自动化基础之爬虫操作1

klmatao 发表于 2021-7-30 11:26

本帖最后由 klmatao 于 2021-7-30 11:33 编辑

1. 使用requests库，爬取绝对领域的软妹子图片
```
import os

import requests
from lxml import etree
from threading import Thread

# 头信息
headers = {
   "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                              "Chrome/91.0.4472.164 Safari/537.36 "
}

# 获取网页的源代码，并且返回源码的字符串形式
def get_html(url):
   try:
            resp = requests.get(url, headers)
            resp.encoding = 'utf-8'
            if resp.status_code == 200:
                     return resp.text
            else:
                     return None
   except Exception as e:
            print(f"获取当前【{url}】错误的原因是：", e)

# 获取图片的内容，返回二进制内容
def get_content(url):
   try:
            resp = requests.get(url, headers)
            if resp.status_code == 200:
                     return resp.content
            else:
                     return None
   except Exception as e:
            print(f"获取当前【{url}】错误的原因是：", e)

# 解析网页源码，返回解析数据的实例对象
def parse_html(text):
   try:
            e = etree.HTML(text)
            return e
   except Exception as e:
            print("解析源代码出错", e)

# 保存图片
def save_picture(url, title, i):
   try:
            resp = get_html(url)
            e = parse_html(resp)
            hrefs = e.xpath("//div[@class='entry-content']/img/@src")
            # 创建文件夹
            base_path = rf".\软妹子\{i}\{title}"
            # 判断文件夹路径是否存在，如果不存在，则创建新的文件夹
            if os.path.exists(base_path):
                     pass
            else:
                     os.makedirs(base_path)
            for i in range(len(hrefs)):
                     resp = get_content(hrefs)
                     with open(base_path + rf"\{i}.jpg", "wb") as f:
                           f.write(resp)
   except Exception as e:
            print(f"sacePic方法调用出错原因：", e)

# 定义入口函数
def main(num):
   # 最外层循环控制页数
   for i in range(1, num + 1):
            print(f"第【{i}】页下载开始")
            base_url = f"https://www.jdlingyu.com/tuji/mzitu/page/{i}"
            resp = get_html(base_url)
            e = parse_html(resp)
            wide_titles = e.xpath("//div/h2/a/text()")
            wide_hrefs = e.xpath("//div/h2/a/@href")
            # 第二层循环控制每一页面上的所有图片标题对象的链接条数
            for wide_title, wide_href in zip(wide_titles, wide_hrefs):
                     # 使用线程，加快链接访问速度
                     s1 = Thread(target=save_picture, args=(wide_href, wide_title, i))
                     s1.start()
                     print(f"第【{i}】页【{wide_title}】下载完成")

            print("-" * 30)

if __name__ == "__main__":
   main(3)
   print("主线程结束")

```

2. 使用requests库，爬取笔趣阁龙虎榜小说

```
import requests
from lxml import etree
base_url = "https://www.quge6.com"
targeturl = "https://www.quge6.com/bqglhb.html"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'}

def chapter_downloader(url, titlename):
""" 下载指定的章节标题和内容并保存 """
# 获取网页
response = requests.get(url,headers=headers)
# 编码
response.encoding = 'utf-8'
# 转换
selector = etree.HTML(response.text)
name = selector.xpath('//h1/text()')
print(f'正在下载章节{title}...')
# 章节内容
content = selector.xpath('//div[@id="content"]/text()')
content = ''.join(content)

with open(r'{0}\{1}.txt'.format(base_path, title),'a',encoding='utf-8') as file:
   file.write(name+'\n'+content+'\n')

base_path = r".\txtfile"
resp = requests.get(targeturl, headers)
resp.encoding = "utf-8"

e = etree.HTML(resp.text)
# 获取到龙虎榜所有的小说的标题和链接
titles = e.xpath("//div[@class='topbooks']/ul/li/a/@title")
hrefs = e.xpath("//div[@class='topbooks']/ul/li/a/@href")
hrefs =

# 遍历访问hrefs中所有的链接
for title, href in zip(titles, hrefs):
response = requests.get(href,headers=headers)
response.encoding = 'utf-8'
selector = etree.HTML(response.text)
name = selector.xpath('//h1/text()')

urls = selector.xpath('//div[@id="list"]/dl/dd/a/@href')
urls = ['https://www.quge6.com'+url for url in urls]
for url in urls[:2]:
   chapter_downloader(url, title)
```

3. 使用selenium库，重写爬取笔趣阁龙虎榜
```
import os

from selenium import webdriver
from threading import Thread

# 创建浏览器对象，返回访问结果
def get_html(url):
try:
phantomjs = webdriver.PhantomJS(executable_path=r"D:\Tools\PythonInstall\phantomjs-2.0.0-windows\bin\phantomjs",
service_args=['--ignore-ssl-errors=true', '--ssl-protocol=TLSv1'])
phantomjs.get(url)
return phantomjs
except Exception as e:
print(f"【get_html】调用错误原因：{e}")

# 访问每一章节，获取每一章节内容
def save_books(url, book_name, base_path):
try:
with open(base_path + rf"\{book_name}.txt", "a", encoding="utf-8") as file:
# 创建小说内容访问浏览器
content_phantomjs = get_html(url)
# 获取章节的标题
chapter_title = content_phantomjs.find_element_by_xpath("//div/h1").text
# 获取章节内容
chapter_content = content_phantomjs.find_element_by_id("content").text
print(f"开始下载{chapter_title}")
file.write(str(chapter_title) + "\n")
file.write("\n" * 3)
file.write(str(chapter_content))
print(f"{chapter_title}下载完成")
# 存储完毕，关闭浏览器
content_phantomjs.close()
except Exception as e:
print(f"【save_books】调用错误原因：{e}")

# 函数的主入口
def main(url, books, number):
try:
# 创建书名访问浏览器
book_phantomjs = get_html(url)
title_elements = book_phantomjs.find_elements_by_xpath("//div[@class='topbooks']/ul/li/a")
# 获取小说的书名和跳转章节链接
book_names =
books_hrefs =
# 初始化一个book数量变量，用来控制下载书的数量
count = 0
# 遍历books_hrefs，获取book_type,chapter_href
for book_name, book_href in zip(book_names, books_hrefs):
# 创建章节访问浏览器
chapter_phantomjs = get_html(book_href)
# 获取到book的类型
book_type = chapter_phantomjs.find_element_by_class_name("con_top").text
book_type = book_type.split('>').strip()
# 获取小说的章节链接
chapter_hrefs_elements = chapter_phantomjs.find_elements_by_xpath("//div[@id='list']/dl/dd/a")
chapter_hrefs = [chapter_hrefs_element.get_attribute("href") for chapter_hrefs_element in
chapter_hrefs_elements][
12:]
base_path = rf"E:\47期课程笔记\第二阶段 python\上课练习\selenium_1\小说下载目录\{book_type}"
if os.path.exists(base_path):
pass
else:
os.mkdir(base_path)
# 访问章节链接
for index, chapter_href in enumerate(chapter_hrefs):
# 使用多线程访问
s1 = Thread(target=save_books, args=(chapter_href, book_name, base_path))
s1.start()
if index >= number:
# 结束条件，自己控制
break
if count >= books:
break
count += 1
except Exception as e:
print(f"【main】调用错误原因：{e}")

if __name__ == '__main__':
# 传入龙虎榜小说入口链接
base_url = "https://www.quge6.com/bqglhb.html"
main(base_url, 5, 3)

```

klmatao 发表于 2021-8-2 22:22

QingYi. 发表于 2021-8-2 15:19
绝对领域简单版如下。import os

import requests

兄弟，headers你都不写，爬取网页绝大多数都会得不到正确的响应

hellomonkiy 发表于 2021-7-30 13:46

Mao19900902 发表于 2021-7-30 11:31
爬虫怎么操作的，小白。

让LZ给你开发一个遥控器。上下左右，前进后退，跳[狗头]

Mao19900902 发表于 2021-7-30 11:31

爬虫怎么操作的，小白。

disk008 发表于 2021-7-30 11:56

感谢分享，学习了

LWWPJ 发表于 2021-7-30 12:01

完全看不懂，好难

54264 发表于 2021-7-30 12:01

感谢兄弟分享

SuigetsuRe 发表于 2021-7-30 12:19

谢谢兄弟

C2021 发表于 2021-7-30 12:19

感谢分享

yemind 发表于 2021-7-30 12:21

赞32个的平方次

blademasterlu 发表于 2021-7-30 12:28

感谢分享，学习学习！！！

93839454 发表于 2021-7-30 12:28

感谢分享，学习学习！！！

页: [1] 2 3 4

吾爱破解 - 52pojie.cn's Archiver

自动化基础之爬虫操作1