本帖最后由 yinghihi 于 2021-5-3 02:26 编辑
python萌新一个,跟着视频教程学习一段时间,本着实践中学习的精神,尝试写了这个爬虫,还有很多不完善的地方,比如只有单线程,下载速度较慢(学习多线程模块中~~~)。异常捕获怎么写还不太会,导致爬虫一次请求失败后,程序会中断,这个也在学怎么修正,把这个发出来,也是想跟大家请教下,加快学习进度
目前爬虫实现的功能:
1,抓取当前漫画的网页中的卷名称,并以此在本地创建同名文件夹,该卷漫画的所有页都自动存储在该卷目录下
5.2更新:
1,优化了代码顺序,自我感觉下载速度变快了
2,添加选择下载功能,现在爬虫会显示所有搜索到的结果,选择对应序号后下载对应资源
[Python] 纯文本查看 复制代码 import requests
import re
import os
from lxml import etree
from requests.exceptions import RequestException
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36 Edg/90.0.818.46'}
def get_url(url):
"""网页请求"""
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
html_res = response.text
return html_res
except RequestException as e:
raise
def select_comic(html_res):
parse_html = etree.HTML(html_res)
search_title = parse_html.xpath('//div[contains(@class,"comicbook-index")]/a/@title')
search_title_url = parse_html.xpath('//div[contains(@class,"comicbook-index")]/a/@href')
search_title_url_full = []
for i in search_title_url:
i = 'https://www.manhuadb.com' + i
search_title_url_full.append(i)
search_dict = dict(zip(search_title, search_title_url_full))
num = 1
index_dict = {}
print('搜索到以下结果:')
print('*' * 60)
for k, v in search_dict.items():
print(str(num) + ',' + k + ':' + v)
index_dict[num] = v
num += 1
print('*' * 60)
select_num = int(input('请输入你要下载的漫画序号:'))
for key, value in index_dict.items():
if key == select_num:
select_url = index_dict[key]
select_url_html = requests.get(select_url, headers=headers).text
select_url_con_re = re.compile('<a class="button-read" href="(.*?)">开始阅读<\/a>', re.S)
select_url_con = re.findall(select_url_con_re, select_url_html)[0]
final_url = 'https://www.manhuadb.com/' + select_url_con
final_content = requests.get(final_url, headers=headers).text
return final_content
else:
continue
def parse_html(res):
all_cha_recompile = re.compile(
'<li class="sort_div.*?" data-sort="\d+">\n.*?<a class=".*?" href="\/manhua\/(.*?)">.*?<\/a>\n.*?<\/li>',
re.S)
all_cha_url = re.findall(all_cha_recompile, res)
for i in all_cha_url:
all_full_url = 'https://www.manhuadb.com/manhua/' + i
all_full_content = requests.get(all_full_url, headers=headers).text
page_num_recompile = re.compile(
'<li class="breadcrumb-item active" aria-current="page">\n.*?<a href="\/manhua\/.*?">.*?<\/a> \/ 第 <span class="c_nav_page">\d+<\/span> 页・共 (\d+) 页\n.*?<\/li>',
re.S)
page_nums = re.findall(page_num_recompile, all_full_content)
for num in range(1, int(page_nums[0]) + 1):
full_urls = all_full_url[:-5] + '_p' + str(num) + all_full_url[-5:]
cox = requests.get(full_urls).text
yield cox
def img_parse(img_content):
for item in img_content:
title_recompile = re.compile(
'<li class="breadcrumb-item active" aria-current="page">\n.*?<a href="\/manhua\/.*?">(.*?)<\/a> \/ 第 <span class="c_nav_page">\d+<\/span> 页・共 \d+ 页\n.*?<\/li>',
re.S)
recompile = re.compile(
'<img class="img-fluid show-pic" src="(.*?)" />', re.S)
title = re.findall(title_recompile, item)[0]
src = re.findall(recompile, item)[0]
pic_request = requests.get(src, headers=headers)
img_name_re = re.compile(
'<img class="img-fluid show-pic" src="https:\/\/i2.manhuadb.com\/.*?\/.*?\/.*?\/(.*?)" />', re.S)
img_name = re.findall(img_name_re, item)[0]
yield {'img_name': img_name, 'img_content': pic_request.content, 'chapter_title': title}
def write_res(res):
for item in res:
dict_res = item['chapter_title']
if not os.path.exists(dict_res):
os.mkdir(dict_res)
with open(dict_res + '/' + item['img_name'], 'wb') as f:
print('正在下载:' + dict_res + '/' + item['img_name'])
f.write(item['img_content'])
else:
with open(dict_res + '/' + item['img_name'], 'wb') as f:
print('正在下载:' + dict_res + '/' + item['img_name'])
f.write(item['img_content'])
def main(offset):
html = get_url(offset)
selecter = select_comic(html)
print('资源序号已选择,开始解析!')
response = parse_html(selecter)
print('页面解析完毕,获取所有漫画页面资源中···!')
img_res = img_parse(response)
print('开始下载···')
print('*' * 60)
write_res(img_res)
if __name__ == '__main__':
print('=====程序开始执行=====')
url = 'https://www.manhuadb.com/search?q=' + input('请输入漫画名:')
main(url)
|