知轩藏书精校小说网全站爬虫(支持搜索和批量下载两种方式)
本帖最后由 wushaominkk 于 2021-5-8 14:57 编辑python学习练手项目,批量下载参考了知X藏书精校小说网全站爬虫,当然我写的没有那么详细,只支持从分类第一页开始逐项爬取。具体效果看截图,萌新学习中,代码写的较乱,大佬勿喷。
爬虫结构:spider├─ index.py ├─common│ └─ classified.ini│ └─ method.ini│ ├─zxcs_category_spider│ └─zxcs_category_spider.py│ └─zxcs_search_spider └─zxcs_search_spider.py
1、入口:index.py#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2021/5/8 11:10
# @AuThor : yinghihi
from zxcs_category_spider import zxcs_category_spider as c
from zxcs_search_spider import zxcs_search_spider as s
if __name__ == "__main__":
print('>>>知轩藏书爬虫启动')
with open('common/method.ini', 'r', encoding='utf-8') as f:
print(f.read())
select_num = int(input('>>>请输入要下载的方式:'))
print('*' * 74)
if select_num == 1:
print('>>>你选择了【批量下载】模式!本模式支持以下分类版块批量下载···')
with open('common/classified.ini', 'r', encoding='utf-8') as f:
print(f.read())
c.main()
if select_num == 2:
print('>>>你选择了【搜索下载】模式!')
print('*' * 74)
url = f'http://www.zxcs.me/index.php?keyword={input(">>>请输入书名或者作者:")}'
s.main(url)2、批量下载模块:zxcs_category_spider.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2021/5/7 17:12
# @Author : yinghihi
import requests
import re
from lxml import etree
import os
from requests.exceptions import Timeout, HTTPError, RequestException
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36 Edg/90.0.818.46'}
def get_url(url):
try:
html_res = requests.get(url, headers=headers, timeout=30)
if html_res.status_code == 200:
return html_res.text
except Timeout:
print('请求超时,正在重试...')
for i in range(1, 5):
print(f'正在重试第{i}次...')
html_res_again = requests.get(url, headers=headers, timeout=30)
return html_res_again.text
def parse_page(html):
# 获取该分类下共有多少页
page_re = re.compile('<atitle="尾页">.*?</a>', re.S)
page_nums = re.findall(page_re, html)
category_re = re.compile('<atitle="尾页">.*?</a>', re.S)
category_num = re.findall(category_re, html)
for num in range(1, int(page_nums) + 1):
category_url = f'http://www.zxcs.me/sort/{category_num}/page/{num}'
category_url_html = requests.get(category_url, headers=headers).text
category_html_parse = etree.HTML(category_url_html)
book_urls = category_html_parse.xpath('//dl[@id="plist"]/dt/a/@href')
for d in book_urls:
book_html = requests.get(d, headers=headers).text
book_html_parse = etree.HTML(book_html)
# 提取书籍图片
img_url = book_html_parse.xpath('//div[@id="content"]//img/@src')
img_content = requests.get(img_url, headers=headers).content
# 提取书名
book_title = book_html_parse.xpath('//div[@id="content"]/h1/text()')
# 提取内容简介
book_content = book_html_parse.xpath('//*[@id="content"]/p')
book_content_text = book_content.xpath('string(.)')
# 提取下载url
book_download_url = book_html_parse.xpath('//*[@class="down_2"]/a/@href')
# 提取压缩包下载url
book_file_html = requests.get(book_download_url, headers=headers).text
book_file_parse = etree.HTML(book_file_html)
book_file_url = book_file_parse.xpath('//span[@class="downfile"]/a/@href')
rar_name = re.findall('http:\/\/\d+\.\d+\.\d+\.\d+\/\d+\/(.*?\.rar)', book_file_url)
# 下载状态判断,第一个下载线路请求失败则切换到下一个备份线路,失败的线路则告知用户
book_rar_content = None
book_nums = len(book_file_url)
for num in range(0, book_nums):
book_rar = requests.get(book_file_url, headers=headers)
if book_rar.status_code == 200:
book_rar_content = book_rar.content
break
else:
print(f'【{book_title}】压缩包下载地址:{book_file_url}请求失败,更换到备份线路尝试...')
continue
if book_rar_content is None:
print(f'【{book_title}】的所有线路线路均请求失败,本书未能获取到压缩包数据!跳过本次下载···')
continue
print('=' * 74)
yield {
'book_title': book_title,
'img_content': img_content,
'book_content': book_content_text,
'book_rar': book_rar_content,
'rar_name': rar_name
}
def write_res(res):
for item in res:
print(f'开始下载电子书【{item["book_title"]}】')
if not os.path.exists(item['book_title']):
os.mkdir(item['book_title'])
with open(item['book_title'] + '/' + item['book_title'] + '.jpg', 'wb') as f:
print(f'正在写入图片=>{item["book_title"]}.jpg')
f.write(item['img_content'])
with open(item['book_title'] + '/' + '内容简介.txt', 'a', encoding='utf-8') as f:
print(f'正在写入内容简介=>{item["book_content"]}')
f.write(item['book_content'])
with open(item['book_title'] + '/' + item['rar_name'], 'wb') as f:
print(f'正在下载压缩包=>{item["rar_name"]}')
f.write(item['book_rar'])
print(f'电子书【{item["book_title"]}】下载完成')
else:
with open(item['book_title'] + '/' + item['book_title'] + '.jpg', 'wb') as f:
if os.path.exists(item['book_title'] + '/' + item['book_title'] + '.jpg'):
print(item['book_title'] + '.jpg已存在,忽略下载!')
f.close()
else:
print(f'正在写入图片=>{item["book_title"]}.jpg')
f.write(item['img_content'])
with open(item['book_title'] + '/' + '内容简介.txt', 'a', encoding='utf-8') as f:
if os.path.exists(item['book_title'] + '/' + '内容简介.txt'):
print('内容简介.txt已存在,忽略下载!')
f.close()
else:
print(f'正在写入内容简介=>{item["book_content"]}')
f.write(item['book_content'])
with open(item['book_title'] + '/' + item['rar_name'], 'wb') as f:
if os.path.exists(item['book_title'] + '/' + '内容简介.txt'):
print(f'{item["rar_name"]}已存在,忽略下载!')
f.close()
else:
print(f'正在下载压缩包=>{item["rar_name"]}')
f.write(item['book_rar'])
print(f'电子书【{item["book_title"]}】下载完成')
def main():
page_num = input('>>>请输入要下载的版块编号:')
url = f'http://www.zxcs.me/sort/{page_num}'
html = get_url(url)
response = parse_page(html)
write_res(response)
3、搜索下载模块:zxcs_search_spider.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2021/5/8 1:34
# @Author : yinghihi
import requests
import re
from lxml import etree
import os
from requests.exceptions import Timeout
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36 Edg/90.0.818.46'}
def get_url(url):
try:
html_res = requests.get(url, headers=headers, timeout=30)
if html_res.status_code == 200:
return html_res.text
except Timeout:
print('请求超时,正在重试...')
for i in range(1, 5):
print(f'正在重试第{i}次...')
html_res_again = requests.get(url, headers=headers, timeout=30)
return html_res_again.text
def parse_html(html):
# 初始化索引字典
index_num = 1
index_dict = {}
html_parse = etree.HTML(html)
# 获取搜索页数
search_nums = html_parse.xpath('//div[@id="pagenavi"]/*')
print('>>>搜索到以下结果:')
print('*' * 60)
# 判断,在搜索结果只有一页时获取所有搜索结果
if not search_nums:
# 获取当前页所有书籍url
book_urls = html_parse.xpath('//*[@id="plist"]/dt/a/@href')
book_titles = html_parse.xpath('//*[@id="plist"]/dt/a/text()')
book_dict = dict(zip(book_titles, book_urls))
for k, v in book_dict.items():
print(f'{index_num},{k}:{v}')
index_dict = v
index_num += 1
else:
# 当搜索结果有多页时
page_num = len(search_nums)
search_url_demo = html_parse.xpath('//div[@id="pagenavi"]//a/@href')
for num in range(1, page_num + 1):
search_full_url = search_url_demo[:-7] + f'&page={num}'
# 解析搜索页
search_url_html = requests.get(search_full_url, headers=headers, timeout=10).text
search_url_parse = etree.HTML(search_url_html)
# 获取当前页所有书籍url
book_urls = search_url_parse.xpath('//*[@id="plist"]/dt/a/@href')
book_titles = search_url_parse.xpath('//*[@id="plist"]/dt/a/text()')
book_dict = dict(zip(book_titles, book_urls))
for k, v in book_dict.items():
print(f'{index_num},{k}:{v}')
index_dict = v
index_num += 1
select_num = int(input('>>>请输入你要下载的电子书序号:'))
for key, value in index_dict.items():
if select_num == key:
select_url = index_dict
select_url_html = requests.get(select_url, headers=headers).text
# 解析选取电子书的正文页面
select_url_parse = etree.HTML(select_url_html)
# 获取电子书名称
book_title = re.findall('<h1>.*?《(.*?)》.*?</h1>', select_url_html)
# 获取图片
book_img_url = select_url_parse.xpath('//*[@id="content"]//a[@target="_blank"]/@href')
book_img_content = requests.get(book_img_url, headers=headers).content
# 获取正文简介
book_content = select_url_parse.xpath('//*[@id="content"]/p')
book_content_text = book_content.xpath('string(.)')
# 获取电子书压缩包
book_download_url = select_url_parse.xpath('//*[@class="down_2"]/a/@href')
book_file_html = requests.get(book_download_url, headers=headers).text
book_file_parse = etree.HTML(book_file_html)
book_file_url = book_file_parse.xpath('//span[@class="downfile"]/a/@href')
rar_name = re.findall('http:\/\/\d+\.\d+\.\d+\.\d+\/\d+\/(.*?\.rar)', book_file_url)
# 下载状态判断,第一个下载线路请求失败则切换到下一个备份线路,失败的线路则告知用户
book_rar_content = None
book_nums = len(book_file_url)
for b_num in range(0, book_nums):
book_rar = requests.get(book_file_url, headers=headers)
if book_rar.status_code == 200:
book_rar_content = book_rar.content
break
else:
print(f'【{book_title}】压缩包下载地址:{book_file_url}请求失败,更换到备份线路尝试...')
continue
if book_rar_content is None:
print(f'【{book_title}】的所有线路线路均请求失败,本书未能获取到压缩包数据!中止本次下载...')
url1 = f'http://www.zxcs.me/index.php?keyword={input(">>>请重新输入书名或者作者:")}'
main(url1)
else:
yield {
'book_title': book_title,
'img_content': book_img_content,
'book_content': book_content_text,
'book_rar': book_rar_content,
'rar_name': rar_name
}
def write_res(res):
for item in res:
print(f'开始下载电子书【{item["book_title"]}】')
if not os.path.exists(item['book_title']):
os.mkdir(item['book_title'])
with open(item['book_title'] + '/' + item['book_title'] + '.jpg', 'wb') as f:
print(f'正在写入图片=>{item["book_title"]}.jpg')
f.write(item['img_content'])
with open(item['book_title'] + '/' + '内容简介.txt', 'a', encoding='utf-8') as f:
print(f'正在写入内容简介=>{item["book_content"]}')
f.write(item['book_content'])
with open(item['book_title'] + '/' + item['rar_name'], 'wb') as f:
print(f'正在下载压缩包=>{item["rar_name"]}')
f.write(item['book_rar'])
else:
with open(item['book_title'] + '/' + item['book_title'] + '.jpg', 'wb') as f:
if os.path.exists(item['book_title'] + '/' + item['book_title'] + '.jpg'):
print(item['book_title'] + '.jpg已存在,忽略下载!')
f.close()
else:
print(f'正在写入图片=>{item["book_title"]}.jpg')
f.write(item['img_content'])
with open(item['book_title'] + '/' + '内容简介.txt', 'a', encoding='utf-8') as f:
if os.path.exists(item['book_title'] + '/' + '内容简介.txt'):
print('内容简介.txt已存在,忽略下载!')
f.close()
else:
print(f'正在写入内容简介=>{item["book_content"]}')
f.write(item['book_content'])
with open(item['book_title'] + '/' + item['rar_name'], 'wb') as f:
if os.path.exists(item['book_title'] + '/' + '内容简介.txt'):
print(f'{item["rar_name"]}已存在,忽略下载!')
f.close()
else:
print(f'正在下载压缩包=>{item["rar_name"]}')
f.write(item['book_rar'])
def main(offset):
html = get_url(offset)
response = parse_html(html)
write_res(response)
3.配置文件:common/method.ini,classified.ini
[*]method.ini
==========================================================================
* 【1】==批量下载== || 【2】==搜索下载== *
==========================================================================
[*]classified.ini
==========================================================================
* *
* 都市生活 【23】 精校奇幻 【38】 精校灵异 【41】 精校竞技 【44】 *
* *
* 精校武侠 【36】 精校玄幻 【39】 精校历史 【42】 精校游戏 【45】 *
* *
* 精校仙侠 【37】 精校科幻 【40】 精校军事 【43】 二次元 【55】 *
* *
*=========================================================================
批量下载截图
搜索下载截图
功能写的好全 简单的会看,这太复杂了。 都是高手啊,膜拜一下 能否用胸的思路帮我看一下这个帖子https://www.52pojie.cn/thread-1437556-1-1.html 不是童画 发表于 2021-5-12 09:10
能否用胸的思路帮我看一下这个帖子https://www.52pojie.cn/thread-1437556-1-1.html
向着大佬的背影一路追赶 建議編譯個成品出來,
可以搜索下載的那種。 可否打包一下呀,非常需要。 本帖最后由 daichaoyu 于 2021-8-11 15:28 编辑
Traceback (most recent call last):
File "/Users/daichaoyu/Desktop/spider/index.py", line 14, in <module>
c.main()
File "/Users/daichaoyu/Desktop/spider/zxcs_category_spider/zxcs_category_spider.py", line 122, in main
write_res(response)
File "/Users/daichaoyu/Desktop/spider/zxcs_category_spider/zxcs_category_spider.py", line 78, in write_res
for item in res:
File "/Users/daichaoyu/Desktop/spider/zxcs_category_spider/zxcs_category_spider.py", line 27, in parse_page
page_nums = re.findall(page_re, html)
IndexError: list index out of range
进程已结束,退出代码为 1
不知道怎么解决 求助 搜索功能可以正常使用 批量功能总是卡在这一步 谢谢分享!
页:
[1]
2