python异步下载笔趣阁小说爬虫
本帖最后由 zboi 于 2024-3-21 20:33 编辑自学了一点python爬虫,代码写的不是那么好看,但效果挺不错的,自己亲测下载速度可以达到100章每秒。
爬取的笔趣阁网站是https://www.biqg.cc/,如果想要爬取别的网站,也可以自己修改解析书籍信息的部分。
这个爬虫提供了两种方式,一个是本地搜索下载,一个是笔趣阁书籍id下载。
本地搜索:
由于爬取的笔趣阁网站的搜索功能存在一些问题,搜索失灵时不灵的,所以我直接将网站的书籍信息与书籍id给爬下来,记录到一个all_book.txt文件中,充当书籍目录。使用时只需要输入书籍名或者作者名,会输出100个结果。
书籍id下载:
这个需要自己到上面提供的网站去找书籍id,就是每个小说链接的数字部分。
效果如下:
需要注意的是,下载过程中,书籍章节序号显示是乱的,但最后合并时会将章节按照顺序排好,所以不用担心乱章问题。
代码如下:
import re
import requests
from lxml import etree
import time
from os import remove
import aiofiles
from aiohttp import ClientSession, ClientTimeout
import asyncio
from colorama import init
def len_str(string):
count = 0
for ch in string:
if ch >= '\u007f':
count += 1
return count
def width(string, length):
if length < len_str(string):
return 0
else:
return length - len_str(string)
# 获取小说书名、目录、章节链接
def get_book_info(url):
try:
response1 = requests.get(url, cookies = cookies, headers = headers)
html1 = etree.HTML(response1.text, parser = etree.HTMLParser(encoding = 'utf-8'))
chapter_name1 = html1.xpath('/html/body/div/dl/dd/a/text()')
chapter_name2 = html1.xpath('/html/body/div/dl/span/dd/a/text()')
chapter_url1 = html1.xpath('/html/body/div/dl/dd/a/@href')
chapter_url2 = html1.xpath('/html/body/div/dl/span/dd/a/@href')
chapter_names = chapter_name1 + chapter_name2 + chapter_name1[-10:]
chapter_urls = chapter_url1 + chapter_url2 + chapter_url1[-10:]# 拼接完整章节目录和链接
novel_name = html1.xpath('/html/body/div/div/h1/text()')# 获取小说书名
return chapter_names, chapter_urls, novel_name
except Exception as e:
print(f'\033[31m获取小说书名出错,出错原因\033[0m:{e}')
return [], [], ['error']
# 单章小说内容下载
async def singe_chapter_download(url1, name1, sem):
chapter_url = f"https://www.biqg.cc/{url1}"# 拼接章节网址
i = 0
async with sem:
while i < 5:
i += 1
try:
timeout = ClientTimeout(total = 20)
async with ClientSession(headers = headers, cookies = cookies, timeout = timeout) as session:
async with session.get(chapter_url) as resp1:
html2 = etree.HTML(await resp1.text(), parser = etree.HTMLParser(encoding = 'utf-8'))
singe_content = html2.xpath('//*[@id="chaptercontent"]/text()')# 获取小说章节内容
result = re.findall(r'第(.*?)章', singe_content)
if len(result):
del singe_content# 去除可能出现的重复标题
content = singe_content# 去除网站附带的广告链接
name2 = strinfo.sub('_', name1)# 去除小说章节书名中的特殊字符,避免生成章节文件时出错
async with aiofiles.open(f"./小说/{name2}.txt", "w", encoding = "utf-8") as f:# 在小说目录下创建临时的单章txt
await f.write(name2 + '\r\r\r')
for lists in content:
await f.write(lists + '\r\r')
name2_width = 60 - len_str(name2)
print(f'{name2:<{name2_width}}finish')
break
except Exception as e:
print(f'{name1} false {i}/5')
print(e)
# 创建异步任务
async def create_tasks(name_chapter, url_chapter, lens):
tasks = []
if lens > 1000:
sema = 1000
else:
sema = lens
sem = asyncio.Semaphore(sema)# 设置同时进行的异步数量,可以根据上面自行设定,数量越大,下载越快
for url4, name3 in zip(url_chapter, name_chapter):
tasks.append(asyncio.create_task(singe_chapter_download(url4, name3, sem)))# 创建任务
await asyncio.gather(*tasks)
def start_download(url):
chapter_name, chapter_url, novel_name = get_book_info(f'https://www.biqg.cc/{url}')# 获取小说目录,对应的网页链接,书名
length = len(chapter_name)
if length:
print(f"\033}》共{length}章, 开始下载!!\033[0m\n\n")
time1 = time.time()
loop.run_until_complete(create_tasks(chapter_name, chapter_url, length))# 提交任务
time2 = time.time()
with open(f'./小说/{novel_name}.txt', 'w', encoding = 'utf-8') as f1:# 将分散的小说章节写入一个{书名}.txt中
for chapter_names in chapter_name:
chapter_name2 = strinfo.sub("_", chapter_names)
try:
with open(f'./小说/{chapter_name2}.txt', 'r', encoding = 'utf-8') as f2:
text1 = f2.read()
f1.write(text1)
remove(f"./小说/{chapter_name2}.txt")# 移除已写入{书名}.txt的临时章节
except Exception as e:
print(f'{chapter_names}false 错误原因:{e}')
print('==============================下载完成==============================\n')
print(f'共耗时:\033[33m{time2 - time1:.2f}s\033[0m\n\n')
print(f'\033}》已下载!!!!\033[0m\n\n\n')
else:
print('error')
if __name__ == '__main__':
cookies = {
}
headers = {
'authority': 'www.biqg.cc',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'cache-control': 'no-cache', 'pragma': 'no-cache',
'referer': 'https://www.biqg.cc/', 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Microsoft Edge";v="120"', 'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1', 'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
}
# get_title('https://www.bige3.cc/book/66/') 7293788896888884263
init(autoreset = True)
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
strinfo = re.compile('[/:*?"<>|\\\\]')# 匹配字符串中特殊的字符
print('小说保存在exe同目录下的小说文件夹下')
a = input('笔趣阁本地搜索: 1\n笔趣阁书籍id下载: 2\n请选择:')
if a == '1':
with open('./小说/all_books.txt', 'r', encoding = 'utf-8') as f:
books = f.read()
books = eval(books)
while True:
k, target = 1, []
a = input('本地搜索已启动:')
for dic in books:
result = re.findall(f'{a}', dic + dic)
if len(result):
target.append(dic)
print(f'{k:<4}{dic:^{width(dic, 60)}}{dic:<{width(dic, 40)}}')
k = k + 1
if k > 100:
break
if len(target) == 0:
print('小说不存在,请重新输入')
continue
choose = input('请输入序号(批量下载请使用空格分隔序号, 重新搜索请输入0, 全部下载请输入101):')
if choose == '0':
continue
elif choose == '101':
for book in target:
start_download(book)
time.sleep(0.5)
else:
choose_list = choose.split(' ')
for ids in choose_list:
if ids.isdigit():
if int(ids) <= len(target):
if int(ids):
start_download(target)
time.sleep(0.5)
else:
continue
else:
print('\033[31m序号超出范围,请重新搜索!!\033[0m')
else:
print('\033[31m请输入正确格式的书籍序号!!!!\033[0m')
elif a == '2':
print('\n请到 \033[32mhttps://www.biqg.cc/\033[0m 网站搜索你想下的小说,并获取相应的的书籍id\n')
while True:
book_id = input('请输入书籍id(即小说链接数字部分):')
start_download(f"/book/{book_id}/")
all_books.txt文件以及源码 all_books.txt文件以及源码链接:
https://boi.lanzoub.com/b0596fkef
密码:2333 本帖最后由 archon1 于 2024-3-21 22:59 编辑
133行前建议增加:
if not os.path.exists(“小说”):
os.mkdir(“小说")
开始import部分增加 import os
否则直接运行时会一直报错,需要手动建立“小说”目录。 def start_download(url):
chapter_name, chapter_url, novel_name = get_book_info(f'https://www.biqg.cc/{url}')# 获取小说目录,对应的网页链接,书名
#print(strinfo.sub("",novel_name))
length = len(chapter_name)
if length:
print(f"\033}》共{length}章, 开始下载!!\033[0m\n\n")
time1 = time.time()
loop.run_until_complete(create_tasks(chapter_name, chapter_url, length))# 提交任务
time2 = time.time()
novel_name=strinfo.sub("",novel_name) # 去除小说名中的特殊字符,避免生成文件名时出错
with open(f'./小说/{novel_name}.txt', 'w', encoding = 'utf-8') as f1:# 将分散的小说章节写入一个{书名}.txt中
for chapter_names in chapter_name:
chapter_name2 = strinfo.sub("_", chapter_names)
try:
with open(f'./小说/{chapter_name2}.txt', 'r', encoding = 'utf-8') as f2:
text1 = f2.read()
f1.write(text1)
remove(f"./小说/{chapter_name2}.txt")# 移除已写入{书名}.txt的临时章节
except Exception as e:
print(f'{chapter_names}false 错误原因:{e}')
print('==============================下载完成==============================\n')
print(f'共耗时:\033[33m{time2 - time1:.2f}s\033[0m\n\n')
print(f'\033[32m《{novel_name}》已下载!!!!\033[0m\n\n\n')
else:
print('error')
例如:book/159889/柯*******女有特殊符号,保存不了文件。 推荐使用下边帖子的方式插入代码
【公告】发帖代码插入以及添加链接教程(有福利)
https://www.52pojie.cn/thread-713042-1-1.html
(出处: 吾爱破解论坛)
论坛上有个类似的fanqie_tool_windows成品可供lz参考
另代码发的很丑{:17_1086:}估计是手机发的没用代码格式 苏紫方璇 发表于 2024-3-21 20:10
推荐使用下边帖子的方式插入代码
【公告】发帖代码插入以及添加链接教程(有福利)
https://www.52pojie. ...
十分感谢,代码直接复制黏贴效果确实不太好看 正在学习模仿中!感谢大神 感谢分享{:1_893:} 很需要,谢谢分享(🙏ˊᗜˋ*) 看了楼主的代码,又学到了