本帖最后由 wanwfy 于 2021-6-17 08:33 编辑
楼主,给你一个我以前写过的版本
借用了大佬写的异步重试装饰器
[Python] 纯文本查看 复制代码 # -*- coding: utf-8 -*-
# @时间 : 2020-03-03 20:05
# @作者 : 陈祥安
# @文件名 : retry_helper.py
# @公众号: Python学习开发
import asyncio
import random
import traceback
from functools import wraps
from loguru import logger
class RetryTimeout(Exception):
def __init__(self, info):
self.info = info
def __str__(self):
return self.info
def aio_retry(**kwargs):
max_sleep_time: int = kwargs.pop("max", None)
min_sleep_time: int = kwargs.pop("min", 0)
attempts: int = kwargs.pop("attempts", 3)
error: bool = kwargs.pop("error", False)
def retry(func):
@wraps(func)
async def decorator(*args, **_kwargs):
retry_count = 1
error_info = ""
while True:
if retry_count > attempts:
if error:
raise RetryTimeout("Too many errors")
else:
logger.error(f"After retries {retry_count} times, an error occurred.here is detail{error_info}")
break
try:
result = await func(*args, **_kwargs)
return result
except Exception as e:
if retry_count == attempts:
error_info = f"{traceback.format_exc()}"
else:
retry_count += 1
if max_sleep_time:
sleep_time = random.randint(min_sleep_time, max_sleep_time)
await asyncio.sleep(sleep_time)
return decorator
return retry
下载所有章节,按章节保存
[Python] 纯文本查看 复制代码 import asyncio
import aiohttp
import re
from lxml import etree
from fake_useragent import UserAgent
from tools.retry_helper import aio_retry
from urllib.parse import urljoin
import aiofiles
from asyncio import Queue
from os.path import exists
import os
UA = UserAgent()
q = Queue()
index = 'http://www.xbiquge.la/0/10/'
# index = 'https://www.xbiquge.cc/book/315/'
# index = 'https://www.biquge.com.cn/book/44017/'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': '_abcde_qweasd=0; Hm_lvt_169609146ffe5972484b0957bd1b46d6=1606587778; bdshare_firstime=1606587779103; Hm_lpvt_169609146ffe5972484b0957bd1b46d6=1606588268',
'Host': 'www.xbiquge.la',
'If-Modified-Since': 'Wed, 23 Sep 2020 23:49:07 GMT',
'If-None-Match': 'W/"5f6bdef3-4bad"',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4331.0 Safari/537.36 Edg/89.0.713.0'
}
@aio_retry(attempts=5, max_sleep_time=15)
async def down_html(session, url):
async with session.get(url, headers=headers) as response:
response.raise_for_status()
return await response.text(encoding='utf-8')
async def parse_index(html):
chapters = []
item = etree.HTML(html)
title = item.xpath('string(//h1)')
chapter_list = item.xpath('//div[@id="list"]/dl/dd')
for chapter in chapter_list:
chapter_link = chapter.xpath('./a/@href')
chapter_title = chapter.xpath('string(./a)')
chapters.append({'chapter_link': urljoin(index, chapter_link[0]), 'chapter_title': chapter_title})
data = {'title': title, 'chapter': chapters}
return data
async def down_chapter(session, book):
exists(book) or os.mkdir(book)
while not q.empty():
chapter = await q.get()
url = chapter.get('chapter_link')
title = chapter.get('chapter_title')
url_num = re.findall('(\\d+).html', url)[0]
# title2 = re.sub('\(.*?\)','',title)
html = await down_html(session, url)
content = await parse_cpahter(html)
content = ''.join([re.sub('^\\s+', '', con, flags=re.S) for con in content])
print(chapter)
async with aiofiles.open(f'{book}\\{url_num}.txt', mode='a+', encoding='gbk') as f:
await f.write(title)
await f.write('\n\n')
await f.write(content)
# await asyncio.sleep(1)
async def parse_cpahter(html):
item = etree.HTML(html)
return item.xpath('//div[@id="content"]/text()')
async def main():
conn = aiohttp.TCPConnector(ssl=False)
async with aiohttp.ClientSession(connector=conn) as session:
index_html = await down_html(session, index)
chapter_data = await parse_index(index_html)
book_name = chapter_data.get('title')
chapters = chapter_data.get('chapter')
[q.put_nowait(chapter) for chapter in chapters]
tasks = [asyncio.ensure_future(down_chapter(session, book_name)) for _ in range(30)]
await asyncio.wait(tasks)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
合并章节
[Python] 纯文本查看 复制代码 import os
import re
def clean_chapter(chapter):
pattern = re.compile('\d+')
if chapter:
chapter_num = pattern.findall(chapter)[0]
return int(chapter_num)
def merge_chapter(book):
path = os.getcwd() + f'\\{book}'
chapters = os.listdir(path)
nums = [clean_chapter(chapter) for chapter in chapters]
with open(f'{book}.txt', mode='a+', encoding='utf8') as f:
for num in nums:
print(num)
with open(f"{path}\\{num}.txt", 'r',encoding='utf8') as r:
f.write(r.read())
f.write('\n')
merge_chapter('特战狂枭')
|