爬小说 单线程异步协程问题出在哪里呢
本帖最后由 lihu5841314 于 2021-6-16 20:19 编辑"""
https://www.xbiquge.la/modules/article/waps.php
searchkey: 三寸人间
Referer: https://www.xbiquge.la/xuanhuanxiaoshuo/
"""
import requests,os
from lxml importetree
from bs4 import BeautifulSoup
import asyncio
fromaiohttpimportClientSession
url1 = "https://www.xbiquge.la"
url = "https://www.xbiquge.la/modules/article/waps.php"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36",
'Referer':url1
}
defget_book_url(url,name): #获取搜索的小说下载地址url
data = {
"searchkey": name
}
resp = requests.post(url=url,headers=headers,data=data)
resp.encoding = resp.apparent_encoding
soup = BeautifulSoup(resp.text,"lxml")
book_url = soup.find('td',attrs={'class':'even'}).find("a").get('href')
book_name = soup.find('td', attrs={'class': 'even'}).find("a").text
returnbook_url
def get_menu_url(url):#获取小说章节页名字 和url
resp = requests.get(url=url,headers=headers)
resp.encoding = resp.apparent_encoding
soup = BeautifulSoup(resp.text, "lxml")
dd_list =soup.find('div',attrs={'id':"list"}).find_all('dd')
dic_list = []
for ddin dd_list:
menu_url =url1 + dd.find('a').get("href") #url不全补齐
menu_name = dd.find('a').text
dic ={
"menu_url":menu_url,
"menu_name":menu_name
}
dic_list.append(dic)
return dic_list
async def book_down(dic): #这里是阻塞且耗时的加异步
url = dic["menu_url"]
menu_name = dic["menu_name"] + ".txt"
async with ClientSession() as session:
async with session.get(url) asresp:
await asyncio.sleep(1)
resp = await resp.text() #text()方法返回字符串形式的响应数据 read()方法返回二进制响应数据 json()方法返回的就是json对象
# # resp.encoding = resp.apparent_encoding
tree =etree.HTML(resp)
content = tree.xpath('//div[@id="content"]//text()')
content ="".join()
content = content.split("-----") # 文本清洗完毕
asyncwith open(menu_name,"w",encoding="utf_8") as f:
f.write(await content)
print(menu_name,"下载完成")
defmain(name):
if os.path.exists(f"./{name}"):
os.mkdir(f"./{name}")
book_url = get_book_url(url, name)
dic_list = get_menu_url(book_url)
tasts = []
for dic indic_list:
task= asyncio.ensure_future(book_down(dic))
tasts.append(task)
loop = asyncio.get_event_loop()# 建立异步事件循环
loop.run_until_complete(asyncio.wait(tasts))
print(name, "下载完成") #基本下载功能已经完成就是效率太低了 练习单线程异步协程
loop.close()
if __name__ == '__main__':
name = input("请输入需要下载的小说或者作者名字:")
main(name) f.write(await content)改为await f.write(content) kof21411 发表于 2021-6-16 19:50
f.write(await content)改为await f.write(content)
不是这里的原因 503是请求数量太多被服务器拒绝了
所以多线多进废了
单线也要加sleep延时个1-5秒 访问太快了~~~ fanvalen 发表于 2021-6-16 21:16
503是请求数量太多被服务器拒绝了
所以多线多进废了
单线也要加sleep延时个1-5秒
应该是这个的原因换个ip试试水 自己弄个ip池就好了 本帖最后由 wanwfy 于 2021-6-17 08:33 编辑
楼主,给你一个我以前写过的版本
借用了大佬写的异步重试装饰器
# -*- coding: utf-8 -*-
# @时间 : 2020-03-03 20:05
# @作者 : 陈祥安
# @文件名 : retry_helper.py
# @公众号: Python学习开发
import asyncio
import random
import traceback
from functools import wraps
from loguru import logger
class RetryTimeout(Exception):
def __init__(self, info):
self.info = info
def __str__(self):
return self.info
def aio_retry(**kwargs):
max_sleep_time: int = kwargs.pop("max", None)
min_sleep_time: int = kwargs.pop("min", 0)
attempts: int = kwargs.pop("attempts", 3)
error: bool = kwargs.pop("error", False)
def retry(func):
@wraps(func)
async def decorator(*args, **_kwargs):
retry_count = 1
error_info = ""
while True:
if retry_count > attempts:
if error:
raise RetryTimeout("Too many errors")
else:
logger.error(f"After retries {retry_count} times, an error occurred.here is detail{error_info}")
break
try:
result = await func(*args, **_kwargs)
return result
except Exception as e:
if retry_count == attempts:
error_info = f"{traceback.format_exc()}"
else:
retry_count += 1
if max_sleep_time:
sleep_time = random.randint(min_sleep_time, max_sleep_time)
await asyncio.sleep(sleep_time)
return decorator
return retry
下载所有章节,按章节保存
import asyncio
import aiohttp
import re
from lxml import etree
from fake_useragent import UserAgent
from tools.retry_helper import aio_retry
from urllib.parse import urljoin
import aiofiles
from asyncio import Queue
from os.path import exists
import os
UA = UserAgent()
q = Queue()
index = 'http://www.xbiquge.la/0/10/'
# index = 'https://www.xbiquge.cc/book/315/'
# index = 'https://www.biquge.com.cn/book/44017/'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': '_abcde_qweasd=0; Hm_lvt_169609146ffe5972484b0957bd1b46d6=1606587778; bdshare_firstime=1606587779103; Hm_lpvt_169609146ffe5972484b0957bd1b46d6=1606588268',
'Host': 'www.xbiquge.la',
'If-Modified-Since': 'Wed, 23 Sep 2020 23:49:07 GMT',
'If-None-Match': 'W/"5f6bdef3-4bad"',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4331.0 Safari/537.36 Edg/89.0.713.0'
}
@aio_retry(attempts=5, max_sleep_time=15)
async def down_html(session, url):
async with session.get(url, headers=headers) as response:
response.raise_for_status()
return await response.text(encoding='utf-8')
async def parse_index(html):
chapters = []
item = etree.HTML(html)
title = item.xpath('string(//h1)')
chapter_list = item.xpath('//div[@id="list"]/dl/dd')
for chapter in chapter_list:
chapter_link = chapter.xpath('./a/@href')
chapter_title = chapter.xpath('string(./a)')
chapters.append({'chapter_link': urljoin(index, chapter_link), 'chapter_title': chapter_title})
data = {'title': title, 'chapter': chapters}
return data
async def down_chapter(session, book):
exists(book) or os.mkdir(book)
while not q.empty():
chapter = await q.get()
url = chapter.get('chapter_link')
title = chapter.get('chapter_title')
url_num = re.findall('(\\d+).html', url)
# title2 = re.sub('\(.*?\)','',title)
html = await down_html(session, url)
content = await parse_cpahter(html)
content = ''.join()
print(chapter)
async with aiofiles.open(f'{book}\\{url_num}.txt', mode='a+', encoding='gbk') as f:
await f.write(title)
await f.write('\n\n')
await f.write(content)
# await asyncio.sleep(1)
async def parse_cpahter(html):
item = etree.HTML(html)
return item.xpath('//div[@id="content"]/text()')
async def main():
conn = aiohttp.TCPConnector(ssl=False)
async with aiohttp.ClientSession(connector=conn) as session:
index_html = await down_html(session, index)
chapter_data = await parse_index(index_html)
book_name = chapter_data.get('title')
chapters = chapter_data.get('chapter')
tasks =
await asyncio.wait(tasks)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
合并章节
import os
import re
def clean_chapter(chapter):
pattern = re.compile('\d+')
if chapter:
chapter_num = pattern.findall(chapter)
return int(chapter_num)
def merge_chapter(book):
path = os.getcwd() + f'\\{book}'
chapters = os.listdir(path)
nums =
with open(f'{book}.txt', mode='a+', encoding='utf8') as f:
for num in nums:
print(num)
with open(f"{path}\\{num}.txt", 'r',encoding='utf8') as r:
f.write(r.read())
f.write('\n')
merge_chapter('特战狂枭')
wanwfy 发表于 2021-6-17 08:30
楼主,给你一个我以前写过的版本
借用了大佬写的异步重试装饰器
太复杂了看了好久才看懂 lihu5841314 发表于 2021-6-17 09:38
太复杂了看了好久才看懂
就当我没回复
页:
[1]