爬小说单线程异步协程问题出在哪里呢

lihu5841314 发表于 2021-6-16 19:33

本帖最后由 lihu5841314 于 2021-6-16 20:19 编辑

"""
https://www.xbiquge.la/modules/article/waps.php
searchkey: 三寸人间
Referer: https://www.xbiquge.la/xuanhuanxiaoshuo/
"""
import requests,os
from lxml importetree
from bs4 import BeautifulSoup
import asyncio
fromaiohttpimportClientSession

url1 = "https://www.xbiquge.la"
url = "https://www.xbiquge.la/modules/article/waps.php"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36",
'Referer':url1
}

defget_book_url(url,name): #获取搜索的小说下载地址url
data = {
   "searchkey": name
}
resp = requests.post(url=url,headers=headers,data=data)
resp.encoding = resp.apparent_encoding
soup = BeautifulSoup(resp.text,"lxml")
book_url = soup.find('td',attrs={'class':'even'}).find("a").get('href')
book_name = soup.find('td', attrs={'class': 'even'}).find("a").text
returnbook_url

def get_menu_url(url):#获取小说章节页名字和url
resp = requests.get(url=url,headers=headers)
resp.encoding = resp.apparent_encoding
soup = BeautifulSoup(resp.text, "lxml")
dd_list =soup.find('div',attrs={'id':"list"}).find_all('dd')
dic_list = []
for ddin dd_list:
   menu_url =url1 + dd.find('a').get("href") #url不全补齐
   menu_name = dd.find('a').text
   dic ={
         "menu_url":menu_url,
         "menu_name":menu_name
   }
   dic_list.append(dic)
return dic_list

async def book_down(dic): #这里是阻塞且耗时的加异步
   url = dic["menu_url"]
   menu_name = dic["menu_name"] + ".txt"
         async with ClientSession() as session:
            async with session.get(url) asresp:
               await asyncio.sleep(1)
               resp = await resp.text() #text()方法返回字符串形式的响应数据 read（）方法返回二进制响应数据 json()方法返回的就是json对象
         # # resp.encoding = resp.apparent_encoding
               tree =etree.HTML(resp)
               content = tree.xpath('//div[@id="content"]//text()')
               content ="".join()
               content = content.split("-----") # 文本清洗完毕
               asyncwith open(menu_name,"w",encoding="utf_8") as f:
                        f.write(await content)
                        print(menu_name,"下载完成")

defmain(name):
if os.path.exists(f"./{name}"):
      os.mkdir(f"./{name}")
book_url = get_book_url(url, name)
dic_list = get_menu_url(book_url)
tasts = []
for dic indic_list:
      task= asyncio.ensure_future(book_down(dic))
      tasts.append(task)
loop = asyncio.get_event_loop()# 建立异步事件循环
loop.run_until_complete(asyncio.wait(tasts))
print(name, "下载完成") #基本下载功能已经完成就是效率太低了练习单线程异步协程
loop.close()

if __name__ == '__main__':
name = input("请输入需要下载的小说或者作者名字：")
main(name)

kof21411 发表于 2021-6-16 19:50

f.write(await content)改为await f.write(content)

lihu5841314 发表于 2021-6-16 20:13

kof21411 发表于 2021-6-16 19:50
f.write(await content)改为await f.write(content)

不是这里的原因

fanvalen 发表于 2021-6-16 21:16

503是请求数量太多被服务器拒绝了
所以多线多进废了
单线也要加sleep延时个1-5秒

kll545012 发表于 2021-6-16 21:16

访问太快了~~~

lihu5841314 发表于 2021-6-16 21:23

fanvalen 发表于 2021-6-16 21:16
503是请求数量太多被服务器拒绝了
所以多线多进废了
单线也要加sleep延时个1-5秒

应该是这个的原因换个ip试试水

allenmichael89 发表于 2021-6-17 01:02

自己弄个ip池就好了

wanwfy 发表于 2021-6-17 08:30

本帖最后由 wanwfy 于 2021-6-17 08:33 编辑

楼主，给你一个我以前写过的版本

借用了大佬写的异步重试装饰器

# -*- coding: utf-8 -*-
# @时间 : 2020-03-03 20:05
# @作者 : 陈祥安
# @文件名 : retry_helper.py
# @公众号: Python学习开发
import asyncio
import random
import traceback
from functools import wraps

from loguru import logger

class RetryTimeout(Exception):
def __init__(self, info):
   self.info = info

def __str__(self):
   return self.info

def aio_retry(**kwargs):
max_sleep_time: int = kwargs.pop("max", None)
min_sleep_time: int = kwargs.pop("min", 0)
attempts: int = kwargs.pop("attempts", 3)
error: bool = kwargs.pop("error", False)

def retry(func):
   @wraps(func)
   async def decorator(*args, **_kwargs):
         retry_count = 1
         error_info = ""
         while True:
            if retry_count > attempts:
               if error:
                     raise RetryTimeout("Too many errors")
               else:
                     logger.error(f"After retries {retry_count} times, an error occurred.here is detail{error_info}")
                     break
            try:
               result = await func(*args, **_kwargs)
               return result
            except Exception as e:
               if retry_count == attempts:
                     error_info = f"{traceback.format_exc()}"
               else:
                     retry_count += 1
                     if max_sleep_time:
                        sleep_time = random.randint(min_sleep_time, max_sleep_time)
                        await asyncio.sleep(sleep_time)

   return decorator

return retry

下载所有章节，按章节保存

import asyncio
import aiohttp
import re
from lxml import etree
from fake_useragent import UserAgent
from tools.retry_helper import aio_retry
from urllib.parse import urljoin
import aiofiles
from asyncio import Queue
from os.path import exists
import os

UA = UserAgent()
q = Queue()

index = 'http://www.xbiquge.la/0/10/'
# index = 'https://www.xbiquge.cc/book/315/'
# index = 'https://www.biquge.com.cn/book/44017/'

headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': '_abcde_qweasd=0; Hm_lvt_169609146ffe5972484b0957bd1b46d6=1606587778; bdshare_firstime=1606587779103; Hm_lpvt_169609146ffe5972484b0957bd1b46d6=1606588268',
'Host': 'www.xbiquge.la',
'If-Modified-Since': 'Wed, 23 Sep 2020 23:49:07 GMT',
'If-None-Match': 'W/"5f6bdef3-4bad"',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4331.0 Safari/537.36 Edg/89.0.713.0'

}

@aio_retry(attempts=5, max_sleep_time=15)
async def down_html(session, url):

async with session.get(url, headers=headers) as response:
   response.raise_for_status()
   return await response.text(encoding='utf-8')

async def parse_index(html):
chapters = []
item = etree.HTML(html)
title = item.xpath('string(//h1)')
chapter_list = item.xpath('//div[@id="list"]/dl/dd')
for chapter in chapter_list:
   chapter_link = chapter.xpath('./a/@href')
   chapter_title = chapter.xpath('string(./a)')
   chapters.append({'chapter_link': urljoin(index, chapter_link), 'chapter_title': chapter_title})
data = {'title': title, 'chapter': chapters}
return data

async def down_chapter(session, book):
exists(book) or os.mkdir(book)
while not q.empty():
   chapter = await q.get()
   url = chapter.get('chapter_link')
   title = chapter.get('chapter_title')
   url_num = re.findall('(\\d+).html', url)
   # title2 = re.sub('\(.*?\)','',title)
   html = await down_html(session, url)

   content = await parse_cpahter(html)
   content = ''.join()
   print(chapter)

   async with aiofiles.open(f'{book}\\{url_num}.txt', mode='a+', encoding='gbk') as f:
         await f.write(title)
         await f.write('\n\n')
         await f.write(content)
         # await asyncio.sleep(1)

async def parse_cpahter(html):
item = etree.HTML(html)
return item.xpath('//div[@id="content"]/text()')

async def main():
conn = aiohttp.TCPConnector(ssl=False)
async with aiohttp.ClientSession(connector=conn) as session:
   index_html = await down_html(session, index)
   chapter_data = await parse_index(index_html)
   book_name = chapter_data.get('title')
   chapters = chapter_data.get('chapter')


   tasks =
   await asyncio.wait(tasks)

if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())

合并章节

import os
import re

def clean_chapter(chapter):
pattern = re.compile('\d+')
if chapter:
   chapter_num = pattern.findall(chapter)
   return int(chapter_num)

def merge_chapter(book):
path = os.getcwd() + f'\\{book}'
chapters = os.listdir(path)

nums =
with open(f'{book}.txt', mode='a+', encoding='utf8') as f:
   for num in nums:
         print(num)
         with open(f"{path}\\{num}.txt", 'r',encoding='utf8') as r:
            f.write(r.read())
            f.write('\n')

merge_chapter('特战狂枭')

lihu5841314 发表于 2021-6-17 09:38

wanwfy 发表于 2021-6-17 08:30
楼主，给你一个我以前写过的版本

借用了大佬写的异步重试装饰器

太复杂了看了好久才看懂

wanwfy 发表于 2021-6-17 14:22

lihu5841314 发表于 2021-6-17 09:38
太复杂了看了好久才看懂

就当我没回复

页: [1]

吾爱破解 - 52pojie.cn's Archiver

爬小说 单线程异步协程问题出在哪里呢

爬小说单线程异步协程问题出在哪里呢