爬小说单线程异步协程问题出在哪里呢

lihu5841314 · 发表于 2021-6-16 19:33

本帖最后由 lihu5841314 于 2021-6-16 20:19 编辑

[Asm] 纯文本查看 复制代码

"""
https://www.xbiquge.la/modules/article/waps.php
searchkey: 三寸人间
Referer: https://www.xbiquge.la/xuanhuanxiaoshuo/
"""
import requests,os
from lxml import  etree
from bs4 import BeautifulSoup
import asyncio
from  aiohttp  import  ClientSession

url1 = "https://www.xbiquge.la"
url = "https://www.xbiquge.la/modules/article/waps.php"
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36",
'Referer':url1
}

def  get_book_url(url,name):   #获取搜索的小说下载地址url
    data = {
        "searchkey": name
    }
    resp = requests.post(url=url,headers=headers,data=data)
    resp.encoding = resp.apparent_encoding
    soup = BeautifulSoup(resp.text,"lxml")
    book_url = soup.find('td',attrs={'class':'even'}).find("a").get('href')
    book_name = soup.find('td', attrs={'class': 'even'}).find("a").text
    return  book_url

def get_menu_url(url):  #获取小说章节页名字 和  url
    resp = requests.get(url=url,headers=headers)
    resp.encoding = resp.apparent_encoding
    soup = BeautifulSoup(resp.text, "lxml")
    dd_list =soup.find('div',attrs={'id':"list"}).find_all('dd')
    dic_list = []
    for dd  in dd_list:
        menu_url =url1 + dd.find('a').get("href")   #url不全  补齐
        menu_name = dd.find('a').text
        dic ={
            "menu_url":menu_url,
            "menu_name":menu_name
        }
        dic_list.append(dic)
    return dic_list


async def book_down(dic):     #这里是阻塞且耗时的  加异步
        url = dic["menu_url"]
        menu_name = dic["menu_name"] + ".txt"
            async with ClientSession() as session:
                 async with session.get(url) as  resp:
                    await asyncio.sleep(1)
                    resp = await resp.text()    #text()方法返回字符串形式的响应数据 read（）方法返回二进制响应数据 json()方法返回的就是json对象
            #     # resp.encoding = resp.apparent_encoding
                    tree =etree.HTML(resp)
                    content = tree.xpath('//div[@id="content"]//text()')
                    content ="".join([x.strip() for  x in content])
                    content = content.split("-----")[0]   # 文本清洗完毕
                    async  with open(menu_name,"w",encoding="utf_8") as f:
                             f.write(await content)
                             print(menu_name,"下载完成")



def  main(name):
     if os.path.exists(f"./{name}"):
         os.mkdir(f"./{name}")
     book_url = get_book_url(url, name)
     dic_list = get_menu_url(book_url)
     tasts = []
     for dic   in  dic_list:
         task= asyncio.ensure_future(book_down(dic))
         tasts.append(task)
     loop = asyncio.get_event_loop()  # 建立异步事件循环
     loop.run_until_complete(asyncio.wait(tasts))
     print(name, "下载完成")    #基本下载功能已经完成  就是效率太低了 练习单线程异步协程
     loop.close()


if __name__ == '__main__':
    name = input("请输入需要下载的小说或者作者名字：")
    main(name)

kof21411 · 发表于 2021-6-16 19:50

f.write(await content)改为await f.write(content)

lihu5841314 · 发表于 2021-6-16 20:13

kof21411 发表于 2021-6-16 19:50
f.write(await content)改为await f.write(content)

不是这里的原因

fanvalen · 发表于 2021-6-16 21:16

503是请求数量太多被服务器拒绝了
所以多线多进废了
单线也要加sleep延时个1-5秒

kll545012 · 发表于 2021-6-16 21:16

访问太快了~~~

lihu5841314 · 发表于 2021-6-16 21:23

fanvalen 发表于 2021-6-16 21:16
503是请求数量太多被服务器拒绝了
所以多线多进废了
单线也要加sleep延时个1-5秒

应该是这个的原因换个ip试试水

allenmichael89 · 发表于 2021-6-17 01:02

自己弄个ip池就好了

wanwfy · 发表于 2021-6-17 08:30

本帖最后由 wanwfy 于 2021-6-17 08:33 编辑

楼主，给你一个我以前写过的版本

借用了大佬写的异步重试装饰器

[Python] 纯文本查看 复制代码

# -*- coding: utf-8 -*-
# @时间 : 2020-03-03 20:05
# @作者 : 陈祥安
# @文件名 : retry_helper.py
# @公众号: Python学习开发
import asyncio
import random
import traceback
from functools import wraps

from loguru import logger


class RetryTimeout(Exception):
    def __init__(self, info):
        self.info = info

    def __str__(self):
        return self.info


def aio_retry(**kwargs):
    max_sleep_time: int = kwargs.pop("max", None)
    min_sleep_time: int = kwargs.pop("min", 0)
    attempts: int = kwargs.pop("attempts", 3)
    error: bool = kwargs.pop("error", False)

    def retry(func):
        @wraps(func)
        async def decorator(*args, **_kwargs):
            retry_count = 1
            error_info = ""
            while True:
                if retry_count > attempts:
                    if error:
                        raise RetryTimeout("Too many errors")
                    else:
                        logger.error(f"After retries {retry_count} times, an error occurred.here is detail{error_info}")
                        break
                try:
                    result = await func(*args, **_kwargs)
                    return result
                except Exception as e:
                    if retry_count == attempts:
                        error_info = f"{traceback.format_exc()}"
                    else:
                        retry_count += 1
                        if max_sleep_time:
                            sleep_time = random.randint(min_sleep_time, max_sleep_time)
                            await asyncio.sleep(sleep_time)

        return decorator

    return retry

下载所有章节，按章节保存

[Python] 纯文本查看 复制代码

import asyncio
import aiohttp
import re
from lxml import etree
from fake_useragent import UserAgent
from tools.retry_helper import aio_retry
from urllib.parse import urljoin
import aiofiles
from asyncio import Queue
from os.path import exists
import os

UA = UserAgent()
q = Queue()

index = 'http://www.xbiquge.la/0/10/'
# index = 'https://www.xbiquge.cc/book/315/'
# index = 'https://www.biquge.com.cn/book/44017/'

headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Cookie': '_abcde_qweasd=0; Hm_lvt_169609146ffe5972484b0957bd1b46d6=1606587778; bdshare_firstime=1606587779103; Hm_lpvt_169609146ffe5972484b0957bd1b46d6=1606588268',
    'Host': 'www.xbiquge.la',
    'If-Modified-Since': 'Wed, 23 Sep 2020 23:49:07 GMT',
    'If-None-Match': 'W/"5f6bdef3-4bad"',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4331.0 Safari/537.36 Edg/89.0.713.0'

}


@aio_retry(attempts=5, max_sleep_time=15)
async def down_html(session, url):

    async with session.get(url, headers=headers) as response:
        response.raise_for_status()
        return await response.text(encoding='utf-8')


async def parse_index(html):
    chapters = []
    item = etree.HTML(html)
    title = item.xpath('string(//h1)')
    chapter_list = item.xpath('//div[@id="list"]/dl/dd')
    for chapter in chapter_list:
        chapter_link = chapter.xpath('./a/@href')
        chapter_title = chapter.xpath('string(./a)')
        chapters.append({'chapter_link': urljoin(index, chapter_link[0]), 'chapter_title': chapter_title})
    data = {'title': title, 'chapter': chapters}
    return data


async def down_chapter(session, book):
    exists(book) or os.mkdir(book)
    while not q.empty():
        chapter = await q.get()
        url = chapter.get('chapter_link')
        title = chapter.get('chapter_title')
        url_num = re.findall('(\\d+).html', url)[0]
        # title2 = re.sub('\(.*?\)','',title)
        html = await down_html(session, url)

        content = await parse_cpahter(html)
        content = ''.join([re.sub('^\\s+', '', con, flags=re.S) for con in content])
        print(chapter)

        async with aiofiles.open(f'{book}\\{url_num}.txt', mode='a+', encoding='gbk') as f:
            await f.write(title)
            await f.write('\n\n')
            await f.write(content)
            # await asyncio.sleep(1)


async def parse_cpahter(html):
    item = etree.HTML(html)
    return item.xpath('//div[@id="content"]/text()')


async def main():
    conn = aiohttp.TCPConnector(ssl=False)
    async with aiohttp.ClientSession(connector=conn) as session:
        index_html = await down_html(session, index)
        chapter_data = await parse_index(index_html)
        book_name = chapter_data.get('title')
        chapters = chapter_data.get('chapter')
        [q.put_nowait(chapter) for chapter in chapters]

        tasks = [asyncio.ensure_future(down_chapter(session, book_name)) for _ in range(30)]
        await asyncio.wait(tasks)


if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())

合并章节

[Python] 纯文本查看 复制代码

import os
import re


def clean_chapter(chapter):
    pattern = re.compile('\d+')
    if chapter:
        chapter_num = pattern.findall(chapter)[0]
        return int(chapter_num)


def merge_chapter(book):
    path = os.getcwd() + f'\\{book}'
    chapters = os.listdir(path)

    nums = [clean_chapter(chapter) for chapter in chapters]
    with open(f'{book}.txt', mode='a+', encoding='utf8') as f:
        for num in nums:
            print(num)
            with open(f"{path}\\{num}.txt", 'r',encoding='utf8') as r:
                f.write(r.read())
                f.write('\n')


merge_chapter('特战狂枭')

lihu5841314 · 发表于 2021-6-17 09:38

wanwfy 发表于 2021-6-17 08:30
楼主，给你一个我以前写过的版本

借用了大佬写的异步重试装饰器

太复杂了看了好久才看懂

wanwfy · 发表于 2021-6-17 14:22

lihu5841314 发表于 2021-6-17 09:38
太复杂了看了好久才看懂

就当我没回复

帐号		自动登录	找回密码
密码			注册[Register]

[求助] 爬小说 单线程异步协程问题出在哪里呢

[求助] 爬小说单线程异步协程问题出在哪里呢