异步爬小说有时候会报出503怎么解决(以解决多谢各位）感受极速

lihu5841314 · 发表于 2021-7-7 19:37

本帖最后由 lihu5841314 于 2021-7-7 20:26 编辑

[Asm] 纯文本查看 复制代码

"""
目标网站https://www.xbiquge.la/
#tqdm   进度条库
#parsel  解析库可以使用css 和xpath
"""
import  requests,os,datetime,random
import parsel
from  tqdm import tqdm
import pandas as pd
import aiohttp
import  aiofiles
import asyncio



headers= {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
'Referer': 'https: // www.xbiquge.la /'
}

def get_response(html_url):
    resp = requests.get(url=html_url,headers=headers,timeout=200)
    resp.encoding = resp.apparent_encoding
    return resp

async  def Chapter_content(dic): # 提取单页的内容
        html_url = dic['chapter_url']
        index = str(dic['index'])
        async with  aiohttp.ClientSession() as  session:
            for  i  in  range(4):
                async with await  session.get(url=html_url,headers=headers) as resp:
                    if  resp.status!=200:
                        await asyncio.sleep(random.randrange(5))
                        continue
                    else:
                        resp = await  resp.text(encoding="utf-8")
                        break
        if len(index) < 5:
            index = "0" * (5 - len(index)) + index
        # 实例化parsel对象
        selector = parsel.Selector(resp)
        # css解析
        content = selector.css('#content::text').extract()
        chapter_name = selector.xpath('//div[@class="box_con"]/div[2]/h1/text()').extract_first()
        chapter_content = "".join([x.strip() + '\n' for x in content])
        path = novel_name1 + "/" + index + ".txt"
        async with  aiofiles.open(path, 'w', encoding="utf-8") as f:
              if chapter_name != None:
                  await f.write(chapter_name)
              await  f.write("\n")
              await  f.write(chapter_content)
              await  f.write("\n")



def marge():
    path = os.getcwd()
    os.chdir(path+"/" + novel_name1 )
    lis = os.listdir()
    s = "+".join(lis)
    os.system(fr'copy /b  {s} {novel_name1}.txt')
    print(s)
    for  file  in  os.listdir():
         if not file.startswith(novel_name1):
                os.remove(file)





def novel_chapter_memu(html_url): #分析章节的来源  小说目录
    resp = get_response(html_url)
    # 实例化parsel对象
    selector = parsel.Selector(resp.text)
    # css解析
    lis_href = selector.css('#list dd a::attr(href)').getall()
    dics = []
    index = 0
    for chapter_url in lis_href:
        index +=1
        chapter_url = 'https://www.xbiquge.la' + chapter_url
        dic ={
            'index':index,
            'chapter_url':chapter_url
        }
        dics.append(dic)
    return dics

def  search():
    while True:
         serch_url = 'https://www.xbiquge.la/modules/article/waps.php'  # Request Method: POST  Referer: https://www.xbiquge.la/
         name = str(input('请输入需要下载的小说名字或者作者名字：'))
         print(f'正在查询小说{name.center(int(50),"-")}')
         data = {
             'searchkey': name
         }
         resp = requests.post(url=serch_url,headers=headers,data=data)
         resp.encoding = resp.apparent_encoding
         selector = parsel.Selector(resp.text)
         tr_lis = selector.xpath('//div[@id="content"]//tr')[1:]
         dics =[]
         for tr in tr_lis:
             novel_url = tr.xpath('./td[@class="even"]/a/@href').extract_first()
             novel_name = tr.xpath('./td[@class="even"]/a/text()').extract_first()
             author_name = tr.xpath('./td[3]/text()').extract_first()
             dic = {
                 'novel_name':novel_name,
                 'author_name':author_name,
                 'novel_url':novel_url,
             }
             dics.append(dic)
         df = pd.DataFrame(dics)
         if df.empty:
             print("没有找到，请重新输入")
             continue
         print(df)
         num = int(input("请选择需要下载的小说："))
         url = df.iloc[num][2]
         novel_name1 = df.iloc[num][0]
         return  url,novel_name1



def  main():
    while True:
        start = datetime.datetime.now()
        global  novel_name1
        memu_url,novel_name1 = search()
        if not os.path.exists(novel_name1):
            os.makedirs(novel_name1)
        dics = novel_chapter_memu(memu_url)
        tasks = []
        for dic in tqdm(dics):
            task = asyncio.ensure_future(Chapter_content(dic))
            tasks.append(task)
        loop.run_until_complete(asyncio.wait(tasks))
        marge()
        print('耗时',datetime.datetime.now()-start,"秒")
        select = input("是否需要继续下载Y/N:")
        if  select == 'Y':
            continue
        else:
            break


if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    main()
#加循环 和随机延时  解决问题

[Asm] 纯文本查看 复制代码

"""
目标网站https://www.xbiquge.la/
#tqdm   进度条库
#parsel  解析库可以使用css 和xpath
"""
import  requests,os,datetime
import parsel
from  tqdm import tqdm
import pandas as pd
import aiohttp
import  aiofiles
import asyncio



headers= {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
'Referer': 'https: // www.xbiquge.la /'
}

def get_response(html_url):
    resp = requests.get(url=html_url,headers=headers,timeout=200)
    resp.encoding = resp.apparent_encoding
    return resp

async  def Chapter_content(dic): # 提取单页的内容
        html_url = dic['chapter_url']
        index = str(dic['index'])
        async with  aiohttp.ClientSession() as  session:
            async with await  session.get(url=html_url,headers=headers) as resp:
                    resp = await  resp.text(encoding="utf-8")
                    if len(index) < 5:
                        index = "0" * (5 - len(index)) + index
                    # 实例化parsel对象
                    selector = parsel.Selector(resp)
                    # css解析
                    content = selector.css('#content::text').extract()
                    chapter_name = selector.xpath('//div[@class="box_con"]/div[2]/h1/text()').extract_first()
                    chapter_content = "".join([x.strip() + '\n' for x in content])
                    path = novel_name1 + "/" + index + ".txt"
                    async with  aiofiles.open(path, 'w', encoding="utf-8") as f:
                          if chapter_name != None:
                              await f.write(chapter_name)
                          await  f.write("\n")
                          await  f.write(chapter_content)
                          await  f.write("\n")
                    await asyncio.sleep(1)


def marge():
    path = os.getcwd()
    os.chdir(path+"/" + novel_name1 )
    lis = os.listdir()
    s = "+".join(lis)
    os.system(fr'copy /b  {s} {novel_name1}.txt')
    print(s)
    for  file  in  os.listdir():
         if not file.startswith(novel_name1):
                os.remove(file)







def novel_chapter_memu(html_url): #分析章节的来源  小说目录
    resp = get_response(html_url)
    # 实例化parsel对象
    selector = parsel.Selector(resp.text)
    # css解析
    lis_href = selector.css('#list dd a::attr(href)').getall()
    dics = []
    index = 0
    for chapter_url in lis_href:
        index +=1
        chapter_url = 'https://www.xbiquge.la' + chapter_url
        dic ={
            'index':index,
            'chapter_url':chapter_url
        }
        dics.append(dic)
    return dics

def  search():
    while True:
         serch_url = 'https://www.xbiquge.la/modules/article/waps.php'  # Request Method: POST  Referer: https://www.xbiquge.la/
         name = str(input('请输入需要下载的小说名字或者作者名字：'))
         print(f'正在查询小说{name.center(int(50),"-")}')
         data = {
             'searchkey': name
         }
         resp = requests.post(url=serch_url,headers=headers,data=data)
         resp.encoding = resp.apparent_encoding
         selector = parsel.Selector(resp.text)
         tr_lis = selector.xpath('//div[@id="content"]//tr')[1:]
         dics =[]
         for tr in tr_lis:
             novel_url = tr.xpath('./td[@class="even"]/a/@href').extract_first()
             novel_name = tr.xpath('./td[@class="even"]/a/text()').extract_first()
             author_name = tr.xpath('./td[3]/text()').extract_first()
             dic = {
                 'novel_name':novel_name,
                 'author_name':author_name,
                 'novel_url':novel_url,
             }
             dics.append(dic)
         df = pd.DataFrame(dics)
         if df.empty:
             print("没有找到，请重新输入")
             continue
         print(df)
         num = int(input("请选择需要下载的小说："))
         url = df.iloc[num][2]
         novel_name1 = df.iloc[num][0]
         return  url,novel_name1



def  main():
    while True:
        start = datetime.datetime.now()
        global  novel_name1
        memu_url,novel_name1 = search()
        if not os.path.exists(novel_name1):
            os.makedirs(novel_name1)
        dics = novel_chapter_memu(memu_url)
        tasks = []
        for dic in tqdm(dics):
            task = asyncio.ensure_future(Chapter_content(dic))
            tasks.append(task)
        loop.run_until_complete(asyncio.wait(tasks))
        marge()
        print('耗时',datetime.datetime.now()-start,"秒")
        select = input("是否需要继续下载Y/N:")
        if  select == 'Y':
            continue
        else:
            break


if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    main()

Sashka · 发表于 2021-7-7 19:50

提示: 作者被禁止或删除内容自动屏蔽

wildwolfzj · 发表于 2021-7-7 19:57

加一个time，另外catch到错误后，再起个请求

细水流长 · 发表于 2021-7-7 20:10

提示: 作者被禁止或删除内容自动屏蔽

lihu5841314 · 发表于 2021-7-7 20:14

细水流长发表于 2021-7-7 20:10
状态不为200时重试，可以设置重试次数，到一定次数还不行就让他是空值

503服务器拒绝访问应该不是请求的原因估计是异步请求太快

小苏打饼干 · 发表于 2021-7-7 20:41

[Python] 纯文本查看 复制代码

time.sleep(1)  # 下载完图片睡眠1S  怕被封ip

每次请求网站后加上这一句

lihu5841314 · 发表于 2021-7-7 21:52

小苏打饼干发表于 2021-7-7 20:41
每次请求网站后加 ...

异步的加了也没啥用

jjjzw · 发表于 2021-7-7 22:12

请求太快了，我试过的解决方法有两种，一是每次请求完time.sleep，二是每次请求完都进行判断，因为返回的值都是"503 ···"那一串，如果返回了这个就一直重复请求，一会儿之后就恢复正常了。
测试下来下载速度都差不多，有时候time.sleep还能更快一点（以time.sleep(1)为例)

xiaoniba1028 · 发表于 2021-7-7 22:42

好文，收藏，待看

hxh-linux · 发表于 2021-7-8 01:00

4纠正一下 403才是拒绝访问 500以上都是服务器端的问题可能是请求频繁服务器又是同步阻塞的容器导致的建议降低访问的线程数量

帐号		自动登录	找回密码
密码			注册[Register]

Sashka Sashka 当前离线好友阅读权限 0 听众最后登录 1970-1-1 头像被屏蔽	Sashka 发表于 2021-7-7 19:50 提示: 作者被禁止或删除内容自动屏蔽
	【吾爱破解论坛总版规】 - [让你充分了解吾爱破解论坛行为规则]
	回复支持举报

细水流长细水流长当前离线好友阅读权限 0 听众最后登录 1970-1-1 头像被屏蔽	细水流长发表于 2021-7-7 20:10 《站点帮助文档》有什么问题来这里看看吧，这里有你想知道的内容！提示: 作者被禁止或删除内容自动屏蔽
细水流长细水流长当前离线好友阅读权限 0 听众最后登录 1970-1-1 头像被屏蔽	呼吁大家发布原创作品添加吾爱破解论坛标识！
	回复支持举报

[讨论] 异步爬小说有时候会报出503怎么解决(以解决多谢各位）感受极速

免费评分

[讨论] 异步爬小说 有时候会报出503怎么解决(以解决 多谢各位）感受极速

免费评分

[讨论] 异步爬小说有时候会报出503怎么解决(以解决多谢各位）感受极速