异步爬小说有时候会报出503怎么解决(以解决多谢各位）感受极速

lihu5841314 发表于 2021-7-7 19:37

本帖最后由 lihu5841314 于 2021-7-7 20:26 编辑

"""
目标网站https://www.xbiquge.la/
#tqdm 进度条库
#parsel解析库可以使用css 和xpath
"""
importrequests,os,datetime,random
import parsel
fromtqdm import tqdm
import pandas as pd
import aiohttp
importaiofiles
import asyncio

headers= {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
'Referer': 'https: // www.xbiquge.la /'
}

def get_response(html_url):
resp = requests.get(url=html_url,headers=headers,timeout=200)
resp.encoding = resp.apparent_encoding
return resp

asyncdef Chapter_content(dic): # 提取单页的内容
   html_url = dic['chapter_url']
   index = str(dic['index'])
   async withaiohttp.ClientSession() assession:
         foriinrange(4):
            async with awaitsession.get(url=html_url,headers=headers) as resp:
               ifresp.status!=200:
                     await asyncio.sleep(random.randrange(5))
                     continue
               else:
                     resp = awaitresp.text(encoding="utf-8")
                     break
   if len(index) < 5:
         index = "0" * (5 - len(index)) + index
   # 实例化parsel对象
   selector = parsel.Selector(resp)
   # css解析
   content = selector.css('#content::text').extract()
   chapter_name = selector.xpath('//div[@class="box_con"]/div/h1/text()').extract_first()
   chapter_content = "".join()
   path = novel_name1 + "/" + index + ".txt"
   async withaiofiles.open(path, 'w', encoding="utf-8") as f:
         if chapter_name != None:
               await f.write(chapter_name)
         awaitf.write("\n")
         awaitf.write(chapter_content)
         awaitf.write("\n")

def marge():
path = os.getcwd()
os.chdir(path+"/" + novel_name1 )
lis = os.listdir()
s = "+".join(lis)
os.system(fr'copy /b{s} {novel_name1}.txt')
print(s)
forfileinos.listdir():
      if not file.startswith(novel_name1):
            os.remove(file)

def novel_chapter_memu(html_url): #分析章节的来源小说目录
resp = get_response(html_url)
# 实例化parsel对象
selector = parsel.Selector(resp.text)
# css解析
lis_href = selector.css('#list dd a::attr(href)').getall()
dics = []
index = 0
for chapter_url in lis_href:
   index +=1
   chapter_url = 'https://www.xbiquge.la' + chapter_url
   dic ={
         'index':index,
         'chapter_url':chapter_url
   }
   dics.append(dic)
return dics

defsearch():
while True:
      serch_url = 'https://www.xbiquge.la/modules/article/waps.php'# Request Method: POSTReferer: https://www.xbiquge.la/
      name = str(input('请输入需要下载的小说名字或者作者名字：'))
      print(f'正在查询小说{name.center(int(50),"-")}')
      data = {
         'searchkey': name
      }
      resp = requests.post(url=serch_url,headers=headers,data=data)
      resp.encoding = resp.apparent_encoding
      selector = parsel.Selector(resp.text)
      tr_lis = selector.xpath('//div[@id="content"]//tr')
      dics =[]
      for tr in tr_lis:
         novel_url = tr.xpath('./td[@class="even"]/a/@href').extract_first()
         novel_name = tr.xpath('./td[@class="even"]/a/text()').extract_first()
         author_name = tr.xpath('./td/text()').extract_first()
         dic = {
            'novel_name':novel_name,
            'author_name':author_name,
            'novel_url':novel_url,
         }
         dics.append(dic)
      df = pd.DataFrame(dics)
      if df.empty:
         print("没有找到，请重新输入")
         continue
      print(df)
      num = int(input("请选择需要下载的小说："))
      url = df.iloc
      novel_name1 = df.iloc
      returnurl,novel_name1

defmain():
while True:
   start = datetime.datetime.now()
   globalnovel_name1
   memu_url,novel_name1 = search()
   if not os.path.exists(novel_name1):
         os.makedirs(novel_name1)
   dics = novel_chapter_memu(memu_url)
   tasks = []
   for dic in tqdm(dics):
         task = asyncio.ensure_future(Chapter_content(dic))
         tasks.append(task)
   loop.run_until_complete(asyncio.wait(tasks))
   marge()
   print('耗时',datetime.datetime.now()-start,"秒")
   select = input("是否需要继续下载Y/N:")
   ifselect == 'Y':
         continue
   else:
         break

if __name__ == '__main__':
loop = asyncio.get_event_loop()
main()
#加循环和随机延时解决问题"""
目标网站https://www.xbiquge.la/
#tqdm 进度条库
#parsel解析库可以使用css 和xpath
"""
importrequests,os,datetime
import parsel
fromtqdm import tqdm
import pandas as pd
import aiohttp
importaiofiles
import asyncio

headers= {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
'Referer': 'https: // www.xbiquge.la /'
}

def get_response(html_url):
resp = requests.get(url=html_url,headers=headers,timeout=200)
resp.encoding = resp.apparent_encoding
return resp

asyncdef Chapter_content(dic): # 提取单页的内容
   html_url = dic['chapter_url']
   index = str(dic['index'])
   async withaiohttp.ClientSession() assession:
         async with awaitsession.get(url=html_url,headers=headers) as resp:
               resp = awaitresp.text(encoding="utf-8")
               if len(index) < 5:
                     index = "0" * (5 - len(index)) + index
               # 实例化parsel对象
               selector = parsel.Selector(resp)
               # css解析
               content = selector.css('#content::text').extract()
               chapter_name = selector.xpath('//div[@class="box_con"]/div/h1/text()').extract_first()
               chapter_content = "".join()
               path = novel_name1 + "/" + index + ".txt"
               async withaiofiles.open(path, 'w', encoding="utf-8") as f:
                     if chapter_name != None:
                           await f.write(chapter_name)
                     awaitf.write("\n")
                     awaitf.write(chapter_content)
                     awaitf.write("\n")
               await asyncio.sleep(1)

def marge():
path = os.getcwd()
os.chdir(path+"/" + novel_name1 )
lis = os.listdir()
s = "+".join(lis)
os.system(fr'copy /b{s} {novel_name1}.txt')
print(s)
forfileinos.listdir():
      if not file.startswith(novel_name1):
            os.remove(file)

def novel_chapter_memu(html_url): #分析章节的来源小说目录
resp = get_response(html_url)
# 实例化parsel对象
selector = parsel.Selector(resp.text)
# css解析
lis_href = selector.css('#list dd a::attr(href)').getall()
dics = []
index = 0
for chapter_url in lis_href:
   index +=1
   chapter_url = 'https://www.xbiquge.la' + chapter_url
   dic ={
         'index':index,
         'chapter_url':chapter_url
   }
   dics.append(dic)
return dics

defsearch():
while True:
      serch_url = 'https://www.xbiquge.la/modules/article/waps.php'# Request Method: POSTReferer: https://www.xbiquge.la/
      name = str(input('请输入需要下载的小说名字或者作者名字：'))
      print(f'正在查询小说{name.center(int(50),"-")}')
      data = {
         'searchkey': name
      }
      resp = requests.post(url=serch_url,headers=headers,data=data)
      resp.encoding = resp.apparent_encoding
      selector = parsel.Selector(resp.text)
      tr_lis = selector.xpath('//div[@id="content"]//tr')
      dics =[]
      for tr in tr_lis:
         novel_url = tr.xpath('./td[@class="even"]/a/@href').extract_first()
         novel_name = tr.xpath('./td[@class="even"]/a/text()').extract_first()
         author_name = tr.xpath('./td/text()').extract_first()
         dic = {
            'novel_name':novel_name,
            'author_name':author_name,
            'novel_url':novel_url,
         }
         dics.append(dic)
      df = pd.DataFrame(dics)
      if df.empty:
         print("没有找到，请重新输入")
         continue
      print(df)
      num = int(input("请选择需要下载的小说："))
      url = df.iloc
      novel_name1 = df.iloc
      returnurl,novel_name1

defmain():
while True:
   start = datetime.datetime.now()
   globalnovel_name1
   memu_url,novel_name1 = search()
   if not os.path.exists(novel_name1):
         os.makedirs(novel_name1)
   dics = novel_chapter_memu(memu_url)
   tasks = []
   for dic in tqdm(dics):
         task = asyncio.ensure_future(Chapter_content(dic))
         tasks.append(task)
   loop.run_until_complete(asyncio.wait(tasks))
   marge()
   print('耗时',datetime.datetime.now()-start,"秒")
   select = input("是否需要继续下载Y/N:")
   ifselect == 'Y':
         continue
   else:
         break

if __name__ == '__main__':
loop = asyncio.get_event_loop()
main()

Sashka 发表于 2021-7-7 19:50

wildwolfzj 发表于 2021-7-7 19:57

加一个time，另外catch到错误后，再起个请求

细水流长 发表于 2021-7-7 20:10

lihu5841314 发表于 2021-7-7 20:14

细水流长发表于 2021-7-7 20:10
状态不为200时重试，可以设置重试次数，到一定次数还不行就让他是空值

503服务器拒绝访问应该不是请求的原因估计是异步请求太快

小苏打饼干 发表于 2021-7-7 20:41

time.sleep(1)# 下载完图片睡眠1S怕被封ip每次请求网站后加上这一句

lihu5841314 发表于 2021-7-7 21:52

小苏打饼干发表于 2021-7-7 20:41
每次请求网站后加 ...

异步的加了也没啥用

jjjzw 发表于 2021-7-7 22:12

请求太快了，我试过的解决方法有两种，一是每次请求完time.sleep，二是每次请求完都进行判断，因为返回的值都是"503 ···"那一串，如果返回了这个就一直重复请求，一会儿之后就恢复正常了。
测试下来下载速度都差不多，有时候time.sleep还能更快一点（以time.sleep(1)为例)

xiaoniba1028 发表于 2021-7-7 22:42

好文，收藏，待看

hxh-linux 发表于 2021-7-8 01:00

4纠正一下403才是拒绝访问500以上都是服务器端的问题可能是请求频繁服务器又是同步阻塞的容器导致的建议降低访问的线程数量

页: [1] 2

吾爱破解 - 52pojie.cn's Archiver

异步爬小说 有时候会报出503怎么解决(以解决 多谢各位）感受极速

异步爬小说有时候会报出503怎么解决(以解决多谢各位）感受极速