怎样加入多线程且爬取顺序不变

YZH1028 发表于 2023-5-20 20:18

本帖最后由 YZH1028 于 2023-5-31 13:34 编辑

因为每一章有几小节，小节的链接地址我只能在每一章中取出，该如何加入多线程，望指点
import requests
from lxml import etree
import re
import urllib.parse
import time
import os
success = False
headers = {
'user-agent': 'Mozilla/5.0',
}
class Urlchuli():

def __init__(self, can, mazhi='utf-8'):
   self.can = can
   self.mazhi = mazhi

def url_bm(self):
   quma = str(self.can).encode(self.mazhi)
   bianma = urllib.parse.quote(quma)
   return bianma

def url_jm(self):
   quma = str(self.can)
   jiema = urllib.parse.unquote(quma, self.mazhi)
   return jiema

name = Urlchuli(input('请输入书名/作者：\n'), 'gbk')
name = name.url_bm()

def search_book(name):
cookies = {
   'PHPSESSID': '5u2161egtijt1b5dr6qdtch3ll',
   'jq_Obj': '1',
   '__51cke__': '',
   '__tins__18946369': '%7B%22sid%22%3A%201684375113921%2C%20%22vd%22%3A%206%2C%20%22expires%22%3A%201684377145674%7D',
   '__51laig__': '6',
}

headers = {
   'authority': 'www.z555.net',
   'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
   'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
   'cache-control': 'no-cache',
   'content-type': 'application/x-www-form-urlencoded',
   # 'cookie': 'PHPSESSID=5u2161egtijt1b5dr6qdtch3ll; jq_Obj=1; __51cke__=; __tins__18946369=%7B%22sid%22%3A%201684375113921%2C%20%22vd%22%3A%206%2C%20%22expires%22%3A%201684377145674%7D; __51laig__=6',
   'origin': 'https://www.z555.net',
   'pragma': 'no-cache',
   'referer': 'https://www.z555.net/search.php',
   'sec-ch-ua': '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
   'sec-ch-ua-mobile': '?0',
   'sec-ch-ua-platform': '"Windows"',
   'sec-fetch-dest': 'document',
   'sec-fetch-mode': 'navigate',
   'sec-fetch-site': 'same-origin',
   'sec-fetch-user': '?1',
   'upgrade-insecure-requests': '1',
   'user-agent': 'Mozilla/5.0',
}

data = f's={name}&searchtype=articlename'
response = requests.post('https://www.z555.net/search.php', cookies=cookies, headers=headers, data=data)
response.encoding = response.apparent_encoding

html = etree.HTML(response.text)
books_time = html.xpath('//*[@id="nr"]/td/text()')
books_name = html.xpath('//tr[@id="nr"]/td[@class="odd"]/a/text()')
books_writer = html.xpath('//*[@id="nr"]/td/text()')
books_link = html.xpath('//tr[@id="nr"]/td[@class="odd"]/a/@href')
# list02 = map(list, zip(books_name, books_link, books_writer,books_time))
length = len(books_link)
print(length)
list01 = [, 'https://www.z555.net'+books_link,books_writer,books_time] for i in range(length)]
for i in list01:
   print(i)

print(f'------------当前共查询到{length}种小说-------------')

# print(list02)
return list01

def choose_name(list01):
while True:
   try:
         name = input('请输入你要获取其中的小说：')
         for i in list01:
            if name in i:
               print(i)
               return i
            else:
               continue
   except ValueError as f:
         print(f'输入程序出错：{f}，重新输入！')

def get_find(list01):# 书本的名字url作者名字
get_choose = requests.get(list01[1])
get_choose.encoding = get_choose.apparent_encoding
books_writer = list01[2]
books_name = list01[0]

html = etree.HTML(get_choose.text)
catalogue_name = html.xpath('//*[@id="chapterlist"]/ul/li/a/text()')
catalogue_link = html.xpath('//*[@id="chapterlist"]/ul/li/a/@href')
counts = len(catalogue_name)
list_data = [['https://www.z555.net' + catalogue_link,catalogue_name, books_name,books_writer] for i in range(counts)]
return list_data

def next_ye(nurl):
book_next_link = ''
# print(nurl)
next_url = nurl[0][0]
title_url = next_url.split('1.html')[0]
book_name = nurl[0][2]
book_author = nurl[0][3]
while book_next_link != './':
   # time.sleep(0.4)
   res = requests.get(url=next_url, headers=headers)
   res.encoding = 'gbk'
   html = etree.HTML(res.text)
   book_next_link = html.xpath('//*[@id="container"]/div/a/@href')[0]
   # print(book_next_link)
   book_chapter_name = html.xpath('//div[@class="title"]/h1/text()')[0]
   book_chapter_name = named(book_chapter_name)
   obj_name = re.compile(r'<div id="content">(?P<content>.*?)</div>', re.S)
   if not os.path.exists(f'小说存放/{book_author}/《{book_name}》/'):
         os.makedirs(f'小说存放/{book_author}/《{book_name}》/')
   with open(f'小说存放/{book_author}/《{book_name}》/《{book_chapter_name}》.txt', 'w',
               encoding='utf-8') as f:
         for i in obj_name.finditer(res.text):
            chapter_content = i.group('content').replace("<br/><br/>", '\n')
            f.write('【' + book_chapter_name + '】' + '\n\n' + chapter_content + '\n\n')
            print(f'------------【{book_chapter_name}下载完成】------------------')
            f.close()

   next_url = title_url+ book_next_link
else:
   print('--------------------小说下载完毕---------------------')

def named(title):
return re.sub(r'[?\\/:!<>|"\s]', '_', title)

def main():
text = get_find(choose_name(search_book(name)))
next_ye(text)

if __name__ == '__main__':
t1 = time.time()
main()
t2 = time.time()
print('耗时：',t2 - t1)

yuxuechao 发表于 2023-5-20 21:21

thread + 定义每个线程的步长

YZH1028 发表于 2023-5-20 22:58

yuxuechao 发表于 2023-5-20 21:21
thread + 定义每个线程的步长

能否说详细点，我用的thread每次他只能爬取第一章，小节的链接他不会更换

llacjj 发表于 2023-5-21 05:05

好像没必要在爬取阶段处理顺序，给每个线程安排传递一个序号，在爬取返回阶段，按顺序处理序号线程，而不是按线程自身谁先爬取完成，这样做逻辑也好处理，想在爬取阶段按顺序处理，本身就违反了多线程的本意

涛之雨 发表于 2023-5-21 08:32

线程分配id，先均分任务池然后根据速度进行微调，最后根据id排个序

YZH1028 发表于 2023-5-21 11:45

好的，谢谢指点

YZH1028 发表于 2023-5-21 11:46

llacjj 发表于 2023-5-21 05:05
好像没必要在爬取阶段处理顺序，给每个线程安排传递一个序号，在爬取返回阶段，按顺序处理序号线程，而不是 ...

谢谢，了解了{:1_919:}

pjy612 发表于 2023-5-21 18:09

{:301_978:} 如果知道请求的 id 可以当参数传过去，作为结果排序的依据。如果结果中包含可以排序的字段，也可以拿来最终排序用。
看具体对时效性的要求。
如果是立刻就要存下来什么的。那么可以根据结果或者入参id 啥的，给文件或者结果取名。。。

YZH1028 发表于 2023-5-21 19:46

pjy612 发表于 2023-5-21 18:09
如果知道请求的 id 可以当参数传过去，作为结果排序的依据。如果结果中包含可以排序的字段 ...

好滴，谢谢指点{:1_919:}

kwk99 发表于 2023-5-25 15:48

可以使用 ThreadPoolExecutor ,并使用 map 方法，无需提前使用 submit 方法，map 方法与 python 标准库中的 map 含义相同，都是将序列中的每个元素都执行同一个函数，下面的代码就是对 urls 的每个元素都执行 get_html 函数，并分配到线程池里

import time
from concurrent.futures import ThreadPoolExecutor

def get_html(times):
time.sleep(times)
print("get page {} success".format(times))
return times

executor = ThreadPoolExecutor(max_workers=2)

# 通过executor的 map 获取已经完成的task的值
for data in executor.map(get_html, urls):
print("get {} page".format(data))

页: [1] 2

吾爱破解 - 52pojie.cn's Archiver

怎样加入多线程且爬取顺序不变