怎样加入多线程且爬取顺序不变
本帖最后由 YZH1028 于 2023-5-31 13:34 编辑因为每一章有几小节,小节的链接地址我只能在每一章中取出,该如何加入多线程,望指点
import requests
from lxml import etree
import re
import urllib.parse
import time
import os
success = False
headers = {
'user-agent': 'Mozilla/5.0',
}
class Urlchuli():
def __init__(self, can, mazhi='utf-8'):
self.can = can
self.mazhi = mazhi
def url_bm(self):
quma = str(self.can).encode(self.mazhi)
bianma = urllib.parse.quote(quma)
return bianma
def url_jm(self):
quma = str(self.can)
jiema = urllib.parse.unquote(quma, self.mazhi)
return jiema
name = Urlchuli(input('请输入书名/作者:\n'), 'gbk')
name = name.url_bm()
def search_book(name):
cookies = {
'PHPSESSID': '5u2161egtijt1b5dr6qdtch3ll',
'jq_Obj': '1',
'__51cke__': '',
'__tins__18946369': '%7B%22sid%22%3A%201684375113921%2C%20%22vd%22%3A%206%2C%20%22expires%22%3A%201684377145674%7D',
'__51laig__': '6',
}
headers = {
'authority': 'www.z555.net',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'cache-control': 'no-cache',
'content-type': 'application/x-www-form-urlencoded',
# 'cookie': 'PHPSESSID=5u2161egtijt1b5dr6qdtch3ll; jq_Obj=1; __51cke__=; __tins__18946369=%7B%22sid%22%3A%201684375113921%2C%20%22vd%22%3A%206%2C%20%22expires%22%3A%201684377145674%7D; __51laig__=6',
'origin': 'https://www.z555.net',
'pragma': 'no-cache',
'referer': 'https://www.z555.net/search.php',
'sec-ch-ua': '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0',
}
data = f's={name}&searchtype=articlename'
response = requests.post('https://www.z555.net/search.php', cookies=cookies, headers=headers, data=data)
response.encoding = response.apparent_encoding
html = etree.HTML(response.text)
books_time = html.xpath('//*[@id="nr"]/td/text()')
books_name = html.xpath('//tr[@id="nr"]/td[@class="odd"]/a/text()')
books_writer = html.xpath('//*[@id="nr"]/td/text()')
books_link = html.xpath('//tr[@id="nr"]/td[@class="odd"]/a/@href')
# list02 = map(list, zip(books_name, books_link, books_writer,books_time))
length = len(books_link)
print(length)
list01 = [, 'https://www.z555.net'+books_link,books_writer,books_time] for i in range(length)]
for i in list01:
print(i)
print(f'------------当前共查询到{length}种小说-------------')
# print(list02)
return list01
def choose_name(list01):
while True:
try:
name = input('请输入你要获取其中的小说:')
for i in list01:
if name in i:
print(i)
return i
else:
continue
except ValueError as f:
print(f'输入程序出错:{f},重新输入!')
def get_find(list01):# 书本的名字url作者名字
get_choose = requests.get(list01[1])
get_choose.encoding = get_choose.apparent_encoding
books_writer = list01[2]
books_name = list01[0]
html = etree.HTML(get_choose.text)
catalogue_name = html.xpath('//*[@id="chapterlist"]/ul/li/a/text()')
catalogue_link = html.xpath('//*[@id="chapterlist"]/ul/li/a/@href')
counts = len(catalogue_name)
list_data = [['https://www.z555.net' + catalogue_link,catalogue_name, books_name,books_writer] for i in range(counts)]
return list_data
def next_ye(nurl):
book_next_link = ''
# print(nurl)
next_url = nurl[0][0]
title_url = next_url.split('1.html')[0]
book_name = nurl[0][2]
book_author = nurl[0][3]
while book_next_link != './':
# time.sleep(0.4)
res = requests.get(url=next_url, headers=headers)
res.encoding = 'gbk'
html = etree.HTML(res.text)
book_next_link = html.xpath('//*[@id="container"]/div/a/@href')[0]
# print(book_next_link)
book_chapter_name = html.xpath('//div[@class="title"]/h1/text()')[0]
book_chapter_name = named(book_chapter_name)
obj_name = re.compile(r'<div id="content">(?P<content>.*?)</div>', re.S)
if not os.path.exists(f'小说存放/{book_author}/《{book_name}》/'):
os.makedirs(f'小说存放/{book_author}/《{book_name}》/')
with open(f'小说存放/{book_author}/《{book_name}》/《{book_chapter_name}》.txt', 'w',
encoding='utf-8') as f:
for i in obj_name.finditer(res.text):
chapter_content = i.group('content').replace("<br/><br/>", '\n')
f.write('【' + book_chapter_name + '】' + '\n\n' + chapter_content + '\n\n')
print(f'------------【{book_chapter_name}下载完成】------------------')
f.close()
next_url = title_url+ book_next_link
else:
print('--------------------小说下载完毕---------------------')
def named(title):
return re.sub(r'[?\\/:!<>|"\s]', '_', title)
def main():
text = get_find(choose_name(search_book(name)))
next_ye(text)
if __name__ == '__main__':
t1 = time.time()
main()
t2 = time.time()
print('耗时:',t2 - t1) thread + 定义每个线程的步长 yuxuechao 发表于 2023-5-20 21:21
thread + 定义每个线程的步长
能否说详细点,我用的thread每次他只能爬取第一章,小节的链接他不会更换 好像没必要在爬取阶段处理顺序,给每个线程安排传递一个序号,在爬取返回阶段,按顺序处理序号线程,而不是按线程自身谁先爬取完成,这样做逻辑也好处理,想在爬取阶段按顺序处理,本身就违反了多线程的本意 线程分配id,先均分任务池然后根据速度进行微调,最后根据id排个序 好的,谢谢指点 llacjj 发表于 2023-5-21 05:05
好像没必要在爬取阶段处理顺序,给每个线程安排传递一个序号,在爬取返回阶段,按顺序处理序号线程,而不是 ...
谢谢,了解了{:1_919:} {:301_978:} 如果知道 请求的 id 可以 当参数传过去,作为结果排序的依据。如果结果中包含 可以排序的 字段,也可以拿来最终排序用。
看具体对时效性的要求。
如果是立刻就要存下来什么的。那么可以 根据结果 或者 入参id 啥的,给文件 或者 结果 取名。。。 pjy612 发表于 2023-5-21 18:09
如果知道 请求的 id 可以 当参数传过去,作为结果排序的依据。如果结果中包含 可以排序的 字段 ...
好滴,谢谢指点{:1_919:} 可以使用 ThreadPoolExecutor ,并使用 map 方法,无需提前使用 submit 方法,map 方法与 python 标准库中的 map 含义相同,都是将序列中的每个元素都执行同一个函数,下面的代码就是对 urls 的每个元素都执行 get_html 函数,并分配到线程池里
import time
from concurrent.futures import ThreadPoolExecutor
def get_html(times):
time.sleep(times)
print("get page {} success".format(times))
return times
executor = ThreadPoolExecutor(max_workers=2)
# 通过executor的 map 获取已经完成的task的值
for data in executor.map(get_html, urls):
print("get {} page".format(data))
页:
[1]
2