python 线程池、进程池执行结果疑问?
本帖最后由 double07 于 2021-12-23 20:58 编辑针对同一列表中的链接,采取线程池、进程池及单线程方式爬取,运行结果:多线程时间慢于单线程,多进程下载的数据不完整,少很多,没找到原因,请大佬指点一下
def download(url):
data_list = []
global p
try:
html_detail = gethtml(url)
html = etree.HTML(html_detail)
b = html.xpath('//li[@class="clear"]')
for i in range(len(b)):
lst = {}
lst['索引'] = ''
lst['主城九区'] = re.findall(r'<a class="selected CLICKDATA" .*?\s+.*?>(.*?)</a>', html_detail)
lst['小区名称'] = b.xpath('./div/div/div/div/a/text()')
lst['商圈'] = re.findall(r'.*?共找到.*?<a.*?>(.*?)二手房</a>', html_detail)
data_list.append(lst)
df = pd.DataFrame(data_list)
for i in df.index:
df['索引'].at = i + 1
df.to_excel("./二手房源" + ".xlsx", index=False)
print(url, '数据提取完毕')
except Exception:
return
if __name__ == '__main__':
df = pd.read_excel("./贝壳区域子链接.xlsx", sheet_name='Sheet1')
id_list = df['子链接'].tolist() #有6000多条链接,每个链接格式“https://cq.ke.com/ershoufang/beibinlu/pg37p7/”
#多线程
start=time.time()
with ProcessPoolExecutor(max_workers=10) as t:
for i in id_list:
t.submit(download,i)
t.shutdown()
end=time.time()
print('全部数据提取完毕,multi_thread cost:',end-start,'s')
#单线程
start=time.time()
for i in id_list:
download(i)
end=time.time()
print('全部数据提取完毕,sigle_thread cost:',end-start,'s')
#多进程
start = time.time()
t = ProcessPoolExecutor(max_workers=16)
for id in id_list:
t.submit(download, id)
t.shutdown()
end = time.time()
print('全部数据提取完毕,multi_thread cost:', end - start, 's') 你这多线程和多进程是一样的,ProcessPoolExecutor是进程池,线程池是这个 ThreadPoolExecutor 建议用协程
页:
[1]