python线程池使用问题?
用线程池多线程爬取数据(6000多条URL),但速度还是很慢,代码哪里写得不对?def download(url):
global p
try:
html_detail = gethtml(url)
html = etree.HTML(html_detail)
b = html.xpath('//li[@class="clear"]')
for i in range(len(b)):
lst = {}
lst['索引'] = ''
lst['小区名称'] = b.xpath('./div/div/div/div/a/text()')
data_list.append(lst)
df = pd.DataFrame(data_list)
for i in df.index:
df['索引'].at = i + 1
df.to_excel("./二手房源" + ".xlsx", index=False)
p = p + 1
print(url,'数据提取完毕')
if __name__ == '__main__':
f = open('./子链接.txt', 'r')
sl = f.readlines()
old_lst = sl.split(',')
new_lst = []
for i in old_lst:
if i not in new_lst:
new_lst.append(i)
start=time.time()
with ThreadPoolExecutor(50) as t:
for i in new_lst:
t.submit(download,i)
end=time.time()
print('全部数据提取完毕,multi_thread cost:',end-start,'s')
完整源代码与源文件在此下载:链接:https://pan.baidu.com/s/1tWnTTx2O6akmRhUtmVJS-Q
提取码:2pgc
df = pd.DataFrame(data_list)
for i in df.index:
df['索引'].at = i + 1
df.to_excel("./二手房源" + ".xlsx", index=False)
你这段代码缩进错误,应该在for i in range(len(b)):这段循环之外 你了解一下python的多线程和多进程的区别,你就明白为什么会慢
页:
[1]