用线程池多线程爬取数据(6000多条URL),但速度还是很慢,代码哪里写得不对?
[Python] 纯文本查看 复制代码
def download(url):
global p
try:
html_detail = gethtml(url)
html = etree.HTML(html_detail)
b = html.xpath('//li[@class="clear"]')
for i in range(len(b)):
lst = {}
lst['索引'] = ''
lst['小区名称'] = b[i].xpath('./div/div[2]/div[1]/div/a/text()')[0]
data_list.append(lst)
df = pd.DataFrame(data_list)
for i in df.index:
df['索引'].at[i] = i + 1
df.to_excel("./二手房源" + ".xlsx", index=False)
p = p + 1
print(url,'数据提取完毕')
if __name__ == '__main__':
f = open('./子链接.txt', 'r')
sl = f.readlines()
old_lst = sl[0].split(',')
new_lst = []
for i in old_lst:
if i not in new_lst:
new_lst.append(i)
start=time.time()
with ThreadPoolExecutor(50) as t:
for i in new_lst:
t.submit(download,i)
end=time.time()
print('全部数据提取完毕,multi_thread cost:',end-start,'s')
完整源代码与源文件在此下载:链接:https://pan.baidu.com/s/1tWnTTx2O6akmRhUtmVJS-Q
提取码:2pgc
|