本帖最后由 double07 于 2021-10-22 23:53 编辑
需要抓取的数据十多万条,单线程肯定不行,所以多线程是最佳答案,但通过实验,多线程代码并未起作用,与单线程时间差不多,问题出在哪儿?
[Python] 纯文本查看 复制代码 def single_thread():
link_lst = ['https://cq.ke.com/ershoufang/chaotianmen/', 'https://cq.ke.com/ershoufang/lianglukou/']
for ls in link_lst:
download(ls)
def multi_thread():
threads=[]
link_lst = ['https://cq.ke.com/ershoufang/chaotianmen/', 'https://cq.ke.com/ershoufang/lianglukou/']
for ls in link_lst:
threads.append(
threading.Thread(target=download,args=(ls,))
)
for thread in threads:
thread.start()
for thread in threads:
thread.join()
def download(ls):
global p
price = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7']
for pr in price:
url = ls + pr + '/'
try:
html = gethtml(url)
mount = re.findall(r'共找到<span> (.*?) </span>套', html)[0].strip()
page = eval(mount) // 30 + 1
p_lst = ls + "pg{}" + pr + "/"
url_list = [p_lst.format(i + 1) for i in range(0, page)]
for i in url_list:
html_detail = gethtml(i)
html = etree.HTML(html_detail)
b = html.xpath('//li[@class="clear"]')
for i in range(len(b)):
lst['主城九区'] = re.findall(r'<a class="selected CLICKDATA" .*?\s+.*?>(.*?)</a>', html_detail)[0]
lst['小区名称'] = b[i].xpath('./div/div[2]/div[1]/div/a/text()')[0]
lst['商圈'] = re.findall(r'.*?共找到.*?<a.*?>(.*?)二手房</a>', html_detail)[0]
data_list.append(lst)
p = p + 1
print('第%s条数据已保存' % p)
except Exception:
coutinue
if __name__ == '__main__':
start=time.time()
multi_thread()
end=time.time()
print('single_thread cost:',end-start,'s') |