请问用多线程为什么数据抓不完整?数据共有649条,实际只抓取了619条,问题在哪里?[Python] 纯文本查看 复制代码
from concurrent.futures import ThreadPoolExecutor
def down_data(i):
html = gethtml(i)
llist = parse_url(html)
for u in llist:
try:
html_detail = gethtml_detail(u)
u = re.findall('https://susong-item.taobao.com/auction/\d{1,}.htm', html_detail, re.S)[0].strip()
html_detail = gethtml_detail(u)
parse = parse_url_detail(html_detail)
df = pd.DataFrame(parse)
for i in df.index:
df['索引'].at[i] = i + 1
df2 = cpca.transform(df['地址'])
df['区'] = df2.loc[:, ['区']]
df['地址'] = df2.loc[:, ['地址']]
# result = pd.concat([df, df1], axis=0)
df.to_excel("C:/Users/Administrator/Desktop/Python/AL-SF/1-retail" + st + ".xlsx", index=False)
print('第%s条数据已保存' % str(i + 1))
except Exception:
try:
html_detail = gethtml_detail(u)
u = re.findall('https://zc-item.taobao.com/auction/\d{1,}.htm', html_detail, re.S)[0].strip()
parse = parse_url_detail(u)
df = pd.DataFrame(parse)
for i in df.index:
df['索引'].at[i] = i + 1
df2 = cpca.transform(df['地址'])
df['区'] = df2.loc[:, ['区']]
df['地址'] = df2.loc[:, ['地址']]
# result = pd.concat([df, df1], axis=0)
df.to_excel("C:/Users/Administrator/Desktop/Python/AL-SF/1-retail" + st + ".xlsx", index=False)
print('第%s条数据已保存' % str(i + 1))
except Exception:
html_detail = gethtml_detail(u)
parse = parse_url_detail(html_detail)
df = pd.DataFrame(parse)
for i in df.index:
df['索引'].at[i] = i + 1
df2 = cpca.transform(df['地址'])
df['区'] = df2.loc[:, ['区']]
df['地址'] = df2.loc[:, ['地址']]
# result = pd.concat([df, df1], axis=0)
df.to_excel("C:/Users/Administrator/Desktop/Python/AL-SF/1-retail" + st + ".xlsx", index=False)
print('第%s条数据已保存' % str(i + 1))
# 主程序
def main():
global p
page = next_page()
time_start = time.time()
with ThreadPoolExecutor(curPage) as t:
for i in page:
p += 1
t.submit(down_data, i)
time_end = time.time()
print('第%s页数据已保存!====用时%.1f秒' % (p, time_end - time_start)) |