学python一个多月啦, 发现python代码真的简单,
全站数据爬取 精减一下估计用不了30行代码
[Python] 纯文本查看 复制代码 import csv
from concurrent.futures import ThreadPoolExecutor
import re
import requests
from lxml import etree
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36'}
f = open('qianyudata.csv', 'a', encoding='gbk')
wr = csv.writer(f)
def qycompany(url):
response = requests.get(url, headers=headers).text
tree = etree.HTML(response)
steat = re.findall('<title>(.*?)</title>', response, re.S)[0]
if steat != "404" :
cp_introduce = tree.xpath('//*[@class="shop-comp-ins"]//text()')[0]
cp_introduce = str(cp_introduce)
ex1 = '<td class="table-tdcon" width="35%">(.*?)</td>'
ex2 = '<td class="table-tdcon" width="35%" colspan="3">(.*?)</td>'
td_s1 = re.findall(ex1, response, re.S)
td_s2 = re.findall(ex2, response, re.S)
td_s3 = td_s1 + td_s2
td_s = ['null' if x == '' else x for x in td_s3] #写入数据库时空值替换为null
td_s.append(cp_introduce)
td_s.append(url)
wr.writerow(td_s)
print('采集成功' + url)
else:
print('无此店铺' + url)
if __name__ == '__main__':
with ThreadPoolExecutor(200) as t: #多线程 看设备性能更改参数,这个网站暂时没有反扒限制
for i in range(1280, 60000): #店铺链接id从1280开始,中间有空,
t.submit(qycompany, f'https://{i}.qianyuwang.com/company.html')
f.close()
print('采集完成') |