本帖最后由 hj170520 于 2020-11-21 23:45 编辑
一般来说,我爬虫代码写成这样
你们会怎么评价我以及我的代码
根据各位的意见,我简化了一下,后续简化还在进行,谢谢大家的帮助。
[Python] 纯文本查看 复制代码 import requests
import csv
import re
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15'
}
proxy = {'http': '127.0.0.1:1086'}
def table(order):
url = "https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectByDocId/data_docId=%s.json" % order
req = requests.get(url, headers=headers)
detail_list = re.findall(r'0000pt; \\" >(.*?)</span>', req.text)
info_filter = []
info = {}
for x in range(len(detail_list)):
# print(detail_list[x])
if re.search(r'>作出处罚决定的日期', detail_list[x]):
detail_list[x] = "作出处罚决定的日期"
elif re.search(r'>作出处罚决定的机关名称', detail_list[x]):
detail_list[x] = "作出处罚决定的机关名称"
if not re.search(r'<', detail_list[x]):
info_filter.append(detail_list[x])
print(detail_list[x])
for i in range(len(info_filter)):
if info_filter[i] == "个人姓名":
if info_filter[i + 1] == "——" or info_filter[i + 1] == "单位":
info["个人姓名"] = " "
else:
info["个人姓名"] = info_filter[i + 1]
if info_filter[i] == "名称":
info["单位名称"] = info_filter[i + 1]
if re.search(r'(法定代表人|负责人)', info_filter[i]):
info["法定代表人(主要负责人)姓名"] = info_filter[i + 1]
if re.search(r'主要违法违规事实', info_filter[i]):
a = 1
info["主要违法违规事实"] = ""
while 1:
if re.search(r'行政处罚依据', info_filter[i+a]):
break
else:
info["主要违法违规事实"] += info_filter[i + a]
a += 1
if info_filter[i] == "行政处罚依据":
a = 1
info["行政处罚依据"] = ""
while 1:
if re.search(r'行政处罚决定', info_filter[i+a]):
break
else:
info["行政处罚依据"] += info_filter[i + a]
a += 1
if info_filter[i] == "行政处罚决定":
a = 1
info["行政处罚决定"] = ""
while 1:
if re.search(r'作出处罚决定的机关名称', info_filter[i+a]):
break
else:
info["行政处罚决定"] += info_filter[i + a]
a += 1
if info_filter[i] == "作出处罚决定的机关名称":
info["作出处罚决定的机关名称"] = info_filter[i + 1]
if info_filter[i] == "作出处罚决定的日期":
a = 1
info["作出处罚决定的日期"] = ""
while 1:
if not i + a < len(info_filter):
break
else:
info["作出处罚决定的日期"] += info_filter[i + a]
a += 1
print(info)
with open('银保监会详情页.csv', 'a') as f:
writer = csv.writer(f, delimiter='\t')
# writer.writerow(info.keys())
writer.writerow(info.values())
csvFile = open("银保监会1.csv", "r")
rows = csv.DictReader(csvFile)
for row in rows:
table(row["docid"]) |