请客观的评价我的“爬虫代码”
本帖最后由 hj170520 于 2020-11-21 23:45 编辑一般来说,我爬虫代码写成这样
你们会怎么评价我以及我的代码
根据各位的意见,我简化了一下,后续简化还在进行,谢谢大家的帮助。
import requests
import csv
import re
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15'
}
proxy = {'http': '127.0.0.1:1086'}
def table(order):
url = "https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectByDocId/data_docId=%s.json" % order
req = requests.get(url, headers=headers)
detail_list = re.findall(r'0000pt; \\" >(.*?)</span>', req.text)
info_filter = []
info = {}
for x in range(len(detail_list)):
# print(detail_list)
if re.search(r'>作出处罚决定的日期', detail_list):
detail_list = "作出处罚决定的日期"
elif re.search(r'>作出处罚决定的机关名称', detail_list):
detail_list = "作出处罚决定的机关名称"
if not re.search(r'<', detail_list):
info_filter.append(detail_list)
print(detail_list)
for i in range(len(info_filter)):
if info_filter == "个人姓名":
if info_filter == "——" or info_filter == "单位":
info["个人姓名"] = " "
else:
info["个人姓名"] = info_filter
if info_filter == "名称":
info["单位名称"] = info_filter
if re.search(r'(法定代表人|负责人)', info_filter):
info["法定代表人(主要负责人)姓名"] = info_filter
if re.search(r'主要违法违规事实', info_filter):
a = 1
info["主要违法违规事实"] = ""
while 1:
if re.search(r'行政处罚依据', info_filter):
break
else:
info["主要违法违规事实"] += info_filter
a += 1
if info_filter == "行政处罚依据":
a = 1
info["行政处罚依据"] = ""
while 1:
if re.search(r'行政处罚决定', info_filter):
break
else:
info["行政处罚依据"] += info_filter
a += 1
if info_filter == "行政处罚决定":
a = 1
info["行政处罚决定"] = ""
while 1:
if re.search(r'作出处罚决定的机关名称', info_filter):
break
else:
info["行政处罚决定"] += info_filter
a += 1
if info_filter == "作出处罚决定的机关名称":
info["作出处罚决定的机关名称"] = info_filter
if info_filter == "作出处罚决定的日期":
a = 1
info["作出处罚决定的日期"] = ""
while 1:
if not i + a < len(info_filter):
break
else:
info["作出处罚决定的日期"] += info_filter
a += 1
print(info)
with open('银保监会详情页.csv', 'a') as f:
writer = csv.writer(f, delimiter='\t')
# writer.writerow(info.keys())
writer.writerow(info.values())
csvFile = open("银保监会1.csv", "r")
rows = csv.DictReader(csvFile)
for row in rows:
table(row["docid"]) 对应的爬虫网址为:https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectByDocId/data_docId=879960.json
可以试试打开一下 优秀,学习了,楼主这个好找工作不,,,, yzqhj 发表于 2020-11-21 20:41
优秀,学习了,楼主这个好找工作不,,,,
按坛友的说法,进号子是迟早的事 不是爬虫,没看出来虫子在哪 hj170520 发表于 2020-11-21 20:44
按坛友的说法,进号子是迟早的事
这么严重么,? json,不是可以转成python字典,然后 for...in...,就可以取值了 有意思学习了 怎么说呢。。。
看不明白,有很多重复的地方,应该还可以再改改。 还可以吧