python爬取网页并将数据导入excel的代码
各位大佬,我在做一个项目,需要经销商数据来做一个分析{:1_923:}。我不太懂Python,自己捣鼓一下能够提取一些数据,但数据太多,搞不定,就来求助一下。我的诉求:爬取五菱官网发布的五菱宏光miniEV车型的经销商和服务商数据只要这款车的,并将其绘制成各省、各市、网点名称、具体地址四列,这样要求的代码,最好标注,我也能学习一下。
我的思考:我虽然获取了网页地址,但获取的内容是一个动态区域,分条目加载的,我获取的可能不是我想要的车型“五菱宏光MINI EV”的经销商地址,麻烦大佬们帮忙分析哈
官网地址:https://www.sgmw.com.cn/dealer_search.html
我目前的进度:
抓取了数据响应值,Python请求得到一些网页数据。
import requests
if __name__ == '__main__':
headers={'User-Agent': 。。。。。。。。"#这里我删除了
kw = input('enter a word')
param = {
'query':kw
}
response= requests.get(url=url,params=param,headers=headers)
page__text=response.text
fileName = kw+'.html'
with open(fileName,'w',encoding='utf-8') as fp:
fp.write(page__text)
print(fileName,'保存成功') 本帖最后由 jdk11223344 于 2022-3-17 00:19 编辑
#网站:https://www.sgmw.com.cn/dealer_search.html
import requests
import execjs
import re
if __name__ == '__main__':
keyword='五菱宏光miniev'
file='sgmw.js'
node=execjs.get()
ctx=node.compile(open(file,encoding="utf-8").read())
js=f"CarNameFromPage('{keyword}')"
result=ctx.eval(js)
url='https://www.sgmw.com.cn/'+result
headers={
#写自己的
}
text=requests.get(url=url,headers=headers).text
aa = re.split('var dealersData =',text)
t=aa.split(r';',1)
list=t.split('},{')
with open('keyword.text','w',encoding='utf-8') as fp:
for list_1 in list:
list_2=list_1.replace('[{','').replace('}]','').split(',')
for list_3 in list_2:
list_4=list_3.split(':')
for list_5 in list_4:
list_6=list_5.replace('"','')
if list_6=='dealerID':
fp.write('\n'+list_6+' ')
else:
fp.write(list_6+' ')
function CarNameFromPage(name) {
var pagename = name||pageTitle()
pagename = pagename.toLocaleLowerCase().replace(/\s+/g,"");
var url = "";
switch (pagename) {
case "五菱宏光miniev":
case "e50.html":
url = "js/hgevdealer.js?v=" + Math.random();
break;
case "宝骏e100":
case "e100.html":
case "e100-m.html":
url = "js/e100dealer.js?v=" + Math.random();
break;
case "宝骏e200":
case "e200.html":
case "e200-m.html":
url = "js/e200dealer.js?v=" + Math.random();
break;
case "新宝骏e300":
case "e300.html":
url = "js/e300dealer.js?v=" + Math.random();
break;
case "五菱荣光电动车":
case "rongguangdiandongche.html":
url = "js/rgddcdealer.js?v=" + Math.random();
break;
case "宝骏310":
case "310.html":
case "310iamt.html":
url = "js/310dealer.js?v=" + Math.random();
break;
case "宝骏310w":
case "310w.html":
case "310W.html":
url = "js/310Wdealer.js?v=" + Math.random();
break;
case "宝骏360":
case "360.html":
case "360cvt.html":
url = "js/530dealer.js?v=" + Math.random();
break;
case "宝骏530(18款)":
case "宝骏530(19款)":
case "宝骏530(七座)":
case "宝骏530(20款)":
case "宝骏530":
case "530.html":
url = "js/530dealer.js?v=" + Math.random();
break;
case "宝骏510":
case "510.html":
case "510-m.html":
case "510iamt.html":
case "510special.html":
url = "js/510dealer_data.js?v=" + Math.random();
break;
case "宝骏560":
case "560.html":
case "560dct.html":
url = "js/560dealer_hy.js?v=" + Math.random();
break;
case "宝骏730":
case "730.html":
case "7302016.html":
url = "js/730dealer.js?v=" + Math.random();
break;
case "新宝骏rs-5":
case "rs5.html":
case "新宝骏rc-6":
case "rc-6.html":
case "新宝骏rm-5":
case "rm-5.html":
case "rc-5.html":
case "rc-5w.html":
case "新宝骏rc-5":
case "新宝骏rc-5w":
url = "js/rs5dealer.js?v=" + Math.random();
break;
case "五菱宏光s3":
case "五菱宏光s3自动离合版":
case "hongguangs3.html":
url = "js/hgs3dealer.js?v=" + Math.random();
break;
case "五菱宏光v":
case "经典款五菱宏光s":
case "五菱宏光s1":
case "五菱荣光":
case "五菱荣光s":
case "五菱荣光v":
case "五菱荣光单排":
case "五菱荣光加长版":
case "五菱荣光双排":
case "五菱荣光新卡单排":
case "五菱荣光新卡双排":
case "五菱之光":
case "新五菱宏光s":
case "货车":
case "hongguangv.html":
case "hongguang_s.html":
case "xinhongguang_s.html":
case "hongguang_s1.html":
case "lechi.html":
case "rongguang.html":
case "zhengcheng.html":
case "zhiguang.html":
case "weihuo.html":
url = "js/wldealer.js?v=" + Math.random();
break;
case "五菱荣光电动车":
url = "js/rgddcdealer.js?v=" + Math.random();
break;
default:
url = "js/dealer.js?v=" + Math.random();
break;
}
return url;
}
Python代码中请求头换成自己de
js代码复制到js文件并以sgmw.js命名和python代码文件放一起
部分效果
dealerID 1784 dealerCode 9450019 company 五菱汽车柳州双恒销售中心 address 柳州市柳南区西环路环卫处南侧柳州双恒展场 saleTel 0772-3710866 province 29 city 331 point 109.384163 24.3313 victoryTag 1
dealerID 1751 dealerCode 9440031 company 五菱汽车潮州粤胜销售中心 address 潮州市潮安区浮洋镇乌洋村干渠东 saleTel 0768-5225899 province 20 city 231 point 116.615467 23.652577 victoryTag 1
dealerID 1564 dealerCode 9330018 company 五菱浙江台州元通销售中心 address 台州市黄岩区南城街道山前村 saleTel 0576-89177333 province 12 city 108 point 121.274496 28.604761 victoryTag 1
dealerID 265 dealerCode 9330077 company 五菱汽车义乌九洲销售中心 address 浙江省义乌市汽车城6号场地 saleTel 0579-85431677 province 12 city 105 point 120.063351 29.319347 victoryTag 1
dealerID 1622 dealerCode 9340039 company 五菱汽车蚌埠润鸿销售中心 address 安徽省蚌埠市高新区迎宾大道中段国际汽车城五菱4S店 saleTel 0552-3713293 province 13 city 112 point 117.322484 32.87257 victoryTag 1
dealerID 917 dealerCode 9360067 company 五菱汽车萍乡骏菱销售中心 address 萍乡市319国道与韶井路交汇处 saleTel 0799-6679998 province 15 city 138 point 113.809283 27.890656 victoryTag 1
dealerID 546 dealerCode 9420099 company 五菱湖北威马楚通销售中心 address 武汉市江夏经济开发区阳光大道西 saleTel 027-59235050 province 18 city 182 point 114.389425 30.441791 victoryTag 1
dealerID 1801 dealerCode 9450098 company 五菱柳州新事业销售中心 address 柳州市河西路18号(河西路口店) saleTel 0772-3750630 province 29 city 331 point 109.377626 24.331603 victoryTag 1
dealerID 810 dealerCode 945008827 company 五菱汽车合浦双诚店 address 广西北海市合浦县廉州镇外东环大道东乪岭(距新公安局2公里) saleTel 0779-7199050 province 29 city 334 point 109.207055 21.666417 victoryTag 1
dealerID 1771 dealerCode 9440087 company 五菱汽车揭阳鹏发销售中心 address 广东省揭阳市榕城区环市北路以南新河路以西(机电大厦隔壁揭阳鹏发五菱宝骏4S店) saleTel 0663-8237838 province 20 city 232 point 116.388198 23.560205 victoryTag 1
dealerID 2003 dealerCode 9320033 company 五菱汽车南京长铃销售中心 address 南京市江宁区天临路18号 saleTel 025-52633930 province 11 city 86 point 118.810526 31.996985 victoryTag 1
dealerID 613 dealerCode 9440023 company 五菱佛山三水合力销售中心 address 广东省佛山市三水区云东海街道南丰大道81号 saleTel 0757-87829705 province 20 city 218 point 112.921942 23.200263 victoryTag 1
dealerID 320 dealerCode 9340043 company 五菱汽车六安五菱销售中心 address 安徽省六安市金安经济开发区东方汽车城五菱4S店 saleTel 0564-3845050 province 13 city 123 point 116.624863 31.766225 victoryTag 1
dealerID 3214 dealerCode 9410073 company 五菱焦作申科宏业销售中心 address 河南省焦作市示范区迎宾路与南海路交口南200米路西 saleTel 15163770530 province 17 city 171 point 113.259753 35.181634 victoryTag 1 python我不懂,但是楼主的代码放在C:\USERS\,这是心有多大 何必那么麻烦。https://www.sgmw.com.cn/ashx/dealerInfo.ashx?r=0.9171321211977566&cname=南昌市请求这个,只要更换市。把所有市区名字放在一个集合里。请求这条就i行了 nuxingxp 发表于 2022-3-16 15:50
python我不懂,但是楼主的代码放在C:%uSERS\,这是心有多大
我是小白,不太懂,请多多指正{:1_918:} Prozacs 发表于 2022-3-16 17:09
何必那么麻烦。https://www.sgmw.com.cn/ashx/dealerInfo.ashx?r=0.9171321211977566&cname=南昌市请求这 ...
大佬啊,我就是不会合计这些啊,帮帮孩吧:'(weeqw 你这基础都没有,难搞 cflying 发表于 2022-3-16 19:58
你这基础都没有,难搞
还请大佬不吝赐教 本帖最后由 jdk11223344 于 2022-3-17 09:38 编辑
这里是结果图片,间隔可以自己换我用的是一个空格
请求完就是该车所有的经销商信息
第一次回帖完全不会用这回复功能,楼主见谅,正文正在审核
好格式代码
#网站:https://www.sgmw.com.cn/dealer_search.html
import requests
import execjs
if __name__ == '__main__':
keyword='宝骏360'
file='sgmw.js'
node=execjs.get()
ctx=node.compile(open(file,encoding="utf-8").read())
js=f"CarNameFromPage('{keyword}')"
result=ctx.eval(js)
url='https://www.sgmw.com.cn/'+result
headers={
#写自己的,不写也可以
}
text=requests.get(url=url,headers=headers).text
with open('zancun.js','w',encoding='utf-8') as f:
f.write(text)
file = 'zancun.js'
node = execjs.get()
ctx = node.compile(open(file, encoding="utf-8").read())
js = f"provinceData"
js_2=f"cityData"
js_3=f"dealersData"
provinceData = ctx.eval(js)
cityData= ctx.eval(js_2)
dealersData= ctx.eval(js_3)
provinceData_dict_1={}
cityData_dict_1={}
for provinceData_dict in provinceData:
word=provinceData_dict["proname"]
key=provinceData_dict['proid']
provinceData_dict_1=word
for cityData_dict in cityData:
word=cityData_dict["cityName"]
key=cityData_dict["cityID"]
cityData_dict_1=word
for dealersData_dict in dealersData:
key=dealersData_dict["province"]
dealersData_dict["province"]=provinceData_dict_1
key=dealersData_dict["city"]
dealersData_dict["city"]=cityData_dict_1
with open('keyword.text','w',encoding="utf-8") as f:
for dealersData_dict in dealersData:
f.write(dealersData_dict["province"]+''+dealersData_dict["city"]+''+dealersData_dict["company"]+''+dealersData_dict['address']+'\n')
获取数据简单,主要是处理数据。
import requests
import json
import xlwt
def get_datas():
url = 'https://www.sgmw.com.cn/js/hgevdealer.js'
res = requests.get(url).text
res = res.split(';')
pre = res.split(' ')[-1]
pre = json.loads(pre)
pro = {}
for i in range(len(pre)):
pro['proid']] = pre['proname']
cre = res.split(' ')[-1]
cre = json.loads(cre)
cro = {}
for i in range(len(cre)):
cro['cityID']] = cre['cityName']
dre = res
dre = json.loads(dre)
s1 = []
for i in range(len(dre)):
s = ['province']], cro['city']], dre['company'], dre['address']]
s1.append(s)
ide = ["省市区", "市", "网点名称", "具体地址"]
index = len(ide)
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet("五菱宏光miniEV")
for i in range(0, len(dre)):
for j in range(0, index):
worksheet.write(i, j, s1)
workbook.save('test.xls')
if __name__ == '__main__':
get_datas()