爬虫之百度地图获取所在城市
朋友要处理70万条数据,通过单位名称获取所在城市。简单分析了下,这年头不懂工作,不会点自动化的伤不起啊。
唯一的难点就是获取接口,接口参数需要把经纬度改成全国的,才能有效搜索。
直接上代码,只是简单的爬虫读取Excel文档,异步调试失败。
有会异步的大神请多多指导。
import csv
import requests
import json
import openpyxl
import random
import time
#用户代{过}{滤}理User-Agent列表
USER_AGENTS = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
"Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
"UCWEB7.0.2.37/28/999",
"NOKIA5700/ UCWEB7.0.2.37/28/999",
"Openwave/ UCWEB7.0.2.37/28/999",
"Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
"Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",
]
# 随机获取一个用户代{过}{滤}理User-Agent的请求头
def get_request_headers():
headers = {
'User-Agent':random.choice(USER_AGENTS),
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-language':'zh-CN,zh;q=0.9',
'Accept-Encoding':'gzip, deflate,br',
'Connection':'keep-alive',
'Content-Type':'text/javascript;charset=utf-8'
}
return headers
def getdata(word):
url = f'https://map.baidu.com/su?wd=%s&cid=0&type=0&b=(10568083.325118167,3270172.585196372;14464559.879448874,4228056.404802672)' % word
headers = get_request_headers()
# time.sleep(0.02)
response = requests.get(url=url, headers=headers)
code = response.status_code
response.encoding = response.apparent_encoding
# json_str = json.loads(response.text)
# dict_str中含有 ‘\’ ,使json无法解析
s = response.text.replace('\\', '\\\\')
json_str = json.loads(s,strict = False)
data = json_str['s']
return data
if __name__ == '__main__':
before = int(input('请输入开始的行数:'))
table = openpyxl.load_workbook("input.xlsx")
sheet = table['input']
nrows = sheet.max_row
lists_city = ''
# 创建文件
f = open('单位地市数据.csv', mode='w', encoding='gbk', newline='') # 创建写入数据
csv_writer = csv.DictWriter(f, fieldnames=['单位', '地市']) # 表头参数
csv_writer.writeheader() # 写入表头s
for i in range(before,nrows):
# 如果等于上一条,说不能改变顺序,好吧不改变顺序的情况下只能这样了
word = sheet['A'].value
lastword = sheet['A'].value
start = time.time()
# 百度地图有段怪,名字太长有时候没结果
if i == before or word != lastword:
if '集团' in word:
word = word.split('集团') + '集团'
elif '医院' in word:
word = word.split('医院') + '医院'
elif '卫生' in word:
word = word.split('卫生')
else:
word = word + '医院'
data = getdata(word)
if data in ['', None, {}, []]:
# word = sheet['A'].value
lists_city = '没找到'
dit = {
'单位':word,
'地市':lists_city
}
else:
first_data = data
lists_data = first_data.split('$')
lists_city = lists_data
dit = {
'单位':word,
'地市':lists_city
}
else:
dit = {
'单位': word,
'地市': lists_city
}
csv_writer.writerow(dit)
end = time.time()
print('共%d条数据已完成第%d条,用时%.2f秒:%s获取成功,位于%s'%(nrows-1,i,end - start,word,lists_city))
不会玩这个 看着很厉害,就是看不懂 哈哈哈,看不懂 能爬地图吗
衫青水袖 发表于 2022-3-28 19:12
能爬地图吗
这个接口是爬所在城市的,要爬地图得研究 地图这东西太敏感搞不好要喝茶的{:1_907:} 尹铭 发表于 2022-4-1 11:29
地图这东西太敏感搞不好要喝茶的
没事,就写了个地图接口数据查询的,自己不用,要抓抓用的人 能爬取所在城市的POI吗
页:
[1]