txq0211 发表于 2022-3-28 15:35

爬虫之百度地图获取所在城市

朋友要处理70万条数据,通过单位名称获取所在城市。
简单分析了下,这年头不懂工作,不会点自动化的伤不起啊。
唯一的难点就是获取接口,接口参数需要把经纬度改成全国的,才能有效搜索。

直接上代码,只是简单的爬虫读取Excel文档,异步调试失败。
有会异步的大神请多多指导。





import csv
import requests
import json
import openpyxl
import random
import time
#用户代{过}{滤}理User-Agent列表
USER_AGENTS = [
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
    "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
    "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
    "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
    "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
    "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
    "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
    "UCWEB7.0.2.37/28/999",
    "NOKIA5700/ UCWEB7.0.2.37/28/999",
    "Openwave/ UCWEB7.0.2.37/28/999",
    "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
    "Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",
]


# 随机获取一个用户代{过}{滤}理User-Agent的请求头
def get_request_headers():
    headers = {
        'User-Agent':random.choice(USER_AGENTS),
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'Accept-language':'zh-CN,zh;q=0.9',
        'Accept-Encoding':'gzip, deflate,br',
        'Connection':'keep-alive',
    'Content-Type':'text/javascript;charset=utf-8'
        }
    return headers

def getdata(word):
    url = f'https://map.baidu.com/su?wd=%s&cid=0&type=0&b=(10568083.325118167,3270172.585196372;14464559.879448874,4228056.404802672)' % word
    headers = get_request_headers()
    # time.sleep(0.02)
    response = requests.get(url=url, headers=headers)
    code = response.status_code
    response.encoding = response.apparent_encoding
    # json_str = json.loads(response.text)
    # dict_str中含有 ‘\’ ,使json无法解析
    s = response.text.replace('\\', '\\\\')
    json_str = json.loads(s,strict = False)
    data = json_str['s']
    return data

if __name__ == '__main__':
    before = int(input('请输入开始的行数:'))
    table = openpyxl.load_workbook("input.xlsx")
    sheet = table['input']
    nrows = sheet.max_row
    lists_city = ''
    # 创建文件
    f = open('单位地市数据.csv', mode='w', encoding='gbk', newline='') # 创建写入数据
    csv_writer = csv.DictWriter(f, fieldnames=['单位', '地市']) # 表头参数
    csv_writer.writeheader() # 写入表头s

    for i in range(before,nrows):
      # 如果等于上一条,说不能改变顺序,好吧不改变顺序的情况下只能这样了
      word = sheet['A'].value
      lastword = sheet['A'].value
      start = time.time()
      # 百度地图有段怪,名字太长有时候没结果
      if i == before or word != lastword:
            if '集团' in word:
                word = word.split('集团') + '集团'
            elif '医院' in word:
                word = word.split('医院') + '医院'
            elif '卫生' in word:
                word = word.split('卫生')
            else:
                word = word + '医院'
            data = getdata(word)
            if data in ['', None, {}, []]:
                # word = sheet['A'].value
                lists_city = '没找到'
                dit = {
                  '单位':word,
                  '地市':lists_city
                }
            else:
                first_data = data
                lists_data = first_data.split('$')
                lists_city = lists_data
                dit = {
                  '单位':word,
                  '地市':lists_city
                }
      else:
            dit = {
                '单位': word,
                '地市': lists_city
            }
      csv_writer.writerow(dit)
      end = time.time()
      print('共%d条数据已完成第%d条,用时%.2f秒:%s获取成功,位于%s'%(nrows-1,i,end - start,word,lists_city))

mb181 发表于 2022-3-28 17:08

不会玩这个

海尔波普彗星 发表于 2022-3-28 17:35

看着很厉害,就是看不懂

szy4444 发表于 2022-3-28 17:40

哈哈哈,看不懂

衫青水袖 发表于 2022-3-28 19:12

能爬地图吗

txq0211 发表于 2022-3-28 19:14

衫青水袖 发表于 2022-3-28 19:12
能爬地图吗

这个接口是爬所在城市的,要爬地图得研究

尹铭 发表于 2022-4-1 11:29

地图这东西太敏感搞不好要喝茶的{:1_907:}

txq0211 发表于 2022-4-1 14:51

尹铭 发表于 2022-4-1 11:29
地图这东西太敏感搞不好要喝茶的

没事,就写了个地图接口数据查询的,自己不用,要抓抓用的人

disidi 发表于 2022-4-2 22:07

能爬取所在城市的POI吗
页: [1]
查看完整版本: 爬虫之百度地图获取所在城市