【Python】利用requests和selenium爬取企业竞争公司信息

liujw1007 发表于 2019-11-19 10:32

新手python一枚，在上个月的工作中需要在某查查上爬取公司的竞品信息，于是想到采用python的爬虫来做，整体上来讲难度不是很大，主要也是为了分享给大家，自己也做个笔记，适合已经知道怎么安装python以及相应库文件的初学者，具体代码如下：首先是库文件的导入；
# -*-coding:utf8-*-
import pandas as pd
import requests
import re
import time
import random
import io
import sys
from selenium import webdriver
这里用到了requests和selenium是因为开始写的时候使用到了requests库，对于某查查的js动态网页很有用，但是之后在xx财富网查询市值信息时遇到了瓶颈，于是采用selenium这种粗暴简单的方式。
第二，公司信息的收集；
在某查查上面，每个公司的定义有两部分构成，即keyno以及companyname，随便搜索一个公司查看其headers结果如下：
https://www.qichacha.com/companyown_rightpromote?keyno=47e8760382f7462a4ad889f3c0e2729a&companyname=%E6%9D%AD%E5%B7%9E%E6%B5%B7%E5%BA%B7%E5%A8%81%E8%A7%86%E6%95%B0%E5%AD%97%E6%8A%80%E6%9C%AF%E8%82%A1%E4%BB%BD%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8
因此当我们要查询某一公司的竞品信息，首先就需要获取这两个值，可以在一个函数里实现：
def get_id(url):
headers={
   'authority':"www.qichacha.com",
   'Cache-Control':"no-cache",
   'upgrade-insecure-requests':"1",
   'user-agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36",
   'accept':"text/html, */*; q=0.01",
   'referer':"https://www.qichacha.com/",
   'accept-encoding':"gzip, deflate, br",
   'accept-language':"zh-CN,zh;q=0.9",
   'cookie':"UM_distinctid=16b8d471703471-04b73bb45b00cd-e343166-1fa400-16b8d471704733; _uab_collina=156144338470291571211012; zg_did=%7B%22did%22%3A%20%2216b8d4741a338c-0e6c00e586f46e-e343166-1fa400-16b8d4741a46d3%22%7D; acw_tc=0ed7383b15710374952277287e45801d4bb57f63357b8b828c585f272b; QCCSESSID=664m77pitdvqjrbafqqfclk503; hasShow=1; Hm_lvt_3456bee468c83cc63fb5147f119f1075=1571190689,1571725534,1572501150,1572505932; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1572506182; CNZZDATA1254842228=1479407757-1561443156-https%253A%252F%252Fwww.baidu.com%252F%7C1572506419; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201572503838881%2C%22updated%22%3A%201572506434729%2C%22info%22%3A%201572501149040%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22www.qichacha.com%22%2C%22zs%22%3A%200%2C%22sc%22%3A%200%2C%22cuid%22%3A%20%2240bf06a6ef10edd4d62374275ca354b4%22%7D",
   }#headers里的信息是我浏览器的信息，每个人都是不一样的，需要到网页network下的XHR里查询
id=[]
req = requests.get(url,headers=headers)
req1=req.url
response = requests.request('GET',url,headers=headers).text#url为有待传入的公司搜索连接
company = req1.split('key=')#公司的keyno信息
id= re.findall('第1个.*?firm_(.*?).html" target="_blank.*?第2个',response,re.S)#正则匹配公司的companyname信息
return id, company
当获得了公司的keyno和companyname（即代码中的id和company）信息之后，就可以得到该公司的竞品信息链接为：
url = 'https://www.qichacha.com/company_getinfos?unique='+id + '&companyname=' + company +'&p='+ '&tab=report'
同样，通过该链接可以利用requests得到具体的竞品信息，这一部分也可以通过函数实现：
def get_message(url):
headers={
   'authority':"www.qichacha.com",
   'Cache-Control':"no-cache",
   'upgrade-insecure-requests':"1",
   'user-agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36",
   'accept':"text/html, */*; q=0.01",
   'referer':"https://www.qichacha.com/",
   'accept-encoding':"gzip, deflate, br",
   'accept-language':"zh-CN,zh;q=0.9",
   'cookie':"UM_distinctid=16b8d471703471-04b73bb45b00cd-e343166-1fa400-16b8d471704733; _uab_collina=156144338470291571211012; zg_did=%7B%22did%22%3A%20%2216b8d4741a338c-0e6c00e586f46e-e343166-1fa400-16b8d4741a46d3%22%7D; acw_tc=0ed7383b15710374952277287e45801d4bb57f63357b8b828c585f272b; QCCSESSID=664m77pitdvqjrbafqqfclk503; hasShow=1; Hm_lvt_3456bee468c83cc63fb5147f119f1075=1571190689,1571725534,1572501150,1572505932; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1572506182; CNZZDATA1254842228=1479407757-1561443156-https%253A%252F%252Fwww.baidu.com%252F%7C1572506419; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201572503838881%2C%22updated%22%3A%201572506434729%2C%22info%22%3A%201572501149040%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22www.qichacha.com%22%2C%22zs%22%3A%200%2C%22sc%22%3A%200%2C%22cuid%22%3A%20%2240bf06a6ef10edd4d62374275ca354b4%22%7D",
   }
massage=[]
response = requests.request('GET',url,headers=headers).text
text = '<tr> <td class="tx.*?width="90"><img style=.*?class="text-center.*?html">(.*?)</a></td> <td width="10%" class="text-center">(.*?)</td> <td width="95" class="text-center">(.*?)</td> <td width="80" class="text-center">(.*?)</td> <td><div style="word-break.*?class="line-clamp">(.*?)</div></td>.*?target="_blank">(.*?)</a> </td> </tr>'
message=re.findall(text,response,re.S)
return message
上述定义的函数中，我们基本获得了竞品公司在某查查上面的所有信息，但是我们还需要查询这些共公司的市值（假设已经上市了的话），通过定义一个Class来封装这些信息：
class jingpin():
'''公司竞品信息'''
def __init__(self):
   self.name = None #竞品信息
   self.date = None #竞品公司成立日期
   self.location = None #竞品公司地理位置
   self.introduction = None #竞品公司介绍
   self.comname = None #竞品公司全称
   self.status = None #竞品公司融资信息
   self.value = None #竞品公司市值
公司的市值信息在xx财富网上爬取获得，爬取链接为：
url1 = 'http://so.eastmoney.com/web/s?keyword='+self.name
这一部分也放在函数中，利用selenium库：
def get_shizhi(url):
driverpath ='C:/Users/wxx/Downloads/Compressed/chromedriver_win32/chromedriver.exe'#通常我们需要下载一个浏览器的驱动，位置随意
driver = webdriver.Chrome(executable_path = driverpath)#利用浏览器模拟搜索
driver.get(url)
time.sleep(3)
response = driver.page_source
password = '<div class="otherinfo.*?<label>总市值：</label>.*?<span>(.*?)</span>'#查询市值信息
message=re.findall(password,response,re.S)
driver.close()#如果不想反复打开浏览器可以最后再关掉，放在代码最后就行
return message
如此，就可以获得所有的公司竞品信息了，正文代码如下：
#读取公司名称
path ="C:/Users/wxx/Desktop/公司名称.xlsx"#公司名称可以放在excel表格里
df = pd.read_excel(path)
df1 = df['公司名称'].values#获取公司名称这一列的值
print(df1)

ids = []
companys = []
for key in df1:
print(key) #公司名称
url = 'https://www.qichacha.com/search?key='+key
id, company = get_id(url)
ids.append(id)
companys.append(company)
time.sleep(3.44+random.random())#在使用requests时必须适当控制爬取速度，防止被封，或者使用代{过}{滤}理IP
print(companys)
idsshape=()#不理解这一块的内容时请查看ids的类型为二维list就知道了
print(idsshape)

messages = []#存储公司专利信息
for i in range(len(companys)):#data.shape

company = companys
id = idsshape
url = 'https://www.qichacha.com/company_getinfos?unique='+id + '&companyname=' + company +'&p='+ '&tab=report'
message = get_message(url)
companyname = df1
txtpath = 'C:/Users/wxx/Desktop/竞品公司/'+companyname+'竞品.txt'
with open(txtpath,"w") as f:
   f.write('竞争公司'+'\t\t'+'融资状态'+'\t\t'+'市值'+'\t\t'+'日期'+'\t\t'+'地点'+'\t\t'+'公司名称'+'\t\t'+'业务介绍')

for j in range(len(message)):
   number = jingpin()
   number.name = message
   number.status = message
   number.date = message
   number.location = message
   number.introduction = message
   number.comname = message
   if number.status == 'IPO':
         url1 = 'http://so.eastmoney.com/web/s?keyword='+number.name
         page = get_shizhi(url1)
         print(page)
         number.value = page
   elif number.status == '新三板':
         url1 = 'http://so.eastmoney.com/web/s?keyword='+number.name
         page = get_shizhi(url1)
         print(page)
         number.value = page
   else:
         number.value = '无'
   try:
         with open(txtpath,"a") as f:
            f.write('\n'+number.name+'\t\t'+number.status+'\t\t'+str(number.value)+'\t\t'+number.date+'\t\t'+number.location+'\t\t'+number.comname+'\t\t'+number.introduction)
   except UnicodeEncodeError:#这里使用try是因为爬取信息中出现了txt无法解析的字符，防止中断程序，非常有用
         with open(txtpath,"a") as f:
            f.write('\n'+number.name+'\t\t'+number.status+'\t\t'+str(number.value)+'\t\t'+number.date+'\t\t'+number.location+'\t\t'+number.comname)
print(companyname+'竞品公司收集完成！')
time.sleep(2.56+random.random())

具体的爬取信息规则就不多说了，各有所需。最后保存为了txt文档，没有保存为excel也是因为excel打开太慢了啊哈哈哈，完结撒花~~~

xing29 发表于 2019-11-19 11:22

学习的好东西，谢谢分享。

PikaQ 发表于 2019-11-19 13:55

太厉害了，谢谢楼主的分享，我也在学习python

DaisyH 发表于 2020-4-28 23:23

学习学习，有实例学起来快

页: [1]

吾爱破解 - 52pojie.cn's Archiver

【Python】利用requests和selenium爬取企业竞争公司信息