python爬不到数据
本帖最后由 D.A. 于 2020-8-7 08:59 编辑求助求助,想用python爬取企查查的数据,但是访问正常,为啥就是爬不到数据呢?
mport requests
from lxml import etree
from fake_useragent import UserAgent
import os
import urllib
from xlrd import open_workbook
from xlutils.copy import copy
#设置headers
ua = UserAgent(verify_ssl=False)
headers = {
#"User-Agent": ua.random,
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36 SE 2.X MetaSr 1.0',
"cookie": r'zg_did=%7B%22did%22%3A%20%2217282b5d071a42-050424cf211674-7d7f582e-1fa400-17282b5d072e51%22%7D; UM_distinctid=17282b5d2c13c0-07187f08117eba-7d7f582e-1fa400-17282b5d2c215c2; _uab_collina=159133085424427690771624; hasShow=1; acw_tc=b702a0a415966961860105836ed561434a05be4c2943c33dcdec970734; QCCSESSID=r2lic1q0l52th9fh6k1feh1e30; CNZZDATA1254842228=1442901653-1591326017-%7C1596695875; Hm_lvt_78f134d5a9ac3f92524914d0247e70cb=1594788502,1594974923,1596682176,1596696190; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201596696188890%2C%22updated%22%3A%201596696217182%2C%22info%22%3A%201596682175518%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22zs%22%3A%200%2C%22sc%22%3A%200%2C%22cuid%22%3A%20%22c67ca9a77e965180c3ffef7365944bac%22%7D; Hm_lpvt_78f134d5a9ac3f92524914d0247e70cb=1596696218',
}
#获取url链接的xml格式
def getxml(url):
res = requests.get(url, headers, timeout = 30)
res.encoding = res.apparent_encoding
text = res.text
xml = etree.HTML(text)
print(res.status_code)
return xml
codes = ['102ZSAN',]
for code in codes:
urlbase= 'https://www.qcc.com/firm_'+ code + '.shtml#base'
print(urlbase)
xml = getxml(urlbase)
corpname = xml.xpath('//div[@class="row title jk-tip"]/h1/text()')
tags = xml.xpath('//div[@class="row tags"]/descendant-or-self::text()')
tag = "、".join(tags)
jianjie = xml.xpath('//span[@class="cvlu introRetract"]/text()')
hangye = xml.xpath('//section[@id="Cominfo"]//tr/td/text()')
renyuanguimo = xml.xpath('//section[@id="Cominfo"]//tr/td/text()')
print(corpname)
print(jianjie)
返回:
有反爬机制!{:1_926:} 我不知道啊 是不是cookie的r https://www.qcc.com/firm_102ZSAN.shtml#base我这边打开405 有反爬吧 _默默_ 发表于 2020-8-7 09:11
有反爬吧
可是用火车头采集能爬到,不知道python代码为啥不行 有的网站数据是异步加载的,打开网页后才加载数据。你这是拿到了网页,还没有加载玩数据所以数据是空的 renyuanguimo那个xpath我复制到网页搜索不到内容,我修改了下.//section[@id="sanbanBase"]//tr/td/text()你试试更换xpath看下 rosemaryzed 发表于 2020-8-7 09:08
https://www.qcc.com/firm_102ZSAN.shtml#base我这边打开405
欸,我这能正常打开啊