|
吾爱游客
发表于 2022-5-3 01:59
申 请 I D:AUXStar
个人邮箱:468835121@qq.com
原创技术文章:
突破某度搜索引擎反爬
思路
先简单测试
https://www.baidu.com/s?wd=abc 无cookie => 无法爬取
https://www.baidu.com/s?wd=abc 有cookie => 爬取3次后被封禁
换浏览器
-
先规规矩矩访问
发现url变化的参数:
rsv_spt=1
rsv_iqid=0x8c9cb78e0017702b
issp=1
f=8
rsv_bp=1
rsv_idx=2
ie=utf-8
tn=baiduhome_pg
rsv_enter=1
rsv_dl=tb
rsv_sug3=2
rsv_sug1=2
rsv_sug7=101
rsv_sug2=0
rsv_btype=i
prefixsug=s
rsp=5
inputT=931
rsv_sug4=1688
-
尝试选择form附加参数
//form[@id="form"]/input/@name|//form[@id="form"]/input/@value
-
实践
先得到第一个参数
global_list = ['']
key_val = etree.HTML(requests('https://www.baidu.com/s?ie=utf-8&f=3&rsv_bp=1&tn=baidu&wd=helloworld&oq=sd&rsv_pq=faa0e68b00091231&rsv_t=69b1RTU5Wl2O08cV%2FBhZEywf2230d9avT2NP8qSNWHkshTDQ4VowDvfIAG0&rqlang=cn&rsv_enter=1&rsv_dl=ts_2&rsv_sug3=4&rsv_sug1=2&rsv_sug7=101&rsv_sug2=0&rsv_btype=t&prefixsug=hello&rsp=2&inputT=12495&rsv_sug4=12787'))\
.xpath('//form[@id="form"]/input/@name|//form[@id="form"]/input/@value')
global_list[0] = 'https://www.baidu.com/s?'
for i in range(len(key_val)//2):
if key_val[i2+1] == "":
continue
global_list[0] += key_val[i2]+'=' + \
parse.quote(key_val[i*2+1], encoding='UTF-8')+'&'
每次调用函数时更新参数
调用时附加其他参数
-
程序
# -*- coding : utf-8-*-
from lxml import etree
from urllib import request, parse
def requests(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Connection': 'keep-alive',
'Cookie': 'BAIDUID=8F908D968D324B630497BAA12D206484:FG=1; BIDUPSID=A07303BCB6B723692B6F803A6991C739; PSTM=1616851097; __yjs_duid=1_3a9a8cb5802c8894921aa358c83128811620571006645; MCITY=-117%3A; sugstore=1; BD_UPN=13314352; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; ispeed_lsm=12; H_PS_PSSID=35104_31254_35775_34584_35491_35872_35889_35542_35796_35319_26350_35746; BDSFRCVID=EgDOJeC62xCDv66DWAFcjURtCPn-wpTTH6aolS402nQSJLvCBNvREG0PKM8g0KubRDmsogKKW2OTHTAF_2uxOjjg8UtVJeC6EG0Ptf8g0f5; H_BDCLCKID_SF=tbAj_IKXtDK3jt-kKRo_-4kyqlOybTn452n9aJ5nJDoWEpbCj6jU3U_z-U6PQJ3mLI_LL-JmQpP-HJ710T82D4Cu0P6r-fJGBjcNKl0MLpvYbb0xynoD0x0DKUnMBMnUteOnaUbp3fAKftnOM46JehL3346-35543bRTLnLy5KJtMDcnK4-XD6O3DNJP; BDRCVFR[gltLrB7qNCt]=mk3SLVN4HKm; delPer=0; BD_CK_SAM=1; PSINO=1; BD_HOME=1; COOKIE_SESSION=31655_0_9_9_0_32_1_0_8_7_0_6_0_0_0_0_0_0_1644889935%7C9%237828622_5_1625451769%7C3; baikeVisitId=6bef1abd-737f-4d75-a05d-393f235f0810',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
}
res = request.urlopen(request.Request(url, headers=headers)).read()
try:
return res.decode('UTF-8')
except UnicodeDecodeError:
return res.decode('GB18030')
# 第一次先执行,获得其他参数
global_list = ['']
key_val = etree.HTML(requests('https://www.baidu.com/s?ie=utf-8&f=3&rsv_bp=1&tn=baidu&wd=helloworld&oq=sd&rsv_pq=faa0e68b00091231&rsv_t=69b1RTU5Wl2O08cV%2FBhZEywf2230d9avT2NP8qSNWHkshTDQ4VowDvfIAG0&rqlang=cn&rsv_enter=1&rsv_dl=ts_2&rsv_sug3=4&rsv_sug1=2&rsv_sug7=101&rsv_sug2=0&rsv_btype=t&prefixsug=hello&rsp=2&inputT=12495&rsv_sug4=12787'))\
.xpath('//form[@id="form"]/input/@name|//form[@id="form"]/input/@value')
global_list[0] = 'https://www.baidu.com/s?'
for i in range(len(key_val)//2):
if key_val[i*2+1] == "":
continue
global_list[0] += key_val[i*2]+'=' + \
parse.quote(key_val[i*2+1], encoding='UTF-8')+'&'
def baidu_req(keyword):
global_list[0]+'wd='+parse.quote(keyword, encoding='utf-8')
res = requests(global_list[0]+'wd='+parse.quote(keyword, encoding='utf-8'))
key_val = etree.HTML(res).xpath(
'//form[@id="form"]/input/@name|//form[@id="form"]/input/@value')
global_list[0] = 'https://www.baidu.com/s?'
# 更新参数
for i in range(len(key_val)//2):
if key_val[i*2+1] == "":
continue
global_list[0] += key_val[i*2]+'=' + \
parse.quote(key_val[i*2+1], encoding='UTF-8')+'&'
return res
|
|
发帖前要善用【论坛搜索】功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。 |
|
|
|
|