小小爬虫之突破反爬虫限制重写javascript函数
交流交流爬取网站碰到的坑~废话少说直接上代码
import requests
from lxml import etree
import base64
url = 'http://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc'
response = requests.get(url).content
root = etree.HTML(response)
proxys = root.xpath('//*[@id="main"]/table/tr')
def func(a):#同等javascript的function str_rot13,javascript函数请看图片
if a.isdigit():
return a
if a.lower() < 'n':
return chr(ord(a) + 13)#chr()和ord()这两个函数百度吧~,很详细
else:
return chr(ord(a) - 13)
for i in proxys:
try:
ip = i.xpath('./td/script/text()')
if not ip:
continue
ports = i.xpath('./td/text()')
if not ports:
continue
port = ports
ip = ip.replace('document.write(Base64.decode(str_rot13("','').replace('")))','')
ip = ''.join(map(func,ip))
ip = base64.b64decode(ip).strip(' I')#莫名其妙的解码后后缀会有“ I”,没仔细看哪的问题~所以用strip过滤
print ip
except Exception,e:
print e
讲真,用xpath做提取的话,确实比re要方便很多,不过这只是我个人认为的咯。 有Java版吗?{:301_997:}
页:
[1]