《Python3网络爬虫开发实战(第二版)》案例SPA4-10代码编写
SPA4:新闻网站索引,无反爬,数据通过 Ajax 加载,无页码翻页,适合 Ajax 分析和动态页面渲染抓取以及智能页面提取分析。思路:访问ajax接口获取json数据。
问题:不是很清楚智能页面提取分析不清楚是什么。
代码:
from requests_html import HTMLSession
import json
def main():
session = HTMLSession()
url = 'https://spa4.scrape.center/api/news/?limit=100&offset=0'
req = session.get(url).json()
with open("./爬虫练习/SPA4/data.json","w",encoding="utf-8") as f:
json.dump(req,f,ensure_ascii=False)
if __name__ == '__main__':
main()
SPA5:图书网站,无反爬,数据通过 Ajax 加载,有翻页,适合大批量动态页面渲染抓取。
思路:通过全球ajax接口获取数据
代码:
from requests_html import HTMLSession
import json
import os
def main():
session=HTMLSession()
url = 'https://spa5.scrape.center/api/book/?limit=10&offset=0' #limit=9040
reqs = session.get(url).json()
with open("./爬虫练习/SPA5/dara.json",'w',encoding='utf-8') as f:
json.dump(reqs,f,ensure_ascii=False)
for i in reqs['results']:
id = i['id']
url = 'https://spa5.scrape.center/api/book/'+str(id)
req = session.get(url).json()
path = "./爬虫练习/SPA5/"+id
if( not os.path.exists(path) ):
os.makedirs(path)
with open("./爬虫练习/SPA5/{}/{}_dara.json".format(id,id),"w",encoding='utf-8') as f:
json.dump(req,f,ensure_ascii=False)
if __name__ == '__main__':
main()
SPA6: 电影数据网站,数据通过 Ajax 加载,数据接口参数加密且有时间限制,源码经过混淆,适合 JavaScript 逆向分析。
思路:js代码经过混淆,但是断点分析发现和SPA2 加密方法基本相同
代码:
from requests_html import HTMLSession
import base64
import time
import hashlib
import json
def main():
session = HTMLSession()
data_all = []
for i in range(1,3):
try:
key = "ef34#teuq0btua#(-57w1q5o5--j@98xygimlyfxs*-!i-0-mb" + str(i)
bytes_key = key.encode("utf-8")
token = base64.b64encode(bytes_key).decode('utf-8')
t = int(time.time())
key = ("/api/movie/"+token+','+str(t)).encode("utf-8")
key = hashlib.sha1(key).hexdigest()
key = (key+","+str(t)).encode("utf-8")
token_2 = base64.b64encode(key).decode('utf-8')
url = "https://spa6.scrape.center/api/movie/{token}/?token={token_2}".format(token=token,token_2=token_2)
print(url)
req = session.get(url).json()
data_all.append(req)
except:
pass
with open("./爬虫练习/SPA6/data.json","w",encoding="utf-8") as f:
json.dump(data_all,f,ensure_ascii=False)
if __name__ == '__main__':
main()
SPA7:NBA 球星数据网站,数据纯前端渲染,Token 经过加密处理,适合基础 JavaScript 模拟分析。
思路:分析可得数据嵌套在js中,直接获取即可。直接观察js可得token利用des加密,key:fipFfVsZsTda94hJNKJfLoaqyqMZFFimwLt
问题:为了获得json数据,用了大量replace清洗数据过于繁琐(未解决)
代码:(踩坑:key只需要8字节,key=fipFfVsZ)
from requests_html import HTMLSession
import re,json
import base64
from pyDes import des, PAD_PKCS5, ECB
def des_encrypt(s):
KEY = b'fipFfVsZ'
des_obj= des(KEY, ECB, pad=None, padmode=PAD_PKCS5) # 初始化一个des对象,参数是秘钥,加密方式,偏移, 填充方式
en = des_obj.encrypt(s, padmode=PAD_PKCS5)# 用对象的encrypt方法加密
return en
def main():
url = "https://spa7.scrape.center/js/main.js"
session=HTMLSession()
req = session.get(url).text
pattern = re.compile('\[[\s\S]*\]')
data = re.findall(pattern,req).replace("\'","\"").replace("name","\"name\"").replace("image","\"image\"").replace("birthday","\"birthday\"").replace("height","\"height\"").replace("weight","\"weight\"").replace(",\n]","]")
json_data = json.loads(data)
for i in range(0,len(json_data)):
name = json_data['name'].encode("utf-8")
base64Name = base64.b64encode(name).decode("utf-8")
birthday=json_data['birthday']
height=json_data['height']
weight=json_data['weight']
key='fipFfVsZsTda94hJNKJfLoaqyqMZFFimwLt'
s = (base64Name+birthday+height+weight).encode("utf-8")
token = des_encrypt(s)
token = base64.b64encode(token).decode("utf-8")
json_data['token'] = token
with open('./爬虫练习/SPA7/data.json','w',encoding="utf-8") as f:
json.dump(json_data,f,ensure_ascii=False)
if __name__ == '__main__':
main()
SPA8:NBA 球星数据网站,数据纯前端渲染,Token 经过加密处理,JavaScript 代码一行混入 HTML 代码,防止直接调试,适合 JavaScript 逆向分析。
思路:利用xpath获取嵌套的js代码,清洗数据
代码:
from requests_html import HTMLSession
import re,json
import base64
from pyDes import des, PAD_PKCS5, ECB
def des_encrypt(s):
KEY = b'qmqTHChq'
des_obj= des(KEY, ECB, pad=None, padmode=PAD_PKCS5) # 初始化一个des对象,参数是秘钥,加密方式,偏移, 填充方式
en = des_obj.encrypt(s, padmode=PAD_PKCS5)# 用对象的encrypt方法加密
return en
def main():
url = 'https://spa8.scrape.center/'
session=HTMLSession()
req = session.get(url)
data = req.html.xpath("./script/text()")
pattern = re.compile('\[[\s\S]*\]')
data = re.findall(pattern,data).replace("\'","\"").replace("name","\"name\"").replace("image","\"image\"").replace("birthday","\"birthday\"").replace("height","\"height\"").replace("weight","\"weight\"").replace(",]","]")
json_data = json.loads(data)
for i in range(0,len(json_data)):
name = json_data['name'].encode("utf-8")
base64Name = base64.b64encode(name).decode("utf-8")
birthday=json_data['birthday']
height=json_data['height']
weight=json_data['weight']
key='fipFfVsZsTda94hJNKJfLoaqyqMZFFimwLt'
s = (base64Name+birthday+height+weight).encode("utf-8")
token = des_encrypt(s)
token = base64.b64encode(token).decode("utf-8")
json_data['token'] = token
with open('./爬虫练习/SPA8/data.json','w',encoding="utf-8") as f:
json.dump(json_data,f,ensure_ascii=False)
if __name__ == '__main__':
main()
SPA9:NBA 球星数据网站,数据纯前端渲染,Token 经过加密处理,JavaScript 经过 eval 混淆,适合 JavaScript 逆向分析。
分析:百度了解eval混淆 可以直接console.log输出
思路:利用execjs模块直接执行js代码,获取解密后js的代码
代码:
import execjs
from requests_html import HTMLSession
import re,json
import base64
from pyDes import des, PAD_PKCS5, ECB
def des_encrypt(s):
KEY = b'NAhwcEVL'
des_obj= des(KEY, ECB, pad=None, padmode=PAD_PKCS5) # 初始化一个des对象,参数是秘钥,加密方式,偏移, 填充方式
en = des_obj.encrypt(s, padmode=PAD_PKCS5)# 用对象的encrypt方法加密
return en
def eval_js(jscode):
jscode='''
var a ={};
function test(){{return a}}
'''.format(jscode)
js = execjs.compile(jscode)
result = js.call('test')
return result
def main():
session = HTMLSession()
req = session.get('https://spa9.scrape.center/')
jscode = req.html.xpath("./script/text()").replace("eval(","")
data = eval_js(jscode)
pattern = re.compile('\[[\s\S]*\]')
data = re.findall(pattern,data).replace("\'","\"").replace("name","\"name\"").replace("image","\"image\"").replace("birthday","\"birthday\"").replace("height","\"height\"").replace("weight","\"weight\"").replace(",]","]")
json_data = json.loads(data)
for i in range(0,len(json_data)):
name = json_data['name'].encode("utf-8")
base64Name = base64.b64encode(name).decode("utf-8")
birthday=json_data['birthday']
height=json_data['height']
weight=json_data['weight']
key='NAhwcEVLEnRoJA7acv6eZGvXWjtijppyHXh'
s = (base64Name+birthday+height+weight).encode("utf-8")
token = des_encrypt(s)
token = base64.b64encode(token).decode("utf-8")
json_data['token'] = token
with open('./爬虫练习/SPA9/data.json','w',encoding="utf-8") as f:
json.dump(json_data,f,ensure_ascii=False)
if __name__ == '__main__':
main()
SPA10:NBA 球星数据网站,数据纯前端渲染,Token 经过加密处理,JavaScript 经过 JJEncode 混淆,适合 JavaScript 逆向分析。
分析:JJEncode 混淆后mian.js已基本不可见,但是jjencode可以直接还原
问题:目前 python的jjencode解密代码编写 暂无思路
思路: 通过百度上jjencode解密工具接口还原代码,同上清洗数据,计算token
电子书能不能分享一下 小飞虫 发表于 2021-10-28 14:52
电子书能不能分享一下
同问电子书 小飞虫 发表于 2021-10-28 14:52
电子书能不能分享一下
我也同问个啊哈哈,好像书还没出版 还没出版,这代码从哪来的 行人2019 发表于 2021-10-31 13:09
还没出版,这代码从哪来的
自己写的{:1_890:} 求第二版电子书
页:
[1]