对某图书馆的搜素功能web逆向分析
今天无意中看到某知名bbs中出3k搜集2500篇文章的论文作者
作为专业人士手就闲不住了,想一想可不可以自动化采集,于是就有了这篇文章
分析
首先查看网页请求,发现f12打不开,看来这个也是一种反调试方式,于是用charles进行抓包,发现网页经过了两次请求才给出数据,如图所示
第一次内容如下
<html>
<head>
<script language="javascript">setTimeout("location.replace(location.href.split(\"#\")[0])",2000);</script>
<script type="text/javascript" src="http://1.1.1.3:89/cookie/flash.js"></script>
<script language="javascript">setURL("1.1.1.3");supFlash("2028132836");</script>
</head>
<body>
<object classid="clsid:d27cdb6e-ae6d-11cf-96b8-444553540000" codebase="http://fpdownload.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=7,0,0,0" width="0" height="0" id="m" align="center"><param name="allowScriptAccess" value="always" />
<param name="movie" value="http://1.1.1.3:89/cookie/flashcookie.swf" />
<param name="quality" value="high" />
<param name="FlashVars" value="srv=1.1.1.3" />
<embed src="http://1.1.1.3:89/cookie/flashcookie.swf"FlashVars="srv=1.1.1.3" quality="high" width="0" height="0" name="m" align="center" allowScriptAccess="always" type="application/x-shockwave-flash"pluginspage="http://www.macromedia.com/go/getflashplayer" />
</object>
</body></html>
第二次返回结果如下
!DOCTYPE html>
<script type="text/javascript">
function encodeParam(param) {
param=param.replace(/\%/g,"%25");
param=param.replace(/#/g,"%23");
param=param.replace(/\&/g,"%26");
param=param.replace(/\+/g,"%2B");
param=param.replace(/\=/g,"%3D");
param=param.replace(/\ /g,"%20");
param=param.replace(/\//g,"%2F");
param=param.replace(/\\/g,"%5C");
param=param.replace(/\?/g,"%3F");
param=param.replace(/\./g,"%2E");
param=param.replace(/\:/g,"%3A");
return param;
}
</script><html>
<head>
<script type="text/javascript">
function encodeParam(param) {
param=param.replace(/\%/g,"%25");
param=param.replace(/#/g,"%23");
param=param.replace(/\&/g,"%26");
param=param.replace(/\+/g,"%2B");
param=param.replace(/\=/g,"%3D");
param=param.replace(/\ /g,"%20");
param=param.replace(/\//g,"%2F");
param=param.replace(/\\/g,"%5C");
param=param.replace(/\?/g,"%3F");
param=param.replace(/\./g,"%2E");
param=param.replace(/\:/g,"%3A");
return param;
}
</script><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta http-equiv="X-UA-Compatible" content="edge" />
<link rel="shortcut icon" href="/static/style/images/gutu_logo.jpg" />
<link rel="stylesheet" type="text/css" href="/static/js/jquery-easyui/themes/gray/easyui.css">
<link rel="stylesheet" type="text/css" href="/static/js/jquery-easyui/themes/icon.css">
<link rel="stylesheet" type="text/css" href="/static/js/jquery-easyui/themes/color.css">
<link rel="stylesheet" type="text/css" href="/static/style/css/common.css?v=201807091100">
<link rel="stylesheet" type="text/css" href="/static/style/css/icon.css">
<script type="text/javascript" src="/static/js/My97DatePicker/WdatePicker.js"></script>
<script type="text/javascript" src="/static/js/jquery-easyui/jquery.min.js"></script>
<script type="text/javascript" src="/static/js/jquery-easyui/jquery.easyui.min.js"></script>
<script type="text/javascript" src="/static/js/jquery-easyui/locale/easyui-lang-zh_CN.js"></script>
<script type="text/javascript" src="/static/js/arrayToTree.js"></script>
<script type="text/javascript" src="/static/js/extJs.js?v=201807091100"></script>
<script type="text/javascript">
var basePath = "";
window.UEDITOR_HOME_URL = "/static/ueditor/";
window.UEDITOR_SERVER_URL = "/ueditor";
</script>
<script type="text/javascript">
//鼠标点击事件
document.onmousedown = function mdClick(event) {
var e = event || window.event || arguments.callee.caller.arguments[0];
if (e.button == 2 || e.button == 3) {
mAlert();
}
}
//禁用浏览器 默认右键菜单
document.oncontextmenu = new Function("return false;");
// 监听键盘事件
document.onkeydown = document.onkeyup = document.onkeypress = function(event) {
var e = event || window.event || arguments.callee.caller.arguments[0];
if (e && e.keyCode == 123) {
e.returnValue = false;
return (false);
}
}
</script>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=0"/>
<meta name="renderer" content="webkit">
<script type="text/javascript" src="/static/js/mainjs/initsso.js"></script>
<script type="text/javascript" src="/static/js/mainjs/jquery-1.11.1.min.js"></script>
<script type="text/javascript" src="/static/js/mainjs/tools.tabs-1.0.4.min.js"></script>
<link rel="stylesheet" href="/static/style/css/Ymenhu/main.css" type="text/css"/>
<title>读者云门户</title>
...
第一次请求 flash.js 如下
var url = "";
function setURL(ip) {
url = "http://" + ip + ":89/cookie/flashcookie.html";
}
function loadPage() {
location.replace(location.href.split("#")[0]);
}
////add by yxf@2014/08/27
/**
*@描 述:增加UA判断,排除移动终端上报cookies值和时间间隔值
*@返回值:
* true, 允许上报; false,不允许上报
*/
function IsCanReport2Ac() {
var strUseAgent = navigator.userAgent.toLowerCase();
//非windows nt
var isWinNt = strUseAgent.indexOf("windows nt") > -1;
if (!isWinNt) {
return false;
}
//移动终端
var isMobile = strUseAgent.indexOf("mobile") > -1;
if (isMobile) {
return false;
}
//为Android
var isAndroid = strUseAgent.indexOf("android") > -1;
if (isAndroid) {
return false;
}
//为ios
var isIOS = !!strUseAgent.match(/\(i[^;]+;( u;)? cpu.+mac os x/);
if (isIOS) {
return false;
}
//为Symbian
var isSymbian = strUseAgent.indexOf("symbian") > -1;
if (isSymbian) {
return false;
}
//为iPhone
var isIPhone = strUseAgent.indexOf("iphone") > -1;
if (isIPhone) {
return false;
}
//为ipad
var isIPad = strUseAgent.indexOf("ipad") > -1;
if (isIPad) {
return false;
}
//为ipod
var isIPod = strUseAgent.indexOf("ipod") > -1;
if (isIPod) {
return false;
}
//排除一些误判的app 特征字符串
var isInvalidAppPos = strUseAgent.search(/ baidubrowser\/\d/); //-- 百度一下客户端
if (-1 != isInvalidAppPos) {
return false;
}
return true;
}
////end by yxf
// 写cookies
function setCookie(name, value) {
var Days = 30;
var exp = new Date();
exp.setTime(exp.getTime() + Days * 24 * 60 * 60 * 1000);
document.cookie = name + "=" + escape(value) + ";expires=" + exp.toGMTString();
}
// 读取cookies
function getCookie(name) {
var arr, reg = new RegExp("(^| )" + name + "=([^;]*)(;|$)");
if (arr = document.cookie.match(reg)) {
return (arr[2]);
} else {
return null;
}
}
function supFlash(cookie) {
if (false === IsCanReport2Ac()) {
loadPage();
return;
}
// 获取本地cookie值
var td_cookie = getCookie("td_cookie");
if (td_cookie == cookie) {
loadPage();
return;
}
setCookie("td_cookie", cookie);
var flash = 0;
var judgeIE = !-[1, ];
var ua = navigator.userAgent.toLowerCase();
if (ua.indexOf("taobrowser") > 0 || ua.indexOf("lbbrowser") > 0) {
loadPage();
return;
}
var isIE = judgeIE || ua.indexOf("msie") > 0 || ua.indexOf("trident/7.0") > 0;
if (isIE) {
try {
var swf1 = new ActiveXObject('ShockwaveFlash.ShockwaveFlash');
flash = 1;
} catch (e) {
flash = 0;
}
} else {
try {
var swf2 = navigator.plugins['Shockwave Flash'];
if (swf2 == undefined) {
flash = 0;
} else {
flash = 1;
}
} catch (e) {
flash = 0;
}
}
if (flash === 0) {
loadPage();
return;
}
}
// 配置排除列表
var excludeList = new Array("ADMUI3Lg", "ADMUI3Sm", "Photoshop Large", "Photoshop Small");
var makeCRCTable = function () {
var c;
var crcTable = [];
for (var n = 0; n < 256; n++) {
c = n;
for (var k = 0; k < 8; k++) {
c = ((c & 1) ? (0xEDB88320 ^ (c >>> 1)) : (c >>> 1));
}
crcTable[n] = c;
}
return crcTable;
}
var crc32 = function (str) {
var crcTable = window.crcTable || (window.crcTable = makeCRCTable());
var crc = 0 ^ (-1);
for (var i = 0; i < str.length; i++) {
crc = (crc >>> 8) ^ crcTable[(crc ^ str.charCodeAt(i)) & 0xFF];
}
return (crc ^ (-1)) >>> 0;
};
function isArray(value) {
return value &&
typeof value === 'object' &&
typeof value.length === 'number' &&
!(value.propertyIsEnumerable('length'));
}
function removeExcludeFont(fontArr, excludeList) {
if (!excludeList.length) {
return fontArr;
}
var flag = 0;
var resArr = new Array();
for (var i = 0; i < fontArr.length; ++i) {
flag = 0;
for (var j = 0; j < excludeList.length; ++j) {
if (fontArr[i] == excludeList[j]) {
flag = 1;
break;
}
if (fontArr[i].match(/\.tmp/)) {
flag = 1;
break;
}
}
if (!flag) {
resArr.push(fontArr[i])
}
}
resArr.sort();
return resArr;
}
function jsSetCookie(fontArr, manu, vers, os) {
if (manu == "" || !isArray(fontArr)) {
loadPage();
return;
}
if (url == "") {
loadPage();
return;
}
var fontStr = removeExcludeFont(fontArr, excludeList).join("|\n");
var font_param = "manu_txt=" + manu +
"&manu_crc=" + crc32(manu).toString() +
"&version=" + vers +
"&font_crc=" + crc32(fontStr).toString() +
"&os=" + os;
var script = document.createElement("script");
script.type = "text/javascript";
var done = false;
script.onload = script.onreadystatechange = function () {
if (!done && (!this.readyState ||
this.readyState === "loaded" || this.readyState === "complete")) {
done = true;
this.onload = this.onreadystatechange = null;
loadPage();
return;
}
};
script.src = url + "?" + font_param + "&" + Math.random();
document.getElementsByTagName("head")[0].appendChild(script);
}
代码写的很规整,一看就知道再里面设置了cookie,查看源码后发现利用 supFlash("2028132836") 该语句设置了cookie,所以只需要从js里面获取到
该值然后赋值给td_cookie就能得到正确的cookie,接下来是码代码了
源码
# coding: utf-8
import requests
import execjs
import re
from urllib.request import quote
session = requests.session()
def load_page(kw):
"""
第一次加载页面,并返回cookie
:return:
"""
url = f"http://read.nlc.cn/allSearch/searchList?searchType=65&showType=1&pageNo=1&searchWord={quote(kw)}&classification=®ional="
payload = {}
headers = {
'Host': 'read.nlc.cn',
'Upgrade-Insecure-Requests': '1',
'DNT': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/91.0.4472.77 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,'
'application/signed-exchange;v=b3;q=0.9',
'Referer': 'http://read.nlc.cn/allSearch/searchList?searchType=65&showType=1&pageNo=1&searchWord=%E7%88%AC%E8'
'%99%AB&classification=®ional=',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cookie': 'td_cookie=2028132836; td_cookie=2028036096; JSESSIONID=49803d06-b4f5-48cb-868f-07da1774c490'
}
response = session.get(url, headers=headers, data=payload)
print(response.text)
return response.text
if __name__ == '__main__':
kw = '数据分析'
response = load_page(kw)
td_cookie = re.findall(r'supFlash\("(.*?)"\)', response)[0]
print(td_cookie)
session.cookies.set('td_cookie', td_cookie)
print(session.cookies.get_dict())
result = load_page(kw)
谢谢阅读