某二手车网站爬虫分析
对于爬虫, 想到的首先是查看所用的接口, 我们打开[待分析网站](https://www.guazi.com/), 然后打开调试, 刷新一下, 然后找一下, 就会发现返回数据的接口.
然后, 我们直接写代码, 获取页面试试.
```python
def run(url):
headers = {
"Host": "www.guazi.com",
"Referer": "https://www.guazi.com/jn/",
"User-Agent": "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36",
}
req = requests.get(url, headers=headers)
print(req.content)
```
然后, 很惊喜的发现, 我们并没有得到数据, 而是得到了一堆代码.
```html
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<script type="text/javascript">
eval(function (p, a, c, k, e, r) {
e = function (c) {
return (c < 62 ? '' : e(parseInt(c / 62))) + ((c = c % 62) > 35 ? String.fromCharCode(c + 29) : c.toString(36))
};
if ('0'.replace(0, e) == 0) {
while (c--) r = k;
k = [function (e) {
return r || e
}];
e = function () {
return '(|1\\w)'
};
c = 1
}
while (c--) if (k) p = p.replace(new RegExp('\\b' + e(c) + '\\b', 'g'), k);
console.log(p); // 打印eval函数执行的内容
return p
}('f u(x,y){e M=(x&N)+(y&N);e 1f=(x>>16)+(y>>16)+(M>>16);h(1f<<16)|(M&N)}f 1g(O,P){h(O<<P)|(O>>>(32-P))}f C(q,a,b,x,s,t){h u(1g(u(u(a,q),u(x,t)),s),b)}f j(a,b,c,d,x,s,t){h C((b&c)|((~b)&d),a,b,x,s,t)}f k(a,b,c,d,x,s,t){h C((b&d)|(c&(~d)),a,b,x,s,t)}f l(a,b,c,d,x,s,t){h C(b^c^d,a,b,x,s,t)}f m(a,b,c,d,x,s,t){h C(c^(b|(~d)),a,b,x,s,t)}f D(x,w){x|=0x80<<(w%32);x[(((w+64)>>>9)<<4)+14]=w;e i;e Q;e R;e S;e T;e a=1732584193;e b=-271733879;e c=-1732584194;e d=271733878;v(i=0;i<x.n;i+=16){Q=a;R=b;S=c;T=d;a=j(a,b,c,d,x,7,-680876936);d=j(d,a,b,c,x,12,-389564586);c=j(c,d,a,b,x,17,606105819);b=j(b,c,d,a,x,22,-1044525330);a=j(a,b,c,d,x,7,-176418897);d=j(d,a,b,c,x,12,1200080426);c=j(c,d,a,b,x,17,-1473231341);b=j(b,c,d,a,x,22,-45705983);a=j(a,b,c,d,x,7,1770035416);d=j(d,a,b,c,x,12,-1958414417);c=j(c,d,a,b,x,17,-42063);b=j(b,c,d,a,x,22,-1990404162);a=j(a,b,c,d,x,7,1804603682);d=j(d,a,b,c,x,12,-40341101);c=j(c,d,a,b,x,17,-1502002290);b=j(b,c,d,a,x,22,1236535329);a=k(a,b,c,d,x,5,-165796510);d=k(d,a,b,c,x,9,-1069501632);c=k(c,d,a,b,x,14,643717713);b=k(b,c,d,a,x,20,-373897302);a=k(a,b,c,d,x,5,-701558691);d=k(d,a,b,c,x,9,38016083);c=k(c,d,a,b,x,14,-660478335);b=k(b,c,d,a,x,20,-405537848);a=k(a,b,c,d,x,5,568446438);d=k(d,a,b,c,x,9,-1019803690);c=k(c,d,a,b,x,14,-187363961);b=k(b,c,d,a,x,20,1163531501);a=k(a,b,c,d,x,5,-1444681467);d=k(d,a,b,c,x,9,-51403784);c=k(c,d,a,b,x,14,1735328473);b=k(b,c,d,a,x,20,-1926607734);a=l(a,b,c,d,x,4,-378558);d=l(d,a,b,c,x,11,-2022574463);c=l(c,d,a,b,x,16,1839030562);b=l(b,c,d,a,x,23,-35309556);a=l(a,b,c,d,x,4,-1530992060);d=l(d,a,b,c,x,11,1272893353);c=l(c,d,a,b,x,16,-155497632);b=l(b,c,d,a,x,23,-1094730640);a=l(a,b,c,d,x,4,681279174);d=l(d,a,b,c,x,11,-358537222);c=l(c,d,a,b,x,16,-722521979);b=l(b,c,d,a,x,23,76029189);a=l(a,b,c,d,x,4,-640364487);d=l(d,a,b,c,x,11,-421815835);c=l(c,d,a,b,x,16,530742520);b=l(b,c,d,a,x,23,-995338651);a=m(a,b,c,d,x,6,-198630844);d=m(d,a,b,c,x,10,1126891415);c=m(c,d,a,b,x,15,-1416354905);b=m(b,c,d,a,x,21,-57434055);a=m(a,b,c,d,x,6,1700485571);d=m(d,a,b,c,x,10,-1894986606);c=m(c,d,a,b,x,15,-1051523);b=m(b,c,d,a,x,21,-2054922799);a=m(a,b,c,d,x,6,1873313359);d=m(d,a,b,c,x,10,-30611744);c=m(c,d,a,b,x,15,-1560198380);b=m(b,c,d,a,x,21,1309151649);a=m(a,b,c,d,x,6,-145523070);d=m(d,a,b,c,x,10,-1120210379);c=m(c,d,a,b,x,15,718787259);b=m(b,c,d,a,x,21,-343485551);a=u(a,Q);b=u(b,R);c=u(c,S);d=u(d,T)}h}f U(o){e i;e p=\'\';e 1h=o.n*32;v(i=0;i<1h;i+=8){p+=String.fromCharCode((o>>>(i%32))&1i)}h p}f F(o){e i;e p=[];p[(o.n>>2)-1]=1j;v(i=0;i<p.n;i+=1){p=0}e 1k=o.n*8;v(i=0;i<1k;i+=8){p|=(o.1l(i/8)&1i)<<(i%32)}h p}f 1m(s){h U(D(F(s),s.n*8))}f rstrHMAC(G,V){e i;e A=F(G);e H=[];e I=[];e W;H=I=1j;z(A.n>16){A=D(A,G.n*8)}v(i=0;i<16;i+=1){H=A^0x36363636;I=A^0x5C5C5C5C}W=D(H.1n(F(V)),1o+V.n*8);h U(D(I.1n(W),1o+128))}f 1p(o){e X=\'0123456789abcdef\';e p=\'\';e x;e i;v(i=0;i<o.n;i+=1){x=o.1l(i);p+=X.Y((x>>>4)&1q)+X.Y(x&1q)}h p}f 1r(o){h unescape(encodeURIComponent(o))}f 1s(s){h 1m(1r(s))}f 1t(s){h 1p(1s(s))}f 1u(){e 18="";e 19="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";e w=J.1v(J.1w()*2);v(e i=0;i<w;i++){18+=19.Y(J.1v(J.1w()*19.n))}h 18}f 1x(s){s=s.1y(//g,\'#\');e E=s.split(\'\');v(e i=0;i<E.n;i++){z(E==\'#\'){E=1u()}}h E.join(\'\')}f anti(1z,G){e 1A=1t(1z);h 1x(1A)}f xredirect(1a,1B,r){e K=new Date();K.setTime(K.getTime()+2592000000);e 1b="; 1b="+K.toUTCString();1C.1c=1a+"="+1B+1b+"; path=/";z(1C.1c.1D(1a)===-1&&navigator.cookieEnabled){alert(\'请修改浏览器设置,允许1c缓存\')}1E{z(r==\'\'){e r=B.1F;z(B.1d!=\'L:\'){r=\'L:\'+1G.B.1F.1H(1G.B.1d.n)}}1E{z(B.1d!=\'L:\'){r=\'L:\'+r}}e 1e=r.1D(\'#\');z(1e!==-1){r=r.1H(0,1e)}B.1y(r)}}', [], 106, '||||||||||||||var|function||return||ff|gg|hh|ii|length|input|output||url|||safeAdd|for|len|||if|bkey|location|cmn|binl|arr|rstr2binl|key|ipad|opad|Math|date|https|lsw|0xFFFF|num|cnt|olda|oldb|oldc|oldd|binl2rstr|data|hash|hexTab|charAt||||||||||text|possible|name|expires|cookie|protocol|ulen|msw|bitRotateLeft|length32|0xFF|undefined|length8|charCodeAt|rstr|concat|512|rstr2hex|0x0F|str2rstrUTF8|raw|hex|uid|floor|random|charRun|replace|string|estring|value|document|indexOf|else|href|window|substring'.split('|'), 0, {}));
// var value = anti('****', '****');
// var name = 'antipas';
// var url = '';
// xredirect(name, value, url, 'https://');
</script>
</head>
<body>
<p>正在打开中,请稍后...
<e style='float:right'>some time</e>
<p>
</body>
</html>
```
看到这些代码, 显然不会直接去静态分析它干了什么, 复制出来, 然后放到一个新页面里面打开, 因为重点在那个`eval`函数中, 因此后面的代码我们先注释掉, 防止他有什么跳转或者是什么别的彩蛋在里面. 根据eval的特点, 执行的必然是字符串, 然后我们直接看里面那个自执行函数, 直接打印返回值, 见上文代码注释, 就可以直接得到`eval`函数执行的内容了. 添加完成之后我们直接打开页面, 看看具体内容是什么.
我们可以直接在控制台复制出代码, 然后然后进行分析, 我们可以比较容易的找到上面提到的两个函数`anti`和`xredirect`
```js
function anti(string, key) {
var estring = hex(string);
return charRun(estring)
}
function xredirect(name, value, url) {
var date = new Date();
date.setTime(date.getTime() + 2592000000);
var expires = "; expires=" + date.toUTCString();
document.cookie = name + "=" + value + expires + "; path=/";
if (document.cookie.indexOf(name) === -1 && navigator.cookieEnabled) {
alert('请修改浏览器设置,允许cookie缓存')
} else {
if (url == '') {
var url = location.href;
if (location.protocol != 'https:') {
url = 'https:' + window.location.href.substring(window.location.protocol.length)
}
} else {
if (location.protocol != 'https:') {
url = 'https:' + url
}
}
var ulen = url.indexOf('#');
if (ulen !== -1) {
url = url.substring(0, ulen)
}
location.replace(url)
}
}
```
这个代码都没混淆, 读起来感觉可谓是太爽了, 我们可以发现第二个函数是重定向函数, 在设置完cookie之后, 便会重定向到正常获取数据的页面. 接下来我们来看一下生成cookie的函数`anti(string, key)` 可以清楚的发现, 后面的那个`key`实际上是用不到的.
## `HEX` 函数分析
```js
function safeAdd(x, y) {
var lsw = (x & 0xFFFF) + (y & 0xFFFF);
var msw = (x >> 16) + (y >> 16) + (lsw >> 16);
return (msw << 16) | (lsw & 0xFFFF)
}
function bitRotateLeft(num, cnt) {
return (num << cnt) | (num >>> (32 - cnt))
}
function cmn(q, a, b, x, s, t) {
return safeAdd(bitRotateLeft(safeAdd(safeAdd(a, q), safeAdd(x, t)), s), b)
}
function ff(a, b, c, d, x, s, t) {
return cmn((b & c) | ((~b) & d), a, b, x, s, t)
}
function gg(a, b, c, d, x, s, t) {
return cmn((b & d) | (c & (~d)), a, b, x, s, t)
}
function hh(a, b, c, d, x, s, t) {
return cmn(b ^ c ^ d, a, b, x, s, t)
}
function ii(a, b, c, d, x, s, t) {
return cmn(c ^ (b | (~d)), a, b, x, s, t)
}
function binl(x, len) {
x |= 0x80 << (len % 32);
x[(((len + 64) >>> 9) << 4) + 14] = len;
var i;
var olda;
var oldb;
var oldc;
var oldd;
var a = 1732584193;
var b = -271733879;
var c = -1732584194;
var d = 271733878;
for (i = 0; i < x.length; i += 16) {
olda = a;
oldb = b;
oldc = c;
oldd = d;
a = ff(a, b, c, d, x, 7, -680876936);
d = ff(d, a, b, c, x, 12, -389564586);
c = ff(c, d, a, b, x, 17, 606105819);
b = ff(b, c, d, a, x, 22, -1044525330);
a = ff(a, b, c, d, x, 7, -176418897);
d = ff(d, a, b, c, x, 12, 1200080426);
c = ff(c, d, a, b, x, 17, -1473231341);
b = ff(b, c, d, a, x, 22, -45705983);
a = ff(a, b, c, d, x, 7, 1770035416);
d = ff(d, a, b, c, x, 12, -1958414417);
c = ff(c, d, a, b, x, 17, -42063);
b = ff(b, c, d, a, x, 22, -1990404162);
a = ff(a, b, c, d, x, 7, 1804603682);
d = ff(d, a, b, c, x, 12, -40341101);
c = ff(c, d, a, b, x, 17, -1502002290);
b = ff(b, c, d, a, x, 22, 1236535329);
a = gg(a, b, c, d, x, 5, -165796510);
d = gg(d, a, b, c, x, 9, -1069501632);
c = gg(c, d, a, b, x, 14, 643717713);
b = gg(b, c, d, a, x, 20, -373897302);
a = gg(a, b, c, d, x, 5, -701558691);
d = gg(d, a, b, c, x, 9, 38016083);
c = gg(c, d, a, b, x, 14, -660478335);
b = gg(b, c, d, a, x, 20, -405537848);
a = gg(a, b, c, d, x, 5, 568446438);
d = gg(d, a, b, c, x, 9, -1019803690);
c = gg(c, d, a, b, x, 14, -187363961);
b = gg(b, c, d, a, x, 20, 1163531501);
a = gg(a, b, c, d, x, 5, -1444681467);
d = gg(d, a, b, c, x, 9, -51403784);
c = gg(c, d, a, b, x, 14, 1735328473);
b = gg(b, c, d, a, x, 20, -1926607734);
a = hh(a, b, c, d, x, 4, -378558);
d = hh(d, a, b, c, x, 11, -2022574463);
c = hh(c, d, a, b, x, 16, 1839030562);
b = hh(b, c, d, a, x, 23, -35309556);
a = hh(a, b, c, d, x, 4, -1530992060);
d = hh(d, a, b, c, x, 11, 1272893353);
c = hh(c, d, a, b, x, 16, -155497632);
b = hh(b, c, d, a, x, 23, -1094730640);
a = hh(a, b, c, d, x, 4, 681279174);
d = hh(d, a, b, c, x, 11, -358537222);
c = hh(c, d, a, b, x, 16, -722521979);
b = hh(b, c, d, a, x, 23, 76029189);
a = hh(a, b, c, d, x, 4, -640364487);
d = hh(d, a, b, c, x, 11, -421815835);
c = hh(c, d, a, b, x, 16, 530742520);
b = hh(b, c, d, a, x, 23, -995338651);
a = ii(a, b, c, d, x, 6, -198630844);
d = ii(d, a, b, c, x, 10, 1126891415);
c = ii(c, d, a, b, x, 15, -1416354905);
b = ii(b, c, d, a, x, 21, -57434055);
a = ii(a, b, c, d, x, 6, 1700485571);
d = ii(d, a, b, c, x, 10, -1894986606);
c = ii(c, d, a, b, x, 15, -1051523);
b = ii(b, c, d, a, x, 21, -2054922799);
a = ii(a, b, c, d, x, 6, 1873313359);
d = ii(d, a, b, c, x, 10, -30611744);
c = ii(c, d, a, b, x, 15, -1560198380);
b = ii(b, c, d, a, x, 21, 1309151649);
a = ii(a, b, c, d, x, 6, -145523070);
d = ii(d, a, b, c, x, 10, -1120210379);
c = ii(c, d, a, b, x, 15, 718787259);
b = ii(b, c, d, a, x, 21, -343485551);
a = safeAdd(a, olda);
b = safeAdd(b, oldb);
c = safeAdd(c, oldc);
d = safeAdd(d, oldd)
}
return
}
function binl2rstr(input) {
var i;
var output = '';
var length32 = input.length * 32;
for (i = 0; i < length32; i += 8) {
output += String.fromCharCode((input >>> (i % 32)) & 0xFF)
}
return output
}
function rstr2binl(input) {
var i;
var output = [];
output[(input.length >> 2) - 1] = undefined;
for (i = 0; i < output.length; i += 1) {
output = 0
}
var length8 = input.length * 8;
for (i = 0; i < length8; i += 8) {
output |= (input.charCodeAt(i / 8) & 0xFF) << (i % 32)
}
return output
}
function rstr(s) {
return binl2rstr(binl(rstr2binl(s), s.length * 8))
}
function rstrHMAC(key, data) {
var i;
var bkey = rstr2binl(key);
var ipad = [];
var opad = [];
var hash;
ipad = opad = undefined;
if (bkey.length > 16) {
bkey = binl(bkey, key.length * 8)
}
for (i = 0; i < 16; i += 1) {
ipad = bkey ^ 0x36363636;
opad = bkey ^ 0x5C5C5C5C
}
hash = binl(ipad.concat(rstr2binl(data)), 512 + data.length * 8);
return binl2rstr(binl(opad.concat(hash), 512 + 128))
}
function rstr2hex(input) {
var hexTab = '0123456789abcdef';
var output = '';
var x;
var i;
for (i = 0; i < input.length; i += 1) {
x = input.charCodeAt(i);
output += hexTab.charAt((x >>> 4) & 0x0F) + hexTab.charAt(x & 0x0F)
}
return output
}
function str2rstrUTF8(input) {
return unescape(encodeURIComponent(input))
}
function raw(s) {
return rstr(str2rstrUTF8(s))
}
function hex(s) {
return rstr2hex(raw(s))
}
```
我们可以发现, 这个好像就是`MD5`, 简单验证一下, 可以发现, 确实是这样的.
然后我们再看看`charRun`这个函数
```js
function uid() {
var text = "";
var possible = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
var len = Math.floor(Math.random() * 2);
for (var i = 0; i < len; i++) {
text += possible.charAt(Math.floor(Math.random() * possible.length))
}
return text
}
function charRun(s) {
s = s.replace(//g, '#');
var arr = s.split('');
for (var i = 0; i < arr.length; i++) {
if (arr == '#') {
arr = uid()
}
}
return arr.join('')
}
```
这个函数看起来也比较简单, 将上文提到的`md5`中的字符串, 所有字母随机替换为`""`或者是所有大小写字母中的一个.
分析出来了这个`anti`具体返回了什么, 我们就可以生成`cookie`中`antipas`的值了, 处于安全考虑, 我隐藏了实际md5的值, 如果有需要, 自行尝试吧.
```py
def uid():
rand = randint(0, 1)
if rand:
return ""
else:
return choice('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz')
def char_run(s):
s = re.sub("", "#", s)
text = ''
for i in s:
if i == '#':
text += uid()
else:
text += i
return text
def anti():
s = hashlib.md5(b'cQLZ****UB4=').hexdigest()
return char_run(s)
```
接着, 我们完善一下我们的爬虫代码
```python
def run(url):
antipas = anti()
headers = {
"Host": "www.guazi.com",
"Referer": "https://www.guazi.com/jn/",
"User-Agent": "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36",
'cookie': f'antipas={antipas}; expires=1584545289384; path=/'
}
req = requests.get(url, headers=headers)
print(req.content)
```
这样就可以正常获取到内容了.
# 总结
想对而言, 这个反爬的机制还是比较简单的, 就是在检测是否有`cookie`, 如果有的话, 就返回正常的数据, 没有的话, 先添加到`cookie`中, 然后在跳转到正常数据, 因为这个代码几乎算是没有混淆, 没有反调试, 因此分析起来想对来说是比较容易的.
# 声明
> 本文仅限用于学习和研究目的, 不得将上述内容用于商业或者非法用途, 否则, 一切后果请用户自负, 与本文作者无关. 属在本网站发表的文章,版权归原作者所有, 转载请注明出处. cenoser795 发表于 2020-2-18 09:36
用chromedriver直接搞就行也可以吧
嗯嗯 可以 我主要想分析一下, 具体它的cookie是怎么算出来的, 没打算实际爬它的数据:lol 感谢楼主分享,最近在研究Python,很值得学习! 膜拜大神 学习学习,最近学python 非常经典的栗子,收藏学习。 学习力 膜拜大神 学习了,感谢分享! 用chromedriver直接搞就行也可以吧 感谢分享