本帖最后由 smallchen 于 2021-1-16 15:57 编辑
以下如有违规请删帖处理
第一次学Python时写的,网上随便找了个磁链开始爬取,不巧,还正好遇到个值得细细研究的
问题如下:
1、网站实际地址和post查询的地址不是同一个
2、POST的地址跟获取详情的地址也不是用一个
3、刚开始学Python还不会去重
代码如下
[Python] 纯文本查看 复制代码 import time
import requests
import json
from lxml import etree
def main():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3861.400 QQBrowser/10.7.4313.400'
}
name = input("请输入汇总关键字:")
print('您输入的关键字为:', name)
no_has = input("请输入需要排除的关键字,以逗号拼接:")
print('您需要排除的关键字为:', no_has)
bus12(name=name, no_has=no_has, headers=headers)
def bus12(name, no_has, headers):
base_url = 'http://www.bus12.xyz'
post_url = 'https://dht.jbib.com/api/bt/ssbc'
param = {
'key': name,
'from': 1,
'type': 'all',
}
already = 1
now_time = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time()))
page_res = requests.post(url=post_url, headers=headers, timeout=30, data=param).json()
if page_res['code'] == 0:
msg = json.loads(page_res['msg'])
page_count = msg['page']
count = msg['sum']
for page in range(page_count):
param = {
'key': name,
'from': page + 1,
'type': 'all',
}
page_res = requests.post(url=post_url, headers=headers, timeout=30, data=param).json()
msg = json.loads(page_res['msg'])
data_list = msg['data']
for data in data_list:
id_key = data['id_IK']
title = data['name_simple']
print('(', already, '/', count, ')|========查询到:', title)
already = already + 1
is_has = False
if no_has != '':
for has in no_has.split(','):
if has in title:
print('|========存在关键字:', has)
is_has = True
if not is_has:
detail_url = base_url + '/Info?id=' + id_key
detail_res = requests.get(url=detail_url, headers=headers).text
html = etree.HTML(detail_res)
detail = html.xpath('//div[@class="box_line"]')
if detail:
detail = detail[0]
else:
continue
if 'size' in data:
size = data['size']
else:
size = '未知'
magnet_data = detail.xpath('./input[@id="mag-link"]/@value')[0]
if 'magnet' in magnet_data:
with open(name + '-' + now_time + ".txt", "a") as f:
f.write(title + ',' + size + ',' + magnet_data + '\n')
else:
print('|========', title, '链接已屏蔽')
if __name__ == '__main__':
main()
print('|=================完成=================|')
|