爬虫-人人影视JSON嵌套那么多层,有没有好方法精准定位?MP4的磁力
本帖最后由 d8349565 于 2020-11-14 11:53 编辑如题,求指导!
只想要这里的名称和链接
from lxml import etree
import re
import requests
import jsonpath
UA伪装 = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'}
def bianhao(keyword):
url=f'http://www.rrys2020.com/search?keyword={keyword}'
response = requests.get(url=url,headers=UA伪装).text
tree = etree.HTML(response)
name = tree.xpath('//strong[@class="list_title"]//text()')
a=tree.xpath('//div[@class="t f14"]//@href')
编号= for i in a]
# 类别= for i in a]
输出=dict(zip(name,编号))
return 输出
def daima(编号):
url=f'http://www.rrys2020.com/resource/index_json/rid/{编号}/channel/movie'
response = requests.get(url=url,headers=UA伪装).text
response = response.replace('var index_info=','')
# 响应数据 = 响应数据.replace(');','')
# json = json.loads(response)
tree = etree.HTML(response)
daima = tree.xpath('//a/@href')
daima=daima.split("=").replace('\\"', '')
return daima
def url_get(daima):
url=f'http://got002.com/api/v1/static/resource/detail?code={daima}'
response = requests.get(url=url, headers=UA伪装).json()
# response = requests.get(url=url, headers=UA伪装).json().get('data').get('list')
输出=jsonpath.jsonpath(response,'$..address')
for i in 输出:
if 'magnet:' in i:
print(i)
a=bianhao('黑袍纠察队')
print(a)
b=list(a.values())
c=daima(b)
url_get(c)
谢谢各位,我早上想到了好的方法了,代码分享如下:
from lxml import etree
import requests
UA伪装 = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'}
def bianhao(keyword):
url=f'http://www.rrys2020.com/search?keyword={keyword}'
response = requests.get(url=url,headers=UA伪装).text
tree = etree.HTML(response)
name = tree.xpath('//strong[@class="list_title"]//text()')
a=tree.xpath('//div[@class="t f14"]//@href')
编号= for i in a]
# 类别= for i in a]
输出=dict(zip(name,编号))
return 输出
def daima(编号):
url=f'http://www.rrys2020.com/resource/index_json/rid/{编号}/channel/movie'
response = requests.get(url=url,headers=UA伪装).text
response = response.replace('var index_info=','')
# 响应数据 = 响应数据.replace(');','')
# json = json.loads(response)
tree = etree.HTML(response)
daima = tree.xpath('//a/@href')
daima=daima.split("=").replace('\\"', '')
return daima
def url_get(daima):
url=f'http://got002.com/api/v1/static/resource/detail?code={daima}'
response = requests.get(url=url, headers=UA伪装).json().get('data').get('list')
season =response
season_count=len(season)
for i in range(0,season_count):
第几季=season['season_cn']
print(第几季)
集=season.get('items').get('MP4')
#可把'MP4'替换为APP、HDTV、WEB-720P、WEB-1080P
集_count=len(集)
# for n in range(0,集_count):
name=[集.get('name') for n in range(0,集_count)]
# data.list.items.MP4.name
address=[集.get('files').get('address') for n in range(0,集_count)]
# data.list.items.MP4.files.address
print(name)
print('-' * 120)
print(address)
print('*'*120)
if __name__ == '__main__':
a=bianhao('无垠的太空')
序号=1
for i in a:
print(f'{序号}、{i}')
序号+=1
num=int(input('请输入要搜索的序号'))
b=list(a.values())
c=daima(b)
url_get(c)
用正则,磁力","address":"(.*?)","passwd" 在vscode或者pycharm里都可以查看变量生成json路径 用sed命令,匹配 反正结构又不会变,就第一次复杂一点,后面用循环不就行了吗 {:1_909:}直接json取不就行啦。 json 可直接提 如果是。字符串可以用正则 如果能保证是页面唯一直接xpath全局搜索定位 用正则吧 magnet_text =re.findall('address:"magnet(.*?)"', text)
找到,然后遍历和magnet拼接,应该就可以了吧 正则表达式和for循环吧
页:
[1]
2