d8349565 发表于 2020-11-14 01:41

爬虫-人人影视JSON嵌套那么多层,有没有好方法精准定位?MP4的磁力

本帖最后由 d8349565 于 2020-11-14 11:53 编辑

如题,求指导!

只想要这里的名称和链接


from lxml import etree
import re
import requests
import jsonpath


UA伪装 = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'}

def bianhao(keyword):
    url=f'http://www.rrys2020.com/search?keyword={keyword}'
    response = requests.get(url=url,headers=UA伪装).text
    tree = etree.HTML(response)
    name = tree.xpath('//strong[@class="list_title"]//text()')
    a=tree.xpath('//div[@class="t f14"]//@href')
    编号= for i in a]
    # 类别= for i in a]
    输出=dict(zip(name,编号))
    return 输出

def daima(编号):
    url=f'http://www.rrys2020.com/resource/index_json/rid/{编号}/channel/movie'
    response = requests.get(url=url,headers=UA伪装).text
    response = response.replace('var index_info=','')
    # 响应数据 = 响应数据.replace(');','')
    # json = json.loads(response)
    tree = etree.HTML(response)
    daima = tree.xpath('//a/@href')
    daima=daima.split("=").replace('\\"', '')
    return daima

def url_get(daima):
    url=f'http://got002.com/api/v1/static/resource/detail?code={daima}'
    response = requests.get(url=url, headers=UA伪装).json()
    # response = requests.get(url=url, headers=UA伪装).json().get('data').get('list')
    输出=jsonpath.jsonpath(response,'$..address')
    for i in 输出:
      if 'magnet:' in i:
            print(i)


a=bianhao('黑袍纠察队')
print(a)
b=list(a.values())
c=daima(b)
url_get(c)

d8349565 发表于 2020-11-14 11:52

谢谢各位,我早上想到了好的方法了,代码分享如下:
from lxml import etree
import requests


UA伪装 = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'}

def bianhao(keyword):
    url=f'http://www.rrys2020.com/search?keyword={keyword}'
    response = requests.get(url=url,headers=UA伪装).text
    tree = etree.HTML(response)
    name = tree.xpath('//strong[@class="list_title"]//text()')
    a=tree.xpath('//div[@class="t f14"]//@href')
    编号= for i in a]
    # 类别= for i in a]
    输出=dict(zip(name,编号))
    return 输出

def daima(编号):
    url=f'http://www.rrys2020.com/resource/index_json/rid/{编号}/channel/movie'
    response = requests.get(url=url,headers=UA伪装).text
    response = response.replace('var index_info=','')
    # 响应数据 = 响应数据.replace(');','')
    # json = json.loads(response)
    tree = etree.HTML(response)
    daima = tree.xpath('//a/@href')
    daima=daima.split("=").replace('\\"', '')
    return daima

def url_get(daima):
    url=f'http://got002.com/api/v1/static/resource/detail?code={daima}'
    response = requests.get(url=url, headers=UA伪装).json().get('data').get('list')
    season =response
    season_count=len(season)
    for i in range(0,season_count):
      第几季=season['season_cn']
      print(第几季)
      集=season.get('items').get('MP4')
      #可把'MP4'替换为APP、HDTV、WEB-720P、WEB-1080P
      集_count=len(集)
      # for n in range(0,集_count):
      name=[集.get('name') for n in range(0,集_count)]
      # data.list.items.MP4.name
      address=[集.get('files').get('address') for n in range(0,集_count)]
      # data.list.items.MP4.files.address

      print(name)
      print('-' * 120)
      print(address)
      print('*'*120)

if __name__ == '__main__':
    a=bianhao('无垠的太空')
    序号=1
    for i in a:
      print(f'{序号}、{i}')
      序号+=1

    num=int(input('请输入要搜索的序号'))
    b=list(a.values())
    c=daima(b)
    url_get(c)

莫丶莫欺少年穷 发表于 2020-11-14 03:56

用正则,磁力","address":"(.*?)","passwd"

chen4321 发表于 2020-11-14 07:10

在vscode或者pycharm里都可以查看变量生成json路径

Eaglecad 发表于 2020-11-14 08:13

用sed命令,匹配

kidneyissource 发表于 2020-11-14 08:38

反正结构又不会变,就第一次复杂一点,后面用循环不就行了吗

E飞翔 发表于 2020-11-14 08:52

{:1_909:}直接json取不就行啦。

Hangjau 发表于 2020-11-14 09:07

json 可直接提 如果是。字符串可以用正则 如果能保证是页面唯一直接xpath全局搜索定位

oudaidai 发表于 2020-11-14 09:13

用正则吧

super谦 发表于 2020-11-14 09:25

magnet_text =re.findall('address:"magnet(.*?)"', text)
找到,然后遍历和magnet拼接,应该就可以了吧

realgreenhand 发表于 2020-11-14 10:09

正则表达式和for循环吧
页: [1] 2
查看完整版本: 爬虫-人人影视JSON嵌套那么多层,有没有好方法精准定位?MP4的磁力