电影爬取下载

fAp · 发表于 2021-1-21 16:53

本帖最后由 fAp 于 2021-1-21 17:10 编辑

从百度搜索电影,然后用过在线解析(https://jx.618g.com)获得解析后的url,再下载
但是有的电影会跳到另外一个解析网站(https://jx.147g.cc/)这个网站有robots就不提供下载,直接在线观看
爱奇艺、腾讯视频、优酷、PPTV、芒果亲测都可以在线观看,下载就不一定了

[Python] 纯文本查看 复制代码

import os
import requests
import base64
from lxml import etree
from Crypto.Cipher import AES
import asyncio
import aiohttp
import shutil
import time


class FilmDownloader:
    def __init__(self):
        '''初始化'''
        
        # 爱奇艺、腾讯、优酷、PPTV、芒果

        '''查找电影的url'''
        self.searchUrl = 'https://www.baidu.com/s?wd='
        # 解析url
        self.parseUrl = 'https://jx.618g.com/?url='
        self.parseUrl_147 = 'https://jx.147g.cc/?url='
        # 下载地址头
        self.downloadHead = 'https://video.dious.cc'
        # User-Agent
        self.userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
        # 下载临时文件目录
        self.downDir = os.getcwd() + '\\temp\\'
        self.webList = ['爱奇艺', '腾讯视频', '优酷', 'PP视频', '芒果TV']
        # 电影名称
        self.name = ''
        # 在线观看url
        self.onlineUrl = ''
        # 完整的电影播放网址
        self.finalUrl = ''
        # 搜索结果列表
        self.searchResultList = []
        # 电影所有的url的地址
        self.allListUrl = ''
        # 解密视频所需要的秘钥的URL
        self.keyUrl = ''
        # 解密视频的秘钥
        self.key = ''
        # 电影所有url列表
        self.allList = []
        # AES解密 - 初始化加密器
        self.aes = AES.new(b'0000000000000000', AES.MODE_CBC)
        # 临时文件总数
        self.total = 0
        # 已经下载的文件数量
        self.cur = 0

        self.indexUrl = ''

        # 下载目录不存在,则创建目录
        if not os.path.exists(self.downDir) or not os.path.isdir(self.downDir):
            os.mkdir(self.downDir)
        # 清空下载目录
        if len(os.listdir(self.downDir)) != 0:
            shutil.rmtree(self.downDir)
            time.sleep(0.2)
            os.mkdir(self.downDir)

    def isFileCanDownload(self) -> bool:
        if len(self.onlineUrl) != 0 and len(self.allList) != 0:
            return True
        
        return False

    def getOnlineUrl(self) -> str:
        if len(self.onlineUrl) != 0:
            return self.onlineUrl
        else:
            return ''

    def SearchFilm(self, name: str) -> bool:
        '''电影搜索'''
        if len(name) == 0:
            print('电影名不能为空...')
            return False

        self.name = name
        head = {
            'Host': 'www.baidu.com',
            'User-Agent': self.userAgent
        }

        print('正在搜索 {} 资源...'.format(name))
        time.sleep(0.2)
        res = requests.get(self.searchUrl + name, headers=head)
        res.encoding = 'utf-8'
        html = etree.HTML(res.text)

        condition = '//a[@target="_blank"][@data-visited="off"][@class="dis-line-block c-gap-right dis-no-line c-blocka"]'
        self.searchResultList = html.xpath(condition)
        # 爱奇艺、腾讯、优酷、PPTV、芒果
        for item in self.searchResultList:
            flag = item.text == '爱奇艺' or item.text == '腾讯视频' or item.text == '优酷'  or item.text == 'PP视频' or item.text == '芒果TV'
            if flag:
                self.finalUrl = item.attrib['href']
                break

        if len(self.finalUrl) == 0:
            self.searchResultList = html.xpath('//div//h3//a')
            tempList = []
            isExist = False
            for Item in self.searchResultList:
                child = Item.getchildren()
                for each in child:
                    tempList.append(each.tail)
                    tempList.append(each.text)
                for each in self.webList:
                    if each in str(tempList):
                        isExist = True
                        break
                if isExist:
                    self.finalUrl = Item.attrib['href']
                    break
        
        if len(self.finalUrl) == 0:
            return False

        return True

    def ParseFilmAndGetURL(self) -> bool:
        '''解析视频,并获得下载地址'''
        if len(self.finalUrl) == 0:
            return False

        head = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'User-Agent': self.userAgent
        }
        time.sleep(0.2)
        res = requests.get(self.finalUrl, headers=head)
        res.encoding = 'utf-8'
        self.finalUrl = self.parseUrl + res.url
        self.onlineUrl = self.finalUrl

        return True

    def getIndexUrl(self) -> bool:
        ''''''
        if len(self.finalUrl) == 0:
            return False

        head = {
            'User-Agent': self.userAgent
        }

        time.sleep(0.2)
        response = requests.get(self.finalUrl, headers=head)
        response.encoding = 'utf-8'
        html = etree.HTML(response.text)
        nodes = html.xpath('//iframe[@id="player"]')
        if nodes is None or len(nodes) == 0:
            return False

        for item in nodes:
            self.indexUrl = item.attrib['src']
            if self.indexUrl.find('m3u8') != -1:
                index = self.indexUrl.find('url=') + 4
                self.indexUrl = self.indexUrl[index:]
            elif self.indexUrl.find('jx.147g.cc') != -1:
                print('由于robots协议,本视频无法下载...')
                return True
            if len(self.indexUrl) != 0:
                # print('url: ' + indexUrl)
                break

        time.sleep(0.2)
        response = requests.get(self.indexUrl, headers=head)
        response.encoding = 'urf-8'
        
        result = response.text.split('\n')
        for line in result:
            if line.find('#') == -1:
                self.allListUrl = self.downloadHead + line
                break

        try:
            response = requests.get(self.allListUrl, headers=head)
        except Exception:
            print('{} 没有下载资源...'.format(self.name))
            return False
        
        response.encoding = 'utf-8'
        tempList = response.text.split('\n')
        n = 0
        for line in tempList:
            if line.find('KEY') != -1 and line.find('URI') != -1:
                self.keyUrl = line[line.find('"') + 1:line.rfind('"')]
                keyRes = requests.get(self.keyUrl, headers=head)
                keyRes.encoding = 'utf-8'
                self.key = keyRes.text
                self.aes = AES.new(self.key.encode('utf-8'), AES.MODE_CBC)
            elif line.find('http') != -1:
                self.allList.append({
                    'index': n,
                    'url': line
                })
                n += 1
        
        self.total = len(self.allList)
        return True

    async def crawler(self, index, url):
        head = {
            'Connection': 'keep-alive',
            'Host': 'ts1.lslkkyj.com',
            'User-Agent': self.userAgent
        }

        content = b''
        
        try:
            async with aiohttp.ClientSession() as session:
                await asyncio.sleep(1)
                async with session.get(url, headers=head) as response:
                    text = await response.read()
                    await asyncio.sleep(1)
                    
                    content = self.aes.decrypt(text)                                  # 解密

                    filename = self.downDir + '{:0>5d}.mp4'.format(index)
                    file = open(filename, 'wb')
                    file.write(content)
                    file.close()
                    self.cur += 1
                    percent = float(self.cur) / float(self.total) * 100
                    print('\r{} 下载中... {:.2f} %   {:d} / {:d}'.format(self.name, percent, self.cur, self.total), end='')
        except Exception:
            # print('{:d} 下载错误;url:{}'.format(index, url))
            await asyncio.sleep(1)
            await self.crawler(index, url)

    def DownloadFilm(self) -> bool:
        '''协程'''
        if len(self.allList) == 0:
            return False

        print('{} 正在下载...'.format(self.name))
        loop = asyncio.get_event_loop()
        tasks = [self.crawler(item['index'], item['url']) for item in self.allList]
        loop.run_until_complete(asyncio.gather(*tasks))
        loop.close()

        print('\n下载完成,正在合并文件...')
        command = 'copy /b ' + self.downDir + '* ' + os.getcwd() + '\\' + self.name + '.mp4'
        os.system(command)
        shutil.rmtree(self.downDir)
        time.sleep(0.2)
        os.mkdir(self.downDir)
        print('视频下载完成...')

        return True

    def FindFilmAndDownload(self, name: str) -> bool:
        '''查找视频并下载或在线观看'''
        if not self.SearchFilm(name):
            print('没有搜索到 {} 资源...'.format(name))
            return False
        elif not self.ParseFilmAndGetURL():
            print('{} 资源解析失败...'.format(name))
            return False
        elif not self.getIndexUrl():
            print('获取 {} 下载资源失败...'.format(name))
            return False
        chioce = input('是否在线观看？在线观看则不下载视频！y/n\n')
        if chioce == 'y':
            os.system('start ' + self.onlineUrl)
            return True
        if len(self.allList) == 0:
            return False
        if not self.DownloadFilm():
            print('{} 下载失败...'.format(name))

        
if __name__ == "__main__":
    
    print('-' * 50)
    # film = '双龙会'
    film = input('请输入电影名...\n')

    task = FilmDownloader()
    task.FindFilmAndDownload(film)

fAp · 发表于 2021-1-21 17:08

补充一下:
如果下载时有部分链接请求失败,我就直接重新发起请求了,所以有的视频会卡在最后

fAp · 发表于 2021-1-21 23:46

如果不想下载,就只需要在线观看可以直接找到爱奇艺、腾讯视频、优酷、PPTV、芒果TV上的视频网址
然后前面加上(https://jx.618g.com/?url=)或(https://jx.147g.cc/?url=)就可以直接在线观看了

bsjasd · 发表于 2021-1-22 07:24

多谢楼主

shlboliqiao · 发表于 2021-1-22 12:38

谢谢分享，跑下代码试下

NJZddmm1030 · 发表于 2022-6-17 20:55

谢谢分享

帐号		自动登录	找回密码
密码			注册[Register]

[Python 转载] 电影爬取下载

免费评分