四十九画 发表于 2020-9-24 15:51

粤语动漫网站爬取,推送到逍遥一仙的下载器下载

最近无聊想回顾下童年,看下粤语动漫,就写了两个爬虫
之前看到论坛有大佬写好调用逍遥一仙的下载器的接口,就直接拿过来用了,忘了叫啥名了。先感谢。


这是第一个,有找到了动漫章节命名规则的
#coding=utf-8
import base64
import json
import re

import requests
from lxml import etree
import ast

headers_pc = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
}
def getPlayList(AcgUrl):
    request = requests.get("https://www.ktwz.cc/detail/6307.html",headers=headers_pc)
    html = request.content.decode()
    el = etree.HTML(html)
    playList = el.xpath('//div[@class="panel-default"]/ul[@class="dslist-group clearfix"]/li/a/@href')
    setNumber = el.xpath('//div[@class="panel-default"]/ul[@class="dslist-group clearfix"]/li/a/text()')
    title = el.xpath('//div[@class="detail-info"]/div[@class="detail-title"]/h2/text()')
    url_prefix = "https://www.ktwz.cc"
    for (playurl,setNumber)in zip(playList,setNumber):
      getM3u8Url(url_prefix+playurl,title+setNumber)


def getM3u8Url(playurl,setNumber):
    request = requests.get(playurl,headers=headers_pc)
    html = request.content.decode()
    # print(html)
    el = etree.HTML(html)
    url = el.xpath('//div[@class="player"]/script')
    urlJs = str.split(url.text,'=')
    urlDict = ast.literal_eval(urlJs)
    print(str.replace(urlDict["url"],"\/","/"))
    m3u8Url = str.replace(urlDict["url"],"\/","/")
    # setNumber = urlDict["nid"];
    posttom3u8('',setNumber,m3u8Url)

def posttom3u8(key, title, url):
    data = '#KEY,{0}\r\n{1},{2}'.format(key, title, url) if key else '{0},{1}'.format(title, url)
    print(data)
    try:
      response = requests.post('http://127.0.0.1:8787/',
                                 data={"data": base64.b64encode(data.encode('GBK')).decode()}).json()
      print('推送成功') if response['message'] == 'success' else print('推送失败')
    except:
      print('推送失败')

if __name__ == "__main__":
    getPlayList('')         ##所需下载动漫目录地址

四十九画 发表于 2020-9-24 15:58

还在写第三个动漫网站,哈里哈里(http://halihali.li/acg/4111/),可惜遭遇反调试了,试过很多方法都不行,用selenium操作的话打开视频无法加载出来,不熟悉前端没有解决办法,有没有大佬可以提供下意见。

四十九画 发表于 2020-9-24 15:53

上面那个忘记改多线程了,需要的可以自己改一下

这是第二个,没有在网址里找到标题命名,直接用数字命名文件了
#coding=utf-8
from queue import Queue
from threading import Thread

import requests
from lxml import etree
import base64

headers_pc = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36",
}
headers_mob = {
    "User-Agent": "Mozilla/5.0 (Linux; U; Android 1.6; ja-jp; generic Build/Donut) AppleWebKit/528.5+ (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1"
}
def getPlayList():

    request = requests.get('http://www.fsro.cn/movie_1503.html',headers=headers_pc)
    html = request.content.decode('gbk')
    # print(html)
    el = etree.HTML(html)
    playQueue = Queue()
    playDict = {}
    playList = el.xpath('//div[@class="yddpplaylist"]/a/@href')
    chapterList = el.xpath('//div[@class="yddpplaylist"]/a/@title')
    for (playUrl, chapter) in zip(playList, chapterList):
      playDict = {
            "playUrl": "http://www.fsro.cn/"+playUrl,
            "chapterName": chapter
      }
      playQueue.put(playDict)
    return playQueue


def test():
    htmlf = open("1.html","r",encoding="gbk")
    html = htmlf.read()
    print(html)
    el = etree.HTML(html)
    playQueue = Queue()
    playDict = {}
    playList = el.xpath('//div[@class="yddpplaylist"]/a/@href')
    chapterList = el.xpath('//div[@class="yddpplaylist"]/a/@title')


def getM3U8Url(playQueue):
    while playQueue.empty() is not True:
      dict = playQueue.get()
      chapter = dict['chapterName']
      if(int(chapter) == 39):
            url = dict['playUrl']
            preq = requests.get(url,headers=headers_pc)
            html = preq.content.decode('gbk')
            el = etree.HTML(html)
            m3u8Url = el.xpath('//div[@id="movieplay"]/div/input/@playurl')
            posttom3u8('',chapter,m3u8Url)

def posttom3u8(key, title, url):
    data = '#KEY,{0}\r\n{1},{2}'.format(key, title, url) if key else '{0},{1}'.format(title, url)
    print(data)
    try:
      response = requests.post('http://127.0.0.1:8787/',
                                 data={"data": base64.b64encode(data.encode('GBK')).decode()}).json()
      print('推送成功') if response['message'] == 'success' else print('推送失败')
    except:
      print('推送失败')

if __name__ == '__main__':
    # test()
    playQueue = getPlayList()
    for index in range(5):
      thread = Thread(target=getM3U8Url,args=(playQueue,))
      thread.daemon=True
      thread.start()
    playQueue.join()

四十九画 发表于 2020-9-24 16:00

应该是反爬虫,使用selenium有标识给识别出来,上网查过有说用代{过}{滤}理把标识标志都给改掉,但看起来有点麻烦和复杂,我可真是个咸鱼啊
页: [1]
查看完整版本: 粤语动漫网站爬取,推送到逍遥一仙的下载器下载