粤语动漫网站爬取,推送到逍遥一仙的下载器下载
最近无聊想回顾下童年,看下粤语动漫,就写了两个爬虫之前看到论坛有大佬写好调用逍遥一仙的下载器的接口,就直接拿过来用了,忘了叫啥名了。先感谢。
这是第一个,有找到了动漫章节命名规则的
#coding=utf-8
import base64
import json
import re
import requests
from lxml import etree
import ast
headers_pc = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
}
def getPlayList(AcgUrl):
request = requests.get("https://www.ktwz.cc/detail/6307.html",headers=headers_pc)
html = request.content.decode()
el = etree.HTML(html)
playList = el.xpath('//div[@class="panel-default"]/ul[@class="dslist-group clearfix"]/li/a/@href')
setNumber = el.xpath('//div[@class="panel-default"]/ul[@class="dslist-group clearfix"]/li/a/text()')
title = el.xpath('//div[@class="detail-info"]/div[@class="detail-title"]/h2/text()')
url_prefix = "https://www.ktwz.cc"
for (playurl,setNumber)in zip(playList,setNumber):
getM3u8Url(url_prefix+playurl,title+setNumber)
def getM3u8Url(playurl,setNumber):
request = requests.get(playurl,headers=headers_pc)
html = request.content.decode()
# print(html)
el = etree.HTML(html)
url = el.xpath('//div[@class="player"]/script')
urlJs = str.split(url.text,'=')
urlDict = ast.literal_eval(urlJs)
print(str.replace(urlDict["url"],"\/","/"))
m3u8Url = str.replace(urlDict["url"],"\/","/")
# setNumber = urlDict["nid"];
posttom3u8('',setNumber,m3u8Url)
def posttom3u8(key, title, url):
data = '#KEY,{0}\r\n{1},{2}'.format(key, title, url) if key else '{0},{1}'.format(title, url)
print(data)
try:
response = requests.post('http://127.0.0.1:8787/',
data={"data": base64.b64encode(data.encode('GBK')).decode()}).json()
print('推送成功') if response['message'] == 'success' else print('推送失败')
except:
print('推送失败')
if __name__ == "__main__":
getPlayList('') ##所需下载动漫目录地址
还在写第三个动漫网站,哈里哈里(http://halihali.li/acg/4111/),可惜遭遇反调试了,试过很多方法都不行,用selenium操作的话打开视频无法加载出来,不熟悉前端没有解决办法,有没有大佬可以提供下意见。 上面那个忘记改多线程了,需要的可以自己改一下
这是第二个,没有在网址里找到标题命名,直接用数字命名文件了
#coding=utf-8
from queue import Queue
from threading import Thread
import requests
from lxml import etree
import base64
headers_pc = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36",
}
headers_mob = {
"User-Agent": "Mozilla/5.0 (Linux; U; Android 1.6; ja-jp; generic Build/Donut) AppleWebKit/528.5+ (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1"
}
def getPlayList():
request = requests.get('http://www.fsro.cn/movie_1503.html',headers=headers_pc)
html = request.content.decode('gbk')
# print(html)
el = etree.HTML(html)
playQueue = Queue()
playDict = {}
playList = el.xpath('//div[@class="yddpplaylist"]/a/@href')
chapterList = el.xpath('//div[@class="yddpplaylist"]/a/@title')
for (playUrl, chapter) in zip(playList, chapterList):
playDict = {
"playUrl": "http://www.fsro.cn/"+playUrl,
"chapterName": chapter
}
playQueue.put(playDict)
return playQueue
def test():
htmlf = open("1.html","r",encoding="gbk")
html = htmlf.read()
print(html)
el = etree.HTML(html)
playQueue = Queue()
playDict = {}
playList = el.xpath('//div[@class="yddpplaylist"]/a/@href')
chapterList = el.xpath('//div[@class="yddpplaylist"]/a/@title')
def getM3U8Url(playQueue):
while playQueue.empty() is not True:
dict = playQueue.get()
chapter = dict['chapterName']
if(int(chapter) == 39):
url = dict['playUrl']
preq = requests.get(url,headers=headers_pc)
html = preq.content.decode('gbk')
el = etree.HTML(html)
m3u8Url = el.xpath('//div[@id="movieplay"]/div/input/@playurl')
posttom3u8('',chapter,m3u8Url)
def posttom3u8(key, title, url):
data = '#KEY,{0}\r\n{1},{2}'.format(key, title, url) if key else '{0},{1}'.format(title, url)
print(data)
try:
response = requests.post('http://127.0.0.1:8787/',
data={"data": base64.b64encode(data.encode('GBK')).decode()}).json()
print('推送成功') if response['message'] == 'success' else print('推送失败')
except:
print('推送失败')
if __name__ == '__main__':
# test()
playQueue = getPlayList()
for index in range(5):
thread = Thread(target=getM3U8Url,args=(playQueue,))
thread.daemon=True
thread.start()
playQueue.join() 应该是反爬虫,使用selenium有标识给识别出来,上网查过有说用代{过}{滤}理把标识标志都给改掉,但看起来有点麻烦和复杂,我可真是个咸鱼啊
页:
[1]