大噶好,我是过儿,初次来到论坛,作为小白分享一下没事胡乱捣鼓的东西:Python实现爬去www.66s.cc网站的电影资源
代码如下:
www.66s.cc
[Python] 纯文本查看 复制代码 import requests
from lxml import etree
import re
import sys
from gogogo import model
headers = {
"Cookie": "UM_distinctid=168da7d834b51f-07e1b88232dad6-49423f1f-144000-168da7d834c900; CNZZDATA1273922090=1881870440-1549849873-https%253A%252F%252Fwww.66s.cc%252F%7C1549865717; tinmkcheckplkey=1549871167%2Cd7f8997e266f5fc265bacd8b3a9dfa1c%2Cfbd0b9d6b9ae7c4e83056665281a6f3f; CNZZDATA1273606887=1885359854-1549850294-%7C1549867557",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.6.0.18627",
}
def index(url,num):
if num == 1:
url_ = url
obj = model(url_,"//div[@class='post_hover']/div[@class='thumbnail']/a/@href")
return obj
elif num == 2:
url_ = url + "qian50m.html"
obj = model(url_, "//div[@id='tab-content']/ul[@class='']/li/a/@href")
return obj
elif num == 3:
url_ = url + "qian50m.html"
obj = model(url_, "//div[@id='tab-content']/ul[@class='hide'][1]/li/a/@href")
return obj
else:
print("输入有误,程序退出!")
sys.exit()
def movie(url):
response = requests.get(url=url,headers=headers)
text = response.content.decode("utf-8")
html = etree.HTML(text)
movie_name = html.xpath("//h1/text()")[1]
try:
movie_urls = html.xpath("//div[@class='widget box row'][2]/a/@href")[0]
return movie_name, movie_urls
except:
return movie_name,None
def m3u8(name,url):
response = requests.get(url=url, headers=headers)
text = response.content.decode("utf-8")
name = name
m3u8 = re.findall(r"a:'(.*?)',", text, re.S)[0]
return name,m3u8
def mp4(name,url):
dict = {}
url = url
response = requests.get(url=url, headers=headers)
text = response.text
# print(url)
m3u8 = re.findall(r"/ppvod/(.*?).m3u8", text, re.S)[0]
m3u8 = "https://vip2.pp63.org/ppvod/" + m3u8 + ".m3u8"
# print(m3u8)
name = name
print("获取成功: " + name + " ==> " + "爬取完毕,已保存至运行目录!" + "\n")
dict[name] = m3u8
return dict
if __name__ == '__main__':
url = input("请输入网址:")
str_ = "1.首页推荐" + "\n" + "2.电影" + "\n" + "3.电视剧" + "\n" + "请选择:" # urls = index("https://www.66s.cc/")
for i in str_:
print(i, end="")
num = int(input())
urls = index(url,num)
for url in urls:
movie_name = movie(url)[0]
movie_url = movie(url)[1]
if movie_url is not None:
m3u8_name = m3u8(name=movie_name,url=movie_url)[0]
m3u8_url = m3u8(name=movie_name,url=movie_url)[1]
mp4s = mp4(m3u8_name,m3u8_url)
for name, url in mp4s.items():
if num == 1:
with open("首页推荐.txt", "a") as f:
f.write(str(name + " ===> " + "https://www.115z.com/video?url=" + str(url)).encode(
"gbk").decode("gbk") + "\n\n")
elif num == 2:
with open("电影.txt", "a") as f:
f.write(str(name + " ===> " + "https://www.115z.com/video?url=" + str(url)).encode(
"gbk").decode("gbk") + "\n\n")
elif num == 3:
with open("电视剧.txt", "a") as f:
f.write(str(name + " ===> " + "https://www.115z.com/video?url=" + str(url)).encode(
"gbk").decode("gbk") + "\n\n")
gogogo.py
[Python] 纯文本查看 复制代码 import requests
from lxml import etree
headers = {
"Cookie": "UM_distinctid=168da7d834b51f-07e1b88232dad6-49423f1f-144000-168da7d834c900; CNZZDATA1273922090=1881870440-1549849873-https%253A%252F%252Fwww.66s.cc%252F%7C1549865717; tinmkcheckplkey=1549871167%2Cd7f8997e266f5fc265bacd8b3a9dfa1c%2Cfbd0b9d6b9ae7c4e83056665281a6f3f; CNZZDATA1273606887=1885359854-1549850294-%7C1549867557",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.6.0.18627",
}
def model(url,xpath):
response = requests.get(url=url, headers=headers)
text = response.content.decode("utf-8")
html = etree.HTML(text)
index_urls = html.xpath(xpath)
return index_urls
因为爬取出来的视频是.m3u8格式,另外附一份自己写的视频合并代码,初学者初次尝试,比较简陋,不喜勿喷!
[Python] 纯文本查看 复制代码 import re
import urllib.request
import os
# def myReportHook(count, blockSize, totalSize):
# # print(count, blockSize, totalSize)
if __name__ == '__main__':
with open("cl9SheFa.m3u8",'r') as f:
text = f.read()
parse_text = re.sub(r"\n/","\nhttps://vip2.pp63.org/",text)
html = re.sub(r"#.*,\n","",parse_text)
urls = html.split("\n")[4:-2]
print("共有" + str(len(urls)) + "个ts文件,开始下载!\n\n")
# print(urls)
for index,url in enumerate(urls):
try:
urllib.request.urlretrieve(url,str(index + 1) + ".ts")
print("第" + str(index + 1) + "下载完成!")
except:
print("\n\n你没网了,请检查网络!" + " 第" + str(index + 1) + "下载失败!\n")
# break
for num in range(len(urls)):
print(num)
os.system("copy /b *.ts 电影.mp4")
os.system('del /Q *.ts')
附加提示:本次爬虫调用了“115资源网”的视频播放接口,如有侵权,请管理员删除帖子。
|