selenium 实现91视频搜索下载 改进线程池 (聊胜于无)
本帖最后由 lihu5841314 于 2021-6-6 21:56 编辑import re,os
importrequests
from selenium import webdriver
from multiprocessing.dummy importPool
import time
chrome_options = webdriver.ChromeOptions();
# 添加浏览器参数
# 添加UA
chrome_options.add_argument(
'User-Agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"'
)
# 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
chrome_options.add_argument('--headless')
# 以最高权限运行
chrome_options.add_argument('--no-sandbox')
# 设置开发者模式启动,该模式下webdriver属性为正常值
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(chrome_options= chrome_options)
url = "https://www.91kanju.com/"
# driver = webdriver.Chrome()
rep1 = driver.get(url)
time.sleep(1)
name = input('请输入需要下载的电视剧电影名称:')
driver.find_element_by_xpath('//*[@id="ff-wd"]').send_keys(name)
time.sleep(1)
rep2 = driver.find_element_by_xpath('//*[@id="header-top"]/div/div/div/div/div/form/button/i')
driver.execute_script('arguments.click();',rep2)
# #获取所有窗口
# current_window = driver.window_handles
# #切换到当前窗口
# driver.switch_to.window(current_window)
time.sleep(1)
rep3 = driver.find_element_by_xpath('/html/body/div/div/div/div/div/div/ul/li/div/a')
driver.execute_script('arguments.click();',rep3)
time.sleep(1)
rep4 = driver.find_element_by_xpath('/html/body/div/div/div/div/div/div/div/div/div/a')
driver.execute_script('arguments.click();',rep4)
time.sleep(1)
movie_url = driver.current_url
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
}
def page_response(url):
rep = requests.get(url=url, headers=headers)
rep.encoding = rep.apparent_encoding
return rep
if not os.path.exists('./movie'):
os.mkdir('./movie')
rep = page_response(movie_url)
driver.close()
driver.quit()
# 正则
obj = re.compile(r"url: '(?P<url>.*?)',", re.S)
# 拿到地址
m3u8_url = obj.search(rep.text).group("url")
# 下载m3u8文件 并保存到本地
resp2 = requests.get(m3u8_url)
with open("video.m3u8", mode="wb") as f:
f.write(resp2.content)
# 解析m3u8文件
urls = []
cnt = 1
with open("video.m3u8", mode="r", encoding="utf-8") as f:
for n in f:
# 先去掉空白 换行之类
n = n.strip()
if n.startswith("#"):
continue
cnt = cnt + 1
dic = {
"name": str(cnt),
"url" : n
}
urls.append(dic)
defpage_down_data(dic):
path = "./movie/" + dic["name"] + ".ts"
print(path, "正在下载......")
url = dic["url"]
data =page_response(url)
with open(path, mode="wb")as pf:
pf.write(data.content)
print(dic["name"],"下载完成")
pool = Pool(8)
data = pool.map(page_down_data,urls)
pool.close()
pool.join()
import re,os
importrequests
from selenium import webdriver
from multiprocessing.dummy importPool
import time
chrome_options = webdriver.ChromeOptions();
# 添加浏览器参数
# 添加UA
chrome_options.add_argument(
'User-Agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"'
)
# 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
chrome_options.add_argument('--headless')
# 以最高权限运行
chrome_options.add_argument('--no-sandbox')
# 设置开发者模式启动,该模式下webdriver属性为正常值
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(chrome_options= chrome_options)
url = "https://www.91kanju.com/"
# driver = webdriver.Chrome()
rep1 = driver.get(url)
time.sleep(1)
name = input('请输入需要下载的电视剧电影名称:')
driver.find_element_by_xpath('//*[@id="ff-wd"]').send_keys(name)
time.sleep(1)
rep2 = driver.find_element_by_xpath('//*[@id="header-top"]/div/div/div/div/div/form/button/i')
driver.execute_script('arguments.click();',rep2)
# #获取所有窗口
# current_window = driver.window_handles
# #切换到当前窗口
# driver.switch_to.window(current_window)
time.sleep(1)
rep3 = driver.find_element_by_xpath('/html/body/div/div/div/div/div/div/ul/li/div/a')
driver.execute_script('arguments.click();',rep3)
time.sleep(1)
rep4 = driver.find_element_by_xpath('/html/body/div/div/div/div/div/div/div/div/div/a')
driver.execute_script('arguments.click();',rep4)
time.sleep(1)
movie_url = driver.current_url
def page_response(url):
rep = requests.get(url=url, headers=headers)
rep.encoding = rep.apparent_encoding
return rep
if not os.path.exists('./movie'):
os.mkdir('./movie')
rep = page_response(movie_url)
driver.close()
driver.quit()
# 正则
obj = re.compile(r"url: '(?P<url>.*?)',", re.S)
# 拿到地址
m3u8_url = obj.search(rep.text).group("url")
# 下载m3u8文件 并保存到本地
resp2 = requests.get(m3u8_url)
with open("video.m3u8", mode="wb") as f:
f.write(resp2.content)
# 解析m3u8文件
cnt = 1
with open("video.m3u8", mode="r", encoding="utf-8") as f:
for n in f:
# 先去掉空白 换行之类
n = n.strip()
if n.startswith("#"):
continue
# # 下载视频片段
resp3 =page_response(n)
path = './movie' +str(cnt) + ".ts"
with open(path, mode="wb")as pf:
pf.write(resp3.content)
cnt = cnt + 1
print(cnt-1,"下载完成")
{:1_937:} 91大神? 原来不是我想的那个91 啊?不是我想象的91啊{:301_971:} mosou 发表于 2021-6-6 21:21
原来不是我想的那个91
懂得都懂{:1_886:} a4299110 发表于 2021-6-6 21:29
懂得都懂
那个91? 难道是书店找刘备的那个 lihu5841314 发表于 2021-6-6 21:49
那个91? 难道是书店找刘备的那个
91短视频,你值得拥有!!!!!!!!!!{:1_899:} 我是看91进来的= = 是我想多了。。。 pyppeteer 异步,你值得拥有