lihu5841314 发表于 2021-6-6 20:58

selenium 实现91视频搜索下载 改进线程池 (聊胜于无)

本帖最后由 lihu5841314 于 2021-6-6 21:56 编辑

import re,os
importrequests
from selenium import webdriver
from multiprocessing.dummy importPool
import time

chrome_options = webdriver.ChromeOptions();
# 添加浏览器参数
# 添加UA
chrome_options.add_argument(
'User-Agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"'
)
# 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
chrome_options.add_argument('--headless')
# 以最高权限运行
chrome_options.add_argument('--no-sandbox')
# 设置开发者模式启动,该模式下webdriver属性为正常值
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(chrome_options= chrome_options)

url = "https://www.91kanju.com/"

# driver = webdriver.Chrome()
rep1 = driver.get(url)
time.sleep(1)
name = input('请输入需要下载的电视剧电影名称:')
driver.find_element_by_xpath('//*[@id="ff-wd"]').send_keys(name)
time.sleep(1)
rep2 = driver.find_element_by_xpath('//*[@id="header-top"]/div/div/div/div/div/form/button/i')
driver.execute_script('arguments.click();',rep2)
# #获取所有窗口
# current_window = driver.window_handles
# #切换到当前窗口
# driver.switch_to.window(current_window)
time.sleep(1)
rep3 = driver.find_element_by_xpath('/html/body/div/div/div/div/div/div/ul/li/div/a')
driver.execute_script('arguments.click();',rep3)
time.sleep(1)
rep4 = driver.find_element_by_xpath('/html/body/div/div/div/div/div/div/div/div/div/a')
driver.execute_script('arguments.click();',rep4)
time.sleep(1)
movie_url = driver.current_url


headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
}

def page_response(url):
    rep = requests.get(url=url, headers=headers)
    rep.encoding = rep.apparent_encoding
    return rep

if not os.path.exists('./movie'):
    os.mkdir('./movie')
rep = page_response(movie_url)
driver.close()
driver.quit()
# 正则
obj = re.compile(r"url: '(?P<url>.*?)',", re.S)
# 拿到地址
m3u8_url = obj.search(rep.text).group("url")
# 下载m3u8文件 并保存到本地
resp2 = requests.get(m3u8_url)
with open("video.m3u8", mode="wb") as f:
    f.write(resp2.content)
# 解析m3u8文件
urls = []
cnt = 1
with open("video.m3u8", mode="r", encoding="utf-8") as f:
    for n in f:
      # 先去掉空白 换行之类
      n = n.strip()
      if n.startswith("#"):
             continue
      cnt = cnt + 1
      dic = {
            "name": str(cnt),
             "url" : n
      }
      urls.append(dic)

defpage_down_data(dic):
   path = "./movie/" + dic["name"] + ".ts"
   print(path, "正在下载......")
   url = dic["url"]
   data =page_response(url)
   with open(path, mode="wb")as pf:
            pf.write(data.content)
            print(dic["name"],"下载完成")
pool = Pool(8)
data = pool.map(page_down_data,urls)
pool.close()
pool.join()

import re,os
importrequests
from selenium import webdriver
from multiprocessing.dummy importPool
import time

chrome_options = webdriver.ChromeOptions();
# 添加浏览器参数
# 添加UA
chrome_options.add_argument(
'User-Agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"'
)
# 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
chrome_options.add_argument('--headless')
# 以最高权限运行
chrome_options.add_argument('--no-sandbox')
# 设置开发者模式启动,该模式下webdriver属性为正常值
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(chrome_options= chrome_options)

url = "https://www.91kanju.com/"

# driver = webdriver.Chrome()
rep1 = driver.get(url)
time.sleep(1)
name = input('请输入需要下载的电视剧电影名称:')
driver.find_element_by_xpath('//*[@id="ff-wd"]').send_keys(name)
time.sleep(1)
rep2 = driver.find_element_by_xpath('//*[@id="header-top"]/div/div/div/div/div/form/button/i')
driver.execute_script('arguments.click();',rep2)
# #获取所有窗口
# current_window = driver.window_handles
# #切换到当前窗口
# driver.switch_to.window(current_window)
time.sleep(1)
rep3 = driver.find_element_by_xpath('/html/body/div/div/div/div/div/div/ul/li/div/a')
driver.execute_script('arguments.click();',rep3)
time.sleep(1)
rep4 = driver.find_element_by_xpath('/html/body/div/div/div/div/div/div/div/div/div/a')
driver.execute_script('arguments.click();',rep4)
time.sleep(1)
movie_url = driver.current_url

def page_response(url):
    rep = requests.get(url=url, headers=headers)
    rep.encoding = rep.apparent_encoding
    return rep

if not os.path.exists('./movie'):
    os.mkdir('./movie')
rep = page_response(movie_url)
driver.close()
driver.quit()
# 正则
obj = re.compile(r"url: '(?P<url>.*?)',", re.S)
# 拿到地址
m3u8_url = obj.search(rep.text).group("url")
# 下载m3u8文件 并保存到本地
resp2 = requests.get(m3u8_url)
with open("video.m3u8", mode="wb") as f:
    f.write(resp2.content)
# 解析m3u8文件
cnt = 1
with open("video.m3u8", mode="r", encoding="utf-8") as f:
    for n in f:
      # 先去掉空白 换行之类
      n = n.strip()
      if n.startswith("#"):
             continue
            # # 下载视频片段
      resp3 =page_response(n)
      path = './movie' +str(cnt) + ".ts"
      with open(path, mode="wb")as pf:
            pf.write(resp3.content)
            cnt = cnt + 1
            print(cnt-1,"下载完成")
{:1_937:}

aipojie_L 发表于 2021-6-6 21:06

91大神?

mosou 发表于 2021-6-6 21:21

原来不是我想的那个91

罗萨 发表于 2021-6-6 21:28

啊?不是我想象的91啊{:301_971:}

a4299110 发表于 2021-6-6 21:29

mosou 发表于 2021-6-6 21:21
原来不是我想的那个91

懂得都懂{:1_886:}

lihu5841314 发表于 2021-6-6 21:49

a4299110 发表于 2021-6-6 21:29
懂得都懂

那个91?   难道是书店找刘备的那个

dhluser 发表于 2021-6-6 22:02

lihu5841314 发表于 2021-6-6 21:49
那个91?   难道是书店找刘备的那个

91短视频,你值得拥有!!!!!!!!!!{:1_899:}

小夜好坏 发表于 2021-6-6 22:12

我是看91进来的= =

loadream 发表于 2021-6-6 22:17

是我想多了。。。

wanwfy 发表于 2021-6-6 22:44

pyppeteer 异步,你值得拥有
页: [1] 2 3
查看完整版本: selenium 实现91视频搜索下载 改进线程池 (聊胜于无)