lixiaoqiang 发表于 2019-12-2 21:51

代码中有问题,请大神看下

import requests
from bs4 import BeautifulSoup
import re
url = "https://www.hd-mv.com/mf?o=download"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0',
         'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
         'Accept-Encoding': 'gzip',
         'Cookie':'_ga=GA1.2.834765233.1575183233; UM_distinctid=16ec03cb04f1db-0a819d105a32cd-b363e65-1fa400-16ec03cb050518; _gid=GA1.2.1947770759.1575286137; PHPSESSID=8porle6ho9st8cc9bfdaukdbm4; CNZZDATA1261301097=99146113-1575183231-https%253A%252F%252Fwww.baidu.com%252F%7C1575286831; wordpress_test_cookie=WP+Cookie+check; wordpress_logged_in_3e4847f2d3806b54c86ce7160083d0b0=lxq1006025203%7C1576500258%7CKFwwEpIYVQSMgTxnBEZtQQloJ4jAOeN6Km4u2Ys3StK%7Ceb68a54e1b1bf1947499ce0bffc6341a1dcedb562c93604191842f37580e739b',
            "Referer": "https://www.hd-mv.com/mf?o=download"
            }
#获取URL下的页面内容
def get(url):
    a=requests.get(url,headers=headers)
    html=a.text
    return html


# 获取免费的MV网址列表
soup = BeautifulSoup(get(url),'lxml')
link_div = soup.find_all('div',class_="img")
links =
soup2 = BeautifulSoup(get(links),'lxml')
mv_url = soup2.find_all('div',class_="erphpdown-box")
mv =
soup3 = BeautifulSoup(get(mv),'lxml')
down_url = soup3.find_all('div',class_="erphpdown-msg")
print(down_url)




运行之后就出现
<div class="erphpdown-msg">
<div class="title"><span>资源名称</span></div>
<p><atarget="_blank">G.E.M.邓紫棋 – 句号 官方完整版MV
1080P</a></p>
<div class="title"><span>下载地址</span></div><p>文件1地址:<a href="download.php?postid=108930&amp;key=1" target="_blank">点击下载</a></p><div class="title"><span>隐藏信息</span></div><div class="hidden-content" style="border:2px dashed #ff5f33;padding:15px;">http://16629707.d.yyupload.com/down/16629707/官方MV/G.E.M.鄧紫棋【句號 Full Stop】Official Music Video.mp4</div> </div>


我只想要
"download.php?postid=108930&amp;key=1"

应该怎么做?
大神们最好讲解下原理!!!

坏人。丶 发表于 2019-12-2 23:13

for i in down_url.find_all('a'):
      print(i['href'])

这样就可以抓取到a标签里面的href,怎么只抓到第二个搞不懂,萌新

zhaoziqi1995 发表于 2019-12-3 14:17

我也是纯新手,看了你这个改了改 能下到视频,你看看,希望能有帮助
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import requests
from bs4 import BeautifulSoup
import re
url = "https://www.hd-mv.com/mf/page/%s?o=download"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0',
         'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
         'Accept-Encoding': 'gzip',
         'Cookie':'_ga=GA1.2.834765233.1575183233; UM_distinctid=16ec03cb04f1db-0a819d105a32cd-b363e65-1fa400-16ec03cb050518; _gid=GA1.2.1947770759.1575286137; PHPSESSID=8porle6ho9st8cc9bfdaukdbm4; CNZZDATA1261301097=99146113-1575183231-https%253A%252F%252Fwww.baidu.com%252F%7C1575286831; wordpress_test_cookie=WP+Cookie+check; wordpress_logged_in_3e4847f2d3806b54c86ce7160083d0b0=lxq1006025203%7C1576500258%7CKFwwEpIYVQSMgTxnBEZtQQloJ4jAOeN6Km4u2Ys3StK%7Ceb68a54e1b1bf1947499ce0bffc6341a1dcedb562c93604191842f37580e739b',
            "Referer": "https://www.hd-mv.com/mf?o=download"
            }
#获取URL下的页面内容
def get(url):
    a=requests.get(url,headers=headers)
    html=a.text
    return html

def downMp4(url, name):
    r = requests.get(url, stream=True)       
    with open('mv\\' + name + '.mp4', "wb") as mp4:
      for chunk in r.iter_content(chunk_size=1024 * 1024):
            if chunk:
                mp4.write(chunk)

for j in range(1,3):
        # 获取免费的MV网址列表
        soup = BeautifulSoup(get(url%j),'lxml')
        link_div = soup.find_all('div',class_="img")
        # print(link_div)
        links =
        print(links)
        for i in range(len(links)):
                soup2 = BeautifulSoup(get(links),'lxml')
                mv_url = soup2.find_all('div',class_="erphpdown-box")
                # print(mv_url)
                mv =
                # print(mv)
                if mv == "https://www.hd-mv.com/user?action=vip":
                        continue
                soup3 = BeautifulSoup(get(mv),'lxml')
                # print(soup3)
                down_url = soup3.find_all('div',class_="erphpdown-msg")
                # print("down_url = ",down_url)
                need = down_url.find_all('div')
                if len(need)>3:
                        url= need.text
                        name = down_url.a.text
                        print("开始下载")
                        downMp4(url, name)
                        print("下载结束")







页: [1]
查看完整版本: 代码中有问题,请大神看下