代码中有问题,请大神看下
import requestsfrom bs4 import BeautifulSoup
import re
url = "https://www.hd-mv.com/mf?o=download"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0',
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
'Accept-Encoding': 'gzip',
'Cookie':'_ga=GA1.2.834765233.1575183233; UM_distinctid=16ec03cb04f1db-0a819d105a32cd-b363e65-1fa400-16ec03cb050518; _gid=GA1.2.1947770759.1575286137; PHPSESSID=8porle6ho9st8cc9bfdaukdbm4; CNZZDATA1261301097=99146113-1575183231-https%253A%252F%252Fwww.baidu.com%252F%7C1575286831; wordpress_test_cookie=WP+Cookie+check; wordpress_logged_in_3e4847f2d3806b54c86ce7160083d0b0=lxq1006025203%7C1576500258%7CKFwwEpIYVQSMgTxnBEZtQQloJ4jAOeN6Km4u2Ys3StK%7Ceb68a54e1b1bf1947499ce0bffc6341a1dcedb562c93604191842f37580e739b',
"Referer": "https://www.hd-mv.com/mf?o=download"
}
#获取URL下的页面内容
def get(url):
a=requests.get(url,headers=headers)
html=a.text
return html
# 获取免费的MV网址列表
soup = BeautifulSoup(get(url),'lxml')
link_div = soup.find_all('div',class_="img")
links =
soup2 = BeautifulSoup(get(links),'lxml')
mv_url = soup2.find_all('div',class_="erphpdown-box")
mv =
soup3 = BeautifulSoup(get(mv),'lxml')
down_url = soup3.find_all('div',class_="erphpdown-msg")
print(down_url)
运行之后就出现
<div class="erphpdown-msg">
<div class="title"><span>资源名称</span></div>
<p><atarget="_blank">G.E.M.邓紫棋 – 句号 官方完整版MV
1080P</a></p>
<div class="title"><span>下载地址</span></div><p>文件1地址:<a href="download.php?postid=108930&key=1" target="_blank">点击下载</a></p><div class="title"><span>隐藏信息</span></div><div class="hidden-content" style="border:2px dashed #ff5f33;padding:15px;">http://16629707.d.yyupload.com/down/16629707/官方MV/G.E.M.鄧紫棋【句號 Full Stop】Official Music Video.mp4</div> </div>
我只想要
"download.php?postid=108930&key=1"
应该怎么做?
大神们最好讲解下原理!!!
for i in down_url.find_all('a'):
print(i['href'])
这样就可以抓取到a标签里面的href,怎么只抓到第二个搞不懂,萌新 我也是纯新手,看了你这个改了改 能下到视频,你看看,希望能有帮助
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import requests
from bs4 import BeautifulSoup
import re
url = "https://www.hd-mv.com/mf/page/%s?o=download"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0',
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
'Accept-Encoding': 'gzip',
'Cookie':'_ga=GA1.2.834765233.1575183233; UM_distinctid=16ec03cb04f1db-0a819d105a32cd-b363e65-1fa400-16ec03cb050518; _gid=GA1.2.1947770759.1575286137; PHPSESSID=8porle6ho9st8cc9bfdaukdbm4; CNZZDATA1261301097=99146113-1575183231-https%253A%252F%252Fwww.baidu.com%252F%7C1575286831; wordpress_test_cookie=WP+Cookie+check; wordpress_logged_in_3e4847f2d3806b54c86ce7160083d0b0=lxq1006025203%7C1576500258%7CKFwwEpIYVQSMgTxnBEZtQQloJ4jAOeN6Km4u2Ys3StK%7Ceb68a54e1b1bf1947499ce0bffc6341a1dcedb562c93604191842f37580e739b',
"Referer": "https://www.hd-mv.com/mf?o=download"
}
#获取URL下的页面内容
def get(url):
a=requests.get(url,headers=headers)
html=a.text
return html
def downMp4(url, name):
r = requests.get(url, stream=True)
with open('mv\\' + name + '.mp4', "wb") as mp4:
for chunk in r.iter_content(chunk_size=1024 * 1024):
if chunk:
mp4.write(chunk)
for j in range(1,3):
# 获取免费的MV网址列表
soup = BeautifulSoup(get(url%j),'lxml')
link_div = soup.find_all('div',class_="img")
# print(link_div)
links =
print(links)
for i in range(len(links)):
soup2 = BeautifulSoup(get(links),'lxml')
mv_url = soup2.find_all('div',class_="erphpdown-box")
# print(mv_url)
mv =
# print(mv)
if mv == "https://www.hd-mv.com/user?action=vip":
continue
soup3 = BeautifulSoup(get(mv),'lxml')
# print(soup3)
down_url = soup3.find_all('div',class_="erphpdown-msg")
# print("down_url = ",down_url)
need = down_url.find_all('div')
if len(need)>3:
url= need.text
name = down_url.a.text
print("开始下载")
downMp4(url, name)
print("下载结束")
页:
[1]