本帖最后由 hj170520 于 2020-5-25 12:32 编辑
代码如下:
[Python] 纯文本查看 复制代码 from lxml import etree
import requests
import re
import os
url = 'http://www.effortlessenglish.libsyn.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Host': 'www.effortlessenglish.libsyn.com'}
Web = requests.get(url, headers=headers)
Web_html = etree.HTML(Web.text)
contain1 = Web_html.xpath('//div[@class="postDetails"]/a/@href[1]')
book = []
if not os.path.exists('./audio'):
os.mkdir('./audio')
for i in range(len(contain1)):
if re.search(r'http.*17069',contain1[i]):
url_data = contain1[i]
# print(contain1[i])
# url_data = re.findall(r'http.*17069',contain1[i])[0] # 因为findall输出的是list 所以用[0]将它拿出来
print(url_data)
# data = requests.get(url_data, headers=headers)
# print("正在下载")
# with open("./audio/" + str(i) + '.mp3', 'wb') as f:
# f.write(data.content)
# f.close()
例如爬取的是
http://traffic.libsyn.com/effortlessenglish/Death_To_The_Schools__DESTROY_Limiting_Beliefs_and_Be_HAPPY.mp3?dest-id=17069
打开后页面变成了
http://hwcdn.libsyn.com/p/c/7/6/c76011bf65265504/Death_To_The_Schools__DESTROY_Limiting_Beliefs_and_Be_HAPPY.mp3?c_id=73623335&cs_id=73623335&destination_id=17069&expiration=1590379908&hwt=1834174b745f61f56b0cd6ee70d59831
再拓展个问题,就是我用request打开某网页,他自动跳转我抓取不到新的网站怎么办?
谢谢大佬们,我成功了!
[Python] 纯文本查看 复制代码 for i in range(len(contain1)):
if re.search(r'http.*17069',contain1[i]):
url_data = contain1[i]
# print(contain1[i])
# url_data = re.findall(r'http.*17069',contain1[i])[0] # 因为findall输出的是list 所以用[0]将它拿出来
print(url_data)
new_url_location = requests.get(url_data, headers=headers, allow_redirects=False)
print(new_url_location.headers['location'])
print(new_url_location._next.url) # 另一种方法
# print("正在下载")
# with open("./audio/" + str(i) + '.mp3', 'wb') as f:
# f.write(data.content)
# f.close() |