修改版来了,下载地址由原来的批量生成改为进入每一章主页爬取,这样就用研究下载地址如何命名了
[Python] 纯文本查看 复制代码 import requests
import urllib
import re
import os
import time
import random
class YsSpider:
def __init__(self, name):
self.search_name = name
self.search_url = "http://www.ting89.com/search.asp?searchword="
self.home_url = "http://www.ting89.com/books/"
self.index_pattern = r"""<a href="/books/([0-9]+).html" title="(.+?)" target='_blank'>"""
self.chapter_pattern=r"""<a href='(/down/\?[^-]+-\d+.html)' target="_blank">(.+?)</a>"""
self.down_pattern=r"""url=(.*)/(.+?)\.mp3"""
self.book_id = ''
self.book_name = ''
self.Chapter_list = []
self.headers_list = ['Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0']
# 返回搜索书目的id
def searchbook(self):
file = requests.get(self.search_url + urllib.parse.quote(self.search_name, encoding='gb2312'))
data = file.content.decode('gbk')
result = re.findall(self.index_pattern, data)
if len(result):
for index, i in enumerate(result):
print('%d.%s'%(index+1,i[1]))
str = input("输入你要下载的书目名称序号: ")
self.book_name = result[int(str)-1][1]
self.book_id = result[int(str)-1][0]
return self.book_id
else:
print('*******没有找到你输入的相关书籍,请更换后重新运行程序*******')
exit()
def get_chapter_list(self):#获取各章节list和url
data = requests.get(self.home_url+self.searchbook()+'.html').content.decode('gbk')
result = re.findall(self.chapter_pattern, data)
return result
def _get_down_url(self, item):
try:
tmp_url = []
randomHeaderUserAgent = random.choice(self.headers_list) # 随机取值
headers = dict([('User-Agent', randomHeaderUserAgent)])
response= requests.get("http://www.ting89.com" + item, headers=headers,timeout=1)
data = response.content.decode('gbk')
result = re.findall(self.down_pattern, data)
if result:
tmp_url = result[0][0] + '/' + result[0][1] + ".mp3"
response.close()
return tmp_url
except:
print('爬取下载地址超时,即将重新爬取')
return self._get_down_url(item)
# 保存指定URL的文件
def save_a_file(self, url, path, chapter):
try:
print('尝试下载', self.book_name, chapter)
if not os.path.exists(path):
response = requests.get(url, timeout=5)
with open(path, 'wb') as f:
f.write(response.content)
f.close
print(self.book_name, chapter, '保存成功')
response.close()
time.sleep(1)
else:
print('文件已经存在')
except:
print('爬取失败,已下载至',chapter,'即将重新尝试下载')
self.save_a_file(url, path, chapter)
def download_files(self):
chapter_list = self.get_chapter_list()
chapter = [x[0] for x in chapter_list]
self.Chapter_list = [x[1] for x in chapter_list]
_list = [x[1] for x in chapter_list]
root = os.path.join(os.getcwd(), self.book_name)
if not os.path.exists(root):
os.mkdir(root)
for index, item in enumerate(chapter):
result= self._get_down_url(item)
if result:
path = os.path.join(root, self.Chapter_list[index])+'.mp3'
self.save_a_file(result, path, self.Chapter_list[index])
aa = YsSpider('凡人修仙传')
print(aa.download_files())
|