本帖最后由 天空宫阙 于 2021-5-9 23:02 编辑
目标站点
https://www.tingchina.com/
上一个版本链接
更新内容
1.上个版本只支持有声书,这个理论上支持听中国上的所有音频(包括有声书,评书,相声等),但未全部测试大概率会有不能下载的
源码
[Python] 纯文本查看 复制代码 from bs4 import BeautifulSoup
import requests
import re
from tqdm import tqdm
import random
import os
import time
HOST = "https://t3344.tingchina.com"
HOST_tingChina = 'https://www.tingchina.com'
def get_episodes(category, id):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
}
response = requests.get(
f'https://www.tingchina.com/{category}/disp_{id}.htm', headers=headers)
if response.status_code == 200:
response.encoding = 'gbk'
soup = BeautifulSoup(response.text, 'lxml')
div_list = soup.select('div.list a')
div_list = [{'url': HOST_tingChina+f'/{category}/' + item['href'], 'name':item.string}
for item in div_list]
return div_list
def get_audio_path(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
response.encoding = 'gbk'
soup = BeautifulSoup(response.text, 'lxml')
script_text = soup.select('script')[-1].string
fileUrl_search = re.search('fileUrl= "(.*?)";', script_text, re.S)
if fileUrl_search:
return HOST + fileUrl_search.group(1)
def get_key(referer):
# url = "https://img.tingchina.com/play/h5_jsonp.asp?0.2617541281416145"
url = f"https://img.tingchina.com/play/h5_jsonp.asp?{str(random.random())}"
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
'referer': referer
}
response = requests.get(url, headers=headers)
# print(response.text)
matched = re.search('(key=.*?)";', response.text, re.S)
if matched:
temp = matched.group(1)
# print(temp)
return temp[len(temp)-42:]
def downloadFILE(url, name, Referer):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
'Referer': Referer
}
resp = requests.get(url=url, stream=True, headers=headers)
content_size = int(int(resp.headers['Content-Length'])/1024)
with open(name, "wb") as f:
print("Pkg total size is:", content_size, 'k,start...')
for data in tqdm(iterable=resp.iter_content(1024), total=content_size, unit='k', desc=os.path.basename(name)):
f.write(data)
print(name, "download finished!")
def main():
# disp_url = 'https://www.tingchina.com/yousheng/disp_29924.htm'
# disp_url = 'https://www.tingchina.com/pingshu/disp_1635.htm'
# disp_url = 'https://www.tingchina.com/xiangsheng/disp_10671.htm'
disp_url = input('请输入目录页链接如:[url]https://www.tingchina.com/yousheng/disp_29924.htm:'[/url])
matched_category_id = re.search(
'tingchina\.com/(\w+)/disp_(\d+).htm', disp_url)
if matched_category_id:
category = matched_category_id.group(1)
id = matched_category_id.group(2)
print(category, id)
folder = category + "_" + id
if not os.path.exists(folder):
os.makedirs(folder)
episodes = get_episodes(category, id)
print(episodes)
print('共',len(episodes),'集')
for episode in episodes:
audio_path = get_audio_path(episode['url'])
key = get_key(episode['url'])
download_url = audio_path+'?key='+key
downloadFILE(download_url, os.path.join(
folder, episode['name']), episode['url'])
# 睡眠几秒以免对服务器造成较大压力
time.sleep(3)
else:
print('输入的链接格式错误,正确的链接格式如')
print('https://www.tingchina.com/yousheng/disp_29924.htm')
if __name__ == "__main__":
main()
使用方法
1.输入目录页链接,如https://www.tingchina.com/pingshu/disp_22924.htm
如果觉得可以免费评下分!
看到自己一年前写的代码真辣眼睛留作纪念吧
[Python] 纯文本查看 复制代码 import requests
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
import time
import os
class TingChina():
def __init__(self,category,id,strat_num):
self.base_url = 'https://www.tingchina.com'
self.category = category
self.id = id
self.num = int(strat_num)-1
self.name_num = int(strat_num)
self.Referer = ''
self.host1 = "http://t44.tingchina.com"
self.host2 = "http://t33.tingchina.com"
self.book_name = ''
def get_total_episode(self):
url ='https://www.tingchina.com/{}/disp_{}.htm'.format(self.category,str(self.id))
print(url)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
}
response = requests.get(url,headers=headers)
if response.status_code==200:
response.encoding='gbk'
soup = BeautifulSoup(response.text,'lxml')
ul = soup.select('div.list > ul')[0]
lis = ul.select('li')
for i in range(len(lis)-1,len(lis)-4,-1):
matched = re.search('play.*?_(\d+)\.htm',str(lis[i]))
if matched:
num = int(matched.group(1))
break
name = soup.select('title')[0].string
return name,num+1
def get_flash_url(self):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
}
second_url = 'https://www.tingchina.com/{}/play/play_{}_{}.htm'.format(self.category,str(self.id),str(self.num))
url = 'https://www.tingchina.com/{}/{}/play_{}_{}.htm'.format(self.category,str(self.id),str(self.id),str(self.num))
response = requests.get(url,headers=headers)
if response.status_code==200:
response.encoding='gbk'
return response.text,url
else:
response = requests.get(second_url,headers=headers)
if response.status_code==200:
response.encoding='gbk'
return response.text,second_url
def parse_flash_url(self):
html,url = self.get_flash_url()
soup = BeautifulSoup(html,'lxml')
src = soup.select('#playdiv')[0].iframe['src']
self.Referer = url
flei_matched = re.search('flei=(.*?)&',src)
bookname_matched = re.search('bookname=(.*?)&',src)
filename_matched = re.search('filename=(.*?)&',src)
info = {}
if flei_matched:
info['flei'] = flei_matched.group(1)
if bookname_matched:
info['bookname'] = bookname_matched.group(1)
if filename_matched:
info['filename'] = filename_matched.group(1)
if len(info)==3:
real_address = self.host1+'/{}/{}/{}/{}'.format(self.category,info['flei'],info['bookname'],info['filename'])
elif len(info)==2:
if not 'flei' in info.keys():
real_address = self.host1+'/{}/{}/{}'.format(self.category,info['bookname'],info['filename'])
if not 'bookname' in info.keys():
real_address = self.host1+'/{}/{}/{}'.format(self.category,info['flei'],info['filename'])
else:
real_address = self.host1+'/{}/{}'.format(self.category,info['filename'])
# print('real_address',real_address)
return src,url,real_address
def get_audio(self):
'''get key 和 real_address拼接得到可以访问的地址'''
temp_url,Referer,real_address =self.parse_flash_url()
url = self.base_url + temp_url
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
'Referer': Referer
}
response = requests.get(url,headers=headers)
if response.status_code==200:
# print(response.apparent_encoding)
response.encoding='utf-8'
matched = re.search('url\[3\]= ".*?(key=.*?)";',response.text,re.S)
if matched:
# print(matched.group(1))
return(real_address+'?'+matched.group(1))
def download(self):
url = self.get_audio()
print(url)
if url:
downloadFILE(url,os.path.join(self.book_name,str(self.name_num).zfill(4)+'.mp3'),self.Referer)
def run(self):
name,total_episode = self.get_total_episode()
print('书名:',name,'集数:',total_episode)
self.book_name = name
if not os.path.exists(name):
os.makedirs(name)
while True:
if self.name_num > total_episode:
print('all assignments done!')
break
try:
self.download()
except Exception as e:
print(self.name_num,e)
with open('log.txt','a',encoding='utf-8') as f:
f.write(str(self.name_num)+str(e)+'\n')
self.num+=1
self.name_num+=1
def downloadFILE(url,name,Referer):
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
'Referer': Referer
}
resp = requests.get(url=url,stream=True,headers=headers)
content_size = int(int(resp.headers['Content-Length'])/1024)
with open(name, "wb") as f:
print("Pkg total size is:",content_size,'k,start...')
for data in tqdm(iterable=resp.iter_content(1024),total=content_size,unit='k',desc=name):
f.write(data)
print(name , "download finished!")
if __name__ == "__main__":
# disp_url = 'https://www.tingchina.com/yousheng/disp_21501.htm'
disp_url = input('请输入目录页链接如:[url=https://www.tingchina.com/yousheng/disp_21501.htm:]https://www.tingchina.com/yousheng/disp_21501.htm:'[/url])
matched_category_id = re.search('tingchina\.com/(\w+)/disp_(\d+).htm',disp_url)
if matched_category_id:
category = matched_category_id.group(1)
id = int(matched_category_id.group(2))
if id and category:
start_num = input('请输入开始下载的集数(直接回车从第一集开始下载)')
if start_num:
t = TingChina(category,id,int(start_num))
t.run()
else:
t = TingChina(category,id,1)
t.run()
else:
print('输入的链接无法解析')
# pyinstaller --onefile --windowed --icon=bitbug_favicon.ico tingchina_v0.3.py
# pyinstaller -F -i bitbug_favicon.ico tingchina_v0.3.py
# t = TingChina('yousheng',21501,143)
# t = TingChina('pingshu',1660,126)
# t = TingChina('xiangsheng',12567,1)
# t = TingChina('erge',433,12)
# t = TingChina('xiaohua',233,248)
# t.run()
|