天空宫阙 发表于 2020-1-11 22:27

python听中国有声小说批量下载 v0.3

本帖最后由 天空宫阙 于 2021-5-9 23:02 编辑

目标站点
https://www.tingchina.com/


上一个版本链接
https://www.52pojie.cn/thread-1089351-1-1.html


更新内容
1.上个版本只支持有声书,这个理论上支持听中国上的所有音频(包括有声书,评书,相声等),但未全部测试大概率会有不能下载的
源码
from bs4 import BeautifulSoup
import requests
import re
from tqdm import tqdm
import random
import os
import time

HOST = "https://t3344.tingchina.com"
HOST_tingChina = 'https://www.tingchina.com'


def get_episodes(category, id):
    headers = {
      'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
    }
    response = requests.get(
      f'https://www.tingchina.com/{category}/disp_{id}.htm', headers=headers)
    if response.status_code == 200:
      response.encoding = 'gbk'
      soup = BeautifulSoup(response.text, 'lxml')
      div_list = soup.select('div.list a')
      div_list = [{'url': HOST_tingChina+f'/{category}/' + item['href'], 'name':item.string}
                  for item in div_list]
      return div_list


def get_audio_path(url):
    headers = {
      'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
      response.encoding = 'gbk'
      soup = BeautifulSoup(response.text, 'lxml')
      script_text = soup.select('script')[-1].string
      fileUrl_search = re.search('fileUrl= "(.*?)";', script_text, re.S)
      if fileUrl_search:
            return HOST + fileUrl_search.group(1)


def get_key(referer):
    # url = "https://img.tingchina.com/play/h5_jsonp.asp?0.2617541281416145"
    url = f"https://img.tingchina.com/play/h5_jsonp.asp?{str(random.random())}"
    headers = {
      'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
      'referer': referer
    }
    response = requests.get(url, headers=headers)
    # print(response.text)
    matched = re.search('(key=.*?)";', response.text, re.S)
    if matched:
      temp = matched.group(1)
      # print(temp)
      return temp


def downloadFILE(url, name, Referer):
    headers = {
      'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
      'Referer': Referer
    }
    resp = requests.get(url=url, stream=True, headers=headers)
    content_size = int(int(resp.headers['Content-Length'])/1024)
    with open(name, "wb") as f:
      print("Pkg total size is:", content_size, 'k,start...')
      for data in tqdm(iterable=resp.iter_content(1024), total=content_size, unit='k', desc=os.path.basename(name)):
            f.write(data)
      print(name, "download finished!")


def main():
    # disp_url = 'https://www.tingchina.com/yousheng/disp_29924.htm'
    # disp_url = 'https://www.tingchina.com/pingshu/disp_1635.htm'
    # disp_url = 'https://www.tingchina.com/xiangsheng/disp_10671.htm'
    disp_url = input('请输入目录页链接如:https://www.tingchina.com/yousheng/disp_29924.htm:')
    matched_category_id = re.search(
      'tingchina\.com/(\w+)/disp_(\d+).htm', disp_url)
    if matched_category_id:
      category = matched_category_id.group(1)
      id = matched_category_id.group(2)
      print(category, id)
      folder = category + "_" + id
      if not os.path.exists(folder):
            os.makedirs(folder)
      episodes = get_episodes(category, id)
      print(episodes)
      print('共',len(episodes),'集')
      for episode in episodes:
            audio_path = get_audio_path(episode['url'])
            key = get_key(episode['url'])
            download_url = audio_path+'?key='+key
            downloadFILE(download_url, os.path.join(
                folder, episode['name']), episode['url'])
            # 睡眠几秒以免对服务器造成较大压力
            time.sleep(3)
    else:
      print('输入的链接格式错误,正确的链接格式如')
      print('https://www.tingchina.com/yousheng/disp_29924.htm')


if __name__ == "__main__":
    main()



使用方法
1.输入目录页链接,如https://www.tingchina.com/pingshu/disp_22924.htm



如果觉得可以免费评下分!

看到自己一年前写的代码真辣眼睛留作纪念吧
import requests
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
import time
import os

class TingChina():
    def __init__(self,category,id,strat_num):
      self.base_url = 'https://www.tingchina.com'
      self.category = category
      self.id = id
      self.num = int(strat_num)-1
      self.name_num = int(strat_num)
      self.Referer = ''
      self.host1 = "http://t44.tingchina.com"
      self.host2 = "http://t33.tingchina.com"
      self.book_name = ''
   
    def get_total_episode(self):
      url ='https://www.tingchina.com/{}/disp_{}.htm'.format(self.category,str(self.id))
      print(url)
      headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
      }
      response = requests.get(url,headers=headers)
      if response.status_code==200:
            response.encoding='gbk'
            soup = BeautifulSoup(response.text,'lxml')
            ul = soup.select('div.list > ul')
            lis = ul.select('li')
            for i in range(len(lis)-1,len(lis)-4,-1):
                matched = re.search('play.*?_(\d+)\.htm',str(lis))
                if matched:
                  num = int(matched.group(1))
                  break
            name = soup.select('title').string
            return name,num+1
            

    def get_flash_url(self):
      headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
            }
      second_url = 'https://www.tingchina.com/{}/play/play_{}_{}.htm'.format(self.category,str(self.id),str(self.num))
      url = 'https://www.tingchina.com/{}/{}/play_{}_{}.htm'.format(self.category,str(self.id),str(self.id),str(self.num))
      response = requests.get(url,headers=headers)
      if response.status_code==200:
            response.encoding='gbk'
            return response.text,url
      else:
            response = requests.get(second_url,headers=headers)
            if response.status_code==200:
                response.encoding='gbk'
                return response.text,second_url
   
    def parse_flash_url(self):
      html,url = self.get_flash_url()
      soup = BeautifulSoup(html,'lxml')
      src = soup.select('#playdiv').iframe['src']
      self.Referer = url
      flei_matched = re.search('flei=(.*?)&',src)
      bookname_matched = re.search('bookname=(.*?)&',src)
      filename_matched = re.search('filename=(.*?)&',src)
      info = {}
      
      if flei_matched:
            info['flei'] = flei_matched.group(1)
      if bookname_matched:
            info['bookname'] = bookname_matched.group(1)
      if filename_matched:
            info['filename'] = filename_matched.group(1)
      if len(info)==3:
            real_address = self.host1+'/{}/{}/{}/{}'.format(self.category,info['flei'],info['bookname'],info['filename'])
      elif len(info)==2:
            if not 'flei' in info.keys():
                real_address = self.host1+'/{}/{}/{}'.format(self.category,info['bookname'],info['filename'])
            if not 'bookname' in info.keys():
                real_address = self.host1+'/{}/{}/{}'.format(self.category,info['flei'],info['filename'])
      else:
            real_address = self.host1+'/{}/{}'.format(self.category,info['filename'])
      # print('real_address',real_address)
      return src,url,real_address
   
    def get_audio(self):
      '''get key 和 real_address拼接得到可以访问的地址'''
      temp_url,Referer,real_address =self.parse_flash_url()
      url = self.base_url + temp_url
      headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
            'Referer': Referer
      }
      response = requests.get(url,headers=headers)
      if response.status_code==200:
            # print(response.apparent_encoding)
            response.encoding='utf-8'
            matched = re.search('url\= ".*?(key=.*?)";',response.text,re.S)
            if matched:
                # print(matched.group(1))
                return(real_address+'?'+matched.group(1))
   
    def download(self):
      url = self.get_audio()
      print(url)
      if url:
            downloadFILE(url,os.path.join(self.book_name,str(self.name_num).zfill(4)+'.mp3'),self.Referer)

    def run(self):
      name,total_episode = self.get_total_episode()
      print('书名:',name,'集数:',total_episode)
      self.book_name = name
      if not os.path.exists(name):
            os.makedirs(name)
      while True:
            if self.name_num > total_episode:
                print('all assignments done!')
                break
            try:
                self.download()
            except Exception as e:
                print(self.name_num,e)
                with open('log.txt','a',encoding='utf-8') as f:
                  f.write(str(self.name_num)+str(e)+'\n')
            self.num+=1
            self.name_num+=1
      

def downloadFILE(url,name,Referer):
    headers={
      'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
      'Referer': Referer
    }
    resp = requests.get(url=url,stream=True,headers=headers)
    content_size = int(int(resp.headers['Content-Length'])/1024)
    with open(name, "wb") as f:
      print("Pkg total size is:",content_size,'k,start...')
      for data in tqdm(iterable=resp.iter_content(1024),total=content_size,unit='k',desc=name):
            f.write(data)
      print(name , "download finished!")

if __name__ == "__main__":
    # disp_url ='https://www.tingchina.com/yousheng/disp_21501.htm'
    disp_url = input('请输入目录页链接如:https://www.tingchina.com/yousheng/disp_21501.htm:')
    matched_category_id = re.search('tingchina\.com/(\w+)/disp_(\d+).htm',disp_url)
    if matched_category_id:
      category = matched_category_id.group(1)
      id = int(matched_category_id.group(2))
      if id and category:
            start_num = input('请输入开始下载的集数(直接回车从第一集开始下载)')
            if start_num:
                t = TingChina(category,id,int(start_num))
                t.run()
            else:
                t = TingChina(category,id,1)
                t.run()
    else:
      print('输入的链接无法解析')
   
    # pyinstaller --onefile --windowed --icon=bitbug_favicon.ico tingchina_v0.3.py
    # pyinstaller -F -i bitbug_favicon.ico tingchina_v0.3.py
    # t = TingChina('yousheng',21501,143)
    # t = TingChina('pingshu',1660,126)
    # t = TingChina('xiangsheng',12567,1)
    # t = TingChina('erge',433,12)
    # t = TingChina('xiaohua',233,248)
    # t.run()

UTP 发表于 2020-3-30 09:46

chuanyue1981 发表于 2020-1-13 12:42
求大神做个python爬取蜻蜓fm的脚本,能够添加输入账号和密码,这样我可以登录会员账号,下载会员音频的

为什么要下载呢,我看微信读书上面蜻蜓FM把自己所有VIP和付费专辑全部免费分享了

天空宫阙 发表于 2020-1-19 16:23

chuanyue1981 发表于 2020-1-13 12:42
求大神做个python爬取蜻蜓fm的脚本,能够添加输入账号和密码,这样我可以登录会员账号,下载会员音频的
https://www.52pojie.cn/thread-1092929-1-1.html
python代码过几天再发,如果这个帖子的回复多就写详细些

潇湘公子 发表于 2020-1-11 22:28

hshcompass 发表于 2020-1-11 22:39

更新好快。
赞!
很赞!!
非常赞!!!

lcylidong 发表于 2020-1-11 22:50

不错,支持一下了

拉玛西亚 发表于 2020-1-11 23:53

我终于能听侯卫东官场笔记了,谢谢了

cherrypi 发表于 2020-1-12 00:20

厉害了,谢谢分享了。

如海之汐 发表于 2020-1-12 00:40

这个爬虫好

xxscwsrym 发表于 2020-1-12 07:59

谢谢分享

向往的歌 发表于 2020-1-12 11:36

拉玛西亚 发表于 2020-1-11 23:53
我终于能听侯卫东官场笔记了,谢谢了

亲,在哪里听呀?有下载的吗?

向往的歌 发表于 2020-1-12 11:38

楼主能直接下载5楼的哪部有声小说吗?
页: [1] 2 3 4 5 6
查看完整版本: python听中国有声小说批量下载 v0.3