我是爬虫小白,之前搞得给我老爹下小说的,大佬看了不要喷[Python] 纯文本查看 复制代码 # -*- coding: utf-8 -*-
import requests
import re
import os
from lxml import etree
from urllib.parse import quote
if __name__ == '__main__':
keyword = input("请输入书籍名:").encode("gb2312")
url = "https://www.tingchina.com/search1.asp?keyword=" + quote(keyword)
headers = {
"user-agent": "Mozilla/7.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
}
page_text = requests.get(url=url, headers=headers).text.encode('iso-8859-1').decode('gbk')
tree = etree.HTML(page_text)
booksname = tree.xpath('/html/body/div[2]/div[2]/dl/dd/ul/li/a/text()')
booknum = tree.xpath('/html/body/div[2]/div[2]/dl/dd/ul/li/a/@href')
result = ''.join(booknum)
booknumber = re.findall(r'yousheng/disp_(.*?).htm', result)
for name,num in zip(booksname, booknumber):
print("书名:" + name)
print("编号:" + num)
pagenum=input("请输入书籍编号:")
page_text1 = requests.get(url="https://www.tingchina.com/yousheng/"+str(pagenum)+"/play_"+str(pagenum)+"_0.htm", headers=headers).text.encode('iso-8859-1').decode('gbk')
pagenum1 = re.findall(r'play+_+\d+_(\d+)', page_text1)
m = (int(pagenum1[len(pagenum1)-2]) + 1)
print("总章节数:"+str(m))
firstnum=input("请输入开始下载章数")
endnum=input("请输入结束下载章数")
for page in range(int(firstnum)-1,int(endnum)):
page=str(page)
indexurl="https://www.tingchina.com/yousheng/"+str(pagenum)+"/play_"+str(pagenum)+"_"+page+".htm"
url="https://img.tingchina.com/play/h5_jsonp.asp?0.9091809774033375"
headers={
"User-Agent": "Mozilla/7.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
"Referer": "https://www.tingchina.com/yousheng/"+str(pagenum)+"/play_"+str(pagenum)+"_5.htm",
}
getbook=requests.get(url=indexurl,headers=headers,verify=False).text.encode('iso-8859-1').decode('gbk')
sonurl=re.findall(r'fileUrl= "(.*?)"',getbook)[0]
name=re.findall(r'fileUrl= "/yousheng/(.*?).mp3"',getbook)[0]
bookname=re.findall(r'如果您喜欢的话,请为(.*?).mp3投一票',getbook)[0]
book=re.findall(r';"><strong>(.*?)</strong>',getbook)[0]
bookshu=re.findall(r'.htm">(.*?)</ul>',getbook)[0]
getshu=requests.get(url=url,headers=headers,verify=False).text
son=re.findall(r'"(.*?)";',getshu)[0].replace('" + "',"")
bookurl="https://t33.tingchina.com"+sonurl+son
headers1={
"Referer": "http://www.23ts.com/",
"User-Agent": "Mozilla/6.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36",
}
getdj=requests.get(url=bookurl,headers=headers,verify=False).content
file_name="./"+book+"/"
if not os.path.exists(file_name):
os.mkdir(file_name)
with open(file_name+"{}.mp3".format(bookname),"wb") as f:
f.write(getdj)
print(bookname+"-----------下载成功!") |