[Python] 纯文本查看 复制代码
import requests
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
import time
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'
}
def remove_span_tag(tag):
content = str(tag)
treated_content = re.sub('<span.*?span>','',content,flags=re.S+re.I)
result = BeautifulSoup(treated_content,'lxml')
return result
def remove_strong_tag(tag):
content = str(tag)
treated_content = re.sub('<strong>|<strong/>','',content,flags=re.S+re.I)
result = BeautifulSoup(treated_content,'lxml')
return result
def remove2next1(string):
treated_content = re.sub('\n\n','\n',string,flags=re.S+re.I)
return treated_content
def change_br2next(tag):
content = str(tag)
# treated_content = re.sub('<br>','',content,flags=re.S+re.I)
treated_content = re.sub('<br/>','\n',content,flags=re.S+re.I)
result = BeautifulSoup(treated_content,'lxml')
return result
def get_html(url):
response = requests.get(url,headers=headers)
if response.status_code==200:
response.encoding = 'utf-8'
# print(response.apparent_encoding)
return response.text
def parse_audio_text(html):
soup = BeautifulSoup(html,'lxml')
title = soup.select('div.f-title')[0].string
# print(title)
audio = soup.select('#show_mp3 > audio')[0].source['src']
# print(audio)
content = soup.select('#content > div > div.infoMain > div.f-y.w.hauto')[0]
texts =content.select('p')
# test = change_br2next(remove_strong_tag(remove_span_tag(texts[1])))
# print(test)
result = ''
for text in texts:
result+= change_br2next(remove_strong_tag(remove_span_tag(text))).get_text()
result_text = remove2next1(result)
# print(result_text)
return title,audio,result_text
# texts = remove_strong_tag(remove_span_tag(content.select('p')[1]))
def parse_index(html):
soup = BeautifulSoup(html,'lxml')
links = soup.select('.listItem')
srcs = []
for link in links:
src = link.select('a')[0]['href']
src = 'http://m.kekenet.com'+ src
srcs.append(src)
return srcs
def get_index(url):
response = requests.get(url,headers=headers)
if response.status_code==200:
response.encoding = 'utf-8'
# print(response.apparent_encoding)
return response.text
def save_text(title,content):
with open(title + '.txt','a',encoding='utf-8') as f:
f.write(content)
f.close()
def downloadFILE(url,name):
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
}
resp = requests.get(url=url,stream=True,headers=headers)
content_size = int(int(resp.headers['Content-Length'])/1024)
with open(name, "wb") as f:
print("Pkg total size is:",content_size,'k,start...')
for data in tqdm(iterable=resp.iter_content(1024),total=content_size,unit='k',desc=name):
f.write(data)
print(name , "download finished!")
if __name__ == "__main__":
for i in range(1,24):
url = 'http://m.kekenet.com/menu/14439/List_{}.shtml'.format(str(i))
html = get_index(url)
srcs = parse_index(html)
# print(srcs)
print('list',i)
for src in srcs:
detial_html = get_html(src)
title,audio,result_text= parse_audio_text(detial_html)
title = re.search('第(.*?)期',title,re.S)
if title:
title = title.group(1).zfill(3)
print(audio)
print(result_text)
save_text(title,result_text)
downloadFILE(audio,title +'.mp3')
# 24链接http://m.kekenet.com/menu/14439/index.shtml