本帖最后由 kaixin365days 于 2020-6-19 14:34 编辑
运行没有问题,加注释就容易出错
[Python] 纯文本查看 复制代码 import requests
import os,time
from lxml import etree
from fake_useragent import UserAgent
def get_html(url):
ua = UserAgent()
headers = {'UserAgent': ua.random}
response = requests.get(url,headers=headers)
response.encoding = response.apparent_encoding
return response
def be_tree(url):
r = get_html(url)
tree = etree.HTML(r.text)
return tree
def get_mulu_lists(mulu_url):
tree = be_tree(mulu_url)
novel_name = tree.xpath('//h1/span[1]/b/text()')[0]
cha_urls = tree.xpath('//ul/span/a/@href')
titles = tree.xpath('//ul/span/a/text()')
return novel_name,titles,cha_urls
def down_onechapter(novel_name,down_url):
tree = be_tree(dow_url)
datas = tree.xpath('//div[1]/div/p/text()')
for data in datas:
with open(f'./{novel_name}.txt','a',encoding='utf-8')as f:
f.write(data)
#写入2行空字符,以便章节内容排版
with open(f'./{novel_name}.txt', 'a', encoding='utf-8')as f:
f.write('\n')
f.write('\n')
print('下载完成')
if __name__ == '__main__':
start = time.time()
# 西游记目录,其他书籍替换链接即可
url = 'https://so.gushiwen.cn/guwen/book_46653FD803893E4FBF8761BEF60CD7D9.aspx'
base_url =url.split('/guwen')[0]
novel_name, titles, cha_urls = get_mulu_lists(url)
for title,cha_url in zip(titles,cha_urls):
dow_url = base_url + cha_url
print(title,dow_url)
with open(f'./{novel_name}.txt','a',encoding='utf-8')as f:
f.write(title)
f.write('\n')
down_onechapter(novel_name,dow_url)
print('全本下载完成')
end = time.time()
use_time = int(end) - int(start)
print(f'下载耗时{use_time}秒')
|