本帖最后由 知心 于 2022-11-14 21:54 编辑
今天看到有坛友求助御书网下载的问题,一时手痒就实现了一下。
通过分析得知:
1.(省略)/list_other_xx.html 页面可以获取到章节列表信息,包括章节标题和章节链接
2.访问章节链接,可以匹配到章节内容。此处需要注意有些章节是分成了多个页面的。
用到库有:
requests、os、time、random、lxml
其中requests和lxml是第三方库,需要通过pip进行安装。
具体实现请看代码:
# +-------------------------------------------------------------------
# | ProjectName: 御书网(www.yushubo.net)小说下载
# +-------------------------------------------------------------------
# | CreateTime: 2022-11-14 10:32
# +-------------------------------------------------------------------
#!/usr/bin/python3
# -*- coding: UTF-8 -*-
import requests
import os
import time
import random
from lxml import etree
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
}
session = requests.session()
def request_html(url):
res = session.get(url=url, headers=headers)
if res.status_code == 200:
tree = etree.HTML(res.text)
return tree
print("请求出错!!!")
def parse_chapter(tree):
chapter_list = tree.xpath('/html/body/div[3]/div[1]/div[2]/div[2]/ul/li/span/a')
chapter_data = []
title = tree.xpath('/html/body/div[3]/div[1]/div[2]/div[1]/h1/text()')[0]
for chapter in chapter_list:
chapter_title = chapter.xpath('./text()')[0]
chapter_link = chapter.xpath('./@href')[0]
chapter_data.append((chapter_title, chapter_link))
return (chapter_data, title)
def parse_content(url):
tree = request_html(url)
book_text = '\n'.join(tree.xpath('//div[@id="BookText"]//text()'))
next_page = tree.xpath('/html/body/div[3]/div[1]/div[4]/a[4]/text()')[0]
while next_page == '下一页':
next_url = "https://www.yushubo.net/" + tree.xpath('/html/body/div[3]/div[1]/div[4]/a[4]/@href')[0]
tree = request_html(next_url)
book_text += '\n'.join(tree.xpath('//div[@id="BookText"]//text()'))
next_page = tree.xpath('/html/body/div[3]/div[1]/div[4]/a[4]/text()')[0]
return book_text
def down_file(chapter_data):
path = f"{os.path.dirname(os.path.abspath(__file__))}/{chapter_data[1]}"
if not os.path.exists(path):
os.mkdir(path)
for index, chapter in enumerate(chapter_data[0]):
name = str(index+1).rjust(4, '0')+'.'+chapter[0]
book_readurl = f"https://www.yushubo.net{chapter[1]}"
book_text = parse_content(book_readurl)
with open(f'{path}/{name}.txt', mode='w', encoding='utf-8') as f:
f.write(book_text)
print(f"{name}.txt 下载完成")
time.sleep(random.random())
if __name__ == "__main__":
book_url = input("请输入书籍链接,例:https://www.yushubo.net/list_other_100341.html\n")
page_tree = request_html(book_url)
chapter_data = parse_chapter(page_tree)
down_file(chapter_data)
|