御书网小说下载

知心 · 发表于 2022-11-14 21:47

本帖最后由知心于 2022-11-14 21:54 编辑

今天看到有坛友求助御书网下载的问题，一时手痒就实现了一下。

通过分析得知：
1.(省略)/list_other_xx.html 页面可以获取到章节列表信息，包括章节标题和章节链接
2.访问章节链接，可以匹配到章节内容。此处需要注意有些章节是分成了多个页面的。

用到库有：
requests、os、time、random、lxml
其中requests和lxml是第三方库，需要通过pip进行安装。

具体实现请看代码：

# +-------------------------------------------------------------------
# | ProjectName: 御书网(www.yushubo.net)小说下载
# +-------------------------------------------------------------------
# | CreateTime: 2022-11-14 10:32
# +-------------------------------------------------------------------

#!/usr/bin/python3
# -*- coding: UTF-8 -*-
import requests
import os
import time
import random
from lxml import etree

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
}
session = requests.session()

def request_html(url):
    res = session.get(url=url, headers=headers)
    if res.status_code == 200:
        tree = etree.HTML(res.text)
        return tree
    print("请求出错！！！")

def parse_chapter(tree):
    chapter_list = tree.xpath('/html/body/div[3]/div[1]/div[2]/div[2]/ul/li/span/a')
    chapter_data = []
    title = tree.xpath('/html/body/div[3]/div[1]/div[2]/div[1]/h1/text()')[0]

    for chapter in chapter_list:
        chapter_title = chapter.xpath('./text()')[0]
        chapter_link = chapter.xpath('./@href')[0]
        chapter_data.append((chapter_title, chapter_link))
    return (chapter_data, title)

def parse_content(url):
    tree = request_html(url)
    book_text = '\n'.join(tree.xpath('//div[@id="BookText"]//text()'))
    next_page = tree.xpath('/html/body/div[3]/div[1]/div[4]/a[4]/text()')[0]
    while next_page == '下一页':
        next_url = "https://www.yushubo.net/" + tree.xpath('/html/body/div[3]/div[1]/div[4]/a[4]/@href')[0]
        tree = request_html(next_url)
        book_text += '\n'.join(tree.xpath('//div[@id="BookText"]//text()'))
        next_page = tree.xpath('/html/body/div[3]/div[1]/div[4]/a[4]/text()')[0]
    return book_text

def down_file(chapter_data):
    path = f"{os.path.dirname(os.path.abspath(__file__))}/{chapter_data[1]}"
    if not os.path.exists(path):
        os.mkdir(path)

    for index, chapter in enumerate(chapter_data[0]):
        name = str(index+1).rjust(4, '0')+'.'+chapter[0]
        book_readurl = f"https://www.yushubo.net{chapter[1]}"
        book_text = parse_content(book_readurl)

        with open(f'{path}/{name}.txt', mode='w', encoding='utf-8') as f:
            f.write(book_text)
        print(f"{name}.txt 下载完成")
        time.sleep(random.random())

if __name__ == "__main__":
    book_url = input("请输入书籍链接,例:https://www.yushubo.net/list_other_100341.html\n")
    page_tree = request_html(book_url)
    chapter_data = parse_chapter(page_tree)
    down_file(chapter_data)

知心 · 发表于 2022-11-23 20:30

本帖最后由知心于 2022-11-23 20:37 编辑

看到多位坛友留言需要合并功能，遂在原来的代码基础上增加了代码合并功能。

[Python] 纯文本查看 复制代码

# +-------------------------------------------------------------------
# | ProjectName: 御书网(www.yushubo.net)小说下载
# +-------------------------------------------------------------------
# | CreateTime: 2022-11-14 10:32
# +-------------------------------------------------------------------

#!/usr/bin/python3
# -*- coding: UTF-8 -*-
import requests
import os
import time
import random
from lxml import etree


headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
}
session = requests.session()


def request_html(url):
    res = session.get(url=url, headers=headers)
    if res.status_code == 200:
        tree = etree.HTML(res.text)
        return tree
    print("请求出错！！！")


def parse_chapter(tree):
    chapter_list = tree.xpath('/html/body/div[3]/div[1]/div[2]/div[2]/ul/li/span/a')
    chapter_data = []
    title = tree.xpath('/html/body/div[3]/div[1]/div[2]/div[1]/h1/text()')[0]

    for chapter in chapter_list:
        chapter_title = chapter.xpath('./text()')[0]
        chapter_link = chapter.xpath('./@href')[0]
        chapter_data.append((chapter_title, chapter_link))
    return (chapter_data, title)


def parse_content(url):
    tree = request_html(url)
    book_text = '\n'.join(tree.xpath('//div[@id="BookText"]//text()'))
    next_page = tree.xpath('/html/body/div[3]/div[1]/div[4]/a[4]/text()')[0]
    while next_page == '下一页':
        next_url = "https://www.yushubo.net/" + tree.xpath('/html/body/div[3]/div[1]/div[4]/a[4]/@href')[0]
        tree = request_html(next_url)
        book_text += '\n'.join(tree.xpath('//div[@id="BookText"]//text()'))
        next_page = tree.xpath('/html/body/div[3]/div[1]/div[4]/a[4]/text()')[0]
    return book_text


def down_file(chapter_data):
    path = f"{os.path.dirname(os.path.abspath(__file__))}/{chapter_data[1]}"
    if not os.path.exists(path):
        os.mkdir(path)

    for index, chapter in enumerate(chapter_data[0]):
        name = str(index+1).rjust(4, '0')+'.'+chapter[0]
        book_readurl = f"https://www.yushubo.net{chapter[1]}"
        book_text = parse_content(book_readurl)

        with open(f'{path}/{name}.txt', mode='w', encoding='utf-8') as f:
            f.write(book_text)
        print(f"{name}.txt 下载完成")
        time.sleep(random.random())

def down_merge_file(chapter_data):
    path = f"{os.path.dirname(os.path.abspath(__file__))}/{chapter_data[1]}"
    if not os.path.exists(path):
        os.mkdir(path)

    for chapter in chapter_data[0]:
        book_readurl = f"https://www.yushubo.net{chapter[1]}"
        book_text = parse_content(book_readurl)

        with open(f'{path}/{chapter_data[1]}.txt', mode='a', encoding='utf-8') as f:
            f.write(f"{chapter[0]}\n{book_text}\n")
        print(f"{chapter[0]} 下载完成")
        time.sleep(random.random())
    print("所有章节下载完成")

if __name__ == "__main__":
    book_url = input("请输入书籍链接,例:https://www.yushubo.net/list_other_100341.html\n")
    save_type=input("请输入保存方式,1-单章节独立保存 2-所有章节合并在一起\n")
    page_tree = request_html(book_url)
    chapter_data = parse_chapter(page_tree)
    if save_type=='1':
        # 每个章节单独保存
        down_file(chapter_data)
    else:
        # 所有章节保存为一个txt
        down_merge_file(chapter_data)

知心 · 发表于 2022-11-23 20:38

知心发表于 2022-11-23 20:30
看到多位坛友留言需要合并功能，遂在原来的代码基础上增加了代码合并功能。

[mw_shl_code=python,true]# ...

下载并合并的效果图：

知心 · 发表于 2022-11-14 21:51

运行效果是这样的：

494635277 · 发表于 2022-11-14 21:53

虽然看不懂支持感谢

疯狂的倔驴 · 发表于 2022-11-14 21:59

好厉害哦，我等小白前排膜拜一下

yoyoma211 · 发表于 2022-11-14 22:02

进来学习一下，楼主给力

XDT500 · 发表于 2022-11-14 22:06

这个思路很好啊，学习了

qinwang · 发表于 2022-11-14 22:10

不错不错

yuedawei · 发表于 2022-11-14 22:19

刚进的小白。看的一脸懵！慢慢学习

yiliber · 发表于 2022-11-14 22:19

感谢分享，学习一下

ysgl · 发表于 2022-11-14 22:28

感谢分享，学习了

帐号		自动登录	找回密码
密码			注册[Register]

[Python 原创] 御书网小说下载

免费评分