(3.22更新下载，添加图形界面源码和exe文件下载）新笔趣阁小说爬取程序

pnnhnjh · 发表于 2025-3-1 22:26

本帖最后由 pnnhnjh 于 2025-3-22 21:20 编辑

弟子小说网小说爬取程序已经不能使用，改为新笔趣阁小说（https://www.xbqg06.com）下载，可设置100线程下载，但不建议，小心被封IP！运行后打开网站，选取你喜欢的小说，打开小说的目录页面（小说目录页），复制网址（如：https://www.xbqg06.com/373303/）后粘贴到输入提示窗口回车即可。注：不输入任何内容直接回车则开始示例小说下载！

（3.22更新下载，添加图形界面源码和可执行文件下载）

后面的代码实际是一个模版，略懂python的朋友修改以下几行就可以爬取别的网站的小说了！
default_url = 'https://www.xbqg06.com/373303/'  # 小说目录页第一页

book_name_xpath = '//h1/text()'  # 小说书名
chapter_links_xpath = '(//ul[@class="section-list fix"])[2]/li/a/@href'  # 小说目录页章节链接
chapter_links_start_number = 0  # 小说目录页章节开始序号
title_elements_xpath = '//h1/text()'  # 小说内容页标题
contents_xpath = '//div[@id="content"]/p/text()'  # 小说内容页内容

directory_pages_xpath = '//option'  # 小说目录页目录链接，如果没有，请设置为空
current_page_option_xpath = '//option[@selected="selected"]'  # 小说目录页当前页名称，如果没有，请设置为空

[Python] 纯文本查看 复制代码

001

002

003

004

005

006

007

008

009

010

011

012

013

014

015

016

017

018

019

020

021

022

023

024

025

026

027

028

029

030

031

032

033

034

035

036

037

038

039

040

041

042

043

044

045

046

047

048

049

050

051

052

053

054

055

056

057

058

059

060

061

062

063

064

065

066

067

068

069

070

071

072

073

074

075

076

077

078

079

080

081

082

083

084

085

086

087

088

089

090

091

092

093

094

095

096

097

098

099

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

import os
import re
import time
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urljoin
from lxml import etree
from requests.adapters import HTTPAdapter
import chardet
import threading
 
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15'
]
 
MAX_RETRIES = 10  # 最大重试次数
TIMEOUT = 5  # 请求超时时间
 
 
def get_random_user_agent():
    """获取随机User-Agent"""
    return USER_AGENTS[int(time.time()) % len(USER_AGENTS)]
 
 
def get_session():
    """为每个线程创建独立的Session对象"""
    thread_local = threading.local()
    if not hasattr(thread_local, "session"):
        thread_local.session = requests.Session()
        adapter = HTTPAdapter(pool_connections=100, pool_maxsize=100, max_retries=MAX_RETRIES)
        thread_local.session.mount('http://', adapter)
        thread_local.session.mount('https://', adapter)
    return thread_local.session
 
 
def decode_content(response):
    """统一处理响应内容的编码"""
    detected = chardet.detect(response.content)
    encodings = ['utf-8', 'gbk', 'gb2312', 'big5', 'gb18030']
 
    if detected['confidence'] >= 0.7:
        try:
            return response.content.decode(detected['encoding'], errors='strict')
        except UnicodeDecodeError:
            pass
 
    for enc in encodings:
        try:
            return response.content.decode(enc, errors='strict')
        except UnicodeDecodeError:
            continue
 
    return response.content.decode(detected['encoding'], errors='replace')
 
 
def fetch_url(url, headers):
    """带有重试机制的请求函数"""
    session = get_session()
    for attempt in range(MAX_RETRIES):
        try:
            response = session.get(url, headers=headers, timeout=TIMEOUT)
            response.raise_for_status()  # 检查HTTP状态码
            return response
        except requests.exceptions.RequestException as e:
            if attempt == MAX_RETRIES - 1:
                raise e
            time.sleep(1)  # 等待一段时间后重试
 
 
def get_chaptercontent(chapter_url, index):
    """获取章节内容"""
    headers = {
        'User-Agent': get_random_user_agent(),
        'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
        'Referer': chapter_url
    }
 
    all_content = []
    title = ""
    while chapter_url:
        try:
            response = fetch_url(chapter_url, headers)
            html = decode_content(response)
            selector = etree.HTML(html)
 
            # 章节内容提取
            title_elements = selector.xpath(title_elements_xpath)
            contents = selector.xpath(contents_xpath)
            all_content.extend([content.strip() for content in contents if content.strip()])
            title = title_elements[0].strip() if title_elements else ""
 
            # 去掉标题中的"（X / X）"部分
            title = re.sub(r'（\s*\d+\s*/\s*\d+\s*）', '', title).strip()
 
            # 检查是否有“下一页”
            next_page = selector.xpath('//a[contains(text(), "下一页")]/@href')
            if next_page and next_page[0] != "javascript:":
                chapter_url = urljoin(chapter_url, next_page[0])
            else:
                chapter_url = None
 
        except Exception as e:
            print(f"获取章节 {title} 时发生错误: {e}")
            break
 
    if not title or not all_content:
        print(f"章节 {index} 获取失败")
        return (index, None, "")
 
    chaptercontent = "\n  ".join(all_content)
 
    # 数据清洗
    # chaptercontent = re.sub(r'一秒记住\s*.*?\s*无弹窗免费阅读！', '', chaptercontent, flags=re.S)
    # chaptercontent = re.sub(r'\(https.*?html\)', '', chaptercontent, flags=re.S)
    # chaptercontent = re.sub(r'[\s　]{0,6}第.{1,10}[部分章节卷页]{1,2}.{0,30}[\s　\n]{0,6}', '', chaptercontent)
    # chaptercontent = re.sub(r'[\s　]{0,6}\d{1,5}.{1,30}[\s　\n]{0,6}', '', chaptercontent)
    # chaptercontent = re.sub(r'[ 　]{1,}', '', chaptercontent)
 
    return (index, title, chaptercontent.strip())
 
 
def download_chapters(base_url, max_threads):
    """下载小说所有章节"""
    headers = {'User-Agent': get_random_user_agent()}
    all_chapter_links = []
    book_name = None  # 初始化 book_name 变量
    first_directory_page = True  # 标记是否是第一个目录页
 
    while base_url:
        try:
            response = fetch_url(base_url, headers)
            html = decode_content(response)
            selector = etree.HTML(html)
 
            if first_directory_page:
                book_name = selector.xpath(book_name_xpath)[0].strip()
                print(f'\n开始下载小说: 《{book_name}》\n')
                first_directory_page = False
 
            # 提取章节链接
            chapter_links = selector.xpath(chapter_links_xpath)[chapter_links_start_number:]
            all_chapter_links.extend(urljoin(base_url, href) for href in chapter_links)
 
            # 获取所有目录页链接
            if directory_pages_xpath and current_page_option_xpath:  # 新增的条件判断
                directory_pages = [(urljoin(base_url, option.attrib['value']), option.text) for option in
                                   selector.xpath(directory_pages_xpath)]
 
                # 当前页
                current_page_option = selector.xpath(current_page_option_xpath)
                if current_page_option:
                    current_page_value = urljoin(base_url, current_page_option[0].attrib['value'])
                    current_page_text = current_page_option[0].text
                    print(f'当前目录页：{current_page_text}')
 
                    # 如果有下一个目录页，则继续
                    current_page_index = [page[0] for page in directory_pages].index(current_page_value)
                    if current_page_index + 1 < len(directory_pages):
                        base_url = directory_pages[current_page_index + 1][0]
                    else:
                        base_url = None
                else:
                    print("未找到当前选中的目录页，停止抓取。")
                    break
            else:
                # print("目录页的xpath表达式为空，跳过目录检测。")
                break
 
        except Exception as e:
            print(f"获取目录页时发生错误: {e}")
            break
 
    if not book_name:
        print("无法获取书名，请检查URL和网页结构。")
        return False
 
    save_dir = os.path.join(os.getcwd(), '我的小说')
    os.makedirs(save_dir, exist_ok=True)
    output_path = os.path.join(save_dir, f'{book_name}.txt')
 
    chapters = []
    failed_chapters = []
 
    def write_to_file():
        chapters.sort(key=lambda x: x[0])
        try:
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(f'\n\n书名：{book_name}\n\n网址：{input_url}\n\n\n')
                for idx, title, content in chapters:
                    f.write(f"{title}\n\n{content}\n\n")
 
            if failed_chapters:
                print(f"\n以下章节下载失败: {failed_chapters}")
 
            print(f'\n《{book_name}》下载完成')
            return True
        except Exception as e:
            print(f"写入文件时发生错误: {e}")
            return False
 
    success = True
    with ThreadPoolExecutor(max_workers=max_threads) as executor:
        futures = [executor.submit(get_chaptercontent, link, idx)
                   for idx, link in enumerate(all_chapter_links, 1)]
 
        for future in as_completed(futures):
            try:
                index, title, content = future.result()
                if title and content:
                    chapters.append((index, title, content))
                    print(f"完成章节: {title}")
                else:
                    failed_chapters.append(index)
            except Exception as e:
                print(f"处理章节时出错: {e}")
                failed_chapters.append(index)
                success = False
 
    if not write_to_file():
        success = False
 
    return success
 
 
if __name__ == "__main__":
    default_url = 'https://www.xbqg06.com/373303/'  # 小说目录页第一页
 
    book_name_xpath = '//h1/text()'  # 小说书名
    chapter_links_xpath = '(//ul[@class="section-list fix"])[2]/li/a/@href'  # 小说目录页章节链接
    chapter_links_start_number = 0  # 小说目录页章节开始序号
    title_elements_xpath = '//h1/text()'  # 小说内容页标题
    contents_xpath = '//div[@id="content"]/p/text()'  # 小说内容页内容
 
    directory_pages_xpath = '//option'  # 小说目录页目录链接，如果没有，请设置为空
    current_page_option_xpath = '//option[@selected="selected"]'  # 小说目录页当前页名称，如果没有，请设置为空
 
    input_url = input(f"请输入小说目录页地址（默认 {default_url}）: ") or default_url
 
    while True:
        threads_input = input("请输入并发线程数（1-100，默认20）: ") or "20"
        if threads_input.isdigit() and 1 <= int(threads_input) <= 100:
            max_threads = int(threads_input)
            break
        print("输入无效，请输入1-100之间的整数")
 
    start_time = time.time()
    success = download_chapters(base_url=input_url, max_threads=max_threads)
 
    elapsed = time.time() - start_time
 
    if success:
        print(f"总耗时: {elapsed:.2f}秒")
    else:
        print("下载过程中发生错误")
    input("下载完成，小说保存在“我的小说”文件夹内，回车退出！")

编译后命令行版下载链接：
链接：https://pan.baidu.com/s/1B00FRJS8yv4SNWRO9tvDEg
提取码：52pj

Ghang · 发表于 2025-3-3 08:08

wyesheng 发表于 2025-3-3 00:10
唉，编程基础太差，有时候想搞点啥都实现不了。。。

我也是从零慢慢看现在稍微能写一点简单的，主要还是让兴趣引导，我喜欢漫画、小说。刚开始就是去找网站写小说爬虫，不会了就百度或者问AI，系统性的学不会，但代码逻辑基础都能学到不少。

zhubaohong02 · 发表于 2025-4-2 23:49

本帖最后由 zhubaohong02 于 2025-4-4 22:50 编辑

非常感谢大佬
www.xbqg06.com这个笔趣阁下载有分页，下载不全。
www.biqvdu.com/book/21610/这样多个/book/的修改哪里可以下载。

yhzh · 发表于 2025-3-1 22:41

感谢楼主的分享。。

sanrokalv · 发表于 2025-3-1 23:46

看到了全新的角度........................

wyesheng · 发表于 2025-3-2 00:03

好像最近Python的程序蛮流行哇

Ghang · 发表于 2025-3-2 00:18

wyesheng 发表于 2025-3-2 00:03
好像最近Python的程序蛮流行哇

简单又好用

Mwowom · 发表于 2025-3-2 00:42

好用的代码使我的公鸡旋转

caogd · 发表于 2025-3-2 07:18

下载备用！谢谢共享啦！

1921688998 · 发表于 2025-3-2 09:21

不会使用

Doublevv · 发表于 2025-3-2 09:50

可惜，打不开弟子小说网站

yuzilin · 发表于 2025-3-2 11:05

感谢分享，可以拿来学习了

帐号		自动登录	找回密码
密码			注册[Register]

[Python 原创] (3.22更新下载，添加图形界面源码和exe文件下载）新笔趣阁小说爬取程序

免费评分

本帖被以下淘专辑推荐: