泡书吧网站小说爬取程序，多线程极速下载

pnnhnjh · 发表于 2024-11-3 21:41

本帖最后由 pnnhnjh 于 2024-11-3 22:15 编辑

泡书吧网站小说爬取程序，多线程极速下载，运行后打开网站，选取你喜欢的小说，打开小说的目录页面（小说目录页），复制网址（如：http://www.paoshu8.info/224_224190/“）后粘贴到输入提示窗口回车即可。注：不输入任何内容直接回车则开始示例小说下载！

[Python] 纯文本查看 复制代码

001

002

003

004

005

006

007

008

009

010

011

012

013

014

015

016

017

018

019

020

021

022

023

024

025

026

027

028

029

030

031

032

033

034

035

036

037

038

039

040

041

042

043

044

045

046

047

048

049

050

051

052

053

054

055

056

057

058

059

060

061

062

063

064

065

066

067

068

069

070

071

072

073

074

075

076

077

078

079

080

081

082

083

084

085

086

087

088

089

090

091

092

093

094

095

096

097

098

099

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

import os
import random
import time
import requests
import threading
from queue import Queue
from lxml import etree
import logging
import colorlog
from requests.adapters import HTTPAdapter
 
# 配置日志
handler = colorlog.StreamHandler()
handler.setFormatter(colorlog.ColoredFormatter(
    '%(log_color)s%(asctime)s - %(levelname)s - %(message)s',
    log_colors={
        'DEBUG': 'cyan',
        'INFO': 'green',
        'WARNING': 'yellow',
        'ERROR': 'red',
        'CRITICAL': 'bold_red',
    }
))
 
logger = logging.getLogger()
logger.addHandler(handler)
logger.setLevel(logging.INFO)
 
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'
]
 
# 增加连接池大小
session = requests.Session()
adapter = HTTPAdapter(pool_connections=100, pool_maxsize=100)  # 设置连接池大小为100
session.mount('http://', adapter)
session.mount('https://', adapter)
 
def get_chaptercontent(chapter_url, temp_file, queue, semaphore, session, max_retries=5):
    semaphore.acquire()  # 获取信号量
    try:
        retry_count = 0
        chaptercontent = ''
        while retry_count < max_retries:
            try:
                time.sleep(0.2)
                headers = {
                    'User-Agent': random.choice(user_agents),
                    'Accept-Language': 'en-US,en;q=0.9'
                }
                response = session.get(chapter_url, headers=headers, timeout=60)
                response.close()  # 确保连接关闭
                if response.status_code == 429:
                    wait_time = int(response.headers.get('Retry-After', 2))  # 尝试从响应头获取等待时间
                    time.sleep(wait_time)
                    continue
 
                if response.status_code != 200:
                    if retry_count == max_retries - 1:
                        logger.error(f"未能获取章节: {chapter_url} - 状态码: {response.status_code}")
                    queue.put(None)  # 标记失败
                    return
 
                # 使用lxml解析响应文本
                html = response.content.decode(response.apparent_encoding)
                selector = etree.HTML(html)
                title = selector.xpath('//h1/text()')
                contents = selector.xpath('//div[@id="content"]/p/text()')
                for content in contents:  # 把每一段内容连接起来
                    chaptercontent = chaptercontent + '\n  ' + str(content).strip()
 
                if not title or not contents:
                    if retry_count == max_retries - 1:
                        logger.error(f"未能找到章节内容: {chapter_url}")
                    queue.put(None)  # 标记失败
                    return
 
                title = title[0]  # 假设只有一个标题
                logger.info(f"\t正在下载：{title}")
                # 将章节标题和内容写入临时文件
                with open(temp_file, 'w', encoding='utf-8') as f:
                    f.write(title + '\n')
                    f.writelines(chaptercontent)
 
                queue.put(temp_file)  # 将文件路径放入队列
                break  # 成功后退出循环
            except requests.exceptions.RequestException as e:
                retry_count += 1
                if retry_count == max_retries:
                    logger.error(f"达到最大重试次数，未能下载章节: {chapter_url} - 错误: {e}")
                    queue.put(None)  # 标记失败
                    return
                else:
                    time.sleep(5)  # 等待5秒后重试
 
        if retry_count == max_retries:
            queue.put(None)  # 标记失败
    finally:
        semaphore.release()  # 无论成功还是失败，都释放信号量
 
 
def download_chapters(base_url):
    retry_count = 0
    max_retries = 5  # 最大尝试次数
    while retry_count < max_retries:
        try:
            response = session.get(base_url, headers={'User-Agent': random.choice(user_agents)}, timeout=60)
            response.close()  # 确保连接关闭
            if response.status_code != 200:
                if retry_count == max_retries - 1:
                    logger.error(f"未能获取URL: {response.status_code}")
                return
 
            # 使用lxml解析响应文本
            html = response.content.decode(response.apparent_encoding)
            selector = etree.HTML(html)
            chapter_links = selector.xpath('//dd/a/@href')
 
            if not chapter_links:
                if retry_count == max_retries - 1:
                    logger.error("未找到章节链接。")
                return
 
            # 获取书籍名称
            book_name = selector.xpath('//div[@id="info"]/h1/text()')[0]
            logger.info(f'\n正在下载小说：{book_name}\n')
 
            save_directory = os.path.join(os.getcwd(), 'downloads')  # 当前目录下的 "downloads" 文件夹
            os.makedirs(save_directory, exist_ok=True)  # 创建保存目录
 
            # 创建一个队列来保存结果
            result_queue = Queue()
 
            # 定义 semaphore
            semaphore = threading.BoundedSemaphore(50)  # 设置最大线程数为50
 
            threads = []
 
            # 遍历章节链接，剔除最新章节部分，从第1章开始
            for index, href in enumerate(chapter_links[9:], start=1):
                chapter_url = f'http://www.paoshu8.info{href}'
 
                temp_file = os.path.join(save_directory, f'temp_{index:04d}.txt')
 
                # 创建线程来下载章节内容
                thread = threading.Thread(target=get_chaptercontent,
                                         args=(chapter_url, temp_file, result_queue, semaphore, session, max_retries))
                threads.append(thread)
                thread.start()
 
            # 等待所有线程完成
            for thread in threads:
                thread.join()
 
            # 从队列中收集结果
            temp_files = []
            while not result_queue.empty():
                temp_file = result_queue.get()
                if temp_file:  # 如果文件成功创建
                    temp_files.append(temp_file)
 
            # 按章节顺序对临时文件排序
            temp_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))
 
            # 合并临时文件到主文件
            append_temp_files_to_main(temp_files, save_directory, book_name)
            break  # 成功后退出循环
        except requests.exceptions.RequestException as e:
            retry_count += 1
            if retry_count == max_retries:
                logger.error(f"达到最大重试次数，未能下载章节列表。 - 错误: {e}")
                return
            else:
                time.sleep(5)  # 等待5秒后重试
 
    if retry_count == max_retries:
        logger.error(f"达到最大重试次数，未能下载章节列表。")
 
 
def append_temp_files_to_main(temp_files, save_directory, book_name):
    book_path = os.path.join(save_directory, f'{book_name}.txt')
    with open(book_path, 'w', encoding='utf-8') as main_file:
        for temp_file in temp_files:
            with open(temp_file, 'r', encoding='utf-8') as tf:
                chapter_text = tf.read().strip()
                if chapter_text:  # 确保章节文本非空
                    main_file.write(chapter_text + '\n\n')
            os.remove(temp_file)  # 删除临时文件
 
 
if __name__ == "__main__":
    base_url = 'http://www.paoshu8.info'
    url = input(f"请输入网站({base_url})内选定小说目录页所在页网址：")
    max_threads = ''
    # max_threads = input(f"请输入允许同时打开的线程数（默认为100，过多的线程可能会被网站限制而出错：)")
    if url == '':
        url = 'http://www.paoshu8.info/224_224190/'
    if max_threads == '':
        max_threads = 100
    else:
        max_threads = int(max_threads)
    start_time = time.time()
    download_chapters(url)
    end_time = time.time()
    logger.info(f'\n总耗时：{end_time - start_time:.2f}秒')

XiaoLuoSheng · 发表于 2024-11-3 23:07

感谢大佬！

lgk · 发表于 2024-11-13 19:16

可以，非常好用下载速度也可以，书源比较全，但是有点问题，就是下载下来的小说不是很完整，每一章节总缺少一点字数，不知道是书源的问题还是啥

zhaomingX · 发表于 2024-11-3 23:17

感谢大佬

lufei2002 · 发表于 2024-11-4 00:01

感谢大佬

moka518 · 发表于 2024-11-4 00:11

代码写的很清楚❤️

159357ssy · 发表于 2024-11-4 00:46

学习一下

Lichenglong1998 · 发表于 2024-11-4 01:30

感谢，值得学习

xiaowuzainuli · 发表于 2024-11-4 03:26

感谢分享666

zjtzjt · 发表于 2024-11-4 06:16

感谢分享，找小说方便多了

zhangxiaoxiao · 发表于 2024-11-4 06:36

学习了，感谢楼主分享

帐号		自动登录	找回密码
密码			注册[Register]

[Python 原创] 泡书吧网站小说爬取程序，多线程极速下载

免费评分

本帖被以下淘专辑推荐: