本帖最后由 pnnhnjh 于 2024-8-12 17:15 编辑
网站小说爬取程序,多线程极速下载,运行后打开网站,选取你喜欢的小说,打开小说的目录页面(小说目录页),复制网址(如:https://www.88xiaoshuo.net/Partlist/291840/“)后粘贴到输入提示窗口回车即可。注:不输入任何内容直接回车则开始示例小说下载!
8月12日修复了原来章节内容应该分段而不分段的问题!!
[Python] 纯文本查看 复制代码 import os
import re
import time
import requests
import threading
from queue import Queue
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363'
}
def get_chaptercontent(chapter_url, temp_file, queue, max_retries=5):
# 发送请求获取章节内容
retry_count = 0
chaptercontent=''
while retry_count < max_retries:
try:
response = requests.get(chapter_url, headers=headers, timeout=60)
if response.status_code != 200:
print(f"未能获取章节: {chapter_url} - 状态码: {response.status_code}")
queue.put(None) # 标记失败
return
# 使用lxml解析响应文本
html = response.content.decode(response.apparent_encoding)
selector = etree.HTML(html)
title = selector.xpath('//h1/text()')
contents = selector.xpath('//div[@id="content"]/p/text()')
for content in contents: # 把每一段内容连接起来
chaptercontent = chaptercontent + '\n ' + str(content).strip()
# print(chaptercontent)
if not title or not contents:
print(f"未能找到章节内容: {chapter_url}")
queue.put(None) # 标记失败
return
title = title[0] # 假设只有一个标题
print(f"\t正在下载:{title}")
# 将章节标题和内容写入临时文件
with open(temp_file, 'w', encoding='utf-8') as f:
f.write(title + '\n')
f.writelines(chaptercontent)
queue.put(temp_file) # 将文件路径放入队列
break # 成功后退出循环
except requests.exceptions.RequestException as e:
# print(f"请求异常: {e}")
retry_count += 1
if retry_count < max_retries:
# print(f"重试第{retry_count+1}次...")
time.sleep(5) # 等待5秒后重试
if retry_count == max_retries:
print(f"达到最大重试次数,未能下载章节: {chapter_url}")
queue.put(None) # 标记失败
def download_chapters(base_url):
# 发送请求获取章节列表页面
retry_count = 0
max_retries = 5
while retry_count < max_retries:
try:
response = requests.get(base_url, headers=headers, timeout=60)
if response.status_code != 200:
print(f"未能获取URL: {response.status_code}")
return
# 使用lxml解析响应文本
html = response.content.decode(response.apparent_encoding)
selector = etree.HTML(html)
chapter_links = selector.xpath('//dd/a/@href')
if not chapter_links:
print("未找到章节链接。")
return
# 获取书籍名称
book_name = selector.xpath('//div[@id="info"]/a/h1/text()')[0]
print(f'\n正在下载小说:{book_name}\n')
save_directory = os.path.join(os.getcwd(), 'downloads') # 当前目录下的 "downloads" 文件夹
os.makedirs(save_directory, exist_ok=True) # 创建保存目录
# 创建一个队列来保存结果
result_queue = Queue()
threads = []
# 遍历章节链接,剔除最新章节部分,从第1章开始
for index, href in enumerate(chapter_links[12:], start=1):
chapter_url = f'https://www.88xiaoshuo.net{href}'
temp_file = os.path.join(save_directory, f'temp_{index:04d}.txt')
# 创建线程来下载章节内容
thread = threading.Thread(target=get_chaptercontent,
args=(chapter_url, temp_file, result_queue, max_retries))
threads.append(thread)
thread.start()
# 等待所有线程完成
for thread in threads:
thread.join()
# 从队列中收集结果
temp_files = []
while not result_queue.empty():
temp_file = result_queue.get()
if temp_file: # 如果文件成功创建
temp_files.append(temp_file)
# 按章节顺序对临时文件排序
temp_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))
# 合并临时文件到主文件
append_temp_files_to_main(temp_files, save_directory, book_name)
break # 成功后退出循环
except requests.exceptions.RequestException as e:
# print(f"请求异常: {e}")
retry_count += 1
if retry_count < max_retries:
# print(f"重试第{retry_count+1}次...")
time.sleep(5) # 等待5秒后重试
if retry_count == max_retries:
print(f"达到最大重试次数,未能下载章节列表。")
def append_temp_files_to_main(temp_files, save_directory, book_name):
book_path = os.path.join(save_directory, f'{book_name}.txt')
with open(book_path, 'w', encoding='utf-8') as main_file:
for temp_file in temp_files:
with open(temp_file, 'r', encoding='utf-8') as tf:
chapter_text = tf.read().strip()
if chapter_text: # 确保章节文本非空
main_file.write(chapter_text + '\n\n')
os.remove(temp_file) # 删除临时文件
if __name__ == "__main__":
base_url = 'https://www.88xiaoshuo.net/'
url = input(f"请输入网站({base_url})内选定小说目录页所在页网址:")
if url == '':
url = 'https://www.88xiaoshuo.net/Partlist/317894/'
start_time = time.time()
download_chapters(url)
end_time = time.time()
print(f'\n总耗时:{end_time - start_time:.2f}秒') |