python 爬虫爬取读万卷小说网站小说
本帖最后由 Mly2580 于 2024-3-21 16:00 编辑之前看到吾爱python爬虫一位大佬写了一个爬取笔趣阁的代码,之后最近也有了点兴趣,于是自己尝试爬了一下读万卷的小说网站,并且打包了一个exe程序可直接执行。原贴链接:https://www.52pojie.cn/thread-1894044-1-1.html,感谢@chenmuting我用夸克网盘分享了「读万卷.exe」,点击链接即可保存。打开「夸克APP」,无需下载在线播放视频,畅享原画5倍速,支持电视投屏。 链接:https://pan.quark.cn/s/df492af360ed 提取码:xZAm 代码如下:
from selenium import webdriver
import requests, re, os, time, shutil, threading, queue
from lxml import etree
import pandas as pd
import random
def get_user_agent():
headers_list = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
]
return headers_list
'''def get_proxy():
proxy = [
'http://182.140.244.163:8118',
'http://113.124.86.180:9999',
'http://117.64.237.42:9999',
'http://182.34.102.48:9999',
'http://183.236.123.242:8060',
'http://27.192.203.80:9000',
'http://114.231.8.242:8888',
'http://36.134.91.82:8888',
'http://222.132.57.105:9000',
'http://61.216.156.222:60808',
'http://182.34.20.110:9999',
'http://60.205.132.71:80',
]
return proxy
/'''
headers = {
'user-agent': random.choice(get_user_agent()),
}
'''proxy = {
'http': random.choice(get_proxy()),}
/'''
def extract_link_suffix(url):
# 查找最后一个斜杠的位置
last_slash_index = url.rfind('/')
if last_slash_index != -1:
# 提取斜杠之后的部分作为后缀
return url1:]
else:
# 如果没有斜杠,则直接返回整个URL(这种情况可能很少见)
return url
# 搜索小说,并选择所需要下载的小说
def search_novel():
chrome_options = webdriver.ChromeOptions()
#后台静默运行
chrome_options.add_argument('--headless')
print('浏览器已打开')
browser = webdriver.Chrome(options=chrome_options)
#browser = webdriver.Chrome()
name_input = input('输入小说名或作者:')
browser.get(f'http://www.duwanjuan.info/modules/article/search.php?q={name_input}')
time.sleep(6)
# 输出网页源代码
html = browser.page_source
browser.close()
# print('浏览器已关闭')
html = etree.HTML(html)
name = html.xpath("//div[@id='jieqi_page_contents']/div[@class='c_row']/div/div/span[@class='c_subject']/a/text()")[:10]
chapter = html.xpath("//div[@class='c_tag']/span[@class='c_value']/a/text()")[:10]
link = html.xpath("//div[@id='jieqi_page_contents']/div[@class='c_row']/div/div/a/@href")[:10]
# 提取每个链接的后缀部分
link_suffixes = for l in link]
author = html.xpath("//div[@class='c_tag']/span作者:')]/following-sibling::span/text()")[:10]
num = 1 for i in range(0, len(name))]
data = {'序号': num, '小说': name, '作者': author,'最新章节':chapter,'链接':link_suffixes}
df = pd.DataFrame(data)
if df.empty:
print('搜索数据为空,请重新搜索')
search_novel()
else:
print(df)
sx_input = int(input('请输入序号选择下载的小说:'))
novel_link = link1]
return novel_link
# 定义一个函数来获取小说章节目录的URL和章节名
def get_chapter_urls(url, visited_urls, value):
global tot_title
global book_name
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
html = etree.HTML(response.text)
chapter_elements = html.xpath("//div[@class='index']//li[@class='chapter']/a")
chapter_elements.pop(10)
tot_title = html.xpath("//div[@class='index']//li[@class='chapter']/a/text()")
bk = html.xpath("//div[@class='main']/div[@class='headlink cf']/h1/text()")
# 从列表中提取字符串
if bk:# 确保bk不为空
text = bk[0]# 提取列表中的第一个元素
else:
text = ""# 如果bk为空,则设置text为空字符串
# 正则表达式,匹配方括号及其内容,但使用括号捕获括号内的内容
pattern = r"\['(.*?)'\]"
# 使用re.search来查找匹配项,如果找到,则提取捕获组中的内容
match = re.search(pattern, text)
if match:
book_name = match.group(1)# 提取捕获组中的内容
else:
book_name = text# 如果没有找到匹配项,则保留原始text值
chapter_urls = []
for element in chapter_elements:
chapter_name = element.text
chapter_url = element.get('href')
if chapter_url not in visited_urls:
value += 1
chapter_urls.append((chapter_name, chapter_url, value))
visited_urls.add(chapter_url)
return chapter_urls
# 定义一个函数来获取小说具体章节的内容
def get_chapter_content(url):
try:
response = requests.get(url, headers=headers,verify=False,timeout=15)
response.encoding = response.apparent_encoding
html = etree.HTML(response.text)
content_element = html.xpath("//div[@id='acontent']/text()")
pattern = r'\r\n \xa0\xa0\xa0\xa0|\s|\(|\)|\读万卷 www.duwanjuan.info'
content = , '', sub_text) for sub_text in content_element]
return content
except requests.RequestException as e:
print(f"Error occurred while fetching content from {url}: {e}")
return []
# 定义一个函数来处理每个章节的爬取任务
def process_chapter(chapter_queue):
global time_start
time_start = time.time()
while not chapter_queue.empty():
chapter_name, chapter_url, value = chapter_queue.get()
print("正在爬取章节:", chapter_name)
try:
content = get_chapter_content(chapter_url)
except Exception as e:
print(f"获取章节内容失败:{e}")
content = []
# 在这里可以将内容保存到文件或进行其他处理
folder_path = f'{book_name}'
if not os.path.exists(folder_path):
os.makedirs(folder_path)
with open(f'{book_name}/{value}.txt', 'w', encoding='utf-8') as f:
f.write('\n' + chapter_name + '\n')
for data in content:
f.write(data + '\n')
f.write('\n\n')
chapter_queue.task_done()
time.sleep(6)
# 合并下载的TXT文件
def merge_txt_files(folder_path, output_file):
txt_files = for f in os.listdir(folder_path) if f.endswith('.txt')]
txt_files.sort(key=lambda x: int(x[:-4]))
with open(output_file, 'w', encoding='utf-8') as outfile:
for txt_file in txt_files:
with open(os.path.join(folder_path, txt_file), 'r', encoding='utf-8') as infile:
content = infile.read()
outfile.write(content)
def search_continue():
input_continue = input('请输入y/n选择是否继续下载小说:')
if input_continue == 'y':
main()
else:
return
def main():
directory_url = search_novel()
# 获取小说章节目录的URL和章节名
visited_urls = set()
value = 0
chapter_urls = get_chapter_urls(directory_url, visited_urls, value)
# 创建一个队列来存储待爬取的章节信息
chapter_queue = queue.Queue()
for chapter_name, chapter_url, value in chapter_urls:
chapter_queue.put((chapter_name, chapter_url, value))
# 创建多个线程来并发爬取章节内容
print('=' * 64)
print('线程数建议在10-30之间,避免对目标服务器造成过大压力')
sum = int(input('输入线程数:'))
num_threads = sum# 设置线程数量,根据需要进行调整
threads = []
for i in range(num_threads):
thread = threading.Thread(target=process_chapter, args=(chapter_queue,))
thread.daemon = False
thread.start()
threads.append(thread)
# 等待所有线程完成任务
chapter_queue.join()
# 等待所有线程结束
for thread in threads:
thread.join()
print("所有章节爬取完成!")
time_end = time.time()
print('章节爬取花费时间:', time_end - time_start)
print('=' * 64)
print('开始合并所有TXT文件')
folder_path_1 = f'{book_name}/'# 请替换为实际文件夹路径
output_file = f'{book_name}.txt'# 输出文件名
merge_txt_files(folder_path_1, output_file)
print('合并所有TXT文件成功')
print(f'{book_name}下载成功')
shutil.rmtree(book_name)
print('=' * 64)
search_continue()
# 主程序入口
if __name__ == "__main__":
main() 感谢分享https://www.52pojie.cn/forum.php?mod=viewthread&tid=1903989&extra=page%3D1%26filter%3Dtypeid%26typeid%3D29&page=2 感谢分享 感谢分享谢谢 感谢分享,辛苦了! 感谢分享,辛苦了! 感谢大佬分享,学习了{:1_921:} 感谢分享谢谢 牛的,小说好评 太厉害了,支持,对python的理解更深刻了 感谢感谢