Python 笔趣阁爬虫下载小说

chenmuting · 发表于 2024-2-27 15:44

本帖最后由 chenmuting 于 2024-2-27 15:48 编辑

写了个笔趣阁小说爬虫，主要是能够进行搜索和下载，没有弄成界面。
在搜索这一部分主要是使用了selenium来模拟谷歌浏览器打开搜索页面，所以要使用我的这个代码，必须安装好selenium和谷歌浏览器。
安装selenium教程：https://blog.csdn.net/m0_57206390/article/details/129327008
接触学习爬虫没多久，有什么不足之处，请各位大佬指教

[Python] 纯文本查看 复制代码

from selenium import webdriver
import requests,re,os,time,shutil,threading,queue
from lxml import etree
import pandas as pd

# 定义一个函数来获取小说章节目录的URL和章节名
def get_chapter_urls(url, visited_urls, value):
    global tot_title
    response = requests.get(url, headers=headers)
    response.encoding = response.apparent_encoding
    html = etree.HTML(response.text)
    chapter_elements = html.xpath("//div[@class='listmain']//dd/a")
    chapter_elements.pop(10)
    tot_title = html.xpath("//div[@class='info']/h1/text()")
    chapter_urls = []
    for element in chapter_elements:
        chapter_name = element.text
        chapter_url = 'https://www.biqg.cc'+element.get('href')
        if chapter_url not in visited_urls:
            value +=1
            chapter_urls.append((chapter_name, chapter_url, value))
            visited_urls.add(chapter_url)
    return chapter_urls

# 定义一个函数来获取小说具体章节的内容
def get_chapter_content(url):
    response = requests.get(url, headers=headers)
    response.encoding = response.apparent_encoding
    html = etree.HTML(response.text)
    content_element = html.xpath("//div[@id='chaptercontent']/text()")[:-1]
    pattern = r'\r\n        \xa0\xa0\xa0\xa0|\s'
    content = [re.sub(pattern, '', sub_text) for sub_text in content_element]
    return content

# 定义一个函数来处理每个章节的爬取任务
def process_chapter(chapter_queue):
    global time_start
    time_start = time.time()
    while not chapter_queue.empty():
        chapter_name, chapter_url, value = chapter_queue.get()
        print("正在爬取章节：", chapter_name)
        content = get_chapter_content(chapter_url)
        # 在这里可以将内容保存到文件或进行其他处理
        folder_path = f'{tot_title[0]}'
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        with open(f'{tot_title[0]}/{value}.txt', 'w', encoding='utf-8') as f:
            f.write('\n'+chapter_name+'\n')
            for data in content:
                f.write(data + '\n')
            f.write('\n\n')
        chapter_queue.task_done()

#合并下载的TXT文件
def merge_txt_files(folder_path, output_file):
    txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
    txt_files.sort(key=lambda x: int(x[:-4]))

    with open(output_file, 'w', encoding='utf-8') as outfile:
        for txt_file in txt_files:
            with open(os.path.join(folder_path, txt_file), 'r', encoding='utf-8') as infile:
                content = infile.read()
                outfile.write(content)

#搜索小说，并选择所需要下载的小说
def search_novel():
    #chrome_options = webdriver.ChromeOptions()
    # 后台静默运行
    #chrome_options.add_argument('--headless')
    #print('浏览器已打开')
    #browser = webdriver.Chrome(options=chrome_options)
    browser = webdriver.Chrome()
    name_input = input('输入小说名或作者：')
    browser.get(f'https://www.biqg.cc/s?q={name_input}')
    time.sleep(3)
    # 输出网页源代码
    html=browser.page_source
    browser.close()
    #print('浏览器已关闭')
    html = etree.HTML(html)
    name = html.xpath("//div[@class='type_show']//div//a/text()")
    link = html.xpath("//div[@class='type_show']//div//h4/a/@href")
    author = html.xpath("//div[@class='type_show']//div[@class='author']/text()")
    num = [i+1 for i in range(0, len(name))]
    data = {'序号':num, '小说':name, '作者':author, '链接':link}
    df = pd.DataFrame(data)
    if df.empty:
        print('搜索数据为空，请重新搜索')
        search_novel()
    else:
        print(df)
        sx_input = int(input('请输入序号选择下载的小说：'))
        novel_link = 'https://www.biqg.cc' + link[sx_input-1]
        return novel_link

def search_continue():
    input_continue = input('请输入y/n选择是否继续下载小说：')
    if input_continue == 'y':
        main()
    else:
        return

def main():
    directory_url = search_novel()
    # 获取小说章节目录的URL和章节名
    visited_urls = set()
    value = 0
    chapter_urls = get_chapter_urls(directory_url, visited_urls, value)
    # 创建一个队列来存储待爬取的章节信息
    chapter_queue = queue.Queue()
    for chapter_name, chapter_url, value in chapter_urls:
        chapter_queue.put((chapter_name, chapter_url, value))
    # 创建多个线程来并发爬取章节内容
    print('='*64)
    print('线程数建议在10-30之间，避免对目标服务器造成过大压力')
    sum = int(input('输入线程数：'))
    num_threads = sum  # 设置线程数量，根据需要进行调整
    threads = []
    for i in range(num_threads):
        thread = threading.Thread(target=process_chapter, args=(chapter_queue,))
        thread.daemon = False
        thread.start()
        threads.append(thread)
    # 等待所有线程完成任务
    chapter_queue.join()
    # 等待所有线程结束
    for thread in threads:
        thread.join()
    print("所有章节爬取完成！")
    time_end = time.time()
    print('章节爬取花费时间：', time_end-time_start)
    print('='*64)
    print('开始合并所有TXT文件')
    folder_path_1 = f'{tot_title[0]}/'  # 请替换为实际文件夹路径
    output_file = f'{tot_title[0]}.txt'  # 输出文件名
    merge_txt_files(folder_path_1, output_file)
    print('合并所有TXT文件成功')
    print(f'{tot_title[0]}下载成功')
    shutil.rmtree(tot_title[0])
    print('='*64)
    search_continue()

# 主程序入口
if __name__ == "__main__":
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306'}
    main()

selenium小说下载.zip (2.56 KB, 下载次数: 275)

chenmuting · 发表于 2024-2-27 18:06

有个疑惑，昨晚我搜斗破苍穹时，什么都搜不到，搜天蚕土豆就出来了，然后我今天搜斗破苍穹的时候也搜出来，是不是昨晚我调试多了，触发啥机制了？

chenmuting · 发表于 2024-2-29 09:23

kexing 发表于 2024-2-29 08:59
挺好用的只是我本地运行有个小问题每次选择y 继续都会打开一个浏览器

有隐藏浏览器的代码，我注释掉了，可以加上去

chenmuting · 发表于 2024-2-28 16:51

Mly2580 发表于 2024-2-28 16:20
csdn作者能否完善优化下，很容易触发反爬，一本都爬不出来。

现在实力不允许我写这样的程序，毕竟接触爬虫满打满算也没多久

sai609 · 发表于 2024-2-27 17:33

更新速度快的主流小说，爬不了啊

bluewatercom · 发表于 2024-2-27 17:39

学习一下

chenmuting · 发表于 2024-2-27 18:03

sai609 发表于 2024-2-27 17:33
更新速度快的主流小说，爬不了啊

只是爬这个https://www.biqg.cc网站，网站有就能下载，至于那些主流小说更新到哪一章，就看那些人上传快不快了

kevinbinhe · 发表于 2024-2-27 18:40

赞，能弄个界面就更好了。

shengchiqie · 发表于 2024-2-27 19:43

谢谢分享

chenmuting · 发表于 2024-2-27 19:44

kevinbinhe 发表于 2024-2-27 18:40
赞，能弄个界面就更好了。

界面就算了，看到我以前弄的界面，在看看别人的界面，我感觉我就是个辣鸡

moruye · 发表于 2024-2-27 21:06

提示: 作者被禁止或删除内容自动屏蔽

jzbcbaba · 发表于 2024-2-27 21:26

牛的，自己动手丰衣足食

帐号		自动登录	找回密码
密码			注册[Register]

[Python 原创] Python 笔趣阁爬虫下载小说

免费评分

本帖被以下淘专辑推荐:

免费评分

moruye moruye 当前离线好友阅读权限 0 听众最后登录 1970-1-1 头像被屏蔽	9^# moruye 发表于 2024-2-27 21:06 提示: 作者被禁止或删除内容自动屏蔽

	回复支持举报