Mly2580 发表于 2024-3-21 15:09

python 爬虫爬取读万卷小说网站小说

本帖最后由 Mly2580 于 2024-3-21 16:00 编辑

之前看到吾爱python爬虫一位大佬写了一个爬取笔趣阁的代码,之后最近也有了点兴趣,于是自己尝试爬了一下读万卷的小说网站,并且打包了一个exe程序可直接执行。原贴链接:https://www.52pojie.cn/thread-1894044-1-1.html,感谢@chenmuting我用夸克网盘分享了「读万卷.exe」,点击链接即可保存。打开「夸克APP」,无需下载在线播放视频,畅享原画5倍速,支持电视投屏。 链接:https://pan.quark.cn/s/df492af360ed 提取码:xZAm 代码如下:

from selenium import webdriver
import requests, re, os, time, shutil, threading, queue
from lxml import etree
import pandas as pd
import random

def get_user_agent():
    headers_list = [
      "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
      "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
      "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
      "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
      "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
      "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
      "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
      "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
      "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
      "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
      "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
      "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
      "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
      "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
      "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
      "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
      "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
      "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
      "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
      "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
      "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
      "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
      "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
      "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
      "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
      "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
      "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
      "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
      "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
      "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
      "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
      "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
      "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
      "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
      "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
    ]
    return headers_list
'''def get_proxy():
    proxy = [
      'http://182.140.244.163:8118',
      'http://113.124.86.180:9999',
       'http://117.64.237.42:9999',
      'http://182.34.102.48:9999',
      'http://183.236.123.242:8060',
      'http://27.192.203.80:9000',
      'http://114.231.8.242:8888',
      'http://36.134.91.82:8888',
      'http://222.132.57.105:9000',
      'http://61.216.156.222:60808',
      'http://182.34.20.110:9999',
      'http://60.205.132.71:80',
    ]
    return proxy
/'''
headers = {
    'user-agent': random.choice(get_user_agent()),
}
'''proxy = {
   'http': random.choice(get_proxy()),}
/'''

def extract_link_suffix(url):
    # 查找最后一个斜杠的位置
    last_slash_index = url.rfind('/')
    if last_slash_index != -1:
      # 提取斜杠之后的部分作为后缀
      return url1:]
    else:
      # 如果没有斜杠,则直接返回整个URL(这种情况可能很少见)
      return url

# 搜索小说,并选择所需要下载的小说
def search_novel():
    chrome_options = webdriver.ChromeOptions()
    #后台静默运行
    chrome_options.add_argument('--headless')
    print('浏览器已打开')
    browser = webdriver.Chrome(options=chrome_options)
    #browser = webdriver.Chrome()
    name_input = input('输入小说名或作者:')
    browser.get(f'http://www.duwanjuan.info/modules/article/search.php?q={name_input}')
    time.sleep(6)
    # 输出网页源代码
    html = browser.page_source
    browser.close()
    # print('浏览器已关闭')
    html = etree.HTML(html)
    name = html.xpath("//div[@id='jieqi_page_contents']/div[@class='c_row']/div/div/span[@class='c_subject']/a/text()")[:10]
    chapter = html.xpath("//div[@class='c_tag']/span[@class='c_value']/a/text()")[:10]
    link = html.xpath("//div[@id='jieqi_page_contents']/div[@class='c_row']/div/div/a/@href")[:10]
    # 提取每个链接的后缀部分
    link_suffixes = for l in link]
    author = html.xpath("//div[@class='c_tag']/span作者:')]/following-sibling::span/text()")[:10]
    num = 1 for i in range(0, len(name))]
    data = {'序号': num, '小说': name, '作者': author,'最新章节':chapter,'链接':link_suffixes}
    df = pd.DataFrame(data)
    if df.empty:
      print('搜索数据为空,请重新搜索')
      search_novel()
    else:
      print(df)
      sx_input = int(input('请输入序号选择下载的小说:'))
      novel_link = link1]
      return novel_link

# 定义一个函数来获取小说章节目录的URL和章节名
def get_chapter_urls(url, visited_urls, value):
    global tot_title
    global book_name
    response = requests.get(url, headers=headers)
    response.encoding = response.apparent_encoding
    html = etree.HTML(response.text)
    chapter_elements = html.xpath("//div[@class='index']//li[@class='chapter']/a")
    chapter_elements.pop(10)
    tot_title = html.xpath("//div[@class='index']//li[@class='chapter']/a/text()")
    bk = html.xpath("//div[@class='main']/div[@class='headlink cf']/h1/text()")
    # 从列表中提取字符串
    if bk:# 确保bk不为空
      text = bk[0]# 提取列表中的第一个元素
    else:
      text = ""# 如果bk为空,则设置text为空字符串

    # 正则表达式,匹配方括号及其内容,但使用括号捕获括号内的内容
    pattern = r"\['(.*?)'\]"
    # 使用re.search来查找匹配项,如果找到,则提取捕获组中的内容
    match = re.search(pattern, text)
    if match:
      book_name = match.group(1)# 提取捕获组中的内容
    else:
      book_name = text# 如果没有找到匹配项,则保留原始text值
    chapter_urls = []
    for element in chapter_elements:
      chapter_name = element.text
      chapter_url = element.get('href')
      if chapter_url not in visited_urls:
            value += 1
            chapter_urls.append((chapter_name, chapter_url, value))
            visited_urls.add(chapter_url)
    return chapter_urls


# 定义一个函数来获取小说具体章节的内容
def get_chapter_content(url):
try:
    response = requests.get(url, headers=headers,verify=False,timeout=15)
    response.encoding = response.apparent_encoding
    html = etree.HTML(response.text)
    content_element = html.xpath("//div[@id='acontent']/text()")
    pattern = r'\r\n   \xa0\xa0\xa0\xa0|\s|\(|\)|\读万卷 www.duwanjuan.info'
    content = , '', sub_text) for sub_text in content_element]
    return content
except requests.RequestException as e:
   print(f"Error occurred while fetching content from {url}: {e}")
   return []



# 定义一个函数来处理每个章节的爬取任务
def process_chapter(chapter_queue):
    global time_start
    time_start = time.time()
    while not chapter_queue.empty():
      chapter_name, chapter_url, value = chapter_queue.get()
      print("正在爬取章节:", chapter_name)
      try:
            content = get_chapter_content(chapter_url)
      except Exception as e:
            print(f"获取章节内容失败:{e}")
            content = []
      # 在这里可以将内容保存到文件或进行其他处理
      folder_path = f'{book_name}'
      if not os.path.exists(folder_path):
            os.makedirs(folder_path)
      with open(f'{book_name}/{value}.txt', 'w', encoding='utf-8') as f:
            f.write('\n' + chapter_name + '\n')
            for data in content:
                f.write(data + '\n')
            f.write('\n\n')
      chapter_queue.task_done()
      time.sleep(6)


# 合并下载的TXT文件
def merge_txt_files(folder_path, output_file):
    txt_files = for f in os.listdir(folder_path) if f.endswith('.txt')]
    txt_files.sort(key=lambda x: int(x[:-4]))

    with open(output_file, 'w', encoding='utf-8') as outfile:
      for txt_file in txt_files:
            with open(os.path.join(folder_path, txt_file), 'r', encoding='utf-8') as infile:
                content = infile.read()
                outfile.write(content)


def search_continue():
    input_continue = input('请输入y/n选择是否继续下载小说:')
    if input_continue == 'y':
      main()
    else:
      return

def main():
    directory_url = search_novel()
    # 获取小说章节目录的URL和章节名
    visited_urls = set()
    value = 0
    chapter_urls = get_chapter_urls(directory_url, visited_urls, value)
    # 创建一个队列来存储待爬取的章节信息
    chapter_queue = queue.Queue()
    for chapter_name, chapter_url, value in chapter_urls:

      chapter_queue.put((chapter_name, chapter_url, value))
    # 创建多个线程来并发爬取章节内容
    print('=' * 64)
    print('线程数建议在10-30之间,避免对目标服务器造成过大压力')
    sum = int(input('输入线程数:'))
    num_threads = sum# 设置线程数量,根据需要进行调整
    threads = []
    for i in range(num_threads):
      thread = threading.Thread(target=process_chapter, args=(chapter_queue,))
      thread.daemon = False
      thread.start()
      threads.append(thread)
    # 等待所有线程完成任务
    chapter_queue.join()
    # 等待所有线程结束
    for thread in threads:
      thread.join()
    print("所有章节爬取完成!")
    time_end = time.time()
    print('章节爬取花费时间:', time_end - time_start)
    print('=' * 64)
    print('开始合并所有TXT文件')
    folder_path_1 = f'{book_name}/'# 请替换为实际文件夹路径
    output_file = f'{book_name}.txt'# 输出文件名
    merge_txt_files(folder_path_1, output_file)
    print('合并所有TXT文件成功')
    print(f'{book_name}下载成功')
    shutil.rmtree(book_name)
    print('=' * 64)
    search_continue()

# 主程序入口
if __name__ == "__main__":

    main()

5m1le 发表于 2024-4-13 09:29

感谢分享https://www.52pojie.cn/forum.php?mod=viewthread&tid=1903989&extra=page%3D1%26filter%3Dtypeid%26typeid%3D29&page=2

molingxi558 发表于 2024-3-21 18:54

感谢分享

Po6Po6 发表于 2024-3-21 20:55

感谢分享谢谢

owxxd00127 发表于 2024-3-21 23:24

感谢分享,辛苦了!

cool662 发表于 2024-3-22 08:40

感谢分享,辛苦了!

tianmenghuan 发表于 2024-3-22 10:02

感谢大佬分享,学习了{:1_921:}

WSF1314 发表于 2024-3-22 13:16

感谢分享谢谢

wojiushiliu 发表于 2024-3-24 08:18

牛的,小说好评

hj6224310 发表于 2024-3-24 17:51

太厉害了,支持,对python的理解更深刻了

dc1127 发表于 2024-3-25 20:06

感谢感谢
页: [1] 2 3
查看完整版本: python 爬虫爬取读万卷小说网站小说