新手小白爬取小说

Timothys 发表于 2024-6-24 10:00

import requests
from bs4 import BeautifulSoup
import time
import random

def fetch_and_write_content(url, file, max_retries=3):
retry_count = 0

while retry_count < max_retries:
   try:
         headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
         }
         response = requests.get(url, headers=headers)
         response.raise_for_status()

         soup = BeautifulSoup(response.content, 'html.parser')
         content_div = soup.find("div", class_="showtxt")

         if content_div:
            chapter_title = soup.find("h1").get_text(strip=True) #章节标题
            content_text = content_div.get_text("\n").strip()#章节内容
            filtered_text = "\n".join(line.lstrip() for line in content_text.splitlines() if line.strip()) # 过滤空白行

            # 写入文件时指定编码为 UTF-8
            file.write(f"{chapter_title}\n{filtered_text}\n")
            print(f"已写入章节：{chapter_title}")

            delay = random.uniform(1.0, 3.0)# 随机延迟1到3秒
            print(f"暂停 {delay:.2f} 秒...")
            time.sleep(delay)

            return True

         else:
            print(f"章节内容为空：{url}")
            return False

   except requests.RequestException as e:
         print(f"抓取失败：{url}，错误信息：{str(e)}")
         retry_count += 1
         if retry_count < max_retries:
            delay = random.uniform(2.0, 5.0)# 随机延迟2到5秒
            print(f"暂停 {delay:.2f} 秒后重试...")
            time.sleep(delay)

print(f"无法获取链接：{url}，超过最大重试次数。")
return False

def main():
url = "http://www.shuquge.io/book/38068/"#小说链接
output_file = "result.txt" #保存小说名称

try:
   response = requests.get(url)
   response.raise_for_status()

   soup = BeautifulSoup(response.content, 'html.parser')
   dd_elements = soup.select("dd a")

   with open(output_file, "w", encoding='utf-8') as file:
         call_count = 0
         for dd_element in dd_elements:
            try:
               href = dd_element.get("href").strip()#每个章节目录
               text = dd_element.get_text().strip() #每个章节的名称
               full_url = f"http://www.shuquge.io{href}" #章节目录拼接成完整链接

               # 输出章节名称
               print("章节名称:", text) #输出每个章节名称

               success = fetch_and_write_content(full_url, file) #调用爬取每个章节内容的函数

               if success:
                     call_count += 1
                     if call_count % 5 == 0:
                        delay = random.uniform(1.0, 3.0)# 随机延迟1到3秒
                        print(f"暂停 {delay:.2f} 秒...")
                        time.sleep(delay)

            except (IndexError, requests.RequestException) as e:
               print("抓取章节失败:", text)

except requests.RequestException as e:
   print(f"请求失败：{url}，错误信息：{str(e)}")

if __name__ == "__main__":
main()

目前存在问题：
1、爬取速度太慢，太快了被封IP，不知道怎么解决，所以加了暂停
2、这个网站貌似不支持搜索，所以固定了URL
3、第一次发帖，有问题请删贴

vaycore 发表于 2024-6-24 11:05

封 IP 可以考虑加个代{过}{滤}理池，还有就是加个 User-Agent 列表，然后每次请求随机 User-Agent 值

812290870 发表于 2024-6-24 15:45

import requests
from bs4 import BeautifulSoup
import time
import random
import tkinter as tk
from tkinter import simpledialog
from tkinter import messagebox

# 更大的User-Agent列表
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.64',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:46.0) Gecko/20100101 Firefox/46.0'
# 可以根据需要添加更多
]

def fetch_and_write_content(url, file, proxies, max_retries=3):
retry_count = 0

while retry_count < max_retries:
   try:
         headers = {
            'User-Agent': random.choice(USER_AGENTS)
         }
         proxy = random.choice(proxies) if proxies else None
         response = requests.get(url, headers=headers, proxies=proxy)
         response.raise_for_status()

         soup = BeautifulSoup(response.content, 'html.parser')
         content_div = soup.find("div", class_="showtxt")

         if content_div:
            chapter_title = soup.find("h1").get_text(strip=True)
            content_text = content_div.get_text("\n").strip()
            filtered_text = "\n".join(line.lstrip() for line in content_text.splitlines() if line.strip())

            file.write(f"{chapter_title}\n{filtered_text}\n")
            print(f"已写入章节：{chapter_title}")

            delay = random.uniform(1.0, 3.0)
            print(f"暂停 {delay:.2f} 秒...")
            time.sleep(delay)

            return True
         else:
            print(f"章节内容为空：{url}")
            return False
   except requests.RequestException as e:
         print(f"抓取失败：{url}，错误信息：{str(e)}")
         retry_count += 1
         if retry_count < max_retries:
            delay = random.uniform(2.0, 5.0)
            print(f"暂停 {delay:.2f} 秒后重试...")
            time.sleep(delay)

print(f"无法获取链接：{url}，超过最大重试次数。")
return False

def main():
root = tk.Tk()
root.withdraw()

url = simpledialog.askstring("输入", "请输入小说主页URL：")
if not url:
   messagebox.showinfo("提示", "必须输入URL才能继续。")
   return

proxy_list_str = simpledialog.askstring("输入", "请输入代{过}{滤}理列表（用逗号分隔），例如：http://123.123.123.123:8080,http://111.111.111.111:8080")
proxies = [{"http": proxy.strip(), "https": proxy.strip()} for proxy in proxy_list_str.split(",")] if proxy_list_str else []

output_file = "result.txt"

try:
   response = requests.get(url)
   response.raise_for_status()

   soup = BeautifulSoup(response.content, 'html.parser')
   dd_elements = soup.select("dd a")

   with open(output_file, "w", encoding='utf-8') as file:
         call_count = 0
         for dd_element in dd_elements:
            try:
               href = dd_element.get("href").strip()
               text = dd_element.get_text().strip()
               full_url = f"http://www.shuquge.io{href}"

               print("章节名称:", text)

               success = fetch_and_write_content(full_url, file, proxies)

               if success:
                     call_count += 1
                     if call_count % 5 == 0:
                        delay = random.uniform(1.0, 3.0)
                        print(f"暂停 {delay:.2f} 秒...")
                        time.sleep(delay)
            except (IndexError, requests.RequestException) as e:
               print("抓取章节失败:", text)
except requests.RequestException as e:
   print(f"请求失败：{url}，错误信息：{str(e)}")
messagebox.showinfo("完成", "小说下载完成。")

if __name__ == "__main__":
main()

liutao0474 发表于 2024-6-24 11:25

非常牛啊，感谢楼主{:1_921:}

xiaoyingang 发表于 2024-6-24 14:26

感谢大佬

lishuichen 发表于 2024-6-24 14:40

有些网站会检测封个十几分钟去搞个爬取代{过}{滤}理的做个池封了换ip

stldhl 发表于 2024-6-24 15:55

感谢大佬

Timothys 发表于 2024-6-25 09:29

812290870 发表于 2024-6-24 15:45
import requests
from bs4 import BeautifulSoup
import time

大佬牛逼，学习了

Timothys 发表于 2024-6-25 09:34

lishuichen 发表于 2024-6-24 14:40
有些网站会检测封个十几分钟去搞个爬取代{过}{滤}理的做个池封了换ip

谢谢大佬的建议，我学习下

Timothys 发表于 2024-6-25 09:35

vaycore 发表于 2024-6-24 11:05
封 IP 可以考虑加个代{过}{滤}理池，还有就是加个 User-Agent 列表，然后每次请求随机 User-Agent 值

感谢大佬指点，我关注下这两个点

页: [1] 2 3

吾爱破解 - 52pojie.cn's Archiver

新手小白 爬取小说

新手小白爬取小说