[Python] 纯文本查看 复制代码 import requests
from bs4 import BeautifulSoup
import time
import random
import tkinter as tk
from tkinter import simpledialog
from tkinter import messagebox
# 更大的User-Agent列表
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.64',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:46.0) Gecko/20100101 Firefox/46.0'
# 可以根据需要添加更多
]
def fetch_and_write_content(url, file, proxies, max_retries=3):
retry_count = 0
while retry_count < max_retries:
try:
headers = {
'User-Agent': random.choice(USER_AGENTS)
}
proxy = random.choice(proxies) if proxies else None
response = requests.get(url, headers=headers, proxies=proxy)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
content_div = soup.find("div", class_="showtxt")
if content_div:
chapter_title = soup.find("h1").get_text(strip=True)
content_text = content_div.get_text("\n").strip()
filtered_text = "\n".join(line.lstrip() for line in content_text.splitlines() if line.strip())
file.write(f"{chapter_title}\n{filtered_text}\n")
print(f"已写入章节:{chapter_title}")
delay = random.uniform(1.0, 3.0)
print(f"暂停 {delay:.2f} 秒...")
time.sleep(delay)
return True
else:
print(f"章节内容为空:{url}")
return False
except requests.RequestException as e:
print(f"抓取失败:{url},错误信息:{str(e)}")
retry_count += 1
if retry_count < max_retries:
delay = random.uniform(2.0, 5.0)
print(f"暂停 {delay:.2f} 秒后重试...")
time.sleep(delay)
print(f"无法获取链接:{url},超过最大重试次数。")
return False
def main():
root = tk.Tk()
root.withdraw()
url = simpledialog.askstring("输入", "请输入小说主页URL:")
if not url:
messagebox.showinfo("提示", "必须输入URL才能继续。")
return
proxy_list_str = simpledialog.askstring("输入", "请输入代{过}{滤}理列表(用逗号分隔),例如:http://123.123.123.123:8080,http://111.111.111.111:8080")
proxies = [{"http": proxy.strip(), "https": proxy.strip()} for proxy in proxy_list_str.split(",")] if proxy_list_str else []
output_file = "result.txt"
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
dd_elements = soup.select("dd a")
with open(output_file, "w", encoding='utf-8') as file:
call_count = 0
for dd_element in dd_elements[5:]:
try:
href = dd_element.get("href").strip()
text = dd_element.get_text().strip()
full_url = f"http://www.shuquge.io{href}"
print("章节名称:", text)
success = fetch_and_write_content(full_url, file, proxies)
if success:
call_count += 1
if call_count % 5 == 0:
delay = random.uniform(1.0, 3.0)
print(f"暂停 {delay:.2f} 秒...")
time.sleep(delay)
except (IndexError, requests.RequestException) as e:
print("抓取章节失败:", text)
except requests.RequestException as e:
print(f"请求失败:{url},错误信息:{str(e)}")
messagebox.showinfo("完成", "小说下载完成。")
if __name__ == "__main__":
main()
|