只写了一点点简陋的代码,只能爬取指定网页内的文本内容,但是我要爬取的内容需要点击标题进入才行,还有翻页这个难题,有大手子帮忙瞅瞅吗?
[Python] 纯文本查看 复制代码 import requests
from bs4 import BeautifulSoup
import tkinter as tk
from tkinter import filedialog
from tkinter import messagebox
from datetime import datetime
def get_web_content(url, keywords, start_date, output_dir):
try:
# 发送GET请求
response = requests.get(url)
# 检查响应状态码
if response.status_code == 200:
# 解析HTML内容
soup = BeautifulSoup(response.content, 'html.parser')
# 获取所有文本内容
text_content = soup.get_text()
# 检查关键词
if keywords:
text_content = filter_keywords(text_content, keywords)
# 检查发布时间
if start_date:
text_content = filter_by_date(text_content, start_date)
# 保存内容到文件
save_to_file(text_content, output_dir)
messagebox.showinfo("成功", "内容获取成功并保存到 " + output_dir)
else:
messagebox.showerror("错误", "无法获取网站内容,状态码:" + str(response.status_code))
except Exception as e:
messagebox.showerror("错误", "发生异常:" + str(e))
def filter_keywords(content, keywords):
# 在内容中筛选关键词
return "\n".join([line for line in content.split("\n") if any(keyword in line for keyword in keywords)])
def filter_by_date(content, start_date):
# 在内容中筛选发布时间
lines = content.split("\n")
filtered_lines = []
for line in lines:
if is_date_after(line, start_date):
filtered_lines.append(line)
return "\n".join(filtered_lines)
def is_date_after(line, start_date):
# 检查行中是否包含日期,并判断是否在指定日期之后
try:
date_str = line.split()[0]
date = datetime.strptime(date_str, '%Y-%m-%d')
return date >= start_date
except Exception as e:
return False
def save_to_file(content, output_dir):
# 保存内容到文件
with open(output_dir, 'w', encoding='utf-8') as file:
file.write(content)
def main():
# 创建GUI窗口
window = tk.Tk()
window.title("网站内容获取工具")
# 添加网址输入框
url_label = tk.Label(window, text="网址:")
url_label.pack()
url_entry = tk.Entry(window, width=50)
url_entry.pack()
# 添加关键词输入框
keywords_label = tk.Label(window, text="关键词(用逗号分隔):")
keywords_label.pack()
keywords_entry = tk.Entry(window, width=50)
keywords_entry.pack()
# 添加发布时间输入框
date_label = tk.Label(window, text="发布时间(格式:YYYY-MM-DD):")
date_label.pack()
date_entry = tk.Entry(window, width=50)
date_entry.pack()
# 添加存放路径选择按钮
def choose_dir():
output_dir = filedialog.asksaveasfilename(defaultextension=".txt", filetypes=[("Text files", "*.txt")])
output_dir_entry.delete(0, tk.END)
output_dir_entry.insert(0, output_dir)
output_dir_label = tk.Label(window, text="存放路径:")
output_dir_label.pack()
output_dir_entry = tk.Entry(window, width=50)
output_dir_entry.pack()
choose_dir_button = tk.Button(window, text="选择路径", command=choose_dir)
choose_dir_button.pack()
# 添加获取内容按钮
def get_content():
url = url_entry.get()
keywords = keywords_entry.get().split(",")
start_date_str = date_entry.get()
output_dir = output_dir_entry.get()
start_date = datetime.strptime(start_date_str, '%Y-%m-%d') if start_date_str else None
get_web_content(url, keywords, start_date, output_dir)
get_content_button = tk.Button(window, text="获取内容", command=get_content)
get_content_button.pack()
# 运行GUI程序
window.mainloop()
if __name__ == "__main__":
main()
|