说明
本代码非常原始,用来学习python,非常简陋,大家有需求可以自行修改
import os
import tkinter as tk
from tkinter import filedialog, messagebox, scrolledtext
import pandas as pd
from PyPDF2 import PdfReader
import threading
import chardet,csv
class PDFKeywordDetector:
def __init__(self, root):
self.root = root
self.root.title("PDF 关键字检测器")
self.keywords = []
self.keywords_file = "keywords.txt"
self.is_running = False
self.excel_file = "检测结果.xlsx"
self.setup_ui()
def setup_ui(self):
# 目录选择
# 创建一个标签,用于提示用户选择目录
self.directory_label = tk.Label(self.root, text="选择目录:")
# 将标签放置在第一行第一列
self.directory_label.grid(row=0, column=0, padx=5, pady=5, sticky='w')
# 创建一个文本框,用于显示用户选择的目录
self.directory_entry = tk.Entry(self.root, width=50)
# 将文本框放置在第一行第二列
self.directory_entry.grid(row=0, column=1, padx=5, pady=5, sticky='w')
# 创建一个按钮,用于让用户选择目录
self.browse_button = tk.Button(self.root, text="浏览", command=self.browse_directory)
# 将按钮放置在第一行第三列
self.browse_button.grid(row=0, column=2, padx=5, pady=5, sticky='w')
# 关键字输入
self.keyword_label = tk.Label(self.root, text="关键字 (用逗号分隔):")
self.keyword_label.grid(row=1, column=0, padx=5, pady=5, sticky='w')
self.keyword_entry = tk.Entry(self.root, width=50)
self.keyword_entry.grid(row=1, column=1, padx=5, pady=5, sticky='w')
self.keyword_add_button = tk.Button(self.root, text="添加关键字", command=self.add_keyword)
self.keyword_add_button.grid(row=1, column=2, padx=5, pady=5, sticky='w')
self.keyword_listbox = tk.Listbox(self.root, width=50, height=10)
self.keyword_listbox.grid(row=2, column=0, columnspan=2, padx=5, pady=5, sticky='w')
self.keyword_remove_button = tk.Button(self.root, text="删除选中关键字", command=self.remove_keyword)
self.keyword_remove_button.grid(row=2, column=2, padx=5, pady=5, sticky='w')
# 检测关键字文件的编码
with open(self.keywords_file, 'rb') as f:
result = chardet.detect(f.read())
file_encoding = result['encoding']
# 读取关键字文件中的关键字
with open(self.keywords_file, 'r', encoding=file_encoding) as f:
self.keywords = [line.strip() for line in f.readlines()]
self.keyword_listbox.delete(0, tk.END)
for keyword in self.keywords:
self.keyword_listbox.insert(tk.END, keyword)
# 检测按钮和进度
self.start_button = tk.Button(self.root, text="开始检测", command=self.start_detection)
self.start_button.grid(row=3, column=0, padx=5, pady=5, sticky='w')
self.pause_button = tk.Button(self.root, text="暂停检测", command=self.pause_detection)
self.pause_button.grid(row=3, column=1, padx=5, pady=5, sticky='w')
self.progress_label = tk.Label(self.root, text="进度: 0/0")
self.progress_label.grid(row=4, column=0, columnspan=3, padx=5, pady=5, sticky='w')
# 日志显示
self.log_text = scrolledtext.ScrolledText(self.root, width=70, height=15)
self.log_text.grid(row=5, column=0, columnspan=3, padx=5, pady=5, sticky='w')
def browse_directory(self):
directory = filedialog.askdirectory()
if directory:
self.directory_entry.insert(0, directory)
def add_keyword(self):
keywords = self.keyword_entry.get().split(',')
for keyword in keywords:
keyword = keyword.strip()
if keyword and keyword not in self.keywords:
self.keywords.append(keyword)
with open(self.keywords_file, 'a') as f:
f.write(keyword + '\n')
self.keyword_listbox.insert(tk.END, keyword)
self.keyword_entry.delete(0, tk.END)
def remove_keyword(self):
selected_indices = self.keyword_listbox.curselection()
for index in selected_indices[::-1]:
keyword = self.keyword_listbox.get(index)
self.keywords.remove(keyword)
with open(self.keywords_file, 'r') as f:
lines = f.readlines()
with open(self.keywords_file, 'w') as f:
for line in lines:
if line.strip() != keyword:
f.write(line)
self.keyword_listbox.delete(index)
def start_detection(self):
# 设置is_running为True
self.is_running = True
# 禁用开始按钮
self.start_button.config(state=tk.DISABLED)
# 启用暂停按钮
self.pause_button.config(state=tk.NORMAL)
# 获取输入的目录
self.directory = self.directory_entry.get()
# 如果目录为空,提示错误
if not self.directory:
messagebox.showerror("错误", "请选择一个目录")
return
# 如果关键字为空,提示错误
if not self.keywords:
messagebox.showerror("错误", "请添加至少一个关键字")
return
# 加载已检测的文件
self.detected_files = self.load_detected_files()
# 计算未检测的文件数量
self.total_files = len([f for f in os.listdir(self.directory) if f.endswith('.pdf') and f not in self.detected_files])
# 初始化已处理文件数量
self.processed_files = 0
# 启动线程,检测关键字
threading.Thread(target=self.detect_keywords).start()
def pause_detection(self):
# 设置is_running为False
self.is_running = False
# 启用开始按钮
self.start_button.config(state=tk.NORMAL)
# 禁用暂停按钮
self.pause_button.config(state=tk.DISABLED)
def load_detected_files(self):
# 获取当前目录
current_directory = self.directory_entry.get()
# 拼接已检测文件的路径
detected_file = os.path.join(current_directory, "已检测.csv")
# 如果文件存在
if os.path.exists(detected_file):
# 读取文件
with open(detected_file, 'rb') as f:
result = chardet.detect(f.read())
# 用相应的编码来读取文件
df = pd.read_csv(detected_file, encoding=result['encoding'])
# 返回文件名的唯一值列表
return df['已检测文件名'].unique().tolist()
# 如果文件不存在,返回空列表
return []
def save_detection_result(self, results):
# 如果excel文件存在
if os.path.exists(self.excel_file):
# 读取excel文件
df = pd.read_excel(self.excel_file)
# 将results添加到df中
df = df._append(pd.DataFrame(results), ignore_index=True)
else:
# 如果excel文件不存在,则创建一个新的DataFrame
df = pd.DataFrame(results)
# 将df写入excel文件
excel_file = os.path.join(self.directory, "检测结果.xlsx")
df.to_excel(excel_file, index=False)
def browse_keyword_file(self):
# 打开文件对话框,让用户选择一个文本文件
file_path = filedialog.askopenfilename(filetypes=[("文本文件", "*.txt")])
if file_path:
# 获取文件的绝对路径
file_path = os.path.abspath(file_path) # 获取绝对路径
# 清空文本框
self.keyword_file_entry.delete(0, tk.END)
# 将选择的文件路径插入文本框
self.keyword_file_entry.insert(0, file_path)
def detect_keywords(self):
results = []
count = 0
for file in os.listdir(self.directory):
if not self.is_running:
break
if file.endswith('.pdf') and file not in self.detected_files:
self.processed_files += 1
self.log_text.insert(tk.END, f"检测文件: {file}\n")
self.log_text.yview(tk.END)
file_path = os.path.join(self.directory, file)
try:
f = open(file_path, "rb")
pdf = PdfReader(f)
for page_num in range(len(pdf.pages)):
page_text = pdf.pages[page_num].extract_text()
for keyword in self.keywords:
if keyword in page_text:
results.append({"文件名": file, "检测页码": page_num + 1, "关键字": keyword})
except Exception as e:
self.log_text.insert(tk.END, f"错误处理文件 {file}: {e}\n")
self.log_text.yview(tk.END)
finally:
f.close()
count += 1
if count % 100 == 0:
self.save_detection_result(results)
results = []
self.progress_label.config(text=f"进度: {self.processed_files}/{self.total_files}")
self.root.update()
self.detected_files.append(file) # 将已检测的文件添加到detected_files列表中
if results:
self.save_detection_result(results)
# 将detected_files列表写入"已检测.csv"文件
detected_file = os.path.join(self.directory, "已检测.csv")
with open(detected_file, "w", newline="", encoding="GBK") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["已检测文件名"])
for file in self.detected_files:
writer.writerow([file])
self.is_running = False
self.start_button.config(state=tk.NORMAL)
self.pause_button.config(state=tk.DISABLED)
self.log_text.insert(tk.END, "检测完成\n")
self.log_text.yview(tk.END)
if __name__ == "__main__":
root = tk.Tk()
app = PDFKeywordDetector(root)
root.mainloop()