[Python] 纯文本查看 复制代码
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
[url=home.php?mod=space&uid=267492]@file[/url] : pdftablekiller.py
[url=home.php?mod=space&uid=238618]@Time[/url] : 2021/11/21
[url=home.php?mod=space&uid=686208]@AuThor[/url] : Ljujl
[url=home.php?mod=space&uid=1248337]@version[/url] : 1.0
@Contact : [email]mr_liu133299@foxmail.com[/email]
'''
# here put the import lib
from tkinter import Label, constants, filedialog, Button, Frame, BROWSE, BOTH, Text, INSERT, EXTENDED
from tkinter import Entry, StringVar, messagebox, Tk, mainloop
from tkinter.ttk import Treeview
import camelot
"""
conda 安装 camelot
"""
class MainWindow():
def __init__(self) -> None:
self.root = Tk()
self.root.title("Pdf Table Killer")
self.root.iconbitmap(r"pdf表格提取\test1.ico")
x = (self.root.winfo_screenwidth() - self.root.winfo_reqwidth()) // 4
y = (self.root.winfo_screenheight() - self.root.winfo_reqheight()) // 4
frame = Frame(self.root)
frame.pack(padx=2, pady=2, ipadx=1)
bg = "#DCDCDC"
btn_open = Button(frame, text="open", width=5, height=1, command=self.open_pdf,).grid(
row=0, column=0, padx=5, pady=5, rowspan=2, columnspan=2, sticky='NS') #
label_page = Label(frame, text="page number:", width=20, height=1, bg=bg).grid(
row=0, column=2, padx=5, pady=5, sticky='W')
self.pages = StringVar()
page_input = Entry(frame, bd=1, textvariable=self.pages).grid(
row=0, column=3, padx=5, pady=5, sticky='W')
label_row_tl = Label(frame, text="row_tol(optional int):", width=20, height=1, bg=bg).grid(
row=1, column=2, padx=5, pady=5, sticky='W')
self.row_tol_value = StringVar()
row_tol = Entry(frame, bd=1, textvariable=self.row_tol_value).grid(
row=1, column=3, padx=5, pady=5, sticky='W')
btn_process = Button(frame, text="process", width=10, height=1, command=self.process).grid(
row=0, column=6, padx=5, pady=5, sticky='W')
btn_save = Button(frame, text="save", width=10, height=1, command=self.sava_data).grid(
row=1, column=6, padx=5, pady=5, sticky='W')
label_display_table = Label(frame, text="display table:", height=1, bg=bg).grid(
row=2, columnspan=7, sticky='WE')
label_white = Label(frame, text="", height=1, bg=bg).grid(
row=3, columnspan=7, sticky='WE')
readme = """Notes:
1. click "open" button
2. input page number
3. click "process" button(maybe you need wait a few seconds)
4. click "save" button
5. any questions please make contact with me: [email]mr_liu133299@foxmail.com[/email]
"""
readme_text = Text(frame, height=8)
readme_text.insert(INSERT, readme)
readme_text['state'] = "disabled"
readme_text.grid(
row=0, column=8, rowspan=6, sticky='SN') # , padx=5, pady=5, column=8
def open_pdf(self):
self.filepath = filedialog.askopenfilename(title='Please choose a file', filetypes=[
('Pdf file', '*.pdf')])
def process(self):
# page为阅读显示的页码,不同于文献页脚页码
if self.filepath and self.pages.get():
if self.row_tol_value.get():
self.root.geometry(
f"{self.root.winfo_screenwidth()}x{self.root.winfo_screenheight()}+0+0")
tables = camelot.read_pdf(self.filepath, pages=self.pages.get(
), flavor='stream', row_tol=eval(self.row_tol_value.get()))
self.df = tables[0].df
data = self.df.to_dict(orient="records")
# 定义列的名称
columns = tuple(self.df.columns)
tree = Treeview(self.root, show="headings",
selectmode=BROWSE, columns=columns)
# 设置表格文字居中
for col in self.df.columns:
tree.column(col, anchor="center")
# 设置表格头部标题
for c in columns:
tree.heading(c, text=c)
# 设置表格内容
i = 0
for v in data:
tree.insert('', i, values=tuple(v.values()))
i += 1
tree.pack(expand=True, fill=BOTH)
else:
self.root.geometry(
f"{self.root.winfo_screenwidth()}x{self.root.winfo_screenheight()}+0+0")
tables = camelot.read_pdf(
self.filepath, pages=self.pages.get(), flavor='stream')
self.df = tables[0].df
data = self.df.to_dict(orient="records")
# 定义列的名称
columns = tuple(self.df.columns)
tree = Treeview(self.root, show="headings",
selectmode=EXTENDED, columns=columns)
# 设置表格文字居中
for col in self.df.columns:
tree.column(col, anchor="center")
# 设置表格头部标题
for c in columns:
tree.heading(c, text=c)
# 设置表格内容
i = 0
for v in data:
tree.insert('', i, values=tuple(v.values()))
i += 1
tree.pack(expand=True, fill=BOTH)
# 保存文件
def sava_data(self):
file_name = self.filepath.split(
"/")[-1].split(".")[0] + "_page" + self.pages.get() + ".csv"
self.df.to_csv(f"{file_name}", index=False, header=None)
messagebox.showinfo('tips', 'save successfully~~')
if __name__ == "__main__":
main = MainWindow()
mainloop()