【Python】一键提取pdf文件表格数据，真香，还有可视化界面

mrliu133 发表于 2021-11-21 21:08

废话不多说，直接上代码
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@file : pdftablekiller.py
@Time : 2021/11/21
@AuThor: Ljujl
@version : 1.0
@Contact : mr_liu133299@foxmail.com
'''

# here put the import lib
from tkinter import Label, constants, filedialog, Button, Frame, BROWSE, BOTH, Text, INSERT, EXTENDED
from tkinter import Entry, StringVar, messagebox, Tk, mainloop
from tkinter.ttk import Treeview
import camelot
"""
conda 安装 camelot
"""

class MainWindow():
def __init__(self) -> None:
   self.root = Tk()
   self.root.title("Pdf Table Killer")
   self.root.iconbitmap(r"pdf表格提取\test1.ico")
   x = (self.root.winfo_screenwidth() - self.root.winfo_reqwidth()) // 4
   y = (self.root.winfo_screenheight() - self.root.winfo_reqheight()) // 4

   frame = Frame(self.root)
   frame.pack(padx=2, pady=2, ipadx=1)

   bg = "#DCDCDC"
   btn_open = Button(frame, text="open", width=5, height=1, command=self.open_pdf,).grid(
         row=0, column=0, padx=5, pady=5, rowspan=2, columnspan=2, sticky='NS')#

   label_page = Label(frame, text="page number:", width=20, height=1, bg=bg).grid(
         row=0, column=2, padx=5, pady=5, sticky='W')
   self.pages = StringVar()
   page_input = Entry(frame, bd=1, textvariable=self.pages).grid(
         row=0, column=3, padx=5, pady=5, sticky='W')

   label_row_tl = Label(frame, text="row_tol(optional int):", width=20, height=1, bg=bg).grid(
         row=1, column=2, padx=5, pady=5, sticky='W')
   self.row_tol_value = StringVar()
   row_tol = Entry(frame, bd=1, textvariable=self.row_tol_value).grid(
         row=1, column=3, padx=5, pady=5, sticky='W')

   btn_process = Button(frame, text="process", width=10, height=1, command=self.process).grid(
         row=0, column=6, padx=5, pady=5, sticky='W')
   btn_save = Button(frame, text="save", width=10, height=1, command=self.sava_data).grid(
         row=1, column=6, padx=5, pady=5, sticky='W')

   label_display_table = Label(frame, text="display table:", height=1, bg=bg).grid(
         row=2, columnspan=7, sticky='WE')
   label_white = Label(frame, text="", height=1, bg=bg).grid(
         row=3, columnspan=7, sticky='WE')
   readme = """Notes:
1. click "open" button
2. input page number
3. click "process" button(maybe you need wait a few seconds)
4. click "save" button
5. any questions please make contact with me: mr_liu133299@foxmail.com
   """
   readme_text = Text(frame, height=8)
   readme_text.insert(INSERT, readme)
   readme_text['state'] = "disabled"
   readme_text.grid(
         row=0, column=8, rowspan=6, sticky='SN')# , padx=5, pady=5, column=8

def open_pdf(self):
   self.filepath = filedialog.askopenfilename(title='Please choose a file', filetypes=[
         ('Pdf file', '*.pdf')])

def process(self):
   # page为阅读显示的页码，不同于文献页脚页码
   if self.filepath and self.pages.get():
         if self.row_tol_value.get():
            self.root.geometry(
               f"{self.root.winfo_screenwidth()}x{self.root.winfo_screenheight()}+0+0")
            tables = camelot.read_pdf(self.filepath, pages=self.pages.get(
            ), flavor='stream', row_tol=eval(self.row_tol_value.get()))
            self.df = tables.df
            data = self.df.to_dict(orient="records")
            # 定义列的名称
            columns = tuple(self.df.columns)
            tree = Treeview(self.root, show="headings",
                           selectmode=BROWSE, columns=columns)
            # 设置表格文字居中
            for col in self.df.columns:
               tree.column(col, anchor="center")

            # 设置表格头部标题
            for c in columns:
               tree.heading(c, text=c)

            # 设置表格内容
            i = 0
            for v in data:
               tree.insert('', i, values=tuple(v.values()))
               i += 1
            tree.pack(expand=True, fill=BOTH)
         else:
            self.root.geometry(
               f"{self.root.winfo_screenwidth()}x{self.root.winfo_screenheight()}+0+0")
            tables = camelot.read_pdf(
               self.filepath, pages=self.pages.get(), flavor='stream')
            self.df = tables.df
            data = self.df.to_dict(orient="records")
            # 定义列的名称
            columns = tuple(self.df.columns)
            tree = Treeview(self.root, show="headings",
                           selectmode=EXTENDED, columns=columns)
            # 设置表格文字居中
            for col in self.df.columns:
               tree.column(col, anchor="center")

            # 设置表格头部标题
            for c in columns:
               tree.heading(c, text=c)

            # 设置表格内容
            i = 0
            for v in data:
               tree.insert('', i, values=tuple(v.values()))
               i += 1
            tree.pack(expand=True, fill=BOTH)

# 保存文件
def sava_data(self):
   file_name = self.filepath.split(
         "/")[-1].split(".") + "_page" + self.pages.get() + ".csv"
   self.df.to_csv(f"{file_name}", index=False, header=None)
   messagebox.showinfo('tips', 'save successfully~~')

if __name__ == "__main__":
main = MainWindow()
mainloop()

咋说吧 发表于 2021-11-26 15:31

看文献的时候会用的，不错！

blackboard 发表于 2021-11-21 21:52

又方便了很多

mrliu133 发表于 2021-11-21 21:23

火焰加鲁鲁发表于 2021-11-21 21:13
感谢楼主，楼主有成品么

今天尝试了打包，打包体积800多M，而且还存在问题，就没弄了。主要是camlot这个模块依赖的包有点多，另外pandas也很大。
成品目前就是代码，运行可以出结果。图标路径可以修改一下或者注释掉。

onetwothreenb 发表于 2021-11-30 13:40

火焰加鲁鲁 发表于 2021-11-21 21:13

感谢楼主，楼主有成品么{:1_918:}

suqingxiao 发表于 2021-11-21 22:00

这个是只能用于那种表格转出来的PDF吗？

aysta 发表于 2021-11-21 22:06

感谢分享，不错

zc777 发表于 2021-11-21 22:20

牛呀，我学习学习了

小能维尼 发表于 2021-11-21 22:24

如果不打包，有没有体积一说？？
库我都装，直接跑代码的那种
库是不是也挺大的？

panchun888 发表于 2021-11-21 22:28

学习了，谢谢楼主

火焰加鲁鲁 发表于 2021-11-21 22:58

mrliu133 发表于 2021-11-21 21:23
今天尝试了打包，打包体积800多M，而且还存在问题，就没弄了。主要是camlot这个模块依赖的包有点多，另外 ...

好的，感谢:lol

页: [1] 2 3 4 5 6 7 8 9 10

吾爱破解 - 52pojie.cn's Archiver

【Python】一键提取pdf文件表格数据，真香，还有可视化界面