python提取PDF图片和表格（带GUI界面）

Eks6666 · 发表于 2024-6-25 23:21

本帖最后由 Eks6666 于 2024-6-25 23:23 编辑

[Python] 纯文本查看 复制代码

001

002

003

004

005

006

007

008

009

010

011

012

013

014

015

016

017

018

019

020

021

022

023

024

025

026

027

028

029

030

031

032

033

034

035

036

037

038

039

040

041

042

043

044

045

046

047

048

049

050

051

052

053

054

055

056

057

058

059

060

061

062

063

064

065

066

067

068

069

070

071

072

073

074

075

076

077

078

079

080

081

082

083

084

085

086

087

088

089

090

091

092

093

094

095

096

097

098

099

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

import tkinter as tk
from tkinter import filedialog, ttk
import threading
import pdfplumber
import openpyxl
from openpyxl import Workbook
import sys
 
from PIL import Image, ImageTk
def extract_tables(pdf, ws, progress_var, total_pages, base_progress=0, progress_step=1):
    for i, page in enumerate(pdf.pages):
        if page.extract_tables():
            for table in page.extract_tables():
                for row in table:
                    ws.append(row)
        current_progress = base_progress + (i + 1) / total_pages * progress_step
        progress_var.set(current_progress)
 
def extract_images(pdf, images_folder, progress_var, total_pages, base_progress=0, progress_step=1, pdf_filename=''):
    for i, page in enumerate(pdf.pages):
        for image_index, image in enumerate(page.images):
            x0, top, x1, bottom = image["x0"], image["top"], image["x1"], image["bottom"]
            cropped_image = page.within_bbox((x0, top, x1, bottom))
            if cropped_image:
                img = cropped_image.to_image(resolution=300)
                img_filename = f"{images_folder}/{pdf_filename}_page_{i+1}_{image_index+1}.png"
 
                img.save(img_filename)
                print(img_filename)
        current_progress = base_progress + (i + 1) / total_pages * progress_step
        print(current_progress)
 
def extract_from_pdf(pdf_paths, excel_path, images_folder, progress_var, status_var, extract_tables_var, extract_images_var):
    try:
        total_pdfs = len(pdf_paths)
        progress_step = 100 / total_pdfs
        for j, pdf_path in enumerate(pdf_paths):
            with pdfplumber.open(pdf_path) as pdf:
                total_pages = len(pdf.pages)
 
                if extract_tables_var.get() == 1:
                    wb = Workbook()
                    ws = wb.active
                    extract_tables(pdf, ws, progress_var, total_pages, base_progress=j*progress_step, progress_step=progress_step)
                    wb.save(excel_path.replace('.xlsx', f'_{j+1}.xlsx'))
 
                if extract_images_var.get() == 1:
                    pdf_filename = pdf_path.split("/")[-1].replace('.pdf', '')
                    extract_images(pdf, images_folder, progress_var, total_pages, base_progress=j*progress_step, progress_step=progress_step, pdf_filename=pdf_filename)
 
        print("提取完成！")
    except Exception as e:
        print(f"发生错误: {str(e)}")
    finally:
        progress_var.set(0)
 
def open_files():
    file_paths = filedialog.askopenfilenames(filetypes=[("PDF files", "*.pdf")])
    if file_paths:
        pdf_paths_var.set(list(file_paths))
        print(f'已选择{file_paths}作为PDF文件来处理！！！')
 
def save_file():
    file_path = filedialog.asksaveasfilename(defaultextension=".xlsx", filetypes=[("Excel files", "*.xlsx")])
    if file_path:
        excel_path_var.set(file_path)
        print(f'已选择{file_path}作为Excel文件的保存表格!!!')
 
def select_folder():
    folder_path = filedialog.askdirectory()
    if folder_path:
        images_folder_var.set(folder_path)
        print(f'已选择{folder_path}作为图片文件保存位置！！！')
def run_extraction():
    pdf_paths = pdf_paths_var.get()
    excel_path = excel_path_var.get()
    images_folder = images_folder_var.get()
    pdf_paths = eval(pdf_paths)
 
    if pdf_paths and (excel_path or images_folder):
        print("正在提取...")
        threading.Thread(target=extract_from_pdf, args=(pdf_paths, excel_path, images_folder, progress_var, status_var, extract_tables_var, extract_images_var)).start()
 
 
def print_to_display(content):
    display_box.insert(tk.END, content + '\n')
    display_box.see(tk.END)
 
# 创建主窗口
root = tk.Tk()
 
root.title("PDF 表格与图片提取器")
 
x_1 = int(root.winfo_screenwidth() / 3 - root.winfo_reqwidth() / 3)
y_1 = int(root.winfo_screenheight() / 3 - root.winfo_reqheight() / 3)
 
root.geometry(f"680x480+{x_1}+{y_1}")
root.configure(bg='#F5F5F5')
 
try:
    background_image = Image.open("background_data_1.png")
    background_image = background_image.resize((700, 480), Image.Resampling.LANCZOS)
    background_image = ImageTk.PhotoImage(background_image)
    canvas_main_bank = tk.Canvas(root, width=700, height=480)
    canvas_main_bank.pack(fill="both", expand=True)
    canvas_main_bank.create_image(0, 0, image=background_image, anchor="nw")
except:
    pass
 
title_label = tk.Label(root, text="PDF 表格与图片提取器", font=("Arial", 18))
title_label.place(x=230, y=5)
 
 
 
pdf_paths_var = tk.StringVar()
excel_path_var = tk.StringVar()
images_folder_var = tk.StringVar()
status_var = tk.StringVar()
progress_var = tk.DoubleVar()
extract_tables_var = tk.IntVar(value=1)
extract_images_var = tk.IntVar(value=1)
 
# 选择PDF文件:
label_pdf = tk.Label(root, text="选择PDF文件:")
label_pdf.place(x=60, y=60)  # 假设 x=10, y=5 作为起始点
 
pdf_entry = tk.Entry(root, textvariable=pdf_paths_var, width=50)
pdf_entry.place(x=180, y=60)  # 假设 x=100 作为输入框的起始点
 
button_browse = tk.Button(root, text="浏览", command=open_files)
button_browse.place(x=540, y=60)  # 假设 x=220 作为按钮的起始点
 
# 保存为Excel文件:
label_excel = tk.Label(root, text="保存为Excel文件:")
label_excel.place(x=60, y=100)  # y 坐标增加，以适应新行
 
excel_entry = tk.Entry(root, textvariable=excel_path_var, width=50)
excel_entry.place(x=180, y=100)
 
button_save = tk.Button(root, text="浏览", command=save_file)
button_save.place(x=540, y=100)
 
# 选择图片保存文件夹:
label_folder = tk.Label(root, text="选择图片保存文件夹:")
label_folder.place(x=60, y=140)
 
images_entry = tk.Entry(root, textvariable=images_folder_var, width=50)
images_entry.place(x=180, y=140)
 
button_select = tk.Button(root, text="浏览", command=select_folder)
button_select.place(x=540,y=140)
 
# 提取选项
check_tables = tk.Checkbutton(root, text="提取表格", variable=extract_tables_var)
check_tables.place(x=160, y=180)
 
check_images = tk.Checkbutton(root, text="提取图片", variable=extract_images_var)
check_images.place(x=260, y=180)
 
# 开始提取按钮
button_start = tk.Button(root, text="开始提取", command=run_extraction, bg="green")
button_start.place(x=400, y=180)  # 根据需要调整位置
 
status_label = tk.Label(root, textvariable=status_var)
status_label.place(x=200, y=220)
 
# 进度条
progress_bar = ttk.Progressbar(root, variable=progress_var, maximum=100)
progress_bar.place(x=200, y=220, width=280)  # 假设宽度为280
 
 
# 滚动条
scrollbar = tk.Scrollbar(root)
scrollbar.place(x=560, y=260, relheight=0.33, anchor=tk.N)
# 文本输出框
display_box = tk.Text(root, yscrollcommand=scrollbar.set, height=12, width=70, bd=0)
display_box.place(x=60, y=260)
 
scrollbar.config(command=display_box.yview)
 
root.mainloop()

lozhuo · 发表于 2024-6-28 09:48

ianlcc 发表于 2024-6-27 16:17
请问您有试过吗？
EXE档还满大的
我找了一个10页的PDF测试，运行後没反应…

exe为什么比较大我不清楚，我只是用pyinstaller把楼主的代码生成了exe文件。
刚才我用一个22页的PDF测试了exe，PDF中有两个表格，其中一个表格跨页了，能正常提取表格，但未提取图片。又用楼主的代码测试了一下，表格和图片都提取成功。

lozhuo · 发表于 2024-6-26 14:48

本帖最后由 lozhuo 于 2024-6-27 09:46 编辑

仅对楼主提供的代码进行打包，若有侵权或违反版规，请及时联系我删除！
python提取PDF图片和表格：
阿里云：https://www.alipan.com/s/pufikUMy8Ti
123云盘：https://www.123pan.com/s/Vg5Jjv-KbKKv.html

zhufuan · 发表于 2024-6-26 02:22

谢谢老师的分享，能有个exe执行程序吗

cxx0515 · 发表于 2024-6-26 07:27

感谢楼主分享，学习一下

wjbg2022 · 发表于 2024-6-26 08:03

界面可以截图一下吗？大佬！

zuxin521 · 发表于 2024-6-26 08:10

GUI的界面有吗？啥样子的？

1045837055lucy · 发表于 2024-6-26 08:42

同求.exe程序，先行感谢。

zpwz · 发表于 2024-6-26 08:47

感谢分享，期待成品

aoxuehanshuang · 发表于 2024-6-26 08:54

同求.exe程序，先行感谢

yaphoo · 发表于 2024-6-26 09:21

感谢分享！

yyrocku2 · 发表于 2024-6-26 11:00

这个牛逼牛逼啊

帐号		自动登录	找回密码
密码			注册[Register]

[Python 原创] python提取PDF图片和表格（带GUI界面）

免费评分

本帖被以下淘专辑推荐:

免费评分

浏览过的版块