吾爱破解 - 52pojie.cn

 找回密码
 注册[Register]

QQ登录

只需一步,快速开始

查看: 3374|回复: 54
收起左侧

[Python 原创] python提取PDF图片和表格(带GUI界面)

  [复制链接]
Eks6666 发表于 2024-6-25 23:21
本帖最后由 Eks6666 于 2024-6-25 23:23 编辑

[Python] 纯文本查看 复制代码
001
002
003
004
005
006
007
008
009
010
011
012
013
014
015
016
017
018
019
020
021
022
023
024
025
026
027
028
029
030
031
032
033
034
035
036
037
038
039
040
041
042
043
044
045
046
047
048
049
050
051
052
053
054
055
056
057
058
059
060
061
062
063
064
065
066
067
068
069
070
071
072
073
074
075
076
077
078
079
080
081
082
083
084
085
086
087
088
089
090
091
092
093
094
095
096
097
098
099
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import tkinter as tk
from tkinter import filedialog, ttk
import threading
import pdfplumber
import openpyxl
from openpyxl import Workbook
import sys
 
from PIL import Image, ImageTk
def extract_tables(pdf, ws, progress_var, total_pages, base_progress=0, progress_step=1):
    for i, page in enumerate(pdf.pages):
        if page.extract_tables():
            for table in page.extract_tables():
                for row in table:
                    ws.append(row)
        current_progress = base_progress + (i + 1) / total_pages * progress_step
        progress_var.set(current_progress)
 
def extract_images(pdf, images_folder, progress_var, total_pages, base_progress=0, progress_step=1, pdf_filename=''):
    for i, page in enumerate(pdf.pages):
        for image_index, image in enumerate(page.images):
            x0, top, x1, bottom = image["x0"], image["top"], image["x1"], image["bottom"]
            cropped_image = page.within_bbox((x0, top, x1, bottom))
            if cropped_image:
                img = cropped_image.to_image(resolution=300)
                img_filename = f"{images_folder}/{pdf_filename}_page_{i+1}_{image_index+1}.png"
 
                img.save(img_filename)
                print(img_filename)
        current_progress = base_progress + (i + 1) / total_pages * progress_step
        print(current_progress)
 
def extract_from_pdf(pdf_paths, excel_path, images_folder, progress_var, status_var, extract_tables_var, extract_images_var):
    try:
        total_pdfs = len(pdf_paths)
        progress_step = 100 / total_pdfs
        for j, pdf_path in enumerate(pdf_paths):
            with pdfplumber.open(pdf_path) as pdf:
                total_pages = len(pdf.pages)
 
                if extract_tables_var.get() == 1:
                    wb = Workbook()
                    ws = wb.active
                    extract_tables(pdf, ws, progress_var, total_pages, base_progress=j*progress_step, progress_step=progress_step)
                    wb.save(excel_path.replace('.xlsx', f'_{j+1}.xlsx'))
 
                if extract_images_var.get() == 1:
                    pdf_filename = pdf_path.split("/")[-1].replace('.pdf', '')
                    extract_images(pdf, images_folder, progress_var, total_pages, base_progress=j*progress_step, progress_step=progress_step, pdf_filename=pdf_filename)
 
        print("提取完成!")
    except Exception as e:
        print(f"发生错误: {str(e)}")
    finally:
        progress_var.set(0)
 
def open_files():
    file_paths = filedialog.askopenfilenames(filetypes=[("PDF files", "*.pdf")])
    if file_paths:
        pdf_paths_var.set(list(file_paths))
        print(f'已选择{file_paths}作为PDF文件来处理!!!')
 
def save_file():
    file_path = filedialog.asksaveasfilename(defaultextension=".xlsx", filetypes=[("Excel files", "*.xlsx")])
    if file_path:
        excel_path_var.set(file_path)
        print(f'已选择{file_path}作为Excel文件的保存表格!!!')
 
def select_folder():
    folder_path = filedialog.askdirectory()
    if folder_path:
        images_folder_var.set(folder_path)
        print(f'已选择{folder_path}作为图片文件保存位置!!!')
def run_extraction():
    pdf_paths = pdf_paths_var.get()
    excel_path = excel_path_var.get()
    images_folder = images_folder_var.get()
    pdf_paths = eval(pdf_paths)
 
    if pdf_paths and (excel_path or images_folder):
        print("正在提取...")
        threading.Thread(target=extract_from_pdf, args=(pdf_paths, excel_path, images_folder, progress_var, status_var, extract_tables_var, extract_images_var)).start()
 
 
def print_to_display(content):
    display_box.insert(tk.END, content + '\n')
    display_box.see(tk.END)
 
# 创建主窗口
root = tk.Tk()
 
root.title("PDF 表格与图片提取器")
 
x_1 = int(root.winfo_screenwidth() / 3 - root.winfo_reqwidth() / 3)
y_1 = int(root.winfo_screenheight() / 3 - root.winfo_reqheight() / 3)
 
root.geometry(f"680x480+{x_1}+{y_1}")
root.configure(bg='#F5F5F5')
 
try:
    background_image = Image.open("background_data_1.png")
    background_image = background_image.resize((700, 480), Image.Resampling.LANCZOS)
    background_image = ImageTk.PhotoImage(background_image)
    canvas_main_bank = tk.Canvas(root, width=700, height=480)
    canvas_main_bank.pack(fill="both", expand=True)
    canvas_main_bank.create_image(0, 0, image=background_image, anchor="nw")
except:
    pass
 
title_label = tk.Label(root, text="PDF 表格与图片提取器", font=("Arial", 18))
title_label.place(x=230, y=5)
 
 
 
pdf_paths_var = tk.StringVar()
excel_path_var = tk.StringVar()
images_folder_var = tk.StringVar()
status_var = tk.StringVar()
progress_var = tk.DoubleVar()
extract_tables_var = tk.IntVar(value=1)
extract_images_var = tk.IntVar(value=1)
 
# 选择PDF文件:
label_pdf = tk.Label(root, text="选择PDF文件:")
label_pdf.place(x=60, y=60# 假设 x=10, y=5 作为起始点
 
pdf_entry = tk.Entry(root, textvariable=pdf_paths_var, width=50)
pdf_entry.place(x=180, y=60# 假设 x=100 作为输入框的起始点
 
button_browse = tk.Button(root, text="浏览", command=open_files)
button_browse.place(x=540, y=60# 假设 x=220 作为按钮的起始点
 
# 保存为Excel文件:
label_excel = tk.Label(root, text="保存为Excel文件:")
label_excel.place(x=60, y=100# y 坐标增加,以适应新行
 
excel_entry = tk.Entry(root, textvariable=excel_path_var, width=50)
excel_entry.place(x=180, y=100)
 
button_save = tk.Button(root, text="浏览", command=save_file)
button_save.place(x=540, y=100)
 
# 选择图片保存文件夹:
label_folder = tk.Label(root, text="选择图片保存文件夹:")
label_folder.place(x=60, y=140)
 
images_entry = tk.Entry(root, textvariable=images_folder_var, width=50)
images_entry.place(x=180, y=140)
 
button_select = tk.Button(root, text="浏览", command=select_folder)
button_select.place(x=540,y=140)
 
# 提取选项
check_tables = tk.Checkbutton(root, text="提取表格", variable=extract_tables_var)
check_tables.place(x=160, y=180)
 
check_images = tk.Checkbutton(root, text="提取图片", variable=extract_images_var)
check_images.place(x=260, y=180)
 
# 开始提取按钮
button_start = tk.Button(root, text="开始提取", command=run_extraction, bg="green")
button_start.place(x=400, y=180# 根据需要调整位置
 
status_label = tk.Label(root, textvariable=status_var)
status_label.place(x=200, y=220)
 
# 进度条
progress_bar = ttk.Progressbar(root, variable=progress_var, maximum=100)
progress_bar.place(x=200, y=220, width=280# 假设宽度为280
 
 
# 滚动条
scrollbar = tk.Scrollbar(root)
scrollbar.place(x=560, y=260, relheight=0.33, anchor=tk.N)
# 文本输出框
display_box = tk.Text(root, yscrollcommand=scrollbar.set, height=12, width=70, bd=0)
display_box.place(x=60, y=260)
 
scrollbar.config(command=display_box.yview)
 
root.mainloop()

免费评分

参与人数 3吾爱币 +8 热心值 +2 收起 理由
lyhjh + 1 我很赞同!
苏紫方璇 + 7 + 1 欢迎分析讨论交流,吾爱破解论坛有你更精彩!
weidechan + 1 谢谢@Thanks!配个界面运行效果图就更好了

查看全部评分

本帖被以下淘专辑推荐:

发帖前要善用论坛搜索功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。

lozhuo 发表于 2024-6-28 09:48
ianlcc 发表于 2024-6-27 16:17
请问您有试过吗?
EXE档还满大的
我找了一个10页的PDF测试,运行後没反应…

exe为什么比较大我不清楚,我只是用pyinstaller把楼主的代码生成了exe文件。
刚才我用一个22页的PDF测试了exe,PDF中有两个表格,其中一个表格跨页了,能正常提取表格,但未提取图片。又用楼主的代码测试了一下,表格和图片都提取成功。
lozhuo 发表于 2024-6-26 14:48
本帖最后由 lozhuo 于 2024-6-27 09:46 编辑

仅对楼主提供的代码进行打包,若有侵权或违反版规,请及时联系我删除!
python提取PDF图片和表格:
阿里云:https://www.alipan.com/s/pufikUMy8Ti
123云盘:https://www.123pan.com/s/Vg5Jjv-KbKKv.html

免费评分

参与人数 1吾爱币 +1 热心值 +1 收起 理由
alice2wu + 1 + 1 谢谢@Thanks!

查看全部评分

zhufuan 发表于 2024-6-26 02:22
cxx0515 发表于 2024-6-26 07:27
感谢楼主分享,学习一下
wjbg2022 发表于 2024-6-26 08:03
界面可以截图一下吗?大佬!
zuxin521 发表于 2024-6-26 08:10
GUI的界面有吗?啥样子的?
1045837055lucy 发表于 2024-6-26 08:42
同求.exe程序,先行感谢。
zpwz 发表于 2024-6-26 08:47
感谢分享,期待成品
aoxuehanshuang 发表于 2024-6-26 08:54
同求.exe程序,先行感谢
yaphoo 发表于 2024-6-26 09:21
感谢分享!
yyrocku2 发表于 2024-6-26 11:00
这个牛逼牛逼啊
您需要登录后才可以回帖 登录 | 注册[Register]

本版积分规则

返回列表

RSS订阅|小黑屋|处罚记录|联系我们|吾爱破解 - LCG - LSG ( 京ICP备16042023号 | 京公网安备 11010502030087号 )

GMT+8, 2025-4-13 16:05

Powered by Discuz!

Copyright © 2001-2020, Tencent Cloud.

快速回复 返回顶部 返回列表