吾爱破解 - 52pojie.cn

 找回密码
 注册[Register]

QQ登录

只需一步,快速开始

查看: 2411|回复: 45
收起左侧

[Python 原创] python提取PDF图片和表格(带GUI界面)

  [复制链接]
Eks6666 发表于 2024-6-25 23:21
本帖最后由 Eks6666 于 2024-6-25 23:23 编辑

[Python] 纯文本查看 复制代码
import tkinter as tk
from tkinter import filedialog, ttk
import threading
import pdfplumber
import openpyxl
from openpyxl import Workbook
import sys

from PIL import Image, ImageTk
def extract_tables(pdf, ws, progress_var, total_pages, base_progress=0, progress_step=1):
    for i, page in enumerate(pdf.pages):
        if page.extract_tables():
            for table in page.extract_tables():
                for row in table:
                    ws.append(row)
        current_progress = base_progress + (i + 1) / total_pages * progress_step
        progress_var.set(current_progress)

def extract_images(pdf, images_folder, progress_var, total_pages, base_progress=0, progress_step=1, pdf_filename=''):
    for i, page in enumerate(pdf.pages):
        for image_index, image in enumerate(page.images):
            x0, top, x1, bottom = image["x0"], image["top"], image["x1"], image["bottom"]
            cropped_image = page.within_bbox((x0, top, x1, bottom))
            if cropped_image:
                img = cropped_image.to_image(resolution=300)
                img_filename = f"{images_folder}/{pdf_filename}_page_{i+1}_{image_index+1}.png"

                img.save(img_filename)
                print(img_filename)
        current_progress = base_progress + (i + 1) / total_pages * progress_step
        print(current_progress)

def extract_from_pdf(pdf_paths, excel_path, images_folder, progress_var, status_var, extract_tables_var, extract_images_var):
    try:
        total_pdfs = len(pdf_paths)
        progress_step = 100 / total_pdfs
        for j, pdf_path in enumerate(pdf_paths):
            with pdfplumber.open(pdf_path) as pdf:
                total_pages = len(pdf.pages)

                if extract_tables_var.get() == 1:
                    wb = Workbook()
                    ws = wb.active
                    extract_tables(pdf, ws, progress_var, total_pages, base_progress=j*progress_step, progress_step=progress_step)
                    wb.save(excel_path.replace('.xlsx', f'_{j+1}.xlsx'))

                if extract_images_var.get() == 1:
                    pdf_filename = pdf_path.split("/")[-1].replace('.pdf', '')
                    extract_images(pdf, images_folder, progress_var, total_pages, base_progress=j*progress_step, progress_step=progress_step, pdf_filename=pdf_filename)

        print("提取完成!")
    except Exception as e:
        print(f"发生错误: {str(e)}")
    finally:
        progress_var.set(0)

def open_files():
    file_paths = filedialog.askopenfilenames(filetypes=[("PDF files", "*.pdf")])
    if file_paths:
        pdf_paths_var.set(list(file_paths))
        print(f'已选择{file_paths}作为PDF文件来处理!!!')

def save_file():
    file_path = filedialog.asksaveasfilename(defaultextension=".xlsx", filetypes=[("Excel files", "*.xlsx")])
    if file_path:
        excel_path_var.set(file_path)
        print(f'已选择{file_path}作为Excel文件的保存表格!!!')

def select_folder():
    folder_path = filedialog.askdirectory()
    if folder_path:
        images_folder_var.set(folder_path)
        print(f'已选择{folder_path}作为图片文件保存位置!!!')
def run_extraction():
    pdf_paths = pdf_paths_var.get()
    excel_path = excel_path_var.get()
    images_folder = images_folder_var.get()
    pdf_paths = eval(pdf_paths)

    if pdf_paths and (excel_path or images_folder):
        print("正在提取...")
        threading.Thread(target=extract_from_pdf, args=(pdf_paths, excel_path, images_folder, progress_var, status_var, extract_tables_var, extract_images_var)).start()


def print_to_display(content):
    display_box.insert(tk.END, content + '\n')
    display_box.see(tk.END)

# 创建主窗口
root = tk.Tk()

root.title("PDF 表格与图片提取器")

x_1 = int(root.winfo_screenwidth() / 3 - root.winfo_reqwidth() / 3)
y_1 = int(root.winfo_screenheight() / 3 - root.winfo_reqheight() / 3)

root.geometry(f"680x480+{x_1}+{y_1}")
root.configure(bg='#F5F5F5')

try:
    background_image = Image.open("background_data_1.png")
    background_image = background_image.resize((700, 480), Image.Resampling.LANCZOS)
    background_image = ImageTk.PhotoImage(background_image)
    canvas_main_bank = tk.Canvas(root, width=700, height=480)
    canvas_main_bank.pack(fill="both", expand=True)
    canvas_main_bank.create_image(0, 0, image=background_image, anchor="nw")
except:
    pass

title_label = tk.Label(root, text="PDF 表格与图片提取器", font=("Arial", 18))
title_label.place(x=230, y=5)



pdf_paths_var = tk.StringVar()
excel_path_var = tk.StringVar()
images_folder_var = tk.StringVar()
status_var = tk.StringVar()
progress_var = tk.DoubleVar()
extract_tables_var = tk.IntVar(value=1)
extract_images_var = tk.IntVar(value=1)

# 选择PDF文件:
label_pdf = tk.Label(root, text="选择PDF文件:")
label_pdf.place(x=60, y=60)  # 假设 x=10, y=5 作为起始点

pdf_entry = tk.Entry(root, textvariable=pdf_paths_var, width=50)
pdf_entry.place(x=180, y=60)  # 假设 x=100 作为输入框的起始点

button_browse = tk.Button(root, text="浏览", command=open_files)
button_browse.place(x=540, y=60)  # 假设 x=220 作为按钮的起始点

# 保存为Excel文件:
label_excel = tk.Label(root, text="保存为Excel文件:")
label_excel.place(x=60, y=100)  # y 坐标增加,以适应新行

excel_entry = tk.Entry(root, textvariable=excel_path_var, width=50)
excel_entry.place(x=180, y=100)

button_save = tk.Button(root, text="浏览", command=save_file)
button_save.place(x=540, y=100)

# 选择图片保存文件夹:
label_folder = tk.Label(root, text="选择图片保存文件夹:")
label_folder.place(x=60, y=140)

images_entry = tk.Entry(root, textvariable=images_folder_var, width=50)
images_entry.place(x=180, y=140)

button_select = tk.Button(root, text="浏览", command=select_folder)
button_select.place(x=540,y=140)

# 提取选项
check_tables = tk.Checkbutton(root, text="提取表格", variable=extract_tables_var)
check_tables.place(x=160, y=180)

check_images = tk.Checkbutton(root, text="提取图片", variable=extract_images_var)
check_images.place(x=260, y=180)

# 开始提取按钮
button_start = tk.Button(root, text="开始提取", command=run_extraction, bg="green")
button_start.place(x=400, y=180)  # 根据需要调整位置

status_label = tk.Label(root, textvariable=status_var)
status_label.place(x=200, y=220)

# 进度条
progress_bar = ttk.Progressbar(root, variable=progress_var, maximum=100)
progress_bar.place(x=200, y=220, width=280)  # 假设宽度为280


# 滚动条
scrollbar = tk.Scrollbar(root)
scrollbar.place(x=560, y=260, relheight=0.33, anchor=tk.N)
# 文本输出框
display_box = tk.Text(root, yscrollcommand=scrollbar.set, height=12, width=70, bd=0)
display_box.place(x=60, y=260)

scrollbar.config(command=display_box.yview)

root.mainloop()

免费评分

参与人数 2吾爱币 +7 热心值 +2 收起 理由
苏紫方璇 + 7 + 1 欢迎分析讨论交流,吾爱破解论坛有你更精彩!
weidechan + 1 谢谢@Thanks!配个界面运行效果图就更好了

查看全部评分

本帖被以下淘专辑推荐:

发帖前要善用论坛搜索功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。

lozhuo 发表于 2024-6-28 09:48
ianlcc 发表于 2024-6-27 16:17
请问您有试过吗?
EXE档还满大的
我找了一个10页的PDF测试,运行後没反应…

exe为什么比较大我不清楚,我只是用pyinstaller把楼主的代码生成了exe文件。
刚才我用一个22页的PDF测试了exe,PDF中有两个表格,其中一个表格跨页了,能正常提取表格,但未提取图片。又用楼主的代码测试了一下,表格和图片都提取成功。
lozhuo 发表于 2024-6-26 14:48
本帖最后由 lozhuo 于 2024-6-27 09:46 编辑

仅对楼主提供的代码进行打包,若有侵权或违反版规,请及时联系我删除!
python提取PDF图片和表格:
阿里云:https://www.alipan.com/s/pufikUMy8Ti
123云盘:https://www.123pan.com/s/Vg5Jjv-KbKKv.html

免费评分

参与人数 1吾爱币 +1 热心值 +1 收起 理由
alice2wu + 1 + 1 谢谢@Thanks!

查看全部评分

zhufuan 发表于 2024-6-26 02:22
cxx0515 发表于 2024-6-26 07:27
感谢楼主分享,学习一下
wjbg2022 发表于 2024-6-26 08:03
界面可以截图一下吗?大佬!
zuxin521 发表于 2024-6-26 08:10
GUI的界面有吗?啥样子的?
1045837055lucy 发表于 2024-6-26 08:42
同求.exe程序,先行感谢。
zpwz 发表于 2024-6-26 08:47
感谢分享,期待成品
aoxuehanshuang 发表于 2024-6-26 08:54
同求.exe程序,先行感谢
yaphoo 发表于 2024-6-26 09:21
感谢分享!
yyrocku2 发表于 2024-6-26 11:00
这个牛逼牛逼啊
您需要登录后才可以回帖 登录 | 注册[Register]

本版积分规则

返回列表

RSS订阅|小黑屋|处罚记录|联系我们|吾爱破解 - LCG - LSG ( 京ICP备16042023号 | 京公网安备 11010502030087号 )

GMT+8, 2024-11-24 12:38

Powered by Discuz!

Copyright © 2001-2020, Tencent Cloud.

快速回复 返回顶部 返回列表