python提取PDF文档表格数据格式不对

thornjay 发表于 2024-8-17 17:02

本帖最后由 thornjay 于 2024-8-19 09:07 编辑

各位大佬，我是python新手，在这里向各位大佬请教一下：
以下是我写的代码，主要目的是电脑E盘有一个叫jf的文件夹，里面有很多PDF文件，每个PDF文件里都有一个表格，我想用python写一段代码，要求能自动提取E:\JF文件夹里所有PDF文件里面表格中的订单编号、品号、品名、规格、数量等对应的值，并在该文件夹下生成一个叫计划的EXCEL表。我的代码如下：
import os
import pandas as pd
from pdfplumber import open as pdf_open
from openpyxl import load_workbook

# 定义文件夹路径和输出Excel文件的路径
folder_path = r'E:\JF'
output_excel = os.path.join(folder_path, '计划.xlsx')

# 检查输出文件是否存在，如果存在则删除
if os.path.exists(output_excel):
os.remove(output_excel)

# 初始化一个空的DataFrame来存储数据
data = []

# 遍历文件夹中的所有PDF文件
for filename in sorted(os.listdir(folder_path)):
if filename.endswith('.pdf'):
   pdf_path = os.path.join(folder_path, filename)
   order_number = None# 存储订单编号
   is_total_found = False# 是否找到“合计”字样

   with pdf_open(pdf_path) as pdf:
         # 遍历页面
         for page in pdf.pages:
            # 从PDF页面中提取表格
            table = page.extract_table()
            if not table:
               continue

            # 提取订单编号
            if not order_number and table:
               order_number = table# 第1行第4列

            # 遍历表格的每一行
            for index, row in enumerate(table):
               # 跳过标题行和空行
               if index < 5:
                     continue

               # 检查是否遇到“合计”，如果遇到则设置标志并跳出内层循环
               if not is_total_found:
                     for cell in row:
                        if "合计" in str(cell).upper():# 转换为大写以忽略大小写
                           is_total_found = True
                           break

               if is_total_found:
                     break

               # 从第6行开始提取数据
               if index >= 6:# 从第6行开始
                     product_code = row.strip() if row else ''
                     product_name = row.strip() if row else ''
                     specification = row.strip() if row else ''
                     quantity = row.strip() if row else ''
                     delivery_date = row.strip() if row else ''

                     # 将交货日期作为字符串存储，以避免Excel将其转换为数值
                     data.append({
                        '订单编号': order_number,
                        '品号': product_code,
                        '品名': product_name,
                        '规格': specification,
                        '数量': quantity,
                        '交货日期': delivery_date
                     })

# 使用pandas创建DataFrame
df = pd.DataFrame(data)

# 将DataFrame保存到Excel文件
df.to_excel(output_excel, index=False, engine='openpyxl')

print(f'数据已成功提取并保存到 {output_excel}')

以上代码运行后，会生成EXCEL表，但是在EXCEL表里，订单编号不会根据每个PDF文档表格里的订单编号生成对应的编号，而是空白；在交货日期里，提取出来的不会显示日期，而是数值，请问要怎样调整代码？
下面图2是PDF文档里的原始表，图1是生成的EXCEL表格，订单编号没有对应数据，订单编号没生成日期格式：

许我浅笑而安 发表于 2024-8-17 23:04

本帖最后由许我浅笑而安° 于 2024-8-17 23:07 编辑

PDF传一下看看
# ...（之前的代码保持不变）

for filename in sorted(os.listdir(folder_path)):
if filename.endswith('.pdf'):
   pdf_path = os.path.join(folder_path, filename)
   order_number = None# 确保每次处理新文件时重置订单编号
   is_total_found = False

   with pdf_open(pdf_path) as pdf:
         for page in pdf.pages:
            table = page.extract_table()
            if not table:
               continue

            # 提取订单编号（确保这里的索引与PDF表格结构相匹配）
            if not order_number and table:
               order_number = table if len(table) > 3 else None

            for index, row in enumerate(table):
               if index < 5:
                     continue

               if not is_total_found:
                     for cell in row:
                        if "合计" in str(cell).upper():
                           is_total_found = True
                           break

               if is_total_found:
                     break

               if index >= 6:
                     product_code = row.strip() if row else ''
                     product_name = row.strip() if row else ''
                     specification = row.strip() if row else ''
                     quantity = row.strip() if row else ''

                     # 确保交货日期是字符串格式
                     delivery_date_str = str(row.strip() if row else '')

                     data.append({
                        '订单编号': order_number,
                        '品号': product_code,
                        '品名': product_name,
                        '规格': specification,
                        '数量': quantity,
                        '交货日期': delivery_date_str# 保存为字符串格式
                     })

# ...（之后的代码保持不变）

matxi 发表于 2024-8-18 10:19

import os
import pandas as pd
import re
from pdfplumber import open as pdf_open
from openpyxl import load_workbook

# 定义文件夹路径和输出Excel文件的路径
folder_path = r'E:\JF'
output_excel = os.path.join(folder_path, '计划.xlsx')

# 检查输出文件是否存在，如果存在则删除
if os.path.exists(output_excel):
os.remove(output_excel)

# 初始化一个空的DataFrame来存储数据
data = []

# 正则表达式查找订单编号的模式（假设订单编号为一串数字）
order_number_pattern = re.compile(r"订单编号[:：]?\s*(\d+)")

# 遍历文件夹中的所有PDF文件
for filename in sorted(os.listdir(folder_path)):
if filename.endswith('.pdf'):
   pdf_path = os.path.join(folder_path, filename)
   order_number = None# 存储订单编号
   is_total_found = False# 是否找到“合计”字样

   with pdf_open(pdf_path) as pdf:
         # 遍历页面
         for page in pdf.pages:
            # 尝试从文本中提取订单编号
            if not order_number:
               text = page.extract_text()
               if text:
                     match = re.search(order_number_pattern, text)
                     if match:
                        order_number = match.group(1)

            # 从PDF页面中提取表格
            table = page.extract_table()
            if not table:
               continue

            # 遍历表格的每一行
            for index, row in enumerate(table):
               # 跳过标题行和空行
               if index < 5:
                     continue

               # 检查是否遇到“合计”，如果遇到则设置标志并跳出内层循环
               if not is_total_found:
                     for cell in row:
                        if "合计" in str(cell).upper():# 转换为大写以忽略大小写
                           is_total_found = True
                           break

               if is_total_found:
                     break

               # 从第6行开始提取数据
               if index >= 6:
                     product_code = row.strip() if row else ''
                     product_name = row.strip() if row else ''
                     specification = row.strip() if row else ''
                     quantity = row.strip() if row else ''
                     delivery_date = row.strip() if row else ''

                     # 将交货日期解析为字符串或保留原始值，以避免Excel将其转换为数值
                     try:
                        delivery_date = pd.to_datetime(delivery_date).strftime('%Y-%m-%d')
                     except:
                        pass

                     # 确保订单编号不为空，默认值为PDF文件名
                     if not order_number:
                        order_number = filename

                     # 将数据追加到列表中
                     data.append({
                        '订单编号': order_number,
                        '品号': product_code,
                        '品名': product_name,
                        '规格': specification,
                        '数量': quantity,
                        '交货日期': delivery_date
                     })

# 使用pandas创建DataFrame
df = pd.DataFrame(data)

# 将DataFrame保存到Excel文件
df.to_excel(output_excel, index=False, engine='openpyxl')

print(f'数据已成功提取并保存到 {output_excel}')

试一下这个

china-ray 发表于 2024-8-18 11:09

问题里没有图，还想学习学习哪！不知道这PDF是不是业务系统形成的？那种图片或扫描件的表格应该不行吧？

thornjay 发表于 2024-8-19 09:00

许我浅笑而安° 发表于 2024-8-17 23:04
PDF传一下看看
# ...（之前的代码保持不变）

PDF文档表格如图

thornjay 发表于 2024-8-19 09:02

china-ray 发表于 2024-8-18 11:09
问题里没有图，还想学习学习哪！不知道这PDF是不是业务系统形成的？那种图片或扫描件的表格应该不行吧？

不知道为什么图没显示：
图1图2如下：

thornjay 发表于 2024-8-19 09:12

matxi 发表于 2024-8-18 10:19
import os
import pandas as pd
import re

您好！首先非常感谢您的指导，谢谢！提取出来的订单编号不完整，只提取了前几位，其次交货日期显示的不是日期，而是成了数值。

页: [1]

吾爱破解 - 52pojie.cn's Archiver

python提取PDF文档表格数据格式不对