python提取PDF文档表格数据格式不对

thornjay · 发表于 2024-8-17 17:02

本帖最后由 thornjay 于 2024-8-19 09:07 编辑

各位大佬，我是python新手，在这里向各位大佬请教一下：
以下是我写的代码，主要目的是电脑E盘有一个叫jf的文件夹，里面有很多PDF文件，每个PDF文件里都有一个表格，我想用python写一段代码，要求能自动提取E:\JF文件夹里所有PDF文件里面表格中的订单编号、品号、品名、规格、数量等对应的值，并在该文件夹下生成一个叫计划的EXCEL表。我的代码如下：
import os
import pandas as pd
from pdfplumber import open as pdf_open
from openpyxl import load_workbook

# 定义文件夹路径和输出Excel文件的路径
folder_path = r'E:\JF'
output_excel = os.path.join(folder_path, '计划.xlsx')

# 检查输出文件是否存在，如果存在则删除
if os.path.exists(output_excel):
os.remove(output_excel)

# 初始化一个空的DataFrame来存储数据
data = []

# 遍历文件夹中的所有PDF文件
for filename in sorted(os.listdir(folder_path)):
if filename.endswith('.pdf'):
      pdf_path = os.path.join(folder_path, filename)
      order_number = None  # 存储订单编号
      is_total_found = False  # 是否找到“合计”字样

      with pdf_open(pdf_path) as pdf:
         # 遍历页面
         for page in pdf.pages:
            # 从PDF页面中提取表格
            table = page.extract_table()
            if not table:
                  continue

            # 提取订单编号
            if not order_number and table:
                  order_number = table[0][3]  # 第1行第4列

            # 遍历表格的每一行
            for index, row in enumerate(table):
                  # 跳过标题行和空行
                  if index < 5:
                     continue

                  # 检查是否遇到“合计”，如果遇到则设置标志并跳出内层循环
                  if not is_total_found:
                     for cell in row:
                        if "合计" in str(cell).upper():  # 转换为大写以忽略大小写
                              is_total_found = True
                              break

                  if is_total_found:
                     break

                  # 从第6行开始提取数据
                  if index >= 6:  # 从第6行开始
                     product_code = row[0].strip() if row[0] else ''
                     product_name = row[1].strip() if row[1] else ''
                     specification = row[2].strip() if row[2] else ''
                     quantity = row[4].strip() if row[4] else ''
                     delivery_date = row[7].strip() if row[7] else ''

                     # 将交货日期作为字符串存储，以避免Excel将其转换为数值
                     data.append({
                        '订单编号': order_number,
                        '品号': product_code,
                        '品名': product_name,
                        '规格': specification,
                        '数量': quantity,
                        '交货日期': delivery_date
                     })

# 使用pandas创建DataFrame
df = pd.DataFrame(data)

# 将DataFrame保存到Excel文件
df.to_excel(output_excel, index=False, engine='openpyxl')

print(f'数据已成功提取并保存到 {output_excel}')

以上代码运行后，会生成EXCEL表，但是在EXCEL表里，订单编号不会根据每个PDF文档表格里的订单编号生成对应的编号，而是空白；在交货日期里，提取出来的不会显示日期，而是数值，请问要怎样调整代码？
下面图2是PDF文档里的原始表，图1是生成的EXCEL表格，订单编号没有对应数据，订单编号没生成日期格式：

许我浅笑而安 · 发表于 2024-8-17 23:04

本帖最后由许我浅笑而安° 于 2024-8-17 23:07 编辑

PDF传一下看看

[Python] 纯文本查看 复制代码

# ...（之前的代码保持不变）  
  
for filename in sorted(os.listdir(folder_path)):  
    if filename.endswith('.pdf'):  
        pdf_path = os.path.join(folder_path, filename)  
        order_number = None  # 确保每次处理新文件时重置订单编号  
        is_total_found = False  
  
        with pdf_open(pdf_path) as pdf:  
            for page in pdf.pages:  
                table = page.extract_table()  
                if not table:  
                    continue  
  
                # 提取订单编号（确保这里的索引与PDF表格结构相匹配）  
                if not order_number and table:  
                    order_number = table[0][3] if len(table[0]) > 3 else None  
  
                for index, row in enumerate(table):  
                    if index < 5:  
                        continue  
  
                    if not is_total_found:  
                        for cell in row:  
                            if "合计" in str(cell).upper():  
                                is_total_found = True  
                                break  
  
                    if is_total_found:  
                        break  
  
                    if index >= 6:  
                        product_code = row[0].strip() if row[0] else ''  
                        product_name = row[1].strip() if row[1] else ''  
                        specification = row[2].strip() if row[2] else ''  
                        quantity = row[4].strip() if row[4] else ''  
                          
                        # 确保交货日期是字符串格式  
                        delivery_date_str = str(row[7].strip() if row[7] else '')  
                          
                        data.append({  
                            '订单编号': order_number,  
                            '品号': product_code,  
                            '品名': product_name,  
                            '规格': specification,  
                            '数量': quantity,  
                            '交货日期': delivery_date_str  # 保存为字符串格式  
                        })  
  
# ...（之后的代码保持不变）

matxi · 发表于 2024-8-18 10:19

[Python] 纯文本查看 复制代码

import os
import pandas as pd
import re
from pdfplumber import open as pdf_open
from openpyxl import load_workbook

# 定义文件夹路径和输出Excel文件的路径
folder_path = r'E:\JF'
output_excel = os.path.join(folder_path, '计划.xlsx')

# 检查输出文件是否存在，如果存在则删除
if os.path.exists(output_excel):
    os.remove(output_excel)

# 初始化一个空的DataFrame来存储数据
data = []

# 正则表达式查找订单编号的模式（假设订单编号为一串数字）
order_number_pattern = re.compile(r"订单编号[:：]?\s*(\d+)")

# 遍历文件夹中的所有PDF文件
for filename in sorted(os.listdir(folder_path)):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(folder_path, filename)
        order_number = None  # 存储订单编号
        is_total_found = False  # 是否找到“合计”字样

        with pdf_open(pdf_path) as pdf:
            # 遍历页面
            for page in pdf.pages:
                # 尝试从文本中提取订单编号
                if not order_number:
                    text = page.extract_text()
                    if text:
                        match = re.search(order_number_pattern, text)
                        if match:
                            order_number = match.group(1)

                # 从PDF页面中提取表格
                table = page.extract_table()
                if not table:
                    continue

                # 遍历表格的每一行
                for index, row in enumerate(table):
                    # 跳过标题行和空行
                    if index < 5:
                        continue

                    # 检查是否遇到“合计”，如果遇到则设置标志并跳出内层循环
                    if not is_total_found:
                        for cell in row:
                            if "合计" in str(cell).upper():  # 转换为大写以忽略大小写
                                is_total_found = True
                                break

                    if is_total_found:
                        break

                    # 从第6行开始提取数据
                    if index >= 6:
                        product_code = row[0].strip() if row[0] else ''
                        product_name = row[1].strip() if row[1] else ''
                        specification = row[2].strip() if row[2] else ''
                        quantity = row[4].strip() if row[4] else ''
                        delivery_date = row[7].strip() if row[7] else ''

                        # 将交货日期解析为字符串或保留原始值，以避免Excel将其转换为数值
                        try:
                            delivery_date = pd.to_datetime(delivery_date).strftime('%Y-%m-%d')
                        except:
                            pass

                        # 确保订单编号不为空，默认值为PDF文件名
                        if not order_number:
                            order_number = filename

                        # 将数据追加到列表中
                        data.append({
                            '订单编号': order_number,
                            '品号': product_code,
                            '品名': product_name,
                            '规格': specification,
                            '数量': quantity,
                            '交货日期': delivery_date
                        })

# 使用pandas创建DataFrame
df = pd.DataFrame(data)

# 将DataFrame保存到Excel文件
df.to_excel(output_excel, index=False, engine='openpyxl')

print(f'数据已成功提取并保存到 {output_excel}')

试一下这个

china-ray · 发表于 2024-8-18 11:09

问题里没有图，还想学习学习哪！不知道这PDF是不是业务系统形成的？那种图片或扫描件的表格应该不行吧？

thornjay · 发表于 2024-8-19 09:00

许我浅笑而安° 发表于 2024-8-17 23:04
PDF传一下看看
[mw_shl_code=python,true]# ...（之前的代码保持不变）

PDF文档表格如图

thornjay · 发表于 2024-8-19 09:02

china-ray 发表于 2024-8-18 11:09
问题里没有图，还想学习学习哪！不知道这PDF是不是业务系统形成的？那种图片或扫描件的表格应该不行吧？

不知道为什么图没显示：
图1图2如下：

thornjay · 发表于 2024-8-19 09:12

matxi 发表于 2024-8-18 10:19
[mw_shl_code=python,true]import os
import pandas as pd
import re

您好！首先非常感谢您的指导，谢谢！提取出来的订单编号不完整，只提取了前几位，其次交货日期显示的不是日期，而是成了数值。

帐号		自动登录	找回密码
密码			注册[Register]

[求助] python提取PDF文档表格数据格式不对