[Python] 纯文本查看 复制代码 import os
import pandas as pd
import re
from pdfplumber import open as pdf_open
from openpyxl import load_workbook
# 定义文件夹路径和输出Excel文件的路径
folder_path = r'E:\JF'
output_excel = os.path.join(folder_path, '计划.xlsx')
# 检查输出文件是否存在,如果存在则删除
if os.path.exists(output_excel):
os.remove(output_excel)
# 初始化一个空的DataFrame来存储数据
data = []
# 正则表达式查找订单编号的模式(假设订单编号为一串数字)
order_number_pattern = re.compile(r"订单编号[::]?\s*(\d+)")
# 遍历文件夹中的所有PDF文件
for filename in sorted(os.listdir(folder_path)):
if filename.endswith('.pdf'):
pdf_path = os.path.join(folder_path, filename)
order_number = None # 存储订单编号
is_total_found = False # 是否找到“合计”字样
with pdf_open(pdf_path) as pdf:
# 遍历页面
for page in pdf.pages:
# 尝试从文本中提取订单编号
if not order_number:
text = page.extract_text()
if text:
match = re.search(order_number_pattern, text)
if match:
order_number = match.group(1)
# 从PDF页面中提取表格
table = page.extract_table()
if not table:
continue
# 遍历表格的每一行
for index, row in enumerate(table):
# 跳过标题行和空行
if index < 5:
continue
# 检查是否遇到“合计”,如果遇到则设置标志并跳出内层循环
if not is_total_found:
for cell in row:
if "合计" in str(cell).upper(): # 转换为大写以忽略大小写
is_total_found = True
break
if is_total_found:
break
# 从第6行开始提取数据
if index >= 6:
product_code = row[0].strip() if row[0] else ''
product_name = row[1].strip() if row[1] else ''
specification = row[2].strip() if row[2] else ''
quantity = row[4].strip() if row[4] else ''
delivery_date = row[7].strip() if row[7] else ''
# 将交货日期解析为字符串或保留原始值,以避免Excel将其转换为数值
try:
delivery_date = pd.to_datetime(delivery_date).strftime('%Y-%m-%d')
except:
pass
# 确保订单编号不为空,默认值为PDF文件名
if not order_number:
order_number = filename
# 将数据追加到列表中
data.append({
'订单编号': order_number,
'品号': product_code,
'品名': product_name,
'规格': specification,
'数量': quantity,
'交货日期': delivery_date
})
# 使用pandas创建DataFrame
df = pd.DataFrame(data)
# 将DataFrame保存到Excel文件
df.to_excel(output_excel, index=False, engine='openpyxl')
print(f'数据已成功提取并保存到 {output_excel}')
试一下这个 |