我想要提取多个不同pdf首页的特定的准确的文字和数字,然后导出到一个表格里面。下面有提取的页面的图片,需要能够快速的提取不需要等很久那种,有没有软件能够实现或者说是什么方法能实现的。目前已经解决了数据提取问题这是文心一言写的,想问下这个数据怎么才能让他们分开呀比如说招标人 投标报价 投标人 甲供材料费 单独每列显示这样
import os
import re
import fitz # PyMuPDF
import csv
def extract_first_page_text(pdf_path): """Extract text from the first page of a PDF file.""" try:
document = fitz.open(pdf_path)
first_page = document.load_page(0) # Load the first page (index 0)
text = first_page.get_text("text") # Use "text" for plain text extraction
return text
except Exception as e:
print(f"Error reading PDF {pdf_path}: {e}")
return ""
def clean_text(text): """Clean the text by removing multiple spaces and newlines.""" cleaned_text = re.sub(r'\s+', ' ', text).strip()
return cleaned_text
def split_text_into_lines(text): """Split the cleaned text into lines.""" return text.split('\n')
# List to hold the data for the CSV file
csv_data = []
# Iterate over all files in the directory
for filename in os.listdir(directory):
if filename.endswith(".pdf"):
pdf_path = os.path.join(directory, filename)
print(f"Processing {pdf_path}")
# Extract text from the first page
text = extract_first_page_text(pdf_path)
# Clean the text
cleaned_text = clean_text(text)
# Split the cleaned text into lines
lines = split_text_into_lines(cleaned_text)
# Add the lines to the CSV data (each PDF file's lines will be a row)
csv_data.append([filename] + lines) # Add filename as the first column
# Write the CSV data to a file
with open(csv_output_path, mode='w', newline='', encoding='utf-8') as csv_file:
writer = csv.writer(csv_file)
# Write the header (optional, you can customize this)
writer.writerow(["Filename"] + ["Line " + str(i + 1) for i in range(len(csv_data[0]) - 1)] if csv_data else [])
# Write the data rows
writer.writerows(csv_data)
print(f"Data has been written to {csv_output_path}")
import fitz
pdf_file = "xxx.pdf"
doc = fitz.open(pdf_file)
text = doc[0].get_text()
print(text)
for line in text.split("\n"):
if '投标报价' in line:
print(line)