[Python] 纯文本查看 复制代码
from PIL import Image as PI
import io
import os
import ddddocr
import pandas as pd
import easyocr
import cv2
import fitz # PyMuPDF
from PIL import Image
ocr = ddddocr.DdddOcr()
#把pdf转为图片
def convert_pdf_to_image(pdf_path, output_image_path):
# 打开PDF文件
pdf_document = fitz.open(pdf_path)
# 获取第一页
first_page = pdf_document[0]
# 设置DPI(每英寸点数)
dpi = 100.0
# 获取图像
image = first_page.get_pixmap(matrix=fitz.Matrix(dpi / 72, dpi / 72))
# 转换为PIL图像
pil_image = Image.frombytes("RGB", [image.width, image.height], image.samples)
# 调整图像大小
#resized_image = pil_image.resize((image_width, image_height), Image.Resampling.LANCZOS)
# 保存图像
pil_image.save(output_image_path, "PNG")
# 关闭PDF文件
pdf_document.close()
# 读取发票
def readPic(img_url):
#img_url = "pic/fp01.jpg"
with open(img_url, 'rb') as f:
a = f.read()
new_img = PI.open(io.BytesIO(a))
#new_img.show()
return new_img;
#递归遍历文件夹
def traverse_directory(path,file_names):
for root, dirs, files in os.walk(path):
for filename in files:
full_path = os.path.join(root, filename)
if full_path.endswith('.png') or full_path.endswith('.jpeg') or full_path.endswith('.jpg'):
file_names.append(full_path)
return file_names
# 提取发票号码
def parse_invoice(new_img):
# 748 *500 门诊收费票据
# 827 * 552 增值类电子普通发票
# result = reader.readtext(new_img)
# print(result)
w = new_img.width # 图片的宽
h = new_img.height
print('识别图片宽、高为:',w,h)
new_img = new_img.resize((827, int(new_img.size[1] * 827 / new_img.size[0])))
w = new_img.width # 图片的宽
h = new_img.height
print('调整后识别图片宽、高为:', w, h)
tuple_voice_type = (236, 28, 583, 57)
image_voice_type = new_img.crop(tuple_voice_type)
#image_voice_type.show()
voice_type = ocr.classification(image_voice_type)
print('识别出的voice_type为:' + voice_type)
if '医疗' in voice_type:
type = 'menzhen'
elif '普通' in voice_type:
type = 'putong'
else:
type = 'yiliao'
invoice = {}
invoice_type_map = {
#广东增值税电子普通发票
'putong':{
'tuple_voice_no':(642,47,703,65),
'tuple_voice_owner': (137, 122,187,137),
'tuple_voice_date': (641,70,739,88), # let [top] right [bottom]
'tuple_voice_amount': (659, 384, 738,404),
'tuple_voice_amount_cn': (225, 382, 390,405)
},
# 广东省医疗门诊收费票据
'menzhen': {
'tuple_voice_no': (584, 88, 650, 102),
'tuple_voice_owner': (92, 120,133,135),
'tuple_voice_date': (584,121,650,133), # let [top] right [bottom]
'tuple_voice_amount': (495, 335, 559,351),
'tuple_voice_amount_cn': (145, 335, 299,351)
}
}
print(invoice_type_map)
image_voice_no = new_img.crop(invoice_type_map[type]['tuple_voice_no'])
#image_voice_no.show()
voice_no = ocr.classification(image_voice_no)
print('识别出的voice_no为:' + voice_no)
image_voice_owner = new_img.crop(invoice_type_map[type]['tuple_voice_owner'])
#image_voice_owner.show()
voice_owner = ocr.classification(image_voice_owner)
print('识别出的voice_owner为:' + voice_owner)
image_voice_date = new_img.crop(invoice_type_map[type]['tuple_voice_date'])
#image_voice_date.show()
voice_date = ocr.classification(image_voice_date)
print('识别出的voice_date为:' + voice_date)
image_voice_amount = new_img.crop(invoice_type_map[type]['tuple_voice_amount'])
#image_voice_amount.show()
voice_amount = ocr.classification(image_voice_amount)
print('识别出的voice_amount小写为:' + voice_amount)
image_voice_amount_cn = new_img.crop(invoice_type_map[type]['tuple_voice_amount_cn'])
#image_voice_amount_cn.show()
voice_amount_cn = ocr.classification(image_voice_amount_cn)
print('识别出的voice_amount_cn大写为:' + voice_amount_cn)
print('--------------------------------------------------')
invoice['voice_no'] = voice_no
invoice['voice_owner'] = voice_owner
invoice['voice_date'] = voice_date
invoice['voice_amount'] = voice_amount
invoice['voice_no'] = voice_no
invoice['voice_amount_cn'] = voice_amount_cn
#print(invoice)
return invoice
def get_output_list():
pass
#保存excel
def saveExcel(output_list):
writer = pd.ExcelWriter('发票记录.xlsx')
data = pd.DataFrame(output_list)
data.to_excel(writer, 'sheet_1', float_format='%f', header=True, index=False)
writer.close()
if __name__ == '__main__':
pdf_path = r"xxx\pdf"
pic_path = r"xxx\png"
file_names = []
files = traverse_directory(pic_path,file_names=file_names)
print(files)
output_list = []
for img_url in files:
new_img = readPic(img_url=img_url)
invoice = parse_invoice(new_img);
invoice['img_url'] = img_url
output_list.append(invoice)
saveExcel(output_list)