[Python] 纯文本查看 复制代码
# InvoiceTemp.py
import os, sys, zipfile
from xml.dom.minidom import parse
import pdfplumber
def get_txt_from_pdf(pdf_path):
pdf = pdfplumber.open(pdf_path, password='')
txt = pdf.pages[0].extract_text(x_tolerance=1, y_tolerance=2)
pdf.close()
print(txt)
if txt is None:
return 'pdf中提取字符失败,估计是pdf中是一张图片'
txt = txt.replace(' ', ' ').replace(' ', ' ').replace(' ', ' ').replace(' ', ' ').replace(' ', ' ')
txt = txt.replace(':', ':').replace('(', '(').replace(')', ')').replace('¥', '¥').replace('\u3000', ' ').replace('*', '*').replace('* *', '**').replace('* *', '**').replace(' ', ' ').replace(' ', ' ')
txt = txt.replace(': ', ':').replace(': ', ':').replace(' :', ':').replace(' :', ':').replace('⽇', '日').replace('⼈', '人').replace('⼩', '小')
txt = 'dyery;PDF16221; ' + txt.replace('\n', ' ') + ' newalg'
if txt.rfind('发票号码') > 0:
if len(txt[txt.rfind('发票号码'):].split(' ')[0].split(':')[1]) == 20:
txt = txt.replace('PDF16221', 'PDF16222')
return txt
def get_info_from_xml(element, tag_name):
data = element.getElementsByTagName(tag_name)
if len(data) > 0:
data = data[0].childNodes
if len(data) > 0:
data = data[0].data
if len(data) == 0:
data = ''
return data
def get_txt_from_ofd2(filename):
txt = 'dyery:OFD85991;'
with zipfile.ZipFile(filename, 'r') as (z):
try:
f = z.open('OFD.xml')
except Exception as ex:
try:
try:
return txt
finally:
ex = None
del ex
finally:
ex = None
del ex
DOMTree = parse(f)
collection = DOMTree.documentElement
data = collection.getElementsByTagName('ofd:CustomData')
buycode = ''
sellcode = ''
summoney = 0
for node in data:
try:
txt += node.getAttribute('Name') + ':' + node.firstChild.data + ';'
if node.getAttribute('Name') == '购买方纳税人识别号':
buycode = node.firstChild.data
else:
if node.getAttribute('Name') == '销售方纳税人识别号':
sellcode = node.firstChild.data
else:
if node.getAttribute('Name') == '合计税额':
summoney += float(node.firstChild.data)
else:
if node.getAttribute('Name') == '合计金额':
summoney += float(node.firstChild.data)
except Exception as ex:
try:
continue
finally:
ex = None
del ex
try:
try:
f = z.open('Doc_0/Attachs/original_invoice.xml')
DOMTree = parse(f)
collection = DOMTree.documentElement
txt += '机器编号:' + get_info_from_xml(collection, 'fp:MachineNo') + ';'
txt += '购买方名称:' + get_info_from_xml(collection, 'fp:BuyerName') + ';'
txt += 'badd:' + get_info_from_xml(collection, 'fp:BuyerAddrTel') + ';'
txt += 'bbank:' + get_info_from_xml(collection, 'fp:BuyerFinancialAccount') + ';'
data = collection.getElementsByTagName('fp:GoodsInfos')
if len(data) > 0:
data = data[0].childNodes
cnt = 0
for node in data:
txt += '项目名称:' + get_info_from_xml(collection, 'fp:Item') + ';'
txt += '销售方名称:' + get_info_from_xml(collection, 'fp:SellerName') + ';'
txt += 'sadd:' + get_info_from_xml(collection, 'fp:SellerAddrTel') + ';'
txt += 'sbank:' + get_info_from_xml(collection, 'fp:SellerFinancialAccount') + ';'
txt += '开票人:' + get_info_from_xml(collection, 'fp:InvoiceClerk') + ';'
txt += '收款人:' + get_info_from_xml(collection, 'fp:Payee') + ';'
txt += '复核:' + get_info_from_xml(collection, 'fp:Checker') + ';'
f = z.open('Doc_0/Pages/Page_0/Content.xml')
collection = parse(f).documentElement
data = collection.getElementsByTagName('ofd:TextObject')
if len(data) > 0:
for node in data:
if node.getAttribute('ID') == '62' and node.lastChild.firstChild.data.find('发票') > 0:
txt += '标题:' + node.lastChild.firstChild.data + ';'
continue
except Exception as ex:
try:
try:
txt = txt.replace('OFD85991', 'OFD85992')
f = z.open('Doc_0/Pages/Page_0/Content.xml')
f2 = z.open('Doc_0/Tags/CustomTag.xml')
collection = parse(f).documentElement
collection2 = parse(f2).documentElement
data2 = collection2.getElementsByTagName('ofd:BuyerName')
BuyerID = '0'
if len(data2) > 0:
BuyerID = data2[0].firstChild.firstChild.data
else:
data3 = collection2.getElementsByTagName('ofd:SellerName')
SellerID = '0'
if len(data3) > 0:
SellerID = data3[0].firstChild.firstChild.data
InvoiceClerkID = '0'
data4 = collection2.getElementsByTagName('ofd:InvoiceClerk')
if len(data4) > 0:
InvoiceClerkID = data4[0].firstChild.firstChild.data
ItemID = '0'
data5 = collection2.getElementsByTagName('ofd:Item')
if len(data5) > 0:
ItemID = data5[0].firstChild.firstChild.data
else:
data5 = collection2.getElementsByTagName('Item')
ItemID = data5[0].firstChild.firstChild.data
noteID = '0'
try:
data6 = collection2.getElementsByTagName('ofd:Note')
if len(data6) > 0:
noteID = data6[0].firstChild.firstChild.data
except:
pass
data = collection.getElementsByTagName('ofd:TextObject')
if len(data) > 0:
for node in data:
if node.getAttribute('ID') == BuyerID:
txt += '购买方名称:' + node.firstChild.firstChild.data + ';'
continue
if node.getAttribute('ID') == SellerID:
txt += '销售方名称:' + node.firstChild.firstChild.data + ';'
continue
if node.getAttribute('ID') == InvoiceClerkID:
txt += '开票人:' + node.firstChild.firstChild.data + ';'
continue
if node.getAttribute('ID') == ItemID:
txt += '项目名称:' + node.firstChild.firstChild.data + ';'
continue
if node.getAttribute('ID') == noteID:
txt += '备注:' + node.firstChild.firstChild.data.replace(';', ';') + ';'
continue
f = z.open('Doc_0/Tpls/Tpl_0/Content.xml')
collection = parse(f).documentElement
data = collection.getElementsByTagName('ofd:TextObject')
titleflag = False
try:
try:
if len(data) > 0:
for node in data:
if '电子发票' in node.lastChild.firstChild.data:
txt += '标题:' + node.lastChild.firstChild.data + ';'
titleflag = True
break
except Exception as ex:
try:
data = collection.getElementsByTagName('ofd:TextCode')
if len(data) > 0:
for node in data:
if '电子发票' in node.firstChild.data:
txt += '标题:' + node.firstChild.data + ';'
titleflag = True
break
finally:
ex = None
del ex
finally:
ex = None
del ex
data = collection.getElementsByTagName('ofd:TextCode')
if not titleflag:
if len(data) > 0:
for node in data:
if '电子发票' in node.firstChild.data:
txt += '标题:' + node.firstChild.data + ';'
titleflag = True
break
except Exception as ex:
try:
try:
try:
txt = txt.replace('OFD85991', 'OFD85992')
f = z.open('Doc_0/Pages/Page_0/Content.xml')
collectionfff = parse(f).documentElement
datafff = collectionfff.getElementsByTagName('ofd:TextCode')
buyerflag = False
sellerflag = False
firstflag = False
payeeflag = False
noteflag = False
datafff = collectionfff.getElementsByTagName('ofd:TextCode')
if len(datafff) > 0:
for node in datafff:
if buyerflag == True:
txt += '购买方名称:' + node.firstChild.data + ';'
firstflag = True
buyerflag = False
continue
else:
if sellerflag == True:
txt += '销售方名称:' + node.firstChild.data + ';'
sellerflag = False
continue
if payeeflag == True:
txt += '开票人:' + node.firstChild.data + ';'
payeeflag = False
continue
if noteflag == True:
if node.firstChild.data.replace(' ', '') != '开票人:':
txt += '备注:' + node.firstChild.data.replace(';', ';') + ';'
noteflag = False
continue
if '名称:' in node.firstChild.data:
if firstflag == False:
buyerflag = True
else:
sellerflag = True
continue
if '开票人' in node.firstChild.data.replace(' ', ''):
payeeflag = True
continue
if '注' == node.firstChild.data.replace(' ', ''):
noteflag = True
continue
if '电子发票' in node.firstChild.data:
txt += '标题:' + node.firstChild.data + ';'
if '*' in node.firstChild.data:
txt += '项目名称:' + node.firstChild.data + ';'
continue
except:
return txt
finally:
ex = None
del ex
finally:
ex = None
del ex
finally:
ex = None
del ex
finally:
ex = None
del ex
return txt
print('----------------------------------------------------------------------------------')
filepath = os.path.dirname(sys.argv[0]) + '\\temp.ini'
f = open(filepath, 'r+', encoding='utf-8')
lines = f.readlines()
i = 0
for line in lines:
if i == 0:
i = 1
continue
line = line.replace('\n', '')
print(line)
if line != '*****':
if line[-3:].lower() == 'pdf':
try:
a = get_txt_from_pdf(line) + '\n'
except:
a = '读取失败\n'
else:
if line[-3:].lower() == 'ofd':
try:
a = get_txt_from_ofd2(line) + '\n'
except:
a = '读取失败\n'
else:
if line[-3:].lower() == 'xml':
a = 'xml\n'
else:
a = '读取失败\n'
f.write(a)
print(a)
else:
break
f.close()