import
osimport re
import
cv2
import
numpy as np
import
threading
from
concurrent.futures
import
ThreadPoolExecutor
from
paddleocr
import
PaddleOCR
import
pandas as pd
import
sys
import
os
import
paddleocr.tools
MODEL_DIR
=
r
"D:\paddle_models"
INPUT_FOLDER
=
r
"D:\images\im"
OUTPUT_EXCEL
=
"商户数据.xlsx"
def
get_resource_path(relative_path):
if
hasattr
(sys,
'_MEIPASS'
):
return
os.path.join(sys._MEIPASS, relative_path)
return
os.path.join(os.path.abspath(
"."
), relative_path)
MODEL_DIR
=
get_resource_path(
"paddle_models"
)
print
(f
"[DEBUG] 模型路径: {MODEL_DIR}"
)
ocr
=
PaddleOCR(
lang
=
"ch"
,
use_angle_cls
=
False
,
det_model_dir
=
os.path.join(MODEL_DIR,
"det/ch_PP-OCRv4_det_infer"
),
rec_model_dir
=
os.path.join(MODEL_DIR,
"rec/ch_PP-OCRv4_rec_infer"
),
use_gpu
=
False
,
det_db_thresh
=
0.3
,
det_db_box_thresh
=
0.4
,
use_dilation
=
True
)
results
=
[]
lock
=
threading.Lock()
def
preprocess_image(img_path):
img
=
cv2.imdecode(np.fromfile(img_path, dtype
=
np.uint8),
-
1
)
gray
=
cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
clahe
=
cv2.createCLAHE(clipLimit
=
2.0
, tileGridSize
=
(
8
,
8
))
clahed
=
clahe.
apply
(gray)
denoised
=
cv2.fastNlMeansDenoising(clahed, h
=
7
, templateWindowSize
=
7
, searchWindowSize
=
21
)
return
denoised
try
:
img
=
cv2.imdecode(np.fromfile(img_path, dtype
=
np.uint8),
-
1
)
gray
=
cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
clahe
=
cv2.createCLAHE(clipLimit
=
3.0
, tileGridSize
=
(
8
,
8
))
enhanced
=
clahe.
apply
(gray)
blurred
=
cv2.GaussianBlur(enhanced, (
5
,
5
),
0
)
return
blurred
except
Exception as e:
print
(f
"图像预处理失败: {img_path}, 错误: {str(e)}"
)
return
None
def
extract_merchant_number(text):
pattern
=
r
'(?:商户编号|商编|商户号|Merchant\s*ID)\s*[::\-\s]*([89]{2}[\d\s\-]{13,})'
match
=
re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
if
match:
cleaned
=
re.sub(r
'\D'
, '', match.group(
1
))
if
len
(cleaned)
=
=
15
and
cleaned.startswith(
'89'
):
return
cleaned
return
None
def
process_image(img_path):
try
:
processed_img
=
preprocess_image(img_path)
if
processed_img
is
None
:
return
result
=
ocr.ocr(processed_img,
cls
=
False
)
text
=
" "
.join(line[
1
][
0
]
for
line
in
(result[
0
]
or
[]))
print
(f
"DEBUG - {os.path.basename(img_path)} 原始文本:\n{text[:230]}..."
)
merchant_no
=
extract_merchant_number(text)
with lock:
results.append({
"图片名称"
: os.path.basename(img_path),
"商户编号"
: merchant_no
or
"未找到"
,
"原始文本"
: text[:
200
]
+
"..."
})
except
Exception as e:
print
(f
"处理失败: {os.path.basename(img_path)}, 错误类型: {type(e).__name__}, 详情: {str(e)}"
)
def
main():
valid_ext
=
(
'.png'
,
'.jpg'
,
'.jpeg'
,
'.pdf'
)
files
=
[
os.path.join(INPUT_FOLDER, f)
for
f
in
os.listdir(INPUT_FOLDER)
if
f.lower().endswith(valid_ext)
]
with ThreadPoolExecutor(max_workers
=
os.cpu_count()
*
2
) as executor:
executor.
map
(process_image, files)
if
results:
df
=
pd.DataFrame(results)
df
=
df[[
"图片名称"
,
"商户编号"
,
"原始文本"
]]
df.to_excel(OUTPUT_EXCEL, index
=
False
)
print
(f
"生成成功!文件保存至: {os.path.abspath(OUTPUT_EXCEL)}"
)
else
:
print
(
"未处理任何有效图片"
)
if
__name__
=
=
"__main__"
:
main()