[Python] 纯文本查看 复制代码
import numpy as np
from collections import Counter
import requests, re, base64, cv2
from PIL import ImageFont, ImageDraw, Image
#单字图片大小为85*85
boxsize = 85
num_list = []
pages = list(range(1,6))
#最右侧背景补全矩阵
blank_array = np.zeros(85*5*3, np.uint8).reshape(85, 5, 3)
#九宫典型位置
block_list = [159, 169, 179, 459, 469, 479, 759, 769, 779]
#汉字分割初始坐标,需尝试找比较接近的
fontboxlist = [[10, 20], [10, 121], [10, 220], [108, 20], [108, 121], [108, 220], [206, 20], [206, 121], [206, 220]]
#自定义字体位置和字体大小
fontpath = r'C:\Users\win10\YaheiCu.ttf'
font = ImageFont.truetype(fontpath, 88)
headers = {
'user-agent': 'yuanrenxue project',
'referer': 'https://match.yuanrenxue.com/match/8',
'cookie': 'sessionid=你的ID'
}
#超过10 个点跨域才算干扰处理,避免字体相同颜色噪点导致文字消失
def noiseline(imgarray, color):
c = 0
row, col = imgarray.shape[:2]
for u in range(row):
for v in range(col):
if (imgarray[u,v] == color).all():
c += 1
return c > 10
#相同像素数量统计,也可用余弦相似度
def image_similarity_count(image1, image2):
tt = 0
for x in range(85):
for y in range(85):
if (image1[x, y] == image2[x, y]).all():
tt += 1
return tt
session = requests.Session()
session.headers.update(headers)
for i in pages:
while(True):
answer = ''
r = session.get('https://match.yuanrenxue.com/api/match/8_verify')
char_list = re.findall('<p>(\S)</p>', r.json()['html'])
img = base64.b64decode(re.search('base64,(\S+)\\" alt=\\"\\">', r.json()['html']).group(1))
im = cv2.imdecode(np.frombuffer(img, np.uint8), cv2.IMREAD_COLOR)
color_list = Counter([tuple(i) for i in im.reshape(-1, 3)]).most_common()
for j in color_list:
#这里小于200的肯定是噪声干扰,需要取舍,太小宫内干扰线去不掉,太大偏旁部首会消失
if j[1] > 5000 or j[1] < 200 or noiseline(im[:,:15,:], j[0]) or noiseline(im[:,105:120,:], j[0]) or noiseline(im[:,205:220,:], j[0]) or noiseline(im[:9,:,:], j[0]) or noiseline(im[95:105,:,:], j[0]) or noiseline(im[195:205,:,:], j[0]):
im[np.all(im == j[0], axis=-1)] = (0, 0, 0)
else:
im[np.all(im == j[0], axis=-1)] = (255, 255, 255)
kernel = np.ones((3, 3), np.uint8)
dilate_img = cv2.dilate(im, kernel, iterations=2)
for flag in range(4):
image = Image.new('RGB', (85, 85), (0, 0, 0))
draw = ImageDraw.Draw(image)
#这里文字坐标也需要反复尝试
draw.text((5, -12), char_list[flag], (255, 255, 255), font=font)
image = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR)
similar_list = []
for q in range(9):
#最右侧补全
if (q % 3) != 2:
image2 = dilate_img[fontboxlist[q][0]:fontboxlist[q][0]+boxsize,fontboxlist[q][1]:fontboxlist[q][1]+boxsize,:]
else:
image2 = np.hstack((dilate_img[fontboxlist[q][0]:fontboxlist[q][0]+boxsize,fontboxlist[q][1]:300,:], blank_array))
result = image_similarity_count(image, image2)
similar_list.append(result)
answer += f'{block_list[np.argmax(np.array(similar_list))]}|'
params = {'page':i, 'answer':answer}
response = requests.get(f'https://match.yuanrenxue.com/api/match/8', headers=headers, params=params)
if response.status_code == 200:
num_list.extend([num['value'] for num in response.json()['data']])
break
else:
print('运气欠佳,识别失败!')
print(Counter(num_list).most_common(1)[0][0])