论字体反爬通杀

DreamMark · 发表于 2023-11-23 17:48

本帖最后由 DreamMark 于 2023-11-23 17:52 编辑

前言
字体反爬，其实就是在网页端加载一种字体文件，通过映射，在页面上显示正常。传统的方式就是找到目标ttf 文件，人为得映射乱码和真实字符。但是当ttf 更新频繁时，人为对照成本就很高了。针对上面的痛点，想了一种成本较低得方法来解决这个问题。通过调研发现Python 的image 类是可以加载字体文件的，那结合机器学习文字识别那就可以很轻松的拿到乱码和字体的映射。现在问题是找到一个靠谱的汉字识别模型。对比市面上的汉字识别模型，最终采用了cnocr 模型。当然使用模型是有准确率的，最终的结果最好还是自己验证一次。
环境安装
python : 3.8.8
win10 系统
安装cnocr
pip install cnocr
如果报错：
1. ERROR: Failed building wheel for Polygon3
直接通过whl 安装Polygon3
https://www.lfd.uci.edu/~gohlke/pythonlibs/#polygon
即在上述链接中选择 Polygon3-3.0.9.1-cp38-cp38-win_amd64.whl 下载安装即可
2, 在实际执行中发现 cv2找不到某个函数，不要犹豫，大概率是opencv-python 的差异，建议直接升级到最高版本
pip install --user opencv-python --upgrade
代码&效果展示

[Python] 纯文本查看 复制代码

import os
from PIL import ImageFont, Image, ImageDraw

from cnocr import CnOcr
import numpy as np
from fontTools.ttLib import TTFont
import requests
from io import BytesIO


def font_to_img(code_list, filename, ignore_flag=True, score=0.95):
    normal_dict = {}
    be_padding_dict = {}
    ocr = CnOcr()
    """
        将字体画成图片
        code_list: 加密字符列表
        filename: 字体文件
        ignore_flag：是否忽视 sorce 返回结果
        score: 识别准确率默认 95%以上
    """
    for char_list in code_list:
        char_code = char_list.encode().decode()
        img_size = 1024
        img = Image.new('1', (img_size, img_size), 255)
        draw = ImageDraw.Draw(img)
        font = ImageFont.truetype(filename, int(img_size * 0.7))
        x, y = draw.textsize(char_code, font=font)
        draw.text(((img_size - x) // 2, (img_size - y) // 2), char_code, font=font, fill=0)
        # 将单通道 转为 三通道
        img = img.convert("RGB")
        # word = ocr.ocr_for_single_line("%s.jpg" % mame_ocr)
        word = ocr.ocr_for_single_line(np.array(img))
        if word["score"] >= score:
            # 处理重复名字
            # img.save("%s_%s.jpg" % (char_code, word["text"]))
            normal_dict[char_code] = word["text"]
        else:
            be_padding_dict[char_code] = word
            img.save("%s_%s_be_padding.jpg" % (char_code, word["text"]))
            if ignore_flag:
                normal_dict[char_code] = word["text"]
    return normal_dict, be_padding_dict


def ttf_parse(url, ttf_name):
    """
        根据url 获取返回值
    """
    response = requests.get(url)
    font_parse = TTFont(BytesIO(response.content))
    font_parse.save(ttf_name)
    m_dict = font_parse.getBestCmap()
    unicode_list = []
    for key, _ in m_dict.items():
        unicode_list.append(key)
    # 获取需要判断的字符
    char_list = [chr(ch_unicode) for ch_unicode in unicode_list]
    normal_dict, error_dict = font_to_img(char_list, ttf_name)
    print(normal_dict)
    print(error_dict)
    # 删除字体文件
    os.remove(ttf_name)

效果
{'\uec20': '公', '\uec29': '是', '\uec2a': '当', '\uec31': '矮', '\uec33': '了', '\uec3a': '保', '\uec43': '近', '\uec44': '味', '\uec4c': '七', '\uec4d': '的', '\uec56': '量', '\uec5f': '坐', '\uec60': '地', '\uec66': '盘', '\uec67': '只', '\uec70': '身', '\uec79': '不', '\uec7a': '八', '\uec82': '三', '\uec8c': '养', '\uec93': '雨', '\uec95': '二', '\uec9c': '很', '\uec9d': '内', '\ueca6': '硬', '\uecad': '过', '\uecaf': '下', '\uecb8': '控', '\uecc0': '空', '\uecc2': '长', '\uecc9': '开', '\ueccb': '低', '\uecda': '真', '\uecdc': '副', '\uece3': '动', '\uece5': '实', '\ueced': '自', '\uecf6': '少', '\uecf8': '启', '\uecfd': '右', '\uecff': '和', '\ued08': '十', '\ued09': '多', '\ued10': '比', '\ued12': '五', '\ued19': '门', '\ued23': '着', '\ued2b': '油', '\ued2c': '音', '\ued35': '手', '\ued3e': '软', '\ued3f': '小', '\ued45': '无', '\ued46': '六', '\ued4f': '泥', '\ued58': '中', '\ued59': '路', '\ued61': '大', '\ued62': '响', '\ued6b': '加', '\ued72': '排', '\ued74': '机', '\ued7b': '高', '\ued7c': '级', '\ued85': '冷', '\ued8e': '灯', '\ued8f': '耗', '\ued97': '呢', '\ued9f': '问', '\ueda1': '上', '\ueda8': '四', '\uedaa': '九', '\uedb1': '一', '\uedbb': '档', '\uedbd': '皮', '\uedc2': '性', '\uedc4': '左', '\uedcc': '里', '\uedcd': '更', '\uedd5': '短', '\uedd7': '光', '\ueddc': '好', '\uedde': '外', '\uedef': '来', '\uedf1': '远', '\uedf8': '有', '\uedfa': '坏', '\uee02': '孩', '\uee0a': '得', '\uee0b': '电'}
以评分95% 做分割，下面是可能不对的，经过对比图片发现没有错误。只是评分较低。
{'\uec33': {'text': '了', 'score': 0.8509207963943481}, '\uec7a': {'text': '八', 'score': 0.8456708192825317}, '\uec82': {'text': '三', 'score': 0.8965854644775391}, '\uec95': {'text': '二', 'score': 0.9056768417358398}, '\uecaf': {'text': '下', 'score': 0.8665554523468018}, '\uece3': {'text': '动', 'score': 0.9179448485374451}, '\ueced': {'text': '自', 'score': 0.9478009343147278}, '\uecfd': {'text': '右', 'score': 0.8674691319465637}, '\uedb1': {'text': '一', 'score': 0.8194031715393066}, '\uedde': {'text': '外', 'score': 0.9190731048583984}, '\uee02': {'text': '孩', 'score': 0.6236460208892822}}

wapython · 发表于 2023-11-24 17:25

大佬nb，测试可用，补充依赖：

[Asm] 纯文本查看 复制代码

import os
# pip install Pillow==9.5.0
from PIL import ImageFont, Image, ImageDraw


# pip install Polygon3-3.0.9.1-cp38-cp38-win_amd64.whl
# pip install cnocr     # 注：cnocr==2.2.4.2
from cnocr import CnOcr

import numpy as np
from fontTools.ttLib import TTFont
import requests
from io import BytesIO

# 其他依赖
# pip install onnxruntime    # 注：onnxruntime==1.16.3

雪辉 · 发表于 2023-11-23 18:22

看到标题我就想到了起点

狂三丶 · 发表于 2024-2-22 01:32

楼主牛逼，确实通杀大部分css加密，实测了一波，文字识别改成了ddddorc，感觉识别率不错。

[Python] 纯文本查看 复制代码

import ddddocr
from PIL import ImageFont, Image, ImageDraw
from fontTools.ttLib import TTFont


def font_to_img(code_list, filename):
    normal_dict = {}
    """
        将字体画成图片
        code_list: 加密字符列表
        filename: 字体文件
    """
    for char_list in code_list:
        char_code = char_list.encode().decode()
        img_size = 1024
        img = Image.new('1', (img_size, img_size), 255)
        draw = ImageDraw.Draw(img)
        font = ImageFont.truetype(filename, int(img_size * 0.7))
        x, y = draw.textsize(char_code, font=font)
        draw.text(((img_size - x) // 2, (img_size - y) // 2), char_code, font=font, fill=0)
        # 将单通道 转为 三通道
        img = img.convert("RGB")
        ocr = ddddocr.DdddOcr()
        res = ocr.classification(img)
        normal_dict[char_code] = res
    return normal_dict

# ttf_name为本地文件ttf文件路径
def ttf_parse(ttf_name):
    with open(ttf_name, 'rb') as f:
        font_parse = TTFont(f)
        m_dict = font_parse.getBestCmap()
        unicode_list = []
        for key, value in m_dict.items():
            print("{}-->{}".format(value, chr(key)))
            unicode_list.append(key)
        # 获取需要判断的字符
        char_list = [chr(ch_unicode) for ch_unicode in unicode_list]
        normal_dict = font_to_img(char_list, ttf_name)
        print(normal_dict)
        # 删除字体文件
        # os.remove(ttf_name)


if __name__ == '__main__':
    ttf_parse("7.ttf")

rsndm · 发表于 2023-11-23 17:55

支持一下

XuJingDaoZhang · 发表于 2023-11-23 18:03

起点反爬很厉害这个工具可以解决起点的反盗版

mcby · 发表于 2023-11-23 18:37

大佬手机可以吗？

moruye · 发表于 2023-11-23 20:44

提示: 作者被禁止或删除内容自动屏蔽

2303 · 发表于 2023-11-23 20:45

厉害厉害

xiaofei09520 · 发表于 2023-11-23 21:20

必须支持一波

amwquhwqas128 · 发表于 2023-11-23 22:07

非常感谢分享的文章

ARKyuyan · 发表于 2023-11-23 22:25

非常不错的文章

帐号		自动登录	找回密码
密码			注册[Register]

[Web逆向] 论字体反爬通杀

免费评分

本帖被以下淘专辑推荐:

免费评分

免费评分

浏览过的版块

moruye moruye 当前离线好友阅读权限 0 听众最后登录 1970-1-1 头像被屏蔽	6^# moruye 发表于 2023-11-23 20:44 提示: 作者被禁止或删除内容自动屏蔽

	回复支持举报