菜鸟读capstone与keystone源码入门
本帖最后由 L剑仙 于 2020-3-31 19:00 编辑原文发自看雪论坛https://bbs.pediy.com/thread-258473.htm
菜鸟最近想入门ollvm分析,得先找个反编译器,看了斑竹大佬的各种开源汇编、反汇编引擎的非专业比较https://bbs.pediy.com/thread-205590.htm,决定入门capstone
搜了搜网上,可能这玩意太简单,大佬都是直接用,只有c的入门,没啥python的入门帖子,菜鸟对各种函数参数还是没搞太明白,只好自己看下源码,记录一下用法。。。
一、capstone反汇编引擎,最重要功能是把二进制转化为汇编语言,关键代码在capstone 包里的__init__.py
最重要的2个类Cs和CsInsn
Cs类的disasm是最重要的反汇编函数,我得先搞明白它的参数和返回
# Disassemble binary & return disassembled instructions in CsInsn objects
def disasm(self, code, offset, count=0):
all_insn = ctypes.POINTER(_cs_insn)()#_cs_insn缩写自_capstone_instruction, 是一个c的结构体;这句创建all_insn指针,指向_cs_insn类型
'''if not _python2:
print(code)
code = code.encode()
print(code)'''
# Hack, unicorn's memory accessors give you back bytearrays, but they
# cause TypeErrors when you hand them into Capstone.
if isinstance(code, bytearray):
code = bytes(code)
res = _cs.cs_disasm(self.csh, code, len(code), offset, count, ctypes.byref(all_insn))#通过c函数cs_disasm解析code,获得一个all_insn引用(byref类似于pointer,返回一个引用)
if res > 0:
try:
for i in range(res):
yield CsInsn(self, all_insn)#看下构造函数def __init__(self, cs, all_info)可以看出self._raw=all_info=all_insn,把all_insn这个_cs_insn结构传递给了_raw,所以_raw储存了address,mnemonic,op_str等关键信息
finally:
_cs.cs_free(all_insn, res)#通过yield输出完CsInsn实例后释放
else:
status = _cs.cs_errno(self.csh)
if status != CS_ERR_OK:
raise CsError(status)
return
yield
函数原型disasm(self, code, offset, count=0),code就是二进制binary,可以是一个字符串,也可以是打开的文件,offset是偏移
它调用了封装的c函数 cs_disasm,先通过c函数cs_disasm解析code,获得一个all_insn引用,然后通过过yield 输出CsInsn 的实例
看一下CsInsn类的__init__,可以看到,类型为ctypes.POINTER转换的_cs_insn这个c结构体的all_insn引作为all_info参数被传递给了CsInsn类的_raw字段
def __init__(self, cs, all_info):
self._raw = copy_ctypes(all_info)#这个_raw的值就是disasm函数获得的all_insn引用
self._cs = cs
if self._cs._detail and self._raw.id != 0:
# save detail
self._raw.detail = ctypes.pointer(all_info.detail._type_())
ctypes.memmove(ctypes.byref(self._raw.detail), ctypes.byref(all_info.detail), ctypes.sizeof(type(all_info.detail)))
而这个结构体定义如下,最重要的几个字段address地址,mnemonic操作码,op_str操作数,它内部的detail也是一个c结构体
class _cs_insn(ctypes.Structure):
_fields_ = (
('id', ctypes.c_uint),
('address', ctypes.c_uint64),
('size', ctypes.c_uint16),
('bytes', ctypes.c_ubyte * 16),
('mnemonic', ctypes.c_char * 32),
('op_str', ctypes.c_char * 160),
('detail', ctypes.POINTER(_cs_detail)),
)
class _cs_detail(ctypes.Structure):
_fields_ = (
('regs_read', ctypes.c_uint16 * 12),
('regs_read_count', ctypes.c_ubyte),
('regs_write', ctypes.c_uint16 * 20),
('regs_write_count', ctypes.c_ubyte),
('groups', ctypes.c_ubyte * 8),
('groups_count', ctypes.c_ubyte),
('arch', _cs_arch),
)
获得这些字段的方法在CsInsn里已经封装好了,id,address,size,bytes,mnemonic,op_str这些基本直接从_raw里面获取或者简单类型转换一下
detail这个c结构体包含一些额外的功能:
regs_read,字面理解是,返回存储所有读取的隐式寄存器的list,实测只有pc,lr,sp和状态寄存器会被存储在list中
regs_write,字面理解是,返回存储所有写入的隐式寄存器的list,实测只有pc,lr,sp和状态寄存器会被存储在list中
regs_access,合并上面2个的结果
举几个例子(节选自某个so的指令)
itttt eq的 regs_write是,代表
popeq.w {r8, sb, sl, fp}的 regs_read 与regs_write是,代表
cmp r1, r0的 regs_write是,代表
groups,返回存储instruction属于的groups的list,常用的有jump跳转功能组,call函数调用功能组,ret返回功能组,int中断功能组
举几个例子
blx #0x2274 的groups是,分别代表subs r0, r1, r0的groups是,分别代表
beq.w #0x41ce 的groups是,分别代表
# Common instruction groups - to be consistent across all architectures.
CS_GRP_INVALID = 0# uninitialized/invalid group.
CS_GRP_JUMP = 1# all jump instructions (conditional+direct+indirect jumps)
CS_GRP_CALL = 2# all call instructions
CS_GRP_RET = 3# all return instructions
CS_GRP_INT = 4# all interrupt instructions (int+syscall)
CS_GRP_IRET = 5# all interrupt return instructions
CS_GRP_PRIVILEGE = 6# all privileged instructions
举个例子,如果我们想筛选出所有符合跳转却不是调用的instruction,怎么做呢
if (1 in i.groups and 2 not in i.groups):
print("0x%x:\t%s\t%s\n" %(i.address, i.mnemonic, i.op_str))
print("\t%s\n" %( i.groups))
后面还有几个基于上面字段的计数判断,没那么常用了,列举一个operand types
op_count(op_type)返回相同op_type的operand个数,举个例子
popeq {r4, r5, r6, r7, pc}的op_count(1)=op_count(CS_OP_REG )=5, 也就是操作数是寄存器的计数为5
blx #0x2274的op_count(2)=op_count(CS_OP_IMM )=1, 也就是操作数是立即寻址的计数为1
# Common instruction operand types - to be consistent across all architectures.
CS_OP_INVALID = 0
CS_OP_REG = 1
CS_OP_IMM = 2
CS_OP_MEM = 3
CS_OP_FP= 4
简单食用一下
#创建输入
bin= open('/src/main/lib/armeabi/libshell-super.2019.so','rb').read()
start=0x0000307C
end=0x00004df4
#导入capstone新建Cs实例,安卓一般都是arm的
import capstone
cs = capstone.Cs(capstone.CS_ARCH_ARM, capstone.CS_MODE_THUMB)
cs.detail = True#打开了detail 才有detail的功能
#打印地址,操作码,操作数
for i in cs.disasm(bin,start):
print("0x%x:\t%s\t%s\n" %(i.address, i.mnemonic, i.op_str))
#打印groups
# print("\t%s\n" %( i.groups))
# for a in i.groups:
# print(i.group_name(a))
#打印regs_read与regs_write
print("0x%x:\t%s\t%s\n" %(i.address, i.regs_read, i.regs_write))
for a in i.regs_read:
print("regs_read:"+i.reg_name(a))
for a in i.regs_write:
print("regs_write:"+i.reg_name(a))
__init__.py里面还有一个_dummy_cs类,提供了两个类似disasm但执行更快的反汇编函数,当然得到的信息也更少,我们可以选择使用
cs_disasm_quick比disasm快一点
cs_disasm_lite只返回(address, size, mnemonic, op_str)4个最重要的数据,比上面再快一点
二、下面学习keystone,最重要功能是把汇编代码变成二进制,关键代码在keystone包里的keystone.py,最关键的自然是asm函数了
原型def asm(self, string, addr=0, as_bytes=False) ,string是汇编字符串,addr开始地址
def asm(self, string, addr=0, as_bytes=False):
encode = POINTER(c_ubyte)()
encode_size = c_size_t()
stat_count = c_size_t()
if not isinstance(string, bytes) and isinstance(string, str):
string = string.encode('ascii')
status = _ks.ks_asm(self._ksh, string, addr, byref(encode), byref(encode_size), byref(stat_count))#这里调用c函数ks_asm获得encode,encode_size的引用
if (status != 0):
errno = _ks.ks_errno(self._ksh)
raise KsError(errno, stat_count.value)
else:
if stat_count.value == 0:
return (None, 0)
else:
if as_bytes:
encoding = string_at(encode, encode_size.value)
else:
encoding = []
for i in range(encode_size.value):
encoding.append(encode) #把所有encode附加到encoding作为返回值
_ks.ks_free(encode)
return (encoding, stat_count.value)
如果string是str,就先ascii编码一下,然后调用c函数ks_asm处理,这个函数在keystone.dll里
status = _ks.ks_asm(self._ksh, string, addr, byref(encode), byref(encode_size), byref(stat_count))#这里调用c函数ks_asm处理参数string获得encode,encode_size的引用
最后再看一下Ks类的__init__,通过arch和mode定义Ks实例,比如对于arm就是ks=keystone.Ks(keystone.KS_ARCH_ARM, keystone.KS_MODE_ARM)
def __init__(self, arch, mode):
# verify version compatibility with the core before doing anything
(major, minor, _combined) = ks_version()
if major != KS_API_MAJOR or minor != KS_API_MINOR:#先判断下当前版本和核心api是否匹配
self._ksh = None
# our binding version is different from the core's API version
raise KsError(KS_ERR_VERSION)
self._arch, self._mode = arch, mode
self._ksh = c_void_p()
status = _ks.ks_open(arch, mode, byref(self._ksh))#根据参数arch和mode通过c函数ks_open打开ks_engine,获得_ksh的引用
if status != KS_ERR_OK:
self._ksh = None
raise KsError(status)
if arch == KS_ARCH_X86:
# Intel syntax is default for X86
self._syntax = KS_OPT_SYNTAX_INTEL
else:
self._syntax = None
很明显,在__init__函数里面先通过c函数ks_open打开ks引擎获得一个引用,才能调用它内部的asm函数,在__del__里还会调用ks_close关闭这几个函数的原型在这里
# setup all the function prototype
def _setup_prototype(lib, fname, restype, *argtypes):
getattr(lib, fname).restype = restype
getattr(lib, fname).argtypes = argtypes
kserr = c_int
ks_engine = c_void_p
ks_hook_h = c_size_t
_setup_prototype(_ks, "ks_version", c_uint, POINTER(c_int), POINTER(c_int))
_setup_prototype(_ks, "ks_arch_supported", c_bool, c_int)
_setup_prototype(_ks, "ks_open", kserr, c_uint, c_uint, POINTER(ks_engine))
_setup_prototype(_ks, "ks_close", kserr, ks_engine)
_setup_prototype(_ks, "ks_strerror", c_char_p, kserr)
_setup_prototype(_ks, "ks_errno", kserr, ks_engine)
_setup_prototype(_ks, "ks_option", kserr, ks_engine, c_int, c_void_p)
_setup_prototype(_ks, "ks_asm", c_int, ks_engine, c_char_p, c_uint64, POINTER(POINTER(c_ubyte)), POINTER(c_size_t), POINTER(c_size_t))
_setup_prototype(_ks, "ks_free", None, POINTER(c_ubyte))
from keystone import *
ks = Ks(KS_ARCH_ARM, KS_MODE_ARM)
code=b"sub r1, r2, r5"
encoding, count =ks.asm(code)
print("%s = [ " % code, end='')
for i in encoding:
print("%02x " % i, end='')
print("]")
可以看到汇编语句转换成了16进制b'sub r1, r2, r5' = [ 05 10 42 e0 ]
总结一下,
capstone的disasm把文件的输入转换成CsInsn实例,通过CsInsn实例 可以获得汇编代码的address地址,mnemonic操作码,op_str操作数等重要信息,用于分析
就是[ 05 10 42 e0 ]→sub r1, r2, r5
keystone的asm把汇编代码转换成输出的16进制,用于patch打包
就是'sub r1, r2, r5'→ [ 05 10 42 e0 ]
ps:原来总共就封装了俩c函数,怪不得大佬懒得写教程,我这个彩笔居然还看了半天,记录下笔记,免得下次再忘了。。。。。。
参考:
https://bbs.pediy.com/thread-205590.htm
https://xz.aliyun.com/t/5753
表示看不懂 一见到阅读源码的,不管是啥,就感觉很牛逼{:1_921:} 感谢楼主 学习到了! 膜拜大佬 膜拜大佬
页:
[1]