题目
在python3中运行下边代码,对应到第五关
python3 -c "exec(__import__('urllib.request',fromlist='Hacking8 ohye').urlopen('http://i.hacking8.com/static/i2.py').read().decode())"
不了解brainfuck也没关系,只需要分析题目中brainfuck vm的逻辑也可以得出代码的含义,再用一点数学知识就可以解决。
下边是对应的代码
from sys import stdin
def find_bracket(code, pos, bracket):
cont = 0
pair = '[' if bracket == ']' else ']'
for a, i in zip(code[pos:], range(pos, len(code))):
if a == bracket:
cont = cont + 1
if a == pair:
if cont == 0:
return i
else:
cont = cont - 1
raise Exception
def prepare_code(code):
def map_left_bracket(b, p):
return (b, find_bracket(code, p + 1, b))
def map_right_bracket(b, p):
offset = find_bracket(list(reversed(code[:p])), 0, ']')
return (b, p - offset)
def map_bracket(b, p):
if b == '[':
return map_left_bracket(b, p)
else:
return map_right_bracket(b, p)
return [map_bracket(c, i) if c in ('[', ']') else c
for c, i in zip(code, range(len(code)))]
def read(string):
valid = ['>', '<', '+', '-', '.', ',', '[', ']', '%']
return prepare_code([c for c in string if c in valid])
def eval_step(code, data, code_pos, data_pos):
c = code[code_pos]
d = data[data_pos]
step = 1
if c == '>':
data_pos = data_pos + 1
if data_pos > len(data):
data_pos = 0
elif c == '<':
if data_pos != 0:
data_pos -= 1
elif c == '+':
data[data_pos] += 1
elif c == '-':
data[data_pos] -= 1
elif c == '.':
print(str(d))
elif c == '%':
if len(data) >= data_pos + 1:
data[data_pos] %= data[data_pos + 1]
elif c == ',':
data[data_pos] = ord(stdin.read(1))
else:
bracket, jmp = c
if bracket == '[' and d == 0:
step = 0
code_pos = jmp
elif bracket == ']' and d != 0:
step = 0
code_pos = jmp
return (data, code_pos, data_pos, step)
def shellcode_eval(code, c_pos=0, d_pos=0):
data = [0 for i in range(100)]
while c_pos < len(code):
(n_data, n_c_pos, n_d_pos, step) = eval_step(code, data, c_pos, d_pos)
print('eval step:',c_pos, d_pos)
print('\t => ',n_c_pos, n_d_pos, n_data)
data, c_pos, d_pos = n_data, n_c_pos, n_d_pos
c_pos += step
return str(data[0])
shellcode = ">>>>[-]++>[-]++++++++>>>>[-]>[-]>[-]>[-]<<<<<<[-][-]+<[->>>>[-]>[-]>[-]<<<<<[-<<[->>>>>+>+<<<<<<]>>>>>>[-<<<<<<+>>>>>>]>+<<<<<]>>>>>[-<<<<<+>>>>>]<<.<<<[-]>>>[-<<<+>>>]>>>+<<<<<<<]>>>>>>>[-<<<<<<<+>>>>>>>]<<<<<<[-<<<<<<+>>>>>>]<<[-]>[-]>[-]>[-]<<<<<<<.>>>>[-]+++<<<<[->>>>>+>>>>>+<<<<<<<<<<]>>>>>>>>>>[-<<<<<<<<<<+>>>>>>>>>>]<[-]>[-]>[-]>[-]<<<<<<[-][-]+<[->>>>[-]>[-]>[-]<<<<<[-<<[->>>>>+>+<<<<<<]>>>>>>[-<<<<<<+>>>>>>]>+<<<<<]>>>>>[-<<<<<+>>>>>]<<.<<<[-]>>>[-<<<+>>>]>>>+<<<<<<<]>>>>>>>[-<<<<<<<+>>>>>>>]<<<<<<[-<<<<<+>>>>>]<<[-]>[-]>[-]>[-]<<<<<<."
c = read(shellcode)
key = shellcode_eval(c)
题目中的 shellcode
比较长,为了方便分析这里只截取一段,具体以题目中的代码为准
分析
分析brainfuck
对应到代码中的 eval_step
方法
>
, <
分别对应数据指针的自增1 和 自减1
+
, -
分别对应当前指针所指数据的自增1 和 自减1
[...]
对应于 while
循环块
,
,.
分别对应输入和输出
另外还增加了一个 %
的取余操作
嗯,很简单的几个操作,但回过头看 shellcode
,还是一脸懵逼...
这里主要有几个难点阻碍我们理解brainfuck代码
1、没有助记符帮我们记住当前操作的数据指针是什么
2、每次增减的大小都是1,对于一个简单如 data[0] = n
的赋值操作也需要用一个循环递增n次,既不利于静态分析,也不利于动态调试
3、毕竟不是我们熟悉的语言,理解起需要额外多一层"翻译", 如果能直接转换成熟悉的语言 如 python, 理解起来也会直接很多。
化简brainfuck
根据上述分析的3个问题,采用下边两个方法来解决
1、合并繁琐的增减操作,增减不再是固定的1,可以一步到位。
当然这样需要维护一个额外的指针映射表和数据增减映射表
2、第1步化简后还会剩下 []
, +
, %
, .
, .
,
这些可以直接从 eval_step
中提取对应的处理语句,
直接进行翻译即可得到我们最终需要的python语言
只不过对while 要稍微做一点特殊的处理
说了这么多,直接撸代码吧
第1步方法
def shrinkBFCode(code):
cPos2Vars = {} #位置对应的变量
cPos2Change = {} #位置中 + 号 增加的值
varPos = 0
nCode = []
incVal = 0
lc = None
dataChangeOp = set(['+', '-'])
dataShiftOp = set(['>', '<'])
for i in range(len(code)):
c = code[i]
if c not in dataChangeOp and lc in dataChangeOp:
cPos2Change[len(nCode)] = incVal
cPos2Vars[len(nCode)] = varPos
nCode.append('+')
incVal = 0
if c == '>':
varPos += 1
elif c == '<':
varPos -= 1
else:
if c in dataChangeOp:
incVal += 1 if c == '+' else -1
else:
#if lc == '>' or lc == '<':
# cPos2Vars[len(nCode)] = varPos
cPos2Vars[len(nCode)] = varPos
nCode.append(c)
lc = c
return ''.join(nCode), cPos2Vars, cPos2Change
第2步的方法
def generatePyCode(shellCode, pVars, pChange):
pyCodes = []
bStacks = []
whileVarCache = {}
for i, c in enumerate(shellCode):
d_pos = i if i not in pVars else pVars[i]
d_change = 1 if i not in pChange else pChange[i]
indentLevel = len(bStacks)
indentStr = ' '*(4*indentLevel)
if c == '[':
pyCodes.append('{}while data[{}] != 0:'.format(indentStr, d_pos))
bStacks.append((c, i))
whileVarCache[i] = {}
elif c == ']':
if bStacks[-1][0] != '[':
raise Exception('miss match of {}] found between {} and {}'.format(bStacks[-1][0], bStacks[-1][1], i))
cNum = i-bStacks[-1][1]
if cNum == 2:
del pyCodes[-1]
del pyCodes[-1]
d_pos_l = i-1 if i-1 not in pVars else pVars[i-1]
pyCodes.append('{}data[{}] = 0'.format(' '*(4*(indentLevel-1)), d_pos_l))
whileCode = shellCode[bStacks[-1][1]+1 : i]
if cNum>2 and '[' not in whileCode and not '%' in whileCode: # nested loop is a bit complicated, just skip
loopCondvar = bStacks[-1][1]
d_pos_l = loopCondvar if loopCondvar not in pVars else pVars[loopCondvar]
whileVars = whileVarCache[bStacks[-1][1]]
cVarChange = whileVars[d_pos_l]
# remove statement of same indent
while len(pyCodes)>0 and pyCodes[-1].startswith(indentStr) and pyCodes[-1][len(indentStr)]!=' ':
pyCodes.pop()
pyCodes.pop()
#del pyCodes[bStacks[-1][1]-i:]
for vPos, vChange in whileVars.items():
if vPos == d_pos_l:
continue
ctimes = abs(vChange / cVarChange)
ctimesStr = '' if ctimes==1 else '{}*'.format(ctimes)
cSign = '+' if vChange > 0 else '-'
pyCodes.append('{}data[{}] {}= {}data[{}]'.format(' '*(4*(indentLevel-1)),
vPos, cSign, ctimesStr, d_pos_l))
pyCodes.append('{}data[{}] = 0'.format(' '*(4*(indentLevel-1)), d_pos_l))
del whileVarCache[bStacks[-1][1]]
bStacks.pop()
elif c == '.':
pyCodes.append('{}print(data[{}])'.format(indentStr, d_pos))
elif c == ',':
pyCodes.append('{}data[{}] = ord(stdin.read(1))'.format(indentStr, d_pos))
elif c == '+':
opSign = '-=' if d_change < 0 else '+='
if pyCodes and pyCodes[-1] == '{}data[{}] = 0'.format(indentStr, d_pos):
pyCodes[-1] = '{}data[{}] = {}'.format(indentStr, d_pos, d_change)
else:
pyCodes.append('{}data[{}] {} {}'.format(indentStr, d_pos, opSign, abs(d_change)))
if bStacks:
whileVarCache[bStacks[-1][1]].setdefault(d_pos, 0)
whileVarCache[bStacks[-1][1]][d_pos] += d_change
elif c == '-':
opSign = '+=' if d_change < 0 else '-='
if pyCodes and pyCodes[-1] == '{}data[{}] = 0'.format(indentStr, d_pos):
pyCodes[-1] = '{}data[{}] = {}'.format(indentStr, d_pos, -d_change)
else:
pyCodes.append('{}data[{}] {} {}'.format(indentStr, d_pos, opSign, abs(d_change)))
if bStacks:
whileVarCache[bStacks[-1][1]].setdefault(d_pos, 0)
whileVarCache[bStacks[-1][1]][d_pos] -= d_change
elif c == '%':
pyCodes.append('{}data[{}] %= data[{}]'.format(indentStr, d_pos, d_pos+1))
return '\n'.join(pyCodes)
具体使用
shrinkCode, pVars, pChange = shrinkBFCode(shellcode)
print(generatePyCode(shrinkCode, pVars, pChange))
这样我们就可以把 shellcode
转化成如下的python代码
data[4] = 2
data[5] = 8
data[9] = 0
data[10] = 0
data[11] = 0
data[12] = 0
data[6] = 0
data[6] = 1
while data[5] != 0:
data[5] -= 1
data[9] = 0
data[10] = 0
data[11] = 0
while data[6] != 0:
data[6] -= 1
data[9] += data[4]
data[10] += data[4]
data[4] = 0
data[4] += data[10]
data[10] = 0
data[11] += 1
data[6] += data[11]
data[11] = 0
print(data[9])
data[6] = 0
data[6] += data[9]
data[9] = 0
data[12] += 1
data[5] += data[12]
data[12] = 0
data[0] += data[6]
data[6] = 0
data[4] = 0
data[5] = 0
data[6] = 0
data[7] = 0
print(data[0])
data[4] = 3
data[5] += data[0]
data[10] += data[0]
data[0] = 0
data[0] += data[10]
data[10] = 0
data[9] = 0
data[10] = 0
data[11] = 0
data[12] = 0
data[6] = 0
data[6] = 1
while data[5] != 0:
data[5] -= 1
data[9] = 0
data[10] = 0
data[11] = 0
while data[6] != 0:
data[6] -= 1
data[9] += data[4]
data[10] += data[4]
data[4] = 0
data[4] += data[10]
data[10] = 0
data[11] += 1
data[6] += data[11]
data[11] = 0
print(data[9])
data[6] = 0
data[6] += data[9]
data[9] = 0
data[12] += 1
data[5] += data[12]
data[12] = 0
data[1] += data[6]
data[6] = 0
data[4] = 0
data[5] = 0
data[6] = 0
data[7] = 0
print(data[1])
至此 我们得出来比较容易读懂的代码了,虽然看起来还是有些绕,但结合动态调试已经足够分析代码的行为了。
这段代码的含义大致是:
先计算 2^8 = 256,
再计算 3^256 = 139008452377144732764939786789661303114218850808529137991604824430036072629766435941001769154109609521811665540548899435521
题目中完整的shellcode还有一些更”夸张“的数学操作,但依然可以通过上述方法比较快速的分析出含义,这里就不详述了,有兴趣的可以自己试一试。
后记
上述的代码转化其实还可以做一些优化,以便得出更容易理解的代码。
包括消除冗余的代码,缓存中间结果减少多层嵌套循环 等
当然这些会涉及到编译原理方面的知识,只能后续抽空补一补,再做优化了
以上只是解题的一种思路,有别的思路或者优化方案也欢迎各位大佬提出,先谢过了