题目来源:Cryptopals set1 challenge 6
题意大致为需要你攻击一个用相同流密钥重复加密的密文文件,密钥长度大致为 2~40 之间
想要对此类流密钥重用加密进行攻击,主要分为两部分
由于我们并不知道密钥长度,所以要先对密钥长度进行爆破,我们已知此类加密的明文一般都是英文文章或歌词,即有通顺语义的英文字符串,也包含一些特殊符号,但大部分仍然是大小写英文字母
根据这种已知条件,我们可以通过 汉明距离 来判断密钥长度
什么是汉明距离? 在信息论中,两个等长字符串之间的汉明距离是两个字符串对应位置的不同字符的个数。换句话说,它就是将一个字符串变换成另外一个字符串所需要替换的字符个数。而对于二进制字符串来说,两个等长01字符串的汉明距离,即是对应位 xor 后 1 的数量。 a = '01011' b = '01100' a ^ b = '00111' hamming_distance = 3
两个英文字母之间的平均汉明距离为 2~3,两个任意字符(非英文字母)之间的平均汉明距离为 4,另外,正确分组的密文和密文之间的汉明距离等于对应明文与明文之间的汉明距离,据此,我们可以通过将密文按照密钥长度分块,计算前几块密文间每个字符的平均汉明距离,汉明距离越小则越有可能是正确的密钥长度
def ham_dist(x, y):
x = libnum.s2b(x)
y = libnum.s2b(y)
dist = 0
for i in range(len(x)):
if int(x[i]) ^ int(y[i]) == 1:
dist += 1
return dist
keysize_res = []
for keysize in range(2, 40):
dist = 0
dist += ham_dist(cry[keysize * 0 : keysize * 1], cry[keysize * 1 : keysize * 2])
dist += ham_dist(cry[keysize * 1 : keysize * 2], cry[keysize * 2 : keysize * 3])
dist += ham_dist(cry[keysize * 2 : keysize * 3], cry[keysize * 3 : keysize * 4])
dist += ham_dist(cry[keysize * 3 : keysize * 4], cry[keysize * 4 : keysize * 5])
dist += ham_dist(cry[keysize * 4 : keysize * 5], cry[keysize * 5 : keysize * 6])
#print dist
aver_dist = float(dist) / (keysize * 5) # 平均每个字母间的汉明距离
data = {'keysize': keysize, 'aver_dist': aver_dist}
keysize_res.append(data)
#print sorted(keysize_res, key=lambda x : x['aver_dist'])
keysize_res = sorted(keysize_res, key=lambda x : x['aver_dist'])
为保证准确性,我们可以选择平均汉明距离最小的五个密钥长度进行进一步爆破
针对此类有意义的长篇英文字符串,爆破准确率最高的方式就是判断明文的词频大小
我们先将整体密文按照密钥长度分块,由于明文是使用相同的流密钥进行加密,所以每块密文的相同位置都是由密钥的同一位进行加密,例如
mess = 'abcdefghijkl'
key = '1234'
m1 = 'abcd'
m2 = 'efjh'
m3 = 'ijkl'
上述例子中,a e i
都由 '1'
加密,b f j
都由 '2'
加密,以此类推
我们通过爆破每个密钥位置对应的所有明文词频之和,取和最大的密钥字符连接在一起,则是此密钥长度下密钥,得到密钥后与密文进行 xor 解密,即可得到明文
char_freq = {
'a': .08167, 'b': .01492, 'c': .02782, 'd': .04253,
'e': .12702, 'f': .02228, 'g': .02015, 'h': .06094,
'i': .06094, 'j': .00153, 'k': .00772, 'l': .04025,
'm': .02406, 'n': .06749, 'o': .07507, 'p': .01929,
'q': .00095, 'r': .05987, 's': .06327, 't': .09056,
'u': .02758, 'v': .00978, 'w': .02360, 'x': .00150,
'y': .01974, 'z': .00074, ' ': .13000
}
for i in range(5):
KEY = ''
MESS = ''
for key_pos in range(keysize_res[i]['keysize']): # 逐个字节计算频率
freq_res = []
for k in range(0, 256):
freq_add = 0
for block in range(0, len(cry) - keysize_res[i]['keysize'] + 1, keysize_res[i]['keysize']):
freq_add += char_freq.get(chr(k ^ ord(cry[block + key_pos])).lower(), 0) # 找到每位key对应的字符频率最大值
data = {'key[i]': k, 'freq_add': freq_add}
freq_res.append(data)
freq_res = sorted(freq_res, key=lambda x:x['freq_add'], reverse=True)
KEY += hex(freq_res[0]['key[i]'])[2:]
#print freq_res
#print KEY
key = KEY * len(cry)
key = key.decode('hex')
MESS = "".join(chr(ord(x) ^ ord(y)) for x, y in zip(cry, key))
print "len(key): ", keysize_res[i]['keysize'], "\n", "key: ", KEY, "\n", MESS
print '=' * 50
# coding=utf-8
import libnum
#a = 'this is a test'
#b = 'wokka wokka!!!'
def ham_dist(x, y):
x = libnum.s2b(x)
y = libnum.s2b(y)
dist = 0
for i in range(len(x)):
if int(x[i]) ^ int(y[i]) == 1:
dist += 1
return dist
#print float(ham_dist(a, b)) / len(a)
f = open('set1-chall6.txt', 'r')
cry = ""
while 1:
a = f.readline().strip()
if a:
cry += a
else:
break
#print cry.decode('base64')
cry = cry.decode('base64')
#print len(cry)
keysize_res = []
for keysize in range(2, 40):
dist = 0
dist += ham_dist(cry[keysize * 0 : keysize * 1], cry[keysize * 1 : keysize * 2])
dist += ham_dist(cry[keysize * 1 : keysize * 2], cry[keysize * 2 : keysize * 3])
dist += ham_dist(cry[keysize * 2 : keysize * 3], cry[keysize * 3 : keysize * 4])
dist += ham_dist(cry[keysize * 3 : keysize * 4], cry[keysize * 4 : keysize * 5])
dist += ham_dist(cry[keysize * 4 : keysize * 5], cry[keysize * 5 : keysize * 6])
#print dist
aver_dist = float(dist) / (keysize * 5) # 平均每个字母间的汉明距离
data = {'keysize': keysize, 'aver_dist': aver_dist}
keysize_res.append(data)
#print sorted(keysize_res, key=lambda x : x['aver_dist'])
keysize_res = sorted(keysize_res, key=lambda x : x['aver_dist'])
#print keysize_res
char_freq = {
'a': .08167, 'b': .01492, 'c': .02782, 'd': .04253,
'e': .12702, 'f': .02228, 'g': .02015, 'h': .06094,
'i': .06094, 'j': .00153, 'k': .00772, 'l': .04025,
'm': .02406, 'n': .06749, 'o': .07507, 'p': .01929,
'q': .00095, 'r': .05987, 's': .06327, 't': .09056,
'u': .02758, 'v': .00978, 'w': .02360, 'x': .00150,
'y': .01974, 'z': .00074, ' ': .13000
}
for i in range(5):
KEY = ''
MESS = ''
for key_pos in range(keysize_res[i]['keysize']): # 逐个字节计算频率
freq_res = []
for k in range(0, 256):
freq_add = 0
for block in range(0, len(cry) - keysize_res[i]['keysize'] + 1, keysize_res[i]['keysize']):
freq_add += char_freq.get(chr(k ^ ord(cry[block + key_pos])).lower(), 0) # 找到每位key对应的字符频率最大值
data = {'key[i]': k, 'freq_add': freq_add}
freq_res.append(data)
freq_res = sorted(freq_res, key=lambda x:x['freq_add'], reverse=True)
KEY += hex(freq_res[0]['key[i]'])[2:]
#print freq_res
#print KEY
key = KEY * len(cry)
key = key.decode('hex')
MESS = "".join(chr(ord(x) ^ ord(y)) for x, y in zip(cry, key))
print "len(key): ", keysize_res[i]['keysize'], "\n", "key: ", KEY, "\n", MESS
print '=' * 50
当密钥长度为 29 时,得到正确明文