Cracking simple XOR cipher with simulated annealing

I did this previously using Z3, but it was a wrong tool for the job (although it was a fun to do it).

This is way better. It can find 17-byte XOR key in matter of minutes on any decent computer, giving the fact that plain text is written in English. However, any other language has its own digrams and trigrams.

The solver can be modified easily to support most frequent x86 (or any other CPU) instructions, etc, to decrypt executable code...

It wouldn't work for any more advanced cipher than XOR-ing...

#!/usr/bin/env python3

from __future__ import print_function
import sys, hexdump
import math, os
import random
from simanneal import Annealer

# requires https://github.com/perrygeo/simanneal

KEYLEN=17

def xor_strings(s,t):
    # https://en.wikipedia.org/wiki/XOR_cipher#Example_implementation
    """xor two strings together"""
    return b"".join(bytes([a^b]) for a,b in zip(s,t))

def read_file(fname):
    file=open(fname, mode='rb')
    content=file.read()
    file.close()
    return content

def chunks(l, n):
    """divide input buffer by n-len chunks"""
    n = max(1, n)
    return [l[i:i + n] for i in range(0, len(l), n)]

trigrams=[b"the", b"and", b"tha", b"ent", b"ing", b"ion", b"tio", b"for", b"nde", b"has", b"nce", b"edt", b"tis", b"oft", b"sth", b"men"]
digrams=[b"th", b"he", b"in", b"er"]

# additional tuning may be needed:
BONUS_FOR_PRINTABLE=1
BONUS_FOR_LOWERCASE=3
BONUS_FOR_SPACE=8
BONUS_FOR_DIGRAM=8
BONUS_FOR_TRIGRAM=16
PENALTY_FOR_DIGIT=-10

cnt=0
def fitness(state):
    global cnt
    fitness=0
    state_as_string=b"".join(map(lambda x: bytes([x]), state))
    tmp=chunks(cipher_file, KEYLEN)
    plain=b"".join(map(lambda x: xor_strings(x, state_as_string), tmp))
    for p in plain:
        if p in b"qwertyuiopasdfghjklzxcvbnm":
            fitness=fitness+BONUS_FOR_LOWERCASE
        if p in b"0123456789":
            fitness=fitness+PENALTY_FOR_DIGIT
        if (p>=0x20 and p<=0x7E) or p==0xA or p==0xD:
            fitness=fitness+BONUS_FOR_PRINTABLE

    for digram in digrams:
        fitness=fitness + plain.count(digram)*BONUS_FOR_DIGRAM

    for trigram in trigrams:
        fitness=fitness + plain.count(trigram)*BONUS_FOR_TRIGRAM
        
    fitness=fitness + plain.count(b" ")*BONUS_FOR_SPACE

    if (cnt%500)==0:
        print("\u001B[0;0H")
        possible_key=b"".join(map(lambda x: bytes([x]), state))
        print ("state/key:")
        hexdump.hexdump (possible_key)
        print ("decrypted:")
        tmp=chunks(cipher_file, KEYLEN)
        plain=b"".join(map(lambda x: xor_strings(x, possible_key), tmp))
        hexdump.hexdump (plain)
    cnt+=1

    return fitness

class TestProblem(Annealer):

    def __init__(self, state):
        super(TestProblem, self).__init__(state)  # important!

    def move(self):
        # set random byte in state to random byte
        i = random.randint(0, len(self.state) - 1)
        self.state[i]=random.randint(0, 255)

    def energy(self):
        return -fitness(self.state)

cipher_file=read_file("cipher1.txt")

init_state = [0 for i in range(KEYLEN)]

test = TestProblem(init_state)
# increase if you're not satisfied with result:
test.steps = 200000
# since our state is just a list, slice is the fastest way to copy
test.copy_strategy = "slice"
#test.copy_strategy = "deepcopy"
state, e = test.anneal()

possible_key=b"".join(map(lambda x: bytes([x]), state))
print ("state/key:")
hexdump.hexdump (possible_key)

print ("decrypted:")
tmp=chunks(cipher_file, KEYLEN)
plain=b"".join(map(lambda x: xor_strings(x, possible_key), tmp))
hexdump.hexdump (plain)


state/key:
00000000: 90 A0 21 50 4F 8F FB FF  85 83 CF 56 41 12 7A F9  ..!PO......VA.z.
00000010: 31                                                1
decrypted:
00000000: 4D 72 2E 20 53 68 65 72  6C 6F 63 6B 20 48 6F 6C  Mr. Sherlock Hol
00000010: 6D 65 73 2C 20 77 68 6F  20 77 61 73 20 75 73 75  mes, who was usu
00000020: 61 6C 6C 79 20 76 65 72  79 20 6C 61 74 65 20 69  ally very late i
00000030: 6E 20 74 68 65 20 6D 6F  72 6E 69 6E 67 73 2C 20  n the mornings, 
00000040: 73 61 76 65 0D 0A 75 70  6F 6E 20 74 68 6F 73 65  save..upon those
00000050: 20 6E 6F 74 20 69 6E 66  72 65 71 75 65 6E 74 20   not infrequent 
00000060: 6F 63 63 61 73 69 6F 6E  73 20 77 68 65 6E 20 68  occasions when h
00000070: 65 20 77 61 73 20 75 70  20 61 6C 6C 20 6E 69 67  e was up all nig
00000080: 68 74 2C 20 77 61 73 20  73 65 61 74 65 64 0D 0A  ht, was seated..
00000090: 61 74 20 74 68 65 20 62  72 65 61 6B 66 61 73 74  at the breakfast
000000A0: 20 74 61 62 6C 65 2E 20  49 20 73 74 6F 6F 64 20   table. I stood 
000000B0: 75 70 6F 6E 20 74 68 65  20 68 65 61 72 74 68 2D  upon the hearth-
000000C0: 72 75 67 20 61 6E 64 20  70 69 63 6B 65 64 20 75  rug and picked u
000000D0: 70 20 74 68 65 0D 0A 73  74 69 63 6B 20 77 68 69  p the..stick whi
000000E0: 63 68 20 6F 75 72 20 76  69 73 69 74 6F 72 20 68  ch our visitor h
000000F0: 61 64 20 6C 65 66 74 20  62 65 68 69 6E 64 20 68  ad left behind h
00000100: 69 6D 20 74 68 65 20 6E  69 67 68 74 20 62 65 66  im the night bef
00000110: 6F 72 65 2E 20 49 74 20  77 61 73 20 61 0D 0A 66  ore. It was a..f
00000120: 69 6E 65 2C 20 74 68 69  63 6B 20 70 69 65 63 65  ine, thick piece
00000130: 20 6F 66 20 77 6F 6F 64  2C 20 62 75 6C 62 6F 75   of wood, bulbou
00000140: 73 2D 68 65 61 64 65 64  2C 20 6F 66 20 74 68 65  s-headed, of the
00000150: 20 73 6F 72 74 20 77 68  69 63 68 20 69 73         sort which is

All the files, including plain text I used, etc: https://yurichev.com/blog/SA_XOR/files.


UPD: at reddit.

UPD: May-2024: Updated to Python 3. And fancy video recorded.


→ [list of blog posts]

'