This is an implementation of Knuth-Morris-Pratt algorithm, it searches for a substring in a string. (Copypasted from http://cprogramming.com/snippets/source-code/knuthmorrispratt-kmp-string-search-algorithm).
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
int64_t T[1024];
char *kmp_search(char *haystack, size_t haystack_size, char *needle, size_t needle_size)
{
//int *T;
int64_t i, j;
char *result = NULL;
if (needle_size==0)
return haystack;
/* Construct the lookup table */
//T = (int*) malloc((needle_size+1) * sizeof(int));
T[0] = -1;
for (i=0; i<needle_size; i++)
{
T[i+1] = T[i] + 1;
while (T[i+1] > 0 && needle[i] != needle[T[i+1]-1])
T[i+1] = T[T[i+1]-1] + 1;
}
/* Perform the search */
for (i=j=0; i<haystack_size; )
{
if (j < 0 || haystack[i] == needle[j])
{
++i, ++j;
if (j == needle_size)
{
result = haystack+i-j;
break;
}
}
else j = T[j];
}
//free(T);
return result;
}
char* helper(char* haystack, char* needle)
{
return kmp_search(haystack, strlen(haystack), needle, strlen(needle));
};
int main()
{
printf ("%s\n", helper("hello world", "world"));
printf ("%s\n", helper("hello world", "ell"));
};
... as you can see, I simplified it a bit, there are no more calls to malloc/free and T[] array is now global.
Then I compiled it using GCC 7.3 x64 and reworked assembly listing a little, now there are no registers, but rather vXX variables, each one is assigned only once, in a SSA (Static single assignment form) manner. No variable assigned more than once. This is AT&T syntax.
#RDI - haystack v1
#RSI - haystack_size v2
#RDX - needle v3
#RCX - needle_size v4
.text
.globl kmp_search
.type kmp_search, @function
kmp_search:
# v1 v2 v3 v4 v5 v6 v7 v8 v9 v10 v11 v12 v13 v14 v15 v16
testq %v4, %v4 # | | | U
movq %v1, %rax # U | | |
je .exit # | | | |
leaq (%v4,%v3), %v7 # | | U U D
movq %v3, %v5 # | | U | D |
leaq 8+T(%rip), %v9 # | | | | | | D
movq $-1, T(%rip) # | | | | | | |
cmpq %v5, %v7 # | | | | U U |
leaq -8(%v9), %v8 # | | | | | | D U
je .L20 # | | | | | | | |
.L6: # | | | | | | | |
movq -8(%v9), %v14 # | | | | | | | U D
addq $1, %v14 # | | | | | | | | U
testq %v14, %v14 # | | | | | | | | U
movq %v14, (%v9) # | | | | | | | U U
jle .L4 # | | | | | | | | |
movzbl (%v5), %v11 # | | | | U | | | D |
cmpb %v11_byte, -1(%v3,%v14) # | | U | | | | | U U
jne .L5 # | | | | | | | | |
jmp .L4 # | | | | | | | | |
.L21: # | | | | | | | | |
movzbl -1(%v3,%v14), %v12 # | | U | | | | | D U
cmpb %v12_byte, (%v5) # | | | | U | | | U |
je .L4 # | | | | | | | | |
.L5: # | | | | | | | | |
movq -8(%v8,%v14,8), %v15 # | | | | | | U | U D
addq $1, %v15 # | | | | | | | U
testq %v15, %v15 # | | | | | | | U
movq %v15, (%v9) # | | | | | | U U
jg .L21 # | | | | | | |
.L4: # | | | | | | |
addq $1, %v5 # | | | | U | |
addq $8, %v9 # | | | | | | U
cmpq %v5, %v7 # | | | | U U |
jne .L6 # | | | | | | |
.L20:
# v1 v2 v3 v4 v5 v6 v7 v8 v9 v10 v11 v12 v13 v14 v15 v16
# | | | |
leaq T(%rip), %v6 # | | | | D
xorq %v16, %v16 # | | | | | D
xorq %v10, %v10 # | | | | | D |
.L7: # | | | | | | |
cmpq %v10, %v2 # | U | | | U |
jbe .L22 # | | | | | | |
.L11: # | | | | | | |
testq %v16, %v16 # | | | | | | U
js .L8 # | | | | | | |
movzbl (%v3,%v16), %v13 # | | U | | | D U
cmpb %v13_byte, (%v1,%v10) # U | | | U U |
je .L8 # | | | | | |
cmpq %v10, %v2 # | U | | U |
movq (%v6,%v16,8), %v16 # | | U | U
ja .L11 # | | | |
.L22: # | | | |
xorq %rax, %rax # | | | |
ret # | | | |
.L8: # | | | |
addq $1, %v16 # | | | U
addq $1, %v10 # | | U |
cmpq %v16, %v4 # | U | U
jne .L7 # | | |
subq %v4, %v10 # | U U
leaq (%v1,%v10), %rax # U U
.exit:
ret
.comm T,8192,32
Dangling "noodles" you see at right are "live ranges" of each vXX variable. "D" means "defined", "U" - "used" or "used and then defined again". Whenever live range is started, we need to allocate variable (in a register or a local stack). When it's ending, we do not need to keep it somewhere in storage (in a register or a local stack).
As you can see, the function has two parts: preparation and processing. You can clearly see how live ranges are divided by two groups, except of first 4 variables, which are function arguments.
You see, there are 16 variables. But we want to use as small number of registers, as possible. If several live ranges are present at some address or point of time, these variables cannot be allocated in the same register.
This is how we can assign a register to each live range using Z3 SMT-solver:
from z3 import *
def attempt(colors):
v=[Int('v%d' % i) for i in range(18)]
s=Solver()
for i in range(18):
s.add(And(v[i]>=0, v[i]<colors))
# a bit redundant, but that is not an issue:
s.add(Distinct(v[1], v[2], v[3], v[4]))
s.add(Distinct(v[1], v[2], v[3], v[4], v[7]))
s.add(Distinct(v[1], v[2], v[3], v[4], v[5], v[7]))
s.add(Distinct(v[1], v[2], v[3], v[4], v[5], v[7], v[9]))
s.add(Distinct(v[1], v[2], v[3], v[4], v[5], v[7], v[8], v[9]))
s.add(Distinct(v[1], v[2], v[3], v[4], v[5], v[7], v[8], v[9], v[14]))
s.add(Distinct(v[1], v[2], v[3], v[4], v[5], v[7], v[8], v[9], v[11], v[14]))
s.add(Distinct(v[1], v[2], v[3], v[4], v[5], v[7], v[8], v[9], v[12], v[14]))
s.add(Distinct(v[1], v[2], v[3], v[4], v[5], v[7], v[8], v[9], v[14], v[15]))
s.add(Distinct(v[1], v[2], v[3], v[4], v[5], v[7], v[9], v[14], v[15]))
s.add(Distinct(v[1], v[2], v[3], v[4], v[5], v[7], v[9], v[14]))
s.add(Distinct(v[1], v[2], v[3], v[4], v[5], v[7], v[14]))
s.add(Distinct(v[1], v[2], v[3], v[4]))
s.add(Distinct(v[1], v[2], v[3], v[4], v[6]))
s.add(Distinct(v[1], v[2], v[3], v[4], v[6], v[16]))
s.add(Distinct(v[1], v[2], v[3], v[4], v[6], v[10], v[16]))
s.add(Distinct(v[1], v[2], v[3], v[4], v[6], v[10], v[13], v[16]))
s.add(Distinct(v[1], v[2], v[4], v[6], v[10], v[13], v[16]))
s.add(Distinct(v[1], v[2], v[4], v[6], v[10], v[16]))
s.add(Distinct(v[1], v[4], v[6], v[10], v[16]))
s.add(Distinct(v[1], v[4], v[10], v[16]))
s.add(Distinct(v[1], v[4], v[10]))
s.add(Distinct(v[1], v[10]))
registers=["RDI", "RSI", "RDX", "RCX", "R8", "R9", "R10", "R11", "R12", "R13", "R14", "R15"]
# first 4 variables are function arguments and they are always linked to rdi/rsi/rdx/rcx:
s.add(v[1]==0)
s.add(v[2]==1)
s.add(v[3]==2)
s.add(v[4]==3)
if s.check()==sat:
print "* colors=", colors
m=s.model()
for i in range(1, 17):
print "v%d=%s" % (i, registers[m[v[i]].as_long()])
for colors in range(12, 0, -1):
attempt(colors)
What we've got for 12, 11 and 10 registers:
* colors= 12 v1=RDI v2=RSI v3=RDX v4=RCX v5=R8 v6=R8 v7=R9 v8=R10 v9=R11 v10=R9 v11=R12 v12=R12 v13=R10 v14=R13 v15=R14 v16=R11 * colors= 11 v1=RDI v2=RSI v3=RDX v4=RCX v5=R14 v6=R8 v7=R13 v8=R8 v9=R12 v10=R11 v11=R9 v12=R11 v13=R9 v14=R10 v15=R9 v16=R10 * colors= 10 v1=RDI v2=RSI v3=RDX v4=RCX v5=R10 v6=R11 v7=R8 v8=R13 v9=R11 v10=R10 v11=R12 v12=R12 v13=R8 v14=R9 v15=R12 v16=R9
It's not possible to assign 9 or less registers. 10 is a minimum.
Now all I do is replacing vXX variables to registers the SMT-solver offered:
#RDI - haystack v1
#RSI - haystack_size v2
#RDX - needle v3
#RCX - needle_size v4
.text
.globl kmp_search
.type kmp_search, @function
kmp_search:
# v1 v2 v3 v4 v5 v6 v7 v8 v9 v10 v11 v12 v13 v14 v15 v16
testq %rcx, %rcx # | | | U
movq %rdi, %rax # U | | |
je .exit # | | | |
leaq (%rcx,%rdx), %r8 # | | U U D
movq %rdx, %r10 # | | U | D |
leaq 8+T(%rip), %r11 # | | | | | | D
movq $-1, T(%rip) # | | | | | | |
cmpq %r10, %r8 # | | | | U U |
leaq -8(%r11), %r13 # | | | | | | D U
je .L20 # | | | | | | | |
.L6: # | | | | | | | |
movq -8(%r11), %r9 # | | | | | | | U D
addq $1, %r9 # | | | | | | | | U
testq %r9, %r9 # | | | | | | | | U
movq %r9, (%r11) # | | | | | | | U U
jle .L4 # | | | | | | | | |
movzbl (%r10), %r12d # | | | | U | | | D |
cmpb %r12b, -1(%rdx,%r9) # | | U | | | | | U U
jne .L5 # | | | | | | | | |
jmp .L4 # | | | | | | | | |
.L21: # | | | | | | | | |
movzbl -1(%rdx,%r9), %r12d # | | U | | | | | D U
cmpb %r12b, (%r10) # | | | | U | | | U |
je .L4 # | | | | | | | | |
.L5: # | | | | | | | | |
movq -8(%r13,%r9,8), %r12 # | | | | | | U | U D
addq $1, %r12 # | | | | | | | U
testq %r12, %r12 # | | | | | | | U
movq %r12, (%r11) # | | | | | | U U
jg .L21 # | | | | | | |
.L4: # | | | | | | |
addq $1, %r10 # | | | | U | |
addq $8, %r11 # | | | | | | U
cmpq %r10, %r8 # | | | | U U |
jne .L6 # | | | | | | |
.L20:
# v1 v2 v3 v4 v5 v6 v7 v8 v9 v10 v11 v12 v13 v14 v15 v16
# | | | |
leaq T(%rip), %r11 # | | | | D
xorq %r9, %r9 # | | | | | D
xorq %r10, %r10 # | | | | | D |
.L7: # | | | | | | |
cmpq %r10, %rsi # | U | | | U |
jbe .L22 # | | | | | | |
.L11: # | | | | | | |
testq %r9, %r9 # | | | | | | U
js .L8 # | | | | | | |
movzbl (%rdx,%r9), %r8d # | | U | | | D U
cmpb %r8b, (%rdi,%r10) # U | | | U U |
je .L8 # | | | | | |
cmpq %r10, %rsi # | U | | U |
movq (%r11,%r9,8), %r9 # | | U | U
ja .L11 # | | | |
.L22: # | | | |
xorq %rax, %rax # | | | |
ret # | | | |
.L8: # | | | |
addq $1, %r9 # | | | U
addq $1, %r10 # | | U |
cmpq %r9, %rcx # | U | U
jne .L7 # | | |
subq %rcx, %r10 # | U U
leaq (%rdi,%r10), %rax # U U
.exit:
ret
.comm T,8192,32
That works and it's almost the same as GCC does.
The problem of register allocation as a graph coloring problem: each live range is a vertex. It a live range must coexist with another live range, put an edge between vertices, that means, vertices cannot share same color. Color reflecting register number.
Almost all compilers (except simplest) do this in code generator. They use simpler algorithms, though, instead of such a heavy machinery as SAT/SMT solvers.
See also: https://en.wikipedia.org/wiki/Register_allocation, https://en.wikipedia.org/wiki/Live_variable_analysis.
Since graph coloring can have many solutions, you can probably hide some information in "coloring": https://yurichev.com/blog/lehmer/.