Register allocation using graph coloring

This is an implementation of Knuth-Morris-Pratt algorithm, it searches for a substring in a string. (Copypasted from http://cprogramming.com/snippets/source-code/knuthmorrispratt-kmp-string-search-algorithm).

#include <stdlib.h>                                                                                                               
#include <stdio.h>
#include <string.h>

int64_t T[1024];

char *kmp_search(char *haystack, size_t haystack_size, char *needle, size_t needle_size)
{
        //int *T;
        int64_t i, j;
        char *result = NULL;

        if (needle_size==0)
                return haystack;

        /* Construct the lookup table */
        //T = (int*) malloc((needle_size+1) * sizeof(int));
        T[0] = -1;
        for (i=0; i<needle_size; i++)
        {
                T[i+1] = T[i] + 1;
                while (T[i+1] > 0 && needle[i] != needle[T[i+1]-1])
                        T[i+1] = T[T[i+1]-1] + 1;
        }

        /* Perform the search */
        for (i=j=0; i<haystack_size; )
        {
                if (j < 0 || haystack[i] == needle[j])
                {
                        ++i, ++j;
                        if (j == needle_size)
                        {
                                result = haystack+i-j;
                                break;
                        }
                }
                else j = T[j];
        }

        //free(T);
        return result;
}

char* helper(char* haystack, char* needle)
{
        return kmp_search(haystack, strlen(haystack), needle, strlen(needle));
};

int main()
{
        printf ("%s\n", helper("hello world", "world"));
        printf ("%s\n", helper("hello world", "ell"));
};

... as you can see, I simplified it a bit, there are no more calls to malloc/free and T[] array is now global.

Then I compiled it using GCC 7.3 x64 and reworked assembly listing a little, now there are no registers, but rather vXX variables, each one is assigned only once, in a SSA (Static single assignment form) manner. No variable assigned more than once. This is AT&T syntax.

#RDI - haystack       v1
#RSI - haystack_size  v2
#RDX - needle         v3
#RCX - needle_size    v4

	.text
	.globl	kmp_search
	.type	kmp_search, @function
kmp_search:                      
                                        # v1  v2  v3  v4  v5  v6  v7  v8  v9  v10 v11 v12 v13 v14 v15 v16
	testq	%v4, %v4                #  |   |   |   U
	movq	%v1, %rax               #  U   |   |   |
	je	.exit                   #  |   |   |   |
	leaq	(%v4,%v3), %v7          #  |   |   U   U           D
	movq	%v3, %v5                #  |   |   U   |   D       |
	leaq	8+T(%rip), %v9          #  |   |   |   |   |       |       D
	movq	$-1, T(%rip)            #  |   |   |   |   |       |       |
	cmpq	%v5, %v7                #  |   |   |   |   U       U       |
	leaq	-8(%v9), %v8            #  |   |   |   |   |       |   D   U
	je	.L20                    #  |   |   |   |   |       |   |   |
.L6:                                    #  |   |   |   |   |       |   |   |
	movq	-8(%v9), %v14           #  |   |   |   |   |       |   |   U                    D
	addq	$1, %v14                #  |   |   |   |   |       |   |   |                    U
	testq	%v14, %v14              #  |   |   |   |   |       |   |   |                    U
	movq	%v14, (%v9)             #  |   |   |   |   |       |   |   U                    U
	jle	.L4                     #  |   |   |   |   |       |   |   |                    |
	movzbl	(%v5), %v11             #  |   |   |   |   U       |   |   |        D           |
	cmpb	%v11_byte, -1(%v3,%v14) #  |   |   U   |   |       |   |   |        U           U
	jne	.L5                     #  |   |   |   |   |       |   |   |                    |
	jmp	.L4                     #  |   |   |   |   |       |   |   |                    |
.L21:                                   #  |   |   |   |   |       |   |   |                    |
	movzbl	-1(%v3,%v14), %v12      #  |   |   U   |   |       |   |   |            D       U
	cmpb	%v12_byte, (%v5)        #  |   |   |   |   U       |   |   |            U       |
	je	.L4                     #  |   |   |   |   |       |   |   |                    |
.L5:                                    #  |   |   |   |   |       |   |   |                    |
	movq	-8(%v8,%v14,8), %v15    #  |   |   |   |   |       |   U   |                    U   D
	addq	$1, %v15                #  |   |   |   |   |       |       |                        U
	testq	%v15, %v15              #  |   |   |   |   |       |       |                        U
	movq	%v15, (%v9)             #  |   |   |   |   |       |       U                        U
	jg	.L21                    #  |   |   |   |   |       |       |
.L4:                                    #  |   |   |   |   |       |       |
	addq	$1, %v5                 #  |   |   |   |   U       |       |
	addq	$8, %v9                 #  |   |   |   |   |       |       U
	cmpq	%v5, %v7                #  |   |   |   |   U       U       |
	jne	.L6                     #  |   |   |   |   |       |       |
.L20:
                                        # v1  v2  v3  v4  v5  v6  v7  v8  v9  v10 v11 v12 v13 v14 v15 v16
                                        #  |   |   |   |
	leaq	T(%rip), %v6            #  |   |   |   |       D
	xorq	%v16, %v16              #  |   |   |   |       |                                        D
	xorq	%v10, %v10              #  |   |   |   |       |                D                       |
.L7:                                    #  |   |   |   |       |                |                       |
	cmpq	%v10, %v2               #  |   U   |   |       |                U                       |
	jbe	.L22                    #  |   |   |   |       |                |                       |
.L11:                                   #  |   |   |   |       |                |                       |
	testq	%v16, %v16              #  |   |   |   |       |                |                       U
	js	.L8                     #  |   |   |   |       |                |                       |
	movzbl	(%v3,%v16), %v13        #  |   |   U   |       |                |           D           U
	cmpb	%v13_byte, (%v1,%v10)   #  U   |       |       |                U           U           |
	je	.L8                     #  |   |       |       |                |                       |
	cmpq	%v10, %v2               #  |   U       |       |                U                       |
	movq	(%v6,%v16,8), %v16      #  |           |       U                |                       U
	ja	.L11                    #  |           |                        |                       |
.L22:                                   #  |           |                        |                       |
	xorq	%rax, %rax              #  |           |                        |                       |
	ret                             #  |           |                        |                       |
.L8:                                    #  |           |                        |                       |
	addq	$1, %v16                #  |           |                        |                       U
	addq	$1, %v10                #  |           |                        U                       |
	cmpq	%v16, %v4               #  |           U                        |                       U
	jne	.L7                     #  |           |                        |
	subq	%v4, %v10               #  |           U                        U
	leaq	(%v1,%v10), %rax        #  U                                    U
.exit:
	ret
	.comm	T,8192,32

Dangling "noodles" you see at right are "live ranges" of each vXX variable. "D" means "defined", "U" - "used" or "used and then defined again". Whenever live range is started, we need to allocate variable (in a register or a local stack). When it's ending, we do not need to keep it somewhere in storage (in a register or a local stack).

As you can see, the function has two parts: preparation and processing. You can clearly see how live ranges are divided by two groups, except of first 4 variables, which are function arguments.

You see, there are 16 variables. But we want to use as small number of registers, as possible. If several live ranges are present at some address or point of time, these variables cannot be allocated in the same register.

This is how we can assign a register to each live range using Z3 SMT-solver:

from z3 import *                                                                                                                  

def attempt(colors):

    v=[Int('v%d' % i) for i in range(18)]

    s=Solver()

    for i in range(18):
        s.add(And(v[i]>=0, v[i]<colors))

    # a bit redundant, but that is not an issue:
    s.add(Distinct(v[1], v[2], v[3], v[4]))
    s.add(Distinct(v[1], v[2], v[3], v[4], v[7]))
    s.add(Distinct(v[1], v[2], v[3], v[4], v[5], v[7]))
    s.add(Distinct(v[1], v[2], v[3], v[4], v[5], v[7], v[9]))
    s.add(Distinct(v[1], v[2], v[3], v[4], v[5], v[7], v[8], v[9]))
    s.add(Distinct(v[1], v[2], v[3], v[4], v[5], v[7], v[8], v[9], v[14]))
    s.add(Distinct(v[1], v[2], v[3], v[4], v[5], v[7], v[8], v[9], v[11], v[14]))
    s.add(Distinct(v[1], v[2], v[3], v[4], v[5], v[7], v[8], v[9], v[12], v[14]))
    s.add(Distinct(v[1], v[2], v[3], v[4], v[5], v[7], v[8], v[9], v[14], v[15]))
    s.add(Distinct(v[1], v[2], v[3], v[4], v[5], v[7], v[9], v[14], v[15]))
    s.add(Distinct(v[1], v[2], v[3], v[4], v[5], v[7], v[9], v[14]))
    s.add(Distinct(v[1], v[2], v[3], v[4], v[5], v[7], v[14]))
    s.add(Distinct(v[1], v[2], v[3], v[4]))
    s.add(Distinct(v[1], v[2], v[3], v[4], v[6]))
    s.add(Distinct(v[1], v[2], v[3], v[4], v[6], v[16]))
    s.add(Distinct(v[1], v[2], v[3], v[4], v[6], v[10], v[16]))
    s.add(Distinct(v[1], v[2], v[3], v[4], v[6], v[10], v[13], v[16]))
    s.add(Distinct(v[1], v[2], v[4], v[6], v[10], v[13], v[16]))
    s.add(Distinct(v[1], v[2], v[4], v[6], v[10], v[16]))
    s.add(Distinct(v[1], v[4], v[6], v[10], v[16]))
    s.add(Distinct(v[1], v[4], v[10], v[16]))
    s.add(Distinct(v[1], v[4], v[10]))
    s.add(Distinct(v[1], v[10]))

    registers=["RDI", "RSI", "RDX", "RCX", "R8", "R9", "R10", "R11", "R12", "R13", "R14", "R15"]
    # first 4 variables are function arguments and they are always linked to rdi/rsi/rdx/rcx:
    s.add(v[1]==0)
    s.add(v[2]==1)
    s.add(v[3]==2)
    s.add(v[4]==3)

    if s.check()==sat:
        print "* colors=", colors
        m=s.model()
        for i in range(1, 17):
            print "v%d=%s" % (i, registers[m[v[i]].as_long()])

for colors in range(12, 0, -1):
    attempt(colors)

What we've got for 12, 11 and 10 registers:

* colors= 12
v1=RDI
v2=RSI
v3=RDX
v4=RCX
v5=R8
v6=R8
v7=R9
v8=R10
v9=R11
v10=R9
v11=R12
v12=R12
v13=R10
v14=R13
v15=R14
v16=R11
* colors= 11
v1=RDI
v2=RSI
v3=RDX
v4=RCX
v5=R14
v6=R8
v7=R13
v8=R8
v9=R12
v10=R11
v11=R9
v12=R11
v13=R9
v14=R10
v15=R9
v16=R10
* colors= 10
v1=RDI
v2=RSI
v3=RDX
v4=RCX
v5=R10
v6=R11
v7=R8
v8=R13
v9=R11
v10=R10
v11=R12
v12=R12
v13=R8
v14=R9
v15=R12
v16=R9

It's not possible to assign 9 or less registers. 10 is a minimum.

Now all I do is replacing vXX variables to registers the SMT-solver offered:

#RDI - haystack       v1
#RSI - haystack_size  v2
#RDX - needle         v3
#RCX - needle_size    v4

	.text
	.globl	kmp_search
	.type	kmp_search, @function
kmp_search:                      
                                        # v1  v2  v3  v4  v5  v6  v7  v8  v9  v10 v11 v12 v13 v14 v15 v16
	testq	%rcx, %rcx              #  |   |   |   U
	movq	%rdi, %rax              #  U   |   |   |
	je	.exit                   #  |   |   |   |
	leaq	(%rcx,%rdx), %r8        #  |   |   U   U           D
	movq	%rdx, %r10              #  |   |   U   |   D       |
	leaq	8+T(%rip), %r11         #  |   |   |   |   |       |       D
	movq	$-1, T(%rip)            #  |   |   |   |   |       |       |
	cmpq	%r10, %r8               #  |   |   |   |   U       U       |
	leaq	-8(%r11), %r13          #  |   |   |   |   |       |   D   U
	je	.L20                    #  |   |   |   |   |       |   |   |
.L6:                                    #  |   |   |   |   |       |   |   |
	movq	-8(%r11), %r9           #  |   |   |   |   |       |   |   U                    D
	addq	$1, %r9                 #  |   |   |   |   |       |   |   |                    U
	testq	%r9, %r9                #  |   |   |   |   |       |   |   |                    U
	movq	%r9, (%r11)             #  |   |   |   |   |       |   |   U                    U
	jle	.L4                     #  |   |   |   |   |       |   |   |                    |
	movzbl	(%r10), %r12d           #  |   |   |   |   U       |   |   |        D           |
	cmpb	%r12b, -1(%rdx,%r9)     #  |   |   U   |   |       |   |   |        U           U
	jne	.L5                     #  |   |   |   |   |       |   |   |                    |
	jmp	.L4                     #  |   |   |   |   |       |   |   |                    |
.L21:                                   #  |   |   |   |   |       |   |   |                    |
	movzbl	-1(%rdx,%r9), %r12d     #  |   |   U   |   |       |   |   |            D       U
	cmpb	%r12b, (%r10)           #  |   |   |   |   U       |   |   |            U       |
	je	.L4                     #  |   |   |   |   |       |   |   |                    |
.L5:                                    #  |   |   |   |   |       |   |   |                    |
	movq	-8(%r13,%r9,8), %r12    #  |   |   |   |   |       |   U   |                    U   D
	addq	$1, %r12                #  |   |   |   |   |       |       |                        U
	testq	%r12, %r12              #  |   |   |   |   |       |       |                        U
	movq	%r12, (%r11)            #  |   |   |   |   |       |       U                        U
	jg	.L21                    #  |   |   |   |   |       |       |
.L4:                                    #  |   |   |   |   |       |       |
	addq	$1, %r10                #  |   |   |   |   U       |       |
	addq	$8, %r11                #  |   |   |   |   |       |       U
	cmpq	%r10, %r8               #  |   |   |   |   U       U       |
	jne	.L6                     #  |   |   |   |   |       |       |
.L20:
                                        # v1  v2  v3  v4  v5  v6  v7  v8  v9  v10 v11 v12 v13 v14 v15 v16
                                        #  |   |   |   |
	leaq	T(%rip), %r11           #  |   |   |   |       D
	xorq	%r9, %r9                #  |   |   |   |       |                                        D
	xorq	%r10, %r10              #  |   |   |   |       |                D                       |
.L7:                                    #  |   |   |   |       |                |                       |
	cmpq	%r10, %rsi              #  |   U   |   |       |                U                       |
	jbe	.L22                    #  |   |   |   |       |                |                       |
.L11:                                   #  |   |   |   |       |                |                       |
	testq	%r9, %r9                #  |   |   |   |       |                |                       U
	js	.L8                     #  |   |   |   |       |                |                       |
	movzbl	(%rdx,%r9), %r8d        #  |   |   U   |       |                |           D           U
	cmpb	%r8b, (%rdi,%r10)       #  U   |       |       |                U           U           |
	je	.L8                     #  |   |       |       |                |                       |
	cmpq	%r10, %rsi              #  |   U       |       |                U                       |
	movq	(%r11,%r9,8), %r9       #  |           |       U                |                       U
	ja	.L11                    #  |           |                        |                       |
.L22:                                   #  |           |                        |                       |
	xorq	%rax, %rax              #  |           |                        |                       |
	ret                             #  |           |                        |                       |
.L8:                                    #  |           |                        |                       |
	addq	$1, %r9                 #  |           |                        |                       U
	addq	$1, %r10                #  |           |                        U                       |
	cmpq	%r9, %rcx               #  |           U                        |                       U
	jne	.L7                     #  |           |                        |
	subq	%rcx, %r10              #  |           U                        U
	leaq	(%rdi,%r10), %rax       #  U                                    U
.exit:
	ret
	.comm	T,8192,32

That works and it's almost the same as GCC does.

The problem of register allocation as a graph coloring problem: each live range is a vertex. It a live range must coexist with another live range, put an edge between vertices, that means, vertices cannot share same color. Color reflecting register number.

Almost all compilers (except simplest) do this in code generator. They use simpler algorithms, though, instead of such a heavy machinery as SAT/SMT solvers.

See also: https://en.wikipedia.org/wiki/Register_allocation, https://en.wikipedia.org/wiki/Live_variable_analysis.


→ [list of blog posts]

Please drop me email about any bug(s) and suggestion(s): dennis(@)yurichev.com.