7-Jun-2017: Eight queens problem in 93 bytes

There is a well known eight queens problem: place 8 queens on chess board so they will not attack each other; also, count all possible positions. There are 92 possible positions for 8*8 chess board (let's leave aside all symmetries).

Here is my piece of code to count them:

#include <stdio.h>
#include <assert.h>

// https://en.wikipedia.org/wiki/Eight_queens_puzzle
// https://oeis.org/A000170/b000170.txt

int try_row(int row, int attack_vertical, int attack_RL, int attack_LR, int queens, int start_bit)
{
        if (row==queens)
        {
                // all rows are filled, so we have a correct position
                return 1;
        };

        int total=0;
        // squares attacked in current row:
        int attack=attack_vertical | attack_RL | attack_LR;

        // enumerate all squares in current row:
        for (int new_queen=start_bit; new_queen; new_queen=new_queen>>1)
        {
                // square is not attacked by previous/upper row(s),
                // so place the queen here:
                if ((new_queen & attack)==0)
                {
                        // add new queen to "attack" variables:
                        // "vertical" is squares attacked vertically
                        // "RL" is squares attacked diagonally R-to-L
                        // "LR" is squares attacked diagonally L-to-R

                        total+=try_row(row+1,
                                attack_vertical | new_queen,
                                (attack_RL | new_queen)<<1,
                                (attack_LR | new_queen)>>1,
                                queens, start_bit);
                };
        };

        // no more space for queen(s) in current row, stop:
        return total;
};

int main()
{
        for (int i=1; i<=14; i++)
                printf ("%d*%d total=%d\n", i, i, try_row(0, 0, 0, 0, i, 1<<(i-1)));
/*
        assert (try_row(0, 0, 0, 0, 1, 1<<(1-1))==1);
        assert (try_row(0, 0, 0, 0, 2, 1<<(2-1))==0);
        assert (try_row(0, 0, 0, 0, 3, 1<<(3-1))==0);
        assert (try_row(0, 0, 0, 0, 4, 1<<(4-1))==2);
        assert (try_row(0, 0, 0, 0, 5, 1<<(5-1))==10);
        assert (try_row(0, 0, 0, 0, 6, 1<<(6-1))==4);
        assert (try_row(0, 0, 0, 0, 7, 1<<(7-1))==40);
        assert (try_row(0, 0, 0, 0, 8, 1<<(8-1))==92);
        assert (try_row(0, 0, 0, 0, 9, 1<<(9-1))==352);
        assert (try_row(0, 0, 0, 0, 10, 1<<(10-1))==724);
        assert (try_row(0, 0, 0, 0, 11, 1<<(11-1))==2680);
        assert (try_row(0, 0, 0, 0, 12, 1<<(12-1))==14200);
        assert (try_row(0, 0, 0, 0, 13, 1<<(13-1))==73712);
        assert (try_row(0, 0, 0, 0, 14, 1<<(14-1))==365596);
*/
};

We can try to rewrite it to assembly. It's very hard to compete with modern compilers in efficient code generation, but we can still compete with small code generation.

This is what GCC generated, but I optimized it manually:

	.intel_syntax noprefix
	.globl	try_row

try_row:
	cmp	edi, r8d		# row==queens?
	je	.L7			# exit

	push	r14
	push	r12
	push	r13
	push	rbp
	push	rbx

	# "queens" is always in R8 and will be used in subsequent calls
	# precomputed "1<<(queens-1)" is always in R9

	# registers we use as local variables:
	mov	r12d, edx		# r12=edx=attack_RL
	mov	r13d, ecx		# r13=ecx=attack_LR
	mov	r14d, esi		# r14=esi=attack_vertical
	mov	ebx, r9d		# ebx=r9=1<<(queens-1) (precomputed)
	xor	ebp, ebp		# ebp=total=0

.L3:
# we check all 3 variables subsequently
# we could OR all them and check once,
# but then we would have to use additional register, which is also would be saved in stack
	test	ebx, r12d		# new_queen & attack_RL
	jnz	.L4			# not zero? skip it: this queen can be attacked from previous/upper row(s)
	test	ebx, r13d		# new_queen & attack_LR
	jnz	.L4			# not zero? skip it: this queen can be attacked from previous/upper row(s)
	test	ebx, r14d		# new_queen & attack_vertical
	jnz	.L4			# not zero? skip it: this queen can be attacked from previous/upper row(s)

# preparing arguments for subsequent call:

	inc	edi			# (1st arg) row++ for subsequent call
	
	mov	esi, ebx		# new_queen
	or	esi, r14d		# (2nd arg) esi=attack_vertical | new_queen
	
	mov	edx, ebx		# new_queen
	or	edx, r12d		# edx=attack_RL | new_queen
	shl	edx, 1			# (3rd arg) edx=(attack_RL | new_queen)<<1

	mov	ecx, ebx		# new_queen
	or	ecx, r13d		# ecx=attack_LR | new_queen
	shr	ecx, 1			# (4th arg) ecx=(attack_LR | new_queen)>>1

	call	try_row
	dec	edi			# row-- (restore it to the previous value, we will use it)
	add	ebp, eax		# total+=try_row(...)
.L4:
	shr	ebx, 1			# shift new_queen
	jnz	.L3			# bit is still present in new_queen, continue
	# if the shifting queen has disappeared, exit:
	mov	eax, ebp		# ebp=total

	pop	rbx
	pop	rbp
	pop	r13
	pop	r12
	pop	r14
	ret
.L7:
	mov	eax, 1
	ret


It works slightly slower, but, judging by objdump (objdump -d 8queens_asm), occupies 93 bytes. It still can be optimized further.

There is no point to write assembly code these days, but it can be good exercise, to understand it better. Surprisingly, it also helps during SIMD coding, since you work with CPU instructions rather than functions.

The files: https://github.com/dennis714/yurichev.com/tree/master/blog/8queens.


→ [list of blog posts, my twitter/facebook]

The page last updated on 07-June-2017