A reverse engineering challenge: an intricate visual pattern in 39 bytes: the solution

One month ago I posted a challenge. Now here is a solution.

I've found it in Donald Knuth's TAOCP section 7.1.3 (Bitwise tricks and techniques):

I translated it to 16-bit x86 assembly taking my Mandelbrot intro as a base:

org 100h
mov al,13h
int 10h

; do not set a palette

push 0a000h
pop es

begin:
xor di, di

FillLoop:
mov ax,di
mov cx,320

xor dx, dx
div cx

; ax/dx - cur coords

push dx
imul ax
pop dx
imul dx
shr ax, 12
and al, 1

stosb
cmp di, 0FA00h
jb FillLoop

; loop endlessly
jmp begin

After some experimenting in Wolfram Mathematica, I realized I can isolate not a single bit after shifting, but a 8 bits, or byte:

org 100h
mov al,13h
int 10h

; set palette
mov dx, 3c8h
mov al, 0
out dx, al
mov cx, 100h
inc dx
l00:
mov al, cl
shl ax, 2
out dx, al ; red
out dx, al ; green
out dx, al ; blue
loop l00

push 0a000h
pop es

begin:

xor di, di

FillLoop:
mov ax,di
mov cx,320

xor dx, dx
div cx

; ax/dx - cur coords

push dx
imul ax
pop dx
imul dx
shr ax, 10

stosb
cmp di, 0FA00h
jb FillLoop

; loop endlessly
jmp begin

Also, pouet.net links, dunno if someone liked it: 1, 2.

I also got an email from Peter Ferrie:

The initial value of cx in DOS is 0x00ff, so your "mov  cx, 100h" can
be "inc cx" to save two bytes.

Also, email from Gonzo Veliki:

Here is my proposal for 2.COM, it's just slightly shorter, I have another
one in mind, but didn't test...

...

here is another one, which is optimised for speed, it just runs faster...

'Smaller' (55 bytes):

   0:   b0 13                   mov    al,0x13
   2:   cd 10                   int    0x10
   4:   ba c8 03                mov    dx,0x3c8
   7:   b0 00                   mov    al,0x0
   9:   ee                      out    dx,al
   a:   b9 00 01                mov    cx,0x100
   d:   42                      inc    dx
   e:   88 c8                   mov    al,cl
  10:   c1 e0 02                shl    ax,0x2
  13:   ee                      out    dx,al
  14:   ee                      out    dx,al
  15:   ee                      out    dx,al
  16:   e2 f6                   loop   0xe
  18:   68 00 a0                push   0xa000
  1b:   07                      pop    es
  1c:   31 ff                   xor    di,di
  1e:   89 f8                   mov    ax,di
  20:   b9 40 01                mov    cx,0x140
  23:   31 d2                   xor    dx,dx
  25:   f7 f1                   div    cx
  27:   f6 e0                   mul    al       ; in place of push/imul/pop
  29:   f7 ea                   imul   dx
  2b:   c1 e8 0a                shr    ax,0xa
  2e:   aa                      stos   BYTE PTR es:[di],al
  2f:   81 ff 00 fa             cmp    di,0xfa00
  33:   72 e9                   jb     0x1e
  35:   eb e5                   jmp    0x1c

'Faster' (62 bytes):

   0:   b0 13                   mov    al,0x13
   2:   cd 10                   int    0x10
   4:   ba c8 03                mov    dx,0x3c8
   7:   b0 00                   mov    al,0x0
   9:   ee                      out    dx,al
   a:   b9 00 01                mov    cx,0x100
   d:   42                      inc    dx
   e:   88 c8                   mov    al,cl
  10:   c1 e0 02                shl    ax,0x2
  13:   ee                      out    dx,al
  14:   ee                      out    dx,al
  15:   ee                      out    dx,al
  16:   e2 f6                   loop   0xe
  18:   68 00 a0                push   0xa000
  1b:   07                      pop    es
  1c:   31 ff                   xor    di,di
  1e:   31 db                   xor    bx,bx
  20:   bd 01 00                mov    bp,0x1
  23:   31 d2                   xor    dx,dx
  25:   b9 40 01                mov    cx,0x140
  28:   89 d0                   mov    ax,dx
  2a:   c1 e8 0a                shr    ax,0xa
  2d:   aa                      stos   BYTE PTR es:[di],al
  2e:   01 da                   add    dx,bx
  30:   e2 f6                   loop   0x28
  32:   01 eb                   add    bx,bp
  34:   45                      inc    bp
  35:   45                      inc    bp
  36:   81 ff 00 fa             cmp    di,0xfa00
  3a:   72 e7                   jb     0x23
  3c:   eb de                   jmp    0x1c