format ELF64 executable 3
entry _start
;-------------------------------------------------------------------------------
; NAME:         XORWOW
; DESC:         Pseudo random number generator.
; OUT:          eax         [0;2^32-1]
;-------------------------------------------------------------------------------
macro           XORWOW      {
                mov         edx,[g_xorwow_x]    ; edx = x
                shr         edx,2               ; edx = x >> 2
                xor         edx,[g_xorwow_x]    ; t = x ^ (x >> 2)
                mov         eax,[g_xorwow_y]    ; eax = y
                mov         [g_xorwow_x],eax    ; x = y
                mov         eax,[g_xorwow_z]    ; eax = z
                mov         [g_xorwow_y],eax    ; y = z
                mov         eax,[g_xorwow_w]    ; eax = w
                mov         [g_xorwow_z],eax    ; z = w
                mov         eax,[g_xorwow_v]    ; eax = v
                mov         [g_xorwow_w],eax    ; w = v
                mov         edi,eax             ; edi = v
                shl         edi,4               ; edi = v << 4
                xor         edi,eax             ; edi = (v ^ (v << 4))
                mov         eax,edx             ; eax = t
                shl         eax,1               ; eax = t << 1
                xor         eax,edx             ; eax = (t ^ (t << 1))
                xor         eax,edi             ; eax = (v ^ (v << 4)) ^ (t ^ (t << 1))
                mov         [g_xorwow_v],eax    ; v = eax
                add         [g_xorwow_d],362437 ; d += 362437
                mov         eax,[g_xorwow_d]    ; eax = d
                add         eax,[g_xorwow_v]    ; eax = d + v
}
;-------------------------------------------------------------------------------
; NAME:         RANDOM
; DESC:         Returns pseudo random number in the range [-0.5;0.5).
; OUT:          xmm0.x      [-0.5;0.5)
;-------------------------------------------------------------------------------
macro           RANDOM {
                XORWOW
                cvtsi2ss    xmm0,eax
                mulss       xmm0,[g_rand_scale]
}
;-------------------------------------------------------------------------------
segment readable executable
;-------------------------------------------------------------------------------
; NAME:         GenerateSequence
; IN:           xmm0.x      re (c0.x)
; IN:           xmm1.x      im (c0.y)
; IN:           rdi         array size
; IN/OUT:       rsi         pointer to the allocated array
; OUT:          rax         generated sequence size
;-------------------------------------------------------------------------------
align 16
GenerateSequence:
                xor         eax,eax     ; eax is index loop
                xorps       xmm4,xmm4   ; xmm4 is c.x
                xorps       xmm5,xmm5   ; xmm5 is c.y
.Loop:
                ; cn.x = c.x * c.x - c.y * c.y + c0.x
                movaps      xmm2,xmm4
                movaps      xmm3,xmm5
                mulss       xmm2,xmm4
                mulss       xmm3,xmm5
                subss       xmm2,xmm3
                addss       xmm2,xmm0
                movaps      xmm6,xmm2   ; xmm6 is cn.x
                ; cn.y = 2.0 * c.x * c.y + c0.y
                movaps      xmm7,xmm4
                mulss       xmm7,xmm5
                addss       xmm7,xmm7
                addss       xmm7,xmm1   ; xmm7 is cn.y
                ; store cn
                movd        dword [rsi+rax*8],xmm6
                movd        dword [rsi+rax*8+4],xmm7
                ; if (cn.x * cn.x + cn.y * cn.y > 10.0) return eax;
                movaps      xmm2,xmm6
                movaps      xmm3,xmm7
                mulss       xmm2,xmm6
                mulss       xmm3,xmm7
                addss       xmm2,xmm3
                ucomiss     xmm2,[g_max_dist]
                ja          .EndLoop
                movaps      xmm4,xmm6   ; c.x = cn.x
                movaps      xmm5,xmm7   ; c.y = cn.y
                ; continue loop
                add         eax,1
                cmp         eax,edi
                jb          .Loop
                ; return 0
                xor         eax,eax
.EndLoop:
                ret
;-------------------------------------------------------------------------------
; NAME:         AllocateMemory
; IN:           rdi         size in bytes
; OUT:          rax         memory address
;-------------------------------------------------------------------------------
align 16
AllocateMemory:
                mov         eax,9           ; sys_mmap
                xor         esi,esi         ; addr
                xchg        esi,edi
                mov         edx,0x1+0x2     ; PROT_READ | PROT_WRITE
                mov         r10d,0x02+0x20  ; MAP_PRIVATE | MAP_ANONYMOUS
                mov         r8,-1           ; fd
                xor         r9d,r9d         ; offset
                syscall
                ret
;-------------------------------------------------------------------------------
; NAME:         main
; DESC:         Program main function.
;-------------------------------------------------------------------------------
align 16
main:
img_ptr         equ         rbp-8
seq_ptr         equ         rbp-16
pixel           equ         rbp-24
                push        rbp
                mov         rbp,rsp
                sub         rsp,128
                ; alloc mem for the sequence
                mov         edi,SEQ_SIZE*8
                call        AllocateMemory
                mov         [seq_ptr],rax
                ; alloc mem for the image
                mov         edi,IMG_SIZE*IMG_SIZE*4
                call        AllocateMemory
                mov         [img_ptr],rax
                ; begin loops
                xor         r13d,r13d         ; .LoopIterations counter
.LoopIterations:
                xor         r12d,r12d         ; .LoopOneMillion counter
.LoopOneMillion:
                RANDOM
                mulss       xmm0,[g_range]
                movaps      xmm1,xmm0
                RANDOM
                mulss       xmm0,[g_range]
                mov         edi,SEQ_SIZE
                mov         rsi,[seq_ptr]
                call        GenerateSequence  ; eax = n sequence size
                test        eax,eax
                jz          .LoopSequenceEnd
                xor         ecx,ecx           ; ecx = i = 0 loop counter
                mov         r9,[seq_ptr]      ; r9 = sequence base address
                mov         r8,[img_ptr]      ; r8 = image base address
                movss       xmm2,[g_img_size]
                movaps      xmm3,xmm2
                mulss       xmm3,[g_0_5]      ; xmm3 = (g_img_size)/2
                movss       xmm4,[g_zoom]
                mulss       xmm4,xmm2         ; xmm4 = g_zoom * g_img_size
                movss       xmm5,[g_offsetx]  ; xmm5 = g_offsetx
                movss       xmm6,[g_offsety]  ; xmm6 = g_offsety
.LoopSequence:
                cmp         ecx,eax           ; i < n
                je          .LoopSequenceEnd
                movd        xmm0,[r9+rcx*8]   ; load re
                movd        xmm1,[r9+rcx*8+4] ; load im
                addss       xmm0,xmm5         ; xmm0 = re+g_offsetx
                addss       xmm1,xmm6         ; xmm1 = im+g_offsety
                mulss       xmm0,xmm4         ; xmm0 = (re+g_offsetx)*g_img_size*g_zoom
                mulss       xmm1,xmm4         ; xmm1 = (im+g_offsety)*g_img_size*g_zoom
                addss       xmm0,xmm3         ; xmm0 = (re+g_offsetx)*g_img_size*g_zoom+g_img_size/2
                addss       xmm1,xmm3         ; xmm1 = (im+g_offsety)*g_img_size*g_zoom+g_img_size/2
                cvtss2si    edi,xmm0          ; edi = x = int(xmm0.x)
                cvtss2si    esi,xmm1          ; esi = y = int(xmm1.x)
                cmp         edi,0
                jl          @f
                cmp         edi,IMG_SIZE
                jge         @f
                cmp         esi,0
                jl          @f
                cmp         esi,IMG_SIZE
                jge         @f
                imul        esi,esi,IMG_SIZE
                add         esi,edi
                add         dword [r8+rsi*4],1
@@:
                add         ecx,1
                jmp         .LoopSequence
.LoopSequenceEnd:
                ; continue .LoopOneMillion
                add         r12d,1
                cmp         r12d,1000000
                jb          .LoopOneMillion
                ; continue .LoopIterations
                add         r13d,1
                cmp         r13d,ITERATIONS
                jb          .LoopIterations
                ; find max value
                mov         r8,[img_ptr] ; r8 = image base address
                xor         r12d,r12d    ; r12d = max_val = 0
                xor         eax,eax      ; eax = i = loop counter
.LoopMax:
                cmp         dword [r8+rax*4],r12d
                cmova       r12d,dword [r8+rax*4]
                add         eax,1
                cmp         eax,IMG_SIZE*IMG_SIZE
                jb          .LoopMax
                ; find min value
                mov         r13d,r12d   ; r13d = min_val = max_val
                xor         eax,eax     ; eax = i = loop counter
.LoopMin:
                cmp         dword [r8+rax*4],r13d
                cmovb       r13d,dword [r8+rax*4]
                add         eax,1
                cmp         eax,IMG_SIZE*IMG_SIZE
                jb          .LoopMin
                ; create TGA file
                mov         eax,85
                mov         rdi,g_tga_name
                mov         esi,110000000b
                syscall
                mov         rbx,rax
                ; write TGA header
                mov         eax,1
                mov         rdi,rbx
                mov         rsi,g_tga_head
                mov         edx,18
                syscall
                ; write image pixels
                mov         byte [pixel+3],255
                mov         r14,[img_ptr]   ; r14 = image base address
                xor         r15d,r15d       ; r15d = i = loop counter
                cvtsi2ss    xmm0,r12d       ; load max_value
                cvtsi2ss    xmm1,r13d       ; load min_value
                movaps      xmm2,xmm0
                subss       xmm2,xmm1       ; xmm2 = r = max_value - min_value
.LoopWrite:
                mov         eax,[r14+r15*4] ; eax = image_value
                sub         eax,r13d        ; eax = image_value - min_value
                cvtsi2ss    xmm0,eax        ; xmm0 = float(image_value - min_value)
                addss       xmm0,xmm0       ; xmm0 = 2.0f * float(image_value - min_value)
                divss       xmm0,xmm2       ; xmm0 = 2.0f * float(image_value - min_value) / r
                minss       xmm0,[g_1_0]    ; clamp to 1.0
                maxss       xmm0,[g_0_0]    ; clamp to 0.0
                mulss       xmm0,[g_255_0]  ; convert to 0 - 255
                cvtss2si    eax,xmm0
                mov         [pixel],al      ; store B component
                mov         [pixel+1],al    ; store G component
                mov         [pixel+2],al    ; store R component
                ; write pixel data
                mov         eax,1
                mov         rdi,rbx
                lea         rsi,[pixel]
                mov         edx,4
                syscall
                ; continue .LoopWrite
                add         r15d,1
                cmp         r15d,IMG_SIZE*IMG_SIZE
                jb          .LoopWrite
                mov         rsp,rbp
                pop         rbp
                ret
                restore     img_ptr,seq_ptr,pixel
;-------------------------------------------------------------------------------
; NAME:         _start
; DESC:         Program entry point.
;-------------------------------------------------------------------------------
_start:
                call        main
                mov         eax,60  ; syscall number
                xor         edi,edi ; exit code
                syscall             ; exit
;-------------------------------------------------------------------------------
segment readable writeable
;-------------------------------------------------------------------------------
align 1
g_tga_name      db          'buddhabrot.tga',0
g_tga_head      db          0,0,2,9 dup 0
                db          (IMG_SIZE and 0x00ff),(IMG_SIZE and 0xff00) shr 8
                db          (IMG_SIZE and 0x00ff),(IMG_SIZE and 0xff00) shr 8,32,0
align 4
g_xorwow_x      dd          123456789
g_xorwow_y      dd          362436069
g_xorwow_z      dd          521288629
g_xorwow_w      dd          88675123
g_xorwow_v      dd          5783321
g_xorwow_d      dd          6615241
g_rand_scale    dd          2.3283064e-10 ; 1.0 / 2^32

IMG_SIZE=800
SEQ_SIZE=50
ITERATIONS=1000
g_img_size      dd          800.0
g_offsetx       dd          0.5
g_offsety       dd          0.0
g_zoom          dd          0.4

g_max_dist      dd          10.0
g_range         dd          4.2
g_0_5           dd          0.5
g_0_0           dd          0.0
g_1_0           dd          1.0
g_255_0         dd          255.0
;-------------------------------------------------------------------------------
