format ELF64 executable 3
entry _start
;-------------------------------------------------------------------------------
; NAME:         SMOOTHSTEP
; IN:           xmm0        edge0
; IN:           xmm1        edge1
; IN:           xmm2        x
; OUT:          xmm0        smoothstep(edge0, edge1, x) like in GLSL
;-------------------------------------------------------------------------------
macro           SMOOTHSTEP  {
                subps       xmm2,xmm0 ; x - edge0
                subps       xmm1,xmm0 ; edge1 - edge0
                divps       xmm2,xmm1
                minps       xmm2,dqword [g_1_0]
                maxps       xmm2,dqword [g_0_0]
                movaps      xmm0,dqword [g_3_0]
                movaps      xmm1,xmm2
                addps       xmm1,xmm1
                subps       xmm0,xmm1
                mulps       xmm2,xmm2
                mulps       xmm0,xmm2
}
;-------------------------------------------------------------------------------
segment readable executable
;-------------------------------------------------------------------------------
; NAME:         sin
; IN:           xmm0        | w z y x |
; OUT:          xmm0        | sin(w) sin(z) sin(y) sin(x) |
;-------------------------------------------------------------------------------
align 16
sin:
                movaps      xmm7,xmm0
                andps       xmm0,dqword [g_inv_sign_mask]
                andps       xmm7,dqword [g_sign_mask]
                mulps       xmm0,dqword [g_2_div_pi]
                pxor        xmm3,xmm3
                movdqa      xmm5,dqword [g_1]
                movaps      xmm4,dqword [g_1_0]
                cvttps2dq   xmm2,xmm0
                pand        xmm5,xmm2
                pcmpeqd	    xmm5,xmm3
                cvtdq2ps    xmm6,xmm2
                pand        xmm2,dqword [g_2]
                pslld       xmm2,30
                subps       xmm0,xmm6
                minps       xmm0,xmm4
                subps       xmm4,xmm0
                andps       xmm0,xmm5
                andnps      xmm5,xmm4
                orps        xmm0,xmm5
                movaps      xmm1,xmm0
                mulps       xmm0,xmm0
                xorps       xmm2,xmm7
                orps        xmm1,xmm2
                movaps      xmm7,xmm0
                mulps       xmm0,dqword [g_sincos_p3]
                addps       xmm0,dqword [g_sincos_p2]
                mulps       xmm0,xmm7
                addps       xmm0,dqword [g_sincos_p1]
                mulps       xmm0,xmm7
                addps       xmm0,dqword [g_sincos_p0]
                mulps       xmm0,xmm1
                ret
;-------------------------------------------------------------------------------
; NAME:         atanr2
; IN:           xmm0        | w0 z0 y0 x0 |
; IN:           xmm1        | w1 z1 y1 x1 |
; OUT:          xmm0        | atan2(w0,1/w1) atan2(z0,1/z1) atan2(y0,1/y1) atan2(x0,1/x1) |
;-------------------------------------------------------------------------------
align 16
atanr2:
                movaps      xmm2,dqword [g_sign_mask]
                xorps       xmm3,xmm3
                movaps      xmm5,dqword [g_1_0]
                andps       xmm2,xmm0
                mulps       xmm0,xmm1
                orps        xmm2,dqword [g_pi]
                cmpleps	    xmm3,xmm1
                movaps      xmm6,dqword [g_m1_0]
                rcpps       xmm4,xmm0
                cmpltps	    xmm5,xmm0
                cmpnleps    xmm6,xmm0
                movaps      xmm1,dqword [g_atan_s0]
                orps        xmm5,xmm6
                movaps      xmm8,xmm2
                movaps      xmm9,xmm3
                andps       xmm4,xmm5
                movaps      xmm2,dqword [g_atan_t0]
                movaps      xmm7,xmm5
                andnps      xmm5,xmm0
                movaps      xmm3,dqword [g_atan_s1]
                orps        xmm4,xmm5
                movaps      xmm0,xmm4
                movaps      xmm6,dqword [g_atan_t1]
                mulps       xmm4,xmm4
                addps       xmm1,xmm4
                movaps      xmm5,dqword [g_atan_s2]
                rcpps       xmm1,xmm1
                mulps       xmm1,xmm2
                movaps      xmm2,dqword [g_atan_t2]
                addps       xmm3,xmm4
                addps       xmm1,xmm3
                movaps      xmm3,dqword [g_atan_s3]
                rcpps       xmm1,xmm1
                mulps       xmm1,xmm6
                movaps      xmm6,dqword [g_atan_t3]
                addps       xmm5,xmm4
                addps       xmm1,xmm5
                movaps      xmm5,dqword [g_sign_mask]
                rcpps       xmm1,xmm1
                mulps       xmm1,xmm2
                addps       xmm3,xmm4
                movaps      xmm4,dqword [g_0_5pi]
                mulps       xmm6,xmm0
                addps       xmm1,xmm3
                andps       xmm0,xmm5
                rcpps       xmm1,xmm1
                movaps      xmm3,xmm9
                mulps       xmm1,xmm6
                orps        xmm0,xmm4
                subps       xmm0,xmm1
                movaps      xmm2,xmm8
                andps       xmm0,xmm7
                andnps      xmm7,xmm1
                orps        xmm0,xmm7
                movaps      xmm1,xmm0
                andps       xmm0,xmm3
                addps       xmm1,xmm2
                andnps      xmm3,xmm1
                orps        xmm0,xmm3
                ret
;-------------------------------------------------------------------------------
; NAME:         ComputeColors
; IN:           rdi         pointer to 4 image pixels
; IN:           xmm0        | x3 x2 x1 x0 | normalized pixel x coordinates
; IN:           xmm1        | y3 y2 y1 y0 | normalized pixel y coordinates
; OUT:          xmm0        | ? b g r | pixel color
;-------------------------------------------------------------------------------
align 16
ComputeColors:
                ; r = sqrt(x * x + y * y)
                movaps      xmm2,xmm0
                mulps       xmm2,xmm0
                movaps      xmm3,xmm1
                mulps       xmm3,xmm1
                addps       xmm2,xmm3
                sqrtps      xmm15,xmm2          ; xmm15 = r
                ; a = atan2(y, x) = atanr2(y, 1/x)
                movaps      xmm2,xmm0
                movaps      xmm0,xmm1
                movaps      xmm1,xmm2
                rcpps       xmm1,xmm1
                call        atanr2
                movaps      xmm14,xmm0          ; xmm14 = a
                ; s = 0.5 + 0.5 * sin(3.0 * a)
                mulps       xmm0,dqword [g_3_0]
                movaps      xmm12,xmm0          ; xmm12 = 3.0 * a
                call        sin
                movaps      xmm1,dqword [g_0_5]
                mulps       xmm0,xmm1
                addps       xmm0,xmm1
                movaps      xmm13,xmm0          ; xmm13 = s
                ; g = sin(0.5 * pi + 3.0 * a)
                movaps      xmm0,xmm12
                addps       xmm0,dqword [g_0_5pi]
                call        sin                 ; xmm0 = g
                ; d = 0.3 + 0.6 * sqrt(s) + 0.15 * g * g
                mulps       xmm0,xmm0
                mulps       xmm0,dqword [g_0_15]
                sqrtps      xmm13,xmm13
                mulps       xmm13,dqword [g_0_6]
                addps       xmm0,xmm13
                addps       xmm0,dqword [g_0_3] ; xmm0 = d
                ; h = r / d
                divps       xmm15,xmm0          ; xmm15 = h
                ; f = 1.0 - smoothstep(0.95, 1.0, h)
                movaps      xmm0,dqword [g_0_95]  ; edge0
                movaps      xmm1,dqword [g_1_0]   ; edge1
                movaps      xmm2,xmm15            ; x
                SMOOTHSTEP
                movaps      xmm14,dqword [g_1_0]
                subps       xmm14,xmm0            ; xmm14 = f
                ;
                movaps      xmm0,xmm12
                call        sin
                movaps      xmm2,xmm0
                movaps      xmm0,xmm15
                mulps       xmm0,dqword [g_0_05]
                addps       xmm0,dqword [g_0_95]
                movaps      xmm1,dqword [g_1_0]
                SMOOTHSTEP
                movaps      xmm1,dqword [g_1_0]
                movaps      xmm2,xmm1
                subps       xmm1,xmm15
                mulps       xmm1,dqword [g_0_5]
                mulps       xmm0,xmm1
                subps       xmm2,xmm0
                mulps       xmm15,xmm2
                ;
                movaps      xmm6,dqword [g_1_0]
                movaps      xmm7,dqword [g_c0]
                mulps       xmm7,xmm15
                addps       xmm7,dqword [g_c1]

                movaps      xmm0,xmm14
                shufps      xmm0,xmm0,00000000b ; xmm0 = | f f f f |
                movaps      xmm1,xmm6
                subps       xmm1,xmm0           ; xmm1 = | 1-f 1-f 1-f 1-f |
                mulps       xmm0,xmm7
                mulps       xmm1,xmm6
                addps       xmm0,xmm1
                movaps      [rdi+0],xmm0

                movaps      xmm0,xmm14
                shufps      xmm0,xmm0,01010101b ; xmm0 = | f f f f |
                movaps      xmm1,xmm6
                subps       xmm1,xmm0           ; xmm1 = | 1-f 1-f 1-f 1-f |
                mulps       xmm0,xmm7
                mulps       xmm1,xmm6
                addps       xmm0,xmm1
                movaps      [rdi+16],xmm0

                movaps      xmm0,xmm14
                shufps      xmm0,xmm0,10101010b ; xmm0 = | f f f f |
                movaps      xmm1,xmm6
                subps       xmm1,xmm0           ; xmm1 = | 1-f 1-f 1-f 1-f |
                mulps       xmm0,xmm7
                mulps       xmm1,xmm6
                addps       xmm0,xmm1
                movaps      [rdi+32],xmm0

                movaps      xmm0,xmm14
                shufps      xmm0,xmm0,11111111b ; xmm0 = | f f f f |
                movaps      xmm1,xmm6
                subps       xmm1,xmm0           ; xmm1 = | 1-f 1-f 1-f 1-f |
                mulps       xmm0,xmm7
                mulps       xmm1,xmm6
                addps       xmm0,xmm1
                movaps      [rdi+48],xmm0
                ret
;-------------------------------------------------------------------------------
; NAME:         AllocateMemory
; IN:           rdi         size in bytes
; OUT:          rax         pointer to allocated memory
;-------------------------------------------------------------------------------
align 16
AllocateMemory:
                mov         eax,9               ; sys_mmap
                mov         rsi,rdi             ; length
                xor         edi,edi             ; addr
                mov         edx,0x1+0x2         ; PROT_READ | PROT_WRITE
                mov         r10d,0x02+0x20      ; MAP_PRIVATE | MAP_ANONYMOUS
                mov         r8,-1               ; fd
                xor         r9d,r9d             ; offset
                syscall
                ret
;-------------------------------------------------------------------------------
; NAME:         main
; DESC:         Program main function.
;-------------------------------------------------------------------------------
main:
imgptr          equ         rbp-8
imgptr_float    equ         rbp-16
x               equ         rbp-32
y               equ         rbp-48
                push        rbp
                mov         rbp,rsp
                sub         rsp,64
                ; alloc memory
                mov         edi,IMG_W*IMG_H*4
                call        AllocateMemory
                mov         [imgptr],rax
                mov         edi,IMG_W*IMG_H*16
                call        AllocateMemory
                mov         [imgptr_float],rax
                mov         rbx,rax
                ;;
                ;; Compute image
                ;;
                ; begin loops
                pxor        xmm0,xmm0
                movdqa      [y],xmm0
.LoopY:
                movdqa      xmm0,dqword [g_3_2_1_0]
                movdqa      [x],xmm0
.LoopX:
                mov         rdi,rbx
                ; compute normalized x coordinate [-1.0 , 1.0]
                cvtdq2ps    xmm0,[x]
                divps       xmm0,dqword [g_img_w]
                subps       xmm0,dqword [g_0_5]
                addps       xmm0,xmm0
                ; compute normalized y coordinate [-1.0 , 1.0]
                cvtdq2ps    xmm1,[y]
                divps       xmm1,dqword [g_img_h]
                subps       xmm1,dqword [g_0_5]
                addps       xmm1,xmm1
                call        ComputeColors
                ; advance pixel pointer
                add         rbx,64
                ; continue .LoopX
                movdqa      xmm0,[x]
                paddd       xmm0,dqword [g_4]
                movdqa      [x],xmm0
                cmp         dword [x+12],IMG_W
                jb          .LoopX
                ; continue .LoopY
                movdqa      xmm0,[y]
                paddd       xmm0,dqword [g_1]
                movdqa      [y],xmm0
                cmp         dword [y],IMG_H
                jb          .LoopY
                ;;
                ;; Convert image
                ;;
                mov         rsi,[imgptr_float]
                mov         rdi,[imgptr]
                mov         ecx,IMG_W*IMG_H
.Convert:
                movaps      xmm0,[rsi]
                maxps       xmm0,dqword [g_0_0]
                minps       xmm0,dqword [g_1_0]
                mulps       xmm0,dqword [g_255_0]
                cvttps2dq   xmm0,xmm0
                pshufb      xmm0,dqword [g_img_conv_mask]
                movd        eax,xmm0
                or          eax,0xff000000
                mov         [rdi],eax
                add         rsi,16
                add         rdi,4
                sub         ecx,1
                jnz         .Convert
                ;;
                ;; Save image
                ;;
                ; create TGA file
                mov         eax,85
                mov         rdi,g_tga_name
                mov         esi,110000000b
                syscall
                mov         rbx,rax
                ; write header
                mov         eax,1
                mov         rdi,rbx
                mov         rsi,g_tga_head
                mov         edx,18
                syscall
                ; write pixel data
                mov         eax,1
                mov         rdi,rbx
                mov         rsi,[imgptr]
                mov         edx,IMG_W*IMG_H*4
                syscall
                mov         rsp,rbp
                pop         rbp
                ret
                restore     imgptr,imgptr_float,x,y
;-------------------------------------------------------------------------------
; NAME:         _start
; DESC:         Program entry point.
;-------------------------------------------------------------------------------
_start:
                call        main
                ; terminate process
                mov         eax,60
                xor         edi,edi
                syscall
;-------------------------------------------------------------------------------
segment readable
;-------------------------------------------------------------------------------
align 1
g_tga_name      db          'clover.tga',0
g_tga_head      db          0,0,2,9 dup 0
                db          (IMG_W and 0x00ff),(IMG_W and 0xff00) shr 8
                db          (IMG_H and 0x00ff),(IMG_H and 0xff00) shr 8,32,0

align 16
g_img_w         dd          4 dup 800.0
g_img_h         dd          4 dup 800.0
IMG_W=800
IMG_H=800

g_c0            dd          0.4,0.3,0.0,0.0
g_c1            dd          0.0,0.2,0.0,0.0
g_3_0           dd          4 dup 3.0
g_0_3           dd          4 dup 0.3
g_0_6           dd          4 dup 0.6
g_0_15          dd          4 dup 0.15
g_0_95          dd          4 dup 0.95
g_0_05          dd          4 dup 0.05

g_img_conv_mask db          8,4,0,12,12 dup 0x80
g_0_0           dd          4 dup 0.0
g_0_5           dd          4 dup 0.5
g_1_0           dd          4 dup 1.0
g_m1_0          dd          4 dup -1.0
g_255_0         dd          4 dup 255.0
g_1             dd          4 dup 1
g_2             dd          4 dup 2
g_4             dd          4 dup 4
g_3_2_1_0       dd          0,1,2,3
g_sign_mask     dd          4 dup 0x80000000
g_inv_sign_mask dd          4 dup not 0x80000000
g_0_5pi         dd          4 dup 1.57079633
g_2_div_pi      dd          4 dup 0.636619772
g_pi            dd          4 dup 3.1415926535897

g_sincos_p0     dd          4 dup 1.5707963267948963959
g_sincos_p1     dd          4 dup -0.64596409750621907082
g_sincos_p2     dd          4 dup 0.07969262624561800806
g_sincos_p3     dd          4 dup -0.00468175413106023168

g_atan_t0       dd          4 dup -0.091646118527267623468
g_atan_t1       dd          4 dup -1.3956945682312098640
g_atan_t2       dd          4 dup -94.393926122725531747
g_atan_t3       dd          4 dup 12.888383034157279340
g_atan_s0       dd          4 dup 1.2797564625607904396
g_atan_s1       dd          4 dup 2.1972168858277355914
g_atan_s2       dd          4 dup 6.8193064729268275701
g_atan_s3       dd          4 dup 28.205206687035841409
;-------------------------------------------------------------------------------
