format ELF64 executable 3
entry _start
;-------------------------------------------------------------------------------
; NAME:         DOT3
; IN:           xmm0        | ? z0 y0 x0 |
; IN:           xmm1        | ? z1 y1 x1 |
; OUT:          xmm0        | s s s s | s = x0*x1+y0*y1+z0*z1
;-------------------------------------------------------------------------------
macro           DOT3        {
                mulps       xmm0,xmm1
                movaps      xmm1,xmm0
                movaps      xmm2,xmm0
                shufps      xmm0,xmm0,0x00
                shufps      xmm1,xmm1,0x55
                shufps      xmm2,xmm2,0xaa
                addps       xmm0,xmm1
                addps       xmm0,xmm2
}
;-------------------------------------------------------------------------------
; NAME:         DOT4
; IN:           xmm0        | w0 z0 y0 x0 |
; IN:           xmm1        | w1 z1 y1 x1 |
; OUT:          xmm0        | s s s s | s = x0*x1+y0*y1+z0*z1+w0*w1
;-------------------------------------------------------------------------------
macro           DOT4        {
                mulps       xmm0,xmm1
                movaps      xmm1,xmm0
                movaps      xmm2,xmm0
                movaps      xmm3,xmm0
                shufps      xmm0,xmm0,0x00
                shufps      xmm1,xmm1,0x55
                shufps      xmm2,xmm2,0xaa
                shufps      xmm3,xmm3,0xff
                addps       xmm0,xmm1
                addps       xmm2,xmm3
                addps       xmm0,xmm2
}
;-------------------------------------------------------------------------------
; NAME:         FLOOR
; IN:           xmm0        | w z y x |
; OUT:          xmm0        | floor(w) floor(z) floor(y) floor(x) |
;-------------------------------------------------------------------------------
macro           FLOOR       {
                cvttps2dq   xmm1,xmm0
                psrld       xmm0,31
                psubd       xmm1,xmm0
                cvtdq2ps    xmm0,xmm1
}
;-------------------------------------------------------------------------------
; NAME:         STEP
; IN:           xmm0        | ew ez ey ex | edge vector
; IN:           xmm1        | w z y x | value vector
; OUT:          xmm0        | step(ew,w) step(ez,z) step(ey,y) step(ex,x) |
;-------------------------------------------------------------------------------
macro           STEP        {
                cmpltps     xmm1,xmm0
                andnps      xmm1,dqword [g_1_0]
                movaps      xmm0,xmm1
}
;-------------------------------------------------------------------------------
; NAME:         MOD289
; IN:           xmm0        | w z y x |
; OUT:          xmm0        | mod289(w) mod289(z) mod289(y) mod289(x) |
;                           mod289(s) = s - floor(s * (1.0/289.0)) * 289.0
;-------------------------------------------------------------------------------
macro           MOD289      {
                movaps      xmm2,xmm0
                mulps       xmm0,dqword [g_1_div_289]
                FLOOR
                mulps       xmm0,dqword [g_289_0]
                subps       xmm2,xmm0
                movaps      xmm0,xmm2
}
;-------------------------------------------------------------------------------
; NAME:         PERMUTE
; IN:           xmm0        | w z y x |
; OUT:          xmm0        | perm(w) perm(z) perm(y) perm(x) |
;                           perm(s) = mod289(((s*34.0)+1.0)*s)
;-------------------------------------------------------------------------------
macro           PERMUTE     {
                movaps      xmm1,xmm0
                mulps       xmm0,dqword [g_34_0]
                addps       xmm0,dqword [g_1_0]
                mulps       xmm0,xmm1
                MOD289
}
;-------------------------------------------------------------------------------
; NAME:         ABS
; IN:           xmm0        | w z y x |
; OUT:          xmm0        | abs(w) abs(z) abs(y) abs(x) |
;-------------------------------------------------------------------------------
macro           ABS         {
                xorps       xmm1,xmm1           ; xmm1 = | 0 0 0 0 |
                subps       xmm1,xmm0           ; xmm1 = neg(x)
                maxps       xmm0,xmm1           ; xmm0 = abs(x)
}
;-------------------------------------------------------------------------------
segment readable executable
;-------------------------------------------------------------------------------
; NAME:         snoise3
; DESC:         3D Simplex noise (https://github.com/ashima/webgl-noise)
; IN:           xmm0        | ? z y x |
; OUT:          xmm0        | s s s s | s is noise value [-1.0,1.0]
;-------------------------------------------------------------------------------
align 16
snoise3:
v               equ         rbp-16
i               equ         rbp-32
x0              equ         rbp-48
x1              equ         rbp-64
x2              equ         rbp-80
x3              equ         rbp-96
i1              equ         rbp-112
i2              equ         rbp-128
                push        rbp
                mov         rbp,rsp
                sub         rsp,256
                movaps      [v],xmm0                  ; save input on the stack
                ;
                ; Compute corners (x0, x1, x2, x3)
                ;
                ; i = floor(v + dot(v, C.yyy))
                movaps      xmm1,dqword [g_snoise_C]
                shufps      xmm1,xmm1,0x55            ; xmm1 = C.yyy
                DOT3                                  ; xmm0 = dot(xmm0,xmm1)
                addps       xmm0,[v]
                FLOOR
                movaps      [i],xmm0
                ; x0 = v - i + dot(i, C.xxx)
                movaps      xmm3,[v]
                subps       xmm3,xmm0
                movaps      xmm1,dqword [g_snoise_C]
                shufps      xmm1,xmm1,0x00            ; xmm1 = C.xxx
                DOT3                                  ; xmm0 = dot(xmm0,xmm1)
                addps       xmm3,xmm0
                movaps      [x0],xmm3
                ; compute i1 and i2
                movaps      xmm0,xmm3
                shufps      xmm0,xmm0,11001001b       ; xmm0 = | w x z y |
                movaps      xmm1,xmm3
                STEP
                movaps      xmm7,xmm0                 ; xmm7 = g
                movaps      xmm6,dqword [g_1_0]
                subps       xmm6,xmm7                 ; xmm6 = 1.0 - g = l
                shufps      xmm6,xmm6,11010010b       ; xmm6 = | w y x z |
                movaps      xmm0,xmm7
                movaps      xmm1,xmm7
                minps       xmm0,xmm6                 ; xmm0 = min(g.xyz, l.zxy)
                maxps       xmm1,xmm6                 ; xmm1 = max(g.xyz, l.zxy)
                movaps      [i1],xmm0                 ; xmm0 = i1
                movaps      [i2],xmm1                 ; xmm1 = i2
                ; compute x1, x2 and x3
                movaps      xmm7,[x0]                 ; xmm7 = x0
                movaps      xmm6,xmm7                 ; xmm6 = x0
                movaps      xmm5,xmm7                 ; xmm5 = x0
                movaps      xmm4,dqword [g_snoise_C]
                movaps      xmm3,xmm4
                movaps      xmm2,dqword [g_snoise_D]
                shufps      xmm4,xmm4,0x00            ; xmm4 = C.xxx
                shufps      xmm3,xmm3,0x55            ; xmm3 = C.yyy
                shufps      xmm2,xmm2,0x55            ; xmm2 = D.yyy
                subps       xmm5,xmm0                 ; xmm5 = x0 - i1
                subps       xmm6,xmm1                 ; xmm6 = x0 - i2
                subps       xmm7,xmm2                 ; xmm7 = x0 - D.yyy
                addps       xmm5,xmm4                 ; xmm5 = x0 - i1 + C.xxx
                addps       xmm6,xmm3                 ; xmm6 = x0 - i2 + C.yyy
                movaps      [x1],xmm5
                movaps      [x2],xmm6
                movaps      [x3],xmm7
                ;
                ; Compute permutations (p)
                ;
                movaps      xmm0,[i]
                MOD289
                movaps      xmm7,xmm0
                movaps      xmm6,xmm0
                movaps      xmm5,xmm0
                shufps      xmm7,xmm7,10101010b       ; xmm7 = i.zzzz
                shufps      xmm6,xmm6,01010101b       ; xmm6 = i.yyyy
                shufps      xmm5,xmm5,00000000b       ; xmm5 = i.xxxx
                movaps      xmm4,[i1]                 ; xmm4 = i1
                movaps      xmm3,[i2]                 ; xmm3 = i2
                ;
                movaps      xmm0,xmm4                 ; xmm0 = i1
                movaps      xmm1,xmm3                 ; xmm1 = i2
                shufps      xmm0,xmm0,10101010b       ; xmm0 = | i1.z i1.z i1.z i1.z |
                shufps      xmm1,xmm1,10101010b       ; xmm1 = | i2.z i2.z i2.z i2.z |
                andps       xmm0,dqword [g_mask_0010] ; xmm0 = | 0 0 i1.z 0 |
                andps       xmm1,dqword [g_mask_0100] ; xmm1 = | 0 i2.z 0 0 |
                orps        xmm0,xmm1                 ; xmm0 = | 0 i2.z i1.z 0 |
                orps        xmm0,dqword [g_1_0_w]     ; xmm0 = | 1 i2.z i1.z 0 |
                addps       xmm0,xmm7                 ; xmm0 = i.zzzz + | 1 i2.z i1.z 0 |
                PERMUTE
                movaps      xmm8,xmm0                 ; xmm8 = p
                movaps      xmm0,xmm4                 ; xmm0 = i1
                movaps      xmm1,xmm3                 ; xmm1 = i2
                shufps      xmm0,xmm0,01010101b       ; xmm0 = | i1.y i1.y i1.y i1.y |
                shufps      xmm1,xmm1,01010101b       ; xmm1 = | i2.y i2.y i2.y i2.y |
                andps       xmm0,dqword [g_mask_0010] ; xmm0 = | 0 0 i1.y 0 |
                andps       xmm1,dqword [g_mask_0100] ; xmm1 = | 0 i2.y 0 0 |
                orps        xmm0,xmm1                 ; xmm0 = | 0 i2.y i1.y 0 |
                orps        xmm0,dqword [g_1_0_w]     ; xmm0 = | 1 i2.y i1.y 0 |
                addps       xmm0,xmm6                 ; xmm0 = i.yyyy + | 1 i2.y i1.y 0 |
                addps       xmm0,xmm8
                PERMUTE
                movaps      xmm8,xmm0                 ; xmm8 = p
                movaps      xmm0,xmm4                 ; xmm0 = i1
                movaps      xmm1,xmm3                 ; xmm1 = i2
                shufps      xmm0,xmm0,00000000b       ; xmm0 = | i1.x i1.x i1.x i1.x |
                shufps      xmm1,xmm1,00000000b       ; xmm1 = | i2.x i2.x i2.x i2.x |
                andps       xmm0,dqword [g_mask_0010] ; xmm0 = | 0 0 i1.x 0 |
                andps       xmm1,dqword [g_mask_0100] ; xmm1 = | 0 i2.x 0 0 |
                orps        xmm0,xmm1                 ; xmm0 = | 0 i2.x i1.x 0 |
                orps        xmm0,dqword [g_1_0_w]     ; xmm0 = | 1 i2.x i1.x 0 |
                addps       xmm0,xmm5                 ; xmm0 = i.xxxx + | 1 i2.x i1.x 0 |
                addps       xmm0,xmm8
                PERMUTE
                movaps      xmm8,xmm0                 ; xmm8 = p
                ;
                ; Compute gradients
                ;
                movaps      xmm0,dqword [g_snoise_D]
                movaps      xmm1,xmm0
                shufps      xmm0,xmm0,11100111b       ; xmm0 = | D.w D.z D.y D.w |
                shufps      xmm1,xmm1,11001000b       ; xmm1 = | D.w D.x D.z D.x |
                mulps       xmm0,dqword [g_1_div_7]
                subps       xmm0,xmm1
                movaps      xmm7,xmm0                 ; xmm7 = ns
                ; xmm8 = j = p - 49.0 * floor(p * ns.z * ns.z)
                shufps      xmm0,xmm0,10101010b       ; xmm0 = ns.zzzz
                mulps       xmm0,xmm0
                mulps       xmm0,xmm8
                FLOOR
                mulps       xmm0,dqword [g_49_0]
                subps       xmm8,xmm0                 ; xmm8 = j
                ; x_ = floor(j * ns.zzzz)
                movaps      xmm0,xmm7
                shufps      xmm0,xmm0,10101010b       ; xmm0 = ns.zzzz
                mulps       xmm0,xmm8
                FLOOR                                 ; xmm0 = x_
                movaps      xmm6,xmm0                 ; xmm6 = x_
                ; y_ = floor(j - 7.0 * x_)
                mulps       xmm0,dqword [g_7_0]
                movaps      xmm1,xmm8
                subps       xmm1,xmm0
                movaps      xmm0,xmm1
                FLOOR
                movaps      xmm5,xmm0                 ; xmm5 = y_
                ; x = x_ * ns.xxxx + ns.yyyy
                ; y = y_ * ns.xxxx + ns.yyyy
                movaps      xmm0,xmm7                 ; xmm0 = ns
                movaps      xmm1,xmm7                 ; xmm1 = ns
                shufps      xmm0,xmm0,00000000b       ; xmm0 = ns.xxxx
                shufps      xmm1,xmm1,01010101b       ; xmm1 = ns.yyyy
                mulps       xmm6,xmm0                 ; xmm6 = x_ * ns.xxxx
                mulps       xmm5,xmm0                 ; xmm5 = y_ * ns.xxxx
                addps       xmm6,xmm1                 ; xmm6 = x = x_ * ns.xxxx + ns.yyyy
                addps       xmm5,xmm1                 ; xmm5 = y = y_ * ns.xxxx + ns.yyyy
                ; h = 1.0 - abs(x) - abs(y)
                movaps      xmm4,dqword [g_1_0]       ; xmm4 = h = | 1 1 1 1 |
                movaps      xmm0,xmm6                 ; xmm0 = x
                ABS                                   ; xmm0 = abs(x)
                movaps      xmm3,xmm0                 ; xmm3 = abs(x)
                movaps      xmm0,xmm5                 ; xmm0 = y
                ABS                                   ; xmm0 = abs(y)
                subps       xmm4,xmm3                 ; xmm4 = h = 1.0 - abs(x)
                subps       xmm4,xmm0                 ; xmm4 = h = 1.0 - abs(x) - abs(y)
                ; b0 = vec4(x.xy, y.xy)
                movaps      xmm0,xmm6                 ; xmm0 = x
                movaps      xmm1,xmm5                 ; xmm1 = y
                unpcklps    xmm0,xmm1                 ; xmm0 = | y.y x.y y.x x.x |
                shufps      xmm0,xmm0,11011000b       ; xmm0 = | y.y y.x x.y x.x |
                movaps      xmm7,xmm0                 ; xmm7 = b0
                ; b1 = vec4(x.zw, y.zw)
                movaps      xmm0,xmm6                 ; xmm0 = x
                movaps      xmm1,xmm5                 ; xmm1 = y
                unpckhps    xmm0,xmm1                 ; xmm0 = | y.w x.w y.z x.z |
                shufps      xmm0,xmm0,11011000b       ; xmm0 = | y.w y.z x.w x.z |
                movaps      xmm3,xmm0                 ; xmm3 = b1
                ; s0 = floor(b0) * 2.0 + 1.0
                movaps      xmm0,xmm7
                FLOOR
                addps       xmm0,xmm0
                addps       xmm0,dqword [g_1_0]
                movaps      xmm15,xmm0                ; xmm15 = s0
                ; s1 = floor(b1) * 2.0 + 1.0
                movaps      xmm0,xmm3
                FLOOR
                addps       xmm0,xmm0
                addps       xmm0,dqword [g_1_0]
                movaps      xmm14,xmm0                ; xmm14 = s1
                ; sh = -step(h, vec4(0.0))
                movaps      xmm0,xmm4                 ; xmm0 = h
                xorps       xmm1,xmm1                 ; xmm1 = | 0 0 0 0 |
                STEP
                xorps       xmm1,xmm1
                subps       xmm1,xmm0
                movaps      xmm13,xmm1                ; xmm13 = sh
                ; a0 = b0.xzyw + s0.xzyw * sh.xxyy
                shufps      xmm7,xmm7,11011000b       ; xmm7 = b0 = | w y z x |
                shufps      xmm15,xmm15,11011000b     ; xmm15 = s0 = | w y z x |
                movaps      xmm0,xmm13
                shufps      xmm0,xmm0,01010000b       ; xmm0 = | y y x x |
                mulps       xmm0,xmm15
                addps       xmm0,xmm7
                movaps      xmm7,xmm0                 ; xmm7 = a0
                ; a1 = b1.xzyw + s1.xzyw * sh.zzww
                shufps      xmm3,xmm3,11011000b       ; xmm3 = b1 = | w y z x |
                shufps      xmm14,xmm14,11011000b     ; xmm14 = s1 = | w y z x |
                shufps      xmm13,xmm13,11111010b     ; xmm13 = sh = | w w z z |
                mulps       xmm14,xmm13
                addps       xmm14,xmm3
                movaps      xmm6,xmm14                ; xmm6 = a1
                ; p0 = vec3(a0.xy, h.x)
                movaps      xmm0,xmm7 ; xmm0 = a0
                shufps      xmm0,xmm4,00000100b       ; | h.x h.x a0.y a0.x |
                movaps      xmm5,xmm0                 ; xmm5 = p0
                ; p1 = vec3(a0.zw, h.y)
                shufps      xmm7,xmm4,01011110b       ; xmm7 = p1 = | h.y h.y a0.w a0.z |
                ; p2 = vec3(a1.xy, h.z)
                movaps      xmm0,xmm6 ; xmm0 = a1
                shufps      xmm0,xmm4,10100100b       ; | h.z h.z a1.y a1.x |
                movaps      xmm3,xmm0                 ; xmm3 = p2
                ; p3 = vec3(a1.zw, h.w)
                shufps      xmm6,xmm4,11111110b       ; xmm6 = p3 = | h.w h.w a1.w a1.z |
                ;
                movaps      xmm4,xmm3                 ; xmm4 = p2
                ;
                ; Normalize gradients
                ;
                ; xmm5 = p0, xmm7 = p1, xmm4 = p2, xmm6 = p3
                ;
                ; xmm15 = dot(p0, p0)
                movaps      xmm0,xmm5                 ; xmm0 = p0
                movaps      xmm1,xmm5                 ; xmm1 = p0
                DOT3
                movaps      xmm15,xmm0                ; xmm15 = dot(p0, p0)
                ; xmm14 = dot(p1, p1)
                movaps      xmm0,xmm7                 ; xmm0 = p1
                movaps      xmm1,xmm7                 ; xmm1 = p1
                DOT3
                movaps      xmm14,xmm0                ; xmm14 = dot(p1, p1)
                ; xmm13 = dot(p2, p2)
                movaps      xmm0,xmm4                 ; xmm0 = p2
                movaps      xmm1,xmm4                 ; xmm1 = p2
                DOT3
                movaps      xmm13,xmm0                ; xmm13 = dot(p2, p2)
                ; xmm12 = dot(p3, p3)
                movaps      xmm0,xmm6                 ; xmm0 = p3
                movaps      xmm1,xmm6                 ; xmm1 = p3
                DOT3
                movaps      xmm12,xmm0                ; xmm12 = dot(p3, p3)
                ;
                movaps      xmm0,dqword [g_taylor_scale]
                movaps      xmm1,dqword [g_taylor_bias]
                mulps       xmm15,xmm0
                mulps       xmm14,xmm0
                mulps       xmm13,xmm0
                mulps       xmm12,xmm0
                addps       xmm15,xmm1
                addps       xmm14,xmm1
                addps       xmm13,xmm1
                addps       xmm12,xmm1
                ; normalize
                mulps       xmm5,xmm15                ; xmm5 = p0
                mulps       xmm7,xmm14                ; xmm7 = p1
                mulps       xmm4,xmm13                ; xmm4 = p2
                mulps       xmm6,xmm12                ; xmm6 = p3
                ;
                ; Mix final noise value
                ;
                ; xmm15 = dot(x0, x0)
                movaps      xmm0,[x0]                 ; xmm0 = x0
                movaps      xmm1,xmm0                 ; xmm1 = x0
                DOT3
                movaps      xmm15,xmm0                ; xmm15 = dot(x0, x0)
                ; xmm14 = dot(x1, x1)
                movaps      xmm0,[x1]                 ; xmm0 = x1
                movaps      xmm1,xmm0                 ; xmm1 = x1
                DOT3
                movaps      xmm14,xmm0                ; xmm14 = dot(x1, x1)
                ; xmm13 = dot(x2, x2)
                movaps      xmm0,[x2]                 ; xmm0 = x2
                movaps      xmm1,xmm0                 ; xmm1 = x2
                DOT3
                movaps      xmm13,xmm0                ; xmm13 = dot(x2, x2)
                ; xmm12 = dot(x3, x3)
                movaps      xmm0,[x3]                 ; xmm0 = x3
                movaps      xmm1,xmm0                 ; xmm1 = x3
                DOT3
                movaps      xmm12,xmm0                ; xmm12 = dot(x3, x3)
                ;
                andps       xmm15,dqword [g_mask_0001]
                andps       xmm14,dqword [g_mask_0010]
                andps       xmm13,dqword [g_mask_0100]
                andps       xmm12,dqword [g_mask_1000]
                orps        xmm15,xmm14
                orps        xmm13,xmm12
                orps        xmm15,xmm13
                movaps      xmm0,dqword [g_0_6]
                subps       xmm0,xmm15
                maxps       xmm0,dqword [g_0_0]       ; xmm0 = m
                mulps       xmm0,xmm0
                mulps       xmm0,xmm0
                movaps      xmm10,xmm0                ; xmm10 = m^4
                ;
                ; xmm15 = dot(x0, p0)
                movaps      xmm0,[x0]                 ; xmm0 = x0
                movaps      xmm1,xmm5                 ; xmm1 = p0
                DOT3
                movaps      xmm15,xmm0                ; xmm15 = dot(x0, p0)
                ; xmm14 = dot(x1, p1)
                movaps      xmm0,[x1]                 ; xmm0 = x1
                movaps      xmm1,xmm7                 ; xmm1 = p1
                DOT3
                movaps      xmm14,xmm0                ; xmm14 = dot(x1, p1)
                ; xmm13 = dot(x2, p2)
                movaps      xmm0,[x2]                 ; xmm0 = x2
                movaps      xmm1,xmm4                 ; xmm1 = p2
                DOT3
                movaps      xmm13,xmm0                ; xmm13 = dot(x2, p2)
                ; xmm12 = dot(x3, p3)
                movaps      xmm0,[x3]                 ; xmm0 = x3
                movaps      xmm1,xmm6                 ; xmm1 = p3
                DOT3
                movaps      xmm12,xmm0                ; xmm12 = dot(x3, p3)
                ; put all above dots into xmm15
                andps       xmm15,dqword [g_mask_0001]
                andps       xmm14,dqword [g_mask_0010]
                andps       xmm13,dqword [g_mask_0100]
                andps       xmm12,dqword [g_mask_1000]
                orps        xmm15,xmm14
                orps        xmm13,xmm12
                orps        xmm15,xmm13
                ;
                movaps      xmm0,xmm10                ; xmm0 = m^4
                movaps      xmm1,xmm15
                DOT4
                mulps       xmm0,dqword [g_42_0]
                mov         rsp,rbp
                pop         rbp
                ret
                restore     v,i,x0,x1,x2,x3,i1,i2
;-------------------------------------------------------------------------------
; NAME:         main
; DESC:         Program main function.
;-------------------------------------------------------------------------------
align 16
main:
imgptr          equ         rbp-8
                push        rbp
                mov         rbp,rsp
                sub         rsp,128
                ; alloc memory for the image
                mov         eax,9                     ; sys_mmap
                xor         edi,edi                   ; addr
                mov         esi,SIZE*SIZE*4           ; length
                mov         edx,0x1+0x2               ; PROT_READ | PROT_WRITE
                mov         r10d,0x02+0x20            ; MAP_PRIVATE | MAP_ANONYMOUS
                mov         r8,-1                     ; fd
                xor         r9d,r9d                   ; offset
                syscall
                mov         [imgptr],rax
                mov         rbx,rax
                ; begin loops
                xor         r13d,r13d                 ; .LoopY index
.LoopY:
                xor         r12d,r12d                 ; .LoopX index
.LoopX:
                ; compute
                xorps       xmm0,xmm0
                xorps       xmm1,xmm1
                cvtsi2ss    xmm0,r12d                 ; xmm0 = | 0 0 0 x |
                cvtsi2ss    xmm1,r13d                 ; xmm1 = | 0 0 0 y |
                unpcklps    xmm0,xmm1                 ; xmm0 = | 0 0 y x |
                divps       xmm0,dqword [g_size]
                addps       xmm0,xmm0
                addps       xmm0,xmm0
                call        snoise3
                mulps       xmm0,dqword [g_0_5]
                addps       xmm0,dqword [g_0_5]
                ; clamp to [0.0,1.0]
                minps       xmm0,dqword [g_1_0]
                maxps       xmm0,dqword [g_0_0]
                ; convert from [0.0,1.0] to [0,255]
                mulps       xmm0,dqword [g_255_0]
                cvttps2dq   xmm0,xmm0
                movd        eax,xmm0
                mov         [rbx+2],al                ; red
                pshufd      xmm1,xmm0,00000001b
                movd        eax,xmm1
                mov         [rbx+1],al                ; green
                pshufd      xmm1,xmm0,00000010b
                movd        eax,xmm1
                mov         [rbx+0],al                ; blue
                mov         byte [rbx+3],255          ; alpha
                ; advance pixel pointer
                add         rbx,4
                ; continue .LoopX
                add         r12d,1
                cmp         r12d,SIZE
                jne         .LoopX
                ; continue .LoopY
                add         r13d,1
                cmp         r13d,SIZE
                jne         .LoopY
                ; create TGA file
                mov         eax,85
                mov         rdi,g_tga_name
                mov         esi,110000000b
                syscall
                mov         rbx,rax
                ; write header
                mov         eax,1
                mov         rdi,rbx
                mov         rsi,g_tga_head
                mov         edx,18
                syscall
                ; write pixel data
                mov         eax,1
                mov         rdi,rbx
                mov         rsi,[imgptr]
                mov         edx,SIZE*SIZE*4
                syscall
                mov         rsp,rbp
                pop         rbp
                ret
                restore     imgptr
;-------------------------------------------------------------------------------
; NAME:         Debug
;-------------------------------------------------------------------------------
align 16
Debug:
v               equ         rbp-16
                push        rbp
                mov         rbp,rsp
                sub         rsp,128
                mov         dword [v+0],1.2
                mov         dword [v+4],2.4
                mov         dword [v+8],3.5
                mov         dword [v+12],0.0
                movaps      xmm0,[v]
                call        snoise3
                mov         rsp,rbp
                pop         rbp
                ret
                restore     v
;-------------------------------------------------------------------------------
; NAME:         _start
; DESC:         Program entry point.
;-------------------------------------------------------------------------------
_start:
                call        main
                ; terminate process
                mov         eax,60
                xor         edi,edi
                syscall
;-------------------------------------------------------------------------------
segment readable
;-------------------------------------------------------------------------------
align 1
g_tga_name      db          'snoise.tga',0
g_tga_head      db          0,0,2,9 dup 0
                db          (SIZE and 0x00ff),(SIZE and 0xff00) shr 8
                db          (SIZE and 0x00ff),(SIZE and 0xff00) shr 8,32,0
align 16
SIZE=800
g_size          dd          4 dup 800.0

g_snoise_C      dd          0.166666667,0.333333333,0.0,0.0
g_snoise_D      dd          0.0,0.5,1.0,2.0

g_0_0           dd          4 dup 0.0
g_0_5           dd          4 dup 0.5
g_0_6           dd          4 dup 0.6
g_1_0           dd          4 dup 1.0
g_7_0           dd          4 dup 7.0
g_34_0          dd          4 dup 34.0
g_42_0          dd          4 dup 42.0
g_49_0          dd          4 dup 49.0
g_255_0         dd          4 dup 255.0
g_289_0         dd          4 dup 289.0
g_1_div_7       dd          4 dup 0.142857142857
g_1_div_289     dd          4 dup 0.003460208
g_mask_0001     dd          0xffffffff,0x00000000,0x00000000,0x00000000
g_mask_0010     dd          0x00000000,0xffffffff,0x00000000,0x00000000
g_mask_0100     dd          0x00000000,0x00000000,0xffffffff,0x00000000
g_mask_1000     dd          0x00000000,0x00000000,0x00000000,0xffffffff
g_1_0_w         dd          0.0,0.0,0.0,1.0
g_taylor_bias   dd          4 dup 1.79284291400159
g_taylor_scale  dd          4 dup -0.85373472095314
;-------------------------------------------------------------------------------
