format ELF64 executable 3
entry start
segment readable executable
;==============================================================================;
  align 16                                                                     ;
  thread:                                                                      ;
;------------------------------------------------------------------------------;
  virtual at rsp
    .startx dd ?
    .starty dd ?
  end virtual
  push rbp rbx
  mov rbp,rsp
  and rsp,not 31
  sub rsp,32
    .tile:
  mov r15d,1
  lock xadd [image_tile],r15d
  cmp r15d,TILE_COUNT
  jge .finish
  xor edx,edx
  mov eax,r15d
  mov edi,TILE_X_COUNT
  div edi
  imul eax,TILE_SIZE
  imul edx,TILE_SIZE
  mov [.startx],edx
  mov [.starty],eax
  imul eax,IMAGE_WIDTH
  add eax,edx
  shl eax,2
  mov rbx,[image_ptr]
  add rbx,rax
  xor r14d,r14d
    .row:
  xor r13d,r13d
    .pixel:
  mov edx,[.startx]
  mov eax,[.starty]
  add edx,r13d
  add eax,r14d
  ;------------------------------------; compute color
  vmovaps ymm2,[.c_1_0]
  vxorps ymm0,ymm0,ymm0
  vxorps ymm1,ymm1,ymm1
  ;------------------------------------; convert color from RGB32F to BGRA8
  vxorps ymm3,ymm3,ymm3
  vmovaps ymm4,[.c_1_0]
  vminps ymm0,ymm0,ymm4
  vminps ymm1,ymm1,ymm4
  vminps ymm2,ymm2,ymm4
  vmaxps ymm0,ymm0,ymm3
  vmaxps ymm1,ymm1,ymm3
  vmaxps ymm2,ymm2,ymm3
  vmovaps ymm3,[.c_255_0]
  vmulps ymm0,ymm0,ymm3
  vmulps ymm1,ymm1,ymm3
  vmulps ymm2,ymm2,ymm3
  vcvttps2dq ymm0,ymm0
  vcvttps2dq ymm1,ymm1
  vcvttps2dq ymm2,ymm2
  vpslld ymm1,ymm1,8
  vpslld ymm0,ymm0,16
  vpor ymm0,ymm0,[.c_ff000000]
  vpor ymm1,ymm1,ymm2
  vpor ymm0,ymm0,ymm1
  vmovdqa [rbx+r13*4],ymm0
  ;------------------------------------;
  add r13d,8
  cmp r13d,TILE_SIZE
  jne .pixel
  add rbx,IMAGE_WIDTH*4
  add r14d,1
  cmp r14d,TILE_SIZE
  jne .row
  jmp .tile
    .finish:
  mov rsp,rbp
  pop rbx rbp
  ret
  align 32
    .c_1_0: dd 8 dup 1.0
    .c_255_0: dd 8 dup 255.0
    .c_ff000000: dd 8 dup 0xff000000
;==============================================================================;
  align 16                                                                     ;
  mem_alloc:                                                                   ;
; rdi: size in bytes                                                           ;
; rax: pointer to the allocated memory                                         ;
;------------------------------------------------------------------------------;
  mov eax,9                            ; sys_mmap
  mov rsi,rdi                          ; length
  xor edi,edi                          ; addr
  mov edx,0x1+0x2                      ; PROT_READ | PROT_WRITE
  mov r10d,0x02+0x20                   ; MAP_PRIVATE | MAP_ANONYMOUS
  mov r8,-1                            ; fd
  xor r9d,r9d                          ; offset
  syscall
  ret
;==============================================================================;
  align 16                                                                     ;
  thread_create:                                                               ;
; rbx: mutex address                                                           ;
;------------------------------------------------------------------------------;
  mov edi,4096
  call mem_alloc
  mov rsi,rax
  add rsi,4096
  mov eax,56                           ; sys_clone
  mov edi,0x100+0x200+0x400+0x800+0x10000
  xor edx,edx
  syscall
  test eax,eax
  jnz .ret
  call thread
  mov dword [rbx],1
  mov eax,202                          ; sys_futex
  mov rdi,rbx                          ; mutex address
  mov esi,1                            ; FUTEX_WAKE
  mov edx,1                            ; wake 1 thread
  syscall
  mov eax,60                           ; sys_exit
  xor edi,edi                          ; exit code
  syscall
    .ret:
  ret
;==============================================================================;
  align 16                                                                     ;
  thread_wait:                                                                 ;
; rbx: mutex address                                                           ;
;------------------------------------------------------------------------------;
  mov eax,202                          ; sys_futex
  mov rdi,rbx                          ; mutex address
  mov esi,0                            ; FUTEX_WAIT
  mov edx,0                            ; mutex 'running' value
  xor r10d,r10d                        ; unused but must be zero
  syscall
  ret
;==============================================================================;
  align 16                                                                     ;
  image_save:                                                                  ;
;------------------------------------------------------------------------------;
  push rbx
  mov eax,85
  mov rdi,.tga_name
  mov esi,110000000b
  syscall
  mov rbx,rax
  mov eax,1
  mov rdi,rbx
  mov rsi,.tga_head
  mov edx,18
  syscall
  mov eax,1
  mov rdi,rbx
  mov rsi,[image_ptr]
  mov edx,IMAGE_WIDTH*IMAGE_HEIGHT*4
  syscall
  pop rbx
  ret
    .tga_name db 'fractal1.tga',0
    .tga_head db 0,0,2,9 dup 0
              db (IMAGE_WIDTH and 0x00ff),(IMAGE_WIDTH and 0xff00) shr 8
              db (IMAGE_HEIGHT and 0x00ff),(IMAGE_HEIGHT and 0xff00) shr 8,32,0
;==============================================================================;
  align 16                                                                     ;
  main:                                                                        ;
;------------------------------------------------------------------------------;
  mov rdi,IMAGE_WIDTH*IMAGE_HEIGHT*4
  call mem_alloc
  mov [image_ptr],rax
  mov rbx,thread0_mutex
  call thread_create
  mov rbx,thread1_mutex
  call thread_create
  mov rbx,thread2_mutex
  call thread_create
  mov rbx,thread3_mutex
  call thread_create
  mov rbx,thread0_mutex
  call thread_wait
  mov rbx,thread1_mutex
  call thread_wait
  mov rbx,thread2_mutex
  call thread_wait
  mov rbx,thread3_mutex
  call thread_wait
  call image_save
  ret
;==============================================================================;
  align 16                                                                     ;
  start:                                                                       ;
;------------------------------------------------------------------------------;
  call main
  mov eax,60                           ; sys_exit
  xor edi,edi                          ; exit code
  syscall
;==============================================================================;
segment readable writeable

IMAGE_WIDTH = 1280
IMAGE_HEIGHT = 720

TILE_SIZE = 80
TILE_X_COUNT = IMAGE_WIDTH / TILE_SIZE
TILE_Y_COUNT = IMAGE_HEIGHT / TILE_SIZE
TILE_COUNT = TILE_X_COUNT * TILE_Y_COUNT

align 4
thread0_mutex dd 0
thread1_mutex dd 0
thread2_mutex dd 0
thread3_mutex dd 0

align 8
image_ptr dq 0
align 4
image_tile dd 0
;==============================================================================;