;    The floating-point matrix multiplication with SSE instruction
;    fasm console float mul example.
;    assemble: fasm.exe fasm_matrix_sse.asm
format PE console  ;The type of program is a PE CONSOLE
entry startup           ;The entry point is startup
;---------------------------------------------
; code section. this name is '.text'(code readable executable).
section '.text' code readable executable
startup:
;   Call mat_mul_sse
    push f32_out	; push param3
    push f32_in2      ; push param2
    push f32_in1      ; push param 1
    call  mat_mul_sse
    
;   Output results
    cvtps2pd xmm0, qword [f32_out + 4*0]    ; convert to double floating-point number
    cvtps2pd xmm1, qword [f32_out + 4*2]
    sub         esp     , 16*2
    
    movups   [esp]      , xmm0 ; the params pushed onto stack
    movups   [esp+16], xmm1
    push       szFmt
    call         [printf]                 ; call printf
    add         esp,16*2+4          ;  stack balance
    
;    Pause--------------------    
    push    szPause
    call      [system]
    add       esp, 8
    jmp       dword [esp-4]   ; end program after stack balance
    
;===================================================
; mat_mul_sse(v1@14, m44, v2@14) function
; function:
;      v1@14 is a 1x4 vector
;      m44     is a 4x4 vector
;      v2@14 is a 1x4 vector
;      v1@14 * m44 = v2@14
; params:
;      v1@14 : [in]   1x4 single floating-point vector pointer.
;      m44      : [in]   4x4 single floating-point vector pointer.
;      v2@14 : [out] 1x4 single floating-point vector pointer.
; description:
;      movups can be replaced with movaps if data stored a output result is aligned
;===================================================
mat_mul_sse:
    ; define three labels
    label   .v1@14 dword at esp+4     ;arg1
    label   .m44      dword at esp+8     ;arg2
    label   .v2@14 dword at esp+12    ;arg3
 
    mov     eax, [.v1@14]
    mov     edx, [.m44]
    mov     ecx, [.v2@14]
    movups  xmm0, [eax]                ; d c b a
    
    movaps  xmm1, xmm0               ; d c b a       
    movaps  xmm2, xmm0               ; d c b a
    movaps  xmm3, xmm0               ; d c b a

    shufps  xmm0, xmm0,0               ; a a a a 
    shufps  xmm1, xmm1,01010101b ; b b b b
    shufps  xmm2, xmm2,10101010b ; c c c c
    shufps  xmm3, xmm3,11111111b ; d d d d

    movups  xmm4, [edx+16*0]
    movups  xmm5, [edx+16*1]
    movups  xmm6, [edx+16*2]
    movups  xmm7, [edx+16*3]

    mulps   xmm0, xmm4
    mulps   xmm1, xmm5
    mulps   xmm2, xmm6
    mulps   xmm3, xmm7

    addps   xmm0, xmm1
    addps   xmm2, xmm3
    addps   xmm0, xmm2

    movups [ecx], xmm0

    retn    3*4     ; stack balance
;--------------- end of mat_mul_sse subrountine --------------------

; data section. this name is '.data' (data readable executable).
section '.data' data readable writeable
    f32_in1    dd  1.1   , 1.2  , 1.3  , 1.4
    f32_in2    dd  10.1 , 11.2, 12.3, 13.4
                   dd  14.9 , 15.2, 11.4, 17.1
                   dd  18.3 , 10.1, 20.1, 21.4
		   dd  22.1 , 23.9, 24.6, 25.5
    f32_out    dd  0.0   , 0.0  , 0.0 , 0.0

    szFmt    db  'Result=',0ah,'%f %f %f %f',0ah,0
    szPause db  'pause',0
;------- end of data section --------------------

; import section. this name is '.idata' (import data readable executable).
section '.idata' import data readable writeable
  ; IMAGE_IMPORT_DESCRIPTOR
  dd 0, 0, 0, RVA msvcrt_name, RVA msvcrt_table
  dd 0, 0, 0, 0, 0  ; the end with zero-fill
  ; FirstThunk
  msvcrt_table:
      printf      dd RVA _printf
      system    dd RVA _system
                    dd 0      ; the end with zero-fill
  ; Library name
  msvcrt_name db 'msvcrt.dll', 0
  ; IMAGE_IMPORT_BY_NAME
  _printf      dw 0               ;hint
                  db  'printf', 0   ;Name
  _system    dw 0
		  db  'system', 0
;------- end of import section --------------------

