format PE64 GUI 5.0
entry start
include 'win64a.inc'
include 'cuda.inc'
section '.text' code readable executable
  start:
          push  rbp
     ; init
           lea  rbx,[_cuInit]
        invoke  cuInit,0
          test  eax,eax
           jnz  Error
           lea  rbx,[_cuDeviceGet]
        invoke  cuDeviceGet,CudaDevice,0   ; get the first device
          test  eax,eax
           jnz  Error
           lea  rbx,[_cuCtxCreate]
        invoke  cuCtxCreate,CudaContext,CU_CTX_SCHED_SPIN+CU_CTX_MAP_HOST,[CudaDevice]      ; this context associates the device with this cpu thread
          test  eax,eax
           jnz  Error
           lea  rbx,[_cuMemAlloc]
        invoke  cuMemAlloc,CudaNumberArray,256*4
          test  eax,eax
           jnz  Error
    ; load ptx source
           lea  rbx,[_cuModuleLoadData]
        invoke  cuModuleLoadData,CudaModule,PTXSourceData
          test  eax,eax
           jnz  Error
           lea  rbx,[_cuModuleGetFunction]
        invoke  cuModuleGetFunction,CudaFunction,[CudaModule],PTXFunction
          test  eax,eax
           jnz  Error
    ; fill in NumberArray and LogNumberArray with x87 calculations for comparision
          fld1
          fld1
           mov  ecx,256*4
           lea  rax,[NumberArray+rcx]
           lea  rbx,[LogNumberArray+rcx]
           neg  rcx
   @@:     fst  dword[rax+rcx]
          fld1
           fld  st1
        fyl2x
          fstp  dword[rbx+rcx]
          fadd  st0,st1
           add  rcx,4
           jnz  @b
        fcompp
    ; call the function
           lea  rbx,[_cuMemcpyHtoD]
        invoke  cuMemcpyHtoD,[CudaNumberArray],NumberArray,256*4
          test  eax,eax
           jnz  Error
           lea  rbx,[_cuParamSetSize]
        invoke  cuParamSetSize,[CudaFunction],12    ; 8 byte first agument + 4 byte second
          test  eax,eax
           jnz  Error
           mov  rax,[CudaNumberArray]              ; fill in the arugments
           mov  qword[Message+0],rax               ;
           mov  dword[Message+8],256               ;
           lea  rbx,[_cuParamSetv]
        invoke  cuParamSetv,[CudaFunction],0,Message,12        ; and pass in    (cuParamSeti doesn't always work so well)
          test  eax,eax
           jnz  Error
           lea  rbx,[_cuFuncSetBlockShape]
        invoke  cuFuncSetBlockShape,[CudaFunction],256,1,1
          test  eax,eax
           jnz  Error
           lea  rbx,[_cuLaunchGrid]
        invoke  cuLaunchGrid,[CudaFunction],1,1
          test  eax,eax
           jnz  Error
           lea  rbx,[_cuCtxSynchronize]
        invoke  cuCtxSynchronize
          test  eax,eax
           jnz  Error
           lea  rbx,[_cuMemcpyDtoH]
        invoke  cuMemcpyDtoH,CudaLogNumberArray,[CudaNumberArray],256*4
          test  eax,eax
           jnz  Error
   ; clean up
           lea  rbx,[_cuMemFree]
        invoke  cuMemFree,[CudaNumberArray]
          test  eax,eax
           jnz  Error
           lea  rbx,[_cuCtxDestroy]
        invoke  cuCtxDestroy,[CudaContext]
          test  eax,eax
           jnz  Error
           lea  rdi,[Message]
           cld
           mov  rax,'32bit fl'
        stosq
           mov  rax,'oats:  x'
        stosq
           mov  rax,'   lg2.a'
        stosq
           mov  rax,'pprox.f3'
        stosq
           mov  rax,'2(x)    '
        stosq
           mov  rax,'fyl2x(x,'
        stosq
           mov  rax,'1.0)    '
        stosq
           mov  al,10
        stosb
           xor  ebx,ebx
     @@:   fld  dword[NumberArray+4*rbx]
          fstp  qword[rsp-8]
           mov  rax,qword[rsp-8]
        invoke  sprintf,Temp,MessageFormat,rax
        movdqu  xmm0,xword[Temp]
        movdqu  [rdi],xmm0
           fld  dword[CudaLogNumberArray+4*rbx]
          fstp  qword[rsp-8]
           mov  rax,qword[rsp-8]
        invoke  sprintf,Temp,MessageFormat,rax
        movdqu  xmm0,xword[Temp]
        movdqu  [rdi+16],xmm0
           fld  dword[LogNumberArray+4*rbx]
          fstp  qword[rsp-8]
           mov  rax,qword[rsp-8]
        invoke  sprintf,Temp,MessageFormat,rax
        movdqu  xmm0,xword[Temp]
        movdqu  [rdi+32],xmm0
           mov  byte[rdi+48],10
           add  rdi,49
           add  rbx,1
           cmp  rbx,20
            jb  @b
           mov  byte[rdi],0
        invoke  MessageBox,NULL,Message,NULL,MB_OK
        invoke  ExitProcess,0
Error:
           mov  ecx,42
           cmp  eax,8
            ja  @f
           mul  ecx
           lea  rax,[err000+rax]
           jmp  .Print
        @@:
           cmp  eax,100
            jb  @f
           cmp  eax,101
            ja  @f
           sub  eax,100
           mul  ecx
           lea  rax,[err100+rax]
           jmp  .Print
        @@:
           cmp  eax,200
            jb  @f
           cmp  eax,216
            ja  @f
           sub  eax,200
           mul  ecx
           lea  rax,[err200+rax]
           jmp  .Print
        @@:
           cmp  eax,300
            jb  @f
           cmp  eax,304
            ja  @f
           sub  eax,300
           mul  ecx
           lea  rax,[err300+rax]
           jmp  .Print
        @@:
           cmp  eax,400
           jne  @f
           sub  eax,400
           mul  ecx
           lea  rax,[err400+rax]
           jmp  .Print
        @@:
           cmp  eax,500
           jne  @f
           sub  eax,500
           mul  ecx
           lea  rax,[err500+rax]
           jmp  .Print
        @@:
           cmp  eax,600
           jne  @f
           sub  eax,600
           mul  ecx
           lea  rax,[err600+rax]
           jmp  .Print
        @@:
           cmp  eax,700
            jb  @f
           cmp  eax,709
            ja  @f
           sub  eax,700
           mul  ecx
           lea  rax,[err700+rax]
           jmp  .Print
        @@:
           cmp  eax,999
           jne  @f
           sub  eax,999
           mul  ecx
           lea  rax,[err999+rax]
           jmp  .Print
        @@:
           lea  rax,[errNoMatch]
  .Print:
           mov  rdx,[rbx+0]
           mov  qword[Message+0],rdx
           mov  rdx,[rbx+8]
           mov  qword[Message+8],rdx
           mov  edx,[rbx+16]
           mov  dword[Message+16],edx
           mov  byte[Message+19],10
           mov  rdx,[rax+0]
           mov  qword[Message+20],rdx
           mov  rdx,[rax+8]
           mov  qword[Message+28],rdx
           mov  rdx,[rax+16]
           mov  qword[Message+36],rdx
           mov  rdx,[rax+24]
           mov  qword[Message+44],rdx
           mov  rdx,[rax+32]
           mov  qword[Message+52],rdx
           mov  edx,[rax+40]
           mov  dword[Message+60],edx
           mov  byte[Message+62],0
        invoke  MessageBox,NULL,Message,NULL,MB_OK
        invoke  ExitProcess,0
section '.data' data readable
  PTXFunction: db 'log_2',0
; log_2(*inout,length):
; xind =  %ctaid.x * %ntid.x + %tid.x
; if xind < length , [inout + 4 * xind] = log2([inout + 4 * xind])
; return
  PTXSourceData:
db '     .version 1.4'
db '     .target sm_13'
db '        .entry log_2 ('
db '                .param .u64 _inout,'
db '                .param .s32 _length)'
db '        {'
db '        .reg .u16 w1,w2;'                 ; word-sized registers
db '        .reg .u32 e1,xind;'               ; dwords
db '        .reg .u64 r1;'                    ; qwords
db '        .reg .f32 f1;'                    ; floats
db '        .reg .pred p1;'                   ; conditions
db '        mov.u16         w1,%ctaid.x;'
db '        mov.u16         w2,%ntid.x;'
db '        cvt.u32.u16     e1,%tid.x;'
db '        mad.wide.u16    xind,w1,w2,e1;'
db '        ld.param.s32    e1,[_length];'
db '        setp.le.s32     p1,e1,xind;'
db '   @p1  bra             $Lt_Exit;'        ; if p1, branch
db '        ld.param.u64    r1,[_inout];'
db '        mad.wide.u32    r1,xind,4,r1;'
db '        ld.global.f32   f1,[r1+0];'
db '        lg2.approx.f32  f1,f1;'
db '        st.global.f32   [r1+0],f1;'
db '$Lt_Exit:'
db '        exit;'
db '        }'
db 0
section '.data' data readable writeable
align 16
  ErrorMessageFormat db 'error code:',10,'hex: 0x%.8x',10,'dec: %u',0
  MessageFormat db '%16.7f',0
err000:
db    'CUDA_SUCCESS                              ';= 0
db    'CUDA_ERROR_INVALID_VALUE                  ';= 1
db    'CUDA_ERROR_OUT_OF_MEMORY                  ';= 2
db    'CUDA_ERROR_NOT_INITIALIZED                ';= 3
db    'CUDA_ERROR_DEINITIALIZED                  ';= 4
db    'CUDA_ERROR_PROFILER_DISABLED              ';= 5
db    'CUDA_ERROR_PROFILER_NOT_INITIALIZED       ';= 6
db    'CUDA_ERROR_PROFILER_ALREADY_STARTED       ';= 7
db    'CUDA_ERROR_PROFILER_ALREADY_STOPPED       ';= 8
err100:
db    'CUDA_ERROR_NO_DEVICE                      ';= 100
db    'CUDA_ERROR_INVALID_DEVICE                 ';= 101
err200:
db    'CUDA_ERROR_INVALID_IMAGE                  ';= 200
db    'CUDA_ERROR_INVALID_CONTEXT                ';= 201
db    'CUDA_ERROR_CONTEXT_ALREADY_CURRENT        ';= 202
db    '?                                         ';=
db    '?                                         ';=
db    'CUDA_ERROR_MAP_FAILED                     ';= 205
db    'CUDA_ERROR_UNMAP_FAILED                   ';= 206
db    'CUDA_ERROR_ARRAY_IS_MAPPED                ';= 207
db    'CUDA_ERROR_ALREADY_MAPPED                 ';= 208
db    'CUDA_ERROR_NO_BINARY_FOR_GPU              ';= 209
db    'CUDA_ERROR_ALREADY_ACQUIRED               ';= 210
db    'CUDA_ERROR_NOT_MAPPED                     ';= 211
db    'CUDA_ERROR_NOT_MAPPED_AS_ARRAY            ';= 212
db    'CUDA_ERROR_NOT_MAPPED_AS_POINTER          ';= 213
db    'CUDA_ERROR_ECC_UNCORRECTABLE              ';= 214
db    'CUDA_ERROR_UNSUPPORTED_LIMIT              ';= 215
db    'CUDA_ERROR_CONTEXT_ALREADY_IN_USE         ';= 216
err300:
db    'CUDA_ERROR_INVALID_SOURCE                 ';= 300
db    'CUDA_ERROR_FILE_NOT_FOUND                 ';= 301
db    'CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND ';= 302
db    'CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      ';= 303
db    'CUDA_ERROR_OPERATING_SYSTEM               ';= 304
err400:
db    'CUDA_ERROR_INVALID_HANDLE                 ';= 400
err500:
db    'CUDA_ERROR_NOT_FOUND                      ';= 500
err600:
db    'CUDA_ERROR_NOT_READY                      ';= 600
err700:
db    'CUDA_ERROR_LAUNCH_FAILED                  ';= 700
db    'CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        ';= 701
db    'CUDA_ERROR_LAUNCH_TIMEOUT                 ';= 702
db    'CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  ';= 703
db    'CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED    ';= 704
db    'CUDA_ERROR_PEER_ACCESS_NOT_ENABLED        ';= 705
db    '?                                         ';=
db    '?                                         ';=
db    'CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         ';= 708
db    'CUDA_ERROR_CONTEXT_IS_DESTROYED           ';= 709
err999:
db    'CUDA_ERROR_UNKNOWN                        ';= 999
errNoMatch:
db    '?                                         '
_cuInit:                db 'cuInit              '
_cuDeviceGet:           db 'cuDeviceGet         '
_cuCtxCreate:           db 'cuCtxCreate         '
_cuMemAlloc:            db 'cuMemAlloc          '
_cuModuleLoadData:      db 'cuModuleLoadData    '
_cuModuleGetFunction:   db 'cuModuleGetFunction '
_cuMemcpyHtoD:          db 'cuMemcpyHtoD        '
_cuParamSeti:           db 'cuParamSeti         '
_cuParamSetv:           db 'cuParamSetv         '
_cuFuncSetBlockShape:   db 'cuFuncSetBlockShape '
_cuLaunchGrid:          db 'cuLaunchGrid        '
_cuParamSetSize:        db 'cuParamSetSize      '
_cuCtxSynchronize:      db 'cuCtxSynchronize    '
_cuMemcpyDtoH:          db 'cuMemcpyDtoH        '
_cuMemFree:             db 'cuMemFree           '
_cuCtxDestroy:          db 'cuCtxDestroy        '
align 16
  CudaDevice       dq ?
  CudaContext      dq ?
  CudaModule       dq ?
  CudaFunction     dq ?
  CudaNumberArray  dq ?
  NumberArray      rd 256
  LogNumberArray   rd 256
  CudaLogNumberArray rd 256
  Temp  rb 32
  Message rb 50*60
section '.idata' import data readable writeable
  library kernel32,'KERNEL32.DLL',\
          user32,'USER32.DLL',\
          msvcrt,'MSVCRT.DLL',\
          cuda,'NVCUDA.DLL'
  include 'api\kernel32.inc'
  include 'api\user32.inc'
  include 'api_cuda.inc'
import msvcrt,\
        sprintf,'sprintf'