format PE64 GUI 5.0
entry start
include 'win64a.inc'
include 'cuda.inc'
section '.text' code readable executable
start:
push rbp
; init
lea rbx,[_cuInit]
invoke cuInit,0
test eax,eax
jnz Error
lea rbx,[_cuDeviceGet]
invoke cuDeviceGet,CudaDevice,0 ; get the first device
test eax,eax
jnz Error
lea rbx,[_cuCtxCreate]
invoke cuCtxCreate,CudaContext,CU_CTX_SCHED_SPIN+CU_CTX_MAP_HOST,[CudaDevice] ; this context associates the device with this cpu thread
test eax,eax
jnz Error
lea rbx,[_cuMemAlloc]
invoke cuMemAlloc,CudaNumberArray,256*4
test eax,eax
jnz Error
; load ptx source
lea rbx,[_cuModuleLoadData]
invoke cuModuleLoadData,CudaModule,PTXSourceData
test eax,eax
jnz Error
lea rbx,[_cuModuleGetFunction]
invoke cuModuleGetFunction,CudaFunction,[CudaModule],PTXFunction
test eax,eax
jnz Error
; fill in NumberArray and LogNumberArray with x87 calculations for comparision
fld1
fld1
mov ecx,256*4
lea rax,[NumberArray+rcx]
lea rbx,[LogNumberArray+rcx]
neg rcx
@@: fst dword[rax+rcx]
fld1
fld st1
fyl2x
fstp dword[rbx+rcx]
fadd st0,st1
add rcx,4
jnz @b
fcompp
; call the function
lea rbx,[_cuMemcpyHtoD]
invoke cuMemcpyHtoD,[CudaNumberArray],NumberArray,256*4
test eax,eax
jnz Error
lea rbx,[_cuParamSetSize]
invoke cuParamSetSize,[CudaFunction],12 ; 8 byte first agument + 4 byte second
test eax,eax
jnz Error
mov rax,[CudaNumberArray] ; fill in the arugments
mov qword[Message+0],rax ;
mov dword[Message+8],256 ;
lea rbx,[_cuParamSetv]
invoke cuParamSetv,[CudaFunction],0,Message,12 ; and pass in (cuParamSeti doesn't always work so well)
test eax,eax
jnz Error
lea rbx,[_cuFuncSetBlockShape]
invoke cuFuncSetBlockShape,[CudaFunction],256,1,1
test eax,eax
jnz Error
lea rbx,[_cuLaunchGrid]
invoke cuLaunchGrid,[CudaFunction],1,1
test eax,eax
jnz Error
lea rbx,[_cuCtxSynchronize]
invoke cuCtxSynchronize
test eax,eax
jnz Error
lea rbx,[_cuMemcpyDtoH]
invoke cuMemcpyDtoH,CudaLogNumberArray,[CudaNumberArray],256*4
test eax,eax
jnz Error
; clean up
lea rbx,[_cuMemFree]
invoke cuMemFree,[CudaNumberArray]
test eax,eax
jnz Error
lea rbx,[_cuCtxDestroy]
invoke cuCtxDestroy,[CudaContext]
test eax,eax
jnz Error
lea rdi,[Message]
cld
mov rax,'32bit fl'
stosq
mov rax,'oats: x'
stosq
mov rax,' lg2.a'
stosq
mov rax,'pprox.f3'
stosq
mov rax,'2(x) '
stosq
mov rax,'fyl2x(x,'
stosq
mov rax,'1.0) '
stosq
mov al,10
stosb
xor ebx,ebx
@@: fld dword[NumberArray+4*rbx]
fstp qword[rsp-8]
mov rax,qword[rsp-8]
invoke sprintf,Temp,MessageFormat,rax
movdqu xmm0,xword[Temp]
movdqu [rdi],xmm0
fld dword[CudaLogNumberArray+4*rbx]
fstp qword[rsp-8]
mov rax,qword[rsp-8]
invoke sprintf,Temp,MessageFormat,rax
movdqu xmm0,xword[Temp]
movdqu [rdi+16],xmm0
fld dword[LogNumberArray+4*rbx]
fstp qword[rsp-8]
mov rax,qword[rsp-8]
invoke sprintf,Temp,MessageFormat,rax
movdqu xmm0,xword[Temp]
movdqu [rdi+32],xmm0
mov byte[rdi+48],10
add rdi,49
add rbx,1
cmp rbx,20
jb @b
mov byte[rdi],0
invoke MessageBox,NULL,Message,NULL,MB_OK
invoke ExitProcess,0
Error:
mov ecx,42
cmp eax,8
ja @f
mul ecx
lea rax,[err000+rax]
jmp .Print
@@:
cmp eax,100
jb @f
cmp eax,101
ja @f
sub eax,100
mul ecx
lea rax,[err100+rax]
jmp .Print
@@:
cmp eax,200
jb @f
cmp eax,216
ja @f
sub eax,200
mul ecx
lea rax,[err200+rax]
jmp .Print
@@:
cmp eax,300
jb @f
cmp eax,304
ja @f
sub eax,300
mul ecx
lea rax,[err300+rax]
jmp .Print
@@:
cmp eax,400
jne @f
sub eax,400
mul ecx
lea rax,[err400+rax]
jmp .Print
@@:
cmp eax,500
jne @f
sub eax,500
mul ecx
lea rax,[err500+rax]
jmp .Print
@@:
cmp eax,600
jne @f
sub eax,600
mul ecx
lea rax,[err600+rax]
jmp .Print
@@:
cmp eax,700
jb @f
cmp eax,709
ja @f
sub eax,700
mul ecx
lea rax,[err700+rax]
jmp .Print
@@:
cmp eax,999
jne @f
sub eax,999
mul ecx
lea rax,[err999+rax]
jmp .Print
@@:
lea rax,[errNoMatch]
.Print:
mov rdx,[rbx+0]
mov qword[Message+0],rdx
mov rdx,[rbx+8]
mov qword[Message+8],rdx
mov edx,[rbx+16]
mov dword[Message+16],edx
mov byte[Message+19],10
mov rdx,[rax+0]
mov qword[Message+20],rdx
mov rdx,[rax+8]
mov qword[Message+28],rdx
mov rdx,[rax+16]
mov qword[Message+36],rdx
mov rdx,[rax+24]
mov qword[Message+44],rdx
mov rdx,[rax+32]
mov qword[Message+52],rdx
mov edx,[rax+40]
mov dword[Message+60],edx
mov byte[Message+62],0
invoke MessageBox,NULL,Message,NULL,MB_OK
invoke ExitProcess,0
section '.data' data readable
PTXFunction: db 'log_2',0
; log_2(*inout,length):
; xind = %ctaid.x * %ntid.x + %tid.x
; if xind < length , [inout + 4 * xind] = log2([inout + 4 * xind])
; return
PTXSourceData:
db ' .version 1.4'
db ' .target sm_13'
db ' .entry log_2 ('
db ' .param .u64 _inout,'
db ' .param .s32 _length)'
db ' {'
db ' .reg .u16 w1,w2;' ; word-sized registers
db ' .reg .u32 e1,xind;' ; dwords
db ' .reg .u64 r1;' ; qwords
db ' .reg .f32 f1;' ; floats
db ' .reg .pred p1;' ; conditions
db ' mov.u16 w1,%ctaid.x;'
db ' mov.u16 w2,%ntid.x;'
db ' cvt.u32.u16 e1,%tid.x;'
db ' mad.wide.u16 xind,w1,w2,e1;'
db ' ld.param.s32 e1,[_length];'
db ' setp.le.s32 p1,e1,xind;'
db ' @p1 bra $Lt_Exit;' ; if p1, branch
db ' ld.param.u64 r1,[_inout];'
db ' mad.wide.u32 r1,xind,4,r1;'
db ' ld.global.f32 f1,[r1+0];'
db ' lg2.approx.f32 f1,f1;'
db ' st.global.f32 [r1+0],f1;'
db '$Lt_Exit:'
db ' exit;'
db ' }'
db 0
section '.data' data readable writeable
align 16
ErrorMessageFormat db 'error code:',10,'hex: 0x%.8x',10,'dec: %u',0
MessageFormat db '%16.7f',0
err000:
db 'CUDA_SUCCESS ';= 0
db 'CUDA_ERROR_INVALID_VALUE ';= 1
db 'CUDA_ERROR_OUT_OF_MEMORY ';= 2
db 'CUDA_ERROR_NOT_INITIALIZED ';= 3
db 'CUDA_ERROR_DEINITIALIZED ';= 4
db 'CUDA_ERROR_PROFILER_DISABLED ';= 5
db 'CUDA_ERROR_PROFILER_NOT_INITIALIZED ';= 6
db 'CUDA_ERROR_PROFILER_ALREADY_STARTED ';= 7
db 'CUDA_ERROR_PROFILER_ALREADY_STOPPED ';= 8
err100:
db 'CUDA_ERROR_NO_DEVICE ';= 100
db 'CUDA_ERROR_INVALID_DEVICE ';= 101
err200:
db 'CUDA_ERROR_INVALID_IMAGE ';= 200
db 'CUDA_ERROR_INVALID_CONTEXT ';= 201
db 'CUDA_ERROR_CONTEXT_ALREADY_CURRENT ';= 202
db '? ';=
db '? ';=
db 'CUDA_ERROR_MAP_FAILED ';= 205
db 'CUDA_ERROR_UNMAP_FAILED ';= 206
db 'CUDA_ERROR_ARRAY_IS_MAPPED ';= 207
db 'CUDA_ERROR_ALREADY_MAPPED ';= 208
db 'CUDA_ERROR_NO_BINARY_FOR_GPU ';= 209
db 'CUDA_ERROR_ALREADY_ACQUIRED ';= 210
db 'CUDA_ERROR_NOT_MAPPED ';= 211
db 'CUDA_ERROR_NOT_MAPPED_AS_ARRAY ';= 212
db 'CUDA_ERROR_NOT_MAPPED_AS_POINTER ';= 213
db 'CUDA_ERROR_ECC_UNCORRECTABLE ';= 214
db 'CUDA_ERROR_UNSUPPORTED_LIMIT ';= 215
db 'CUDA_ERROR_CONTEXT_ALREADY_IN_USE ';= 216
err300:
db 'CUDA_ERROR_INVALID_SOURCE ';= 300
db 'CUDA_ERROR_FILE_NOT_FOUND ';= 301
db 'CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND ';= 302
db 'CUDA_ERROR_SHARED_OBJECT_INIT_FAILED ';= 303
db 'CUDA_ERROR_OPERATING_SYSTEM ';= 304
err400:
db 'CUDA_ERROR_INVALID_HANDLE ';= 400
err500:
db 'CUDA_ERROR_NOT_FOUND ';= 500
err600:
db 'CUDA_ERROR_NOT_READY ';= 600
err700:
db 'CUDA_ERROR_LAUNCH_FAILED ';= 700
db 'CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES ';= 701
db 'CUDA_ERROR_LAUNCH_TIMEOUT ';= 702
db 'CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING ';= 703
db 'CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED ';= 704
db 'CUDA_ERROR_PEER_ACCESS_NOT_ENABLED ';= 705
db '? ';=
db '? ';=
db 'CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE ';= 708
db 'CUDA_ERROR_CONTEXT_IS_DESTROYED ';= 709
err999:
db 'CUDA_ERROR_UNKNOWN ';= 999
errNoMatch:
db '? '
_cuInit: db 'cuInit '
_cuDeviceGet: db 'cuDeviceGet '
_cuCtxCreate: db 'cuCtxCreate '
_cuMemAlloc: db 'cuMemAlloc '
_cuModuleLoadData: db 'cuModuleLoadData '
_cuModuleGetFunction: db 'cuModuleGetFunction '
_cuMemcpyHtoD: db 'cuMemcpyHtoD '
_cuParamSeti: db 'cuParamSeti '
_cuParamSetv: db 'cuParamSetv '
_cuFuncSetBlockShape: db 'cuFuncSetBlockShape '
_cuLaunchGrid: db 'cuLaunchGrid '
_cuParamSetSize: db 'cuParamSetSize '
_cuCtxSynchronize: db 'cuCtxSynchronize '
_cuMemcpyDtoH: db 'cuMemcpyDtoH '
_cuMemFree: db 'cuMemFree '
_cuCtxDestroy: db 'cuCtxDestroy '
align 16
CudaDevice dq ?
CudaContext dq ?
CudaModule dq ?
CudaFunction dq ?
CudaNumberArray dq ?
NumberArray rd 256
LogNumberArray rd 256
CudaLogNumberArray rd 256
Temp rb 32
Message rb 50*60
section '.idata' import data readable writeable
library kernel32,'KERNEL32.DLL',\
user32,'USER32.DLL',\
msvcrt,'MSVCRT.DLL',\
cuda,'NVCUDA.DLL'
include 'api\kernel32.inc'
include 'api\user32.inc'
include 'api_cuda.inc'
import msvcrt,\
sprintf,'sprintf'