flat assembler
Message board for the users of flat assembler.

Index > Windows > Nvidia Cuda examples.

Author
Thread Post new topic Reply to topic
Roman



Joined: 21 Apr 2012
Posts: 1769
Roman 07 Aug 2024, 12:25
I found this.
https://www.masm32.com/board/index.php?topic=7819.0

But not found include 'cuda.inc'
Post 07 Aug 2024, 12:25
View user's profile Send private message Reply with quote
Roman



Joined: 21 Apr 2012
Posts: 1769
Roman 07 Aug 2024, 12:36
Post 07 Aug 2024, 12:36
View user's profile Send private message Reply with quote
Roman



Joined: 21 Apr 2012
Posts: 1769
Roman 07 Aug 2024, 12:46
Code:
;in file 'api_cuda.inc'
  import cuda,\
         cuInit,'cuInit',\
         cuDeviceGet,'cuDeviceGet',\
         cuCtxCreate,'cuCtxCreate',\
         cuMemAlloc,'cuMemAlloc',\
         cuModuleLoadData,'cuModuleLoadData',\
         cuModuleGetFunction,'cuModuleGetFunction',\
         cuMemcpyHtoD,'cuMemcpyHtoD',\
         cuParamSetSize,'cuParamSetSize',\
         cuParamSetv,'cuParamSetv',\
         cuFuncSetBlockShape,'cuFuncSetBlockShape',\
         cuCtxSynchronize,'cuCtxSynchronize',\
         cuMemcpyDtoH,'cuMemcpyDtoH',\
         cuMemFree,'cuMemFree',\
         cuCtxDestroy,'cuCtxDestroy',\
         cuLaunchGrid,'cuLaunchGrid'
    

Code:
;file cuda.inc
CU_CTX_SCHED_SPIN = 0x01

CU_CTX_SCHED_MASK = 0x07
CU_CTX_MAP_HOST = 0x08

    
Post 07 Aug 2024, 12:46
View user's profile Send private message Reply with quote
Roman



Joined: 21 Apr 2012
Posts: 1769
Roman 07 Aug 2024, 14:46
Code:
format PE64 GUI 5.0
entry start

include 'win64a.inc'
include 'cuda.inc'

section '.text' code readable executable

  start:
          push  rbp

     ; init
           lea  rbx,[_cuInit]
        invoke  cuInit,0
          test  eax,eax
           jnz  Error

           lea  rbx,[_cuDeviceGet]
        invoke  cuDeviceGet,CudaDevice,0   ; get the first device
          test  eax,eax
           jnz  Error

           lea  rbx,[_cuCtxCreate]
        invoke  cuCtxCreate,CudaContext,CU_CTX_SCHED_SPIN+CU_CTX_MAP_HOST,[CudaDevice]      ; this context associates the device with this cpu thread
          test  eax,eax
           jnz  Error

           lea  rbx,[_cuMemAlloc]
        invoke  cuMemAlloc,CudaNumberArray,256*4
          test  eax,eax
           jnz  Error

    ; load ptx source
           lea  rbx,[_cuModuleLoadData]
        invoke  cuModuleLoadData,CudaModule,PTXSourceData
          test  eax,eax
           jnz  Error

           lea  rbx,[_cuModuleGetFunction]
        invoke  cuModuleGetFunction,CudaFunction,[CudaModule],PTXFunction
          test  eax,eax
           jnz  Error

    ; fill in NumberArray and LogNumberArray with x87 calculations for comparision
          fld1
          fld1
           mov  ecx,256*4
           lea  rax,[NumberArray+rcx]
           lea  rbx,[LogNumberArray+rcx]
           neg  rcx
   @@:     fst  dword[rax+rcx]
          fld1
           fld  st1
        fyl2x
          fstp  dword[rbx+rcx]
          fadd  st0,st1
           add  rcx,4
           jnz  @b
        fcompp

    ; call the function
           lea  rbx,[_cuMemcpyHtoD]
        invoke  cuMemcpyHtoD,[CudaNumberArray],NumberArray,256*4
          test  eax,eax
           jnz  Error


           lea  rbx,[_cuParamSetSize]
        invoke  cuParamSetSize,[CudaFunction],12    ; 8 byte first agument + 4 byte second
          test  eax,eax
           jnz  Error

           mov  rax,[CudaNumberArray]              ; fill in the arugments
           mov  qword[Message+0],rax               ;
           mov  dword[Message+8],256               ;
           lea  rbx,[_cuParamSetv]
        invoke  cuParamSetv,[CudaFunction],0,Message,12        ; and pass in    (cuParamSeti doesn't always work so well)
          test  eax,eax
           jnz  Error

           lea  rbx,[_cuFuncSetBlockShape]
        invoke  cuFuncSetBlockShape,[CudaFunction],256,1,1
          test  eax,eax
           jnz  Error

           lea  rbx,[_cuLaunchGrid]
        invoke  cuLaunchGrid,[CudaFunction],1,1
          test  eax,eax
           jnz  Error

           lea  rbx,[_cuCtxSynchronize]
        invoke  cuCtxSynchronize
          test  eax,eax
           jnz  Error

           lea  rbx,[_cuMemcpyDtoH]
        invoke  cuMemcpyDtoH,CudaLogNumberArray,[CudaNumberArray],256*4
          test  eax,eax
           jnz  Error

   ; clean up
           lea  rbx,[_cuMemFree]
        invoke  cuMemFree,[CudaNumberArray]
          test  eax,eax
           jnz  Error

           lea  rbx,[_cuCtxDestroy]
        invoke  cuCtxDestroy,[CudaContext]
          test  eax,eax
           jnz  Error

           lea  rdi,[Message]
           cld
           mov  rax,'32bit fl'
        stosq
           mov  rax,'oats:  x'
        stosq
           mov  rax,'   lg2.a'
        stosq
           mov  rax,'pprox.f3'
        stosq
           mov  rax,'2(x)    '
        stosq
           mov  rax,'fyl2x(x,'
        stosq
           mov  rax,'1.0)    '
        stosq
           mov  al,10
        stosb
           xor  ebx,ebx
     @@:   fld  dword[NumberArray+4*rbx]
          fstp  qword[rsp-8]
           mov  rax,qword[rsp-8]
        invoke  sprintf,Temp,MessageFormat,rax
        movdqu  xmm0,xword[Temp]
        movdqu  [rdi],xmm0
           fld  dword[CudaLogNumberArray+4*rbx]
          fstp  qword[rsp-8]
           mov  rax,qword[rsp-8]
        invoke  sprintf,Temp,MessageFormat,rax
        movdqu  xmm0,xword[Temp]
        movdqu  [rdi+16],xmm0
           fld  dword[LogNumberArray+4*rbx]
          fstp  qword[rsp-8]
           mov  rax,qword[rsp-8]
        invoke  sprintf,Temp,MessageFormat,rax
        movdqu  xmm0,xword[Temp]
        movdqu  [rdi+32],xmm0
           mov  byte[rdi+48],10
           add  rdi,49
           add  rbx,1
           cmp  rbx,20
            jb  @b
           mov  byte[rdi],0
        invoke  MessageBox,NULL,Message,NULL,MB_OK

        invoke  ExitProcess,0

Error:
           mov  ecx,42
           cmp  eax,8
            ja  @f
           mul  ecx
           lea  rax,[err000+rax]
           jmp  .Print
        @@:
           cmp  eax,100
            jb  @f
           cmp  eax,101
            ja  @f
           sub  eax,100
           mul  ecx
           lea  rax,[err100+rax]
           jmp  .Print
        @@:
           cmp  eax,200
            jb  @f
           cmp  eax,216
            ja  @f
           sub  eax,200
           mul  ecx
           lea  rax,[err200+rax]
           jmp  .Print
        @@:
           cmp  eax,300
            jb  @f
           cmp  eax,304
            ja  @f
           sub  eax,300
           mul  ecx
           lea  rax,[err300+rax]
           jmp  .Print
        @@:
           cmp  eax,400
           jne  @f
           sub  eax,400
           mul  ecx
           lea  rax,[err400+rax]
           jmp  .Print
        @@:
           cmp  eax,500
           jne  @f
           sub  eax,500
           mul  ecx
           lea  rax,[err500+rax]
           jmp  .Print
        @@:
           cmp  eax,600
           jne  @f
           sub  eax,600
           mul  ecx
           lea  rax,[err600+rax]
           jmp  .Print
        @@:
           cmp  eax,700
            jb  @f
           cmp  eax,709
            ja  @f
           sub  eax,700
           mul  ecx
           lea  rax,[err700+rax]
           jmp  .Print
        @@:
           cmp  eax,999
           jne  @f
           sub  eax,999
           mul  ecx
           lea  rax,[err999+rax]
           jmp  .Print
        @@:
           lea  rax,[errNoMatch]

  .Print:
           mov  rdx,[rbx+0]
           mov  qword[Message+0],rdx
           mov  rdx,[rbx+8]
           mov  qword[Message+8],rdx
           mov  edx,[rbx+16]
           mov  dword[Message+16],edx
           mov  byte[Message+19],10

           mov  rdx,[rax+0]
           mov  qword[Message+20],rdx
           mov  rdx,[rax+8]
           mov  qword[Message+28],rdx
           mov  rdx,[rax+16]
           mov  qword[Message+36],rdx
           mov  rdx,[rax+24]
           mov  qword[Message+44],rdx
           mov  rdx,[rax+32]
           mov  qword[Message+52],rdx
           mov  edx,[rax+40]
           mov  dword[Message+60],edx
           mov  byte[Message+62],0

        invoke  MessageBox,NULL,Message,NULL,MB_OK
        invoke  ExitProcess,0

section '.data' data readable

  PTXFunction: db 'log_2',0

; log_2(*inout,length):
; xind =  %ctaid.x * %ntid.x + %tid.x
; if xind < length , [inout + 4 * xind] = log2([inout + 4 * xind])
; return

  PTXSourceData:
db '     .version 1.4'
db '     .target sm_13'
db '        .entry log_2 ('
db '                .param .u64 _inout,'
db '                .param .s32 _length)'
db '        {'
db '        .reg .u16 w1,w2;'                 ; word-sized registers
db '        .reg .u32 e1,xind;'               ; dwords
db '        .reg .u64 r1;'                    ; qwords
db '        .reg .f32 f1;'                    ; floats
db '        .reg .pred p1;'                   ; conditions
db '        mov.u16         w1,%ctaid.x;'
db '        mov.u16         w2,%ntid.x;'
db '        cvt.u32.u16     e1,%tid.x;'
db '        mad.wide.u16    xind,w1,w2,e1;'
db '        ld.param.s32    e1,[_length];'
db '        setp.le.s32     p1,e1,xind;'
db '   @p1  bra             $Lt_Exit;'        ; if p1, branch
db '        ld.param.u64    r1,[_inout];'
db '        mad.wide.u32    r1,xind,4,r1;'
db '        ld.global.f32   f1,[r1+0];'
db '        lg2.approx.f32  f1,f1;'
db '        st.global.f32   [r1+0],f1;'
db '$Lt_Exit:'
db '        exit;'
db '        }'
db 0

section '.data' data readable writeable

align 16
  ErrorMessageFormat db 'error code:',10,'hex: 0x%.8x',10,'dec: %u',0
  MessageFormat db '%16.7f',0
err000:
db    'CUDA_SUCCESS                              ';= 0
db    'CUDA_ERROR_INVALID_VALUE                  ';= 1
db    'CUDA_ERROR_OUT_OF_MEMORY                  ';= 2
db    'CUDA_ERROR_NOT_INITIALIZED                ';= 3
db    'CUDA_ERROR_DEINITIALIZED                  ';= 4
db    'CUDA_ERROR_PROFILER_DISABLED              ';= 5
db    'CUDA_ERROR_PROFILER_NOT_INITIALIZED       ';= 6
db    'CUDA_ERROR_PROFILER_ALREADY_STARTED       ';= 7
db    'CUDA_ERROR_PROFILER_ALREADY_STOPPED       ';= 8
err100:
db    'CUDA_ERROR_NO_DEVICE                      ';= 100
db    'CUDA_ERROR_INVALID_DEVICE                 ';= 101
err200:
db    'CUDA_ERROR_INVALID_IMAGE                  ';= 200
db    'CUDA_ERROR_INVALID_CONTEXT                ';= 201
db    'CUDA_ERROR_CONTEXT_ALREADY_CURRENT        ';= 202
db    '?                                         ';=
db    '?                                         ';=
db    'CUDA_ERROR_MAP_FAILED                     ';= 205
db    'CUDA_ERROR_UNMAP_FAILED                   ';= 206
db    'CUDA_ERROR_ARRAY_IS_MAPPED                ';= 207
db    'CUDA_ERROR_ALREADY_MAPPED                 ';= 208
db    'CUDA_ERROR_NO_BINARY_FOR_GPU              ';= 209
db    'CUDA_ERROR_ALREADY_ACQUIRED               ';= 210
db    'CUDA_ERROR_NOT_MAPPED                     ';= 211
db    'CUDA_ERROR_NOT_MAPPED_AS_ARRAY            ';= 212
db    'CUDA_ERROR_NOT_MAPPED_AS_POINTER          ';= 213
db    'CUDA_ERROR_ECC_UNCORRECTABLE              ';= 214
db    'CUDA_ERROR_UNSUPPORTED_LIMIT              ';= 215
db    'CUDA_ERROR_CONTEXT_ALREADY_IN_USE         ';= 216
err300:
db    'CUDA_ERROR_INVALID_SOURCE                 ';= 300
db    'CUDA_ERROR_FILE_NOT_FOUND                 ';= 301
db    'CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND ';= 302
db    'CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      ';= 303
db    'CUDA_ERROR_OPERATING_SYSTEM               ';= 304
err400:
db    'CUDA_ERROR_INVALID_HANDLE                 ';= 400
err500:
db    'CUDA_ERROR_NOT_FOUND                      ';= 500
err600:
db    'CUDA_ERROR_NOT_READY                      ';= 600
err700:
db    'CUDA_ERROR_LAUNCH_FAILED                  ';= 700
db    'CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        ';= 701
db    'CUDA_ERROR_LAUNCH_TIMEOUT                 ';= 702
db    'CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  ';= 703
db    'CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED    ';= 704
db    'CUDA_ERROR_PEER_ACCESS_NOT_ENABLED        ';= 705
db    '?                                         ';=
db    '?                                         ';=
db    'CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         ';= 708
db    'CUDA_ERROR_CONTEXT_IS_DESTROYED           ';= 709
err999:
db    'CUDA_ERROR_UNKNOWN                        ';= 999
errNoMatch:
db    '?                                         '


_cuInit:                db 'cuInit              '
_cuDeviceGet:           db 'cuDeviceGet         '
_cuCtxCreate:           db 'cuCtxCreate         '
_cuMemAlloc:            db 'cuMemAlloc          '
_cuModuleLoadData:      db 'cuModuleLoadData    '
_cuModuleGetFunction:   db 'cuModuleGetFunction '
_cuMemcpyHtoD:          db 'cuMemcpyHtoD        '
_cuParamSeti:           db 'cuParamSeti         '
_cuParamSetv:           db 'cuParamSetv         '
_cuFuncSetBlockShape:   db 'cuFuncSetBlockShape '
_cuLaunchGrid:          db 'cuLaunchGrid        '
_cuParamSetSize:        db 'cuParamSetSize      '
_cuCtxSynchronize:      db 'cuCtxSynchronize    '
_cuMemcpyDtoH:          db 'cuMemcpyDtoH        '
_cuMemFree:             db 'cuMemFree           '
_cuCtxDestroy:          db 'cuCtxDestroy        '



align 16
  CudaDevice       dq ?
  CudaContext      dq ?
  CudaModule       dq ?
  CudaFunction     dq ?
  CudaNumberArray  dq ?
  NumberArray      rd 256
  LogNumberArray   rd 256
  CudaLogNumberArray rd 256
  Temp  rb 32
  Message rb 50*60

section '.idata' import data readable writeable

  library kernel32,'KERNEL32.DLL',\
          user32,'USER32.DLL',\
          msvcrt,'MSVCRT.DLL',\
          cuda,'NVCUDA.DLL'

  include 'api\kernel32.inc'
  include 'api\user32.inc'
  include 'api_cuda.inc'

import msvcrt,\
        sprintf,'sprintf'
    
Post 07 Aug 2024, 14:46
View user's profile Send private message Reply with quote
Display posts from previous:
Post new topic Reply to topic

Jump to:  


< Last Thread | Next Thread >
Forum Rules:
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum
You cannot attach files in this forum
You can download files in this forum


Copyright © 1999-2024, Tomasz Grysztar. Also on GitHub, YouTube.

Website powered by rwasa.