flat assembler
Message board for the users of flat assembler.
![]() |
Author |
|
Roman 05 Feb 2025, 16:48
|
|||
![]() |
|
Roman 05 Feb 2025, 18:15
|
|||
![]() |
|
Roman 07 Feb 2025, 14:45
cuDNN 8.8 who try install ?
I try download but access denied on Nvidia official site. |
|||
![]() |
|
bitRAKE 07 Feb 2025, 15:06
If this URL blocked then perhaps bounce through a redirector?
https://developer.download.nvidia.com/compute/cudnn/redist/ |
|||
![]() |
|
Roman 07 Feb 2025, 17:57
cuDNN 9.7.1
8 dlls 1 gigabyte ! 64bits I not found 32bit. I found this cuDNN dlls. But its from not officially site. https://huggingface.co/MonsterMMORPG/94_CUDA_Fix/tree/main https://www.opendll.com/index.php?file-download=cudnn64_8.dll&arch=64bit&version=6.14.11.6050 |
|||
![]() |
|
Roman 08 Feb 2025, 04:47
Code: format PE64 GUI 5.0 entry start ;https://github.com/ggerganov/whisper.cpp/blob/master/src/whisper-mel-cuda.cu include 'win64a.inc' CUDNN_STATUS_SUCCESS = 0 CUDNN_POOLING_MAX = 0 CUDNN_NOT_PROPAGATE_NAN = 0 CUDNN_TENSOR_NCHW = 0 CUDNN_DTYPE = 0 CUDNN_DATA_FLOAT = 0 CUDNN_DATA_DOUBLE = 1 CUDNN_DATA_HALF = 2 CUDNN_DATA_INT8 = 3 CUDNN_DATA_INT32 = 4 ;CUDNN_DATA_INT8x4 CUDNN_DEPRECATED_ENUM = 5 CUDNN_DATA_UINT8 = 6 ;CUDNN_DATA_UINT8x4 CUDNN_DEPRECATED_ENUM = 7 ;CUDNN_DATA_INT8x32 CUDNN_DEPRECATED_ENUM = 8 CUDNN_DATA_BFLOAT16 = 9 CUDNN_DATA_INT64 = 10 CUDNN_DATA_BOOLEAN = 11 CUDNN_DATA_FP8_E4M3 = 12 CUDNN_DATA_FP8_E5M2 = 13 CUDNN_DATA_FAST_FLOAT_FOR_FP8 = 14 CUDNN_DATA_FP8_E8M0 = 15 CUDNN_DATA_FP4_E2M1 = 16 cudaMemcpyHostToHost = 0 cudaMemcpyHostToDevice = 1 cudaMemcpyDeviceToHost = 2 cudaMemcpyDeviceToDevice = 3 cudaMemcpyDefault = 4 macro ifcuError { test eax,eax jz @f call GetcuError @@: } section '.text' code readable executable proc GetcuError invoke cudnnGetErrorString,rax invoke MessageBox,0,rax,'cuDNN error:',0 ret endp start: push rbp mov eax,enn invoke cudnnCreate,cuDNNHndl ifcuError invoke cudnnGetCudartVersion ;ifcuError invoke cudnnCreatePoolingDescriptor,pooling_desc ifcuError invoke cudnnSetPooling2dDescriptor,[pooling_desc],CUDNN_POOLING_MAX,\ CUDNN_NOT_PROPAGATE_NAN,3,3,0,0,1,1 ifcuError invoke cudnnCreateTensorDescriptor,in_desc ifcuError invoke cudnnSetTensor4dDescriptor,[in_desc],CUDNN_TENSOR_NCHW,\ CUDNN_DTYPE,2,2,10,10 ifcuError invoke cudnnCreateTensorDescriptor,out_desc ifcuError invoke cudnnSetTensor4dDescriptor,[out_desc],CUDNN_TENSOR_NCHW,\ CUDNN_DTYPE,2,2,8,8 ifcuError ;gpu mem alloc IN_DATA_BYTES = 2*2*10*10*4 OUT_DATA_BYTES = 2*2*8*8*4 invoke cudaMalloc,in_data,IN_DATA_BYTES invoke cudaMalloc,out_data,OUT_DATA_BYTES invoke cudaMemcpy,[in_data],input,IN_DATA_BYTES,cudaMemcpyHostToDevice invoke cudaMemset,[out_data],0,OUT_DATA_BYTES invoke cudnnPoolingForward,[cuDNNHndl],[pooling_desc],alpha,[in_desc],[in_data],\ beta,[out_desc],[out_data] ifcuError ;get gpu data invoke cudaMemcpy,result,[out_data],OUT_DATA_BYTES,cudaMemcpyDeviceToHost mov ebx,32 call printFlts enn: invoke cudaMalloc,in_grad,IN_DATA_BYTES invoke cudaMemset,[in_grad],0,IN_DATA_BYTES invoke cudnnPoolingBackward,[cuDNNHndl],[pooling_desc],alpha,[out_desc],[out_data],\ [out_desc],[out_data],[in_desc],[in_data],beta,[in_desc],[in_grad] ifcuError invoke cudaMemcpy,result,[in_grad],IN_DATA_BYTES,cudaMemcpyDeviceToHost mov ebx,32 call printFlts ;end invoke cudaFree,[in_data] invoke cudaFree,[in_grad] invoke cudaFree,[out_data] invoke cudnnDestroyTensorDescriptor,[in_desc] invoke cudnnDestroyTensorDescriptor,[out_desc] invoke cudnnDestroyPoolingDescriptor,[pooling_desc] invoke cudnnDestroy,[cuDNNHndl] invoke MessageBox,0,'cuDNN init !','Exit',0 invoke ExitProcess,0 macro nxtLine reg { mov byte [reg],13 inc reg } proc printFlts mov edi,result mov esi,Temp ;mov ebx,24 .up: cvtss2sd xmm1,[edi] movq rax,xmm1 invoke sprintf,rsi,MessageFormat,rax add esi,eax nxtLine esi add edi,4 dec ebx jnz .up invoke MessageBox,0,Temp,'out:',0 ret endp section '.data' data readable writeable MessageFormat db '%1.6f;;',0 alpha dd 1.0 beta dd 0 in_data dq 0 out_data dq 0 cuDNNHndl dq 0 pooling_desc dq 0 in_desc dq 0 out_desc dq 0 in_grad dq 0 include 'input.txt' ;any 400 floats numbers. input db 400 dup(2.0) section '.bss' readable writeable Message rb 50*600 result rd 40000 Temp rb 6400 section '.idata' import data readable writeable library kernel32,'KERNEL32.DLL',\ user32,'USER32.DLL',\ cudart,'cudart64_101.dll',\ msvcrt,'MSVCRT.DLL',\ cuDNN1,'cudnn64_9.dll' include 'api\kernel32.inc' include 'api\user32.inc' import cudart,\ cudaMalloc,'cudaMalloc',\ cudaMemcpy,'cudaMemcpy',\ cudaMemset,'cudaMemset',\ cudaFree,'cudaFree',\ cudaStreamCreate,'cudaStreamCreate' import cuDNN1,\ cudnnTransformFilter,'cudnnTransformFilter',\ cudnnSoftmaxForward,'cudnnSoftmaxForward',\ cudnnGetRNNForwardTrainingAlgorithmMaxCount,'cudnnGetRNNForwardTrainingAlgorithmMaxCount',\ cudnnSetFilter4dDescriptor,'cudnnSetFilter4dDescriptor',\ cudnnGetFilter4dDescriptor,'cudnnGetFilter4dDescriptor',\ cudnnSetFilterNdDescriptor,'cudnnSetFilterNdDescriptor',\ cudnnGetFilterNdDescriptor,'cudnnGetFilterNdDescriptor',\ cudnnSetTensor4dDescriptorEx,'cudnnSetTensor4dDescriptorEx',\ cudnnSetTensorNdDescriptor,'cudnnSetTensorNdDescriptor',\ cudnnGetCudartVersion,'cudnnGetCudartVersion',\ cudnnDestroyPoolingDescriptor,'cudnnDestroyPoolingDescriptor',\ cudnnDestroyTensorDescriptor,'cudnnDestroyTensorDescriptor',\ cudnnPoolingBackward,'cudnnPoolingBackward',\ cudnnPoolingForward,'cudnnPoolingForward',\ cudnnSetTensor4dDescriptor,'cudnnSetTensor4dDescriptor',\ cudnnCreateTensorDescriptor,'cudnnCreateTensorDescriptor',\ cudnnSetPooling2dDescriptor,'cudnnSetPooling2dDescriptor',\ cudnnCreatePoolingDescriptor,'cudnnCreatePoolingDescriptor',\ cudnnGetErrorString,'cudnnGetErrorString',\ cudnnCreate,'cudnnCreate',\ cudnnDestroy,'cudnnDestroy' import msvcrt,\ sprintf,'sprintf'
Last edited by Roman on 18 Feb 2025, 13:03; edited 6 times in total |
|||||||||||
![]() |
|
Roman 12 Feb 2025, 08:29
Output image.bmp saved in folder dlls.
In folder dlls
|
||||||||||
![]() |
|
Roman 19 Feb 2025, 17:06
nvrtc64_120_0.dll size 84mb.
Example how compiled Cuda kernel shader to PTX text asm shader. Code: macro Msg a { invoke MessageBox,0,a,0,MB_OK } NVRTC_SUCCESS = 0 NVRTC_ERROR_OUT_OF_MEMORY = 1 NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2 NVRTC_ERROR_INVALID_INPUT = 3 NVRTC_ERROR_INVALID_PROGRAM = 4 NVRTC_ERROR_INVALID_OPTION = 5 NVRTC_ERROR_COMPILATION = 6 NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7 NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8 NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9 NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10 NVRTC_ERROR_INTERNAL_ERROR = 11 start: ;compiled cuda kernel shader invoke nvrtcCreateProgram,ShaderCUDA,saxpyHLSL,0,0,0,0 test eax,eax jz @f invoke nvrtcGetErrorString,rax Msg rax @@: invoke nvrtcCompileProgram,[ShaderCUDA],2,ops_txt test eax,eax jz @f invoke nvrtcGetErrorString,rax Msg rax @@: invoke nvrtcGetProgramLog,[ShaderCUDA],Temp cmp byte [Temp],0 jz @f Msg Temp @@: invoke nvrtcGetPTX,[ShaderCUDA],Temp Msg Temp ;print PTX text shader invoke nvrtcGetPTXSize,[ShaderCUDA],PTXSize ;end compiling ;data PTXSize dd 0,0 ops_txt dq ops1,ops2,0 ops1 db '--gpu-architecture=compute_80',0 ops2 db '--fmad=false',0 saxpyHLSL: db 'extern "C" __global__ ',13,10 db 'void saxpy(float a, float *x, float *y, float *out, size_t n) {',13,10 db 'size_t tid = blockIdx.x * blockDim.x + threadIdx.x;',13,10 db 'if (tid < n) { out[tid] = a * x[tid] + y[tid]; }',13,10 db '}' db 0 Temp rb 32000 ShaderCUDA dq 1 section '.idata' import data readable writeable library kernel32,'KERNEL32.DLL',\ user32,'USER32.DLL',\ msvcrt,'MSVCRT.DLL',\ cudart,'cudart64_101.dll',\ cudacompiler,'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin\nvrtc64_120_0.dll' import cudacompiler,\ nvrtcGetPTX,'nvrtcGetPTX',\ nvrtcGetProgramLog,'nvrtcGetProgramLog',\ nvrtcGetPTXSize,'nvrtcGetPTXSize',\ nvrtcCompileProgram,'nvrtcCompileProgram',\ nvrtcCreateProgram,'nvrtcCreateProgram',\ nvrtcDestroyProgram,'nvrtcDestroyProgram',\ nvrtcGetErrorString,'nvrtcGetErrorString' Last edited by Roman on 20 Feb 2025, 07:22; edited 7 times in total |
|||
![]() |
|
Roman 19 Feb 2025, 17:22
For nvrtcCompileProgram:
When i try ops1 db '--gpu-architecture=compute_30',0 I get NVRTC_ERROR_INVALID_OPTION Than i changed ops1 db '--gpu-architecture=compute_80',0 And i get NVRTC_SUCCESS If cuModuleLoadData get error: a PTX JIT compilation failed Than try --gpu-architecture=compute_60 or 50 InfoStruct rb 512 Get compute version: invoke cudaGetDeviceProperties,InfoStruct,0 mov eax,[InfoStruct+360] ;int number its version your GPU ! https://www.cs.cmu.edu/afs/cs/academic/class/15668-s11/www/cuda-doc/html/group__CUDART__DEVICE_g5aa4f47938af8276f08074d09b7d520c.html Or using this: Code: ;get your GPU compute version invoke cuDeviceComputeCapability,major,minor,0 major dd 0 minor dd 0 |
|||
![]() |
|
Roman 28 Feb 2025, 04:49
Fully connected the same as dense.
Main idea training neurons and find the right weights.
|
||||||||||
![]() |
|
< Last Thread | Next Thread > |
Forum Rules:
|
Copyright © 1999-2025, Tomasz Grysztar. Also on GitHub, YouTube.
Website powered by rwasa.