flat assembler
Message board for the users of flat assembler.
![]() |
Author |
|
Roman 05 Feb 2025, 16:48
|
|||
![]() |
|
Roman 05 Feb 2025, 18:15
|
|||
![]() |
|
Roman 07 Feb 2025, 14:45
cuDNN 8.8 who try install ?
I try download but access denied on Nvidia official site. |
|||
![]() |
|
bitRAKE 07 Feb 2025, 15:06
If this URL blocked then perhaps bounce through a redirector?
https://developer.download.nvidia.com/compute/cudnn/redist/ _________________ ¯\(°_o)/¯ “languages are not safe - uses can be” Bjarne Stroustrup |
|||
![]() |
|
Roman 07 Feb 2025, 17:57
cuDNN 9.7.1
8 dlls 1 gigabyte ! 64bits I not found 32bit. I found this cuDNN dlls. But its from not officially site. https://huggingface.co/MonsterMMORPG/94_CUDA_Fix/tree/main https://www.opendll.com/index.php?file-download=cudnn64_8.dll&arch=64bit&version=6.14.11.6050 |
|||
![]() |
|
Roman 08 Feb 2025, 04:47
Code: format PE64 GUI 5.0 entry start ;https://github.com/ggerganov/whisper.cpp/blob/master/src/whisper-mel-cuda.cu include 'win64a.inc' CUDNN_STATUS_SUCCESS = 0 CUDNN_POOLING_MAX = 0 CUDNN_NOT_PROPAGATE_NAN = 0 CUDNN_TENSOR_NCHW = 0 CUDNN_DTYPE = 0 CUDNN_DATA_FLOAT = 0 CUDNN_DATA_DOUBLE = 1 CUDNN_DATA_HALF = 2 CUDNN_DATA_INT8 = 3 CUDNN_DATA_INT32 = 4 ;CUDNN_DATA_INT8x4 CUDNN_DEPRECATED_ENUM = 5 CUDNN_DATA_UINT8 = 6 ;CUDNN_DATA_UINT8x4 CUDNN_DEPRECATED_ENUM = 7 ;CUDNN_DATA_INT8x32 CUDNN_DEPRECATED_ENUM = 8 CUDNN_DATA_BFLOAT16 = 9 CUDNN_DATA_INT64 = 10 CUDNN_DATA_BOOLEAN = 11 CUDNN_DATA_FP8_E4M3 = 12 CUDNN_DATA_FP8_E5M2 = 13 CUDNN_DATA_FAST_FLOAT_FOR_FP8 = 14 CUDNN_DATA_FP8_E8M0 = 15 CUDNN_DATA_FP4_E2M1 = 16 cudaMemcpyHostToHost = 0 cudaMemcpyHostToDevice = 1 cudaMemcpyDeviceToHost = 2 cudaMemcpyDeviceToDevice = 3 cudaMemcpyDefault = 4 macro ifcuError { test eax,eax jz @f call GetcuError @@: } section '.text' code readable executable proc GetcuError invoke cudnnGetErrorString,Message invoke MessageBox,0,Message,'cu error:',0 ret endp start: push rbp mov eax,enn invoke cudnnCreate,cuDNNHndl ifcuError invoke cudnnGetCudartVersion ;ifcuError invoke cudnnCreatePoolingDescriptor,pooling_desc ifcuError invoke cudnnSetPooling2dDescriptor,[pooling_desc],CUDNN_POOLING_MAX,\ CUDNN_NOT_PROPAGATE_NAN,3,3,0,0,1,1 ifcuError invoke cudnnCreateTensorDescriptor,in_desc ifcuError invoke cudnnSetTensor4dDescriptor,[in_desc],CUDNN_TENSOR_NCHW,\ CUDNN_DTYPE,2,2,10,10 ifcuError invoke cudnnCreateTensorDescriptor,out_desc ifcuError invoke cudnnSetTensor4dDescriptor,[out_desc],CUDNN_TENSOR_NCHW,\ CUDNN_DTYPE,2,2,8,8 ifcuError ;gpu mem alloc IN_DATA_BYTES = 2*2*10*10*4 OUT_DATA_BYTES = 2*2*8*8*4 invoke cudaMalloc,in_data,IN_DATA_BYTES invoke cudaMalloc,out_data,OUT_DATA_BYTES invoke cudaMemcpy,[in_data],input,IN_DATA_BYTES,cudaMemcpyHostToDevice invoke cudaMemset,[out_data],0,OUT_DATA_BYTES invoke cudnnPoolingForward,[cuDNNHndl],[pooling_desc],alpha,[in_desc],[in_data],\ beta,[out_desc],[out_data] ifcuError ;get gpu data invoke cudaMemcpy,result,[out_data],OUT_DATA_BYTES,cudaMemcpyDeviceToHost mov ebx,32 call printFlts enn: invoke cudaMalloc,in_grad,IN_DATA_BYTES invoke cudaMemset,[in_grad],0,IN_DATA_BYTES invoke cudnnPoolingBackward,[cuDNNHndl],[pooling_desc],alpha,[out_desc],[out_data],\ [out_desc],[out_data],[in_desc],[in_data],beta,[in_desc],[in_grad] ifcuError invoke cudaMemcpy,result,[in_grad],IN_DATA_BYTES,cudaMemcpyDeviceToHost mov ebx,32 call printFlts ;end invoke cudaFree,[in_data] invoke cudaFree,[in_grad] invoke cudaFree,[out_data] invoke cudnnDestroyTensorDescriptor,[in_desc] invoke cudnnDestroyTensorDescriptor,[out_desc] invoke cudnnDestroyPoolingDescriptor,[pooling_desc] invoke cudnnDestroy,[cuDNNHndl] invoke MessageBox,0,'cuDNN init !','Exit',0 invoke ExitProcess,0 macro nxtLine reg { mov byte [reg],13 inc reg } proc printFlts mov edi,result mov esi,Temp ;mov ebx,24 .up: cvtss2sd xmm1,[edi] movq rax,xmm1 invoke sprintf,rsi,MessageFormat,rax add esi,eax nxtLine esi add edi,4 dec ebx jnz .up invoke MessageBox,0,Temp,'out:',0 ret endp section '.data' data readable writeable MessageFormat db '%1.6f;;',0 alpha dd 1.0 beta dd 0 in_data dq 0 out_data dq 0 cuDNNHndl dq 0 pooling_desc dq 0 in_desc dq 0 out_desc dq 0 in_grad dq 0 include 'input.txt' ;any 400 floats numbers. input db 400 dup(2.0) section '.bss' readable writeable Message rb 50*600 result rd 40000 Temp rb 6400 section '.idata' import data readable writeable library kernel32,'KERNEL32.DLL',\ user32,'USER32.DLL',\ cudart,'cudart64_101.dll',\ msvcrt,'MSVCRT.DLL',\ cuDNN1,'cudnn64_9.dll' include 'api\kernel32.inc' include 'api\user32.inc' import cudart,\ cudaMalloc,'cudaMalloc',\ cudaMemcpy,'cudaMemcpy',\ cudaMemset,'cudaMemset',\ cudaFree,'cudaFree',\ cudaStreamCreate,'cudaStreamCreate' import cuDNN1,\ cudnnTransformFilter,'cudnnTransformFilter',\ cudnnSoftmaxForward,'cudnnSoftmaxForward',\ cudnnGetRNNForwardTrainingAlgorithmMaxCount,'cudnnGetRNNForwardTrainingAlgorithmMaxCount',\ cudnnSetFilter4dDescriptor,'cudnnSetFilter4dDescriptor',\ cudnnGetFilter4dDescriptor,'cudnnGetFilter4dDescriptor',\ cudnnSetFilterNdDescriptor,'cudnnSetFilterNdDescriptor',\ cudnnGetFilterNdDescriptor,'cudnnGetFilterNdDescriptor',\ cudnnSetTensor4dDescriptorEx,'cudnnSetTensor4dDescriptorEx',\ cudnnSetTensorNdDescriptor,'cudnnSetTensorNdDescriptor',\ cudnnGetCudartVersion,'cudnnGetCudartVersion',\ cudnnDestroyPoolingDescriptor,'cudnnDestroyPoolingDescriptor',\ cudnnDestroyTensorDescriptor,'cudnnDestroyTensorDescriptor',\ cudnnPoolingBackward,'cudnnPoolingBackward',\ cudnnPoolingForward,'cudnnPoolingForward',\ cudnnSetTensor4dDescriptor,'cudnnSetTensor4dDescriptor',\ cudnnCreateTensorDescriptor,'cudnnCreateTensorDescriptor',\ cudnnSetPooling2dDescriptor,'cudnnSetPooling2dDescriptor',\ cudnnCreatePoolingDescriptor,'cudnnCreatePoolingDescriptor',\ cudnnGetErrorString,'cudnnGetErrorString',\ cudnnCreate,'cudnnCreate',\ cudnnDestroy,'cudnnDestroy' import msvcrt,\ sprintf,'sprintf' |
|||
![]() |
|
< Last Thread | Next Thread > |
Forum Rules:
|
Copyright © 1999-2025, Tomasz Grysztar. Also on GitHub, YouTube.
Website powered by rwasa.