        @cudnn Tensor,input_descriptor,output_descriptor
        @cudnn Filter,kernel_descriptor
        @cudnn Convolut,convolution_descriptor
        @cudnn Activat,activation_descriptor

        SetTensor4 input_descriptor,CUDNN_TENSOR_NHWC,CUDNN_DATA_FLOAT,numImgs,3,[image.rows],[image.cols]


        _step = 0
        SetFilter4 kernel_descriptor,CUDNN_DATA_FLOAT,CUDNN_TENSOR_NCHW,3,3,3+_step*2,3+_step*2


        SetConvolut2d convolution_descriptor,1+_step,1+_step,1,1,1,1,CUDNN_CROSS_CORRELATION,CUDNN_DATA_FLOAT

        miRpd batch_size,channels,height,width,workspace_bytes

        invoke  cudnnGetConvolution2dForwardOutputDim,[convolution_descriptor],\
                                                      [input_descriptor],[kernel_descriptor],\
                                                      batch_size,channels,height,width
        ifcuError



        invoke  cudnnSetTensor4dDescriptor,[output_descriptor],CUDNN_TENSOR_NHWC,CUDNN_DATA_FLOAT,\
                                   numImgs,3,[image.rows],[image.cols]
        ifcuError
    ;this fixed  CUDNN_STATUS_SUBLIBRARY_LOADING_FAILED v  cudnnGetConvolutionForwardAlgorithm
        invoke  cudnnFindConvolutionForwardAlgorithm,[cuDNNHndl],[input_descriptor],[kernel_descriptor],\
                                  [convolution_descriptor],[output_descriptor],\
                                  1024,Message,convolution_algorithm
        ifcuError
;cudnnGetConvolutionForwardAlgorithm != cudnnGetConvolutionForwardAlgorithm_v7
;cudnnGetConvolutionForwardAlgorithm_v7 gruzit sama cudnn_cnn64_9.dll
        invoke  cudnnGetConvolutionForwardAlgorithm,[cuDNNHndl],[input_descriptor],[kernel_descriptor],\
                                  [convolution_descriptor],[output_descriptor],\
                                  CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,0,convolution_algorithm
                                  ;CUDNN_CONVOLUTION_FWD_ALGO_COUNT,Message,convolution_algorithm
                                  ;CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,0,convolution_algorithm
        ifcuError  1    ;error

        invoke  cudnnGetConvolutionForwardWorkspaceSize,[cuDNNHndl],[input_descriptor],[kernel_descriptor],\
                     [convolution_descriptor],[output_descriptor],[convolution_algorithm],workspace_bytes
        ifcuError  2

        prInt [workspace_bytes]   ;=0 why ?

        miRpq  d_workspace,d_input,d_output,d_kernel
        invoke cudaMalloc,d_workspace, [workspace_bytes]

        GetImgSize  image_bytes,batch_size,width,height,channels

        ;prInt [image_bytes]

  ;int image_bytes = batch_size * channels * height * width * sizeof(float);
        ;invoke  cudaMalloc,d_input, [image_bytes]
        ;invoke  cudaMemcpy,[d_input], imgF3, [image_bytes], cudaMemcpyHostToDevice
        NewGPUData d_input, [image_bytes], imgF3

        invoke  cudaMalloc,d_output, [image_bytes]
        invoke  cudaMemset,[d_output], 0, [image_bytes] ; Zero the array


        invoke  cudaMalloc,d_kernel, size.h_kernel
        invoke  cudaMemcpy,[d_kernel], h_kernel, size.h_kernel, cudaMemcpyHostToDevice

        invoke  cudnnConvolutionForward,[cuDNNHndl],alpha,[input_descriptor],[d_input],[kernel_descriptor],\
                                     [d_kernel],[convolution_descriptor],\
                                     [convolution_algorithm],[d_workspace],[workspace_bytes],beta,[output_descriptor],[d_output]
        ifcuError

        miRpq  mode, reluNanOpt, coef
        invoke cudnnGetActivationDescriptor,[activation_descriptor], mode, reluNanOpt, coef
        rept 7 n:0 { xorps xmm#n,xmm#n }
        mov rax,1.0f
        ;rept 7 n:0 { movq xmm#n,rax }
        invoke  cudnnSetActivationDescriptor,[activation_descriptor], CUDNN_ACTIVATION_SIGMOID,\
                                             CUDNN_PROPAGATE_NAN ,  [coef] ;last xmm0
        ifcuError 5     ;bad_param

;test time
        ;cudaBgnTimer
        Dbg 1,call  cuda_BgnTimer ;test speed


        rept 1 {   ;mset beta=0.19,alpha=1.0
        ;mov [beta],0.3
        ;mov [alpha],1.1
        invoke  cudnnActivationForward,[cuDNNHndl],[activation_descriptor],alpha,[output_descriptor],\
                                      [d_output],beta,[output_descriptor],[d_output]
        ifcuError 6

                 }

        Dbg 1,call PrintcudaTime  
        invoke Sleep,120