//image edges outline
int main() {
const float kernel_template[3][3] = {
    {1, 1, 1},
    {1, -8, 1},
    {1, 1, 1}
  };
  // clang-format on
  float h_kernel[3][3][3][3];
  for (int kernel = 0; kernel < 3; ++kernel) {
    for (int channel = 0; channel < 3; ++channel) {
      for (int row = 0; row < 3; ++row) {
        for (int column = 0; column < 3; ++column) {
          h_kernel[kernel][channel][row][column] = kernel_template[row][column];
        }
      }
    }
  }
  return void(&h_kernel);
}

В фасме просто
h_kernel dd 3*3 dup(1.0,1.0,1.0,  1.0,-8.0,1.0,  1.0,1.0,1.0)

int main() {   //godbolt x64 msvc 19.latest
const float kernel_template[3][3] = {
    {1, 1, 1},
    {1, -8, 1},
    {1, 1, 1}
  };
  // clang-format on

  float *h_kernel[3][3][3][3];
  for (int kernel = 0; kernel < 3; ++kernel) {
    for (int channel = 0; channel < 3; ++channel) {
      for (int row = 0; row < 3; ++row) {
        for (int column = 0; column < 3; ++column) {
          *h_kernel[kernel][channel][row][column] = kernel_template[row][column];
        }
      }
    }
  }
  return *h_kernel[3][3][3][3];
}
----------------------https://russianblogs.com/article/40291184869/
Tensor<float> output(input.shape());
    vector_set_gpu(output.count(), 0.0f, output.gptr());

    cudnnTensorDescriptor_t output_descriptor;
    cudnnCreateTensorDescriptor(&output_descriptor);
    cudnnSetTensor4dDescriptor(output_descriptor,
                               CUDNN_TENSOR_NHWC,
                               CUDNN_DATA_FLOAT,
                               output.shape(0), output.shape(1), output.shape(2), output.shape(3));

    // kernel array fill data
    Tensor<float> kernel({ output.shape(1), input.shape(1), 3, 3 });
    auto kernel_size = kernel.count(2, 4);
    float kernel_[kernel_size] = { 0, 1, 0, 1, -4, 1, 0, 1, 0 };
    for(auto i = 0; i < kernel.count(0, 2); ++i) {
        memcpy(kernel.cptr() + i * kernel_size, kernel_, kernel_size * sizeof(float));
    }

    cudnnFilterDescriptor_t kernel_descriptor;
    cudnnCreateFilterDescriptor(&kernel_descriptor);
    cudnnSetFilter4dDescriptor(kernel_descriptor,
                               CUDNN_DATA_FLOAT,
                               CUDNN_TENSOR_NCHW,
                               kernel.shape(0), kernel.shape(1), kernel.shape(2), kernel.shape(3));