cudaMalloc ( &dev1, size )	;memory is transferred using the host CPU

cudaMallocHost ( &host1, size ) ;pinned memory required on host. Using DMA fast way. cudaFreeHost(host1)

cudaStreamCreate ( &stream1) ;
cudaMemcpyAsync ( dev1, host1, size, H2D, stream1 ) ;
cudaStreamSynchronize ( stream1);wait copy




void CUDART_CB MyCallback(cudaStream_t stream, cudaError_t status, void *data){
printf("Inside callback %d\n", (size_t)data);
}

for (size_t i = 0; i < 2; ++i) {
cudaMemcpyAsync(devPtrIn[i], hostPtr[i], size, cudaMemcpyHostToDevice,
stream[i]);
MyKernel<<<100, 512, 0, stream[i]>>>(devPtrOut[i], devPtrIn[i], size);
cudaMemcpyAsync(hostPtr[i], devPtrOut[i], size, cudaMemcpyDeviceToHost,
stream[i]);
cudaStreamAddCallback(stream[i], MyCallback, (void*)i, 0);
}

 cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId)