1 // Comes from
2 // https://devtalk.nvidia.com/default/topic/1037482/gpu-accelerated-libraries/help-me-help-you-with-modern-cmake-and-cuda-mwe-for-npp/post/5271066/#5271066
3 
4 #ifdef _WIN32
5 #  define EXPORT __declspec(dllexport)
6 #else
7 #  define EXPORT
8 #endif
9 
10 #include <cstdio>
11 #include <iostream>
12 
13 #include <assert.h>
14 #include <cuda_runtime_api.h>
15 #include <nppi_filtering_functions.h>
16 
nppif_main()17 EXPORT int nppif_main()
18 {
19   /**
20    * 8-bit unsigned single-channel 1D row convolution.
21    */
22   const int simgrows = 32;
23   const int simgcols = 32;
24   Npp8u *d_pSrc, *d_pDst;
25   const int nMaskSize = 3;
26   NppiSize oROI;
27   oROI.width = simgcols - nMaskSize;
28   oROI.height = simgrows;
29   const int simgsize = simgrows * simgcols * sizeof(d_pSrc[0]);
30   const int dimgsize = oROI.width * oROI.height * sizeof(d_pSrc[0]);
31   const int simgpix = simgrows * simgcols;
32   const int dimgpix = oROI.width * oROI.height;
33   const int nSrcStep = simgcols * sizeof(d_pSrc[0]);
34   const int nDstStep = oROI.width * sizeof(d_pDst[0]);
35   const int pixval = 1;
36   const int nDivisor = 1;
37   const Npp32s h_pKernel[nMaskSize] = { pixval, pixval, pixval };
38   Npp32s* d_pKernel;
39   const Npp32s nAnchor = 2;
40   cudaError_t err = cudaMalloc((void**)&d_pSrc, simgsize);
41   if (err != cudaSuccess) {
42     fprintf(stderr, "Cuda error %d\n", __LINE__);
43     return 1;
44   }
45   err = cudaMalloc((void**)&d_pDst, dimgsize);
46   if (err != cudaSuccess) {
47     fprintf(stderr, "Cuda error %d\n", __LINE__);
48     return 1;
49   }
50   err = cudaMalloc((void**)&d_pKernel, nMaskSize * sizeof(d_pKernel[0]));
51   if (err != cudaSuccess) {
52     fprintf(stderr, "Cuda error %d\n", __LINE__);
53     return 1;
54   }
55   // set image to pixval initially
56   err = cudaMemset(d_pSrc, pixval, simgsize);
57   if (err != cudaSuccess) {
58     fprintf(stderr, "Cuda error %d\n", __LINE__);
59     return 1;
60   }
61   err = cudaMemset(d_pDst, 0, dimgsize);
62   if (err != cudaSuccess) {
63     fprintf(stderr, "Cuda error %d\n", __LINE__);
64     return 1;
65   }
66   err = cudaMemcpy(d_pKernel, h_pKernel, nMaskSize * sizeof(d_pKernel[0]),
67                    cudaMemcpyHostToDevice);
68   if (err != cudaSuccess) {
69     fprintf(stderr, "Cuda error %d\n", __LINE__);
70     return 1;
71   }
72   // copy src to dst
73   NppStatus ret =
74     nppiFilterRow_8u_C1R(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, d_pKernel,
75                          nMaskSize, nAnchor, nDivisor);
76   assert(ret == NPP_NO_ERROR);
77   Npp8u* h_imgres = new Npp8u[dimgpix];
78   err = cudaMemcpy(h_imgres, d_pDst, dimgsize, cudaMemcpyDeviceToHost);
79   if (err != cudaSuccess) {
80     fprintf(stderr, "Cuda error %d\n", __LINE__);
81     return 1;
82   }
83   // test for filtering
84   for (int i = 0; i < dimgpix; i++) {
85     if (h_imgres[i] != (pixval * pixval * nMaskSize)) {
86       fprintf(stderr, "h_imgres at index %d failed to match\n", i);
87       return 1;
88     }
89   }
90 
91   return 0;
92 }
93