1 // Comes from
2 // https://devtalk.nvidia.com/default/topic/1037482/gpu-accelerated-libraries/help-me-help-you-with-modern-cmake-and-cuda-mwe-for-npp/post/5271066/#5271066
3
4 #ifdef _WIN32
5 # define EXPORT __declspec(dllexport)
6 #else
7 # define EXPORT
8 #endif
9
10 #include <cstdio>
11 #include <iostream>
12
13 #include <assert.h>
14 #include <cuda_runtime_api.h>
15 #include <nppi_filtering_functions.h>
16
nppif_main()17 EXPORT int nppif_main()
18 {
19 /**
20 * 8-bit unsigned single-channel 1D row convolution.
21 */
22 const int simgrows = 32;
23 const int simgcols = 32;
24 Npp8u *d_pSrc, *d_pDst;
25 const int nMaskSize = 3;
26 NppiSize oROI;
27 oROI.width = simgcols - nMaskSize;
28 oROI.height = simgrows;
29 const int simgsize = simgrows * simgcols * sizeof(d_pSrc[0]);
30 const int dimgsize = oROI.width * oROI.height * sizeof(d_pSrc[0]);
31 const int simgpix = simgrows * simgcols;
32 const int dimgpix = oROI.width * oROI.height;
33 const int nSrcStep = simgcols * sizeof(d_pSrc[0]);
34 const int nDstStep = oROI.width * sizeof(d_pDst[0]);
35 const int pixval = 1;
36 const int nDivisor = 1;
37 const Npp32s h_pKernel[nMaskSize] = { pixval, pixval, pixval };
38 Npp32s* d_pKernel;
39 const Npp32s nAnchor = 2;
40 cudaError_t err = cudaMalloc((void**)&d_pSrc, simgsize);
41 if (err != cudaSuccess) {
42 fprintf(stderr, "Cuda error %d\n", __LINE__);
43 return 1;
44 }
45 err = cudaMalloc((void**)&d_pDst, dimgsize);
46 if (err != cudaSuccess) {
47 fprintf(stderr, "Cuda error %d\n", __LINE__);
48 return 1;
49 }
50 err = cudaMalloc((void**)&d_pKernel, nMaskSize * sizeof(d_pKernel[0]));
51 if (err != cudaSuccess) {
52 fprintf(stderr, "Cuda error %d\n", __LINE__);
53 return 1;
54 }
55 // set image to pixval initially
56 err = cudaMemset(d_pSrc, pixval, simgsize);
57 if (err != cudaSuccess) {
58 fprintf(stderr, "Cuda error %d\n", __LINE__);
59 return 1;
60 }
61 err = cudaMemset(d_pDst, 0, dimgsize);
62 if (err != cudaSuccess) {
63 fprintf(stderr, "Cuda error %d\n", __LINE__);
64 return 1;
65 }
66 err = cudaMemcpy(d_pKernel, h_pKernel, nMaskSize * sizeof(d_pKernel[0]),
67 cudaMemcpyHostToDevice);
68 if (err != cudaSuccess) {
69 fprintf(stderr, "Cuda error %d\n", __LINE__);
70 return 1;
71 }
72 // copy src to dst
73 NppStatus ret =
74 nppiFilterRow_8u_C1R(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, d_pKernel,
75 nMaskSize, nAnchor, nDivisor);
76 assert(ret == NPP_NO_ERROR);
77 Npp8u* h_imgres = new Npp8u[dimgpix];
78 err = cudaMemcpy(h_imgres, d_pDst, dimgsize, cudaMemcpyDeviceToHost);
79 if (err != cudaSuccess) {
80 fprintf(stderr, "Cuda error %d\n", __LINE__);
81 return 1;
82 }
83 // test for filtering
84 for (int i = 0; i < dimgpix; i++) {
85 if (h_imgres[i] != (pixval * pixval * nMaskSize)) {
86 fprintf(stderr, "h_imgres at index %d failed to match\n", i);
87 return 1;
88 }
89 }
90
91 return 0;
92 }
93