1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 ////////////////////////////////////////////////////////////////////////////////
44 //
45 // NVIDIA CUDA implementation of Viola-Jones Object Detection Framework
46 //
47 // The algorithm and code are explained in the upcoming GPU Computing Gems
48 // chapter in detail:
49 //
50 //   Anton Obukhov, "Haar Classifiers for Object Detection with CUDA"
51 //   PDF URL placeholder
52 //   email: aobukhov@nvidia.com, devsupport@nvidia.com
53 //
54 // Credits for help with the code to:
55 // Alexey Mendelenko, Cyril Crassin, and Mikhail Smirnov.
56 //
57 ////////////////////////////////////////////////////////////////////////////////
58 
59 #include <algorithm>
60 #include <cstdio>
61 
62 #include "opencv2/cudev.hpp"
63 #include "opencv2/core/persistence.hpp"
64 
65 #include "opencv2/opencv_modules.hpp"
66 
67 #ifdef HAVE_OPENCV_OBJDETECT
68 #  include "opencv2/objdetect.hpp"
69 //#  include "opencv2/objdetect/objdetect_c.h"
70 #endif
71 
72 #include "opencv2/cudalegacy/NCV.hpp"
73 #include "opencv2/cudalegacy/NPP_staging.hpp"
74 #include "opencv2/cudalegacy/NCVHaarObjectDetection.hpp"
75 
76 #include "NCVRuntimeTemplates.hpp"
77 #include "NCVAlg.hpp"
78 
79 
80 //==============================================================================
81 //
82 // HaarClassifierCascade file
83 //
84 //==============================================================================
85 
86 
87 const Ncv32u MAX_GRID_DIM = 65535;
88 
89 
90 const Ncv32u NUM_THREADS_ANCHORSPARALLEL = 64;
91 
92 
93 #define NUM_THREADS_CLASSIFIERPARALLEL_LOG2     6
94 #define NUM_THREADS_CLASSIFIERPARALLEL          (1 << NUM_THREADS_CLASSIFIERPARALLEL_LOG2)
95 
96 
97 /** \internal
98 * Haar features solid array.
99 */
100 texture<uint2, 1, cudaReadModeElementType> texHaarFeatures;
101 
102 
103 /** \internal
104 * Haar classifiers flattened trees container.
105 * Two parts: first contains root nodes, second - nodes that are referred by root nodes.
106 * Drawback: breaks tree locality (might cause more cache misses
107 * Advantage: No need to introduce additional 32-bit field to index root nodes offsets
108 */
109 texture<uint4, 1, cudaReadModeElementType> texHaarClassifierNodes;
110 
111 
112 texture<Ncv32u, 1, cudaReadModeElementType> texIImage;
113 
114 
getStage(Ncv32u iStage,HaarStage64 * d_Stages)115 __device__ HaarStage64 getStage(Ncv32u iStage, HaarStage64 *d_Stages)
116 {
117     return d_Stages[iStage];
118 }
119 
120 
121 template <NcvBool tbCacheTextureCascade>
getClassifierNode(Ncv32u iNode,HaarClassifierNode128 * d_ClassifierNodes)122 __device__ HaarClassifierNode128 getClassifierNode(Ncv32u iNode, HaarClassifierNode128 *d_ClassifierNodes)
123 {
124     HaarClassifierNode128 tmpNode;
125     if (tbCacheTextureCascade)
126     {
127         tmpNode._ui4 = tex1Dfetch(texHaarClassifierNodes, iNode);
128     }
129     else
130     {
131         tmpNode = d_ClassifierNodes[iNode];
132     }
133     return tmpNode;
134 }
135 
136 
137 template <NcvBool tbCacheTextureCascade>
getFeature(Ncv32u iFeature,HaarFeature64 * d_Features,Ncv32f * weight,Ncv32u * rectX,Ncv32u * rectY,Ncv32u * rectWidth,Ncv32u * rectHeight)138 __device__ void getFeature(Ncv32u iFeature, HaarFeature64 *d_Features,
139                            Ncv32f *weight,
140                            Ncv32u *rectX, Ncv32u *rectY, Ncv32u *rectWidth, Ncv32u *rectHeight)
141 {
142     HaarFeature64 feature;
143     if (tbCacheTextureCascade)
144     {
145         feature._ui2 = tex1Dfetch(texHaarFeatures, iFeature);
146     }
147     else
148     {
149         feature = d_Features[iFeature];
150     }
151     feature.getRect(rectX, rectY, rectWidth, rectHeight);
152     *weight = feature.getWeight();
153 }
154 
155 
156 template <NcvBool tbCacheTextureIImg>
getElemIImg(Ncv32u x,Ncv32u * d_IImg)157 __device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg)
158 {
159     if (tbCacheTextureIImg)
160     {
161         return tex1Dfetch(texIImage, x);
162     }
163     else
164     {
165         return d_IImg[x];
166     }
167 }
168 
169 
170 __device__ Ncv32u d_outMaskPosition;
171 
172 
compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag,Ncv32u threadElem,Ncv32u * vectorOut)173 __device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u threadElem, Ncv32u *vectorOut)
174 {
175 #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
176 
177     __shared__ Ncv32u shmem[NUM_THREADS_ANCHORSPARALLEL];
178     __shared__ Ncv32u numPassed;
179     __shared__ Ncv32u outMaskOffset;
180 
181     Ncv32u incScan = cv::cudev::blockScanInclusive<NUM_THREADS_ANCHORSPARALLEL>(threadPassFlag, shmem, threadIdx.x);
182     __syncthreads();
183 
184     if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1)
185     {
186         numPassed = incScan;
187         outMaskOffset = atomicAdd(&d_outMaskPosition, incScan);
188     }
189 
190     if (threadPassFlag)
191     {
192         Ncv32u excScan = incScan - threadPassFlag;
193         shmem[excScan] = threadElem;
194     }
195 
196     __syncthreads();
197 
198     if (threadIdx.x < numPassed)
199     {
200         vectorOut[outMaskOffset + threadIdx.x] = shmem[threadIdx.x];
201     }
202 #endif
203 }
204 
205 
206 template <NcvBool tbInitMaskPositively,
207           NcvBool tbCacheTextureIImg,
208           NcvBool tbCacheTextureCascade,
209           NcvBool tbReadPixelIndexFromVector,
210           NcvBool tbDoAtomicCompaction>
applyHaarClassifierAnchorParallel(Ncv32u * d_IImg,Ncv32u IImgStride,Ncv32f * d_weights,Ncv32u weightsStride,HaarFeature64 * d_Features,HaarClassifierNode128 * d_ClassifierNodes,HaarStage64 * d_Stages,Ncv32u * d_inMask,Ncv32u * d_outMask,Ncv32u mask1Dlen,Ncv32u mask2Dstride,NcvSize32u anchorsRoi,Ncv32u startStageInc,Ncv32u endStageExc,Ncv32f scaleArea)211 __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStride,
212                                                   Ncv32f *d_weights, Ncv32u weightsStride,
213                                                   HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
214                                                   Ncv32u *d_inMask, Ncv32u *d_outMask,
215                                                   Ncv32u mask1Dlen, Ncv32u mask2Dstride,
216                                                   NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
217 {
218     Ncv32u y_offs;
219     Ncv32u x_offs;
220     Ncv32u maskOffset;
221     Ncv32u outMaskVal;
222 
223     NcvBool bInactiveThread = false;
224 
225     if (tbReadPixelIndexFromVector)
226     {
227         maskOffset = (MAX_GRID_DIM * blockIdx.y + blockIdx.x) * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;
228 
229         if (maskOffset >= mask1Dlen)
230         {
231             if (tbDoAtomicCompaction) bInactiveThread = true; else return;
232         }
233 
234         if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)
235         {
236             outMaskVal = d_inMask[maskOffset];
237             y_offs = outMaskVal >> 16;
238             x_offs = outMaskVal & 0xFFFF;
239         }
240     }
241     else
242     {
243         y_offs = blockIdx.y;
244         x_offs = blockIdx.x * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;
245 
246         if (x_offs >= mask2Dstride)
247         {
248             if (tbDoAtomicCompaction) bInactiveThread = true; else return;
249         }
250 
251         if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)
252         {
253             maskOffset = y_offs * mask2Dstride + x_offs;
254 
255             if ((x_offs >= anchorsRoi.width) ||
256                 (!tbInitMaskPositively &&
257                  d_inMask != d_outMask &&
258                  d_inMask[maskOffset] == OBJDET_MASK_ELEMENT_INVALID_32U))
259             {
260                 if (tbDoAtomicCompaction)
261                 {
262                     bInactiveThread = true;
263                 }
264                 else
265                 {
266                     d_outMask[maskOffset] = OBJDET_MASK_ELEMENT_INVALID_32U;
267                     return;
268                 }
269             }
270 
271             outMaskVal = (y_offs << 16) | x_offs;
272         }
273     }
274 
275     NcvBool bPass = true;
276 
277     if (!tbDoAtomicCompaction || tbDoAtomicCompaction)
278     {
279         Ncv32f pixelStdDev = 0.0f;
280 
281         if (!bInactiveThread)
282             pixelStdDev = d_weights[y_offs * weightsStride + x_offs];
283 
284         for (Ncv32u iStage = startStageInc; iStage < endStageExc; iStage++)
285         {
286             Ncv32f curStageSum = 0.0f;
287 
288             HaarStage64 curStage = getStage(iStage, d_Stages);
289             Ncv32u numRootNodesInStage = curStage.getNumClassifierRootNodes();
290             Ncv32u curRootNodeOffset = curStage.getStartClassifierRootNodeOffset();
291             Ncv32f stageThreshold = curStage.getStageThreshold();
292 
293             while (numRootNodesInStage--)
294             {
295                 NcvBool bMoreNodesToTraverse = true;
296                 Ncv32u iNode = curRootNodeOffset;
297 
298                 if (bPass && !bInactiveThread)
299                 {
300                     while (bMoreNodesToTraverse)
301                     {
302                         HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
303                         HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
304                         Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
305                         Ncv32u iFeature = featuresDesc.getFeaturesOffset();
306 
307                         Ncv32f curNodeVal = 0.0f;
308 
309                         for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)
310                         {
311                             Ncv32f rectWeight;
312                             Ncv32u rectX, rectY, rectWidth, rectHeight;
313                             getFeature<tbCacheTextureCascade>
314                                 (iFeature + iRect, d_Features,
315                                 &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
316 
317                             Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
318                             Ncv32u iioffsTR = iioffsTL + rectWidth;
319                             Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
320                             Ncv32u iioffsBR = iioffsBL + rectWidth;
321 
322                             Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -
323                                              getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +
324                                              getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
325                                              getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
326 
327     #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
328                         curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
329     #else
330                         curNodeVal += (Ncv32f)rectSum * rectWeight;
331     #endif
332                         }
333 
334                         HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
335                         HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();
336                         Ncv32f nodeThreshold = curNode.getThreshold();
337 
338                         HaarClassifierNodeDescriptor32 nextNodeDescriptor;
339                         NcvBool nextNodeIsLeaf;
340 
341                         if (curNodeVal < scaleArea * pixelStdDev * nodeThreshold)
342                         {
343                             nextNodeDescriptor = nodeLeft;
344                             nextNodeIsLeaf = featuresDesc.isLeftNodeLeaf();
345                         }
346                         else
347                         {
348                             nextNodeDescriptor = nodeRight;
349                             nextNodeIsLeaf = featuresDesc.isRightNodeLeaf();
350                         }
351 
352                         if (nextNodeIsLeaf)
353                         {
354                             Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue();
355                             curStageSum += tmpLeafValue;
356                             bMoreNodesToTraverse = false;
357                         }
358                         else
359                         {
360                             iNode = nextNodeDescriptor.getNextNodeOffset();
361                         }
362                     }
363                 }
364 
365                 __syncthreads();
366                 curRootNodeOffset++;
367             }
368 
369             if (curStageSum < stageThreshold)
370             {
371                 bPass = false;
372                 outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U;
373             }
374         }
375     }
376 
377     __syncthreads();
378 
379     if (!tbDoAtomicCompaction)
380     {
381         if (!tbReadPixelIndexFromVector ||
382             (tbReadPixelIndexFromVector && (!bPass || d_inMask != d_outMask)))
383         {
384             d_outMask[maskOffset] = outMaskVal;
385         }
386     }
387     else
388     {
389         compactBlockWriteOutAnchorParallel(bPass && !bInactiveThread,
390                                            outMaskVal,
391                                            d_outMask);
392     }
393 }
394 
395 
396 template <NcvBool tbCacheTextureIImg,
397           NcvBool tbCacheTextureCascade,
398           NcvBool tbDoAtomicCompaction>
applyHaarClassifierClassifierParallel(Ncv32u * d_IImg,Ncv32u IImgStride,Ncv32f * d_weights,Ncv32u weightsStride,HaarFeature64 * d_Features,HaarClassifierNode128 * d_ClassifierNodes,HaarStage64 * d_Stages,Ncv32u * d_inMask,Ncv32u * d_outMask,Ncv32u mask1Dlen,Ncv32u mask2Dstride,NcvSize32u anchorsRoi,Ncv32u startStageInc,Ncv32u endStageExc,Ncv32f scaleArea)399 __global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IImgStride,
400                                                       Ncv32f *d_weights, Ncv32u weightsStride,
401                                                       HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
402                                                       Ncv32u *d_inMask, Ncv32u *d_outMask,
403                                                       Ncv32u mask1Dlen, Ncv32u mask2Dstride,
404                                                       NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
405 {
406     Ncv32u maskOffset = MAX_GRID_DIM * blockIdx.y + blockIdx.x;
407 
408     if (maskOffset >= mask1Dlen)
409     {
410         return;
411     }
412 
413     Ncv32u outMaskVal = d_inMask[maskOffset];
414     Ncv32u y_offs = outMaskVal >> 16;
415     Ncv32u x_offs = outMaskVal & 0xFFFF;
416 
417     Ncv32f pixelStdDev = d_weights[y_offs * weightsStride + x_offs];
418     NcvBool bPass = true;
419 
420     for (Ncv32u iStage = startStageInc; iStage<endStageExc; iStage++)
421     {
422         //this variable is subject to reduction
423         Ncv32f curStageSum = 0.0f;
424 
425         HaarStage64 curStage = getStage(iStage, d_Stages);
426         Ncv32s numRootNodesInStage = curStage.getNumClassifierRootNodes();
427         Ncv32u curRootNodeOffset = curStage.getStartClassifierRootNodeOffset() + threadIdx.x;
428         Ncv32f stageThreshold = curStage.getStageThreshold();
429 
430         Ncv32u numRootChunks = (numRootNodesInStage + NUM_THREADS_CLASSIFIERPARALLEL - 1) >> NUM_THREADS_CLASSIFIERPARALLEL_LOG2;
431 
432         for (Ncv32u chunkId=0; chunkId<numRootChunks; chunkId++)
433         {
434             NcvBool bMoreNodesToTraverse = true;
435 
436             if (chunkId * NUM_THREADS_CLASSIFIERPARALLEL + threadIdx.x < numRootNodesInStage)
437             {
438                 Ncv32u iNode = curRootNodeOffset;
439 
440                 while (bMoreNodesToTraverse)
441                 {
442                     HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
443                     HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
444                     Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
445                     Ncv32u iFeature = featuresDesc.getFeaturesOffset();
446 
447                     Ncv32f curNodeVal = 0.0f;
448                     //TODO: fetch into shmem if size suffices. Shmem can be shared with reduce
449                     for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)
450                     {
451                         Ncv32f rectWeight;
452                         Ncv32u rectX, rectY, rectWidth, rectHeight;
453                         getFeature<tbCacheTextureCascade>
454                             (iFeature + iRect, d_Features,
455                             &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
456 
457                         Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
458                         Ncv32u iioffsTR = iioffsTL + rectWidth;
459                         Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
460                         Ncv32u iioffsBR = iioffsBL + rectWidth;
461 
462                         Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -
463                                          getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +
464                                          getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
465                                          getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
466 
467 #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
468                         curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
469 #else
470                         curNodeVal += (Ncv32f)rectSum * rectWeight;
471 #endif
472                     }
473 
474                     HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
475                     HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();
476                     Ncv32f nodeThreshold = curNode.getThreshold();
477 
478                     HaarClassifierNodeDescriptor32 nextNodeDescriptor;
479                     NcvBool nextNodeIsLeaf;
480 
481                     if (curNodeVal < scaleArea * pixelStdDev * nodeThreshold)
482                     {
483                         nextNodeDescriptor = nodeLeft;
484                         nextNodeIsLeaf = featuresDesc.isLeftNodeLeaf();
485                     }
486                     else
487                     {
488                         nextNodeDescriptor = nodeRight;
489                         nextNodeIsLeaf = featuresDesc.isRightNodeLeaf();
490                     }
491 
492                     if (nextNodeIsLeaf)
493                     {
494                         Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue();
495                         curStageSum += tmpLeafValue;
496                         bMoreNodesToTraverse = false;
497                     }
498                     else
499                     {
500                         iNode = nextNodeDescriptor.getNextNodeOffset();
501                     }
502                 }
503             }
504             __syncthreads();
505 
506             curRootNodeOffset += NUM_THREADS_CLASSIFIERPARALLEL;
507         }
508 
509         Ncv32f finalStageSum = subReduce<Ncv32f, functorAddValues<Ncv32f>, NUM_THREADS_CLASSIFIERPARALLEL>(curStageSum);
510 
511         if (finalStageSum < stageThreshold)
512         {
513             bPass = false;
514             outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U;
515             break;
516         }
517     }
518 
519     if (!tbDoAtomicCompaction)
520     {
521         if (!bPass || d_inMask != d_outMask)
522         {
523             if (!threadIdx.x)
524             {
525                 d_outMask[maskOffset] = outMaskVal;
526             }
527         }
528     }
529     else
530     {
531 #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
532         if (bPass && !threadIdx.x)
533         {
534             Ncv32u outMaskOffset = atomicAdd(&d_outMaskPosition, 1);
535             d_outMask[outMaskOffset] = outMaskVal;
536         }
537 #endif
538     }
539 }
540 
541 
542 template <NcvBool tbMaskByInmask,
543           NcvBool tbDoAtomicCompaction>
initializeMaskVector(Ncv32u * d_inMask,Ncv32u * d_outMask,Ncv32u mask1Dlen,Ncv32u mask2Dstride,NcvSize32u anchorsRoi,Ncv32u step)544 __global__ void initializeMaskVector(Ncv32u *d_inMask, Ncv32u *d_outMask,
545                                      Ncv32u mask1Dlen, Ncv32u mask2Dstride,
546                                      NcvSize32u anchorsRoi, Ncv32u step)
547 {
548     Ncv32u y_offs = blockIdx.y;
549     Ncv32u x_offs = blockIdx.x * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;
550     Ncv32u outMaskOffset = y_offs * gridDim.x * blockDim.x + x_offs;
551 
552     Ncv32u y_offs_upsc = step * y_offs;
553     Ncv32u x_offs_upsc = step * x_offs;
554     Ncv32u inMaskOffset = y_offs_upsc * mask2Dstride + x_offs_upsc;
555 
556     Ncv32u outElem = OBJDET_MASK_ELEMENT_INVALID_32U;
557 
558     if (x_offs_upsc < anchorsRoi.width &&
559         (!tbMaskByInmask || d_inMask[inMaskOffset] != OBJDET_MASK_ELEMENT_INVALID_32U))
560     {
561         outElem = (y_offs_upsc << 16) | x_offs_upsc;
562     }
563 
564     if (!tbDoAtomicCompaction)
565     {
566         d_outMask[outMaskOffset] = outElem;
567     }
568     else
569     {
570         compactBlockWriteOutAnchorParallel(outElem != OBJDET_MASK_ELEMENT_INVALID_32U,
571                                            outElem,
572                                            d_outMask);
573     }
574 }
575 
576 
577 struct applyHaarClassifierAnchorParallelFunctor
578 {
579     dim3 gridConf, blockConf;
580     cudaStream_t cuStream;
581 
582     //Kernel arguments are stored as members;
583     Ncv32u *d_IImg;
584     Ncv32u IImgStride;
585     Ncv32f *d_weights;
586     Ncv32u weightsStride;
587     HaarFeature64 *d_Features;
588     HaarClassifierNode128 *d_ClassifierNodes;
589     HaarStage64 *d_Stages;
590     Ncv32u *d_inMask;
591     Ncv32u *d_outMask;
592     Ncv32u mask1Dlen;
593     Ncv32u mask2Dstride;
594     NcvSize32u anchorsRoi;
595     Ncv32u startStageInc;
596     Ncv32u endStageExc;
597     Ncv32f scaleArea;
598 
599     //Arguments are passed through the constructor
applyHaarClassifierAnchorParallelFunctorapplyHaarClassifierAnchorParallelFunctor600     applyHaarClassifierAnchorParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
601                                              Ncv32u *_d_IImg, Ncv32u _IImgStride,
602                                              Ncv32f *_d_weights, Ncv32u _weightsStride,
603                                              HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,
604                                              Ncv32u *_d_inMask, Ncv32u *_d_outMask,
605                                              Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
606                                              NcvSize32u _anchorsRoi, Ncv32u _startStageInc,
607                                              Ncv32u _endStageExc, Ncv32f _scaleArea) :
608     gridConf(_gridConf),
609     blockConf(_blockConf),
610     cuStream(_cuStream),
611     d_IImg(_d_IImg),
612     IImgStride(_IImgStride),
613     d_weights(_d_weights),
614     weightsStride(_weightsStride),
615     d_Features(_d_Features),
616     d_ClassifierNodes(_d_ClassifierNodes),
617     d_Stages(_d_Stages),
618     d_inMask(_d_inMask),
619     d_outMask(_d_outMask),
620     mask1Dlen(_mask1Dlen),
621     mask2Dstride(_mask2Dstride),
622     anchorsRoi(_anchorsRoi),
623     startStageInc(_startStageInc),
624     endStageExc(_endStageExc),
625     scaleArea(_scaleArea)
626     {}
627 
628     template<class TList>
callapplyHaarClassifierAnchorParallelFunctor629     void call(TList tl)
630     {
631         CV_UNUSED(tl);
632         applyHaarClassifierAnchorParallel <
633             Loki::TL::TypeAt<TList, 0>::Result::value,
634             Loki::TL::TypeAt<TList, 1>::Result::value,
635             Loki::TL::TypeAt<TList, 2>::Result::value,
636             Loki::TL::TypeAt<TList, 3>::Result::value,
637             Loki::TL::TypeAt<TList, 4>::Result::value >
638             <<<gridConf, blockConf, 0, cuStream>>>
639             (d_IImg, IImgStride,
640             d_weights, weightsStride,
641             d_Features, d_ClassifierNodes, d_Stages,
642             d_inMask, d_outMask,
643             mask1Dlen, mask2Dstride,
644             anchorsRoi, startStageInc,
645             endStageExc, scaleArea);
646     }
647 };
648 
649 
applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively,NcvBool tbCacheTextureIImg,NcvBool tbCacheTextureCascade,NcvBool tbReadPixelIndexFromVector,NcvBool tbDoAtomicCompaction,dim3 gridConf,dim3 blockConf,cudaStream_t cuStream,Ncv32u * d_IImg,Ncv32u IImgStride,Ncv32f * d_weights,Ncv32u weightsStride,HaarFeature64 * d_Features,HaarClassifierNode128 * d_ClassifierNodes,HaarStage64 * d_Stages,Ncv32u * d_inMask,Ncv32u * d_outMask,Ncv32u mask1Dlen,Ncv32u mask2Dstride,NcvSize32u anchorsRoi,Ncv32u startStageInc,Ncv32u endStageExc,Ncv32f scaleArea)650 void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively,
651                                                   NcvBool tbCacheTextureIImg,
652                                                   NcvBool tbCacheTextureCascade,
653                                                   NcvBool tbReadPixelIndexFromVector,
654                                                   NcvBool tbDoAtomicCompaction,
655 
656                                                   dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
657 
658                                                   Ncv32u *d_IImg, Ncv32u IImgStride,
659                                                   Ncv32f *d_weights, Ncv32u weightsStride,
660                                                   HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
661                                                   Ncv32u *d_inMask, Ncv32u *d_outMask,
662                                                   Ncv32u mask1Dlen, Ncv32u mask2Dstride,
663                                                   NcvSize32u anchorsRoi, Ncv32u startStageInc,
664                                                   Ncv32u endStageExc, Ncv32f scaleArea)
665 {
666 
667     applyHaarClassifierAnchorParallelFunctor functor(gridConf, blockConf, cuStream,
668                                                      d_IImg, IImgStride,
669                                                      d_weights, weightsStride,
670                                                      d_Features, d_ClassifierNodes, d_Stages,
671                                                      d_inMask, d_outMask,
672                                                      mask1Dlen, mask2Dstride,
673                                                      anchorsRoi, startStageInc,
674                                                      endStageExc, scaleArea);
675 
676     //Second parameter is the number of "dynamic" template parameters
677     NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 5, applyHaarClassifierAnchorParallelFunctor>
678         ::call( &functor,
679                 tbInitMaskPositively,
680                 tbCacheTextureIImg,
681                 tbCacheTextureCascade,
682                 tbReadPixelIndexFromVector,
683                 tbDoAtomicCompaction);
684 }
685 
686 
687 struct applyHaarClassifierClassifierParallelFunctor
688 {
689     dim3 gridConf, blockConf;
690     cudaStream_t cuStream;
691 
692     //Kernel arguments are stored as members;
693     Ncv32u *d_IImg;
694     Ncv32u IImgStride;
695     Ncv32f *d_weights;
696     Ncv32u weightsStride;
697     HaarFeature64 *d_Features;
698     HaarClassifierNode128 *d_ClassifierNodes;
699     HaarStage64 *d_Stages;
700     Ncv32u *d_inMask;
701     Ncv32u *d_outMask;
702     Ncv32u mask1Dlen;
703     Ncv32u mask2Dstride;
704     NcvSize32u anchorsRoi;
705     Ncv32u startStageInc;
706     Ncv32u endStageExc;
707     Ncv32f scaleArea;
708 
709     //Arguments are passed through the constructor
applyHaarClassifierClassifierParallelFunctorapplyHaarClassifierClassifierParallelFunctor710     applyHaarClassifierClassifierParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
711                                                  Ncv32u *_d_IImg, Ncv32u _IImgStride,
712                                                  Ncv32f *_d_weights, Ncv32u _weightsStride,
713                                                  HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,
714                                                  Ncv32u *_d_inMask, Ncv32u *_d_outMask,
715                                                  Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
716                                                  NcvSize32u _anchorsRoi, Ncv32u _startStageInc,
717                                                  Ncv32u _endStageExc, Ncv32f _scaleArea) :
718     gridConf(_gridConf),
719     blockConf(_blockConf),
720     cuStream(_cuStream),
721     d_IImg(_d_IImg),
722     IImgStride(_IImgStride),
723     d_weights(_d_weights),
724     weightsStride(_weightsStride),
725     d_Features(_d_Features),
726     d_ClassifierNodes(_d_ClassifierNodes),
727     d_Stages(_d_Stages),
728     d_inMask(_d_inMask),
729     d_outMask(_d_outMask),
730     mask1Dlen(_mask1Dlen),
731     mask2Dstride(_mask2Dstride),
732     anchorsRoi(_anchorsRoi),
733     startStageInc(_startStageInc),
734     endStageExc(_endStageExc),
735     scaleArea(_scaleArea)
736     {}
737 
738     template<class TList>
callapplyHaarClassifierClassifierParallelFunctor739     void call(TList tl)
740     {
741         CV_UNUSED(tl);
742         applyHaarClassifierClassifierParallel <
743             Loki::TL::TypeAt<TList, 0>::Result::value,
744             Loki::TL::TypeAt<TList, 1>::Result::value,
745             Loki::TL::TypeAt<TList, 2>::Result::value >
746             <<<gridConf, blockConf, 0, cuStream>>>
747             (d_IImg, IImgStride,
748             d_weights, weightsStride,
749             d_Features, d_ClassifierNodes, d_Stages,
750             d_inMask, d_outMask,
751             mask1Dlen, mask2Dstride,
752             anchorsRoi, startStageInc,
753             endStageExc, scaleArea);
754     }
755 };
756 
757 
applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg,NcvBool tbCacheTextureCascade,NcvBool tbDoAtomicCompaction,dim3 gridConf,dim3 blockConf,cudaStream_t cuStream,Ncv32u * d_IImg,Ncv32u IImgStride,Ncv32f * d_weights,Ncv32u weightsStride,HaarFeature64 * d_Features,HaarClassifierNode128 * d_ClassifierNodes,HaarStage64 * d_Stages,Ncv32u * d_inMask,Ncv32u * d_outMask,Ncv32u mask1Dlen,Ncv32u mask2Dstride,NcvSize32u anchorsRoi,Ncv32u startStageInc,Ncv32u endStageExc,Ncv32f scaleArea)758 void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg,
759                                                       NcvBool tbCacheTextureCascade,
760                                                       NcvBool tbDoAtomicCompaction,
761 
762                                                       dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
763 
764                                                       Ncv32u *d_IImg, Ncv32u IImgStride,
765                                                       Ncv32f *d_weights, Ncv32u weightsStride,
766                                                       HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
767                                                       Ncv32u *d_inMask, Ncv32u *d_outMask,
768                                                       Ncv32u mask1Dlen, Ncv32u mask2Dstride,
769                                                       NcvSize32u anchorsRoi, Ncv32u startStageInc,
770                                                       Ncv32u endStageExc, Ncv32f scaleArea)
771 {
772     applyHaarClassifierClassifierParallelFunctor functor(gridConf, blockConf, cuStream,
773                                                          d_IImg, IImgStride,
774                                                          d_weights, weightsStride,
775                                                          d_Features, d_ClassifierNodes, d_Stages,
776                                                          d_inMask, d_outMask,
777                                                          mask1Dlen, mask2Dstride,
778                                                          anchorsRoi, startStageInc,
779                                                          endStageExc, scaleArea);
780 
781     //Second parameter is the number of "dynamic" template parameters
782     NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 3, applyHaarClassifierClassifierParallelFunctor>
783         ::call( &functor,
784                 tbCacheTextureIImg,
785                 tbCacheTextureCascade,
786                 tbDoAtomicCompaction);
787 }
788 
789 
790 struct initializeMaskVectorFunctor
791 {
792     dim3 gridConf, blockConf;
793     cudaStream_t cuStream;
794 
795     //Kernel arguments are stored as members;
796     Ncv32u *d_inMask;
797     Ncv32u *d_outMask;
798     Ncv32u mask1Dlen;
799     Ncv32u mask2Dstride;
800     NcvSize32u anchorsRoi;
801     Ncv32u step;
802 
803     //Arguments are passed through the constructor
initializeMaskVectorFunctorinitializeMaskVectorFunctor804     initializeMaskVectorFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
805                                 Ncv32u *_d_inMask, Ncv32u *_d_outMask,
806                                 Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
807                                 NcvSize32u _anchorsRoi, Ncv32u _step) :
808     gridConf(_gridConf),
809     blockConf(_blockConf),
810     cuStream(_cuStream),
811     d_inMask(_d_inMask),
812     d_outMask(_d_outMask),
813     mask1Dlen(_mask1Dlen),
814     mask2Dstride(_mask2Dstride),
815     anchorsRoi(_anchorsRoi),
816     step(_step)
817     {}
818 
819     template<class TList>
callinitializeMaskVectorFunctor820     void call(TList tl)
821     {
822         CV_UNUSED(tl);
823         initializeMaskVector <
824             Loki::TL::TypeAt<TList, 0>::Result::value,
825             Loki::TL::TypeAt<TList, 1>::Result::value >
826             <<<gridConf, blockConf, 0, cuStream>>>
827             (d_inMask, d_outMask,
828              mask1Dlen, mask2Dstride,
829              anchorsRoi, step);
830     }
831 };
832 
833 
initializeMaskVectorDynTemplate(NcvBool tbMaskByInmask,NcvBool tbDoAtomicCompaction,dim3 gridConf,dim3 blockConf,cudaStream_t cuStream,Ncv32u * d_inMask,Ncv32u * d_outMask,Ncv32u mask1Dlen,Ncv32u mask2Dstride,NcvSize32u anchorsRoi,Ncv32u step)834 void initializeMaskVectorDynTemplate(NcvBool tbMaskByInmask,
835                                      NcvBool tbDoAtomicCompaction,
836 
837                                      dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
838 
839                                      Ncv32u *d_inMask, Ncv32u *d_outMask,
840                                      Ncv32u mask1Dlen, Ncv32u mask2Dstride,
841                                      NcvSize32u anchorsRoi, Ncv32u step)
842 {
843     initializeMaskVectorFunctor functor(gridConf, blockConf, cuStream,
844                                         d_inMask, d_outMask,
845                                         mask1Dlen, mask2Dstride,
846                                         anchorsRoi, step);
847 
848     //Second parameter is the number of "dynamic" template parameters
849     NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 2, initializeMaskVectorFunctor>
850         ::call( &functor,
851                 tbMaskByInmask,
852                 tbDoAtomicCompaction);
853 }
854 
855 
getStageNumWithNotLessThanNclassifiers(Ncv32u N,HaarClassifierCascadeDescriptor & haar,NCVVector<HaarStage64> & h_HaarStages)856 Ncv32u getStageNumWithNotLessThanNclassifiers(Ncv32u N, HaarClassifierCascadeDescriptor &haar,
857                                               NCVVector<HaarStage64> &h_HaarStages)
858 {
859     Ncv32u i = 0;
860     for (; i<haar.NumStages; i++)
861     {
862         if (h_HaarStages.ptr()[i].getNumClassifierRootNodes() >= N)
863         {
864             break;
865         }
866     }
867     return i;
868 }
869 
870 
ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> & integral,NCVMatrix<Ncv32f> & d_weights,NCVMatrixAlloc<Ncv32u> & d_pixelMask,Ncv32u & numDetections,HaarClassifierCascadeDescriptor & haar,NCVVector<HaarStage64> & h_HaarStages,NCVVector<HaarStage64> & d_HaarStages,NCVVector<HaarClassifierNode128> & d_HaarNodes,NCVVector<HaarFeature64> & d_HaarFeatures,NcvBool bMaskElements,NcvSize32u anchorsRoi,Ncv32u pixelStep,Ncv32f scaleArea,INCVMemAllocator & gpuAllocator,INCVMemAllocator & cpuAllocator,cudaDeviceProp & devProp,cudaStream_t cuStream)871 NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
872                                                NCVMatrix<Ncv32f> &d_weights,
873                                                NCVMatrixAlloc<Ncv32u> &d_pixelMask,
874                                                Ncv32u &numDetections,
875                                                HaarClassifierCascadeDescriptor &haar,
876                                                NCVVector<HaarStage64> &h_HaarStages,
877                                                NCVVector<HaarStage64> &d_HaarStages,
878                                                NCVVector<HaarClassifierNode128> &d_HaarNodes,
879                                                NCVVector<HaarFeature64> &d_HaarFeatures,
880                                                NcvBool bMaskElements,
881                                                NcvSize32u anchorsRoi,
882                                                Ncv32u pixelStep,
883                                                Ncv32f scaleArea,
884                                                INCVMemAllocator &gpuAllocator,
885                                                INCVMemAllocator &cpuAllocator,
886                                                cudaDeviceProp &devProp,
887                                                cudaStream_t cuStream)
888 {
889     ncvAssertReturn(integral.memType() == d_weights.memType()&&
890                     integral.memType() == d_pixelMask.memType() &&
891                     integral.memType() == gpuAllocator.memType() &&
892                    (integral.memType() == NCVMemoryTypeDevice ||
893                     integral.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
894 
895     ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() &&
896                     d_HaarStages.memType() == d_HaarFeatures.memType() &&
897                      (d_HaarStages.memType() == NCVMemoryTypeDevice ||
898                       d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
899 
900     ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
901 
902     ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
903 
904     ncvAssertReturn((integral.ptr() != NULL && d_weights.ptr() != NULL && d_pixelMask.ptr() != NULL &&
905                      h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL &&
906                      d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR);
907 
908     ncvAssertReturn(anchorsRoi.width > 0 && anchorsRoi.height > 0 &&
909                     d_pixelMask.width() >= anchorsRoi.width && d_pixelMask.height() >= anchorsRoi.height &&
910                     d_weights.width() >= anchorsRoi.width && d_weights.height() >= anchorsRoi.height &&
911                     integral.width() >= anchorsRoi.width + haar.ClassifierSize.width &&
912                     integral.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID);
913 
914     ncvAssertReturn(scaleArea > 0, NCV_INVALID_SCALE);
915 
916     ncvAssertReturn(d_HaarStages.length() >= haar.NumStages &&
917                     d_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
918                     d_HaarFeatures.length() >= haar.NumFeatures &&
919                     d_HaarStages.length() == h_HaarStages.length() &&
920                     haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
921 
922     ncvAssertReturn(haar.bNeedsTiltedII == false || gpuAllocator.isCounting(), NCV_NOIMPL_HAAR_TILTED_FEATURES);
923 
924     ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);
925 
926     NCV_SET_SKIP_COND(gpuAllocator.isCounting());
927 
928 #if defined _SELF_TEST_
929 
930     NCVStatus ncvStat;
931 
932     NCVMatrixAlloc<Ncv32u> h_integralImage(cpuAllocator, integral.width, integral.height, integral.pitch);
933     ncvAssertReturn(h_integralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
934     NCVMatrixAlloc<Ncv32f> h_weights(cpuAllocator, d_weights.width, d_weights.height, d_weights.pitch);
935     ncvAssertReturn(h_weights.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
936     NCVMatrixAlloc<Ncv32u> h_pixelMask(cpuAllocator, d_pixelMask.width, d_pixelMask.height, d_pixelMask.pitch);
937     ncvAssertReturn(h_pixelMask.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
938     NCVVectorAlloc<HaarClassifierNode128> h_HaarNodes(cpuAllocator, d_HaarNodes.length);
939     ncvAssertReturn(h_HaarNodes.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
940     NCVVectorAlloc<HaarFeature64> h_HaarFeatures(cpuAllocator, d_HaarFeatures.length);
941     ncvAssertReturn(h_HaarFeatures.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
942 
943     NCVMatrixAlloc<Ncv32u> h_pixelMask_d(cpuAllocator, d_pixelMask.width, d_pixelMask.height, d_pixelMask.pitch);
944     ncvAssertReturn(h_pixelMask_d.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
945 
946     NCV_SKIP_COND_BEGIN
947 
948     ncvStat = d_pixelMask.copySolid(h_pixelMask, 0);
949     ncvAssertReturnNcvStat(ncvStat);
950     ncvStat = integral.copySolid(h_integralImage, 0);
951     ncvAssertReturnNcvStat(ncvStat);
952     ncvStat = d_weights.copySolid(h_weights, 0);
953     ncvAssertReturnNcvStat(ncvStat);
954     ncvStat = d_HaarNodes.copySolid(h_HaarNodes, 0);
955     ncvAssertReturnNcvStat(ncvStat);
956     ncvStat = d_HaarFeatures.copySolid(h_HaarFeatures, 0);
957     ncvAssertReturnNcvStat(ncvStat);
958     ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
959 
960     for (Ncv32u i=0; i<(Ncv32u)anchorsRoi.height; i++)
961     {
962         for (Ncv32u j=0; j<d_pixelMask.stride(); j++)
963         {
964             if ((i%pixelStep==0) && (j%pixelStep==0) && (j<(Ncv32u)anchorsRoi.width))
965             {
966                 if (!bMaskElements || h_pixelMask.ptr[i*d_pixelMask.stride()+j] != OBJDET_MASK_ELEMENT_INVALID_32U)
967                 {
968                     h_pixelMask.ptr[i*d_pixelMask.stride()+j] = (i << 16) | j;
969                 }
970             }
971             else
972             {
973                 h_pixelMask.ptr[i*d_pixelMask.stride()+j] = OBJDET_MASK_ELEMENT_INVALID_32U;
974             }
975         }
976     }
977 
978     NCV_SKIP_COND_END
979 
980 #endif
981 
982     NCVVectorReuse<Ncv32u> d_vecPixelMask(d_pixelMask.getSegment(), anchorsRoi.height * d_pixelMask.stride());
983     ncvAssertReturn(d_vecPixelMask.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
984 
985     NCVVectorAlloc<Ncv32u> d_vecPixelMaskTmp(gpuAllocator, static_cast<Ncv32u>(d_vecPixelMask.length()));
986     ncvAssertReturn(d_vecPixelMaskTmp.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
987 
988     NCVVectorAlloc<Ncv32u> hp_pool32u(cpuAllocator, 2);
989     ncvAssertReturn(hp_pool32u.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
990     Ncv32u *hp_zero = &hp_pool32u.ptr()[0];
991     Ncv32u *hp_numDet = &hp_pool32u.ptr()[1];
992 
993     NCV_SKIP_COND_BEGIN
994     *hp_zero = 0;
995     *hp_numDet = 0;
996     NCV_SKIP_COND_END
997 
998     Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) *
999                                           (haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER));
1000 
1001     NcvBool bTexCacheCascade = devProp.major < 2;
1002     NcvBool bTexCacheIImg = true; //this works better even on Fermi so far
1003     NcvBool bDoAtomicCompaction = devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3);
1004 
1005     NCVVector<Ncv32u> *d_ptrNowData = &d_vecPixelMask;
1006     NCVVector<Ncv32u> *d_ptrNowTmp = &d_vecPixelMaskTmp;
1007 
1008     Ncv32u szNppCompactTmpBuf;
1009     nppsStCompactGetSize_32u(static_cast<Ncv32u>(d_vecPixelMask.length()), &szNppCompactTmpBuf, devProp);
1010     if (bDoAtomicCompaction)
1011     {
1012         szNppCompactTmpBuf = 0;
1013     }
1014     NCVVectorAlloc<Ncv8u> d_tmpBufCompact(gpuAllocator, szNppCompactTmpBuf);
1015 
1016     NCV_SKIP_COND_BEGIN
1017 
1018     if (bTexCacheIImg)
1019     {
1020         cudaChannelFormatDesc cfdTexIImage;
1021         cfdTexIImage = cudaCreateChannelDesc<Ncv32u>();
1022 
1023         size_t alignmentOffset;
1024         ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texIImage, integral.ptr(), cfdTexIImage,
1025             (anchorsRoi.height + haar.ClassifierSize.height) * integral.pitch()), NCV_CUDA_ERROR);
1026         ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
1027     }
1028 
1029     if (bTexCacheCascade)
1030     {
1031         cudaChannelFormatDesc cfdTexHaarFeatures;
1032         cudaChannelFormatDesc cfdTexHaarClassifierNodes;
1033         cfdTexHaarFeatures = cudaCreateChannelDesc<uint2>();
1034         cfdTexHaarClassifierNodes = cudaCreateChannelDesc<uint4>();
1035 
1036         size_t alignmentOffset;
1037         ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarFeatures,
1038             d_HaarFeatures.ptr(), cfdTexHaarFeatures,sizeof(HaarFeature64) * haar.NumFeatures), NCV_CUDA_ERROR);
1039         ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
1040         ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarClassifierNodes,
1041             d_HaarNodes.ptr(), cfdTexHaarClassifierNodes, sizeof(HaarClassifierNode128) * haar.NumClassifierTotalNodes), NCV_CUDA_ERROR);
1042         ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
1043     }
1044 
1045     Ncv32u stageStartAnchorParallel = 0;
1046     Ncv32u stageMiddleSwitch = getStageNumWithNotLessThanNclassifiers(NUM_THREADS_CLASSIFIERPARALLEL,
1047         haar, h_HaarStages);
1048     Ncv32u stageEndClassifierParallel = haar.NumStages;
1049     if (stageMiddleSwitch == 0)
1050     {
1051         stageMiddleSwitch = 1;
1052     }
1053 
1054     //create stages subdivision for pixel-parallel processing
1055     const Ncv32u compactEveryNstage = bDoAtomicCompaction ? 7 : 1;
1056     Ncv32u curStop = stageStartAnchorParallel;
1057     std::vector<Ncv32u> pixParallelStageStops;
1058     while (curStop < stageMiddleSwitch)
1059     {
1060         pixParallelStageStops.push_back(curStop);
1061         curStop += compactEveryNstage;
1062     }
1063     if (curStop > compactEveryNstage && curStop - stageMiddleSwitch > compactEveryNstage / 2)
1064     {
1065         pixParallelStageStops[pixParallelStageStops.size()-1] =
1066             (stageMiddleSwitch - (curStop - 2 * compactEveryNstage)) / 2;
1067     }
1068     pixParallelStageStops.push_back(stageMiddleSwitch);
1069     Ncv32u pixParallelStageStopsIndex = 0;
1070 
1071     if (pixelStep != 1 || bMaskElements)
1072     {
1073         if (bDoAtomicCompaction)
1074         {
1075             ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
1076                                                         0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
1077             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1078         }
1079 
1080         dim3 gridInit((((anchorsRoi.width + pixelStep - 1) / pixelStep + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL),
1081                         (anchorsRoi.height + pixelStep - 1) / pixelStep);
1082         dim3 blockInit(NUM_THREADS_ANCHORSPARALLEL);
1083 
1084         if (gridInit.x == 0 || gridInit.y == 0)
1085         {
1086             numDetections = 0;
1087             return NCV_SUCCESS;
1088         }
1089 
1090         initializeMaskVectorDynTemplate(bMaskElements,
1091                                         bDoAtomicCompaction,
1092                                         gridInit, blockInit, cuStream,
1093                                         d_ptrNowData->ptr(),
1094                                         d_ptrNowTmp->ptr(),
1095                                         static_cast<Ncv32u>(d_vecPixelMask.length()), d_pixelMask.stride(),
1096                                         anchorsRoi, pixelStep);
1097         ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
1098 
1099         if (bDoAtomicCompaction)
1100         {
1101             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1102             ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
1103                                                           0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
1104             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1105             swap(d_ptrNowData, d_ptrNowTmp);
1106         }
1107         else
1108         {
1109             NCVStatus nppSt;
1110             nppSt = nppsStCompact_32u(d_ptrNowTmp->ptr(), static_cast<Ncv32u>(d_vecPixelMask.length()),
1111                                       d_ptrNowData->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
1112                                       d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
1113             ncvAssertReturn(nppSt == NPPST_SUCCESS, NCV_NPP_ERROR);
1114         }
1115         numDetections = *hp_numDet;
1116     }
1117     else
1118     {
1119         //
1120         // 1. Run the first pixel-input pixel-parallel classifier for few stages
1121         //
1122 
1123         if (bDoAtomicCompaction)
1124         {
1125             ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
1126                                                         0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
1127             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1128         }
1129 
1130         dim3 grid1(((d_pixelMask.stride() + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL),
1131                    anchorsRoi.height);
1132         dim3 block1(NUM_THREADS_ANCHORSPARALLEL);
1133         applyHaarClassifierAnchorParallelDynTemplate(
1134             true,                         //tbInitMaskPositively
1135             bTexCacheIImg,                //tbCacheTextureIImg
1136             bTexCacheCascade,             //tbCacheTextureCascade
1137             pixParallelStageStops[pixParallelStageStopsIndex] != 0,//tbReadPixelIndexFromVector
1138             bDoAtomicCompaction,          //tbDoAtomicCompaction
1139             grid1,
1140             block1,
1141             cuStream,
1142             integral.ptr(), integral.stride(),
1143             d_weights.ptr(), d_weights.stride(),
1144             d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
1145             d_ptrNowData->ptr(),
1146             bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
1147             0,
1148             d_pixelMask.stride(),
1149             anchorsRoi,
1150             pixParallelStageStops[pixParallelStageStopsIndex],
1151             pixParallelStageStops[pixParallelStageStopsIndex+1],
1152             scaleAreaPixels);
1153         ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
1154 
1155         if (bDoAtomicCompaction)
1156         {
1157             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1158             ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
1159                                                           0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
1160             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1161         }
1162         else
1163         {
1164             NCVStatus nppSt;
1165             nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), static_cast<Ncv32u>(d_vecPixelMask.length()),
1166                                       d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
1167                                       d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
1168             ncvAssertReturnNcvStat(nppSt);
1169         }
1170 
1171         swap(d_ptrNowData, d_ptrNowTmp);
1172         numDetections = *hp_numDet;
1173 
1174         pixParallelStageStopsIndex++;
1175     }
1176 
1177     //
1178     // 2. Run pixel-parallel stages
1179     //
1180 
1181     for (; pixParallelStageStopsIndex < pixParallelStageStops.size()-1; pixParallelStageStopsIndex++)
1182     {
1183         if (numDetections == 0)
1184         {
1185             break;
1186         }
1187 
1188         if (bDoAtomicCompaction)
1189         {
1190             ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
1191                                                         0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
1192             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1193         }
1194 
1195         dim3 grid2((numDetections + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL);
1196         if (numDetections > MAX_GRID_DIM)
1197         {
1198             grid2.x = MAX_GRID_DIM;
1199             grid2.y = (numDetections + MAX_GRID_DIM - 1) / MAX_GRID_DIM;
1200         }
1201         dim3 block2(NUM_THREADS_ANCHORSPARALLEL);
1202 
1203         applyHaarClassifierAnchorParallelDynTemplate(
1204             false,                        //tbInitMaskPositively
1205             bTexCacheIImg,                //tbCacheTextureIImg
1206             bTexCacheCascade,             //tbCacheTextureCascade
1207             pixParallelStageStops[pixParallelStageStopsIndex] != 0 || pixelStep != 1 || bMaskElements,//tbReadPixelIndexFromVector
1208             bDoAtomicCompaction,          //tbDoAtomicCompaction
1209             grid2,
1210             block2,
1211             cuStream,
1212             integral.ptr(), integral.stride(),
1213             d_weights.ptr(), d_weights.stride(),
1214             d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
1215             d_ptrNowData->ptr(),
1216             bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
1217             numDetections,
1218             d_pixelMask.stride(),
1219             anchorsRoi,
1220             pixParallelStageStops[pixParallelStageStopsIndex],
1221             pixParallelStageStops[pixParallelStageStopsIndex+1],
1222             scaleAreaPixels);
1223         ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
1224 
1225         if (bDoAtomicCompaction)
1226         {
1227             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1228             ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
1229                                                           0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
1230             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1231         }
1232         else
1233         {
1234             NCVStatus nppSt;
1235             nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections,
1236                                       d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
1237                                       d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
1238             ncvAssertReturnNcvStat(nppSt);
1239         }
1240 
1241         swap(d_ptrNowData, d_ptrNowTmp);
1242         numDetections = *hp_numDet;
1243     }
1244 
1245     //
1246     // 3. Run all left stages in one stage-parallel kernel
1247     //
1248 
1249     if (numDetections > 0 && stageMiddleSwitch < stageEndClassifierParallel)
1250     {
1251         if (bDoAtomicCompaction)
1252         {
1253             ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
1254                                                         0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
1255             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1256         }
1257 
1258         dim3 grid3(numDetections);
1259         if (numDetections > MAX_GRID_DIM)
1260         {
1261             grid3.x = MAX_GRID_DIM;
1262             grid3.y = (numDetections + MAX_GRID_DIM - 1) / MAX_GRID_DIM;
1263         }
1264         dim3 block3(NUM_THREADS_CLASSIFIERPARALLEL);
1265 
1266         applyHaarClassifierClassifierParallelDynTemplate(
1267             bTexCacheIImg,                //tbCacheTextureIImg
1268             bTexCacheCascade,             //tbCacheTextureCascade
1269             bDoAtomicCompaction,          //tbDoAtomicCompaction
1270             grid3,
1271             block3,
1272             cuStream,
1273             integral.ptr(), integral.stride(),
1274             d_weights.ptr(), d_weights.stride(),
1275             d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
1276             d_ptrNowData->ptr(),
1277             bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
1278             numDetections,
1279             d_pixelMask.stride(),
1280             anchorsRoi,
1281             stageMiddleSwitch,
1282             stageEndClassifierParallel,
1283             scaleAreaPixels);
1284         ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
1285 
1286         if (bDoAtomicCompaction)
1287         {
1288             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1289             ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
1290                                                           0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
1291             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1292         }
1293         else
1294         {
1295             NCVStatus nppSt;
1296             nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections,
1297                                       d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
1298                                       d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
1299             ncvAssertReturnNcvStat(nppSt);
1300         }
1301 
1302         swap(d_ptrNowData, d_ptrNowTmp);
1303         numDetections = *hp_numDet;
1304     }
1305 
1306     if (d_ptrNowData != &d_vecPixelMask)
1307     {
1308         d_vecPixelMaskTmp.copySolid(d_vecPixelMask, cuStream);
1309         ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1310     }
1311 
1312 #if defined _SELF_TEST_
1313 
1314     ncvStat = d_pixelMask.copySolid(h_pixelMask_d, 0);
1315     ncvAssertReturnNcvStat(ncvStat);
1316     ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1317 
1318     if (bDoAtomicCompaction)
1319     {
1320         std::sort(h_pixelMask_d.ptr, h_pixelMask_d.ptr + numDetections);
1321     }
1322 
1323     Ncv32u fpu_oldcw, fpu_cw;
1324     _controlfp_s(&fpu_cw, 0, 0);
1325     fpu_oldcw = fpu_cw;
1326     _controlfp_s(&fpu_cw, _PC_24, _MCW_PC);
1327     Ncv32u numDetGold;
1328     ncvStat = ncvApplyHaarClassifierCascade_host(h_integralImage, h_weights, h_pixelMask, numDetGold, haar,
1329                                                  h_HaarStages, h_HaarNodes, h_HaarFeatures,
1330                                                  bMaskElements, anchorsRoi, pixelStep, scaleArea);
1331     ncvAssertReturnNcvStat(ncvStat);
1332     _controlfp_s(&fpu_cw, fpu_oldcw, _MCW_PC);
1333 
1334     bool bPass = true;
1335 
1336     if (numDetGold != numDetections)
1337     {
1338         printf("NCVHaarClassifierCascade::applyHaarClassifierCascade numdetections don't match: cpu=%d, gpu=%d\n", numDetGold, numDetections);
1339         bPass = false;
1340     }
1341     else
1342     {
1343         for (Ncv32u i=0; i<std::max(numDetGold, numDetections) && bPass; i++)
1344         {
1345             if (h_pixelMask.ptr[i] != h_pixelMask_d.ptr[i])
1346             {
1347                 printf("NCVHaarClassifierCascade::applyHaarClassifierCascade self test failed: i=%d, cpu=%d, gpu=%d\n", i, h_pixelMask.ptr[i], h_pixelMask_d.ptr[i]);
1348                 bPass = false;
1349             }
1350         }
1351     }
1352 
1353     printf("NCVHaarClassifierCascade::applyHaarClassifierCascade %s\n", bPass?"PASSED":"FAILED");
1354 #endif
1355 
1356     NCV_SKIP_COND_END
1357 
1358     return NCV_SUCCESS;
1359 }
1360 
1361 
1362 //==============================================================================
1363 //
1364 // HypothesesOperations file
1365 //
1366 //==============================================================================
1367 
1368 
1369 const Ncv32u NUM_GROW_THREADS = 128;
1370 
1371 
pixelToRect(Ncv32u pixel,Ncv32u width,Ncv32u height,Ncv32f scale)1372 __device__ __host__ NcvRect32u pixelToRect(Ncv32u pixel, Ncv32u width, Ncv32u height, Ncv32f scale)
1373 {
1374     NcvRect32u res;
1375     res.x = (Ncv32u)(scale * (pixel & 0xFFFF));
1376     res.y = (Ncv32u)(scale * (pixel >> 16));
1377     res.width = (Ncv32u)(scale * width);
1378     res.height = (Ncv32u)(scale * height);
1379     return res;
1380 }
1381 
1382 
growDetectionsKernel(Ncv32u * pixelMask,Ncv32u numElements,NcvRect32u * hypotheses,Ncv32u rectWidth,Ncv32u rectHeight,Ncv32f curScale)1383 __global__ void growDetectionsKernel(Ncv32u *pixelMask, Ncv32u numElements,
1384                                      NcvRect32u *hypotheses,
1385                                      Ncv32u rectWidth, Ncv32u rectHeight, Ncv32f curScale)
1386 {
1387     Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
1388     Ncv32u elemAddr = blockId * NUM_GROW_THREADS + threadIdx.x;
1389     if (elemAddr >= numElements)
1390     {
1391         return;
1392     }
1393     hypotheses[elemAddr] = pixelToRect(pixelMask[elemAddr], rectWidth, rectHeight, curScale);
1394 }
1395 
1396 
ncvGrowDetectionsVector_device(NCVVector<Ncv32u> & pixelMask,Ncv32u numPixelMaskDetections,NCVVector<NcvRect32u> & hypotheses,Ncv32u & totalDetections,Ncv32u totalMaxDetections,Ncv32u rectWidth,Ncv32u rectHeight,Ncv32f curScale,cudaStream_t cuStream)1397 NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
1398                                          Ncv32u numPixelMaskDetections,
1399                                          NCVVector<NcvRect32u> &hypotheses,
1400                                          Ncv32u &totalDetections,
1401                                          Ncv32u totalMaxDetections,
1402                                          Ncv32u rectWidth,
1403                                          Ncv32u rectHeight,
1404                                          Ncv32f curScale,
1405                                          cudaStream_t cuStream)
1406 {
1407     ncvAssertReturn(pixelMask.ptr() != NULL && hypotheses.ptr() != NULL, NCV_NULL_PTR);
1408 
1409     ncvAssertReturn(pixelMask.memType() == hypotheses.memType() &&
1410                     pixelMask.memType() == NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
1411 
1412     ncvAssertReturn(rectWidth > 0 && rectHeight > 0 && curScale > 0, NCV_INVALID_ROI);
1413 
1414     ncvAssertReturn(curScale > 0, NCV_INVALID_SCALE);
1415 
1416     ncvAssertReturn(totalMaxDetections <= hypotheses.length() &&
1417                     numPixelMaskDetections <= pixelMask.length() &&
1418                     totalDetections <= totalMaxDetections, NCV_INCONSISTENT_INPUT);
1419 
1420     NCVStatus ncvStat = NCV_SUCCESS;
1421     Ncv32u numDetsToCopy = numPixelMaskDetections;
1422 
1423     if (numDetsToCopy == 0)
1424     {
1425         return ncvStat;
1426     }
1427 
1428     if (totalDetections + numPixelMaskDetections > totalMaxDetections)
1429     {
1430         ncvStat = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;
1431         numDetsToCopy = totalMaxDetections - totalDetections;
1432     }
1433 
1434     dim3 block(NUM_GROW_THREADS);
1435     dim3 grid((numDetsToCopy + NUM_GROW_THREADS - 1) / NUM_GROW_THREADS);
1436     if (grid.x > 65535)
1437     {
1438         grid.y = (grid.x + 65534) / 65535;
1439         grid.x = 65535;
1440     }
1441     growDetectionsKernel<<<grid, block, 0, cuStream>>>(pixelMask.ptr(), numDetsToCopy,
1442                                                        hypotheses.ptr() + totalDetections,
1443                                                        rectWidth, rectHeight, curScale);
1444     ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
1445 
1446     totalDetections += numDetsToCopy;
1447     return ncvStat;
1448 }
1449 
1450 
1451 //==============================================================================
1452 //
1453 // Pipeline file
1454 //
1455 //==============================================================================
1456 
1457 
ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> & d_srcImg,NcvSize32u srcRoi,NCVVector<NcvRect32u> & d_dstRects,Ncv32u & dstNumRects,HaarClassifierCascadeDescriptor & haar,NCVVector<HaarStage64> & h_HaarStages,NCVVector<HaarStage64> & d_HaarStages,NCVVector<HaarClassifierNode128> & d_HaarNodes,NCVVector<HaarFeature64> & d_HaarFeatures,NcvSize32u minObjSize,Ncv32u minNeighbors,Ncv32f scaleStep,Ncv32u pixelStep,Ncv32u flags,INCVMemAllocator & gpuAllocator,INCVMemAllocator & cpuAllocator,cudaDeviceProp & devProp,cudaStream_t cuStream)1458 NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
1459                                             NcvSize32u srcRoi,
1460                                             NCVVector<NcvRect32u> &d_dstRects,
1461                                             Ncv32u &dstNumRects,
1462 
1463                                             HaarClassifierCascadeDescriptor &haar,
1464                                             NCVVector<HaarStage64> &h_HaarStages,
1465                                             NCVVector<HaarStage64> &d_HaarStages,
1466                                             NCVVector<HaarClassifierNode128> &d_HaarNodes,
1467                                             NCVVector<HaarFeature64> &d_HaarFeatures,
1468 
1469                                             NcvSize32u minObjSize,
1470                                             Ncv32u minNeighbors,      //default 4
1471                                             Ncv32f scaleStep,         //default 1.2f
1472                                             Ncv32u pixelStep,         //default 1
1473                                             Ncv32u flags,             //default NCVPipeObjDet_Default
1474 
1475                                             INCVMemAllocator &gpuAllocator,
1476                                             INCVMemAllocator &cpuAllocator,
1477                                             cudaDeviceProp &devProp,
1478                                             cudaStream_t cuStream)
1479 {
1480     ncvAssertReturn(d_srcImg.memType() == d_dstRects.memType() &&
1481                     d_srcImg.memType() == gpuAllocator.memType() &&
1482                      (d_srcImg.memType() == NCVMemoryTypeDevice ||
1483                       d_srcImg.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
1484 
1485     ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() &&
1486                     d_HaarStages.memType() == d_HaarFeatures.memType() &&
1487                      (d_HaarStages.memType() == NCVMemoryTypeDevice ||
1488                       d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
1489 
1490     ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
1491 
1492     ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
1493 
1494     ncvAssertReturn((d_srcImg.ptr() != NULL && d_dstRects.ptr() != NULL &&
1495                      h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL &&
1496                      d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR);
1497     ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0 &&
1498                     d_srcImg.width() >= srcRoi.width && d_srcImg.height() >= srcRoi.height &&
1499                     srcRoi.width >= minObjSize.width && srcRoi.height >= minObjSize.height &&
1500                     d_dstRects.length() >= 1, NCV_DIMENSIONS_INVALID);
1501 
1502     ncvAssertReturn(scaleStep > 1.0f, NCV_INVALID_SCALE);
1503 
1504     ncvAssertReturn(d_HaarStages.length() >= haar.NumStages &&
1505                     d_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
1506                     d_HaarFeatures.length() >= haar.NumFeatures &&
1507                     d_HaarStages.length() == h_HaarStages.length() &&
1508                     haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
1509 
1510     ncvAssertReturn(haar.bNeedsTiltedII == false, NCV_NOIMPL_HAAR_TILTED_FEATURES);
1511 
1512     ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);
1513 
1514     //TODO: set NPP active stream to cuStream
1515 
1516     NCVStatus ncvStat;
1517     NCV_SET_SKIP_COND(gpuAllocator.isCounting());
1518 
1519     Ncv32u integralWidth = d_srcImg.width() + 1;
1520     Ncv32u integralHeight = d_srcImg.height() + 1;
1521 
1522     NCVMatrixAlloc<Ncv32u> integral(gpuAllocator, integralWidth, integralHeight);
1523     ncvAssertReturn(integral.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1524     NCVMatrixAlloc<Ncv64u> d_sqIntegralImage(gpuAllocator, integralWidth, integralHeight);
1525     ncvAssertReturn(d_sqIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1526 
1527     NCVMatrixAlloc<Ncv32f> d_rectStdDev(gpuAllocator, d_srcImg.width(), d_srcImg.height());
1528     ncvAssertReturn(d_rectStdDev.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1529     NCVMatrixAlloc<Ncv32u> d_pixelMask(gpuAllocator, d_srcImg.width(), d_srcImg.height());
1530     ncvAssertReturn(d_pixelMask.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1531 
1532     NCVMatrixAlloc<Ncv32u> d_scaledIntegralImage(gpuAllocator, integralWidth, integralHeight);
1533     ncvAssertReturn(d_scaledIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1534     NCVMatrixAlloc<Ncv64u> d_scaledSqIntegralImage(gpuAllocator, integralWidth, integralHeight);
1535     ncvAssertReturn(d_scaledSqIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1536 
1537     NCVVectorAlloc<NcvRect32u> d_hypothesesIntermediate(gpuAllocator, d_srcImg.width() * d_srcImg.height());
1538     ncvAssertReturn(d_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1539     NCVVectorAlloc<NcvRect32u> h_hypothesesIntermediate(cpuAllocator, d_srcImg.width() * d_srcImg.height());
1540     ncvAssertReturn(h_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1541 
1542     NCVStatus nppStat;
1543     Ncv32u szTmpBufIntegral, szTmpBufSqIntegral;
1544     nppStat = nppiStIntegralGetSize_8u32u(NcvSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufIntegral, devProp);
1545     ncvAssertReturnNcvStat(nppStat);
1546     nppStat = nppiStSqrIntegralGetSize_8u64u(NcvSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufSqIntegral, devProp);
1547     ncvAssertReturnNcvStat(nppStat);
1548     NCVVectorAlloc<Ncv8u> d_tmpIIbuf(gpuAllocator, std::max(szTmpBufIntegral, szTmpBufSqIntegral));
1549     ncvAssertReturn(d_tmpIIbuf.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1550 
1551     NCV_SKIP_COND_BEGIN
1552 
1553     nppStat = nppiStIntegral_8u32u_C1R(d_srcImg.ptr(), d_srcImg.pitch(),
1554                                        integral.ptr(), integral.pitch(),
1555                                        NcvSize32u(d_srcImg.width(), d_srcImg.height()),
1556                                        d_tmpIIbuf.ptr(), szTmpBufIntegral, devProp);
1557     ncvAssertReturnNcvStat(nppStat);
1558 
1559     nppStat = nppiStSqrIntegral_8u64u_C1R(d_srcImg.ptr(), d_srcImg.pitch(),
1560                                           d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
1561                                           NcvSize32u(d_srcImg.width(), d_srcImg.height()),
1562                                           d_tmpIIbuf.ptr(), szTmpBufSqIntegral, devProp);
1563     ncvAssertReturnNcvStat(nppStat);
1564 
1565     NCV_SKIP_COND_END
1566 
1567     dstNumRects = 0;
1568 
1569     Ncv32u lastCheckedScale = 0;
1570     NcvBool bReverseTraverseScale = ((flags & NCVPipeObjDet_FindLargestObject) != 0);
1571     std::vector<Ncv32u> scalesVector;
1572 
1573     NcvBool bFoundLargestFace = false;
1574 
1575     for (Ncv32f scaleIter = 1.0f; ; scaleIter *= scaleStep)
1576     {
1577         Ncv32u scale = (Ncv32u)scaleIter;
1578         if (lastCheckedScale == scale)
1579         {
1580             continue;
1581         }
1582         lastCheckedScale = scale;
1583 
1584         if (haar.ClassifierSize.width * (Ncv32s)scale < minObjSize.width ||
1585             haar.ClassifierSize.height * (Ncv32s)scale < minObjSize.height)
1586         {
1587             continue;
1588         }
1589 
1590         NcvSize32s srcRoi_, srcIIRo_i, scaledIIRoi, searchRoi;
1591 
1592         srcRoi_.width = d_srcImg.width();
1593         srcRoi_.height = d_srcImg.height();
1594 
1595         srcIIRo_i.width = srcRoi_.width + 1;
1596         srcIIRo_i.height = srcRoi_.height + 1;
1597 
1598         scaledIIRoi.width = srcIIRo_i.width / scale;
1599         scaledIIRoi.height = srcIIRo_i.height / scale;
1600 
1601         searchRoi.width = scaledIIRoi.width - haar.ClassifierSize.width;
1602         searchRoi.height = scaledIIRoi.height - haar.ClassifierSize.height;
1603 
1604         if (searchRoi.width <= 0 || searchRoi.height <= 0)
1605         {
1606             break;
1607         }
1608 
1609         scalesVector.push_back(scale);
1610 
1611         if (gpuAllocator.isCounting())
1612         {
1613             break;
1614         }
1615     }
1616 
1617     if (bReverseTraverseScale)
1618     {
1619         std::reverse(scalesVector.begin(), scalesVector.end());
1620     }
1621 
1622     //TODO: handle _fair_scale_ flag
1623     for (Ncv32u i=0; i<scalesVector.size(); i++)
1624     {
1625         Ncv32u scale = scalesVector[i];
1626 
1627         NcvSize32u srcRoi_, scaledIIRoi, searchRoi;
1628         NcvSize32u srcIIRoi;
1629         srcRoi_.width = d_srcImg.width();
1630         srcRoi_.height = d_srcImg.height();
1631         srcIIRoi.width = srcRoi_.width + 1;
1632         srcIIRoi.height = srcRoi_.height + 1;
1633         scaledIIRoi.width = srcIIRoi.width / scale;
1634         scaledIIRoi.height = srcIIRoi.height / scale;
1635         searchRoi.width = scaledIIRoi.width - haar.ClassifierSize.width;
1636         searchRoi.height = scaledIIRoi.height - haar.ClassifierSize.height;
1637 
1638         NCV_SKIP_COND_BEGIN
1639 
1640         nppStat = nppiStDecimate_32u_C1R(
1641             integral.ptr(), integral.pitch(),
1642             d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
1643             srcIIRoi, scale, true);
1644         ncvAssertReturnNcvStat(nppStat);
1645 
1646         nppStat = nppiStDecimate_64u_C1R(
1647             d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
1648             d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),
1649             srcIIRoi, scale, true);
1650         ncvAssertReturnNcvStat(nppStat);
1651 
1652         const NcvRect32u rect(
1653             HAAR_STDDEV_BORDER,
1654             HAAR_STDDEV_BORDER,
1655             haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER,
1656             haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER);
1657         nppStat = nppiStRectStdDev_32f_C1R(
1658             d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
1659             d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),
1660             d_rectStdDev.ptr(), d_rectStdDev.pitch(),
1661             NcvSize32u(searchRoi.width, searchRoi.height), rect,
1662             (Ncv32f)scale*scale, true);
1663         ncvAssertReturnNcvStat(nppStat);
1664 
1665         NCV_SKIP_COND_END
1666 
1667         Ncv32u detectionsOnThisScale;
1668         ncvStat = ncvApplyHaarClassifierCascade_device(
1669             d_scaledIntegralImage, d_rectStdDev, d_pixelMask,
1670             detectionsOnThisScale,
1671             haar, h_HaarStages, d_HaarStages, d_HaarNodes, d_HaarFeatures, false,
1672             searchRoi, pixelStep, (Ncv32f)scale*scale,
1673             gpuAllocator, cpuAllocator, devProp, cuStream);
1674         ncvAssertReturnNcvStat(nppStat);
1675 
1676         NCV_SKIP_COND_BEGIN
1677 
1678         NCVVectorReuse<Ncv32u> d_vecPixelMask(d_pixelMask.getSegment());
1679         ncvStat = ncvGrowDetectionsVector_device(
1680             d_vecPixelMask,
1681             detectionsOnThisScale,
1682             d_hypothesesIntermediate,
1683             dstNumRects,
1684             static_cast<Ncv32u>(d_hypothesesIntermediate.length()),
1685             haar.ClassifierSize.width,
1686             haar.ClassifierSize.height,
1687             (Ncv32f)scale,
1688             cuStream);
1689         ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
1690 
1691         if (flags & NCVPipeObjDet_FindLargestObject)
1692         {
1693             if (dstNumRects == 0)
1694             {
1695                 continue;
1696             }
1697 
1698             if (dstNumRects != 0)
1699             {
1700                 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1701                 ncvStat = d_hypothesesIntermediate.copySolid(h_hypothesesIntermediate, cuStream,
1702                                                              dstNumRects * sizeof(NcvRect32u));
1703                 ncvAssertReturnNcvStat(ncvStat);
1704                 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1705             }
1706 
1707             Ncv32u numStrongHypothesesNow = dstNumRects;
1708             ncvStat = ncvGroupRectangles_host(
1709                 h_hypothesesIntermediate,
1710                 numStrongHypothesesNow,
1711                 minNeighbors,
1712                 RECT_SIMILARITY_PROPORTION,
1713                 NULL);
1714             ncvAssertReturnNcvStat(ncvStat);
1715 
1716             if (numStrongHypothesesNow > 0)
1717             {
1718                 NcvRect32u maxRect = h_hypothesesIntermediate.ptr()[0];
1719                 for (Ncv32u j=1; j<numStrongHypothesesNow; j++)
1720                 {
1721                     if (maxRect.width < h_hypothesesIntermediate.ptr()[j].width)
1722                     {
1723                         maxRect = h_hypothesesIntermediate.ptr()[j];
1724                     }
1725                 }
1726 
1727                 h_hypothesesIntermediate.ptr()[0] = maxRect;
1728                 dstNumRects = 1;
1729 
1730                 ncvStat = h_hypothesesIntermediate.copySolid(d_dstRects, cuStream, sizeof(NcvRect32u));
1731                 ncvAssertReturnNcvStat(ncvStat);
1732 
1733                 bFoundLargestFace = true;
1734 
1735                 break;
1736             }
1737         }
1738 
1739         NCV_SKIP_COND_END
1740 
1741         if (gpuAllocator.isCounting())
1742         {
1743             break;
1744         }
1745     }
1746 
1747     NCVStatus ncvRetCode = NCV_SUCCESS;
1748 
1749     NCV_SKIP_COND_BEGIN
1750 
1751     if (flags & NCVPipeObjDet_FindLargestObject)
1752     {
1753         if (!bFoundLargestFace)
1754         {
1755             dstNumRects = 0;
1756         }
1757     }
1758     else
1759     {
1760         //TODO: move hypotheses filtration to GPU pipeline (the only CPU-resident element of the pipeline left)
1761         if (dstNumRects != 0)
1762         {
1763             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1764             ncvStat = d_hypothesesIntermediate.copySolid(h_hypothesesIntermediate, cuStream,
1765                                                          dstNumRects * sizeof(NcvRect32u));
1766             ncvAssertReturnNcvStat(ncvStat);
1767             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1768         }
1769 
1770         ncvStat = ncvGroupRectangles_host(
1771             h_hypothesesIntermediate,
1772             dstNumRects,
1773             minNeighbors,
1774             RECT_SIMILARITY_PROPORTION,
1775             NULL);
1776         ncvAssertReturnNcvStat(ncvStat);
1777 
1778         if (dstNumRects > d_dstRects.length())
1779         {
1780             ncvRetCode = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;
1781             dstNumRects = static_cast<Ncv32u>(d_dstRects.length());
1782         }
1783 
1784         if (dstNumRects != 0)
1785         {
1786             ncvStat = h_hypothesesIntermediate.copySolid(d_dstRects, cuStream,
1787                                                          dstNumRects * sizeof(NcvRect32u));
1788             ncvAssertReturnNcvStat(ncvStat);
1789         }
1790     }
1791 
1792     if (flags & NCVPipeObjDet_VisualizeInPlace)
1793     {
1794         ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1795         ncvDrawRects_8u_device(d_srcImg.ptr(), d_srcImg.stride(),
1796                                d_srcImg.width(), d_srcImg.height(),
1797                                d_dstRects.ptr(), dstNumRects, 255, cuStream);
1798     }
1799 
1800     NCV_SKIP_COND_END
1801 
1802     return ncvRetCode;
1803 }
1804 
1805 
1806 //==============================================================================
1807 //
1808 // Purely Host code: classifier IO, mock-ups
1809 //
1810 //==============================================================================
1811 
1812 
1813 #ifdef _SELF_TEST_
1814 #include <float.h>
1815 #endif
1816 
1817 
ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> & h_integralImage,NCVMatrix<Ncv32f> & h_weights,NCVMatrixAlloc<Ncv32u> & h_pixelMask,Ncv32u & numDetections,HaarClassifierCascadeDescriptor & haar,NCVVector<HaarStage64> & h_HaarStages,NCVVector<HaarClassifierNode128> & h_HaarNodes,NCVVector<HaarFeature64> & h_HaarFeatures,NcvBool bMaskElements,NcvSize32u anchorsRoi,Ncv32u pixelStep,Ncv32f scaleArea)1818 NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,
1819                                              NCVMatrix<Ncv32f> &h_weights,
1820                                              NCVMatrixAlloc<Ncv32u> &h_pixelMask,
1821                                              Ncv32u &numDetections,
1822                                              HaarClassifierCascadeDescriptor &haar,
1823                                              NCVVector<HaarStage64> &h_HaarStages,
1824                                              NCVVector<HaarClassifierNode128> &h_HaarNodes,
1825                                              NCVVector<HaarFeature64> &h_HaarFeatures,
1826                                              NcvBool bMaskElements,
1827                                              NcvSize32u anchorsRoi,
1828                                              Ncv32u pixelStep,
1829                                              Ncv32f scaleArea)
1830 {
1831     ncvAssertReturn(h_integralImage.memType() == h_weights.memType() &&
1832                     h_integralImage.memType() == h_pixelMask.memType() &&
1833                      (h_integralImage.memType() == NCVMemoryTypeHostPageable ||
1834                       h_integralImage.memType() == NCVMemoryTypeHostPinned), NCV_MEM_RESIDENCE_ERROR);
1835     ncvAssertReturn(h_HaarStages.memType() == h_HaarNodes.memType() &&
1836                     h_HaarStages.memType() == h_HaarFeatures.memType() &&
1837                      (h_HaarStages.memType() == NCVMemoryTypeHostPageable ||
1838                       h_HaarStages.memType() == NCVMemoryTypeHostPinned), NCV_MEM_RESIDENCE_ERROR);
1839     ncvAssertReturn(h_integralImage.ptr() != NULL && h_weights.ptr() != NULL && h_pixelMask.ptr() != NULL &&
1840                     h_HaarStages.ptr() != NULL && h_HaarNodes.ptr() != NULL && h_HaarFeatures.ptr() != NULL, NCV_NULL_PTR);
1841     ncvAssertReturn(anchorsRoi.width > 0 && anchorsRoi.height > 0 &&
1842                     h_pixelMask.width() >= anchorsRoi.width && h_pixelMask.height() >= anchorsRoi.height &&
1843                     h_weights.width() >= anchorsRoi.width && h_weights.height() >= anchorsRoi.height &&
1844                     h_integralImage.width() >= anchorsRoi.width + haar.ClassifierSize.width &&
1845                     h_integralImage.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID);
1846     ncvAssertReturn(scaleArea > 0, NCV_INVALID_SCALE);
1847     ncvAssertReturn(h_HaarStages.length() >= haar.NumStages &&
1848                     h_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
1849                     h_HaarFeatures.length() >= haar.NumFeatures &&
1850                     h_HaarStages.length() == h_HaarStages.length() &&
1851                     haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
1852     ncvAssertReturn(haar.bNeedsTiltedII == false, NCV_NOIMPL_HAAR_TILTED_FEATURES);
1853     ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);
1854 
1855     Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) *
1856                                           (haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER));
1857 
1858     for (Ncv32u i=0; i<anchorsRoi.height; i++)
1859     {
1860         for (Ncv32u j=0; j<h_pixelMask.stride(); j++)
1861         {
1862             if (i % pixelStep != 0 || j % pixelStep != 0 || j >= anchorsRoi.width)
1863             {
1864                 h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = OBJDET_MASK_ELEMENT_INVALID_32U;
1865             }
1866             else
1867             {
1868                 for (Ncv32u iStage = 0; iStage < haar.NumStages; iStage++)
1869                 {
1870                     Ncv32f curStageSum = 0.0f;
1871                     Ncv32u numRootNodesInStage = h_HaarStages.ptr()[iStage].getNumClassifierRootNodes();
1872                     Ncv32u curRootNodeOffset = h_HaarStages.ptr()[iStage].getStartClassifierRootNodeOffset();
1873 
1874                     if (iStage == 0)
1875                     {
1876                         if (bMaskElements && h_pixelMask.ptr()[i * h_pixelMask.stride() + j] == OBJDET_MASK_ELEMENT_INVALID_32U)
1877                         {
1878                             break;
1879                         }
1880                         else
1881                         {
1882                             h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = ((i << 16) | j);
1883                         }
1884                     }
1885                     else if (h_pixelMask.ptr()[i * h_pixelMask.stride() + j] == OBJDET_MASK_ELEMENT_INVALID_32U)
1886                     {
1887                         break;
1888                     }
1889 
1890                     while (numRootNodesInStage--)
1891                     {
1892                         NcvBool bMoreNodesToTraverse = true;
1893                         Ncv32u curNodeOffset = curRootNodeOffset;
1894 
1895                         while (bMoreNodesToTraverse)
1896                         {
1897                             HaarClassifierNode128 curNode = h_HaarNodes.ptr()[curNodeOffset];
1898                             HaarFeatureDescriptor32 curFeatDesc = curNode.getFeatureDesc();
1899                             Ncv32u curNodeFeaturesNum = curFeatDesc.getNumFeatures();
1900                             Ncv32u curNodeFeaturesOffs = curFeatDesc.getFeaturesOffset();
1901 
1902                             Ncv32f curNodeVal = 0.f;
1903                             for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)
1904                             {
1905                                 HaarFeature64 feature = h_HaarFeatures.ptr()[curNodeFeaturesOffs + iRect];
1906                                 Ncv32u rectX, rectY, rectWidth, rectHeight;
1907                                 feature.getRect(&rectX, &rectY, &rectWidth, &rectHeight);
1908                                 Ncv32f rectWeight = feature.getWeight();
1909                                 Ncv32u iioffsTL = (i + rectY) * h_integralImage.stride() + (j + rectX);
1910                                 Ncv32u iioffsTR = iioffsTL + rectWidth;
1911                                 Ncv32u iioffsBL = iioffsTL + rectHeight * h_integralImage.stride();
1912                                 Ncv32u iioffsBR = iioffsBL + rectWidth;
1913 
1914                                 Ncv32u iivalTL = h_integralImage.ptr()[iioffsTL];
1915                                 Ncv32u iivalTR = h_integralImage.ptr()[iioffsTR];
1916                                 Ncv32u iivalBL = h_integralImage.ptr()[iioffsBL];
1917                                 Ncv32u iivalBR = h_integralImage.ptr()[iioffsBR];
1918                                 Ncv32u rectSum = iivalBR - iivalBL + iivalTL - iivalTR;
1919                                 curNodeVal += (Ncv32f)rectSum * rectWeight;
1920                             }
1921 
1922                             HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
1923                             HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();
1924                             Ncv32f nodeThreshold = curNode.getThreshold();
1925 
1926                             HaarClassifierNodeDescriptor32 nextNodeDescriptor;
1927                             NcvBool nextNodeIsLeaf;
1928 
1929                             if (curNodeVal < scaleAreaPixels * h_weights.ptr()[i * h_weights.stride() + j] * nodeThreshold)
1930                             {
1931                                 nextNodeDescriptor = nodeLeft;
1932                                 nextNodeIsLeaf = curFeatDesc.isLeftNodeLeaf();
1933                             }
1934                             else
1935                             {
1936                                 nextNodeDescriptor = nodeRight;
1937                                 nextNodeIsLeaf = curFeatDesc.isRightNodeLeaf();
1938                             }
1939 
1940                             if (nextNodeIsLeaf)
1941                             {
1942                                 Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValueHost();
1943                                 curStageSum += tmpLeafValue;
1944                                 bMoreNodesToTraverse = false;
1945                             }
1946                             else
1947                             {
1948                                 curNodeOffset = nextNodeDescriptor.getNextNodeOffset();
1949                             }
1950                         }
1951 
1952                         curRootNodeOffset++;
1953                     }
1954 
1955                     Ncv32f tmpStageThreshold = h_HaarStages.ptr()[iStage].getStageThreshold();
1956                     if (curStageSum < tmpStageThreshold)
1957                     {
1958                         //drop
1959                         h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = OBJDET_MASK_ELEMENT_INVALID_32U;
1960                         break;
1961                     }
1962                 }
1963             }
1964         }
1965     }
1966 
1967     std::sort(h_pixelMask.ptr(), h_pixelMask.ptr() + anchorsRoi.height * h_pixelMask.stride());
1968     Ncv32u i = 0;
1969     for (; i<anchorsRoi.height * h_pixelMask.stride(); i++)
1970     {
1971         if (h_pixelMask.ptr()[i] == OBJDET_MASK_ELEMENT_INVALID_32U)
1972         {
1973             break;
1974         }
1975     }
1976     numDetections = i;
1977 
1978     return NCV_SUCCESS;
1979 }
1980 
1981 
ncvGrowDetectionsVector_host(NCVVector<Ncv32u> & pixelMask,Ncv32u numPixelMaskDetections,NCVVector<NcvRect32u> & hypotheses,Ncv32u & totalDetections,Ncv32u totalMaxDetections,Ncv32u rectWidth,Ncv32u rectHeight,Ncv32f curScale)1982 NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
1983                                        Ncv32u numPixelMaskDetections,
1984                                        NCVVector<NcvRect32u> &hypotheses,
1985                                        Ncv32u &totalDetections,
1986                                        Ncv32u totalMaxDetections,
1987                                        Ncv32u rectWidth,
1988                                        Ncv32u rectHeight,
1989                                        Ncv32f curScale)
1990 {
1991     ncvAssertReturn(pixelMask.ptr() != NULL && hypotheses.ptr() != NULL, NCV_NULL_PTR);
1992     ncvAssertReturn(pixelMask.memType() == hypotheses.memType() &&
1993                     pixelMask.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
1994     ncvAssertReturn(rectWidth > 0 && rectHeight > 0 && curScale > 0, NCV_INVALID_ROI);
1995     ncvAssertReturn(curScale > 0, NCV_INVALID_SCALE);
1996     ncvAssertReturn(totalMaxDetections <= hypotheses.length() &&
1997                     numPixelMaskDetections <= pixelMask.length() &&
1998                     totalDetections <= totalMaxDetections, NCV_INCONSISTENT_INPUT);
1999 
2000     NCVStatus ncvStat = NCV_SUCCESS;
2001     Ncv32u numDetsToCopy = numPixelMaskDetections;
2002 
2003     if (numDetsToCopy == 0)
2004     {
2005         return ncvStat;
2006     }
2007 
2008     if (totalDetections + numPixelMaskDetections > totalMaxDetections)
2009     {
2010         ncvStat = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;
2011         numDetsToCopy = totalMaxDetections - totalDetections;
2012     }
2013 
2014     for (Ncv32u i=0; i<numDetsToCopy; i++)
2015     {
2016         hypotheses.ptr()[totalDetections + i] = pixelToRect(pixelMask.ptr()[i], rectWidth, rectHeight, curScale);
2017     }
2018 
2019     totalDetections += numDetsToCopy;
2020     return ncvStat;
2021 }
2022 
2023 #define RECT_X_IDX              0
2024 #define RECT_Y_IDX              1
2025 #define RECT_W_IDX              2
2026 #define RECT_H_IDX              3
2027 #define RECT_WEIGHT_IDX         4
2028 
2029 #define CUDA_CC_SIZE_W          0
2030 #define CUDA_CC_SIZE_H          1
2031 
loadFromXML(const cv::String & filename,HaarClassifierCascadeDescriptor & haar,std::vector<HaarStage64> & haarStages,std::vector<HaarClassifierNode128> & haarClassifierNodes,std::vector<HaarFeature64> & haarFeatures)2032 static NCVStatus loadFromXML(const cv::String &filename,
2033                       HaarClassifierCascadeDescriptor &haar,
2034                       std::vector<HaarStage64> &haarStages,
2035                       std::vector<HaarClassifierNode128> &haarClassifierNodes,
2036                       std::vector<HaarFeature64> &haarFeatures)
2037 {
2038     const char *CUDA_CC_SIZE = "size";
2039     const char *CUDA_CC_STAGES = "stages";
2040     const char *CUDA_CC_STAGE_THRESHOLD = "stage_threshold";
2041     const char *CUDA_CC_TREES = "trees";
2042     const char *CUDA_CC_FEATURE = "feature";
2043     const char *CUDA_CC_RECT = "rects";
2044     const char *CUDA_CC_TILTED = "tilted";
2045     const char *CUDA_CC_THRESHOLD = "threshold";
2046     const char *CUDA_CC_LEFT_VAL = "left_val";
2047     const char *CUDA_CC_RIGHT_VAL = "right_val";
2048     const char *CUDA_CC_LEFT_NODE = "left_node";
2049     const char *CUDA_CC_RIGHT_NODE = "right_node";
2050 
2051     NCVStatus ncvStat;
2052 
2053     haar.NumStages = 0;
2054     haar.NumClassifierRootNodes = 0;
2055     haar.NumClassifierTotalNodes = 0;
2056     haar.NumFeatures = 0;
2057     haar.ClassifierSize.width = 0;
2058     haar.ClassifierSize.height = 0;
2059     haar.bHasStumpsOnly = true;
2060     haar.bNeedsTiltedII = false;
2061     Ncv32u curMaxTreeDepth = 0;
2062 
2063     std::vector<HaarClassifierNode128> h_TmpClassifierNotRootNodes;
2064     haarStages.resize(0);
2065     haarClassifierNodes.resize(0);
2066     haarFeatures.resize(0);
2067 
2068     cv::FileStorage fs(filename, cv::FileStorage::READ | cv::FileStorage::FORMAT_XML);
2069 
2070     if (!fs.isOpened())
2071         return NCV_FILE_ERROR;
2072 
2073     const cv::FileNode &root = fs.getFirstTopLevelNode();
2074     const cv::FileNode &fnSize = root[CUDA_CC_SIZE];
2075 
2076     // collect the cascade classifier window size
2077     haar.ClassifierSize.width = (int)fnSize[CUDA_CC_SIZE_W];
2078     haar.ClassifierSize.height = (int)fnSize[CUDA_CC_SIZE_H];
2079     CV_Assert(haar.ClassifierSize.height > 0 && haar.ClassifierSize.width > 0);
2080 
2081     const cv::FileNode &fnStages = root[CUDA_CC_STAGES];
2082     cv::FileNodeIterator it = fnStages.begin(), it_end = fnStages.end();
2083 
2084     for (; it != it_end; ++it) // by stages
2085     {
2086         cv::FileNode fnStage = *it;
2087         HaarStage64 curStage;
2088 
2089         curStage.setStartClassifierRootNodeOffset(static_cast<Ncv32u>(haarClassifierNodes.size()));
2090         curStage.setStageThreshold((float)fnStage[CUDA_CC_STAGE_THRESHOLD]);
2091 
2092         // iterate over the trees
2093         const cv::FileNode &fnTrees = fnStage[CUDA_CC_TREES];
2094         cv::FileNodeIterator it1 = fnTrees.begin(), it1_end = fnTrees.end();
2095 
2096         for (; it1 != it1_end; ++it1) // by trees
2097         {
2098             cv::FileNode tree = *it1;
2099             Ncv32u nodeId = (size_t)0;
2100             HaarClassifierNode128 curNode;
2101 
2102             curNode.setThreshold((float)tree[0][CUDA_CC_THRESHOLD]);
2103 
2104             NcvBool bIsLeftNodeLeaf = false;
2105             NcvBool bIsRightNodeLeaf = false;
2106 
2107             HaarClassifierNodeDescriptor32 nodeLeft;
2108 
2109             cv::FileNode leftNode = tree[0][CUDA_CC_LEFT_NODE];
2110 
2111             if (leftNode.fs == NULL)
2112             {
2113                 Ncv32f leftVal = tree[0][CUDA_CC_LEFT_VAL];
2114                 ncvStat = nodeLeft.create(leftVal);
2115                 ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
2116                 bIsLeftNodeLeaf = true;
2117             }
2118             else
2119             {
2120                 Ncv32u leftNodeOffset = (int)tree[0][CUDA_CC_LEFT_NODE];
2121                 nodeLeft.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + leftNodeOffset - 1));
2122                 haar.bHasStumpsOnly = false;
2123             }
2124 
2125             curNode.setLeftNodeDesc(nodeLeft);
2126 
2127             HaarClassifierNodeDescriptor32 nodeRight;
2128             cv::FileNode rightNode = tree[0][CUDA_CC_RIGHT_NODE];
2129 
2130             if (rightNode.fs == NULL)
2131             {
2132                 Ncv32f rightVal = tree[0][CUDA_CC_RIGHT_VAL];
2133                 ncvStat = nodeRight.create(rightVal);
2134                 ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
2135                 bIsRightNodeLeaf = true;
2136             }
2137             else
2138             {
2139                 Ncv32u rightNodeOffset = (int)tree[0][CUDA_CC_RIGHT_NODE];
2140                 nodeRight.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + rightNodeOffset - 1));
2141                 haar.bHasStumpsOnly = false;
2142             }
2143 
2144             curNode.setRightNodeDesc(nodeRight);
2145 
2146             cv::FileNode fnFeature = tree[0][CUDA_CC_FEATURE];
2147             Ncv32u tiltedVal = (int)fnFeature[CUDA_CC_TILTED];
2148             haar.bNeedsTiltedII = (tiltedVal != 0);
2149 
2150             cv::FileNodeIterator it2 = fnFeature[CUDA_CC_RECT].begin(), it2_end = fnFeature[CUDA_CC_RECT].end();
2151 
2152             Ncv32u featureId = 0;
2153             for (; it2 != it2_end; ++it2) // by feature
2154             {
2155                 cv::FileNode rect = *it2;
2156 
2157                 Ncv32u rectX = (int)rect[RECT_X_IDX];
2158                 Ncv32u rectY = (int)rect[RECT_Y_IDX];
2159                 Ncv32u rectWidth = (int)rect[RECT_W_IDX];
2160                 Ncv32u rectHeight = (int)rect[RECT_H_IDX];
2161 
2162                 Ncv32f rectWeight = (float)rect[RECT_WEIGHT_IDX];
2163 
2164                 if (rectWeight == 0)
2165                     break;
2166 
2167                 HaarFeature64 curFeature;
2168                 ncvStat = curFeature.setRect(rectX, rectY, rectWidth, rectHeight, haar.ClassifierSize.width, haar.ClassifierSize.height);
2169                 curFeature.setWeight(rectWeight);
2170                 ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
2171 
2172                 haarFeatures.push_back(curFeature);
2173                 featureId++;
2174             }
2175 
2176             HaarFeatureDescriptor32 tmpFeatureDesc;
2177             ncvStat = tmpFeatureDesc.create(haar.bNeedsTiltedII, bIsLeftNodeLeaf, bIsRightNodeLeaf,
2178                 featureId, static_cast<Ncv32u>(haarFeatures.size()) - featureId);
2179             ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
2180             curNode.setFeatureDesc(tmpFeatureDesc);
2181 
2182             if (!nodeId)
2183             {
2184                 //root node
2185                 haarClassifierNodes.push_back(curNode);
2186                 curMaxTreeDepth = 1;
2187             }
2188             else
2189             {
2190                 //other node
2191                 h_TmpClassifierNotRootNodes.push_back(curNode);
2192                 curMaxTreeDepth++;
2193             }
2194 
2195             nodeId++;
2196         }
2197 
2198         curStage.setNumClassifierRootNodes((Ncv32u)fnTrees.size());
2199         haarStages.push_back(curStage);
2200     }
2201 
2202     //fill in cascade stats
2203     haar.NumStages = static_cast<Ncv32u>(haarStages.size());
2204     haar.NumClassifierRootNodes = static_cast<Ncv32u>(haarClassifierNodes.size());
2205     haar.NumClassifierTotalNodes = static_cast<Ncv32u>(haar.NumClassifierRootNodes + h_TmpClassifierNotRootNodes.size());
2206     haar.NumFeatures = static_cast<Ncv32u>(haarFeatures.size());
2207 
2208     //merge root and leaf nodes in one classifiers array
2209     Ncv32u offsetRoot = static_cast<Ncv32u>(haarClassifierNodes.size());
2210     for (Ncv32u i=0; i<haarClassifierNodes.size(); i++)
2211     {
2212         HaarFeatureDescriptor32 featureDesc = haarClassifierNodes[i].getFeatureDesc();
2213 
2214         HaarClassifierNodeDescriptor32 nodeLeft = haarClassifierNodes[i].getLeftNodeDesc();
2215         if (!featureDesc.isLeftNodeLeaf())
2216         {
2217             Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
2218             nodeLeft.create(newOffset);
2219         }
2220         haarClassifierNodes[i].setLeftNodeDesc(nodeLeft);
2221 
2222         HaarClassifierNodeDescriptor32 nodeRight = haarClassifierNodes[i].getRightNodeDesc();
2223         if (!featureDesc.isRightNodeLeaf())
2224         {
2225             Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
2226             nodeRight.create(newOffset);
2227         }
2228         haarClassifierNodes[i].setRightNodeDesc(nodeRight);
2229     }
2230 
2231     for (Ncv32u i=0; i<h_TmpClassifierNotRootNodes.size(); i++)
2232     {
2233         HaarFeatureDescriptor32 featureDesc = h_TmpClassifierNotRootNodes[i].getFeatureDesc();
2234 
2235         HaarClassifierNodeDescriptor32 nodeLeft = h_TmpClassifierNotRootNodes[i].getLeftNodeDesc();
2236         if (!featureDesc.isLeftNodeLeaf())
2237         {
2238             Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
2239             nodeLeft.create(newOffset);
2240         }
2241         h_TmpClassifierNotRootNodes[i].setLeftNodeDesc(nodeLeft);
2242 
2243         HaarClassifierNodeDescriptor32 nodeRight = h_TmpClassifierNotRootNodes[i].getRightNodeDesc();
2244         if (!featureDesc.isRightNodeLeaf())
2245         {
2246             Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
2247             nodeRight.create(newOffset);
2248         }
2249         h_TmpClassifierNotRootNodes[i].setRightNodeDesc(nodeRight);
2250 
2251         haarClassifierNodes.push_back(h_TmpClassifierNotRootNodes[i]);
2252     }
2253 
2254     return NCV_SUCCESS;
2255 }
2256 
2257 
2258 #define NVBIN_HAAR_SIZERESERVED     16
2259 #define NVBIN_HAAR_VERSION          0x1
2260 
2261 
loadFromNVBIN(const cv::String & filename,HaarClassifierCascadeDescriptor & haar,std::vector<HaarStage64> & haarStages,std::vector<HaarClassifierNode128> & haarClassifierNodes,std::vector<HaarFeature64> & haarFeatures)2262 static NCVStatus loadFromNVBIN(const cv::String &filename,
2263                                HaarClassifierCascadeDescriptor &haar,
2264                                std::vector<HaarStage64> &haarStages,
2265                                std::vector<HaarClassifierNode128> &haarClassifierNodes,
2266                                std::vector<HaarFeature64> &haarFeatures)
2267 {
2268     size_t readCount;
2269     FILE *fp = fopen(filename.c_str(), "rb");
2270     ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
2271     Ncv32u fileVersion;
2272     readCount = fread(&fileVersion, sizeof(Ncv32u), 1, fp);
2273     ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2274     ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);
2275     Ncv32u fsize;
2276     readCount = fread(&fsize, sizeof(Ncv32u), 1, fp);
2277     ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2278     fseek(fp, 0, SEEK_END);
2279     Ncv32u fsizeActual = ftell(fp);
2280     ncvAssertReturn(fsize == fsizeActual, NCV_FILE_ERROR);
2281 
2282     std::vector<unsigned char> fdata;
2283     fdata.resize(fsize);
2284     Ncv32u dataOffset = 0;
2285     fseek(fp, 0, SEEK_SET);
2286     readCount = fread(&fdata[0], fsize, 1, fp);
2287     ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2288     fclose(fp);
2289 
2290     //data
2291     dataOffset = NVBIN_HAAR_SIZERESERVED;
2292     haar.NumStages = *(Ncv32u *)(&fdata[0]+dataOffset);
2293     dataOffset += sizeof(Ncv32u);
2294     haar.NumClassifierRootNodes = *(Ncv32u *)(&fdata[0]+dataOffset);
2295     dataOffset += sizeof(Ncv32u);
2296     haar.NumClassifierTotalNodes = *(Ncv32u *)(&fdata[0]+dataOffset);
2297     dataOffset += sizeof(Ncv32u);
2298     haar.NumFeatures = *(Ncv32u *)(&fdata[0]+dataOffset);
2299     dataOffset += sizeof(Ncv32u);
2300     haar.ClassifierSize = *(NcvSize32u *)(&fdata[0]+dataOffset);
2301     dataOffset += sizeof(NcvSize32u);
2302     haar.bNeedsTiltedII = *(NcvBool *)(&fdata[0]+dataOffset);
2303     dataOffset += sizeof(NcvBool);
2304     haar.bHasStumpsOnly = *(NcvBool *)(&fdata[0]+dataOffset);
2305     dataOffset += sizeof(NcvBool);
2306 
2307     haarStages.resize(haar.NumStages);
2308     haarClassifierNodes.resize(haar.NumClassifierTotalNodes);
2309     haarFeatures.resize(haar.NumFeatures);
2310 
2311     Ncv32u szStages = haar.NumStages * sizeof(HaarStage64);
2312     Ncv32u szClassifiers = haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128);
2313     Ncv32u szFeatures = haar.NumFeatures * sizeof(HaarFeature64);
2314 
2315     memcpy(&haarStages[0], &fdata[0]+dataOffset, szStages);
2316     dataOffset += szStages;
2317     memcpy(&haarClassifierNodes[0], &fdata[0]+dataOffset, szClassifiers);
2318     dataOffset += szClassifiers;
2319     memcpy(&haarFeatures[0], &fdata[0]+dataOffset, szFeatures);
2320     dataOffset += szFeatures;
2321 
2322     return NCV_SUCCESS;
2323 }
2324 
2325 
ncvHaarGetClassifierSize(const cv::String & filename,Ncv32u & numStages,Ncv32u & numNodes,Ncv32u & numFeatures)2326 NCVStatus ncvHaarGetClassifierSize(const cv::String &filename, Ncv32u &numStages,
2327                                    Ncv32u &numNodes, Ncv32u &numFeatures)
2328 {
2329     size_t readCount;
2330     NCVStatus ncvStat;
2331 
2332     cv::String fext = filename.substr(filename.find_last_of(".") + 1);
2333     std::transform(fext.begin(), fext.end(), fext.begin(), ::tolower);
2334 
2335     if (fext == "nvbin")
2336     {
2337         FILE *fp = fopen(filename.c_str(), "rb");
2338         ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
2339         Ncv32u fileVersion;
2340         readCount = fread(&fileVersion, sizeof(Ncv32u), 1, fp);
2341         ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2342         ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);
2343         fseek(fp, NVBIN_HAAR_SIZERESERVED, SEEK_SET);
2344         Ncv32u tmp;
2345         readCount = fread(&numStages,   sizeof(Ncv32u), 1, fp);
2346         ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2347         readCount = fread(&tmp,         sizeof(Ncv32u), 1, fp);
2348         ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2349         readCount = fread(&numNodes,    sizeof(Ncv32u), 1, fp);
2350         ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2351         readCount = fread(&numFeatures, sizeof(Ncv32u), 1, fp);
2352         ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2353         fclose(fp);
2354     }
2355     else if (fext == "xml")
2356     {
2357         HaarClassifierCascadeDescriptor haar;
2358         std::vector<HaarStage64> haarStages;
2359         std::vector<HaarClassifierNode128> haarNodes;
2360         std::vector<HaarFeature64> haarFeatures;
2361 
2362         ncvStat = loadFromXML(filename, haar, haarStages, haarNodes, haarFeatures);
2363         ncvAssertReturnNcvStat(ncvStat);
2364 
2365         numStages = haar.NumStages;
2366         numNodes = haar.NumClassifierTotalNodes;
2367         numFeatures = haar.NumFeatures;
2368     }
2369     else
2370     {
2371         return NCV_HAAR_XML_LOADING_EXCEPTION;
2372     }
2373 
2374     return NCV_SUCCESS;
2375 }
2376 
2377 
ncvHaarLoadFromFile_host(const cv::String & filename,HaarClassifierCascadeDescriptor & haar,NCVVector<HaarStage64> & h_HaarStages,NCVVector<HaarClassifierNode128> & h_HaarNodes,NCVVector<HaarFeature64> & h_HaarFeatures)2378 NCVStatus ncvHaarLoadFromFile_host(const cv::String &filename,
2379                                    HaarClassifierCascadeDescriptor &haar,
2380                                    NCVVector<HaarStage64> &h_HaarStages,
2381                                    NCVVector<HaarClassifierNode128> &h_HaarNodes,
2382                                    NCVVector<HaarFeature64> &h_HaarFeatures)
2383 {
2384     ncvAssertReturn(h_HaarStages.memType() == NCVMemoryTypeHostPinned &&
2385                     h_HaarNodes.memType() == NCVMemoryTypeHostPinned &&
2386                     h_HaarFeatures.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
2387 
2388     NCVStatus ncvStat;
2389 
2390     cv::String fext = filename.substr(filename.find_last_of(".") + 1);
2391     std::transform(fext.begin(), fext.end(), fext.begin(), ::tolower);
2392 
2393     std::vector<HaarStage64> haarStages;
2394     std::vector<HaarClassifierNode128> haarNodes;
2395     std::vector<HaarFeature64> haarFeatures;
2396 
2397     if (fext == "nvbin")
2398     {
2399         ncvStat = loadFromNVBIN(filename, haar, haarStages, haarNodes, haarFeatures);
2400         ncvAssertReturnNcvStat(ncvStat);
2401     }
2402     else if (fext == "xml")
2403     {
2404         ncvStat = loadFromXML(filename, haar, haarStages, haarNodes, haarFeatures);
2405         ncvAssertReturnNcvStat(ncvStat);
2406     }
2407     else
2408     {
2409         return NCV_HAAR_XML_LOADING_EXCEPTION;
2410     }
2411 
2412     ncvAssertReturn(h_HaarStages.length() >= haarStages.size(), NCV_MEM_INSUFFICIENT_CAPACITY);
2413     ncvAssertReturn(h_HaarNodes.length() >= haarNodes.size(), NCV_MEM_INSUFFICIENT_CAPACITY);
2414     ncvAssertReturn(h_HaarFeatures.length() >= haarFeatures.size(), NCV_MEM_INSUFFICIENT_CAPACITY);
2415 
2416     memcpy(h_HaarStages.ptr(), &haarStages[0], haarStages.size()*sizeof(HaarStage64));
2417     memcpy(h_HaarNodes.ptr(), &haarNodes[0], haarNodes.size()*sizeof(HaarClassifierNode128));
2418     memcpy(h_HaarFeatures.ptr(), &haarFeatures[0], haarFeatures.size()*sizeof(HaarFeature64));
2419 
2420     return NCV_SUCCESS;
2421 }
2422 
2423 
ncvHaarStoreNVBIN_host(const cv::String & filename,HaarClassifierCascadeDescriptor haar,NCVVector<HaarStage64> & h_HaarStages,NCVVector<HaarClassifierNode128> & h_HaarNodes,NCVVector<HaarFeature64> & h_HaarFeatures)2424 NCVStatus ncvHaarStoreNVBIN_host(const cv::String &filename,
2425                                  HaarClassifierCascadeDescriptor haar,
2426                                  NCVVector<HaarStage64> &h_HaarStages,
2427                                  NCVVector<HaarClassifierNode128> &h_HaarNodes,
2428                                  NCVVector<HaarFeature64> &h_HaarFeatures)
2429 {
2430     ncvAssertReturn(h_HaarStages.length() >= haar.NumStages, NCV_INCONSISTENT_INPUT);
2431     ncvAssertReturn(h_HaarNodes.length() >= haar.NumClassifierTotalNodes, NCV_INCONSISTENT_INPUT);
2432     ncvAssertReturn(h_HaarFeatures.length() >= haar.NumFeatures, NCV_INCONSISTENT_INPUT);
2433     ncvAssertReturn(h_HaarStages.memType() == NCVMemoryTypeHostPinned &&
2434                     h_HaarNodes.memType() == NCVMemoryTypeHostPinned &&
2435                     h_HaarFeatures.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
2436 
2437     Ncv32u szStages = haar.NumStages * sizeof(HaarStage64);
2438     Ncv32u szClassifiers = haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128);
2439     Ncv32u szFeatures = haar.NumFeatures * sizeof(HaarFeature64);
2440 
2441     Ncv32u dataOffset = 0;
2442     std::vector<unsigned char> fdata;
2443     fdata.resize(szStages+szClassifiers+szFeatures+1024, 0);
2444 
2445     //header
2446     *(Ncv32u *)(&fdata[0]+dataOffset) = NVBIN_HAAR_VERSION;
2447 
2448     //data
2449     dataOffset = NVBIN_HAAR_SIZERESERVED;
2450     *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumStages;
2451     dataOffset += sizeof(Ncv32u);
2452     *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumClassifierRootNodes;
2453     dataOffset += sizeof(Ncv32u);
2454     *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumClassifierTotalNodes;
2455     dataOffset += sizeof(Ncv32u);
2456     *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumFeatures;
2457     dataOffset += sizeof(Ncv32u);
2458     *(NcvSize32u *)(&fdata[0]+dataOffset) = haar.ClassifierSize;
2459     dataOffset += sizeof(NcvSize32u);
2460     *(NcvBool *)(&fdata[0]+dataOffset) = haar.bNeedsTiltedII;
2461     dataOffset += sizeof(NcvBool);
2462     *(NcvBool *)(&fdata[0]+dataOffset) = haar.bHasStumpsOnly;
2463     dataOffset += sizeof(NcvBool);
2464 
2465     memcpy(&fdata[0]+dataOffset, h_HaarStages.ptr(), szStages);
2466     dataOffset += szStages;
2467     memcpy(&fdata[0]+dataOffset, h_HaarNodes.ptr(), szClassifiers);
2468     dataOffset += szClassifiers;
2469     memcpy(&fdata[0]+dataOffset, h_HaarFeatures.ptr(), szFeatures);
2470     dataOffset += szFeatures;
2471     Ncv32u fsize = dataOffset;
2472 
2473     //TODO: CRC32 here
2474 
2475     //update header
2476     dataOffset = sizeof(Ncv32u);
2477     *(Ncv32u *)(&fdata[0]+dataOffset) = fsize;
2478 
2479     FILE *fp = fopen(filename.c_str(), "wb");
2480     ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
2481     fwrite(&fdata[0], fsize, 1, fp);
2482     fclose(fp);
2483     return NCV_SUCCESS;
2484 }
2485