1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
22 //
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
26 //
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 ////////////////////////////////////////////////////////////////////////////////
44 //
45 // NVIDIA CUDA implementation of Viola-Jones Object Detection Framework
46 //
47 // The algorithm and code are explained in the upcoming GPU Computing Gems
48 // chapter in detail:
49 //
50 // Anton Obukhov, "Haar Classifiers for Object Detection with CUDA"
51 // PDF URL placeholder
52 // email: aobukhov@nvidia.com, devsupport@nvidia.com
53 //
54 // Credits for help with the code to:
55 // Alexey Mendelenko, Cyril Crassin, and Mikhail Smirnov.
56 //
57 ////////////////////////////////////////////////////////////////////////////////
58
59 #include <algorithm>
60 #include <cstdio>
61
62 #include "opencv2/cudev.hpp"
63 #include "opencv2/core/persistence.hpp"
64
65 #include "opencv2/opencv_modules.hpp"
66
67 #ifdef HAVE_OPENCV_OBJDETECT
68 # include "opencv2/objdetect.hpp"
69 //# include "opencv2/objdetect/objdetect_c.h"
70 #endif
71
72 #include "opencv2/cudalegacy/NCV.hpp"
73 #include "opencv2/cudalegacy/NPP_staging.hpp"
74 #include "opencv2/cudalegacy/NCVHaarObjectDetection.hpp"
75
76 #include "NCVRuntimeTemplates.hpp"
77 #include "NCVAlg.hpp"
78
79
80 //==============================================================================
81 //
82 // HaarClassifierCascade file
83 //
84 //==============================================================================
85
86
87 const Ncv32u MAX_GRID_DIM = 65535;
88
89
90 const Ncv32u NUM_THREADS_ANCHORSPARALLEL = 64;
91
92
93 #define NUM_THREADS_CLASSIFIERPARALLEL_LOG2 6
94 #define NUM_THREADS_CLASSIFIERPARALLEL (1 << NUM_THREADS_CLASSIFIERPARALLEL_LOG2)
95
96
97 /** \internal
98 * Haar features solid array.
99 */
100 texture<uint2, 1, cudaReadModeElementType> texHaarFeatures;
101
102
103 /** \internal
104 * Haar classifiers flattened trees container.
105 * Two parts: first contains root nodes, second - nodes that are referred by root nodes.
106 * Drawback: breaks tree locality (might cause more cache misses
107 * Advantage: No need to introduce additional 32-bit field to index root nodes offsets
108 */
109 texture<uint4, 1, cudaReadModeElementType> texHaarClassifierNodes;
110
111
112 texture<Ncv32u, 1, cudaReadModeElementType> texIImage;
113
114
getStage(Ncv32u iStage,HaarStage64 * d_Stages)115 __device__ HaarStage64 getStage(Ncv32u iStage, HaarStage64 *d_Stages)
116 {
117 return d_Stages[iStage];
118 }
119
120
121 template <NcvBool tbCacheTextureCascade>
getClassifierNode(Ncv32u iNode,HaarClassifierNode128 * d_ClassifierNodes)122 __device__ HaarClassifierNode128 getClassifierNode(Ncv32u iNode, HaarClassifierNode128 *d_ClassifierNodes)
123 {
124 HaarClassifierNode128 tmpNode;
125 if (tbCacheTextureCascade)
126 {
127 tmpNode._ui4 = tex1Dfetch(texHaarClassifierNodes, iNode);
128 }
129 else
130 {
131 tmpNode = d_ClassifierNodes[iNode];
132 }
133 return tmpNode;
134 }
135
136
137 template <NcvBool tbCacheTextureCascade>
getFeature(Ncv32u iFeature,HaarFeature64 * d_Features,Ncv32f * weight,Ncv32u * rectX,Ncv32u * rectY,Ncv32u * rectWidth,Ncv32u * rectHeight)138 __device__ void getFeature(Ncv32u iFeature, HaarFeature64 *d_Features,
139 Ncv32f *weight,
140 Ncv32u *rectX, Ncv32u *rectY, Ncv32u *rectWidth, Ncv32u *rectHeight)
141 {
142 HaarFeature64 feature;
143 if (tbCacheTextureCascade)
144 {
145 feature._ui2 = tex1Dfetch(texHaarFeatures, iFeature);
146 }
147 else
148 {
149 feature = d_Features[iFeature];
150 }
151 feature.getRect(rectX, rectY, rectWidth, rectHeight);
152 *weight = feature.getWeight();
153 }
154
155
156 template <NcvBool tbCacheTextureIImg>
getElemIImg(Ncv32u x,Ncv32u * d_IImg)157 __device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg)
158 {
159 if (tbCacheTextureIImg)
160 {
161 return tex1Dfetch(texIImage, x);
162 }
163 else
164 {
165 return d_IImg[x];
166 }
167 }
168
169
170 __device__ Ncv32u d_outMaskPosition;
171
172
compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag,Ncv32u threadElem,Ncv32u * vectorOut)173 __device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u threadElem, Ncv32u *vectorOut)
174 {
175 #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
176
177 __shared__ Ncv32u shmem[NUM_THREADS_ANCHORSPARALLEL];
178 __shared__ Ncv32u numPassed;
179 __shared__ Ncv32u outMaskOffset;
180
181 Ncv32u incScan = cv::cudev::blockScanInclusive<NUM_THREADS_ANCHORSPARALLEL>(threadPassFlag, shmem, threadIdx.x);
182 __syncthreads();
183
184 if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1)
185 {
186 numPassed = incScan;
187 outMaskOffset = atomicAdd(&d_outMaskPosition, incScan);
188 }
189
190 if (threadPassFlag)
191 {
192 Ncv32u excScan = incScan - threadPassFlag;
193 shmem[excScan] = threadElem;
194 }
195
196 __syncthreads();
197
198 if (threadIdx.x < numPassed)
199 {
200 vectorOut[outMaskOffset + threadIdx.x] = shmem[threadIdx.x];
201 }
202 #endif
203 }
204
205
206 template <NcvBool tbInitMaskPositively,
207 NcvBool tbCacheTextureIImg,
208 NcvBool tbCacheTextureCascade,
209 NcvBool tbReadPixelIndexFromVector,
210 NcvBool tbDoAtomicCompaction>
applyHaarClassifierAnchorParallel(Ncv32u * d_IImg,Ncv32u IImgStride,Ncv32f * d_weights,Ncv32u weightsStride,HaarFeature64 * d_Features,HaarClassifierNode128 * d_ClassifierNodes,HaarStage64 * d_Stages,Ncv32u * d_inMask,Ncv32u * d_outMask,Ncv32u mask1Dlen,Ncv32u mask2Dstride,NcvSize32u anchorsRoi,Ncv32u startStageInc,Ncv32u endStageExc,Ncv32f scaleArea)211 __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStride,
212 Ncv32f *d_weights, Ncv32u weightsStride,
213 HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
214 Ncv32u *d_inMask, Ncv32u *d_outMask,
215 Ncv32u mask1Dlen, Ncv32u mask2Dstride,
216 NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
217 {
218 Ncv32u y_offs;
219 Ncv32u x_offs;
220 Ncv32u maskOffset;
221 Ncv32u outMaskVal;
222
223 NcvBool bInactiveThread = false;
224
225 if (tbReadPixelIndexFromVector)
226 {
227 maskOffset = (MAX_GRID_DIM * blockIdx.y + blockIdx.x) * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;
228
229 if (maskOffset >= mask1Dlen)
230 {
231 if (tbDoAtomicCompaction) bInactiveThread = true; else return;
232 }
233
234 if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)
235 {
236 outMaskVal = d_inMask[maskOffset];
237 y_offs = outMaskVal >> 16;
238 x_offs = outMaskVal & 0xFFFF;
239 }
240 }
241 else
242 {
243 y_offs = blockIdx.y;
244 x_offs = blockIdx.x * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;
245
246 if (x_offs >= mask2Dstride)
247 {
248 if (tbDoAtomicCompaction) bInactiveThread = true; else return;
249 }
250
251 if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)
252 {
253 maskOffset = y_offs * mask2Dstride + x_offs;
254
255 if ((x_offs >= anchorsRoi.width) ||
256 (!tbInitMaskPositively &&
257 d_inMask != d_outMask &&
258 d_inMask[maskOffset] == OBJDET_MASK_ELEMENT_INVALID_32U))
259 {
260 if (tbDoAtomicCompaction)
261 {
262 bInactiveThread = true;
263 }
264 else
265 {
266 d_outMask[maskOffset] = OBJDET_MASK_ELEMENT_INVALID_32U;
267 return;
268 }
269 }
270
271 outMaskVal = (y_offs << 16) | x_offs;
272 }
273 }
274
275 NcvBool bPass = true;
276
277 if (!tbDoAtomicCompaction || tbDoAtomicCompaction)
278 {
279 Ncv32f pixelStdDev = 0.0f;
280
281 if (!bInactiveThread)
282 pixelStdDev = d_weights[y_offs * weightsStride + x_offs];
283
284 for (Ncv32u iStage = startStageInc; iStage < endStageExc; iStage++)
285 {
286 Ncv32f curStageSum = 0.0f;
287
288 HaarStage64 curStage = getStage(iStage, d_Stages);
289 Ncv32u numRootNodesInStage = curStage.getNumClassifierRootNodes();
290 Ncv32u curRootNodeOffset = curStage.getStartClassifierRootNodeOffset();
291 Ncv32f stageThreshold = curStage.getStageThreshold();
292
293 while (numRootNodesInStage--)
294 {
295 NcvBool bMoreNodesToTraverse = true;
296 Ncv32u iNode = curRootNodeOffset;
297
298 if (bPass && !bInactiveThread)
299 {
300 while (bMoreNodesToTraverse)
301 {
302 HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
303 HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
304 Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
305 Ncv32u iFeature = featuresDesc.getFeaturesOffset();
306
307 Ncv32f curNodeVal = 0.0f;
308
309 for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)
310 {
311 Ncv32f rectWeight;
312 Ncv32u rectX, rectY, rectWidth, rectHeight;
313 getFeature<tbCacheTextureCascade>
314 (iFeature + iRect, d_Features,
315 &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
316
317 Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
318 Ncv32u iioffsTR = iioffsTL + rectWidth;
319 Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
320 Ncv32u iioffsBR = iioffsBL + rectWidth;
321
322 Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -
323 getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +
324 getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
325 getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
326
327 #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
328 curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
329 #else
330 curNodeVal += (Ncv32f)rectSum * rectWeight;
331 #endif
332 }
333
334 HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
335 HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();
336 Ncv32f nodeThreshold = curNode.getThreshold();
337
338 HaarClassifierNodeDescriptor32 nextNodeDescriptor;
339 NcvBool nextNodeIsLeaf;
340
341 if (curNodeVal < scaleArea * pixelStdDev * nodeThreshold)
342 {
343 nextNodeDescriptor = nodeLeft;
344 nextNodeIsLeaf = featuresDesc.isLeftNodeLeaf();
345 }
346 else
347 {
348 nextNodeDescriptor = nodeRight;
349 nextNodeIsLeaf = featuresDesc.isRightNodeLeaf();
350 }
351
352 if (nextNodeIsLeaf)
353 {
354 Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue();
355 curStageSum += tmpLeafValue;
356 bMoreNodesToTraverse = false;
357 }
358 else
359 {
360 iNode = nextNodeDescriptor.getNextNodeOffset();
361 }
362 }
363 }
364
365 __syncthreads();
366 curRootNodeOffset++;
367 }
368
369 if (curStageSum < stageThreshold)
370 {
371 bPass = false;
372 outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U;
373 }
374 }
375 }
376
377 __syncthreads();
378
379 if (!tbDoAtomicCompaction)
380 {
381 if (!tbReadPixelIndexFromVector ||
382 (tbReadPixelIndexFromVector && (!bPass || d_inMask != d_outMask)))
383 {
384 d_outMask[maskOffset] = outMaskVal;
385 }
386 }
387 else
388 {
389 compactBlockWriteOutAnchorParallel(bPass && !bInactiveThread,
390 outMaskVal,
391 d_outMask);
392 }
393 }
394
395
396 template <NcvBool tbCacheTextureIImg,
397 NcvBool tbCacheTextureCascade,
398 NcvBool tbDoAtomicCompaction>
applyHaarClassifierClassifierParallel(Ncv32u * d_IImg,Ncv32u IImgStride,Ncv32f * d_weights,Ncv32u weightsStride,HaarFeature64 * d_Features,HaarClassifierNode128 * d_ClassifierNodes,HaarStage64 * d_Stages,Ncv32u * d_inMask,Ncv32u * d_outMask,Ncv32u mask1Dlen,Ncv32u mask2Dstride,NcvSize32u anchorsRoi,Ncv32u startStageInc,Ncv32u endStageExc,Ncv32f scaleArea)399 __global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IImgStride,
400 Ncv32f *d_weights, Ncv32u weightsStride,
401 HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
402 Ncv32u *d_inMask, Ncv32u *d_outMask,
403 Ncv32u mask1Dlen, Ncv32u mask2Dstride,
404 NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
405 {
406 Ncv32u maskOffset = MAX_GRID_DIM * blockIdx.y + blockIdx.x;
407
408 if (maskOffset >= mask1Dlen)
409 {
410 return;
411 }
412
413 Ncv32u outMaskVal = d_inMask[maskOffset];
414 Ncv32u y_offs = outMaskVal >> 16;
415 Ncv32u x_offs = outMaskVal & 0xFFFF;
416
417 Ncv32f pixelStdDev = d_weights[y_offs * weightsStride + x_offs];
418 NcvBool bPass = true;
419
420 for (Ncv32u iStage = startStageInc; iStage<endStageExc; iStage++)
421 {
422 //this variable is subject to reduction
423 Ncv32f curStageSum = 0.0f;
424
425 HaarStage64 curStage = getStage(iStage, d_Stages);
426 Ncv32s numRootNodesInStage = curStage.getNumClassifierRootNodes();
427 Ncv32u curRootNodeOffset = curStage.getStartClassifierRootNodeOffset() + threadIdx.x;
428 Ncv32f stageThreshold = curStage.getStageThreshold();
429
430 Ncv32u numRootChunks = (numRootNodesInStage + NUM_THREADS_CLASSIFIERPARALLEL - 1) >> NUM_THREADS_CLASSIFIERPARALLEL_LOG2;
431
432 for (Ncv32u chunkId=0; chunkId<numRootChunks; chunkId++)
433 {
434 NcvBool bMoreNodesToTraverse = true;
435
436 if (chunkId * NUM_THREADS_CLASSIFIERPARALLEL + threadIdx.x < numRootNodesInStage)
437 {
438 Ncv32u iNode = curRootNodeOffset;
439
440 while (bMoreNodesToTraverse)
441 {
442 HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
443 HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
444 Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
445 Ncv32u iFeature = featuresDesc.getFeaturesOffset();
446
447 Ncv32f curNodeVal = 0.0f;
448 //TODO: fetch into shmem if size suffices. Shmem can be shared with reduce
449 for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)
450 {
451 Ncv32f rectWeight;
452 Ncv32u rectX, rectY, rectWidth, rectHeight;
453 getFeature<tbCacheTextureCascade>
454 (iFeature + iRect, d_Features,
455 &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
456
457 Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
458 Ncv32u iioffsTR = iioffsTL + rectWidth;
459 Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
460 Ncv32u iioffsBR = iioffsBL + rectWidth;
461
462 Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -
463 getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +
464 getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
465 getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
466
467 #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
468 curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
469 #else
470 curNodeVal += (Ncv32f)rectSum * rectWeight;
471 #endif
472 }
473
474 HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
475 HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();
476 Ncv32f nodeThreshold = curNode.getThreshold();
477
478 HaarClassifierNodeDescriptor32 nextNodeDescriptor;
479 NcvBool nextNodeIsLeaf;
480
481 if (curNodeVal < scaleArea * pixelStdDev * nodeThreshold)
482 {
483 nextNodeDescriptor = nodeLeft;
484 nextNodeIsLeaf = featuresDesc.isLeftNodeLeaf();
485 }
486 else
487 {
488 nextNodeDescriptor = nodeRight;
489 nextNodeIsLeaf = featuresDesc.isRightNodeLeaf();
490 }
491
492 if (nextNodeIsLeaf)
493 {
494 Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue();
495 curStageSum += tmpLeafValue;
496 bMoreNodesToTraverse = false;
497 }
498 else
499 {
500 iNode = nextNodeDescriptor.getNextNodeOffset();
501 }
502 }
503 }
504 __syncthreads();
505
506 curRootNodeOffset += NUM_THREADS_CLASSIFIERPARALLEL;
507 }
508
509 Ncv32f finalStageSum = subReduce<Ncv32f, functorAddValues<Ncv32f>, NUM_THREADS_CLASSIFIERPARALLEL>(curStageSum);
510
511 if (finalStageSum < stageThreshold)
512 {
513 bPass = false;
514 outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U;
515 break;
516 }
517 }
518
519 if (!tbDoAtomicCompaction)
520 {
521 if (!bPass || d_inMask != d_outMask)
522 {
523 if (!threadIdx.x)
524 {
525 d_outMask[maskOffset] = outMaskVal;
526 }
527 }
528 }
529 else
530 {
531 #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
532 if (bPass && !threadIdx.x)
533 {
534 Ncv32u outMaskOffset = atomicAdd(&d_outMaskPosition, 1);
535 d_outMask[outMaskOffset] = outMaskVal;
536 }
537 #endif
538 }
539 }
540
541
542 template <NcvBool tbMaskByInmask,
543 NcvBool tbDoAtomicCompaction>
initializeMaskVector(Ncv32u * d_inMask,Ncv32u * d_outMask,Ncv32u mask1Dlen,Ncv32u mask2Dstride,NcvSize32u anchorsRoi,Ncv32u step)544 __global__ void initializeMaskVector(Ncv32u *d_inMask, Ncv32u *d_outMask,
545 Ncv32u mask1Dlen, Ncv32u mask2Dstride,
546 NcvSize32u anchorsRoi, Ncv32u step)
547 {
548 Ncv32u y_offs = blockIdx.y;
549 Ncv32u x_offs = blockIdx.x * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;
550 Ncv32u outMaskOffset = y_offs * gridDim.x * blockDim.x + x_offs;
551
552 Ncv32u y_offs_upsc = step * y_offs;
553 Ncv32u x_offs_upsc = step * x_offs;
554 Ncv32u inMaskOffset = y_offs_upsc * mask2Dstride + x_offs_upsc;
555
556 Ncv32u outElem = OBJDET_MASK_ELEMENT_INVALID_32U;
557
558 if (x_offs_upsc < anchorsRoi.width &&
559 (!tbMaskByInmask || d_inMask[inMaskOffset] != OBJDET_MASK_ELEMENT_INVALID_32U))
560 {
561 outElem = (y_offs_upsc << 16) | x_offs_upsc;
562 }
563
564 if (!tbDoAtomicCompaction)
565 {
566 d_outMask[outMaskOffset] = outElem;
567 }
568 else
569 {
570 compactBlockWriteOutAnchorParallel(outElem != OBJDET_MASK_ELEMENT_INVALID_32U,
571 outElem,
572 d_outMask);
573 }
574 }
575
576
577 struct applyHaarClassifierAnchorParallelFunctor
578 {
579 dim3 gridConf, blockConf;
580 cudaStream_t cuStream;
581
582 //Kernel arguments are stored as members;
583 Ncv32u *d_IImg;
584 Ncv32u IImgStride;
585 Ncv32f *d_weights;
586 Ncv32u weightsStride;
587 HaarFeature64 *d_Features;
588 HaarClassifierNode128 *d_ClassifierNodes;
589 HaarStage64 *d_Stages;
590 Ncv32u *d_inMask;
591 Ncv32u *d_outMask;
592 Ncv32u mask1Dlen;
593 Ncv32u mask2Dstride;
594 NcvSize32u anchorsRoi;
595 Ncv32u startStageInc;
596 Ncv32u endStageExc;
597 Ncv32f scaleArea;
598
599 //Arguments are passed through the constructor
applyHaarClassifierAnchorParallelFunctorapplyHaarClassifierAnchorParallelFunctor600 applyHaarClassifierAnchorParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
601 Ncv32u *_d_IImg, Ncv32u _IImgStride,
602 Ncv32f *_d_weights, Ncv32u _weightsStride,
603 HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,
604 Ncv32u *_d_inMask, Ncv32u *_d_outMask,
605 Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
606 NcvSize32u _anchorsRoi, Ncv32u _startStageInc,
607 Ncv32u _endStageExc, Ncv32f _scaleArea) :
608 gridConf(_gridConf),
609 blockConf(_blockConf),
610 cuStream(_cuStream),
611 d_IImg(_d_IImg),
612 IImgStride(_IImgStride),
613 d_weights(_d_weights),
614 weightsStride(_weightsStride),
615 d_Features(_d_Features),
616 d_ClassifierNodes(_d_ClassifierNodes),
617 d_Stages(_d_Stages),
618 d_inMask(_d_inMask),
619 d_outMask(_d_outMask),
620 mask1Dlen(_mask1Dlen),
621 mask2Dstride(_mask2Dstride),
622 anchorsRoi(_anchorsRoi),
623 startStageInc(_startStageInc),
624 endStageExc(_endStageExc),
625 scaleArea(_scaleArea)
626 {}
627
628 template<class TList>
callapplyHaarClassifierAnchorParallelFunctor629 void call(TList tl)
630 {
631 CV_UNUSED(tl);
632 applyHaarClassifierAnchorParallel <
633 Loki::TL::TypeAt<TList, 0>::Result::value,
634 Loki::TL::TypeAt<TList, 1>::Result::value,
635 Loki::TL::TypeAt<TList, 2>::Result::value,
636 Loki::TL::TypeAt<TList, 3>::Result::value,
637 Loki::TL::TypeAt<TList, 4>::Result::value >
638 <<<gridConf, blockConf, 0, cuStream>>>
639 (d_IImg, IImgStride,
640 d_weights, weightsStride,
641 d_Features, d_ClassifierNodes, d_Stages,
642 d_inMask, d_outMask,
643 mask1Dlen, mask2Dstride,
644 anchorsRoi, startStageInc,
645 endStageExc, scaleArea);
646 }
647 };
648
649
applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively,NcvBool tbCacheTextureIImg,NcvBool tbCacheTextureCascade,NcvBool tbReadPixelIndexFromVector,NcvBool tbDoAtomicCompaction,dim3 gridConf,dim3 blockConf,cudaStream_t cuStream,Ncv32u * d_IImg,Ncv32u IImgStride,Ncv32f * d_weights,Ncv32u weightsStride,HaarFeature64 * d_Features,HaarClassifierNode128 * d_ClassifierNodes,HaarStage64 * d_Stages,Ncv32u * d_inMask,Ncv32u * d_outMask,Ncv32u mask1Dlen,Ncv32u mask2Dstride,NcvSize32u anchorsRoi,Ncv32u startStageInc,Ncv32u endStageExc,Ncv32f scaleArea)650 void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively,
651 NcvBool tbCacheTextureIImg,
652 NcvBool tbCacheTextureCascade,
653 NcvBool tbReadPixelIndexFromVector,
654 NcvBool tbDoAtomicCompaction,
655
656 dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
657
658 Ncv32u *d_IImg, Ncv32u IImgStride,
659 Ncv32f *d_weights, Ncv32u weightsStride,
660 HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
661 Ncv32u *d_inMask, Ncv32u *d_outMask,
662 Ncv32u mask1Dlen, Ncv32u mask2Dstride,
663 NcvSize32u anchorsRoi, Ncv32u startStageInc,
664 Ncv32u endStageExc, Ncv32f scaleArea)
665 {
666
667 applyHaarClassifierAnchorParallelFunctor functor(gridConf, blockConf, cuStream,
668 d_IImg, IImgStride,
669 d_weights, weightsStride,
670 d_Features, d_ClassifierNodes, d_Stages,
671 d_inMask, d_outMask,
672 mask1Dlen, mask2Dstride,
673 anchorsRoi, startStageInc,
674 endStageExc, scaleArea);
675
676 //Second parameter is the number of "dynamic" template parameters
677 NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 5, applyHaarClassifierAnchorParallelFunctor>
678 ::call( &functor,
679 tbInitMaskPositively,
680 tbCacheTextureIImg,
681 tbCacheTextureCascade,
682 tbReadPixelIndexFromVector,
683 tbDoAtomicCompaction);
684 }
685
686
687 struct applyHaarClassifierClassifierParallelFunctor
688 {
689 dim3 gridConf, blockConf;
690 cudaStream_t cuStream;
691
692 //Kernel arguments are stored as members;
693 Ncv32u *d_IImg;
694 Ncv32u IImgStride;
695 Ncv32f *d_weights;
696 Ncv32u weightsStride;
697 HaarFeature64 *d_Features;
698 HaarClassifierNode128 *d_ClassifierNodes;
699 HaarStage64 *d_Stages;
700 Ncv32u *d_inMask;
701 Ncv32u *d_outMask;
702 Ncv32u mask1Dlen;
703 Ncv32u mask2Dstride;
704 NcvSize32u anchorsRoi;
705 Ncv32u startStageInc;
706 Ncv32u endStageExc;
707 Ncv32f scaleArea;
708
709 //Arguments are passed through the constructor
applyHaarClassifierClassifierParallelFunctorapplyHaarClassifierClassifierParallelFunctor710 applyHaarClassifierClassifierParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
711 Ncv32u *_d_IImg, Ncv32u _IImgStride,
712 Ncv32f *_d_weights, Ncv32u _weightsStride,
713 HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,
714 Ncv32u *_d_inMask, Ncv32u *_d_outMask,
715 Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
716 NcvSize32u _anchorsRoi, Ncv32u _startStageInc,
717 Ncv32u _endStageExc, Ncv32f _scaleArea) :
718 gridConf(_gridConf),
719 blockConf(_blockConf),
720 cuStream(_cuStream),
721 d_IImg(_d_IImg),
722 IImgStride(_IImgStride),
723 d_weights(_d_weights),
724 weightsStride(_weightsStride),
725 d_Features(_d_Features),
726 d_ClassifierNodes(_d_ClassifierNodes),
727 d_Stages(_d_Stages),
728 d_inMask(_d_inMask),
729 d_outMask(_d_outMask),
730 mask1Dlen(_mask1Dlen),
731 mask2Dstride(_mask2Dstride),
732 anchorsRoi(_anchorsRoi),
733 startStageInc(_startStageInc),
734 endStageExc(_endStageExc),
735 scaleArea(_scaleArea)
736 {}
737
738 template<class TList>
callapplyHaarClassifierClassifierParallelFunctor739 void call(TList tl)
740 {
741 CV_UNUSED(tl);
742 applyHaarClassifierClassifierParallel <
743 Loki::TL::TypeAt<TList, 0>::Result::value,
744 Loki::TL::TypeAt<TList, 1>::Result::value,
745 Loki::TL::TypeAt<TList, 2>::Result::value >
746 <<<gridConf, blockConf, 0, cuStream>>>
747 (d_IImg, IImgStride,
748 d_weights, weightsStride,
749 d_Features, d_ClassifierNodes, d_Stages,
750 d_inMask, d_outMask,
751 mask1Dlen, mask2Dstride,
752 anchorsRoi, startStageInc,
753 endStageExc, scaleArea);
754 }
755 };
756
757
applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg,NcvBool tbCacheTextureCascade,NcvBool tbDoAtomicCompaction,dim3 gridConf,dim3 blockConf,cudaStream_t cuStream,Ncv32u * d_IImg,Ncv32u IImgStride,Ncv32f * d_weights,Ncv32u weightsStride,HaarFeature64 * d_Features,HaarClassifierNode128 * d_ClassifierNodes,HaarStage64 * d_Stages,Ncv32u * d_inMask,Ncv32u * d_outMask,Ncv32u mask1Dlen,Ncv32u mask2Dstride,NcvSize32u anchorsRoi,Ncv32u startStageInc,Ncv32u endStageExc,Ncv32f scaleArea)758 void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg,
759 NcvBool tbCacheTextureCascade,
760 NcvBool tbDoAtomicCompaction,
761
762 dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
763
764 Ncv32u *d_IImg, Ncv32u IImgStride,
765 Ncv32f *d_weights, Ncv32u weightsStride,
766 HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
767 Ncv32u *d_inMask, Ncv32u *d_outMask,
768 Ncv32u mask1Dlen, Ncv32u mask2Dstride,
769 NcvSize32u anchorsRoi, Ncv32u startStageInc,
770 Ncv32u endStageExc, Ncv32f scaleArea)
771 {
772 applyHaarClassifierClassifierParallelFunctor functor(gridConf, blockConf, cuStream,
773 d_IImg, IImgStride,
774 d_weights, weightsStride,
775 d_Features, d_ClassifierNodes, d_Stages,
776 d_inMask, d_outMask,
777 mask1Dlen, mask2Dstride,
778 anchorsRoi, startStageInc,
779 endStageExc, scaleArea);
780
781 //Second parameter is the number of "dynamic" template parameters
782 NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 3, applyHaarClassifierClassifierParallelFunctor>
783 ::call( &functor,
784 tbCacheTextureIImg,
785 tbCacheTextureCascade,
786 tbDoAtomicCompaction);
787 }
788
789
790 struct initializeMaskVectorFunctor
791 {
792 dim3 gridConf, blockConf;
793 cudaStream_t cuStream;
794
795 //Kernel arguments are stored as members;
796 Ncv32u *d_inMask;
797 Ncv32u *d_outMask;
798 Ncv32u mask1Dlen;
799 Ncv32u mask2Dstride;
800 NcvSize32u anchorsRoi;
801 Ncv32u step;
802
803 //Arguments are passed through the constructor
initializeMaskVectorFunctorinitializeMaskVectorFunctor804 initializeMaskVectorFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
805 Ncv32u *_d_inMask, Ncv32u *_d_outMask,
806 Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
807 NcvSize32u _anchorsRoi, Ncv32u _step) :
808 gridConf(_gridConf),
809 blockConf(_blockConf),
810 cuStream(_cuStream),
811 d_inMask(_d_inMask),
812 d_outMask(_d_outMask),
813 mask1Dlen(_mask1Dlen),
814 mask2Dstride(_mask2Dstride),
815 anchorsRoi(_anchorsRoi),
816 step(_step)
817 {}
818
819 template<class TList>
callinitializeMaskVectorFunctor820 void call(TList tl)
821 {
822 CV_UNUSED(tl);
823 initializeMaskVector <
824 Loki::TL::TypeAt<TList, 0>::Result::value,
825 Loki::TL::TypeAt<TList, 1>::Result::value >
826 <<<gridConf, blockConf, 0, cuStream>>>
827 (d_inMask, d_outMask,
828 mask1Dlen, mask2Dstride,
829 anchorsRoi, step);
830 }
831 };
832
833
initializeMaskVectorDynTemplate(NcvBool tbMaskByInmask,NcvBool tbDoAtomicCompaction,dim3 gridConf,dim3 blockConf,cudaStream_t cuStream,Ncv32u * d_inMask,Ncv32u * d_outMask,Ncv32u mask1Dlen,Ncv32u mask2Dstride,NcvSize32u anchorsRoi,Ncv32u step)834 void initializeMaskVectorDynTemplate(NcvBool tbMaskByInmask,
835 NcvBool tbDoAtomicCompaction,
836
837 dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
838
839 Ncv32u *d_inMask, Ncv32u *d_outMask,
840 Ncv32u mask1Dlen, Ncv32u mask2Dstride,
841 NcvSize32u anchorsRoi, Ncv32u step)
842 {
843 initializeMaskVectorFunctor functor(gridConf, blockConf, cuStream,
844 d_inMask, d_outMask,
845 mask1Dlen, mask2Dstride,
846 anchorsRoi, step);
847
848 //Second parameter is the number of "dynamic" template parameters
849 NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 2, initializeMaskVectorFunctor>
850 ::call( &functor,
851 tbMaskByInmask,
852 tbDoAtomicCompaction);
853 }
854
855
getStageNumWithNotLessThanNclassifiers(Ncv32u N,HaarClassifierCascadeDescriptor & haar,NCVVector<HaarStage64> & h_HaarStages)856 Ncv32u getStageNumWithNotLessThanNclassifiers(Ncv32u N, HaarClassifierCascadeDescriptor &haar,
857 NCVVector<HaarStage64> &h_HaarStages)
858 {
859 Ncv32u i = 0;
860 for (; i<haar.NumStages; i++)
861 {
862 if (h_HaarStages.ptr()[i].getNumClassifierRootNodes() >= N)
863 {
864 break;
865 }
866 }
867 return i;
868 }
869
870
ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> & integral,NCVMatrix<Ncv32f> & d_weights,NCVMatrixAlloc<Ncv32u> & d_pixelMask,Ncv32u & numDetections,HaarClassifierCascadeDescriptor & haar,NCVVector<HaarStage64> & h_HaarStages,NCVVector<HaarStage64> & d_HaarStages,NCVVector<HaarClassifierNode128> & d_HaarNodes,NCVVector<HaarFeature64> & d_HaarFeatures,NcvBool bMaskElements,NcvSize32u anchorsRoi,Ncv32u pixelStep,Ncv32f scaleArea,INCVMemAllocator & gpuAllocator,INCVMemAllocator & cpuAllocator,cudaDeviceProp & devProp,cudaStream_t cuStream)871 NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
872 NCVMatrix<Ncv32f> &d_weights,
873 NCVMatrixAlloc<Ncv32u> &d_pixelMask,
874 Ncv32u &numDetections,
875 HaarClassifierCascadeDescriptor &haar,
876 NCVVector<HaarStage64> &h_HaarStages,
877 NCVVector<HaarStage64> &d_HaarStages,
878 NCVVector<HaarClassifierNode128> &d_HaarNodes,
879 NCVVector<HaarFeature64> &d_HaarFeatures,
880 NcvBool bMaskElements,
881 NcvSize32u anchorsRoi,
882 Ncv32u pixelStep,
883 Ncv32f scaleArea,
884 INCVMemAllocator &gpuAllocator,
885 INCVMemAllocator &cpuAllocator,
886 cudaDeviceProp &devProp,
887 cudaStream_t cuStream)
888 {
889 ncvAssertReturn(integral.memType() == d_weights.memType()&&
890 integral.memType() == d_pixelMask.memType() &&
891 integral.memType() == gpuAllocator.memType() &&
892 (integral.memType() == NCVMemoryTypeDevice ||
893 integral.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
894
895 ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() &&
896 d_HaarStages.memType() == d_HaarFeatures.memType() &&
897 (d_HaarStages.memType() == NCVMemoryTypeDevice ||
898 d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
899
900 ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
901
902 ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
903
904 ncvAssertReturn((integral.ptr() != NULL && d_weights.ptr() != NULL && d_pixelMask.ptr() != NULL &&
905 h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL &&
906 d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR);
907
908 ncvAssertReturn(anchorsRoi.width > 0 && anchorsRoi.height > 0 &&
909 d_pixelMask.width() >= anchorsRoi.width && d_pixelMask.height() >= anchorsRoi.height &&
910 d_weights.width() >= anchorsRoi.width && d_weights.height() >= anchorsRoi.height &&
911 integral.width() >= anchorsRoi.width + haar.ClassifierSize.width &&
912 integral.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID);
913
914 ncvAssertReturn(scaleArea > 0, NCV_INVALID_SCALE);
915
916 ncvAssertReturn(d_HaarStages.length() >= haar.NumStages &&
917 d_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
918 d_HaarFeatures.length() >= haar.NumFeatures &&
919 d_HaarStages.length() == h_HaarStages.length() &&
920 haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
921
922 ncvAssertReturn(haar.bNeedsTiltedII == false || gpuAllocator.isCounting(), NCV_NOIMPL_HAAR_TILTED_FEATURES);
923
924 ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);
925
926 NCV_SET_SKIP_COND(gpuAllocator.isCounting());
927
928 #if defined _SELF_TEST_
929
930 NCVStatus ncvStat;
931
932 NCVMatrixAlloc<Ncv32u> h_integralImage(cpuAllocator, integral.width, integral.height, integral.pitch);
933 ncvAssertReturn(h_integralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
934 NCVMatrixAlloc<Ncv32f> h_weights(cpuAllocator, d_weights.width, d_weights.height, d_weights.pitch);
935 ncvAssertReturn(h_weights.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
936 NCVMatrixAlloc<Ncv32u> h_pixelMask(cpuAllocator, d_pixelMask.width, d_pixelMask.height, d_pixelMask.pitch);
937 ncvAssertReturn(h_pixelMask.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
938 NCVVectorAlloc<HaarClassifierNode128> h_HaarNodes(cpuAllocator, d_HaarNodes.length);
939 ncvAssertReturn(h_HaarNodes.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
940 NCVVectorAlloc<HaarFeature64> h_HaarFeatures(cpuAllocator, d_HaarFeatures.length);
941 ncvAssertReturn(h_HaarFeatures.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
942
943 NCVMatrixAlloc<Ncv32u> h_pixelMask_d(cpuAllocator, d_pixelMask.width, d_pixelMask.height, d_pixelMask.pitch);
944 ncvAssertReturn(h_pixelMask_d.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
945
946 NCV_SKIP_COND_BEGIN
947
948 ncvStat = d_pixelMask.copySolid(h_pixelMask, 0);
949 ncvAssertReturnNcvStat(ncvStat);
950 ncvStat = integral.copySolid(h_integralImage, 0);
951 ncvAssertReturnNcvStat(ncvStat);
952 ncvStat = d_weights.copySolid(h_weights, 0);
953 ncvAssertReturnNcvStat(ncvStat);
954 ncvStat = d_HaarNodes.copySolid(h_HaarNodes, 0);
955 ncvAssertReturnNcvStat(ncvStat);
956 ncvStat = d_HaarFeatures.copySolid(h_HaarFeatures, 0);
957 ncvAssertReturnNcvStat(ncvStat);
958 ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
959
960 for (Ncv32u i=0; i<(Ncv32u)anchorsRoi.height; i++)
961 {
962 for (Ncv32u j=0; j<d_pixelMask.stride(); j++)
963 {
964 if ((i%pixelStep==0) && (j%pixelStep==0) && (j<(Ncv32u)anchorsRoi.width))
965 {
966 if (!bMaskElements || h_pixelMask.ptr[i*d_pixelMask.stride()+j] != OBJDET_MASK_ELEMENT_INVALID_32U)
967 {
968 h_pixelMask.ptr[i*d_pixelMask.stride()+j] = (i << 16) | j;
969 }
970 }
971 else
972 {
973 h_pixelMask.ptr[i*d_pixelMask.stride()+j] = OBJDET_MASK_ELEMENT_INVALID_32U;
974 }
975 }
976 }
977
978 NCV_SKIP_COND_END
979
980 #endif
981
982 NCVVectorReuse<Ncv32u> d_vecPixelMask(d_pixelMask.getSegment(), anchorsRoi.height * d_pixelMask.stride());
983 ncvAssertReturn(d_vecPixelMask.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
984
985 NCVVectorAlloc<Ncv32u> d_vecPixelMaskTmp(gpuAllocator, static_cast<Ncv32u>(d_vecPixelMask.length()));
986 ncvAssertReturn(d_vecPixelMaskTmp.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
987
988 NCVVectorAlloc<Ncv32u> hp_pool32u(cpuAllocator, 2);
989 ncvAssertReturn(hp_pool32u.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
990 Ncv32u *hp_zero = &hp_pool32u.ptr()[0];
991 Ncv32u *hp_numDet = &hp_pool32u.ptr()[1];
992
993 NCV_SKIP_COND_BEGIN
994 *hp_zero = 0;
995 *hp_numDet = 0;
996 NCV_SKIP_COND_END
997
998 Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) *
999 (haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER));
1000
1001 NcvBool bTexCacheCascade = devProp.major < 2;
1002 NcvBool bTexCacheIImg = true; //this works better even on Fermi so far
1003 NcvBool bDoAtomicCompaction = devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3);
1004
1005 NCVVector<Ncv32u> *d_ptrNowData = &d_vecPixelMask;
1006 NCVVector<Ncv32u> *d_ptrNowTmp = &d_vecPixelMaskTmp;
1007
1008 Ncv32u szNppCompactTmpBuf;
1009 nppsStCompactGetSize_32u(static_cast<Ncv32u>(d_vecPixelMask.length()), &szNppCompactTmpBuf, devProp);
1010 if (bDoAtomicCompaction)
1011 {
1012 szNppCompactTmpBuf = 0;
1013 }
1014 NCVVectorAlloc<Ncv8u> d_tmpBufCompact(gpuAllocator, szNppCompactTmpBuf);
1015
1016 NCV_SKIP_COND_BEGIN
1017
1018 if (bTexCacheIImg)
1019 {
1020 cudaChannelFormatDesc cfdTexIImage;
1021 cfdTexIImage = cudaCreateChannelDesc<Ncv32u>();
1022
1023 size_t alignmentOffset;
1024 ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texIImage, integral.ptr(), cfdTexIImage,
1025 (anchorsRoi.height + haar.ClassifierSize.height) * integral.pitch()), NCV_CUDA_ERROR);
1026 ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
1027 }
1028
1029 if (bTexCacheCascade)
1030 {
1031 cudaChannelFormatDesc cfdTexHaarFeatures;
1032 cudaChannelFormatDesc cfdTexHaarClassifierNodes;
1033 cfdTexHaarFeatures = cudaCreateChannelDesc<uint2>();
1034 cfdTexHaarClassifierNodes = cudaCreateChannelDesc<uint4>();
1035
1036 size_t alignmentOffset;
1037 ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarFeatures,
1038 d_HaarFeatures.ptr(), cfdTexHaarFeatures,sizeof(HaarFeature64) * haar.NumFeatures), NCV_CUDA_ERROR);
1039 ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
1040 ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarClassifierNodes,
1041 d_HaarNodes.ptr(), cfdTexHaarClassifierNodes, sizeof(HaarClassifierNode128) * haar.NumClassifierTotalNodes), NCV_CUDA_ERROR);
1042 ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
1043 }
1044
1045 Ncv32u stageStartAnchorParallel = 0;
1046 Ncv32u stageMiddleSwitch = getStageNumWithNotLessThanNclassifiers(NUM_THREADS_CLASSIFIERPARALLEL,
1047 haar, h_HaarStages);
1048 Ncv32u stageEndClassifierParallel = haar.NumStages;
1049 if (stageMiddleSwitch == 0)
1050 {
1051 stageMiddleSwitch = 1;
1052 }
1053
1054 //create stages subdivision for pixel-parallel processing
1055 const Ncv32u compactEveryNstage = bDoAtomicCompaction ? 7 : 1;
1056 Ncv32u curStop = stageStartAnchorParallel;
1057 std::vector<Ncv32u> pixParallelStageStops;
1058 while (curStop < stageMiddleSwitch)
1059 {
1060 pixParallelStageStops.push_back(curStop);
1061 curStop += compactEveryNstage;
1062 }
1063 if (curStop > compactEveryNstage && curStop - stageMiddleSwitch > compactEveryNstage / 2)
1064 {
1065 pixParallelStageStops[pixParallelStageStops.size()-1] =
1066 (stageMiddleSwitch - (curStop - 2 * compactEveryNstage)) / 2;
1067 }
1068 pixParallelStageStops.push_back(stageMiddleSwitch);
1069 Ncv32u pixParallelStageStopsIndex = 0;
1070
1071 if (pixelStep != 1 || bMaskElements)
1072 {
1073 if (bDoAtomicCompaction)
1074 {
1075 ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
1076 0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
1077 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1078 }
1079
1080 dim3 gridInit((((anchorsRoi.width + pixelStep - 1) / pixelStep + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL),
1081 (anchorsRoi.height + pixelStep - 1) / pixelStep);
1082 dim3 blockInit(NUM_THREADS_ANCHORSPARALLEL);
1083
1084 if (gridInit.x == 0 || gridInit.y == 0)
1085 {
1086 numDetections = 0;
1087 return NCV_SUCCESS;
1088 }
1089
1090 initializeMaskVectorDynTemplate(bMaskElements,
1091 bDoAtomicCompaction,
1092 gridInit, blockInit, cuStream,
1093 d_ptrNowData->ptr(),
1094 d_ptrNowTmp->ptr(),
1095 static_cast<Ncv32u>(d_vecPixelMask.length()), d_pixelMask.stride(),
1096 anchorsRoi, pixelStep);
1097 ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
1098
1099 if (bDoAtomicCompaction)
1100 {
1101 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1102 ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
1103 0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
1104 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1105 swap(d_ptrNowData, d_ptrNowTmp);
1106 }
1107 else
1108 {
1109 NCVStatus nppSt;
1110 nppSt = nppsStCompact_32u(d_ptrNowTmp->ptr(), static_cast<Ncv32u>(d_vecPixelMask.length()),
1111 d_ptrNowData->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
1112 d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
1113 ncvAssertReturn(nppSt == NPPST_SUCCESS, NCV_NPP_ERROR);
1114 }
1115 numDetections = *hp_numDet;
1116 }
1117 else
1118 {
1119 //
1120 // 1. Run the first pixel-input pixel-parallel classifier for few stages
1121 //
1122
1123 if (bDoAtomicCompaction)
1124 {
1125 ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
1126 0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
1127 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1128 }
1129
1130 dim3 grid1(((d_pixelMask.stride() + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL),
1131 anchorsRoi.height);
1132 dim3 block1(NUM_THREADS_ANCHORSPARALLEL);
1133 applyHaarClassifierAnchorParallelDynTemplate(
1134 true, //tbInitMaskPositively
1135 bTexCacheIImg, //tbCacheTextureIImg
1136 bTexCacheCascade, //tbCacheTextureCascade
1137 pixParallelStageStops[pixParallelStageStopsIndex] != 0,//tbReadPixelIndexFromVector
1138 bDoAtomicCompaction, //tbDoAtomicCompaction
1139 grid1,
1140 block1,
1141 cuStream,
1142 integral.ptr(), integral.stride(),
1143 d_weights.ptr(), d_weights.stride(),
1144 d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
1145 d_ptrNowData->ptr(),
1146 bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
1147 0,
1148 d_pixelMask.stride(),
1149 anchorsRoi,
1150 pixParallelStageStops[pixParallelStageStopsIndex],
1151 pixParallelStageStops[pixParallelStageStopsIndex+1],
1152 scaleAreaPixels);
1153 ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
1154
1155 if (bDoAtomicCompaction)
1156 {
1157 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1158 ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
1159 0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
1160 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1161 }
1162 else
1163 {
1164 NCVStatus nppSt;
1165 nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), static_cast<Ncv32u>(d_vecPixelMask.length()),
1166 d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
1167 d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
1168 ncvAssertReturnNcvStat(nppSt);
1169 }
1170
1171 swap(d_ptrNowData, d_ptrNowTmp);
1172 numDetections = *hp_numDet;
1173
1174 pixParallelStageStopsIndex++;
1175 }
1176
1177 //
1178 // 2. Run pixel-parallel stages
1179 //
1180
1181 for (; pixParallelStageStopsIndex < pixParallelStageStops.size()-1; pixParallelStageStopsIndex++)
1182 {
1183 if (numDetections == 0)
1184 {
1185 break;
1186 }
1187
1188 if (bDoAtomicCompaction)
1189 {
1190 ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
1191 0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
1192 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1193 }
1194
1195 dim3 grid2((numDetections + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL);
1196 if (numDetections > MAX_GRID_DIM)
1197 {
1198 grid2.x = MAX_GRID_DIM;
1199 grid2.y = (numDetections + MAX_GRID_DIM - 1) / MAX_GRID_DIM;
1200 }
1201 dim3 block2(NUM_THREADS_ANCHORSPARALLEL);
1202
1203 applyHaarClassifierAnchorParallelDynTemplate(
1204 false, //tbInitMaskPositively
1205 bTexCacheIImg, //tbCacheTextureIImg
1206 bTexCacheCascade, //tbCacheTextureCascade
1207 pixParallelStageStops[pixParallelStageStopsIndex] != 0 || pixelStep != 1 || bMaskElements,//tbReadPixelIndexFromVector
1208 bDoAtomicCompaction, //tbDoAtomicCompaction
1209 grid2,
1210 block2,
1211 cuStream,
1212 integral.ptr(), integral.stride(),
1213 d_weights.ptr(), d_weights.stride(),
1214 d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
1215 d_ptrNowData->ptr(),
1216 bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
1217 numDetections,
1218 d_pixelMask.stride(),
1219 anchorsRoi,
1220 pixParallelStageStops[pixParallelStageStopsIndex],
1221 pixParallelStageStops[pixParallelStageStopsIndex+1],
1222 scaleAreaPixels);
1223 ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
1224
1225 if (bDoAtomicCompaction)
1226 {
1227 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1228 ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
1229 0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
1230 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1231 }
1232 else
1233 {
1234 NCVStatus nppSt;
1235 nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections,
1236 d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
1237 d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
1238 ncvAssertReturnNcvStat(nppSt);
1239 }
1240
1241 swap(d_ptrNowData, d_ptrNowTmp);
1242 numDetections = *hp_numDet;
1243 }
1244
1245 //
1246 // 3. Run all left stages in one stage-parallel kernel
1247 //
1248
1249 if (numDetections > 0 && stageMiddleSwitch < stageEndClassifierParallel)
1250 {
1251 if (bDoAtomicCompaction)
1252 {
1253 ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
1254 0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
1255 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1256 }
1257
1258 dim3 grid3(numDetections);
1259 if (numDetections > MAX_GRID_DIM)
1260 {
1261 grid3.x = MAX_GRID_DIM;
1262 grid3.y = (numDetections + MAX_GRID_DIM - 1) / MAX_GRID_DIM;
1263 }
1264 dim3 block3(NUM_THREADS_CLASSIFIERPARALLEL);
1265
1266 applyHaarClassifierClassifierParallelDynTemplate(
1267 bTexCacheIImg, //tbCacheTextureIImg
1268 bTexCacheCascade, //tbCacheTextureCascade
1269 bDoAtomicCompaction, //tbDoAtomicCompaction
1270 grid3,
1271 block3,
1272 cuStream,
1273 integral.ptr(), integral.stride(),
1274 d_weights.ptr(), d_weights.stride(),
1275 d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
1276 d_ptrNowData->ptr(),
1277 bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
1278 numDetections,
1279 d_pixelMask.stride(),
1280 anchorsRoi,
1281 stageMiddleSwitch,
1282 stageEndClassifierParallel,
1283 scaleAreaPixels);
1284 ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
1285
1286 if (bDoAtomicCompaction)
1287 {
1288 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1289 ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
1290 0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
1291 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1292 }
1293 else
1294 {
1295 NCVStatus nppSt;
1296 nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections,
1297 d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
1298 d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
1299 ncvAssertReturnNcvStat(nppSt);
1300 }
1301
1302 swap(d_ptrNowData, d_ptrNowTmp);
1303 numDetections = *hp_numDet;
1304 }
1305
1306 if (d_ptrNowData != &d_vecPixelMask)
1307 {
1308 d_vecPixelMaskTmp.copySolid(d_vecPixelMask, cuStream);
1309 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1310 }
1311
1312 #if defined _SELF_TEST_
1313
1314 ncvStat = d_pixelMask.copySolid(h_pixelMask_d, 0);
1315 ncvAssertReturnNcvStat(ncvStat);
1316 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1317
1318 if (bDoAtomicCompaction)
1319 {
1320 std::sort(h_pixelMask_d.ptr, h_pixelMask_d.ptr + numDetections);
1321 }
1322
1323 Ncv32u fpu_oldcw, fpu_cw;
1324 _controlfp_s(&fpu_cw, 0, 0);
1325 fpu_oldcw = fpu_cw;
1326 _controlfp_s(&fpu_cw, _PC_24, _MCW_PC);
1327 Ncv32u numDetGold;
1328 ncvStat = ncvApplyHaarClassifierCascade_host(h_integralImage, h_weights, h_pixelMask, numDetGold, haar,
1329 h_HaarStages, h_HaarNodes, h_HaarFeatures,
1330 bMaskElements, anchorsRoi, pixelStep, scaleArea);
1331 ncvAssertReturnNcvStat(ncvStat);
1332 _controlfp_s(&fpu_cw, fpu_oldcw, _MCW_PC);
1333
1334 bool bPass = true;
1335
1336 if (numDetGold != numDetections)
1337 {
1338 printf("NCVHaarClassifierCascade::applyHaarClassifierCascade numdetections don't match: cpu=%d, gpu=%d\n", numDetGold, numDetections);
1339 bPass = false;
1340 }
1341 else
1342 {
1343 for (Ncv32u i=0; i<std::max(numDetGold, numDetections) && bPass; i++)
1344 {
1345 if (h_pixelMask.ptr[i] != h_pixelMask_d.ptr[i])
1346 {
1347 printf("NCVHaarClassifierCascade::applyHaarClassifierCascade self test failed: i=%d, cpu=%d, gpu=%d\n", i, h_pixelMask.ptr[i], h_pixelMask_d.ptr[i]);
1348 bPass = false;
1349 }
1350 }
1351 }
1352
1353 printf("NCVHaarClassifierCascade::applyHaarClassifierCascade %s\n", bPass?"PASSED":"FAILED");
1354 #endif
1355
1356 NCV_SKIP_COND_END
1357
1358 return NCV_SUCCESS;
1359 }
1360
1361
1362 //==============================================================================
1363 //
1364 // HypothesesOperations file
1365 //
1366 //==============================================================================
1367
1368
1369 const Ncv32u NUM_GROW_THREADS = 128;
1370
1371
pixelToRect(Ncv32u pixel,Ncv32u width,Ncv32u height,Ncv32f scale)1372 __device__ __host__ NcvRect32u pixelToRect(Ncv32u pixel, Ncv32u width, Ncv32u height, Ncv32f scale)
1373 {
1374 NcvRect32u res;
1375 res.x = (Ncv32u)(scale * (pixel & 0xFFFF));
1376 res.y = (Ncv32u)(scale * (pixel >> 16));
1377 res.width = (Ncv32u)(scale * width);
1378 res.height = (Ncv32u)(scale * height);
1379 return res;
1380 }
1381
1382
growDetectionsKernel(Ncv32u * pixelMask,Ncv32u numElements,NcvRect32u * hypotheses,Ncv32u rectWidth,Ncv32u rectHeight,Ncv32f curScale)1383 __global__ void growDetectionsKernel(Ncv32u *pixelMask, Ncv32u numElements,
1384 NcvRect32u *hypotheses,
1385 Ncv32u rectWidth, Ncv32u rectHeight, Ncv32f curScale)
1386 {
1387 Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
1388 Ncv32u elemAddr = blockId * NUM_GROW_THREADS + threadIdx.x;
1389 if (elemAddr >= numElements)
1390 {
1391 return;
1392 }
1393 hypotheses[elemAddr] = pixelToRect(pixelMask[elemAddr], rectWidth, rectHeight, curScale);
1394 }
1395
1396
ncvGrowDetectionsVector_device(NCVVector<Ncv32u> & pixelMask,Ncv32u numPixelMaskDetections,NCVVector<NcvRect32u> & hypotheses,Ncv32u & totalDetections,Ncv32u totalMaxDetections,Ncv32u rectWidth,Ncv32u rectHeight,Ncv32f curScale,cudaStream_t cuStream)1397 NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
1398 Ncv32u numPixelMaskDetections,
1399 NCVVector<NcvRect32u> &hypotheses,
1400 Ncv32u &totalDetections,
1401 Ncv32u totalMaxDetections,
1402 Ncv32u rectWidth,
1403 Ncv32u rectHeight,
1404 Ncv32f curScale,
1405 cudaStream_t cuStream)
1406 {
1407 ncvAssertReturn(pixelMask.ptr() != NULL && hypotheses.ptr() != NULL, NCV_NULL_PTR);
1408
1409 ncvAssertReturn(pixelMask.memType() == hypotheses.memType() &&
1410 pixelMask.memType() == NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
1411
1412 ncvAssertReturn(rectWidth > 0 && rectHeight > 0 && curScale > 0, NCV_INVALID_ROI);
1413
1414 ncvAssertReturn(curScale > 0, NCV_INVALID_SCALE);
1415
1416 ncvAssertReturn(totalMaxDetections <= hypotheses.length() &&
1417 numPixelMaskDetections <= pixelMask.length() &&
1418 totalDetections <= totalMaxDetections, NCV_INCONSISTENT_INPUT);
1419
1420 NCVStatus ncvStat = NCV_SUCCESS;
1421 Ncv32u numDetsToCopy = numPixelMaskDetections;
1422
1423 if (numDetsToCopy == 0)
1424 {
1425 return ncvStat;
1426 }
1427
1428 if (totalDetections + numPixelMaskDetections > totalMaxDetections)
1429 {
1430 ncvStat = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;
1431 numDetsToCopy = totalMaxDetections - totalDetections;
1432 }
1433
1434 dim3 block(NUM_GROW_THREADS);
1435 dim3 grid((numDetsToCopy + NUM_GROW_THREADS - 1) / NUM_GROW_THREADS);
1436 if (grid.x > 65535)
1437 {
1438 grid.y = (grid.x + 65534) / 65535;
1439 grid.x = 65535;
1440 }
1441 growDetectionsKernel<<<grid, block, 0, cuStream>>>(pixelMask.ptr(), numDetsToCopy,
1442 hypotheses.ptr() + totalDetections,
1443 rectWidth, rectHeight, curScale);
1444 ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
1445
1446 totalDetections += numDetsToCopy;
1447 return ncvStat;
1448 }
1449
1450
1451 //==============================================================================
1452 //
1453 // Pipeline file
1454 //
1455 //==============================================================================
1456
1457
ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> & d_srcImg,NcvSize32u srcRoi,NCVVector<NcvRect32u> & d_dstRects,Ncv32u & dstNumRects,HaarClassifierCascadeDescriptor & haar,NCVVector<HaarStage64> & h_HaarStages,NCVVector<HaarStage64> & d_HaarStages,NCVVector<HaarClassifierNode128> & d_HaarNodes,NCVVector<HaarFeature64> & d_HaarFeatures,NcvSize32u minObjSize,Ncv32u minNeighbors,Ncv32f scaleStep,Ncv32u pixelStep,Ncv32u flags,INCVMemAllocator & gpuAllocator,INCVMemAllocator & cpuAllocator,cudaDeviceProp & devProp,cudaStream_t cuStream)1458 NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
1459 NcvSize32u srcRoi,
1460 NCVVector<NcvRect32u> &d_dstRects,
1461 Ncv32u &dstNumRects,
1462
1463 HaarClassifierCascadeDescriptor &haar,
1464 NCVVector<HaarStage64> &h_HaarStages,
1465 NCVVector<HaarStage64> &d_HaarStages,
1466 NCVVector<HaarClassifierNode128> &d_HaarNodes,
1467 NCVVector<HaarFeature64> &d_HaarFeatures,
1468
1469 NcvSize32u minObjSize,
1470 Ncv32u minNeighbors, //default 4
1471 Ncv32f scaleStep, //default 1.2f
1472 Ncv32u pixelStep, //default 1
1473 Ncv32u flags, //default NCVPipeObjDet_Default
1474
1475 INCVMemAllocator &gpuAllocator,
1476 INCVMemAllocator &cpuAllocator,
1477 cudaDeviceProp &devProp,
1478 cudaStream_t cuStream)
1479 {
1480 ncvAssertReturn(d_srcImg.memType() == d_dstRects.memType() &&
1481 d_srcImg.memType() == gpuAllocator.memType() &&
1482 (d_srcImg.memType() == NCVMemoryTypeDevice ||
1483 d_srcImg.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
1484
1485 ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() &&
1486 d_HaarStages.memType() == d_HaarFeatures.memType() &&
1487 (d_HaarStages.memType() == NCVMemoryTypeDevice ||
1488 d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
1489
1490 ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
1491
1492 ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
1493
1494 ncvAssertReturn((d_srcImg.ptr() != NULL && d_dstRects.ptr() != NULL &&
1495 h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL &&
1496 d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR);
1497 ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0 &&
1498 d_srcImg.width() >= srcRoi.width && d_srcImg.height() >= srcRoi.height &&
1499 srcRoi.width >= minObjSize.width && srcRoi.height >= minObjSize.height &&
1500 d_dstRects.length() >= 1, NCV_DIMENSIONS_INVALID);
1501
1502 ncvAssertReturn(scaleStep > 1.0f, NCV_INVALID_SCALE);
1503
1504 ncvAssertReturn(d_HaarStages.length() >= haar.NumStages &&
1505 d_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
1506 d_HaarFeatures.length() >= haar.NumFeatures &&
1507 d_HaarStages.length() == h_HaarStages.length() &&
1508 haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
1509
1510 ncvAssertReturn(haar.bNeedsTiltedII == false, NCV_NOIMPL_HAAR_TILTED_FEATURES);
1511
1512 ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);
1513
1514 //TODO: set NPP active stream to cuStream
1515
1516 NCVStatus ncvStat;
1517 NCV_SET_SKIP_COND(gpuAllocator.isCounting());
1518
1519 Ncv32u integralWidth = d_srcImg.width() + 1;
1520 Ncv32u integralHeight = d_srcImg.height() + 1;
1521
1522 NCVMatrixAlloc<Ncv32u> integral(gpuAllocator, integralWidth, integralHeight);
1523 ncvAssertReturn(integral.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1524 NCVMatrixAlloc<Ncv64u> d_sqIntegralImage(gpuAllocator, integralWidth, integralHeight);
1525 ncvAssertReturn(d_sqIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1526
1527 NCVMatrixAlloc<Ncv32f> d_rectStdDev(gpuAllocator, d_srcImg.width(), d_srcImg.height());
1528 ncvAssertReturn(d_rectStdDev.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1529 NCVMatrixAlloc<Ncv32u> d_pixelMask(gpuAllocator, d_srcImg.width(), d_srcImg.height());
1530 ncvAssertReturn(d_pixelMask.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1531
1532 NCVMatrixAlloc<Ncv32u> d_scaledIntegralImage(gpuAllocator, integralWidth, integralHeight);
1533 ncvAssertReturn(d_scaledIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1534 NCVMatrixAlloc<Ncv64u> d_scaledSqIntegralImage(gpuAllocator, integralWidth, integralHeight);
1535 ncvAssertReturn(d_scaledSqIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1536
1537 NCVVectorAlloc<NcvRect32u> d_hypothesesIntermediate(gpuAllocator, d_srcImg.width() * d_srcImg.height());
1538 ncvAssertReturn(d_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1539 NCVVectorAlloc<NcvRect32u> h_hypothesesIntermediate(cpuAllocator, d_srcImg.width() * d_srcImg.height());
1540 ncvAssertReturn(h_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1541
1542 NCVStatus nppStat;
1543 Ncv32u szTmpBufIntegral, szTmpBufSqIntegral;
1544 nppStat = nppiStIntegralGetSize_8u32u(NcvSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufIntegral, devProp);
1545 ncvAssertReturnNcvStat(nppStat);
1546 nppStat = nppiStSqrIntegralGetSize_8u64u(NcvSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufSqIntegral, devProp);
1547 ncvAssertReturnNcvStat(nppStat);
1548 NCVVectorAlloc<Ncv8u> d_tmpIIbuf(gpuAllocator, std::max(szTmpBufIntegral, szTmpBufSqIntegral));
1549 ncvAssertReturn(d_tmpIIbuf.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
1550
1551 NCV_SKIP_COND_BEGIN
1552
1553 nppStat = nppiStIntegral_8u32u_C1R(d_srcImg.ptr(), d_srcImg.pitch(),
1554 integral.ptr(), integral.pitch(),
1555 NcvSize32u(d_srcImg.width(), d_srcImg.height()),
1556 d_tmpIIbuf.ptr(), szTmpBufIntegral, devProp);
1557 ncvAssertReturnNcvStat(nppStat);
1558
1559 nppStat = nppiStSqrIntegral_8u64u_C1R(d_srcImg.ptr(), d_srcImg.pitch(),
1560 d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
1561 NcvSize32u(d_srcImg.width(), d_srcImg.height()),
1562 d_tmpIIbuf.ptr(), szTmpBufSqIntegral, devProp);
1563 ncvAssertReturnNcvStat(nppStat);
1564
1565 NCV_SKIP_COND_END
1566
1567 dstNumRects = 0;
1568
1569 Ncv32u lastCheckedScale = 0;
1570 NcvBool bReverseTraverseScale = ((flags & NCVPipeObjDet_FindLargestObject) != 0);
1571 std::vector<Ncv32u> scalesVector;
1572
1573 NcvBool bFoundLargestFace = false;
1574
1575 for (Ncv32f scaleIter = 1.0f; ; scaleIter *= scaleStep)
1576 {
1577 Ncv32u scale = (Ncv32u)scaleIter;
1578 if (lastCheckedScale == scale)
1579 {
1580 continue;
1581 }
1582 lastCheckedScale = scale;
1583
1584 if (haar.ClassifierSize.width * (Ncv32s)scale < minObjSize.width ||
1585 haar.ClassifierSize.height * (Ncv32s)scale < minObjSize.height)
1586 {
1587 continue;
1588 }
1589
1590 NcvSize32s srcRoi_, srcIIRo_i, scaledIIRoi, searchRoi;
1591
1592 srcRoi_.width = d_srcImg.width();
1593 srcRoi_.height = d_srcImg.height();
1594
1595 srcIIRo_i.width = srcRoi_.width + 1;
1596 srcIIRo_i.height = srcRoi_.height + 1;
1597
1598 scaledIIRoi.width = srcIIRo_i.width / scale;
1599 scaledIIRoi.height = srcIIRo_i.height / scale;
1600
1601 searchRoi.width = scaledIIRoi.width - haar.ClassifierSize.width;
1602 searchRoi.height = scaledIIRoi.height - haar.ClassifierSize.height;
1603
1604 if (searchRoi.width <= 0 || searchRoi.height <= 0)
1605 {
1606 break;
1607 }
1608
1609 scalesVector.push_back(scale);
1610
1611 if (gpuAllocator.isCounting())
1612 {
1613 break;
1614 }
1615 }
1616
1617 if (bReverseTraverseScale)
1618 {
1619 std::reverse(scalesVector.begin(), scalesVector.end());
1620 }
1621
1622 //TODO: handle _fair_scale_ flag
1623 for (Ncv32u i=0; i<scalesVector.size(); i++)
1624 {
1625 Ncv32u scale = scalesVector[i];
1626
1627 NcvSize32u srcRoi_, scaledIIRoi, searchRoi;
1628 NcvSize32u srcIIRoi;
1629 srcRoi_.width = d_srcImg.width();
1630 srcRoi_.height = d_srcImg.height();
1631 srcIIRoi.width = srcRoi_.width + 1;
1632 srcIIRoi.height = srcRoi_.height + 1;
1633 scaledIIRoi.width = srcIIRoi.width / scale;
1634 scaledIIRoi.height = srcIIRoi.height / scale;
1635 searchRoi.width = scaledIIRoi.width - haar.ClassifierSize.width;
1636 searchRoi.height = scaledIIRoi.height - haar.ClassifierSize.height;
1637
1638 NCV_SKIP_COND_BEGIN
1639
1640 nppStat = nppiStDecimate_32u_C1R(
1641 integral.ptr(), integral.pitch(),
1642 d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
1643 srcIIRoi, scale, true);
1644 ncvAssertReturnNcvStat(nppStat);
1645
1646 nppStat = nppiStDecimate_64u_C1R(
1647 d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
1648 d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),
1649 srcIIRoi, scale, true);
1650 ncvAssertReturnNcvStat(nppStat);
1651
1652 const NcvRect32u rect(
1653 HAAR_STDDEV_BORDER,
1654 HAAR_STDDEV_BORDER,
1655 haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER,
1656 haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER);
1657 nppStat = nppiStRectStdDev_32f_C1R(
1658 d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
1659 d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),
1660 d_rectStdDev.ptr(), d_rectStdDev.pitch(),
1661 NcvSize32u(searchRoi.width, searchRoi.height), rect,
1662 (Ncv32f)scale*scale, true);
1663 ncvAssertReturnNcvStat(nppStat);
1664
1665 NCV_SKIP_COND_END
1666
1667 Ncv32u detectionsOnThisScale;
1668 ncvStat = ncvApplyHaarClassifierCascade_device(
1669 d_scaledIntegralImage, d_rectStdDev, d_pixelMask,
1670 detectionsOnThisScale,
1671 haar, h_HaarStages, d_HaarStages, d_HaarNodes, d_HaarFeatures, false,
1672 searchRoi, pixelStep, (Ncv32f)scale*scale,
1673 gpuAllocator, cpuAllocator, devProp, cuStream);
1674 ncvAssertReturnNcvStat(nppStat);
1675
1676 NCV_SKIP_COND_BEGIN
1677
1678 NCVVectorReuse<Ncv32u> d_vecPixelMask(d_pixelMask.getSegment());
1679 ncvStat = ncvGrowDetectionsVector_device(
1680 d_vecPixelMask,
1681 detectionsOnThisScale,
1682 d_hypothesesIntermediate,
1683 dstNumRects,
1684 static_cast<Ncv32u>(d_hypothesesIntermediate.length()),
1685 haar.ClassifierSize.width,
1686 haar.ClassifierSize.height,
1687 (Ncv32f)scale,
1688 cuStream);
1689 ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
1690
1691 if (flags & NCVPipeObjDet_FindLargestObject)
1692 {
1693 if (dstNumRects == 0)
1694 {
1695 continue;
1696 }
1697
1698 if (dstNumRects != 0)
1699 {
1700 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1701 ncvStat = d_hypothesesIntermediate.copySolid(h_hypothesesIntermediate, cuStream,
1702 dstNumRects * sizeof(NcvRect32u));
1703 ncvAssertReturnNcvStat(ncvStat);
1704 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1705 }
1706
1707 Ncv32u numStrongHypothesesNow = dstNumRects;
1708 ncvStat = ncvGroupRectangles_host(
1709 h_hypothesesIntermediate,
1710 numStrongHypothesesNow,
1711 minNeighbors,
1712 RECT_SIMILARITY_PROPORTION,
1713 NULL);
1714 ncvAssertReturnNcvStat(ncvStat);
1715
1716 if (numStrongHypothesesNow > 0)
1717 {
1718 NcvRect32u maxRect = h_hypothesesIntermediate.ptr()[0];
1719 for (Ncv32u j=1; j<numStrongHypothesesNow; j++)
1720 {
1721 if (maxRect.width < h_hypothesesIntermediate.ptr()[j].width)
1722 {
1723 maxRect = h_hypothesesIntermediate.ptr()[j];
1724 }
1725 }
1726
1727 h_hypothesesIntermediate.ptr()[0] = maxRect;
1728 dstNumRects = 1;
1729
1730 ncvStat = h_hypothesesIntermediate.copySolid(d_dstRects, cuStream, sizeof(NcvRect32u));
1731 ncvAssertReturnNcvStat(ncvStat);
1732
1733 bFoundLargestFace = true;
1734
1735 break;
1736 }
1737 }
1738
1739 NCV_SKIP_COND_END
1740
1741 if (gpuAllocator.isCounting())
1742 {
1743 break;
1744 }
1745 }
1746
1747 NCVStatus ncvRetCode = NCV_SUCCESS;
1748
1749 NCV_SKIP_COND_BEGIN
1750
1751 if (flags & NCVPipeObjDet_FindLargestObject)
1752 {
1753 if (!bFoundLargestFace)
1754 {
1755 dstNumRects = 0;
1756 }
1757 }
1758 else
1759 {
1760 //TODO: move hypotheses filtration to GPU pipeline (the only CPU-resident element of the pipeline left)
1761 if (dstNumRects != 0)
1762 {
1763 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1764 ncvStat = d_hypothesesIntermediate.copySolid(h_hypothesesIntermediate, cuStream,
1765 dstNumRects * sizeof(NcvRect32u));
1766 ncvAssertReturnNcvStat(ncvStat);
1767 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1768 }
1769
1770 ncvStat = ncvGroupRectangles_host(
1771 h_hypothesesIntermediate,
1772 dstNumRects,
1773 minNeighbors,
1774 RECT_SIMILARITY_PROPORTION,
1775 NULL);
1776 ncvAssertReturnNcvStat(ncvStat);
1777
1778 if (dstNumRects > d_dstRects.length())
1779 {
1780 ncvRetCode = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;
1781 dstNumRects = static_cast<Ncv32u>(d_dstRects.length());
1782 }
1783
1784 if (dstNumRects != 0)
1785 {
1786 ncvStat = h_hypothesesIntermediate.copySolid(d_dstRects, cuStream,
1787 dstNumRects * sizeof(NcvRect32u));
1788 ncvAssertReturnNcvStat(ncvStat);
1789 }
1790 }
1791
1792 if (flags & NCVPipeObjDet_VisualizeInPlace)
1793 {
1794 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
1795 ncvDrawRects_8u_device(d_srcImg.ptr(), d_srcImg.stride(),
1796 d_srcImg.width(), d_srcImg.height(),
1797 d_dstRects.ptr(), dstNumRects, 255, cuStream);
1798 }
1799
1800 NCV_SKIP_COND_END
1801
1802 return ncvRetCode;
1803 }
1804
1805
1806 //==============================================================================
1807 //
1808 // Purely Host code: classifier IO, mock-ups
1809 //
1810 //==============================================================================
1811
1812
1813 #ifdef _SELF_TEST_
1814 #include <float.h>
1815 #endif
1816
1817
ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> & h_integralImage,NCVMatrix<Ncv32f> & h_weights,NCVMatrixAlloc<Ncv32u> & h_pixelMask,Ncv32u & numDetections,HaarClassifierCascadeDescriptor & haar,NCVVector<HaarStage64> & h_HaarStages,NCVVector<HaarClassifierNode128> & h_HaarNodes,NCVVector<HaarFeature64> & h_HaarFeatures,NcvBool bMaskElements,NcvSize32u anchorsRoi,Ncv32u pixelStep,Ncv32f scaleArea)1818 NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,
1819 NCVMatrix<Ncv32f> &h_weights,
1820 NCVMatrixAlloc<Ncv32u> &h_pixelMask,
1821 Ncv32u &numDetections,
1822 HaarClassifierCascadeDescriptor &haar,
1823 NCVVector<HaarStage64> &h_HaarStages,
1824 NCVVector<HaarClassifierNode128> &h_HaarNodes,
1825 NCVVector<HaarFeature64> &h_HaarFeatures,
1826 NcvBool bMaskElements,
1827 NcvSize32u anchorsRoi,
1828 Ncv32u pixelStep,
1829 Ncv32f scaleArea)
1830 {
1831 ncvAssertReturn(h_integralImage.memType() == h_weights.memType() &&
1832 h_integralImage.memType() == h_pixelMask.memType() &&
1833 (h_integralImage.memType() == NCVMemoryTypeHostPageable ||
1834 h_integralImage.memType() == NCVMemoryTypeHostPinned), NCV_MEM_RESIDENCE_ERROR);
1835 ncvAssertReturn(h_HaarStages.memType() == h_HaarNodes.memType() &&
1836 h_HaarStages.memType() == h_HaarFeatures.memType() &&
1837 (h_HaarStages.memType() == NCVMemoryTypeHostPageable ||
1838 h_HaarStages.memType() == NCVMemoryTypeHostPinned), NCV_MEM_RESIDENCE_ERROR);
1839 ncvAssertReturn(h_integralImage.ptr() != NULL && h_weights.ptr() != NULL && h_pixelMask.ptr() != NULL &&
1840 h_HaarStages.ptr() != NULL && h_HaarNodes.ptr() != NULL && h_HaarFeatures.ptr() != NULL, NCV_NULL_PTR);
1841 ncvAssertReturn(anchorsRoi.width > 0 && anchorsRoi.height > 0 &&
1842 h_pixelMask.width() >= anchorsRoi.width && h_pixelMask.height() >= anchorsRoi.height &&
1843 h_weights.width() >= anchorsRoi.width && h_weights.height() >= anchorsRoi.height &&
1844 h_integralImage.width() >= anchorsRoi.width + haar.ClassifierSize.width &&
1845 h_integralImage.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID);
1846 ncvAssertReturn(scaleArea > 0, NCV_INVALID_SCALE);
1847 ncvAssertReturn(h_HaarStages.length() >= haar.NumStages &&
1848 h_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
1849 h_HaarFeatures.length() >= haar.NumFeatures &&
1850 h_HaarStages.length() == h_HaarStages.length() &&
1851 haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
1852 ncvAssertReturn(haar.bNeedsTiltedII == false, NCV_NOIMPL_HAAR_TILTED_FEATURES);
1853 ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);
1854
1855 Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) *
1856 (haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER));
1857
1858 for (Ncv32u i=0; i<anchorsRoi.height; i++)
1859 {
1860 for (Ncv32u j=0; j<h_pixelMask.stride(); j++)
1861 {
1862 if (i % pixelStep != 0 || j % pixelStep != 0 || j >= anchorsRoi.width)
1863 {
1864 h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = OBJDET_MASK_ELEMENT_INVALID_32U;
1865 }
1866 else
1867 {
1868 for (Ncv32u iStage = 0; iStage < haar.NumStages; iStage++)
1869 {
1870 Ncv32f curStageSum = 0.0f;
1871 Ncv32u numRootNodesInStage = h_HaarStages.ptr()[iStage].getNumClassifierRootNodes();
1872 Ncv32u curRootNodeOffset = h_HaarStages.ptr()[iStage].getStartClassifierRootNodeOffset();
1873
1874 if (iStage == 0)
1875 {
1876 if (bMaskElements && h_pixelMask.ptr()[i * h_pixelMask.stride() + j] == OBJDET_MASK_ELEMENT_INVALID_32U)
1877 {
1878 break;
1879 }
1880 else
1881 {
1882 h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = ((i << 16) | j);
1883 }
1884 }
1885 else if (h_pixelMask.ptr()[i * h_pixelMask.stride() + j] == OBJDET_MASK_ELEMENT_INVALID_32U)
1886 {
1887 break;
1888 }
1889
1890 while (numRootNodesInStage--)
1891 {
1892 NcvBool bMoreNodesToTraverse = true;
1893 Ncv32u curNodeOffset = curRootNodeOffset;
1894
1895 while (bMoreNodesToTraverse)
1896 {
1897 HaarClassifierNode128 curNode = h_HaarNodes.ptr()[curNodeOffset];
1898 HaarFeatureDescriptor32 curFeatDesc = curNode.getFeatureDesc();
1899 Ncv32u curNodeFeaturesNum = curFeatDesc.getNumFeatures();
1900 Ncv32u curNodeFeaturesOffs = curFeatDesc.getFeaturesOffset();
1901
1902 Ncv32f curNodeVal = 0.f;
1903 for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)
1904 {
1905 HaarFeature64 feature = h_HaarFeatures.ptr()[curNodeFeaturesOffs + iRect];
1906 Ncv32u rectX, rectY, rectWidth, rectHeight;
1907 feature.getRect(&rectX, &rectY, &rectWidth, &rectHeight);
1908 Ncv32f rectWeight = feature.getWeight();
1909 Ncv32u iioffsTL = (i + rectY) * h_integralImage.stride() + (j + rectX);
1910 Ncv32u iioffsTR = iioffsTL + rectWidth;
1911 Ncv32u iioffsBL = iioffsTL + rectHeight * h_integralImage.stride();
1912 Ncv32u iioffsBR = iioffsBL + rectWidth;
1913
1914 Ncv32u iivalTL = h_integralImage.ptr()[iioffsTL];
1915 Ncv32u iivalTR = h_integralImage.ptr()[iioffsTR];
1916 Ncv32u iivalBL = h_integralImage.ptr()[iioffsBL];
1917 Ncv32u iivalBR = h_integralImage.ptr()[iioffsBR];
1918 Ncv32u rectSum = iivalBR - iivalBL + iivalTL - iivalTR;
1919 curNodeVal += (Ncv32f)rectSum * rectWeight;
1920 }
1921
1922 HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
1923 HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();
1924 Ncv32f nodeThreshold = curNode.getThreshold();
1925
1926 HaarClassifierNodeDescriptor32 nextNodeDescriptor;
1927 NcvBool nextNodeIsLeaf;
1928
1929 if (curNodeVal < scaleAreaPixels * h_weights.ptr()[i * h_weights.stride() + j] * nodeThreshold)
1930 {
1931 nextNodeDescriptor = nodeLeft;
1932 nextNodeIsLeaf = curFeatDesc.isLeftNodeLeaf();
1933 }
1934 else
1935 {
1936 nextNodeDescriptor = nodeRight;
1937 nextNodeIsLeaf = curFeatDesc.isRightNodeLeaf();
1938 }
1939
1940 if (nextNodeIsLeaf)
1941 {
1942 Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValueHost();
1943 curStageSum += tmpLeafValue;
1944 bMoreNodesToTraverse = false;
1945 }
1946 else
1947 {
1948 curNodeOffset = nextNodeDescriptor.getNextNodeOffset();
1949 }
1950 }
1951
1952 curRootNodeOffset++;
1953 }
1954
1955 Ncv32f tmpStageThreshold = h_HaarStages.ptr()[iStage].getStageThreshold();
1956 if (curStageSum < tmpStageThreshold)
1957 {
1958 //drop
1959 h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = OBJDET_MASK_ELEMENT_INVALID_32U;
1960 break;
1961 }
1962 }
1963 }
1964 }
1965 }
1966
1967 std::sort(h_pixelMask.ptr(), h_pixelMask.ptr() + anchorsRoi.height * h_pixelMask.stride());
1968 Ncv32u i = 0;
1969 for (; i<anchorsRoi.height * h_pixelMask.stride(); i++)
1970 {
1971 if (h_pixelMask.ptr()[i] == OBJDET_MASK_ELEMENT_INVALID_32U)
1972 {
1973 break;
1974 }
1975 }
1976 numDetections = i;
1977
1978 return NCV_SUCCESS;
1979 }
1980
1981
ncvGrowDetectionsVector_host(NCVVector<Ncv32u> & pixelMask,Ncv32u numPixelMaskDetections,NCVVector<NcvRect32u> & hypotheses,Ncv32u & totalDetections,Ncv32u totalMaxDetections,Ncv32u rectWidth,Ncv32u rectHeight,Ncv32f curScale)1982 NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
1983 Ncv32u numPixelMaskDetections,
1984 NCVVector<NcvRect32u> &hypotheses,
1985 Ncv32u &totalDetections,
1986 Ncv32u totalMaxDetections,
1987 Ncv32u rectWidth,
1988 Ncv32u rectHeight,
1989 Ncv32f curScale)
1990 {
1991 ncvAssertReturn(pixelMask.ptr() != NULL && hypotheses.ptr() != NULL, NCV_NULL_PTR);
1992 ncvAssertReturn(pixelMask.memType() == hypotheses.memType() &&
1993 pixelMask.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
1994 ncvAssertReturn(rectWidth > 0 && rectHeight > 0 && curScale > 0, NCV_INVALID_ROI);
1995 ncvAssertReturn(curScale > 0, NCV_INVALID_SCALE);
1996 ncvAssertReturn(totalMaxDetections <= hypotheses.length() &&
1997 numPixelMaskDetections <= pixelMask.length() &&
1998 totalDetections <= totalMaxDetections, NCV_INCONSISTENT_INPUT);
1999
2000 NCVStatus ncvStat = NCV_SUCCESS;
2001 Ncv32u numDetsToCopy = numPixelMaskDetections;
2002
2003 if (numDetsToCopy == 0)
2004 {
2005 return ncvStat;
2006 }
2007
2008 if (totalDetections + numPixelMaskDetections > totalMaxDetections)
2009 {
2010 ncvStat = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;
2011 numDetsToCopy = totalMaxDetections - totalDetections;
2012 }
2013
2014 for (Ncv32u i=0; i<numDetsToCopy; i++)
2015 {
2016 hypotheses.ptr()[totalDetections + i] = pixelToRect(pixelMask.ptr()[i], rectWidth, rectHeight, curScale);
2017 }
2018
2019 totalDetections += numDetsToCopy;
2020 return ncvStat;
2021 }
2022
2023 #define RECT_X_IDX 0
2024 #define RECT_Y_IDX 1
2025 #define RECT_W_IDX 2
2026 #define RECT_H_IDX 3
2027 #define RECT_WEIGHT_IDX 4
2028
2029 #define CUDA_CC_SIZE_W 0
2030 #define CUDA_CC_SIZE_H 1
2031
loadFromXML(const cv::String & filename,HaarClassifierCascadeDescriptor & haar,std::vector<HaarStage64> & haarStages,std::vector<HaarClassifierNode128> & haarClassifierNodes,std::vector<HaarFeature64> & haarFeatures)2032 static NCVStatus loadFromXML(const cv::String &filename,
2033 HaarClassifierCascadeDescriptor &haar,
2034 std::vector<HaarStage64> &haarStages,
2035 std::vector<HaarClassifierNode128> &haarClassifierNodes,
2036 std::vector<HaarFeature64> &haarFeatures)
2037 {
2038 const char *CUDA_CC_SIZE = "size";
2039 const char *CUDA_CC_STAGES = "stages";
2040 const char *CUDA_CC_STAGE_THRESHOLD = "stage_threshold";
2041 const char *CUDA_CC_TREES = "trees";
2042 const char *CUDA_CC_FEATURE = "feature";
2043 const char *CUDA_CC_RECT = "rects";
2044 const char *CUDA_CC_TILTED = "tilted";
2045 const char *CUDA_CC_THRESHOLD = "threshold";
2046 const char *CUDA_CC_LEFT_VAL = "left_val";
2047 const char *CUDA_CC_RIGHT_VAL = "right_val";
2048 const char *CUDA_CC_LEFT_NODE = "left_node";
2049 const char *CUDA_CC_RIGHT_NODE = "right_node";
2050
2051 NCVStatus ncvStat;
2052
2053 haar.NumStages = 0;
2054 haar.NumClassifierRootNodes = 0;
2055 haar.NumClassifierTotalNodes = 0;
2056 haar.NumFeatures = 0;
2057 haar.ClassifierSize.width = 0;
2058 haar.ClassifierSize.height = 0;
2059 haar.bHasStumpsOnly = true;
2060 haar.bNeedsTiltedII = false;
2061 Ncv32u curMaxTreeDepth = 0;
2062
2063 std::vector<HaarClassifierNode128> h_TmpClassifierNotRootNodes;
2064 haarStages.resize(0);
2065 haarClassifierNodes.resize(0);
2066 haarFeatures.resize(0);
2067
2068 cv::FileStorage fs(filename, cv::FileStorage::READ | cv::FileStorage::FORMAT_XML);
2069
2070 if (!fs.isOpened())
2071 return NCV_FILE_ERROR;
2072
2073 const cv::FileNode &root = fs.getFirstTopLevelNode();
2074 const cv::FileNode &fnSize = root[CUDA_CC_SIZE];
2075
2076 // collect the cascade classifier window size
2077 haar.ClassifierSize.width = (int)fnSize[CUDA_CC_SIZE_W];
2078 haar.ClassifierSize.height = (int)fnSize[CUDA_CC_SIZE_H];
2079 CV_Assert(haar.ClassifierSize.height > 0 && haar.ClassifierSize.width > 0);
2080
2081 const cv::FileNode &fnStages = root[CUDA_CC_STAGES];
2082 cv::FileNodeIterator it = fnStages.begin(), it_end = fnStages.end();
2083
2084 for (; it != it_end; ++it) // by stages
2085 {
2086 cv::FileNode fnStage = *it;
2087 HaarStage64 curStage;
2088
2089 curStage.setStartClassifierRootNodeOffset(static_cast<Ncv32u>(haarClassifierNodes.size()));
2090 curStage.setStageThreshold((float)fnStage[CUDA_CC_STAGE_THRESHOLD]);
2091
2092 // iterate over the trees
2093 const cv::FileNode &fnTrees = fnStage[CUDA_CC_TREES];
2094 cv::FileNodeIterator it1 = fnTrees.begin(), it1_end = fnTrees.end();
2095
2096 for (; it1 != it1_end; ++it1) // by trees
2097 {
2098 cv::FileNode tree = *it1;
2099 Ncv32u nodeId = (size_t)0;
2100 HaarClassifierNode128 curNode;
2101
2102 curNode.setThreshold((float)tree[0][CUDA_CC_THRESHOLD]);
2103
2104 NcvBool bIsLeftNodeLeaf = false;
2105 NcvBool bIsRightNodeLeaf = false;
2106
2107 HaarClassifierNodeDescriptor32 nodeLeft;
2108
2109 cv::FileNode leftNode = tree[0][CUDA_CC_LEFT_NODE];
2110
2111 if (leftNode.fs == NULL)
2112 {
2113 Ncv32f leftVal = tree[0][CUDA_CC_LEFT_VAL];
2114 ncvStat = nodeLeft.create(leftVal);
2115 ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
2116 bIsLeftNodeLeaf = true;
2117 }
2118 else
2119 {
2120 Ncv32u leftNodeOffset = (int)tree[0][CUDA_CC_LEFT_NODE];
2121 nodeLeft.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + leftNodeOffset - 1));
2122 haar.bHasStumpsOnly = false;
2123 }
2124
2125 curNode.setLeftNodeDesc(nodeLeft);
2126
2127 HaarClassifierNodeDescriptor32 nodeRight;
2128 cv::FileNode rightNode = tree[0][CUDA_CC_RIGHT_NODE];
2129
2130 if (rightNode.fs == NULL)
2131 {
2132 Ncv32f rightVal = tree[0][CUDA_CC_RIGHT_VAL];
2133 ncvStat = nodeRight.create(rightVal);
2134 ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
2135 bIsRightNodeLeaf = true;
2136 }
2137 else
2138 {
2139 Ncv32u rightNodeOffset = (int)tree[0][CUDA_CC_RIGHT_NODE];
2140 nodeRight.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + rightNodeOffset - 1));
2141 haar.bHasStumpsOnly = false;
2142 }
2143
2144 curNode.setRightNodeDesc(nodeRight);
2145
2146 cv::FileNode fnFeature = tree[0][CUDA_CC_FEATURE];
2147 Ncv32u tiltedVal = (int)fnFeature[CUDA_CC_TILTED];
2148 haar.bNeedsTiltedII = (tiltedVal != 0);
2149
2150 cv::FileNodeIterator it2 = fnFeature[CUDA_CC_RECT].begin(), it2_end = fnFeature[CUDA_CC_RECT].end();
2151
2152 Ncv32u featureId = 0;
2153 for (; it2 != it2_end; ++it2) // by feature
2154 {
2155 cv::FileNode rect = *it2;
2156
2157 Ncv32u rectX = (int)rect[RECT_X_IDX];
2158 Ncv32u rectY = (int)rect[RECT_Y_IDX];
2159 Ncv32u rectWidth = (int)rect[RECT_W_IDX];
2160 Ncv32u rectHeight = (int)rect[RECT_H_IDX];
2161
2162 Ncv32f rectWeight = (float)rect[RECT_WEIGHT_IDX];
2163
2164 if (rectWeight == 0)
2165 break;
2166
2167 HaarFeature64 curFeature;
2168 ncvStat = curFeature.setRect(rectX, rectY, rectWidth, rectHeight, haar.ClassifierSize.width, haar.ClassifierSize.height);
2169 curFeature.setWeight(rectWeight);
2170 ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
2171
2172 haarFeatures.push_back(curFeature);
2173 featureId++;
2174 }
2175
2176 HaarFeatureDescriptor32 tmpFeatureDesc;
2177 ncvStat = tmpFeatureDesc.create(haar.bNeedsTiltedII, bIsLeftNodeLeaf, bIsRightNodeLeaf,
2178 featureId, static_cast<Ncv32u>(haarFeatures.size()) - featureId);
2179 ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
2180 curNode.setFeatureDesc(tmpFeatureDesc);
2181
2182 if (!nodeId)
2183 {
2184 //root node
2185 haarClassifierNodes.push_back(curNode);
2186 curMaxTreeDepth = 1;
2187 }
2188 else
2189 {
2190 //other node
2191 h_TmpClassifierNotRootNodes.push_back(curNode);
2192 curMaxTreeDepth++;
2193 }
2194
2195 nodeId++;
2196 }
2197
2198 curStage.setNumClassifierRootNodes((Ncv32u)fnTrees.size());
2199 haarStages.push_back(curStage);
2200 }
2201
2202 //fill in cascade stats
2203 haar.NumStages = static_cast<Ncv32u>(haarStages.size());
2204 haar.NumClassifierRootNodes = static_cast<Ncv32u>(haarClassifierNodes.size());
2205 haar.NumClassifierTotalNodes = static_cast<Ncv32u>(haar.NumClassifierRootNodes + h_TmpClassifierNotRootNodes.size());
2206 haar.NumFeatures = static_cast<Ncv32u>(haarFeatures.size());
2207
2208 //merge root and leaf nodes in one classifiers array
2209 Ncv32u offsetRoot = static_cast<Ncv32u>(haarClassifierNodes.size());
2210 for (Ncv32u i=0; i<haarClassifierNodes.size(); i++)
2211 {
2212 HaarFeatureDescriptor32 featureDesc = haarClassifierNodes[i].getFeatureDesc();
2213
2214 HaarClassifierNodeDescriptor32 nodeLeft = haarClassifierNodes[i].getLeftNodeDesc();
2215 if (!featureDesc.isLeftNodeLeaf())
2216 {
2217 Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
2218 nodeLeft.create(newOffset);
2219 }
2220 haarClassifierNodes[i].setLeftNodeDesc(nodeLeft);
2221
2222 HaarClassifierNodeDescriptor32 nodeRight = haarClassifierNodes[i].getRightNodeDesc();
2223 if (!featureDesc.isRightNodeLeaf())
2224 {
2225 Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
2226 nodeRight.create(newOffset);
2227 }
2228 haarClassifierNodes[i].setRightNodeDesc(nodeRight);
2229 }
2230
2231 for (Ncv32u i=0; i<h_TmpClassifierNotRootNodes.size(); i++)
2232 {
2233 HaarFeatureDescriptor32 featureDesc = h_TmpClassifierNotRootNodes[i].getFeatureDesc();
2234
2235 HaarClassifierNodeDescriptor32 nodeLeft = h_TmpClassifierNotRootNodes[i].getLeftNodeDesc();
2236 if (!featureDesc.isLeftNodeLeaf())
2237 {
2238 Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
2239 nodeLeft.create(newOffset);
2240 }
2241 h_TmpClassifierNotRootNodes[i].setLeftNodeDesc(nodeLeft);
2242
2243 HaarClassifierNodeDescriptor32 nodeRight = h_TmpClassifierNotRootNodes[i].getRightNodeDesc();
2244 if (!featureDesc.isRightNodeLeaf())
2245 {
2246 Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
2247 nodeRight.create(newOffset);
2248 }
2249 h_TmpClassifierNotRootNodes[i].setRightNodeDesc(nodeRight);
2250
2251 haarClassifierNodes.push_back(h_TmpClassifierNotRootNodes[i]);
2252 }
2253
2254 return NCV_SUCCESS;
2255 }
2256
2257
2258 #define NVBIN_HAAR_SIZERESERVED 16
2259 #define NVBIN_HAAR_VERSION 0x1
2260
2261
loadFromNVBIN(const cv::String & filename,HaarClassifierCascadeDescriptor & haar,std::vector<HaarStage64> & haarStages,std::vector<HaarClassifierNode128> & haarClassifierNodes,std::vector<HaarFeature64> & haarFeatures)2262 static NCVStatus loadFromNVBIN(const cv::String &filename,
2263 HaarClassifierCascadeDescriptor &haar,
2264 std::vector<HaarStage64> &haarStages,
2265 std::vector<HaarClassifierNode128> &haarClassifierNodes,
2266 std::vector<HaarFeature64> &haarFeatures)
2267 {
2268 size_t readCount;
2269 FILE *fp = fopen(filename.c_str(), "rb");
2270 ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
2271 Ncv32u fileVersion;
2272 readCount = fread(&fileVersion, sizeof(Ncv32u), 1, fp);
2273 ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2274 ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);
2275 Ncv32u fsize;
2276 readCount = fread(&fsize, sizeof(Ncv32u), 1, fp);
2277 ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2278 fseek(fp, 0, SEEK_END);
2279 Ncv32u fsizeActual = ftell(fp);
2280 ncvAssertReturn(fsize == fsizeActual, NCV_FILE_ERROR);
2281
2282 std::vector<unsigned char> fdata;
2283 fdata.resize(fsize);
2284 Ncv32u dataOffset = 0;
2285 fseek(fp, 0, SEEK_SET);
2286 readCount = fread(&fdata[0], fsize, 1, fp);
2287 ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2288 fclose(fp);
2289
2290 //data
2291 dataOffset = NVBIN_HAAR_SIZERESERVED;
2292 haar.NumStages = *(Ncv32u *)(&fdata[0]+dataOffset);
2293 dataOffset += sizeof(Ncv32u);
2294 haar.NumClassifierRootNodes = *(Ncv32u *)(&fdata[0]+dataOffset);
2295 dataOffset += sizeof(Ncv32u);
2296 haar.NumClassifierTotalNodes = *(Ncv32u *)(&fdata[0]+dataOffset);
2297 dataOffset += sizeof(Ncv32u);
2298 haar.NumFeatures = *(Ncv32u *)(&fdata[0]+dataOffset);
2299 dataOffset += sizeof(Ncv32u);
2300 haar.ClassifierSize = *(NcvSize32u *)(&fdata[0]+dataOffset);
2301 dataOffset += sizeof(NcvSize32u);
2302 haar.bNeedsTiltedII = *(NcvBool *)(&fdata[0]+dataOffset);
2303 dataOffset += sizeof(NcvBool);
2304 haar.bHasStumpsOnly = *(NcvBool *)(&fdata[0]+dataOffset);
2305 dataOffset += sizeof(NcvBool);
2306
2307 haarStages.resize(haar.NumStages);
2308 haarClassifierNodes.resize(haar.NumClassifierTotalNodes);
2309 haarFeatures.resize(haar.NumFeatures);
2310
2311 Ncv32u szStages = haar.NumStages * sizeof(HaarStage64);
2312 Ncv32u szClassifiers = haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128);
2313 Ncv32u szFeatures = haar.NumFeatures * sizeof(HaarFeature64);
2314
2315 memcpy(&haarStages[0], &fdata[0]+dataOffset, szStages);
2316 dataOffset += szStages;
2317 memcpy(&haarClassifierNodes[0], &fdata[0]+dataOffset, szClassifiers);
2318 dataOffset += szClassifiers;
2319 memcpy(&haarFeatures[0], &fdata[0]+dataOffset, szFeatures);
2320 dataOffset += szFeatures;
2321
2322 return NCV_SUCCESS;
2323 }
2324
2325
ncvHaarGetClassifierSize(const cv::String & filename,Ncv32u & numStages,Ncv32u & numNodes,Ncv32u & numFeatures)2326 NCVStatus ncvHaarGetClassifierSize(const cv::String &filename, Ncv32u &numStages,
2327 Ncv32u &numNodes, Ncv32u &numFeatures)
2328 {
2329 size_t readCount;
2330 NCVStatus ncvStat;
2331
2332 cv::String fext = filename.substr(filename.find_last_of(".") + 1);
2333 std::transform(fext.begin(), fext.end(), fext.begin(), ::tolower);
2334
2335 if (fext == "nvbin")
2336 {
2337 FILE *fp = fopen(filename.c_str(), "rb");
2338 ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
2339 Ncv32u fileVersion;
2340 readCount = fread(&fileVersion, sizeof(Ncv32u), 1, fp);
2341 ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2342 ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);
2343 fseek(fp, NVBIN_HAAR_SIZERESERVED, SEEK_SET);
2344 Ncv32u tmp;
2345 readCount = fread(&numStages, sizeof(Ncv32u), 1, fp);
2346 ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2347 readCount = fread(&tmp, sizeof(Ncv32u), 1, fp);
2348 ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2349 readCount = fread(&numNodes, sizeof(Ncv32u), 1, fp);
2350 ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2351 readCount = fread(&numFeatures, sizeof(Ncv32u), 1, fp);
2352 ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
2353 fclose(fp);
2354 }
2355 else if (fext == "xml")
2356 {
2357 HaarClassifierCascadeDescriptor haar;
2358 std::vector<HaarStage64> haarStages;
2359 std::vector<HaarClassifierNode128> haarNodes;
2360 std::vector<HaarFeature64> haarFeatures;
2361
2362 ncvStat = loadFromXML(filename, haar, haarStages, haarNodes, haarFeatures);
2363 ncvAssertReturnNcvStat(ncvStat);
2364
2365 numStages = haar.NumStages;
2366 numNodes = haar.NumClassifierTotalNodes;
2367 numFeatures = haar.NumFeatures;
2368 }
2369 else
2370 {
2371 return NCV_HAAR_XML_LOADING_EXCEPTION;
2372 }
2373
2374 return NCV_SUCCESS;
2375 }
2376
2377
ncvHaarLoadFromFile_host(const cv::String & filename,HaarClassifierCascadeDescriptor & haar,NCVVector<HaarStage64> & h_HaarStages,NCVVector<HaarClassifierNode128> & h_HaarNodes,NCVVector<HaarFeature64> & h_HaarFeatures)2378 NCVStatus ncvHaarLoadFromFile_host(const cv::String &filename,
2379 HaarClassifierCascadeDescriptor &haar,
2380 NCVVector<HaarStage64> &h_HaarStages,
2381 NCVVector<HaarClassifierNode128> &h_HaarNodes,
2382 NCVVector<HaarFeature64> &h_HaarFeatures)
2383 {
2384 ncvAssertReturn(h_HaarStages.memType() == NCVMemoryTypeHostPinned &&
2385 h_HaarNodes.memType() == NCVMemoryTypeHostPinned &&
2386 h_HaarFeatures.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
2387
2388 NCVStatus ncvStat;
2389
2390 cv::String fext = filename.substr(filename.find_last_of(".") + 1);
2391 std::transform(fext.begin(), fext.end(), fext.begin(), ::tolower);
2392
2393 std::vector<HaarStage64> haarStages;
2394 std::vector<HaarClassifierNode128> haarNodes;
2395 std::vector<HaarFeature64> haarFeatures;
2396
2397 if (fext == "nvbin")
2398 {
2399 ncvStat = loadFromNVBIN(filename, haar, haarStages, haarNodes, haarFeatures);
2400 ncvAssertReturnNcvStat(ncvStat);
2401 }
2402 else if (fext == "xml")
2403 {
2404 ncvStat = loadFromXML(filename, haar, haarStages, haarNodes, haarFeatures);
2405 ncvAssertReturnNcvStat(ncvStat);
2406 }
2407 else
2408 {
2409 return NCV_HAAR_XML_LOADING_EXCEPTION;
2410 }
2411
2412 ncvAssertReturn(h_HaarStages.length() >= haarStages.size(), NCV_MEM_INSUFFICIENT_CAPACITY);
2413 ncvAssertReturn(h_HaarNodes.length() >= haarNodes.size(), NCV_MEM_INSUFFICIENT_CAPACITY);
2414 ncvAssertReturn(h_HaarFeatures.length() >= haarFeatures.size(), NCV_MEM_INSUFFICIENT_CAPACITY);
2415
2416 memcpy(h_HaarStages.ptr(), &haarStages[0], haarStages.size()*sizeof(HaarStage64));
2417 memcpy(h_HaarNodes.ptr(), &haarNodes[0], haarNodes.size()*sizeof(HaarClassifierNode128));
2418 memcpy(h_HaarFeatures.ptr(), &haarFeatures[0], haarFeatures.size()*sizeof(HaarFeature64));
2419
2420 return NCV_SUCCESS;
2421 }
2422
2423
ncvHaarStoreNVBIN_host(const cv::String & filename,HaarClassifierCascadeDescriptor haar,NCVVector<HaarStage64> & h_HaarStages,NCVVector<HaarClassifierNode128> & h_HaarNodes,NCVVector<HaarFeature64> & h_HaarFeatures)2424 NCVStatus ncvHaarStoreNVBIN_host(const cv::String &filename,
2425 HaarClassifierCascadeDescriptor haar,
2426 NCVVector<HaarStage64> &h_HaarStages,
2427 NCVVector<HaarClassifierNode128> &h_HaarNodes,
2428 NCVVector<HaarFeature64> &h_HaarFeatures)
2429 {
2430 ncvAssertReturn(h_HaarStages.length() >= haar.NumStages, NCV_INCONSISTENT_INPUT);
2431 ncvAssertReturn(h_HaarNodes.length() >= haar.NumClassifierTotalNodes, NCV_INCONSISTENT_INPUT);
2432 ncvAssertReturn(h_HaarFeatures.length() >= haar.NumFeatures, NCV_INCONSISTENT_INPUT);
2433 ncvAssertReturn(h_HaarStages.memType() == NCVMemoryTypeHostPinned &&
2434 h_HaarNodes.memType() == NCVMemoryTypeHostPinned &&
2435 h_HaarFeatures.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
2436
2437 Ncv32u szStages = haar.NumStages * sizeof(HaarStage64);
2438 Ncv32u szClassifiers = haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128);
2439 Ncv32u szFeatures = haar.NumFeatures * sizeof(HaarFeature64);
2440
2441 Ncv32u dataOffset = 0;
2442 std::vector<unsigned char> fdata;
2443 fdata.resize(szStages+szClassifiers+szFeatures+1024, 0);
2444
2445 //header
2446 *(Ncv32u *)(&fdata[0]+dataOffset) = NVBIN_HAAR_VERSION;
2447
2448 //data
2449 dataOffset = NVBIN_HAAR_SIZERESERVED;
2450 *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumStages;
2451 dataOffset += sizeof(Ncv32u);
2452 *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumClassifierRootNodes;
2453 dataOffset += sizeof(Ncv32u);
2454 *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumClassifierTotalNodes;
2455 dataOffset += sizeof(Ncv32u);
2456 *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumFeatures;
2457 dataOffset += sizeof(Ncv32u);
2458 *(NcvSize32u *)(&fdata[0]+dataOffset) = haar.ClassifierSize;
2459 dataOffset += sizeof(NcvSize32u);
2460 *(NcvBool *)(&fdata[0]+dataOffset) = haar.bNeedsTiltedII;
2461 dataOffset += sizeof(NcvBool);
2462 *(NcvBool *)(&fdata[0]+dataOffset) = haar.bHasStumpsOnly;
2463 dataOffset += sizeof(NcvBool);
2464
2465 memcpy(&fdata[0]+dataOffset, h_HaarStages.ptr(), szStages);
2466 dataOffset += szStages;
2467 memcpy(&fdata[0]+dataOffset, h_HaarNodes.ptr(), szClassifiers);
2468 dataOffset += szClassifiers;
2469 memcpy(&fdata[0]+dataOffset, h_HaarFeatures.ptr(), szFeatures);
2470 dataOffset += szFeatures;
2471 Ncv32u fsize = dataOffset;
2472
2473 //TODO: CRC32 here
2474
2475 //update header
2476 dataOffset = sizeof(Ncv32u);
2477 *(Ncv32u *)(&fdata[0]+dataOffset) = fsize;
2478
2479 FILE *fp = fopen(filename.c_str(), "wb");
2480 ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
2481 fwrite(&fdata[0], fsize, 1, fp);
2482 fclose(fp);
2483 return NCV_SUCCESS;
2484 }
2485