1 /*
2  * Software License Agreement (BSD License)
3  *
4  *  Point Cloud Library (PCL) - www.pointclouds.org
5  *  Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
6  *  Third party copyrights are property of their respective owners.
7  *
8  *  All rights reserved.
9  *
10  *  Redistribution and use in source and binary forms, with or without
11  *  modification, are permitted provided that the following conditions
12  *  are met:
13  *
14  *   * Redistributions of source code must retain the above copyright
15  *     notice, this list of conditions and the following disclaimer.
16  *   * Redistributions in binary form must reproduce the above
17  *     copyright notice, this list of conditions and the following
18  *     disclaimer in the documentation and/or other materials provided
19  *     with the distribution.
20  *   * Neither the name of Willow Garage, Inc. nor the names of its
21  *     contributors may be used to endorse or promote products derived
22  *     from this software without specific prior written permission.
23  *
24  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25  *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26  *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
27  *  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
28  *  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
29  *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
30  *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31  *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
32  *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
34  *  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35  *  POSSIBILITY OF SUCH DAMAGE.
36  *
37  * $Id:  $
38  * Ported to PCL by Koen Buys : Attention Work in progress!
39  */
40 
41 #include <iostream>
42 #include <string>
43 #include <vector>
44 #include <algorithm>
45 #include "NCV.hpp"
46 
47 
48 //==============================================================================
49 //
50 // Error handling helpers
51 //
52 //==============================================================================
53 
54 
stdDebugOutput(const std::string & msg)55 static void stdDebugOutput(const std::string &msg)
56 {
57     std::cout << msg;
58 }
59 
60 
61 static NCVDebugOutputHandler *debugOutputHandler = stdDebugOutput;
62 
63 
ncvDebugOutput(const std::string & msg)64 void ncvDebugOutput(const std::string &msg)
65 {
66     debugOutputHandler(msg);
67 }
68 
69 
ncvSetDebugOutputHandler(NCVDebugOutputHandler * func)70 void ncvSetDebugOutputHandler(NCVDebugOutputHandler *func)
71 {
72     debugOutputHandler = func;
73 }
74 
75 
76 //==============================================================================
77 //
78 // Memory wrappers and helpers
79 //
80 //==============================================================================
81 
82 
alignUp(Ncv32u what,Ncv32u alignment)83 Ncv32u alignUp(Ncv32u what, Ncv32u alignment)
84 {
85     Ncv32u alignMask = alignment-1;
86     Ncv32u inverseAlignMask = ~alignMask;
87     Ncv32u res = (what + alignMask) & inverseAlignMask;
88     return res;
89 }
90 
91 
clear()92 void NCVMemPtr::clear()
93 {
94     ptr = NULL;
95     memtype = NCVMemoryTypeNone;
96 }
97 
98 
clear()99 void NCVMemSegment::clear()
100 {
101     begin.clear();
102     size = 0;
103 }
104 
105 
memSegCopyHelper(void * dst,NCVMemoryType dstType,const void * src,NCVMemoryType srcType,std::size_t sz,cudaStream_t cuStream)106 NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType, const void *src, NCVMemoryType srcType, std::size_t sz, cudaStream_t cuStream)
107 {
108     NCVStatus ncvStat;
109     switch (dstType)
110     {
111     case NCVMemoryTypeHostPageable:
112     case NCVMemoryTypeHostPinned:
113         switch (srcType)
114         {
115         case NCVMemoryTypeHostPageable:
116         case NCVMemoryTypeHostPinned:
117             memcpy(dst, src, sz);
118             ncvStat = NCV_SUCCESS;
119             break;
120         case NCVMemoryTypeDevice:
121             if (cuStream != 0)
122             {
123                 ncvAssertCUDAReturn(cudaMemcpyAsync(dst, src, sz, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
124             }
125             else
126             {
127                 ncvAssertCUDAReturn(cudaMemcpy(dst, src, sz, cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
128             }
129             ncvStat = NCV_SUCCESS;
130             break;
131         default:
132             ncvStat = NCV_MEM_RESIDENCE_ERROR;
133         }
134         break;
135     case NCVMemoryTypeDevice:
136         switch (srcType)
137         {
138         case NCVMemoryTypeHostPageable:
139         case NCVMemoryTypeHostPinned:
140             if (cuStream != 0)
141             {
142                 ncvAssertCUDAReturn(cudaMemcpyAsync(dst, src, sz, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
143             }
144             else
145             {
146                 ncvAssertCUDAReturn(cudaMemcpy(dst, src, sz, cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
147             }
148             ncvStat = NCV_SUCCESS;
149             break;
150         case NCVMemoryTypeDevice:
151             if (cuStream != 0)
152             {
153                 ncvAssertCUDAReturn(cudaMemcpyAsync(dst, src, sz, cudaMemcpyDeviceToDevice, cuStream), NCV_CUDA_ERROR);
154             }
155             else
156             {
157                 ncvAssertCUDAReturn(cudaMemcpy(dst, src, sz, cudaMemcpyDeviceToDevice), NCV_CUDA_ERROR);
158             }
159             ncvStat = NCV_SUCCESS;
160             break;
161         default:
162             ncvStat = NCV_MEM_RESIDENCE_ERROR;
163         }
164         break;
165     default:
166         ncvStat = NCV_MEM_RESIDENCE_ERROR;
167     }
168 
169     return ncvStat;
170 }
171 
172 
memSegCopyHelper2D(void * dst,Ncv32u dstPitch,NCVMemoryType dstType,const void * src,Ncv32u srcPitch,NCVMemoryType srcType,Ncv32u widthbytes,Ncv32u height,cudaStream_t cuStream)173 NCVStatus memSegCopyHelper2D(void *dst, Ncv32u dstPitch, NCVMemoryType dstType,
174                              const void *src, Ncv32u srcPitch, NCVMemoryType srcType,
175                              Ncv32u widthbytes, Ncv32u height, cudaStream_t cuStream)
176 {
177     NCVStatus ncvStat;
178     switch (dstType)
179     {
180     case NCVMemoryTypeHostPageable:
181     case NCVMemoryTypeHostPinned:
182         switch (srcType)
183         {
184         case NCVMemoryTypeHostPageable:
185         case NCVMemoryTypeHostPinned:
186             for (Ncv32u i=0; i<height; i++)
187             {
188                 memcpy((char*)dst + i * dstPitch, (char*)src + i * srcPitch, widthbytes);
189             }
190             ncvStat = NCV_SUCCESS;
191             break;
192         case NCVMemoryTypeDevice:
193             if (cuStream != 0)
194             {
195                 ncvAssertCUDAReturn(cudaMemcpy2DAsync(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
196             }
197             else
198             {
199                 ncvAssertCUDAReturn(cudaMemcpy2D(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
200             }
201             ncvStat = NCV_SUCCESS;
202             break;
203         default:
204             ncvStat = NCV_MEM_RESIDENCE_ERROR;
205         }
206         break;
207     case NCVMemoryTypeDevice:
208         switch (srcType)
209         {
210         case NCVMemoryTypeHostPageable:
211         case NCVMemoryTypeHostPinned:
212             if (cuStream != 0)
213             {
214                 ncvAssertCUDAReturn(cudaMemcpy2DAsync(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
215             }
216             else
217             {
218                 ncvAssertCUDAReturn(cudaMemcpy2D(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
219             }
220             ncvStat = NCV_SUCCESS;
221             break;
222         case NCVMemoryTypeDevice:
223             if (cuStream != 0)
224             {
225                 ncvAssertCUDAReturn(cudaMemcpy2DAsync(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToDevice, cuStream), NCV_CUDA_ERROR);
226             }
227             else
228             {
229                 ncvAssertCUDAReturn(cudaMemcpy2D(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToDevice), NCV_CUDA_ERROR);
230             }
231             ncvStat = NCV_SUCCESS;
232             break;
233         default:
234             ncvStat = NCV_MEM_RESIDENCE_ERROR;
235         }
236         break;
237     default:
238         ncvStat = NCV_MEM_RESIDENCE_ERROR;
239     }
240 
241     return ncvStat;
242 }
243 
244 
245 //===================================================================
246 //
247 // NCVMemStackAllocator class members implementation
248 //
249 //===================================================================
250 
251 
NCVMemStackAllocator(Ncv32u alignment)252 NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
253     :
254     currentSize(0),
255     _maxSize(0),
256     allocBegin(NULL),
257     begin(NULL),
258     end(NULL),
259     _memType(NCVMemoryTypeNone),
260     _alignment(alignment),
261     bReusesMemory(false)
262 {
263     NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;
264     ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2");
265 }
266 
267 
NCVMemStackAllocator(NCVMemoryType memT,std::size_t capacity,Ncv32u alignment,void * reusePtr)268 NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, std::size_t capacity, Ncv32u alignment, void *reusePtr)
269     :
270     currentSize(0),
271     _maxSize(0),
272     allocBegin(NULL),
273     _memType(memT),
274     _alignment(alignment)
275 {
276     NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;
277     ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: _alignment not power of 2");
278     ncvAssertPrintCheck(memT != NCVMemoryTypeNone, "NCVMemStackAllocator ctor:: Incorrect allocator type");
279 
280     allocBegin = NULL;
281 
282     if (reusePtr == NULL && capacity != 0)
283     {
284         bReusesMemory = false;
285         switch (memT)
286         {
287         case NCVMemoryTypeDevice:
288             ncvAssertCUDAReturn(cudaMalloc(&allocBegin, capacity), );
289             break;
290         case NCVMemoryTypeHostPinned:
291             ncvAssertCUDAReturn(cudaMallocHost(&allocBegin, capacity), );
292             break;
293         case NCVMemoryTypeHostPageable:
294             allocBegin = (Ncv8u *)malloc(capacity);
295             break;
296         default:;
297         }
298     }
299     else
300     {
301         bReusesMemory = true;
302         allocBegin = (Ncv8u *)reusePtr;
303     }
304 
305     if (capacity == 0)
306     {
307         allocBegin = (Ncv8u *)(0x1);
308     }
309 
310     if (!isCounting())
311     {
312         begin = allocBegin;
313         end = begin + capacity;
314     }
315 }
316 
317 
~NCVMemStackAllocator()318 NCVMemStackAllocator::~NCVMemStackAllocator()
319 {
320     if (allocBegin != NULL)
321     {
322         ncvAssertPrintCheck(currentSize == 0, "NCVMemStackAllocator dtor:: not all objects were deallocated properly, forcing destruction");
323 
324         if (!bReusesMemory && (allocBegin != (Ncv8u *)(0x1)))
325         {
326             switch (_memType)
327             {
328             case NCVMemoryTypeDevice:
329                 ncvAssertCUDAReturn(cudaFree(allocBegin), );
330                 break;
331             case NCVMemoryTypeHostPinned:
332                 ncvAssertCUDAReturn(cudaFreeHost(allocBegin), );
333                 break;
334             case NCVMemoryTypeHostPageable:
335                 free(allocBegin);
336                 break;
337             default:;
338             }
339         }
340 
341         allocBegin = NULL;
342     }
343 }
344 
345 
alloc(NCVMemSegment & seg,std::size_t size)346 NCVStatus NCVMemStackAllocator::alloc(NCVMemSegment &seg, std::size_t size)
347 {
348     seg.clear();
349     ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);
350 
351     size = alignUp(size, this->_alignment);
352     this->currentSize += size;
353     this->_maxSize = max(this->_maxSize, this->currentSize);
354 
355     if (!isCounting())
356     {
357         std::size_t availSize = end - begin;
358         ncvAssertReturn(size <= availSize, NCV_ALLOCATOR_INSUFFICIENT_CAPACITY);
359     }
360 
361     seg.begin.ptr = begin;
362     seg.begin.memtype = this->_memType;
363     seg.size = size;
364     begin += size;
365 
366     return NCV_SUCCESS;
367 }
368 
369 
dealloc(NCVMemSegment & seg)370 NCVStatus NCVMemStackAllocator::dealloc(NCVMemSegment &seg)
371 {
372     ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);
373     ncvAssertReturn(seg.begin.memtype == this->_memType, NCV_ALLOCATOR_BAD_DEALLOC);
374     ncvAssertReturn(seg.begin.ptr != NULL || isCounting(), NCV_ALLOCATOR_BAD_DEALLOC);
375     ncvAssertReturn(seg.begin.ptr == begin - seg.size, NCV_ALLOCATOR_DEALLOC_ORDER);
376 
377     currentSize -= seg.size;
378     begin -= seg.size;
379 
380     seg.clear();
381 
382     ncvAssertReturn(allocBegin <= begin, NCV_ALLOCATOR_BAD_DEALLOC);
383 
384     return NCV_SUCCESS;
385 }
386 
387 
isInitialized(void) const388 NcvBool NCVMemStackAllocator::isInitialized(void) const
389 {
390     return ((this->_alignment & (this->_alignment-1)) == 0) && isCounting() || this->allocBegin != NULL;
391 }
392 
393 
isCounting(void) const394 NcvBool NCVMemStackAllocator::isCounting(void) const
395 {
396     return this->_memType == NCVMemoryTypeNone;
397 }
398 
399 
memType(void) const400 NCVMemoryType NCVMemStackAllocator::memType(void) const
401 {
402     return this->_memType;
403 }
404 
405 
alignment(void) const406 Ncv32u NCVMemStackAllocator::alignment(void) const
407 {
408     return this->_alignment;
409 }
410 
411 
maxSize(void) const412 size_t NCVMemStackAllocator::maxSize(void) const
413 {
414     return this->_maxSize;
415 }
416 
417 
418 //===================================================================
419 //
420 // NCVMemNativeAllocator class members implementation
421 //
422 //===================================================================
423 
424 
NCVMemNativeAllocator(NCVMemoryType memT,Ncv32u alignment)425 NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment)
426     :
427     currentSize(0),
428     _maxSize(0),
429     _memType(memT),
430     _alignment(alignment)
431 {
432     ncvAssertPrintReturn(memT != NCVMemoryTypeNone, "NCVMemNativeAllocator ctor:: counting not permitted for this allocator type", );
433 }
434 
435 
~NCVMemNativeAllocator()436 NCVMemNativeAllocator::~NCVMemNativeAllocator()
437 {
438     ncvAssertPrintCheck(currentSize == 0, "NCVMemNativeAllocator dtor:: detected memory leak");
439 }
440 
441 
alloc(NCVMemSegment & seg,std::size_t size)442 NCVStatus NCVMemNativeAllocator::alloc(NCVMemSegment &seg, std::size_t size)
443 {
444     seg.clear();
445     ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);
446 
447     switch (this->_memType)
448     {
449     case NCVMemoryTypeDevice:
450         ncvAssertCUDAReturn(cudaMalloc(&seg.begin.ptr, size), NCV_CUDA_ERROR);
451         break;
452     case NCVMemoryTypeHostPinned:
453         ncvAssertCUDAReturn(cudaMallocHost(&seg.begin.ptr, size), NCV_CUDA_ERROR);
454         break;
455     case NCVMemoryTypeHostPageable:
456         seg.begin.ptr = (Ncv8u *)malloc(size);
457         break;
458     default:;
459     }
460 
461     this->currentSize += alignUp(size, this->_alignment);
462     this->_maxSize = max(this->_maxSize, this->currentSize);
463 
464     seg.begin.memtype = this->_memType;
465     seg.size = size;
466 
467     return NCV_SUCCESS;
468 }
469 
470 
dealloc(NCVMemSegment & seg)471 NCVStatus NCVMemNativeAllocator::dealloc(NCVMemSegment &seg)
472 {
473     ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);
474     ncvAssertReturn(seg.begin.memtype == this->_memType, NCV_ALLOCATOR_BAD_DEALLOC);
475     ncvAssertReturn(seg.begin.ptr != NULL, NCV_ALLOCATOR_BAD_DEALLOC);
476 
477     ncvAssertReturn(currentSize >= alignUp(seg.size, this->_alignment), NCV_ALLOCATOR_BAD_DEALLOC);
478     currentSize -= alignUp(seg.size, this->_alignment);
479 
480     switch (this->_memType)
481     {
482     case NCVMemoryTypeDevice:
483         ncvAssertCUDAReturn(cudaFree(seg.begin.ptr), NCV_CUDA_ERROR);
484         break;
485     case NCVMemoryTypeHostPinned:
486         ncvAssertCUDAReturn(cudaFreeHost(seg.begin.ptr), NCV_CUDA_ERROR);
487         break;
488     case NCVMemoryTypeHostPageable:
489         free(seg.begin.ptr);
490         break;
491     default:;
492     }
493 
494     seg.clear();
495 
496     return NCV_SUCCESS;
497 }
498 
499 
isInitialized(void) const500 NcvBool NCVMemNativeAllocator::isInitialized(void) const
501 {
502     return (this->_alignment != 0);
503 }
504 
505 
isCounting(void) const506 NcvBool NCVMemNativeAllocator::isCounting(void) const
507 {
508     return false;
509 }
510 
511 
memType(void) const512 NCVMemoryType NCVMemNativeAllocator::memType(void) const
513 {
514     return this->_memType;
515 }
516 
517 
alignment(void) const518 Ncv32u NCVMemNativeAllocator::alignment(void) const
519 {
520     return this->_alignment;
521 }
522 
523 
maxSize(void) const524 size_t NCVMemNativeAllocator::maxSize(void) const
525 {
526     return this->_maxSize;
527 }
528 
529 
530 //===================================================================
531 //
532 // Operations with rectangles
533 //
534 //===================================================================
535 
536 
537 template <class T>
drawRectsWrapperHost(T * h_dst,Ncv32u dstStride,Ncv32u dstWidth,Ncv32u dstHeight,NcvRect32u * h_rects,Ncv32u numRects,T color)538 static NCVStatus drawRectsWrapperHost(T *h_dst,
539                                       Ncv32u dstStride,
540                                       Ncv32u dstWidth,
541                                       Ncv32u dstHeight,
542                                       NcvRect32u *h_rects,
543                                       Ncv32u numRects,
544                                       T color)
545 {
546     ncvAssertReturn(h_dst != NULL && h_rects != NULL, NCV_NULL_PTR);
547     ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
548     ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
549     ncvAssertReturn(numRects != 0, NCV_SUCCESS);
550     ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
551 
552     for (Ncv32u i=0; i<numRects; i++)
553     {
554         NcvRect32u rect = h_rects[i];
555 
556         if (rect.x < dstWidth)
557         {
558             for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
559             {
560                 h_dst[i*dstStride+rect.x] = color;
561             }
562         }
563         if (rect.x+rect.width-1 < dstWidth)
564         {
565             for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
566             {
567                 h_dst[i*dstStride+rect.x+rect.width-1] = color;
568             }
569         }
570         if (rect.y < dstHeight)
571         {
572             for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
573             {
574                 h_dst[rect.y*dstStride+j] = color;
575             }
576         }
577         if (rect.y + rect.height - 1 < dstHeight)
578         {
579             for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
580             {
581                 h_dst[(rect.y+rect.height-1)*dstStride+j] = color;
582             }
583         }
584     }
585 
586     return NCV_SUCCESS;
587 }
588 
589 
ncvDrawRects_8u_host(Ncv8u * h_dst,Ncv32u dstStride,Ncv32u dstWidth,Ncv32u dstHeight,NcvRect32u * h_rects,Ncv32u numRects,Ncv8u color)590 NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
591                                Ncv32u dstStride,
592                                Ncv32u dstWidth,
593                                Ncv32u dstHeight,
594                                NcvRect32u *h_rects,
595                                Ncv32u numRects,
596                                Ncv8u color)
597 {
598     return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
599 }
600 
601 
ncvDrawRects_32u_host(Ncv32u * h_dst,Ncv32u dstStride,Ncv32u dstWidth,Ncv32u dstHeight,NcvRect32u * h_rects,Ncv32u numRects,Ncv32u color)602 NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
603                                 Ncv32u dstStride,
604                                 Ncv32u dstWidth,
605                                 Ncv32u dstHeight,
606                                 NcvRect32u *h_rects,
607                                 Ncv32u numRects,
608                                 Ncv32u color)
609 {
610     return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
611 }
612 
613 
614 const Ncv32u NUMTHREADS_DRAWRECTS = 32;
615 const Ncv32u NUMTHREADS_DRAWRECTS_LOG2 = 5;
616 
617 
618 template <class T>
drawRects(T * d_dst,Ncv32u dstStride,Ncv32u dstWidth,Ncv32u dstHeight,NcvRect32u * d_rects,Ncv32u numRects,T color)619 __global__ void drawRects(T *d_dst,
620                           Ncv32u dstStride,
621                           Ncv32u dstWidth,
622                           Ncv32u dstHeight,
623                           NcvRect32u *d_rects,
624                           Ncv32u numRects,
625                           T color)
626 {
627     Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
628     if (blockId > numRects * 4)
629     {
630         return;
631     }
632 
633     NcvRect32u curRect = d_rects[blockId >> 2];
634     NcvBool bVertical = blockId & 0x1;
635     NcvBool bTopLeft = blockId & 0x2;
636 
637     Ncv32u pt0x, pt0y;
638     if (bVertical)
639     {
640         Ncv32u numChunks = (curRect.height + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
641 
642         pt0x = bTopLeft ? curRect.x : curRect.x + curRect.width - 1;
643         pt0y = curRect.y;
644 
645         if (pt0x < dstWidth)
646         {
647             for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
648             {
649                 Ncv32u ptY = pt0y + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
650                 if (ptY < pt0y + curRect.height && ptY < dstHeight)
651                 {
652                     d_dst[ptY * dstStride + pt0x] = color;
653                 }
654             }
655         }
656     }
657     else
658     {
659         Ncv32u numChunks = (curRect.width + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
660 
661         pt0x = curRect.x;
662         pt0y = bTopLeft ? curRect.y : curRect.y + curRect.height - 1;
663 
664         if (pt0y < dstHeight)
665         {
666             for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
667             {
668                 Ncv32u ptX = pt0x + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
669                 if (ptX < pt0x + curRect.width && ptX < dstWidth)
670                 {
671                     d_dst[pt0y * dstStride + ptX] = color;
672                 }
673             }
674         }
675     }
676 }
677 
678 
679 template <class T>
drawRectsWrapperDevice(T * d_dst,Ncv32u dstStride,Ncv32u dstWidth,Ncv32u dstHeight,NcvRect32u * d_rects,Ncv32u numRects,T color,cudaStream_t cuStream)680 static NCVStatus drawRectsWrapperDevice(T *d_dst,
681                                         Ncv32u dstStride,
682                                         Ncv32u dstWidth,
683                                         Ncv32u dstHeight,
684                                         NcvRect32u *d_rects,
685                                         Ncv32u numRects,
686                                         T color,
687                                         cudaStream_t cuStream)
688 {
689     ncvAssertReturn(d_dst != NULL && d_rects != NULL, NCV_NULL_PTR);
690     ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
691     ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
692     ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
693 
694     if (numRects == 0)
695     {
696         return NCV_SUCCESS;
697     }
698 
699     dim3 grid(numRects * 4);
700     dim3 block(NUMTHREADS_DRAWRECTS);
701     if (grid.x > 65535)
702     {
703         grid.y = (grid.x + 65534) / 65535;
704         grid.x = 65535;
705     }
706 
707     drawRects<T><<<grid, block>>>(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color);
708 
709     ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
710 
711     return NCV_SUCCESS;
712 }
713 
714 
ncvDrawRects_8u_device(Ncv8u * d_dst,Ncv32u dstStride,Ncv32u dstWidth,Ncv32u dstHeight,NcvRect32u * d_rects,Ncv32u numRects,Ncv8u color,cudaStream_t cuStream)715 NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
716                                  Ncv32u dstStride,
717                                  Ncv32u dstWidth,
718                                  Ncv32u dstHeight,
719                                  NcvRect32u *d_rects,
720                                  Ncv32u numRects,
721                                  Ncv8u color,
722                                  cudaStream_t cuStream)
723 {
724     return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
725 }
726 
727 
ncvDrawRects_32u_device(Ncv32u * d_dst,Ncv32u dstStride,Ncv32u dstWidth,Ncv32u dstHeight,NcvRect32u * d_rects,Ncv32u numRects,Ncv32u color,cudaStream_t cuStream)728 NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
729                                   Ncv32u dstStride,
730                                   Ncv32u dstWidth,
731                                   Ncv32u dstHeight,
732                                   NcvRect32u *d_rects,
733                                   Ncv32u numRects,
734                                   Ncv32u color,
735                                   cudaStream_t cuStream)
736 {
737     return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
738 }
739