1 /*
2 * Software License Agreement (BSD License)
3 *
4 *  Copyright (c) 2011, Willow Garage, Inc.
5 *  All rights reserved.
6 *
7 *  Redistribution and use in source and binary forms, with or without
8 *  modification, are permitted provided that the following conditions
9 *  are met:
10 *
11 *   * Redistributions of source code must retain the above copyright
12 *     notice, this list of conditions and the following disclaimer.
13 *   * Redistributions in binary form must reproduce the above
14 *     copyright notice, this list of conditions and the following
15 *     disclaimer in the documentation and/or other materials provided
16 *     with the distribution.
17 *   * Neither the name of Willow Garage, Inc. nor the names of its
18 *     contributors may be used to endorse or promote products derived
19 *     from this software without specific prior written permission.
20 *
21 *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 *  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 *  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 *  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 *  POSSIBILITY OF SUCH DAMAGE.
33 *
34 *  Author: Anatoly Baskeheev, Itseez Ltd, (myname.mysurname@mycompany.com)
35 */
36 
37 #ifndef PCL_DEVICE_UTILS_BLOCK_HPP_
38 #define PCL_DEVICE_UTILS_BLOCK_HPP_
39 
40 namespace pcl
41 {
42 	namespace device
43 	{
44 		struct Block
45 		{
idpcl::device::Block46 			static __device__ __forceinline__ unsigned int id()
47 			{
48 				return blockIdx.x;
49 			}
50 
stridepcl::device::Block51 			static __device__ __forceinline__ unsigned int stride()
52 			{
53 				return blockDim.x * blockDim.y * blockDim.z;
54 			}
55 
syncpcl::device::Block56 			static __device__ __forceinline__ void sync()
57 			{
58 				__syncthreads();
59 			}
60 
flattenedThreadIdpcl::device::Block61 			static __device__ __forceinline__ int flattenedThreadId()
62 			{
63 				return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
64 			}
65 
66 			template<typename It, typename T>
fillpcl::device::Block67 			static __device__ __forceinline__ void fill(It beg, It end, const T& value)
68 			{
69 				int STRIDE = stride();
70 				It t = beg + flattenedThreadId();
71 
72 				for(; t < end; t += STRIDE)
73 					*t = value;
74 			}
75 
76 			template<typename OutIt, typename T>
yotapcl::device::Block77 			static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
78 			{
79 				int STRIDE = stride();
80 				int tid = flattenedThreadId();
81 				value += tid;
82 
83 				for(OutIt t = beg + tid; t < end; t += STRIDE, value += STRIDE)
84 					*t = value;
85 			}
86 
87 			template<typename InIt, typename OutIt>
copypcl::device::Block88 			static __device__ __forceinline__ void copy(InIt beg, InIt end, OutIt out)
89 			{
90 				int STRIDE = stride();
91 				InIt  t = beg + flattenedThreadId();
92 				OutIt o = out + (t - beg);
93 
94 				for(; t < end; t += STRIDE, o += STRIDE)
95 					*o = *t;
96 			}
97 
98 			template<typename InIt, typename OutIt, class UnOp>
transformpcl::device::Block99 			static __device__ __forceinline__ void transform(InIt beg, InIt end, OutIt out, UnOp op)
100 			{
101 				int STRIDE = stride();
102 				InIt  t = beg + flattenedThreadId();
103 				OutIt o = out + (t - beg);
104 
105 				for(; t < end; t += STRIDE, o += STRIDE)
106 					*o = op(*t);
107 			}
108 
109 			template<typename InIt1, typename InIt2, typename OutIt, class BinOp>
transformpcl::device::Block110 			static __device__ __forceinline__ void transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
111 			{
112 				int STRIDE = stride();
113 				InIt1 t1 = beg1 + flattenedThreadId();
114 				InIt2 t2 = beg2 + flattenedThreadId();
115 				OutIt o  = out + (t1 - beg1);
116 
117 				for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, o += STRIDE)
118 					*o = op(*t1, *t2);
119 			}
120 
121 			template<int CTA_SIZE, typename T, class BinOp>
reducepcl::device::Block122 			static __device__ __forceinline__ void reduce(volatile T* buffer, BinOp op)
123 			{
124 				int tid = flattenedThreadId();
125 				T val =  buffer[tid];
126 
127 				if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
128 				if (CTA_SIZE >=  512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
129 				if (CTA_SIZE >=  256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
130 				if (CTA_SIZE >=  128) { if (tid <  64) buffer[tid] = val = op(val, buffer[tid +  64]); __syncthreads(); }
131 
132 				if (tid < 32)
133 				{
134 					if (CTA_SIZE >=   64) { buffer[tid] = val = op(val, buffer[tid +  32]); }
135 					if (CTA_SIZE >=   32) { buffer[tid] = val = op(val, buffer[tid +  16]); }
136 					if (CTA_SIZE >=   16) { buffer[tid] = val = op(val, buffer[tid +   8]); }
137 					if (CTA_SIZE >=    8) { buffer[tid] = val = op(val, buffer[tid +   4]); }
138 					if (CTA_SIZE >=    4) { buffer[tid] = val = op(val, buffer[tid +   2]); }
139 					if (CTA_SIZE >=    2) { buffer[tid] = val = op(val, buffer[tid +   1]); }
140 				}
141 			}
142 
143 			template<int CTA_SIZE, typename T, class BinOp>
reducepcl::device::Block144 			static __device__ __forceinline__ T reduce(volatile T* buffer, T init, BinOp op)
145 			{
146 				int tid = flattenedThreadId();
147 				T val =  buffer[tid] = init;
148 				__syncthreads();
149 
150 				if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
151 				if (CTA_SIZE >=  512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
152 				if (CTA_SIZE >=  256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
153 				if (CTA_SIZE >=  128) { if (tid <  64) buffer[tid] = val = op(val, buffer[tid +  64]); __syncthreads(); }
154 
155 				if (tid < 32)
156 				{
157 					if (CTA_SIZE >=   64) { buffer[tid] = val = op(val, buffer[tid +  32]); }
158 					if (CTA_SIZE >=   32) { buffer[tid] = val = op(val, buffer[tid +  16]); }
159 					if (CTA_SIZE >=   16) { buffer[tid] = val = op(val, buffer[tid +   8]); }
160 					if (CTA_SIZE >=    8) { buffer[tid] = val = op(val, buffer[tid +   4]); }
161 					if (CTA_SIZE >=    4) { buffer[tid] = val = op(val, buffer[tid +   2]); }
162 					if (CTA_SIZE >=    2) { buffer[tid] = val = op(val, buffer[tid +   1]); }
163 				}
164 				__syncthreads();
165 				return buffer[0];
166 			}
167 
168 			template <typename T, class BinOp>
reduce_npcl::device::Block169 			static __device__ __forceinline__ void reduce_n(T* data, unsigned int n, BinOp op)
170 			{
171 				int ftid = flattenedThreadId();
172 				int sft = stride();
173 
174 				if (sft < n)
175 				{
176 					for (unsigned int i = sft + ftid; i < n; i += sft)
177 						data[ftid] = op(data[ftid], data[i]);
178 
179 					__syncthreads();
180 
181 					n = sft;
182 				}
183 
184 				while (n > 1)
185 				{
186 					unsigned int half = n/2;
187 
188 					if (ftid < half)
189 						data[ftid] = op(data[ftid], data[n - ftid - 1]);
190 
191 					__syncthreads();
192 
193 					n = n - half;
194 				}
195 			}
196 		};
197 	}
198 }
199 
200 #endif /* PCL_DEVICE_UTILS_BLOCK_HPP_ */
201 
202