1 /* 2 * Software License Agreement (BSD License) 3 * 4 * Copyright (c) 2011, Willow Garage, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above 14 * copyright notice, this list of conditions and the following 15 * disclaimer in the documentation and/or other materials provided 16 * with the distribution. 17 * * Neither the name of Willow Garage, Inc. nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 29 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 31 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 * 34 * Author: Anatoly Baskeheev, Itseez Ltd, (myname.mysurname@mycompany.com) 35 */ 36 37 #ifndef PCL_DEVICE_UTILS_BLOCK_HPP_ 38 #define PCL_DEVICE_UTILS_BLOCK_HPP_ 39 40 namespace pcl 41 { 42 namespace device 43 { 44 struct Block 45 { idpcl::device::Block46 static __device__ __forceinline__ unsigned int id() 47 { 48 return blockIdx.x; 49 } 50 stridepcl::device::Block51 static __device__ __forceinline__ unsigned int stride() 52 { 53 return blockDim.x * blockDim.y * blockDim.z; 54 } 55 syncpcl::device::Block56 static __device__ __forceinline__ void sync() 57 { 58 __syncthreads(); 59 } 60 flattenedThreadIdpcl::device::Block61 static __device__ __forceinline__ int flattenedThreadId() 62 { 63 return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; 64 } 65 66 template<typename It, typename T> fillpcl::device::Block67 static __device__ __forceinline__ void fill(It beg, It end, const T& value) 68 { 69 int STRIDE = stride(); 70 It t = beg + flattenedThreadId(); 71 72 for(; t < end; t += STRIDE) 73 *t = value; 74 } 75 76 template<typename OutIt, typename T> yotapcl::device::Block77 static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value) 78 { 79 int STRIDE = stride(); 80 int tid = flattenedThreadId(); 81 value += tid; 82 83 for(OutIt t = beg + tid; t < end; t += STRIDE, value += STRIDE) 84 *t = value; 85 } 86 87 template<typename InIt, typename OutIt> copypcl::device::Block88 static __device__ __forceinline__ void copy(InIt beg, InIt end, OutIt out) 89 { 90 int STRIDE = stride(); 91 InIt t = beg + flattenedThreadId(); 92 OutIt o = out + (t - beg); 93 94 for(; t < end; t += STRIDE, o += STRIDE) 95 *o = *t; 96 } 97 98 template<typename InIt, typename OutIt, class UnOp> transformpcl::device::Block99 static __device__ __forceinline__ void transform(InIt beg, InIt end, OutIt out, UnOp op) 100 { 101 int STRIDE = stride(); 102 InIt t = beg + flattenedThreadId(); 103 OutIt o = out + (t - beg); 104 105 for(; t < end; t += STRIDE, o += STRIDE) 106 *o = op(*t); 107 } 108 109 template<typename InIt1, typename InIt2, typename OutIt, class BinOp> transformpcl::device::Block110 static __device__ __forceinline__ void transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op) 111 { 112 int STRIDE = stride(); 113 InIt1 t1 = beg1 + flattenedThreadId(); 114 InIt2 t2 = beg2 + flattenedThreadId(); 115 OutIt o = out + (t1 - beg1); 116 117 for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, o += STRIDE) 118 *o = op(*t1, *t2); 119 } 120 121 template<int CTA_SIZE, typename T, class BinOp> reducepcl::device::Block122 static __device__ __forceinline__ void reduce(volatile T* buffer, BinOp op) 123 { 124 int tid = flattenedThreadId(); 125 T val = buffer[tid]; 126 127 if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); } 128 if (CTA_SIZE >= 512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); } 129 if (CTA_SIZE >= 256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); } 130 if (CTA_SIZE >= 128) { if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); } 131 132 if (tid < 32) 133 { 134 if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); } 135 if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); } 136 if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); } 137 if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); } 138 if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); } 139 if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); } 140 } 141 } 142 143 template<int CTA_SIZE, typename T, class BinOp> reducepcl::device::Block144 static __device__ __forceinline__ T reduce(volatile T* buffer, T init, BinOp op) 145 { 146 int tid = flattenedThreadId(); 147 T val = buffer[tid] = init; 148 __syncthreads(); 149 150 if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); } 151 if (CTA_SIZE >= 512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); } 152 if (CTA_SIZE >= 256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); } 153 if (CTA_SIZE >= 128) { if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); } 154 155 if (tid < 32) 156 { 157 if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); } 158 if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); } 159 if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); } 160 if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); } 161 if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); } 162 if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); } 163 } 164 __syncthreads(); 165 return buffer[0]; 166 } 167 168 template <typename T, class BinOp> reduce_npcl::device::Block169 static __device__ __forceinline__ void reduce_n(T* data, unsigned int n, BinOp op) 170 { 171 int ftid = flattenedThreadId(); 172 int sft = stride(); 173 174 if (sft < n) 175 { 176 for (unsigned int i = sft + ftid; i < n; i += sft) 177 data[ftid] = op(data[ftid], data[i]); 178 179 __syncthreads(); 180 181 n = sft; 182 } 183 184 while (n > 1) 185 { 186 unsigned int half = n/2; 187 188 if (ftid < half) 189 data[ftid] = op(data[ftid], data[n - ftid - 1]); 190 191 __syncthreads(); 192 193 n = n - half; 194 } 195 } 196 }; 197 } 198 } 199 200 #endif /* PCL_DEVICE_UTILS_BLOCK_HPP_ */ 201 202