1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements.  See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership.  The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License.  You may obtain a copy of the License at
9  *
10  *   http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied.  See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 /*!
21  * \file reduce.cuh
22  * \brief helper functions to do reduction
23  * \author Tianqi Chen
24  */
25 #ifndef MSHADOW_CUDA_REDUCE_CUH_
26 #define MSHADOW_CUDA_REDUCE_CUH_
27 
28 namespace mshadow {
29 namespace cuda {
30 /*
31  * \brief reduce over the dimension x
32  * \tparam Reducer reducer
33  * \tparam x_bits dimension = 1<<x_bits
34  * \tparam DType content data type
35  */
36 template<typename Reducer, int x_bits, typename DType>
37 inline __device__ void Reduce1D(volatile DType buf[1 << x_bits]);
38 /*
39  * \brief reduce over the dimension x
40  * \tparam Reducer reducer
41  * \tparam xmax_bits maximum size of buffer
42  * \tparam DType content data type
43  * \param xsize size of x dimension, not sure if aligned
44  */
45 template<typename Reducer, int xmax_bits, typename DType>
46 inline __device__ void
47 Reduce1DNotAlign(volatile DType buf[1 << xmax_bits], int xsize);
48 // ===============================================x===
49 //  implementations afterwards,
50 //  no need to read if only use the functions
51 // --------------------------------------------------
52 #ifdef  __DEVICE_EMULATION__
53 #define __syncwarp() __syncthreads()
54 #else
55 #if CUDA_VERSION < 9000
56 #define __syncwarp()
57 #endif
58 #endif
59 
60 template<typename Reducer, int x_bits, typename DType>
ReduceX(volatile DType buf[],int tid)61 inline __device__ void ReduceX(volatile DType  buf[], int tid) {
62   if (x_bits >= 10) {
63     if (tid < 512) Reducer::Reduce(buf[tid] , buf[tid + 512]);
64     __syncthreads();
65   }
66   if (x_bits >= 9) {
67     if (tid < 256) Reducer::Reduce(buf[tid] , buf[tid + 256]);
68     __syncthreads();
69   }
70   if (x_bits >= 8) {
71     if (tid < 128) Reducer::Reduce(buf[tid] , buf[tid + 128]);
72     __syncthreads();
73   }
74   if (x_bits >= 7) {
75     if (tid < 64) Reducer::Reduce(buf[tid] , buf[tid + 64]);
76     __syncthreads();
77   }
78   if (x_bits >= 6) {
79     if (tid < 32) Reducer::Reduce(buf[tid] , buf[tid + 32]);
80     __syncthreads();
81   }
82   // in warp optimization
83   if (x_bits >= 5) {
84     if (tid < 16) Reducer::Reduce(buf[tid] , buf[tid + 16]);
85 #if MSHADOW_OLD_CUDA
86     __syncthreads();
87 #else
88     __syncwarp();
89 #endif
90   }
91   if (x_bits >= 4) {
92     if (tid < 8) Reducer::Reduce(buf[tid] , buf[tid + 8]);
93     __syncwarp();
94   }
95   if (x_bits >= 3) {
96     if (tid < 4) Reducer::Reduce(buf[tid] , buf[tid + 4]);
97     __syncwarp();
98   }
99   if (x_bits >= 2) {
100     if (tid < 2) Reducer::Reduce(buf[tid] , buf[tid + 2]);
101     __syncwarp();
102   }
103   if (x_bits >= 1) {
104     if (tid < 1) Reducer::Reduce(buf[tid] , buf[tid + 1]);
105     __syncwarp();
106   }
107 }
108 template<typename Reducer, int x_bits, typename DType>
Reduce1D(volatile DType buf[1<<x_bits])109 inline __device__ void Reduce1D(volatile DType buf[1 << x_bits]) {
110   ReduceX<Reducer, x_bits>(buf, threadIdx.x);
111 }
112 // reduce with a upper bound
113 #define __RD_NON_ALIGN(els, x_bits)                                     \
114   els                                                                   \
115   if (xmax_bits >= x_bits && x_size >= (1 << x_bits)) {                 \
116     if (tid < (1 << x_bits) && tid + (1 << x_bits) < x_size) {          \
117       Reducer::Reduce(buf[tid] , buf[tid + (1 << x_bits)]);             \
118     }                                                                   \
119     __syncthreads();                                                    \
120     ReduceX<Reducer, x_bits>(buf, tid);                                 \
121   }                                                                     \
122 
123 template<typename Reducer, int xmax_bits, typename DType>
Reduce1DNotAlign(volatile DType buf[],int x_size)124 inline __device__ void Reduce1DNotAlign(volatile DType buf[], int x_size) {
125   int tid = threadIdx.x;
126   __RD_NON_ALIGN(, 8)
127   __RD_NON_ALIGN(else, 7)
128   __RD_NON_ALIGN(else, 6)
129   __RD_NON_ALIGN(else, 5)
130   __RD_NON_ALIGN(else, 4)
131   __RD_NON_ALIGN(else, 3)
132   __RD_NON_ALIGN(else, 2)
133   __RD_NON_ALIGN(else, 1)
134 }
135 }  // namespace cuda
136 }  // namespace mshadow
137 #endif  // MSHADOW_CUDA_REDUCE_CUH_
138 
139