1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20 /*!
21 * \file reduce.cuh
22 * \brief helper functions to do reduction
23 * \author Tianqi Chen
24 */
25 #ifndef MSHADOW_CUDA_REDUCE_CUH_
26 #define MSHADOW_CUDA_REDUCE_CUH_
27
28 namespace mshadow {
29 namespace cuda {
30 /*
31 * \brief reduce over the dimension x
32 * \tparam Reducer reducer
33 * \tparam x_bits dimension = 1<<x_bits
34 * \tparam DType content data type
35 */
36 template<typename Reducer, int x_bits, typename DType>
37 inline __device__ void Reduce1D(volatile DType buf[1 << x_bits]);
38 /*
39 * \brief reduce over the dimension x
40 * \tparam Reducer reducer
41 * \tparam xmax_bits maximum size of buffer
42 * \tparam DType content data type
43 * \param xsize size of x dimension, not sure if aligned
44 */
45 template<typename Reducer, int xmax_bits, typename DType>
46 inline __device__ void
47 Reduce1DNotAlign(volatile DType buf[1 << xmax_bits], int xsize);
48 // ===============================================x===
49 // implementations afterwards,
50 // no need to read if only use the functions
51 // --------------------------------------------------
52 #ifdef __DEVICE_EMULATION__
53 #define __syncwarp() __syncthreads()
54 #else
55 #if CUDA_VERSION < 9000
56 #define __syncwarp()
57 #endif
58 #endif
59
60 template<typename Reducer, int x_bits, typename DType>
ReduceX(volatile DType buf[],int tid)61 inline __device__ void ReduceX(volatile DType buf[], int tid) {
62 if (x_bits >= 10) {
63 if (tid < 512) Reducer::Reduce(buf[tid] , buf[tid + 512]);
64 __syncthreads();
65 }
66 if (x_bits >= 9) {
67 if (tid < 256) Reducer::Reduce(buf[tid] , buf[tid + 256]);
68 __syncthreads();
69 }
70 if (x_bits >= 8) {
71 if (tid < 128) Reducer::Reduce(buf[tid] , buf[tid + 128]);
72 __syncthreads();
73 }
74 if (x_bits >= 7) {
75 if (tid < 64) Reducer::Reduce(buf[tid] , buf[tid + 64]);
76 __syncthreads();
77 }
78 if (x_bits >= 6) {
79 if (tid < 32) Reducer::Reduce(buf[tid] , buf[tid + 32]);
80 __syncthreads();
81 }
82 // in warp optimization
83 if (x_bits >= 5) {
84 if (tid < 16) Reducer::Reduce(buf[tid] , buf[tid + 16]);
85 #if MSHADOW_OLD_CUDA
86 __syncthreads();
87 #else
88 __syncwarp();
89 #endif
90 }
91 if (x_bits >= 4) {
92 if (tid < 8) Reducer::Reduce(buf[tid] , buf[tid + 8]);
93 __syncwarp();
94 }
95 if (x_bits >= 3) {
96 if (tid < 4) Reducer::Reduce(buf[tid] , buf[tid + 4]);
97 __syncwarp();
98 }
99 if (x_bits >= 2) {
100 if (tid < 2) Reducer::Reduce(buf[tid] , buf[tid + 2]);
101 __syncwarp();
102 }
103 if (x_bits >= 1) {
104 if (tid < 1) Reducer::Reduce(buf[tid] , buf[tid + 1]);
105 __syncwarp();
106 }
107 }
108 template<typename Reducer, int x_bits, typename DType>
Reduce1D(volatile DType buf[1<<x_bits])109 inline __device__ void Reduce1D(volatile DType buf[1 << x_bits]) {
110 ReduceX<Reducer, x_bits>(buf, threadIdx.x);
111 }
112 // reduce with a upper bound
113 #define __RD_NON_ALIGN(els, x_bits) \
114 els \
115 if (xmax_bits >= x_bits && x_size >= (1 << x_bits)) { \
116 if (tid < (1 << x_bits) && tid + (1 << x_bits) < x_size) { \
117 Reducer::Reduce(buf[tid] , buf[tid + (1 << x_bits)]); \
118 } \
119 __syncthreads(); \
120 ReduceX<Reducer, x_bits>(buf, tid); \
121 } \
122
123 template<typename Reducer, int xmax_bits, typename DType>
Reduce1DNotAlign(volatile DType buf[],int x_size)124 inline __device__ void Reduce1DNotAlign(volatile DType buf[], int x_size) {
125 int tid = threadIdx.x;
126 __RD_NON_ALIGN(, 8)
127 __RD_NON_ALIGN(else, 7)
128 __RD_NON_ALIGN(else, 6)
129 __RD_NON_ALIGN(else, 5)
130 __RD_NON_ALIGN(else, 4)
131 __RD_NON_ALIGN(else, 3)
132 __RD_NON_ALIGN(else, 2)
133 __RD_NON_ALIGN(else, 1)
134 }
135 } // namespace cuda
136 } // namespace mshadow
137 #endif // MSHADOW_CUDA_REDUCE_CUH_
138
139