1 #pragma once
2
3 #include "cuda_execution_policy.hpp"
4
5 namespace tf {
6
7 // default warp size
8 inline constexpr unsigned CUDA_WARP_SIZE = 32;
9
10 // empty type
11 struct cudaEmpty { };
12
13 // ----------------------------------------------------------------------------
14 // iterator unrolling
15 // ----------------------------------------------------------------------------
16
17 // Template unrolled looping construct.
18 template<unsigned i, unsigned count, bool valid = (i < count)>
19 struct cudaIterate {
20 template<typename F>
evaltf::cudaIterate21 __device__ static void eval(F f) {
22 f(i);
23 cudaIterate<i + 1, count>::eval(f);
24 }
25 };
26
27 template<unsigned i, unsigned count>
28 struct cudaIterate<i, count, false> {
29 template<typename F>
evaltf::cudaIterate30 __device__ static void eval(F) { }
31 };
32
33 template<unsigned begin, unsigned end, typename F>
cuda_iterate(F f)34 __device__ void cuda_iterate(F f) {
35 cudaIterate<begin, end>::eval(f);
36 }
37
38 template<unsigned count, typename F>
cuda_iterate(F f)39 __device__ void cuda_iterate(F f) {
40 cuda_iterate<0, count>(f);
41 }
42
43 template<unsigned count, typename T>
reduce(const T (& x)[count])44 __device__ T reduce(const T(&x)[count]) {
45 T y;
46 cuda_iterate<count>([&](auto i) { y = i ? x[i] + y : x[i]; });
47 return y;
48 }
49
50 template<unsigned count, typename T>
fill(T (& x)[count],T val)51 __device__ void fill(T(&x)[count], T val) {
52 cuda_iterate<count>([&](auto i) { x[i] = val; });
53 }
54
55 // Invoke unconditionally.
56 template<unsigned nt, unsigned vt, typename F>
cuda_strided_iterate(F f,unsigned tid)57 __device__ void cuda_strided_iterate(F f, unsigned tid) {
58 cuda_iterate<vt>([=](auto i) { f(i, nt * i + tid); });
59 }
60
61 // Check range.
62 template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename F>
cuda_strided_iterate(F f,unsigned tid,unsigned count)63 __device__ void cuda_strided_iterate(F f, unsigned tid, unsigned count) {
64 // Unroll the first vt0 elements of each thread.
65 if(vt0 > 1 && count >= nt * vt0) {
66 cuda_strided_iterate<nt, vt0>(f, tid); // No checking
67 } else {
68 cuda_iterate<vt0>([=](auto i) {
69 auto j = nt * i + tid;
70 if(j < count) f(i, j);
71 });
72 }
73
74 // TODO: seems dummy when vt0 == vt
75 cuda_iterate<vt0, vt>([=](auto i) {
76 auto j = nt * i + tid;
77 if(j < count) f(i, j);
78 });
79 }
80
81 template<unsigned vt, typename F>
cuda_thread_iterate(F f,unsigned tid)82 __device__ void cuda_thread_iterate(F f, unsigned tid) {
83 cuda_iterate<vt>([=](auto i) { f(i, vt * tid + i); });
84 }
85
86 // ----------------------------------------------------------------------------
87 // cudaRange
88 // ----------------------------------------------------------------------------
89
90 // cudaRange
91 struct cudaRange {
92 unsigned begin, end;
sizetf::cudaRange93 __device__ unsigned size() const { return end - begin; }
counttf::cudaRange94 __device__ unsigned count() const { return size(); }
validtf::cudaRange95 __device__ bool valid() const { return end > begin; }
96 };
97
cuda_get_tile(unsigned b,unsigned nv,unsigned count)98 inline __device__ cudaRange cuda_get_tile(unsigned b, unsigned nv, unsigned count) {
99 return cudaRange { nv * b, min(count, nv * (b + 1)) };
100 }
101
102
103 // ----------------------------------------------------------------------------
104 // cudaArray
105 // ----------------------------------------------------------------------------
106
107 template<typename T, unsigned size>
108 struct cudaArray {
109 T data[size];
110
operator []tf::cudaArray111 __device__ T operator[](unsigned i) const { return data[i]; }
operator []tf::cudaArray112 __device__ T& operator[](unsigned i) { return data[i]; }
113
114 cudaArray() = default;
115 cudaArray(const cudaArray&) = default;
116 cudaArray& operator=(const cudaArray&) = default;
117
118 // Fill the array with x.
cudaArraytf::cudaArray119 __device__ cudaArray(T x) {
120 cuda_iterate<size>([&](unsigned i) { data[i] = x; });
121 }
122 };
123
124 template<typename T>
125 struct cudaArray<T, 0> {
operator []tf::cudaArray126 __device__ T operator[](unsigned) const { return T(); }
operator []tf::cudaArray127 __device__ T& operator[](unsigned) { return *(T*)nullptr; }
128 };
129
130 template<typename T, typename V, unsigned size>
131 struct cudaKVArray {
132 cudaArray<T, size> keys;
133 cudaArray<V, size> vals;
134 };
135
136 // ----------------------------------------------------------------------------
137 // thread reg <-> global mem
138 // ----------------------------------------------------------------------------
139
140 template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename I>
cuda_mem_to_reg_strided(I mem,unsigned tid,unsigned count)141 __device__ auto cuda_mem_to_reg_strided(I mem, unsigned tid, unsigned count) {
142 using T = typename std::iterator_traits<I>::value_type;
143 cudaArray<T, vt> x;
144 cuda_strided_iterate<nt, vt, vt0>(
145 [&](auto i, auto j) { x[i] = mem[j]; }, tid, count
146 );
147 return x;
148 }
149
150 template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename T, typename it_t>
cuda_reg_to_mem_strided(cudaArray<T,vt> x,unsigned tid,unsigned count,it_t mem)151 __device__ void cuda_reg_to_mem_strided(
152 cudaArray<T, vt> x, unsigned tid, unsigned count, it_t mem) {
153
154 cuda_strided_iterate<nt, vt, vt0>(
155 [=](auto i, auto j) { mem[j] = x[i]; }, tid, count
156 );
157 }
158
159 template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename I, typename O>
cuda_transform_mem_to_reg_strided(I mem,unsigned tid,unsigned count,O op)160 __device__ auto cuda_transform_mem_to_reg_strided(
161 I mem, unsigned tid, unsigned count, O op
162 ) {
163 using T = std::invoke_result_t<O, typename std::iterator_traits<I>::value_type>;
164 cudaArray<T, vt> x;
165 cuda_strided_iterate<nt, vt, vt0>(
166 [&](auto i, auto j) { x[i] = op(mem[j]); }, tid, count
167 );
168 return x;
169 }
170
171 // ----------------------------------------------------------------------------
172 // thread reg <-> shared
173 // ----------------------------------------------------------------------------
174
175 template<unsigned nt, unsigned vt, typename T, unsigned shared_size>
cuda_reg_to_shared_thread(cudaArray<T,vt> x,unsigned tid,T (& shared)[shared_size],bool sync=true)176 __device__ void cuda_reg_to_shared_thread(
177 cudaArray<T, vt> x, unsigned tid, T (&shared)[shared_size], bool sync = true
178 ) {
179
180 static_assert(shared_size >= nt * vt,
181 "reg_to_shared_thread must have at least nt * vt storage");
182
183 cuda_thread_iterate<vt>([&](auto i, auto j) { shared[j] = x[i]; }, tid);
184
185 if(sync) __syncthreads();
186 }
187
188 template<unsigned nt, unsigned vt, typename T, unsigned shared_size>
cuda_shared_to_reg_thread(const T (& shared)[shared_size],unsigned tid,bool sync=true)189 __device__ auto cuda_shared_to_reg_thread(
190 const T (&shared)[shared_size], unsigned tid, bool sync = true
191 ) {
192
193 static_assert(shared_size >= nt * vt,
194 "reg_to_shared_thread must have at least nt * vt storage");
195
196 cudaArray<T, vt> x;
197 cuda_thread_iterate<vt>([&](auto i, auto j) {
198 x[i] = shared[j];
199 }, tid);
200
201 if(sync) __syncthreads();
202
203 return x;
204 }
205
206 template<unsigned nt, unsigned vt, typename T, unsigned shared_size>
cuda_reg_to_shared_strided(cudaArray<T,vt> x,unsigned tid,T (& shared)[shared_size],bool sync=true)207 __device__ void cuda_reg_to_shared_strided(
208 cudaArray<T, vt> x, unsigned tid, T (&shared)[shared_size], bool sync = true
209 ) {
210
211 static_assert(shared_size >= nt * vt,
212 "reg_to_shared_strided must have at least nt * vt storage");
213
214 cuda_strided_iterate<nt, vt>(
215 [&](auto i, auto j) { shared[j] = x[i]; }, tid
216 );
217
218 if(sync) __syncthreads();
219 }
220
221 template<unsigned nt, unsigned vt, typename T, unsigned shared_size>
cuda_shared_to_reg_strided(const T (& shared)[shared_size],unsigned tid,bool sync=true)222 __device__ auto cuda_shared_to_reg_strided(
223 const T (&shared)[shared_size], unsigned tid, bool sync = true
224 ) {
225
226 static_assert(shared_size >= nt * vt,
227 "shared_to_reg_strided must have at least nt * vt storage");
228
229 cudaArray<T, vt> x;
230 cuda_strided_iterate<nt, vt>([&](auto i, auto j) { x[i] = shared[j]; }, tid);
231 if(sync) __syncthreads();
232
233 return x;
234 }
235
236 template<
237 unsigned nt, unsigned vt, unsigned vt0 = vt, typename T, typename it_t,
238 unsigned shared_size
239 >
cuda_reg_to_mem_thread(cudaArray<T,vt> x,unsigned tid,unsigned count,it_t mem,T (& shared)[shared_size])240 __device__ auto cuda_reg_to_mem_thread(
241 cudaArray<T, vt> x, unsigned tid,
242 unsigned count, it_t mem, T (&shared)[shared_size]
243 ) {
244 cuda_reg_to_shared_thread<nt>(x, tid, shared);
245 auto y = cuda_shared_to_reg_strided<nt, vt>(shared, tid);
246 cuda_reg_to_mem_strided<nt, vt, vt0>(y, tid, count, mem);
247 }
248
249 template<
250 unsigned nt, unsigned vt, unsigned vt0 = vt, typename T, typename it_t,
251 unsigned shared_size
252 >
cuda_mem_to_reg_thread(it_t mem,unsigned tid,unsigned count,T (& shared)[shared_size])253 __device__ auto cuda_mem_to_reg_thread(
254 it_t mem, unsigned tid, unsigned count, T (&shared)[shared_size]
255 ) {
256
257 auto x = cuda_mem_to_reg_strided<nt, vt, vt0>(mem, tid, count);
258 cuda_reg_to_shared_strided<nt, vt>(x, tid, shared);
259 auto y = cuda_shared_to_reg_thread<nt, vt>(shared, tid);
260 return y;
261 }
262
263 template<unsigned nt, unsigned vt, typename T, unsigned S>
cuda_shared_gather(const T (& data)[S],cudaArray<unsigned,vt> indices,bool sync=true)264 __device__ auto cuda_shared_gather(
265 const T(&data)[S], cudaArray<unsigned, vt> indices, bool sync = true
266 ) {
267
268 static_assert(S >= nt * vt,
269 "shared_gather must have at least nt * vt storage");
270
271 cudaArray<T, vt> x;
272 cuda_iterate<vt>([&](auto i) { x[i] = data[indices[i]]; });
273
274 if(sync) __syncthreads();
275
276 return x;
277 }
278
279
280
281 // ----------------------------------------------------------------------------
282 // reg<->reg
283 // ----------------------------------------------------------------------------
284
285 template<unsigned nt, unsigned vt, typename T, unsigned S>
cuda_reg_thread_to_strided(cudaArray<T,vt> x,unsigned tid,T (& shared)[S])286 __device__ auto cuda_reg_thread_to_strided(
287 cudaArray<T, vt> x, unsigned tid, T (&shared)[S]
288 ) {
289 cuda_reg_to_shared_thread<nt>(x, tid, shared);
290 return cuda_shared_to_reg_strided<nt, vt>(shared, tid);
291 }
292
293 template<unsigned nt, unsigned vt, typename T, unsigned S>
cuda_reg_strided_to_thread(cudaArray<T,vt> x,unsigned tid,T (& shared)[S])294 __device__ auto cuda_reg_strided_to_thread(
295 cudaArray<T, vt> x, unsigned tid, T (&shared)[S]
296 ) {
297 cuda_reg_to_shared_strided<nt>(x, tid, shared);
298 return cuda_shared_to_reg_thread<nt, vt>(shared, tid);
299 }
300
301 // ----------------------------------------------------------------------------
302 // cudaLoadStoreIterator
303 // ----------------------------------------------------------------------------
304
305 template<typename L, typename S, typename T, typename I>
306 struct cudaLoadStoreIterator : std::iterator_traits<const T*> {
307
308 L load;
309 S store;
310 I base;
311
cudaLoadStoreIteratortf::cudaLoadStoreIterator312 cudaLoadStoreIterator(L load_, S store_, I base_) :
313 load(load_), store(store_), base(base_) { }
314
315 struct assign_t {
316 L load;
317 S store;
318 I index;
319
operator =tf::cudaLoadStoreIterator::assign_t320 __device__ assign_t& operator=(T rhs) {
321 static_assert(!std::is_same<S, cudaEmpty>::value,
322 "load_iterator is being stored to.");
323 store(rhs, index);
324 return *this;
325 }
operator Ttf::cudaLoadStoreIterator::assign_t326 __device__ operator T() const {
327 static_assert(!std::is_same<L, cudaEmpty>::value,
328 "store_iterator is being loaded from.");
329 return load(index);
330 }
331 };
332
operator []tf::cudaLoadStoreIterator333 __device__ assign_t operator[](I index) const {
334 return assign_t { load, store, base + index };
335 }
operator *tf::cudaLoadStoreIterator336 __device__ assign_t operator*() const {
337 return assign_t { load, store, base };
338 }
339
operator +tf::cudaLoadStoreIterator340 __device__ cudaLoadStoreIterator operator+(I offset) const {
341 cudaLoadStoreIterator cp = *this;
342 cp += offset;
343 return cp;
344 }
345
operator +=tf::cudaLoadStoreIterator346 __device__ cudaLoadStoreIterator& operator+=(I offset) {
347 base += offset;
348 return *this;
349 }
350
operator -tf::cudaLoadStoreIterator351 __device__ cudaLoadStoreIterator operator-(I offset) const {
352 cudaLoadStoreIterator cp = *this;
353 cp -= offset;
354 return cp;
355 }
356
operator -=tf::cudaLoadStoreIterator357 __device__ cudaLoadStoreIterator& operator-=(I offset) {
358 base -= offset;
359 return *this;
360 }
361 };
362
363 //template<typename T>
364 //struct trivial_load_functor {
365 // template<typename I>
366 // __device__ T operator()(I index) const {
367 // return T();
368 // }
369 //};
370
371 //template<typename T>
372 //struct trivial_store_functor {
373 // template<typename I>
374 // __device__ void operator()(T v, I index) const { }
375 //};
376
377 template <typename T, typename I = unsigned, typename L, typename S>
cuda_make_load_store_iterator(L load,S store,I base=0)378 auto cuda_make_load_store_iterator(L load, S store, I base = 0) {
379 return cudaLoadStoreIterator<L, S, T, I>(load, store, base);
380 }
381
382 template <typename T, typename I = unsigned, typename L>
cuda_make_load_iterator(L load,I base=0)383 auto cuda_make_load_iterator(L load, I base = 0) {
384 return cuda_make_load_store_iterator<T>(load, cudaEmpty(), base);
385 }
386
387 template <typename T, typename I = unsigned, typename S>
cuda_make_store_iterator(S store,I base=0)388 auto cuda_make_store_iterator(S store, I base = 0) {
389 return cuda_make_load_store_iterator<T>(cudaEmpty(), store, base);
390 }
391
392 // ----------------------------------------------------------------------------
393 // swap
394 // ----------------------------------------------------------------------------
395
396 template<typename T>
cuda_swap(T & a,T & b)397 __device__ void cuda_swap(T& a, T& b) {
398 auto c = a;
399 a = b;
400 b = c;
401 }
402
403 // ----------------------------------------------------------------------------
404 // launch kernel
405 // ----------------------------------------------------------------------------
406
407 template<typename F, typename... args_t>
cuda_kernel(F f,args_t...args)408 __global__ void cuda_kernel(F f, args_t... args) {
409 f(threadIdx.x, blockIdx.x, args...);
410 }
411
412 // ----------------------------------------------------------------------------
413 // operators
414 // ----------------------------------------------------------------------------
415
416 template <typename T>
417 struct cuda_plus : public std::binary_function<T, T, T> {
operator ()tf::cuda_plus418 __device__ T operator()(T a, T b) const { return a + b; }
419 };
420
421 template <typename T>
422 struct cuda_minus : public std::binary_function<T, T, T> {
operator ()tf::cuda_minus423 __device__ T operator()(T a, T b) const { return a - b; }
424 };
425
426 template <typename T>
427 struct cuda_multiplies : public std::binary_function<T, T, T> {
operator ()tf::cuda_multiplies428 __device__ T operator()(T a, T b) const { return a * b; }
429 };
430
431 template <typename T>
432 struct cuda_maximum : public std::binary_function<T, T, T> {
operator ()tf::cuda_maximum433 __device__ T operator()(T a, T b) const { return a > b ? a : b; }
434 };
435
436 template <typename T>
437 struct cuda_minimum : public std::binary_function<T, T, T> {
operator ()tf::cuda_minimum438 __device__ T operator()(T a, T b) const { return a < b ? a : b; }
439 };
440
441 template <typename T>
442 struct cuda_less : public std::binary_function<T, T, T> {
operator ()tf::cuda_less443 __device__ T operator()(T a, T b) const { return a < b; }
444 };
445
446 template <typename T>
447 struct cuda_greater : public std::binary_function<T, T, T> {
operator ()tf::cuda_greater448 __device__ T operator()(T a, T b) const { return a > b; }
449 };
450
451 } // end of namespace tf -----------------------------------------------------
452
453
454
455