1 #pragma once
2 
3 #include "cuda_execution_policy.hpp"
4 
5 namespace tf {
6 
7 // default warp size
8 inline constexpr unsigned CUDA_WARP_SIZE = 32;
9 
10 // empty type
11 struct cudaEmpty { };
12 
13 // ----------------------------------------------------------------------------
14 // iterator unrolling
15 // ----------------------------------------------------------------------------
16 
17 // Template unrolled looping construct.
18 template<unsigned i, unsigned count, bool valid = (i < count)>
19 struct cudaIterate {
20   template<typename F>
evaltf::cudaIterate21   __device__ static void eval(F f) {
22     f(i);
23     cudaIterate<i + 1, count>::eval(f);
24   }
25 };
26 
27 template<unsigned i, unsigned count>
28 struct cudaIterate<i, count, false> {
29   template<typename F>
evaltf::cudaIterate30   __device__ static void eval(F) { }
31 };
32 
33 template<unsigned begin, unsigned end, typename F>
cuda_iterate(F f)34 __device__ void cuda_iterate(F f) {
35   cudaIterate<begin, end>::eval(f);
36 }
37 
38 template<unsigned count, typename F>
cuda_iterate(F f)39 __device__ void cuda_iterate(F f) {
40   cuda_iterate<0, count>(f);
41 }
42 
43 template<unsigned count, typename T>
reduce(const T (& x)[count])44 __device__ T reduce(const T(&x)[count]) {
45   T y;
46   cuda_iterate<count>([&](auto i) { y = i ? x[i] + y : x[i]; });
47   return y;
48 }
49 
50 template<unsigned count, typename T>
fill(T (& x)[count],T val)51 __device__ void fill(T(&x)[count], T val) {
52   cuda_iterate<count>([&](auto i) { x[i] = val; });
53 }
54 
55 // Invoke unconditionally.
56 template<unsigned nt, unsigned vt, typename F>
cuda_strided_iterate(F f,unsigned tid)57 __device__ void cuda_strided_iterate(F f, unsigned tid) {
58   cuda_iterate<vt>([=](auto i) { f(i, nt * i + tid); });
59 }
60 
61 // Check range.
62 template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename F>
cuda_strided_iterate(F f,unsigned tid,unsigned count)63 __device__ void cuda_strided_iterate(F f, unsigned tid, unsigned count) {
64   // Unroll the first vt0 elements of each thread.
65   if(vt0 > 1 && count >= nt * vt0) {
66     cuda_strided_iterate<nt, vt0>(f, tid);    // No checking
67   } else {
68     cuda_iterate<vt0>([=](auto i) {
69       auto j = nt * i + tid;
70       if(j < count) f(i, j);
71     });
72   }
73 
74   // TODO: seems dummy when vt0 == vt
75   cuda_iterate<vt0, vt>([=](auto i) {
76     auto j = nt * i + tid;
77     if(j < count) f(i, j);
78   });
79 }
80 
81 template<unsigned vt, typename F>
cuda_thread_iterate(F f,unsigned tid)82 __device__ void cuda_thread_iterate(F f, unsigned tid) {
83   cuda_iterate<vt>([=](auto i) { f(i, vt * tid + i); });
84 }
85 
86 // ----------------------------------------------------------------------------
87 // cudaRange
88 // ----------------------------------------------------------------------------
89 
90 // cudaRange
91 struct cudaRange {
92   unsigned begin, end;
sizetf::cudaRange93   __device__ unsigned size() const { return end - begin; }
counttf::cudaRange94   __device__ unsigned count() const { return size(); }
validtf::cudaRange95   __device__ bool valid() const { return end > begin; }
96 };
97 
cuda_get_tile(unsigned b,unsigned nv,unsigned count)98 inline __device__ cudaRange cuda_get_tile(unsigned b, unsigned nv, unsigned count) {
99   return cudaRange { nv * b, min(count, nv * (b + 1)) };
100 }
101 
102 
103 // ----------------------------------------------------------------------------
104 // cudaArray
105 // ----------------------------------------------------------------------------
106 
107 template<typename T, unsigned size>
108 struct cudaArray {
109   T data[size];
110 
operator []tf::cudaArray111   __device__ T operator[](unsigned i) const { return data[i]; }
operator []tf::cudaArray112   __device__ T& operator[](unsigned i) { return data[i]; }
113 
114   cudaArray() = default;
115   cudaArray(const cudaArray&) = default;
116   cudaArray& operator=(const cudaArray&) = default;
117 
118   // Fill the array with x.
cudaArraytf::cudaArray119   __device__ cudaArray(T x) {
120     cuda_iterate<size>([&](unsigned i) { data[i] = x; });
121   }
122 };
123 
124 template<typename T>
125 struct cudaArray<T, 0> {
operator []tf::cudaArray126   __device__ T operator[](unsigned) const { return T(); }
operator []tf::cudaArray127   __device__ T& operator[](unsigned) { return *(T*)nullptr; }
128 };
129 
130 template<typename T, typename V, unsigned size>
131 struct cudaKVArray {
132   cudaArray<T, size> keys;
133   cudaArray<V, size> vals;
134 };
135 
136 // ----------------------------------------------------------------------------
137 // thread reg <-> global mem
138 // ----------------------------------------------------------------------------
139 
140 template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename I>
cuda_mem_to_reg_strided(I mem,unsigned tid,unsigned count)141 __device__ auto cuda_mem_to_reg_strided(I mem, unsigned tid, unsigned count) {
142   using T = typename std::iterator_traits<I>::value_type;
143   cudaArray<T, vt> x;
144   cuda_strided_iterate<nt, vt, vt0>(
145     [&](auto i, auto j) { x[i] = mem[j]; }, tid, count
146   );
147   return x;
148 }
149 
150 template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename T, typename it_t>
cuda_reg_to_mem_strided(cudaArray<T,vt> x,unsigned tid,unsigned count,it_t mem)151 __device__ void cuda_reg_to_mem_strided(
152   cudaArray<T, vt> x, unsigned tid, unsigned count, it_t mem) {
153 
154   cuda_strided_iterate<nt, vt, vt0>(
155     [=](auto i, auto j) { mem[j] = x[i]; }, tid, count
156   );
157 }
158 
159 template<unsigned nt, unsigned vt, unsigned vt0 = vt, typename I, typename O>
cuda_transform_mem_to_reg_strided(I mem,unsigned tid,unsigned count,O op)160 __device__ auto cuda_transform_mem_to_reg_strided(
161   I mem, unsigned tid, unsigned count, O op
162 ) {
163   using T = std::invoke_result_t<O, typename std::iterator_traits<I>::value_type>;
164   cudaArray<T, vt> x;
165   cuda_strided_iterate<nt, vt, vt0>(
166     [&](auto i, auto j) { x[i] = op(mem[j]); }, tid, count
167   );
168   return x;
169 }
170 
171 // ----------------------------------------------------------------------------
172 // thread reg <-> shared
173 // ----------------------------------------------------------------------------
174 
175 template<unsigned nt, unsigned vt, typename T, unsigned shared_size>
cuda_reg_to_shared_thread(cudaArray<T,vt> x,unsigned tid,T (& shared)[shared_size],bool sync=true)176 __device__ void cuda_reg_to_shared_thread(
177   cudaArray<T, vt> x, unsigned tid, T (&shared)[shared_size], bool sync = true
178 ) {
179 
180   static_assert(shared_size >= nt * vt,
181     "reg_to_shared_thread must have at least nt * vt storage");
182 
183   cuda_thread_iterate<vt>([&](auto i, auto j) { shared[j] = x[i]; }, tid);
184 
185   if(sync) __syncthreads();
186 }
187 
188 template<unsigned nt, unsigned vt, typename T, unsigned shared_size>
cuda_shared_to_reg_thread(const T (& shared)[shared_size],unsigned tid,bool sync=true)189 __device__ auto cuda_shared_to_reg_thread(
190   const T (&shared)[shared_size], unsigned tid, bool sync = true
191 ) {
192 
193   static_assert(shared_size >= nt * vt,
194     "reg_to_shared_thread must have at least nt * vt storage");
195 
196   cudaArray<T, vt> x;
197   cuda_thread_iterate<vt>([&](auto i, auto j) {
198     x[i] = shared[j];
199   }, tid);
200 
201   if(sync) __syncthreads();
202 
203   return x;
204 }
205 
206 template<unsigned nt, unsigned vt, typename T, unsigned shared_size>
cuda_reg_to_shared_strided(cudaArray<T,vt> x,unsigned tid,T (& shared)[shared_size],bool sync=true)207 __device__ void cuda_reg_to_shared_strided(
208   cudaArray<T, vt> x, unsigned tid, T (&shared)[shared_size], bool sync = true
209 ) {
210 
211   static_assert(shared_size >= nt * vt,
212     "reg_to_shared_strided must have at least nt * vt storage");
213 
214   cuda_strided_iterate<nt, vt>(
215     [&](auto i, auto j) { shared[j] = x[i]; }, tid
216   );
217 
218   if(sync) __syncthreads();
219 }
220 
221 template<unsigned nt, unsigned vt, typename T, unsigned shared_size>
cuda_shared_to_reg_strided(const T (& shared)[shared_size],unsigned tid,bool sync=true)222 __device__ auto cuda_shared_to_reg_strided(
223   const T (&shared)[shared_size], unsigned tid, bool sync = true
224 ) {
225 
226   static_assert(shared_size >= nt * vt,
227     "shared_to_reg_strided must have at least nt * vt storage");
228 
229   cudaArray<T, vt> x;
230   cuda_strided_iterate<nt, vt>([&](auto i, auto j) { x[i] = shared[j]; }, tid);
231   if(sync) __syncthreads();
232 
233   return x;
234 }
235 
236 template<
237   unsigned nt, unsigned vt, unsigned vt0 = vt, typename T, typename it_t,
238   unsigned shared_size
239 >
cuda_reg_to_mem_thread(cudaArray<T,vt> x,unsigned tid,unsigned count,it_t mem,T (& shared)[shared_size])240 __device__ auto cuda_reg_to_mem_thread(
241   cudaArray<T, vt> x, unsigned tid,
242   unsigned count, it_t mem, T (&shared)[shared_size]
243 ) {
244   cuda_reg_to_shared_thread<nt>(x, tid, shared);
245   auto y = cuda_shared_to_reg_strided<nt, vt>(shared, tid);
246   cuda_reg_to_mem_strided<nt, vt, vt0>(y, tid, count, mem);
247 }
248 
249 template<
250   unsigned nt, unsigned vt, unsigned vt0 = vt, typename T, typename it_t,
251   unsigned shared_size
252 >
cuda_mem_to_reg_thread(it_t mem,unsigned tid,unsigned count,T (& shared)[shared_size])253 __device__ auto cuda_mem_to_reg_thread(
254   it_t mem, unsigned tid, unsigned count, T (&shared)[shared_size]
255 ) {
256 
257   auto x = cuda_mem_to_reg_strided<nt, vt, vt0>(mem, tid, count);
258   cuda_reg_to_shared_strided<nt, vt>(x, tid, shared);
259   auto y = cuda_shared_to_reg_thread<nt, vt>(shared, tid);
260   return y;
261 }
262 
263 template<unsigned nt, unsigned vt, typename T, unsigned S>
cuda_shared_gather(const T (& data)[S],cudaArray<unsigned,vt> indices,bool sync=true)264 __device__ auto cuda_shared_gather(
265   const T(&data)[S], cudaArray<unsigned, vt> indices, bool sync = true
266 ) {
267 
268   static_assert(S >= nt * vt,
269     "shared_gather must have at least nt * vt storage");
270 
271   cudaArray<T, vt> x;
272   cuda_iterate<vt>([&](auto i) { x[i] = data[indices[i]]; });
273 
274   if(sync) __syncthreads();
275 
276   return x;
277 }
278 
279 
280 
281 // ----------------------------------------------------------------------------
282 // reg<->reg
283 // ----------------------------------------------------------------------------
284 
285 template<unsigned nt, unsigned vt, typename T, unsigned S>
cuda_reg_thread_to_strided(cudaArray<T,vt> x,unsigned tid,T (& shared)[S])286 __device__ auto cuda_reg_thread_to_strided(
287   cudaArray<T, vt> x, unsigned tid, T (&shared)[S]
288 ) {
289   cuda_reg_to_shared_thread<nt>(x, tid, shared);
290   return cuda_shared_to_reg_strided<nt, vt>(shared, tid);
291 }
292 
293 template<unsigned nt, unsigned vt, typename T, unsigned S>
cuda_reg_strided_to_thread(cudaArray<T,vt> x,unsigned tid,T (& shared)[S])294 __device__ auto cuda_reg_strided_to_thread(
295   cudaArray<T, vt> x, unsigned tid, T (&shared)[S]
296 ) {
297   cuda_reg_to_shared_strided<nt>(x, tid, shared);
298   return cuda_shared_to_reg_thread<nt, vt>(shared, tid);
299 }
300 
301 // ----------------------------------------------------------------------------
302 // cudaLoadStoreIterator
303 // ----------------------------------------------------------------------------
304 
305 template<typename L, typename S, typename T, typename I>
306 struct cudaLoadStoreIterator : std::iterator_traits<const T*> {
307 
308   L load;
309   S store;
310   I base;
311 
cudaLoadStoreIteratortf::cudaLoadStoreIterator312   cudaLoadStoreIterator(L load_, S store_, I base_) :
313     load(load_), store(store_), base(base_) { }
314 
315   struct assign_t {
316     L load;
317     S store;
318     I index;
319 
operator =tf::cudaLoadStoreIterator::assign_t320     __device__ assign_t& operator=(T rhs) {
321       static_assert(!std::is_same<S, cudaEmpty>::value,
322         "load_iterator is being stored to.");
323       store(rhs, index);
324       return *this;
325     }
operator Ttf::cudaLoadStoreIterator::assign_t326     __device__ operator T() const {
327       static_assert(!std::is_same<L, cudaEmpty>::value,
328         "store_iterator is being loaded from.");
329       return load(index);
330     }
331   };
332 
operator []tf::cudaLoadStoreIterator333   __device__ assign_t operator[](I index) const {
334     return assign_t { load, store, base + index };
335   }
operator *tf::cudaLoadStoreIterator336   __device__ assign_t operator*() const {
337     return assign_t { load, store, base };
338   }
339 
operator +tf::cudaLoadStoreIterator340   __device__ cudaLoadStoreIterator operator+(I offset) const {
341     cudaLoadStoreIterator cp = *this;
342     cp += offset;
343     return cp;
344   }
345 
operator +=tf::cudaLoadStoreIterator346   __device__ cudaLoadStoreIterator& operator+=(I offset) {
347     base += offset;
348     return *this;
349   }
350 
operator -tf::cudaLoadStoreIterator351   __device__ cudaLoadStoreIterator operator-(I offset) const {
352     cudaLoadStoreIterator cp = *this;
353     cp -= offset;
354     return cp;
355   }
356 
operator -=tf::cudaLoadStoreIterator357   __device__ cudaLoadStoreIterator& operator-=(I offset) {
358     base -= offset;
359     return *this;
360   }
361 };
362 
363 //template<typename T>
364 //struct trivial_load_functor {
365 //  template<typename I>
366 //  __device__ T operator()(I index) const {
367 //    return T();
368 //  }
369 //};
370 
371 //template<typename T>
372 //struct trivial_store_functor {
373 //  template<typename I>
374 //  __device__ void operator()(T v, I index) const { }
375 //};
376 
377 template <typename T, typename I = unsigned, typename L, typename S>
cuda_make_load_store_iterator(L load,S store,I base=0)378 auto cuda_make_load_store_iterator(L load, S store, I base = 0) {
379   return cudaLoadStoreIterator<L, S, T, I>(load, store, base);
380 }
381 
382 template <typename T, typename I = unsigned, typename L>
cuda_make_load_iterator(L load,I base=0)383 auto cuda_make_load_iterator(L load, I base = 0) {
384   return cuda_make_load_store_iterator<T>(load, cudaEmpty(), base);
385 }
386 
387 template <typename T, typename I = unsigned, typename S>
cuda_make_store_iterator(S store,I base=0)388 auto cuda_make_store_iterator(S store, I base = 0) {
389   return cuda_make_load_store_iterator<T>(cudaEmpty(), store, base);
390 }
391 
392 // ----------------------------------------------------------------------------
393 // swap
394 // ----------------------------------------------------------------------------
395 
396 template<typename T>
cuda_swap(T & a,T & b)397 __device__ void cuda_swap(T& a, T& b) {
398   auto c = a;
399   a = b;
400   b = c;
401 }
402 
403 // ----------------------------------------------------------------------------
404 // launch kernel
405 // ----------------------------------------------------------------------------
406 
407 template<typename F, typename... args_t>
cuda_kernel(F f,args_t...args)408 __global__ void cuda_kernel(F f, args_t... args) {
409   f(threadIdx.x, blockIdx.x, args...);
410 }
411 
412 // ----------------------------------------------------------------------------
413 // operators
414 // ----------------------------------------------------------------------------
415 
416 template <typename T>
417 struct cuda_plus : public std::binary_function<T, T, T> {
operator ()tf::cuda_plus418   __device__ T operator()(T a, T b) const { return a + b; }
419 };
420 
421 template <typename T>
422 struct cuda_minus : public std::binary_function<T, T, T> {
operator ()tf::cuda_minus423   __device__ T operator()(T a, T b) const { return a - b; }
424 };
425 
426 template <typename T>
427 struct cuda_multiplies : public std::binary_function<T, T, T> {
operator ()tf::cuda_multiplies428   __device__ T operator()(T a, T b) const { return a * b; }
429 };
430 
431 template <typename T>
432 struct cuda_maximum  : public std::binary_function<T, T, T> {
operator ()tf::cuda_maximum433   __device__ T operator()(T a, T b) const { return a > b ? a : b; }
434 };
435 
436 template <typename T>
437 struct cuda_minimum  : public std::binary_function<T, T, T> {
operator ()tf::cuda_minimum438   __device__ T operator()(T a, T b) const { return a < b ? a : b; }
439 };
440 
441 template <typename T>
442 struct cuda_less : public std::binary_function<T, T, T> {
operator ()tf::cuda_less443   __device__ T operator()(T a, T b) const { return a < b; }
444 };
445 
446 template <typename T>
447 struct cuda_greater : public std::binary_function<T, T, T> {
operator ()tf::cuda_greater448   __device__ T operator()(T a, T b) const { return a > b; }
449 };
450 
451 }  // end of namespace tf -----------------------------------------------------
452 
453 
454 
455