#pragma once #include "cuda_execution_policy.hpp" namespace tf { // default warp size inline constexpr unsigned CUDA_WARP_SIZE = 32; // empty type struct cudaEmpty { }; // ---------------------------------------------------------------------------- // iterator unrolling // ---------------------------------------------------------------------------- // Template unrolled looping construct. template struct cudaIterate { template __device__ static void eval(F f) { f(i); cudaIterate::eval(f); } }; template struct cudaIterate { template __device__ static void eval(F) { } }; template __device__ void cuda_iterate(F f) { cudaIterate::eval(f); } template __device__ void cuda_iterate(F f) { cuda_iterate<0, count>(f); } template __device__ T reduce(const T(&x)[count]) { T y; cuda_iterate([&](auto i) { y = i ? x[i] + y : x[i]; }); return y; } template __device__ void fill(T(&x)[count], T val) { cuda_iterate([&](auto i) { x[i] = val; }); } // Invoke unconditionally. template __device__ void cuda_strided_iterate(F f, unsigned tid) { cuda_iterate([=](auto i) { f(i, nt * i + tid); }); } // Check range. template __device__ void cuda_strided_iterate(F f, unsigned tid, unsigned count) { // Unroll the first vt0 elements of each thread. if(vt0 > 1 && count >= nt * vt0) { cuda_strided_iterate(f, tid); // No checking } else { cuda_iterate([=](auto i) { auto j = nt * i + tid; if(j < count) f(i, j); }); } // TODO: seems dummy when vt0 == vt cuda_iterate([=](auto i) { auto j = nt * i + tid; if(j < count) f(i, j); }); } template __device__ void cuda_thread_iterate(F f, unsigned tid) { cuda_iterate([=](auto i) { f(i, vt * tid + i); }); } // ---------------------------------------------------------------------------- // cudaRange // ---------------------------------------------------------------------------- // cudaRange struct cudaRange { unsigned begin, end; __device__ unsigned size() const { return end - begin; } __device__ unsigned count() const { return size(); } __device__ bool valid() const { return end > begin; } }; inline __device__ cudaRange cuda_get_tile(unsigned b, unsigned nv, unsigned count) { return cudaRange { nv * b, min(count, nv * (b + 1)) }; } // ---------------------------------------------------------------------------- // cudaArray // ---------------------------------------------------------------------------- template struct cudaArray { T data[size]; __device__ T operator[](unsigned i) const { return data[i]; } __device__ T& operator[](unsigned i) { return data[i]; } cudaArray() = default; cudaArray(const cudaArray&) = default; cudaArray& operator=(const cudaArray&) = default; // Fill the array with x. __device__ cudaArray(T x) { cuda_iterate([&](unsigned i) { data[i] = x; }); } }; template struct cudaArray { __device__ T operator[](unsigned) const { return T(); } __device__ T& operator[](unsigned) { return *(T*)nullptr; } }; template struct cudaKVArray { cudaArray keys; cudaArray vals; }; // ---------------------------------------------------------------------------- // thread reg <-> global mem // ---------------------------------------------------------------------------- template __device__ auto cuda_mem_to_reg_strided(I mem, unsigned tid, unsigned count) { using T = typename std::iterator_traits::value_type; cudaArray x; cuda_strided_iterate( [&](auto i, auto j) { x[i] = mem[j]; }, tid, count ); return x; } template __device__ void cuda_reg_to_mem_strided( cudaArray x, unsigned tid, unsigned count, it_t mem) { cuda_strided_iterate( [=](auto i, auto j) { mem[j] = x[i]; }, tid, count ); } template __device__ auto cuda_transform_mem_to_reg_strided( I mem, unsigned tid, unsigned count, O op ) { using T = std::invoke_result_t::value_type>; cudaArray x; cuda_strided_iterate( [&](auto i, auto j) { x[i] = op(mem[j]); }, tid, count ); return x; } // ---------------------------------------------------------------------------- // thread reg <-> shared // ---------------------------------------------------------------------------- template __device__ void cuda_reg_to_shared_thread( cudaArray x, unsigned tid, T (&shared)[shared_size], bool sync = true ) { static_assert(shared_size >= nt * vt, "reg_to_shared_thread must have at least nt * vt storage"); cuda_thread_iterate([&](auto i, auto j) { shared[j] = x[i]; }, tid); if(sync) __syncthreads(); } template __device__ auto cuda_shared_to_reg_thread( const T (&shared)[shared_size], unsigned tid, bool sync = true ) { static_assert(shared_size >= nt * vt, "reg_to_shared_thread must have at least nt * vt storage"); cudaArray x; cuda_thread_iterate([&](auto i, auto j) { x[i] = shared[j]; }, tid); if(sync) __syncthreads(); return x; } template __device__ void cuda_reg_to_shared_strided( cudaArray x, unsigned tid, T (&shared)[shared_size], bool sync = true ) { static_assert(shared_size >= nt * vt, "reg_to_shared_strided must have at least nt * vt storage"); cuda_strided_iterate( [&](auto i, auto j) { shared[j] = x[i]; }, tid ); if(sync) __syncthreads(); } template __device__ auto cuda_shared_to_reg_strided( const T (&shared)[shared_size], unsigned tid, bool sync = true ) { static_assert(shared_size >= nt * vt, "shared_to_reg_strided must have at least nt * vt storage"); cudaArray x; cuda_strided_iterate([&](auto i, auto j) { x[i] = shared[j]; }, tid); if(sync) __syncthreads(); return x; } template< unsigned nt, unsigned vt, unsigned vt0 = vt, typename T, typename it_t, unsigned shared_size > __device__ auto cuda_reg_to_mem_thread( cudaArray x, unsigned tid, unsigned count, it_t mem, T (&shared)[shared_size] ) { cuda_reg_to_shared_thread(x, tid, shared); auto y = cuda_shared_to_reg_strided(shared, tid); cuda_reg_to_mem_strided(y, tid, count, mem); } template< unsigned nt, unsigned vt, unsigned vt0 = vt, typename T, typename it_t, unsigned shared_size > __device__ auto cuda_mem_to_reg_thread( it_t mem, unsigned tid, unsigned count, T (&shared)[shared_size] ) { auto x = cuda_mem_to_reg_strided(mem, tid, count); cuda_reg_to_shared_strided(x, tid, shared); auto y = cuda_shared_to_reg_thread(shared, tid); return y; } template __device__ auto cuda_shared_gather( const T(&data)[S], cudaArray indices, bool sync = true ) { static_assert(S >= nt * vt, "shared_gather must have at least nt * vt storage"); cudaArray x; cuda_iterate([&](auto i) { x[i] = data[indices[i]]; }); if(sync) __syncthreads(); return x; } // ---------------------------------------------------------------------------- // reg<->reg // ---------------------------------------------------------------------------- template __device__ auto cuda_reg_thread_to_strided( cudaArray x, unsigned tid, T (&shared)[S] ) { cuda_reg_to_shared_thread(x, tid, shared); return cuda_shared_to_reg_strided(shared, tid); } template __device__ auto cuda_reg_strided_to_thread( cudaArray x, unsigned tid, T (&shared)[S] ) { cuda_reg_to_shared_strided(x, tid, shared); return cuda_shared_to_reg_thread(shared, tid); } // ---------------------------------------------------------------------------- // cudaLoadStoreIterator // ---------------------------------------------------------------------------- template struct cudaLoadStoreIterator : std::iterator_traits { L load; S store; I base; cudaLoadStoreIterator(L load_, S store_, I base_) : load(load_), store(store_), base(base_) { } struct assign_t { L load; S store; I index; __device__ assign_t& operator=(T rhs) { static_assert(!std::is_same::value, "load_iterator is being stored to."); store(rhs, index); return *this; } __device__ operator T() const { static_assert(!std::is_same::value, "store_iterator is being loaded from."); return load(index); } }; __device__ assign_t operator[](I index) const { return assign_t { load, store, base + index }; } __device__ assign_t operator*() const { return assign_t { load, store, base }; } __device__ cudaLoadStoreIterator operator+(I offset) const { cudaLoadStoreIterator cp = *this; cp += offset; return cp; } __device__ cudaLoadStoreIterator& operator+=(I offset) { base += offset; return *this; } __device__ cudaLoadStoreIterator operator-(I offset) const { cudaLoadStoreIterator cp = *this; cp -= offset; return cp; } __device__ cudaLoadStoreIterator& operator-=(I offset) { base -= offset; return *this; } }; //template //struct trivial_load_functor { // template // __device__ T operator()(I index) const { // return T(); // } //}; //template //struct trivial_store_functor { // template // __device__ void operator()(T v, I index) const { } //}; template auto cuda_make_load_store_iterator(L load, S store, I base = 0) { return cudaLoadStoreIterator(load, store, base); } template auto cuda_make_load_iterator(L load, I base = 0) { return cuda_make_load_store_iterator(load, cudaEmpty(), base); } template auto cuda_make_store_iterator(S store, I base = 0) { return cuda_make_load_store_iterator(cudaEmpty(), store, base); } // ---------------------------------------------------------------------------- // swap // ---------------------------------------------------------------------------- template __device__ void cuda_swap(T& a, T& b) { auto c = a; a = b; b = c; } // ---------------------------------------------------------------------------- // launch kernel // ---------------------------------------------------------------------------- template __global__ void cuda_kernel(F f, args_t... args) { f(threadIdx.x, blockIdx.x, args...); } // ---------------------------------------------------------------------------- // operators // ---------------------------------------------------------------------------- template struct cuda_plus : public std::binary_function { __device__ T operator()(T a, T b) const { return a + b; } }; template struct cuda_minus : public std::binary_function { __device__ T operator()(T a, T b) const { return a - b; } }; template struct cuda_multiplies : public std::binary_function { __device__ T operator()(T a, T b) const { return a * b; } }; template struct cuda_maximum : public std::binary_function { __device__ T operator()(T a, T b) const { return a > b ? a : b; } }; template struct cuda_minimum : public std::binary_function { __device__ T operator()(T a, T b) const { return a < b ? a : b; } }; template struct cuda_less : public std::binary_function { __device__ T operator()(T a, T b) const { return a < b; } }; template struct cuda_greater : public std::binary_function { __device__ T operator()(T a, T b) const { return a > b; } }; } // end of namespace tf -----------------------------------------------------