1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 // Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
6 //
7 // This Source Code Form is subject to the terms of the Mozilla
8 // Public License v. 2.0. If a copy of the MPL was not distributed
9 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 
11 #ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
12 #define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
13 
14 namespace Eigen {
15 namespace internal {
16 
17 namespace {
18 
get_random_seed()19 EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
20 #if defined(EIGEN_GPU_COMPILE_PHASE)
21   // We don't support 3d kernels since we currently only use 1 and
22   // 2d kernels.
23   gpu_assert(threadIdx.z == 0);
24   return blockIdx.x * blockDim.x + threadIdx.x
25          + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
26 #else
27   // Rely on Eigen's random implementation.
28   return random<uint64_t>();
29 #endif
30 }
31 
PCG_XSH_RS_generator(uint64_t * state,uint64_t stream)32 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {
33   // TODO: Unify with the implementation in the non blocking thread pool.
34   uint64_t current = *state;
35   // Update the internal state
36   *state = current * 6364136223846793005ULL + (stream << 1 | 1);
37   // Generate the random output (using the PCG-XSH-RS scheme)
38   return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
39 }
40 
PCG_XSH_RS_state(uint64_t seed)41 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
42   seed = seed ? seed : get_random_seed();
43   return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
44 }
45 
46 }  // namespace
47 
48 
49 template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
RandomToTypeUniform(uint64_t * state,uint64_t stream)50 T RandomToTypeUniform(uint64_t* state, uint64_t stream) {
51   unsigned rnd = PCG_XSH_RS_generator(state, stream);
52   return static_cast<T>(rnd);
53 }
54 
55 
56 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
57 Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
58   // Generate 10 random bits for the mantissa, merge with exponent.
59   unsigned rnd = PCG_XSH_RS_generator(state, stream);
60   const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);
61   Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);
62   // Return the final result
63   return result - Eigen::half(1.0f);
64 }
65 
66 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
67 Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state, uint64_t stream) {
68 
69   // Generate 7 random bits for the mantissa, merge with exponent.
70   unsigned rnd = PCG_XSH_RS_generator(state, stream);
71   const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);
72   Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);
73   // Return the final result
74   return result - Eigen::bfloat16(1.0f);
75 }
76 
77 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
78 float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) {
79   typedef union {
80     uint32_t raw;
81     float fp;
82   } internal;
83   internal result;
84   // Generate 23 random bits for the mantissa mantissa
85   const unsigned rnd = PCG_XSH_RS_generator(state, stream);
86   result.raw = rnd & 0x7fffffu;
87   // Set the exponent
88   result.raw |= (static_cast<uint32_t>(127) << 23);
89   // Return the final result
90   return result.fp - 1.0f;
91 }
92 
93 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
94 double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {
95   typedef union {
96     uint64_t raw;
97     double dp;
98   } internal;
99   internal result;
100   result.raw = 0;
101   // Generate 52 random bits for the mantissa
102   // First generate the upper 20 bits
103   unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;
104   // The generate the lower 32 bits
105   unsigned rnd2 = PCG_XSH_RS_generator(state, stream);
106   result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
107   // Set the exponent
108   result.raw |= (static_cast<uint64_t>(1023) << 52);
109   // Return the final result
110   return result.dp - 1.0;
111 }
112 
113 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
114 std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state, uint64_t stream) {
115   return std::complex<float>(RandomToTypeUniform<float>(state, stream),
116                              RandomToTypeUniform<float>(state, stream));
117 }
118 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
119 std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state, uint64_t stream) {
120   return std::complex<double>(RandomToTypeUniform<double>(state, stream),
121                               RandomToTypeUniform<double>(state, stream));
122 }
123 
124 template <typename T> class UniformRandomGenerator {
125  public:
126   static const bool PacketAccess = true;
127 
128   // Uses the given "seed" if non-zero, otherwise uses a random seed.
129   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
130       uint64_t seed = 0) {
131     m_state = PCG_XSH_RS_state(seed);
132     #ifdef EIGEN_USE_SYCL
133     // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
134     // Therefor, we need two step to initializate the m_state.
135     // IN SYCL, the constructor of the functor is s called on the CPU
136     // and we get the clock seed here from the CPU. However, This seed is
137     //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
138     // and only  available on the Operator() function (which is called on the GPU).
139     // Thus for CUDA (((CLOCK  + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread
140     // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds
141     // the  (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction
142     // similar to CUDA Therefore, the thread Id injection is not available at this stage.
143     //However when the operator() is called the thread ID will be avilable. So inside the opeator,
144     // we add the thrreadID, BlockId,... (which is equivalent of i)
145     //to the seed and construct the unique m_state per thead similar to cuda.
146     m_exec_once =false;
147    #endif
148   }
UniformRandomGenerator(const UniformRandomGenerator & other)149   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
150       const UniformRandomGenerator& other) {
151     m_state = other.m_state;
152     #ifdef EIGEN_USE_SYCL
153      m_exec_once =other.m_exec_once;
154     #endif
155   }
156 
157   template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
operator()158   T operator()(Index i) const {
159     #ifdef EIGEN_USE_SYCL
160       if(!m_exec_once) {
161       // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
162       // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
163        m_state += (i * 6364136223846793005ULL);
164        m_exec_once =true;
165       }
166     #endif
167     T result = RandomToTypeUniform<T>(&m_state, i);
168     return result;
169   }
170 
171   template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
packetOp(Index i)172   Packet packetOp(Index i) const {
173     const int packetSize = internal::unpacket_traits<Packet>::size;
174     EIGEN_ALIGN_MAX T values[packetSize];
175       #ifdef EIGEN_USE_SYCL
176       if(!m_exec_once) {
177       // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
178        m_state += (i * 6364136223846793005ULL);
179        m_exec_once =true;
180       }
181     #endif
182     EIGEN_UNROLL_LOOP
183     for (int j = 0; j < packetSize; ++j) {
184       values[j] = RandomToTypeUniform<T>(&m_state, i);
185     }
186     return internal::pload<Packet>(values);
187   }
188 
189  private:
190   mutable uint64_t m_state;
191   #ifdef EIGEN_USE_SYCL
192   mutable bool m_exec_once;
193   #endif
194 };
195 
196 template <typename Scalar>
197 struct functor_traits<UniformRandomGenerator<Scalar> > {
198   enum {
199     // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
200     Cost = 12 * NumTraits<Scalar>::AddCost *
201            ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
202     PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
203   };
204 };
205 
206 
207 
208 template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
209 T RandomToTypeNormal(uint64_t* state, uint64_t stream) {
210   // Use the ratio of uniform method to generate numbers following a normal
211   // distribution. See for example Numerical Recipes chapter 7.3.9 for the
212   // details.
213   T u, v, q;
214   do {
215     u = RandomToTypeUniform<T>(state, stream);
216     v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5));
217     const T x = u - T(0.449871);
218     const T y = numext::abs(v) + T(0.386595);
219     q = x*x + y * (T(0.196)*y - T(0.25472)*x);
220   } while (q > T(0.27597) &&
221            (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u));
222 
223   return v/u;
224 }
225 
226 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
227 std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state, uint64_t stream) {
228   return std::complex<float>(RandomToTypeNormal<float>(state, stream),
229                              RandomToTypeNormal<float>(state, stream));
230 }
231 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
232 std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state, uint64_t stream) {
233   return std::complex<double>(RandomToTypeNormal<double>(state, stream),
234                               RandomToTypeNormal<double>(state, stream));
235 }
236 
237 
238 template <typename T> class NormalRandomGenerator {
239  public:
240   static const bool PacketAccess = true;
241 
242   // Uses the given "seed" if non-zero, otherwise uses a random seed.
243   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
244     m_state = PCG_XSH_RS_state(seed);
245     #ifdef EIGEN_USE_SYCL
246     // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
247     // Therefor, we need two steps to initializate the m_state.
248     // IN SYCL, the constructor of the functor is s called on the CPU
249     // and we get the clock seed here from the CPU. However, This seed is
250     //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
251     // and only  available on the Operator() function (which is called on the GPU).
252     // Therefore, the thread Id injection is not available at this stage. However when the operator()
253     //is called the thread ID will be avilable. So inside the opeator,
254     // we add the thrreadID, BlockId,... (which is equivalent of i)
255     //to the seed and construct the unique m_state per thead similar to cuda.
256     m_exec_once =false;
257    #endif
258   }
259   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
260       const NormalRandomGenerator& other) {
261     m_state = other.m_state;
262 #ifdef EIGEN_USE_SYCL
263     m_exec_once=other.m_exec_once;
264 #endif
265   }
266 
267  template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
268   T operator()(Index i) const {
269     #ifdef EIGEN_USE_SYCL
270     if(!m_exec_once) {
271       // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
272       m_state += (i * 6364136223846793005ULL);
273       m_exec_once =true;
274     }
275     #endif
276     T result = RandomToTypeNormal<T>(&m_state, i);
277     return result;
278   }
279 
280   template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
281   Packet packetOp(Index i) const {
282     const int packetSize = internal::unpacket_traits<Packet>::size;
283     EIGEN_ALIGN_MAX T values[packetSize];
284     #ifdef EIGEN_USE_SYCL
285     if(!m_exec_once) {
286       // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
287       m_state += (i * 6364136223846793005ULL);
288       m_exec_once =true;
289     }
290     #endif
291     EIGEN_UNROLL_LOOP
292     for (int j = 0; j < packetSize; ++j) {
293       values[j] = RandomToTypeNormal<T>(&m_state, i);
294     }
295     return internal::pload<Packet>(values);
296   }
297 
298  private:
299   mutable uint64_t m_state;
300    #ifdef EIGEN_USE_SYCL
301   mutable bool m_exec_once;
302   #endif
303 };
304 
305 
306 template <typename Scalar>
307 struct functor_traits<NormalRandomGenerator<Scalar> > {
308   enum {
309     // On average, we need to generate about 3 random numbers
310     // 15 mul, 8 add, 1.5 logs
311     Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost +
312            15 * NumTraits<Scalar>::AddCost + 8 * NumTraits<Scalar>::AddCost +
313            3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
314     PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
315   };
316 };
317 
318 
319 } // end namespace internal
320 } // end namespace Eigen
321 
322 #endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
323