1*e4b17023SJohn Marino // -*- C++ -*- 2*e4b17023SJohn Marino 3*e4b17023SJohn Marino // Copyright (C) 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc. 4*e4b17023SJohn Marino // 5*e4b17023SJohn Marino // This file is part of the GNU ISO C++ Library. This library is free 6*e4b17023SJohn Marino // software; you can redistribute it and/or modify it under the terms 7*e4b17023SJohn Marino // of the GNU General Public License as published by the Free Software 8*e4b17023SJohn Marino // Foundation; either version 3, or (at your option) any later 9*e4b17023SJohn Marino // version. 10*e4b17023SJohn Marino 11*e4b17023SJohn Marino // This library is distributed in the hope that it will be useful, but 12*e4b17023SJohn Marino // WITHOUT ANY WARRANTY; without even the implied warranty of 13*e4b17023SJohn Marino // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14*e4b17023SJohn Marino // General Public License for more details. 15*e4b17023SJohn Marino 16*e4b17023SJohn Marino // Under Section 7 of GPL version 3, you are granted additional 17*e4b17023SJohn Marino // permissions described in the GCC Runtime Library Exception, version 18*e4b17023SJohn Marino // 3.1, as published by the Free Software Foundation. 19*e4b17023SJohn Marino 20*e4b17023SJohn Marino // You should have received a copy of the GNU General Public License and 21*e4b17023SJohn Marino // a copy of the GCC Runtime Library Exception along with this program; 22*e4b17023SJohn Marino // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23*e4b17023SJohn Marino // <http://www.gnu.org/licenses/>. 24*e4b17023SJohn Marino 25*e4b17023SJohn Marino /** @file parallel/random_shuffle.h 26*e4b17023SJohn Marino * @brief Parallel implementation of std::random_shuffle(). 27*e4b17023SJohn Marino * This file is a GNU parallel extension to the Standard C++ Library. 28*e4b17023SJohn Marino */ 29*e4b17023SJohn Marino 30*e4b17023SJohn Marino // Written by Johannes Singler. 31*e4b17023SJohn Marino 32*e4b17023SJohn Marino #ifndef _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H 33*e4b17023SJohn Marino #define _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H 1 34*e4b17023SJohn Marino 35*e4b17023SJohn Marino #include <limits> 36*e4b17023SJohn Marino #include <bits/stl_numeric.h> 37*e4b17023SJohn Marino #include <parallel/parallel.h> 38*e4b17023SJohn Marino #include <parallel/random_number.h> 39*e4b17023SJohn Marino 40*e4b17023SJohn Marino namespace __gnu_parallel 41*e4b17023SJohn Marino { 42*e4b17023SJohn Marino /** @brief Type to hold the index of a bin. 43*e4b17023SJohn Marino * 44*e4b17023SJohn Marino * Since many variables of this type are allocated, it should be 45*e4b17023SJohn Marino * chosen as small as possible. 46*e4b17023SJohn Marino */ 47*e4b17023SJohn Marino typedef unsigned short _BinIndex; 48*e4b17023SJohn Marino 49*e4b17023SJohn Marino /** @brief Data known to every thread participating in 50*e4b17023SJohn Marino __gnu_parallel::__parallel_random_shuffle(). */ 51*e4b17023SJohn Marino template<typename _RAIter> 52*e4b17023SJohn Marino struct _DRandomShufflingGlobalData 53*e4b17023SJohn Marino { 54*e4b17023SJohn Marino typedef std::iterator_traits<_RAIter> _TraitsType; 55*e4b17023SJohn Marino typedef typename _TraitsType::value_type _ValueType; 56*e4b17023SJohn Marino typedef typename _TraitsType::difference_type _DifferenceType; 57*e4b17023SJohn Marino 58*e4b17023SJohn Marino /** @brief Begin iterator of the __source. */ 59*e4b17023SJohn Marino _RAIter& _M_source; 60*e4b17023SJohn Marino 61*e4b17023SJohn Marino /** @brief Temporary arrays for each thread. */ 62*e4b17023SJohn Marino _ValueType** _M_temporaries; 63*e4b17023SJohn Marino 64*e4b17023SJohn Marino /** @brief Two-dimensional array to hold the thread-bin distribution. 65*e4b17023SJohn Marino * 66*e4b17023SJohn Marino * Dimensions (_M_num_threads + 1) __x (_M_num_bins + 1). */ 67*e4b17023SJohn Marino _DifferenceType** _M_dist; 68*e4b17023SJohn Marino 69*e4b17023SJohn Marino /** @brief Start indexes of the threads' __chunks. */ 70*e4b17023SJohn Marino _DifferenceType* _M_starts; 71*e4b17023SJohn Marino 72*e4b17023SJohn Marino /** @brief Number of the thread that will further process the 73*e4b17023SJohn Marino corresponding bin. */ 74*e4b17023SJohn Marino _ThreadIndex* _M_bin_proc; 75*e4b17023SJohn Marino 76*e4b17023SJohn Marino /** @brief Number of bins to distribute to. */ 77*e4b17023SJohn Marino int _M_num_bins; 78*e4b17023SJohn Marino 79*e4b17023SJohn Marino /** @brief Number of bits needed to address the bins. */ 80*e4b17023SJohn Marino int _M_num_bits; 81*e4b17023SJohn Marino 82*e4b17023SJohn Marino /** @brief Constructor. */ _DRandomShufflingGlobalData_DRandomShufflingGlobalData83*e4b17023SJohn Marino _DRandomShufflingGlobalData(_RAIter& __source) 84*e4b17023SJohn Marino : _M_source(__source) { } 85*e4b17023SJohn Marino }; 86*e4b17023SJohn Marino 87*e4b17023SJohn Marino /** @brief Local data for a thread participating in 88*e4b17023SJohn Marino __gnu_parallel::__parallel_random_shuffle(). 89*e4b17023SJohn Marino */ 90*e4b17023SJohn Marino template<typename _RAIter, typename _RandomNumberGenerator> 91*e4b17023SJohn Marino struct _DRSSorterPU 92*e4b17023SJohn Marino { 93*e4b17023SJohn Marino /** @brief Number of threads participating in total. */ 94*e4b17023SJohn Marino int _M_num_threads; 95*e4b17023SJohn Marino 96*e4b17023SJohn Marino /** @brief Begin index for bins taken care of by this thread. */ 97*e4b17023SJohn Marino _BinIndex _M_bins_begin; 98*e4b17023SJohn Marino 99*e4b17023SJohn Marino /** @brief End index for bins taken care of by this thread. */ 100*e4b17023SJohn Marino _BinIndex __bins_end; 101*e4b17023SJohn Marino 102*e4b17023SJohn Marino /** @brief Random _M_seed for this thread. */ 103*e4b17023SJohn Marino uint32_t _M_seed; 104*e4b17023SJohn Marino 105*e4b17023SJohn Marino /** @brief Pointer to global data. */ 106*e4b17023SJohn Marino _DRandomShufflingGlobalData<_RAIter>* _M_sd; 107*e4b17023SJohn Marino }; 108*e4b17023SJohn Marino 109*e4b17023SJohn Marino /** @brief Generate a random number in @c [0,2^__logp). 110*e4b17023SJohn Marino * @param __logp Logarithm (basis 2) of the upper range __bound. 111*e4b17023SJohn Marino * @param __rng Random number generator to use. 112*e4b17023SJohn Marino */ 113*e4b17023SJohn Marino template<typename _RandomNumberGenerator> 114*e4b17023SJohn Marino inline int __random_number_pow2(int __logp,_RandomNumberGenerator & __rng)115*e4b17023SJohn Marino __random_number_pow2(int __logp, _RandomNumberGenerator& __rng) 116*e4b17023SJohn Marino { return __rng.__genrand_bits(__logp); } 117*e4b17023SJohn Marino 118*e4b17023SJohn Marino /** @brief Random shuffle code executed by each thread. 119*e4b17023SJohn Marino * @param __pus Array of thread-local data records. */ 120*e4b17023SJohn Marino template<typename _RAIter, typename _RandomNumberGenerator> 121*e4b17023SJohn Marino void __parallel_random_shuffle_drs_pu(_DRSSorterPU<_RAIter,_RandomNumberGenerator> * __pus)122*e4b17023SJohn Marino __parallel_random_shuffle_drs_pu(_DRSSorterPU<_RAIter, 123*e4b17023SJohn Marino _RandomNumberGenerator>* __pus) 124*e4b17023SJohn Marino { 125*e4b17023SJohn Marino typedef std::iterator_traits<_RAIter> _TraitsType; 126*e4b17023SJohn Marino typedef typename _TraitsType::value_type _ValueType; 127*e4b17023SJohn Marino typedef typename _TraitsType::difference_type _DifferenceType; 128*e4b17023SJohn Marino 129*e4b17023SJohn Marino _ThreadIndex __iam = omp_get_thread_num(); 130*e4b17023SJohn Marino _DRSSorterPU<_RAIter, _RandomNumberGenerator>* __d = &__pus[__iam]; 131*e4b17023SJohn Marino _DRandomShufflingGlobalData<_RAIter>* __sd = __d->_M_sd; 132*e4b17023SJohn Marino 133*e4b17023SJohn Marino // Indexing: _M_dist[bin][processor] 134*e4b17023SJohn Marino _DifferenceType __length = (__sd->_M_starts[__iam + 1] 135*e4b17023SJohn Marino - __sd->_M_starts[__iam]); 136*e4b17023SJohn Marino _BinIndex* __oracles = new _BinIndex[__length]; 137*e4b17023SJohn Marino _DifferenceType* __dist = new _DifferenceType[__sd->_M_num_bins + 1]; 138*e4b17023SJohn Marino _BinIndex* __bin_proc = new _BinIndex[__sd->_M_num_bins]; 139*e4b17023SJohn Marino _ValueType** __temporaries = new _ValueType*[__d->_M_num_threads]; 140*e4b17023SJohn Marino 141*e4b17023SJohn Marino // Compute oracles and count appearances. 142*e4b17023SJohn Marino for (_BinIndex __b = 0; __b < __sd->_M_num_bins + 1; ++__b) 143*e4b17023SJohn Marino __dist[__b] = 0; 144*e4b17023SJohn Marino int __num_bits = __sd->_M_num_bits; 145*e4b17023SJohn Marino 146*e4b17023SJohn Marino _RandomNumber __rng(__d->_M_seed); 147*e4b17023SJohn Marino 148*e4b17023SJohn Marino // First main loop. 149*e4b17023SJohn Marino for (_DifferenceType __i = 0; __i < __length; ++__i) 150*e4b17023SJohn Marino { 151*e4b17023SJohn Marino _BinIndex __oracle = __random_number_pow2(__num_bits, __rng); 152*e4b17023SJohn Marino __oracles[__i] = __oracle; 153*e4b17023SJohn Marino 154*e4b17023SJohn Marino // To allow prefix (partial) sum. 155*e4b17023SJohn Marino ++(__dist[__oracle + 1]); 156*e4b17023SJohn Marino } 157*e4b17023SJohn Marino 158*e4b17023SJohn Marino for (_BinIndex __b = 0; __b < __sd->_M_num_bins + 1; ++__b) 159*e4b17023SJohn Marino __sd->_M_dist[__b][__iam + 1] = __dist[__b]; 160*e4b17023SJohn Marino 161*e4b17023SJohn Marino # pragma omp barrier 162*e4b17023SJohn Marino 163*e4b17023SJohn Marino # pragma omp single 164*e4b17023SJohn Marino { 165*e4b17023SJohn Marino // Sum up bins, __sd->_M_dist[__s + 1][__d->_M_num_threads] now 166*e4b17023SJohn Marino // contains the total number of items in bin __s 167*e4b17023SJohn Marino for (_BinIndex __s = 0; __s < __sd->_M_num_bins; ++__s) 168*e4b17023SJohn Marino __gnu_sequential::partial_sum(__sd->_M_dist[__s + 1], 169*e4b17023SJohn Marino __sd->_M_dist[__s + 1] 170*e4b17023SJohn Marino + __d->_M_num_threads + 1, 171*e4b17023SJohn Marino __sd->_M_dist[__s + 1]); 172*e4b17023SJohn Marino } 173*e4b17023SJohn Marino 174*e4b17023SJohn Marino # pragma omp barrier 175*e4b17023SJohn Marino 176*e4b17023SJohn Marino _SequenceIndex __offset = 0, __global_offset = 0; 177*e4b17023SJohn Marino for (_BinIndex __s = 0; __s < __d->_M_bins_begin; ++__s) 178*e4b17023SJohn Marino __global_offset += __sd->_M_dist[__s + 1][__d->_M_num_threads]; 179*e4b17023SJohn Marino 180*e4b17023SJohn Marino # pragma omp barrier 181*e4b17023SJohn Marino 182*e4b17023SJohn Marino for (_BinIndex __s = __d->_M_bins_begin; __s < __d->__bins_end; ++__s) 183*e4b17023SJohn Marino { 184*e4b17023SJohn Marino for (int __t = 0; __t < __d->_M_num_threads + 1; ++__t) 185*e4b17023SJohn Marino __sd->_M_dist[__s + 1][__t] += __offset; 186*e4b17023SJohn Marino __offset = __sd->_M_dist[__s + 1][__d->_M_num_threads]; 187*e4b17023SJohn Marino } 188*e4b17023SJohn Marino 189*e4b17023SJohn Marino __sd->_M_temporaries[__iam] = static_cast<_ValueType*> 190*e4b17023SJohn Marino (::operator new(sizeof(_ValueType) * __offset)); 191*e4b17023SJohn Marino 192*e4b17023SJohn Marino # pragma omp barrier 193*e4b17023SJohn Marino 194*e4b17023SJohn Marino // Draw local copies to avoid false sharing. 195*e4b17023SJohn Marino for (_BinIndex __b = 0; __b < __sd->_M_num_bins + 1; ++__b) 196*e4b17023SJohn Marino __dist[__b] = __sd->_M_dist[__b][__iam]; 197*e4b17023SJohn Marino for (_BinIndex __b = 0; __b < __sd->_M_num_bins; ++__b) 198*e4b17023SJohn Marino __bin_proc[__b] = __sd->_M_bin_proc[__b]; 199*e4b17023SJohn Marino for (_ThreadIndex __t = 0; __t < __d->_M_num_threads; ++__t) 200*e4b17023SJohn Marino __temporaries[__t] = __sd->_M_temporaries[__t]; 201*e4b17023SJohn Marino 202*e4b17023SJohn Marino _RAIter __source = __sd->_M_source; 203*e4b17023SJohn Marino _DifferenceType __start = __sd->_M_starts[__iam]; 204*e4b17023SJohn Marino 205*e4b17023SJohn Marino // Distribute according to oracles, second main loop. 206*e4b17023SJohn Marino for (_DifferenceType __i = 0; __i < __length; ++__i) 207*e4b17023SJohn Marino { 208*e4b17023SJohn Marino _BinIndex __target_bin = __oracles[__i]; 209*e4b17023SJohn Marino _ThreadIndex __target_p = __bin_proc[__target_bin]; 210*e4b17023SJohn Marino 211*e4b17023SJohn Marino // Last column [__d->_M_num_threads] stays unchanged. 212*e4b17023SJohn Marino ::new(&(__temporaries[__target_p][__dist[__target_bin + 1]++])) 213*e4b17023SJohn Marino _ValueType(*(__source + __i + __start)); 214*e4b17023SJohn Marino } 215*e4b17023SJohn Marino 216*e4b17023SJohn Marino delete[] __oracles; 217*e4b17023SJohn Marino delete[] __dist; 218*e4b17023SJohn Marino delete[] __bin_proc; 219*e4b17023SJohn Marino delete[] __temporaries; 220*e4b17023SJohn Marino 221*e4b17023SJohn Marino # pragma omp barrier 222*e4b17023SJohn Marino 223*e4b17023SJohn Marino // Shuffle bins internally. 224*e4b17023SJohn Marino for (_BinIndex __b = __d->_M_bins_begin; __b < __d->__bins_end; ++__b) 225*e4b17023SJohn Marino { 226*e4b17023SJohn Marino _ValueType* __begin = 227*e4b17023SJohn Marino (__sd->_M_temporaries[__iam] 228*e4b17023SJohn Marino + (__b == __d->_M_bins_begin 229*e4b17023SJohn Marino ? 0 : __sd->_M_dist[__b][__d->_M_num_threads])), 230*e4b17023SJohn Marino *__end = (__sd->_M_temporaries[__iam] 231*e4b17023SJohn Marino + __sd->_M_dist[__b + 1][__d->_M_num_threads]); 232*e4b17023SJohn Marino 233*e4b17023SJohn Marino __sequential_random_shuffle(__begin, __end, __rng); 234*e4b17023SJohn Marino std::copy(__begin, __end, __sd->_M_source + __global_offset 235*e4b17023SJohn Marino + (__b == __d->_M_bins_begin 236*e4b17023SJohn Marino ? 0 : __sd->_M_dist[__b][__d->_M_num_threads])); 237*e4b17023SJohn Marino } 238*e4b17023SJohn Marino 239*e4b17023SJohn Marino for (_SequenceIndex __i = 0; __i < __offset; ++__i) 240*e4b17023SJohn Marino __sd->_M_temporaries[__iam][__i].~_ValueType(); 241*e4b17023SJohn Marino ::operator delete(__sd->_M_temporaries[__iam]); 242*e4b17023SJohn Marino } 243*e4b17023SJohn Marino 244*e4b17023SJohn Marino /** @brief Round up to the next greater power of 2. 245*e4b17023SJohn Marino * @param __x _Integer to round up */ 246*e4b17023SJohn Marino template<typename _Tp> 247*e4b17023SJohn Marino _Tp __round_up_to_pow2(_Tp __x)248*e4b17023SJohn Marino __round_up_to_pow2(_Tp __x) 249*e4b17023SJohn Marino { 250*e4b17023SJohn Marino if (__x <= 1) 251*e4b17023SJohn Marino return 1; 252*e4b17023SJohn Marino else 253*e4b17023SJohn Marino return (_Tp)1 << (__rd_log2(__x - 1) + 1); 254*e4b17023SJohn Marino } 255*e4b17023SJohn Marino 256*e4b17023SJohn Marino /** @brief Main parallel random shuffle step. 257*e4b17023SJohn Marino * @param __begin Begin iterator of sequence. 258*e4b17023SJohn Marino * @param __end End iterator of sequence. 259*e4b17023SJohn Marino * @param __n Length of sequence. 260*e4b17023SJohn Marino * @param __num_threads Number of threads to use. 261*e4b17023SJohn Marino * @param __rng Random number generator to use. 262*e4b17023SJohn Marino */ 263*e4b17023SJohn Marino template<typename _RAIter, typename _RandomNumberGenerator> 264*e4b17023SJohn Marino void __parallel_random_shuffle_drs(_RAIter __begin,_RAIter __end,typename std::iterator_traits<_RAIter>::difference_type __n,_ThreadIndex __num_threads,_RandomNumberGenerator & __rng)265*e4b17023SJohn Marino __parallel_random_shuffle_drs(_RAIter __begin, _RAIter __end, 266*e4b17023SJohn Marino typename std::iterator_traits 267*e4b17023SJohn Marino <_RAIter>::difference_type __n, 268*e4b17023SJohn Marino _ThreadIndex __num_threads, 269*e4b17023SJohn Marino _RandomNumberGenerator& __rng) 270*e4b17023SJohn Marino { 271*e4b17023SJohn Marino typedef std::iterator_traits<_RAIter> _TraitsType; 272*e4b17023SJohn Marino typedef typename _TraitsType::value_type _ValueType; 273*e4b17023SJohn Marino typedef typename _TraitsType::difference_type _DifferenceType; 274*e4b17023SJohn Marino 275*e4b17023SJohn Marino _GLIBCXX_CALL(__n) 276*e4b17023SJohn Marino 277*e4b17023SJohn Marino const _Settings& __s = _Settings::get(); 278*e4b17023SJohn Marino 279*e4b17023SJohn Marino if (__num_threads > __n) 280*e4b17023SJohn Marino __num_threads = static_cast<_ThreadIndex>(__n); 281*e4b17023SJohn Marino 282*e4b17023SJohn Marino _BinIndex __num_bins, __num_bins_cache; 283*e4b17023SJohn Marino 284*e4b17023SJohn Marino #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 285*e4b17023SJohn Marino // Try the L1 cache first. 286*e4b17023SJohn Marino 287*e4b17023SJohn Marino // Must fit into L1. 288*e4b17023SJohn Marino __num_bins_cache = 289*e4b17023SJohn Marino std::max<_DifferenceType>(1, __n / (__s.L1_cache_size_lb 290*e4b17023SJohn Marino / sizeof(_ValueType))); 291*e4b17023SJohn Marino __num_bins_cache = __round_up_to_pow2(__num_bins_cache); 292*e4b17023SJohn Marino 293*e4b17023SJohn Marino // No more buckets than TLB entries, power of 2 294*e4b17023SJohn Marino // Power of 2 and at least one element per bin, at most the TLB size. 295*e4b17023SJohn Marino __num_bins = std::min<_DifferenceType>(__n, __num_bins_cache); 296*e4b17023SJohn Marino 297*e4b17023SJohn Marino #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 298*e4b17023SJohn Marino // 2 TLB entries needed per bin. 299*e4b17023SJohn Marino __num_bins = std::min<_DifferenceType>(__s.TLB_size / 2, __num_bins); 300*e4b17023SJohn Marino #endif 301*e4b17023SJohn Marino __num_bins = __round_up_to_pow2(__num_bins); 302*e4b17023SJohn Marino 303*e4b17023SJohn Marino if (__num_bins < __num_bins_cache) 304*e4b17023SJohn Marino { 305*e4b17023SJohn Marino #endif 306*e4b17023SJohn Marino // Now try the L2 cache 307*e4b17023SJohn Marino // Must fit into L2 308*e4b17023SJohn Marino __num_bins_cache = static_cast<_BinIndex> 309*e4b17023SJohn Marino (std::max<_DifferenceType>(1, __n / (__s.L2_cache_size 310*e4b17023SJohn Marino / sizeof(_ValueType)))); 311*e4b17023SJohn Marino __num_bins_cache = __round_up_to_pow2(__num_bins_cache); 312*e4b17023SJohn Marino 313*e4b17023SJohn Marino // No more buckets than TLB entries, power of 2. 314*e4b17023SJohn Marino __num_bins = static_cast<_BinIndex> 315*e4b17023SJohn Marino (std::min(__n, static_cast<_DifferenceType>(__num_bins_cache))); 316*e4b17023SJohn Marino // Power of 2 and at least one element per bin, at most the TLB size. 317*e4b17023SJohn Marino #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 318*e4b17023SJohn Marino // 2 TLB entries needed per bin. 319*e4b17023SJohn Marino __num_bins = std::min(static_cast<_DifferenceType>(__s.TLB_size / 2), 320*e4b17023SJohn Marino __num_bins); 321*e4b17023SJohn Marino #endif 322*e4b17023SJohn Marino __num_bins = __round_up_to_pow2(__num_bins); 323*e4b17023SJohn Marino #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 324*e4b17023SJohn Marino } 325*e4b17023SJohn Marino #endif 326*e4b17023SJohn Marino 327*e4b17023SJohn Marino __num_bins = __round_up_to_pow2( 328*e4b17023SJohn Marino std::max<_BinIndex>(__num_threads, __num_bins)); 329*e4b17023SJohn Marino 330*e4b17023SJohn Marino if (__num_threads <= 1) 331*e4b17023SJohn Marino { 332*e4b17023SJohn Marino _RandomNumber __derived_rng( 333*e4b17023SJohn Marino __rng(std::numeric_limits<uint32_t>::max())); 334*e4b17023SJohn Marino __sequential_random_shuffle(__begin, __end, __derived_rng); 335*e4b17023SJohn Marino return; 336*e4b17023SJohn Marino } 337*e4b17023SJohn Marino 338*e4b17023SJohn Marino _DRandomShufflingGlobalData<_RAIter> __sd(__begin); 339*e4b17023SJohn Marino _DRSSorterPU<_RAIter, _RandomNumber >* __pus; 340*e4b17023SJohn Marino _DifferenceType* __starts; 341*e4b17023SJohn Marino 342*e4b17023SJohn Marino # pragma omp parallel num_threads(__num_threads) 343*e4b17023SJohn Marino { 344*e4b17023SJohn Marino _ThreadIndex __num_threads = omp_get_num_threads(); 345*e4b17023SJohn Marino # pragma omp single 346*e4b17023SJohn Marino { 347*e4b17023SJohn Marino __pus = new _DRSSorterPU<_RAIter, _RandomNumber>[__num_threads]; 348*e4b17023SJohn Marino 349*e4b17023SJohn Marino __sd._M_temporaries = new _ValueType*[__num_threads]; 350*e4b17023SJohn Marino __sd._M_dist = new _DifferenceType*[__num_bins + 1]; 351*e4b17023SJohn Marino __sd._M_bin_proc = new _ThreadIndex[__num_bins]; 352*e4b17023SJohn Marino for (_BinIndex __b = 0; __b < __num_bins + 1; ++__b) 353*e4b17023SJohn Marino __sd._M_dist[__b] = new _DifferenceType[__num_threads + 1]; 354*e4b17023SJohn Marino for (_BinIndex __b = 0; __b < (__num_bins + 1); ++__b) 355*e4b17023SJohn Marino { 356*e4b17023SJohn Marino __sd._M_dist[0][0] = 0; 357*e4b17023SJohn Marino __sd._M_dist[__b][0] = 0; 358*e4b17023SJohn Marino } 359*e4b17023SJohn Marino __starts = __sd._M_starts = new _DifferenceType[__num_threads + 1]; 360*e4b17023SJohn Marino int __bin_cursor = 0; 361*e4b17023SJohn Marino __sd._M_num_bins = __num_bins; 362*e4b17023SJohn Marino __sd._M_num_bits = __rd_log2(__num_bins); 363*e4b17023SJohn Marino 364*e4b17023SJohn Marino _DifferenceType __chunk_length = __n / __num_threads, 365*e4b17023SJohn Marino __split = __n % __num_threads, 366*e4b17023SJohn Marino __start = 0; 367*e4b17023SJohn Marino _DifferenceType __bin_chunk_length = __num_bins / __num_threads, 368*e4b17023SJohn Marino __bin_split = __num_bins % __num_threads; 369*e4b17023SJohn Marino for (_ThreadIndex __i = 0; __i < __num_threads; ++__i) 370*e4b17023SJohn Marino { 371*e4b17023SJohn Marino __starts[__i] = __start; 372*e4b17023SJohn Marino __start += (__i < __split 373*e4b17023SJohn Marino ? (__chunk_length + 1) : __chunk_length); 374*e4b17023SJohn Marino int __j = __pus[__i]._M_bins_begin = __bin_cursor; 375*e4b17023SJohn Marino 376*e4b17023SJohn Marino // Range of bins for this processor. 377*e4b17023SJohn Marino __bin_cursor += (__i < __bin_split 378*e4b17023SJohn Marino ? (__bin_chunk_length + 1) 379*e4b17023SJohn Marino : __bin_chunk_length); 380*e4b17023SJohn Marino __pus[__i].__bins_end = __bin_cursor; 381*e4b17023SJohn Marino for (; __j < __bin_cursor; ++__j) 382*e4b17023SJohn Marino __sd._M_bin_proc[__j] = __i; 383*e4b17023SJohn Marino __pus[__i]._M_num_threads = __num_threads; 384*e4b17023SJohn Marino __pus[__i]._M_seed = __rng(std::numeric_limits<uint32_t>::max()); 385*e4b17023SJohn Marino __pus[__i]._M_sd = &__sd; 386*e4b17023SJohn Marino } 387*e4b17023SJohn Marino __starts[__num_threads] = __start; 388*e4b17023SJohn Marino } //single 389*e4b17023SJohn Marino // Now shuffle in parallel. 390*e4b17023SJohn Marino __parallel_random_shuffle_drs_pu(__pus); 391*e4b17023SJohn Marino } // parallel 392*e4b17023SJohn Marino 393*e4b17023SJohn Marino delete[] __starts; 394*e4b17023SJohn Marino delete[] __sd._M_bin_proc; 395*e4b17023SJohn Marino for (int __s = 0; __s < (__num_bins + 1); ++__s) 396*e4b17023SJohn Marino delete[] __sd._M_dist[__s]; 397*e4b17023SJohn Marino delete[] __sd._M_dist; 398*e4b17023SJohn Marino delete[] __sd._M_temporaries; 399*e4b17023SJohn Marino 400*e4b17023SJohn Marino delete[] __pus; 401*e4b17023SJohn Marino } 402*e4b17023SJohn Marino 403*e4b17023SJohn Marino /** @brief Sequential cache-efficient random shuffle. 404*e4b17023SJohn Marino * @param __begin Begin iterator of sequence. 405*e4b17023SJohn Marino * @param __end End iterator of sequence. 406*e4b17023SJohn Marino * @param __rng Random number generator to use. 407*e4b17023SJohn Marino */ 408*e4b17023SJohn Marino template<typename _RAIter, typename _RandomNumberGenerator> 409*e4b17023SJohn Marino void __sequential_random_shuffle(_RAIter __begin,_RAIter __end,_RandomNumberGenerator & __rng)410*e4b17023SJohn Marino __sequential_random_shuffle(_RAIter __begin, _RAIter __end, 411*e4b17023SJohn Marino _RandomNumberGenerator& __rng) 412*e4b17023SJohn Marino { 413*e4b17023SJohn Marino typedef std::iterator_traits<_RAIter> _TraitsType; 414*e4b17023SJohn Marino typedef typename _TraitsType::value_type _ValueType; 415*e4b17023SJohn Marino typedef typename _TraitsType::difference_type _DifferenceType; 416*e4b17023SJohn Marino 417*e4b17023SJohn Marino _DifferenceType __n = __end - __begin; 418*e4b17023SJohn Marino const _Settings& __s = _Settings::get(); 419*e4b17023SJohn Marino 420*e4b17023SJohn Marino _BinIndex __num_bins, __num_bins_cache; 421*e4b17023SJohn Marino 422*e4b17023SJohn Marino #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 423*e4b17023SJohn Marino // Try the L1 cache first, must fit into L1. 424*e4b17023SJohn Marino __num_bins_cache = std::max<_DifferenceType> 425*e4b17023SJohn Marino (1, __n / (__s.L1_cache_size_lb / sizeof(_ValueType))); 426*e4b17023SJohn Marino __num_bins_cache = __round_up_to_pow2(__num_bins_cache); 427*e4b17023SJohn Marino 428*e4b17023SJohn Marino // No more buckets than TLB entries, power of 2 429*e4b17023SJohn Marino // Power of 2 and at least one element per bin, at most the TLB size 430*e4b17023SJohn Marino __num_bins = std::min(__n, (_DifferenceType)__num_bins_cache); 431*e4b17023SJohn Marino #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 432*e4b17023SJohn Marino // 2 TLB entries needed per bin 433*e4b17023SJohn Marino __num_bins = std::min((_DifferenceType)__s.TLB_size / 2, __num_bins); 434*e4b17023SJohn Marino #endif 435*e4b17023SJohn Marino __num_bins = __round_up_to_pow2(__num_bins); 436*e4b17023SJohn Marino 437*e4b17023SJohn Marino if (__num_bins < __num_bins_cache) 438*e4b17023SJohn Marino { 439*e4b17023SJohn Marino #endif 440*e4b17023SJohn Marino // Now try the L2 cache, must fit into L2. 441*e4b17023SJohn Marino __num_bins_cache = static_cast<_BinIndex> 442*e4b17023SJohn Marino (std::max<_DifferenceType>(1, __n / (__s.L2_cache_size 443*e4b17023SJohn Marino / sizeof(_ValueType)))); 444*e4b17023SJohn Marino __num_bins_cache = __round_up_to_pow2(__num_bins_cache); 445*e4b17023SJohn Marino 446*e4b17023SJohn Marino // No more buckets than TLB entries, power of 2 447*e4b17023SJohn Marino // Power of 2 and at least one element per bin, at most the TLB size. 448*e4b17023SJohn Marino __num_bins = static_cast<_BinIndex> 449*e4b17023SJohn Marino (std::min(__n, static_cast<_DifferenceType>(__num_bins_cache))); 450*e4b17023SJohn Marino 451*e4b17023SJohn Marino #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 452*e4b17023SJohn Marino // 2 TLB entries needed per bin 453*e4b17023SJohn Marino __num_bins = std::min<_DifferenceType>(__s.TLB_size / 2, __num_bins); 454*e4b17023SJohn Marino #endif 455*e4b17023SJohn Marino __num_bins = __round_up_to_pow2(__num_bins); 456*e4b17023SJohn Marino #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 457*e4b17023SJohn Marino } 458*e4b17023SJohn Marino #endif 459*e4b17023SJohn Marino 460*e4b17023SJohn Marino int __num_bits = __rd_log2(__num_bins); 461*e4b17023SJohn Marino 462*e4b17023SJohn Marino if (__num_bins > 1) 463*e4b17023SJohn Marino { 464*e4b17023SJohn Marino _ValueType* __target = 465*e4b17023SJohn Marino static_cast<_ValueType*>(::operator new(sizeof(_ValueType) * __n)); 466*e4b17023SJohn Marino _BinIndex* __oracles = new _BinIndex[__n]; 467*e4b17023SJohn Marino _DifferenceType* __dist0 = new _DifferenceType[__num_bins + 1], 468*e4b17023SJohn Marino * __dist1 = new _DifferenceType[__num_bins + 1]; 469*e4b17023SJohn Marino 470*e4b17023SJohn Marino for (int __b = 0; __b < __num_bins + 1; ++__b) 471*e4b17023SJohn Marino __dist0[__b] = 0; 472*e4b17023SJohn Marino 473*e4b17023SJohn Marino _RandomNumber __bitrng(__rng(0xFFFFFFFF)); 474*e4b17023SJohn Marino 475*e4b17023SJohn Marino for (_DifferenceType __i = 0; __i < __n; ++__i) 476*e4b17023SJohn Marino { 477*e4b17023SJohn Marino _BinIndex __oracle = __random_number_pow2(__num_bits, __bitrng); 478*e4b17023SJohn Marino __oracles[__i] = __oracle; 479*e4b17023SJohn Marino 480*e4b17023SJohn Marino // To allow prefix (partial) sum. 481*e4b17023SJohn Marino ++(__dist0[__oracle + 1]); 482*e4b17023SJohn Marino } 483*e4b17023SJohn Marino 484*e4b17023SJohn Marino // Sum up bins. 485*e4b17023SJohn Marino __gnu_sequential::partial_sum(__dist0, __dist0 + __num_bins + 1, 486*e4b17023SJohn Marino __dist0); 487*e4b17023SJohn Marino 488*e4b17023SJohn Marino for (int __b = 0; __b < __num_bins + 1; ++__b) 489*e4b17023SJohn Marino __dist1[__b] = __dist0[__b]; 490*e4b17023SJohn Marino 491*e4b17023SJohn Marino // Distribute according to oracles. 492*e4b17023SJohn Marino for (_DifferenceType __i = 0; __i < __n; ++__i) 493*e4b17023SJohn Marino ::new(&(__target[(__dist0[__oracles[__i]])++])) 494*e4b17023SJohn Marino _ValueType(*(__begin + __i)); 495*e4b17023SJohn Marino 496*e4b17023SJohn Marino for (int __b = 0; __b < __num_bins; ++__b) 497*e4b17023SJohn Marino __sequential_random_shuffle(__target + __dist1[__b], 498*e4b17023SJohn Marino __target + __dist1[__b + 1], __rng); 499*e4b17023SJohn Marino 500*e4b17023SJohn Marino // Copy elements back. 501*e4b17023SJohn Marino std::copy(__target, __target + __n, __begin); 502*e4b17023SJohn Marino 503*e4b17023SJohn Marino delete[] __dist0; 504*e4b17023SJohn Marino delete[] __dist1; 505*e4b17023SJohn Marino delete[] __oracles; 506*e4b17023SJohn Marino 507*e4b17023SJohn Marino for (_DifferenceType __i = 0; __i < __n; ++__i) 508*e4b17023SJohn Marino __target[__i].~_ValueType(); 509*e4b17023SJohn Marino ::operator delete(__target); 510*e4b17023SJohn Marino } 511*e4b17023SJohn Marino else 512*e4b17023SJohn Marino __gnu_sequential::random_shuffle(__begin, __end, __rng); 513*e4b17023SJohn Marino } 514*e4b17023SJohn Marino 515*e4b17023SJohn Marino /** @brief Parallel random public call. 516*e4b17023SJohn Marino * @param __begin Begin iterator of sequence. 517*e4b17023SJohn Marino * @param __end End iterator of sequence. 518*e4b17023SJohn Marino * @param __rng Random number generator to use. 519*e4b17023SJohn Marino */ 520*e4b17023SJohn Marino template<typename _RAIter, typename _RandomNumberGenerator> 521*e4b17023SJohn Marino inline void 522*e4b17023SJohn Marino __parallel_random_shuffle(_RAIter __begin, _RAIter __end, 523*e4b17023SJohn Marino _RandomNumberGenerator __rng = _RandomNumber()) 524*e4b17023SJohn Marino { 525*e4b17023SJohn Marino typedef std::iterator_traits<_RAIter> _TraitsType; 526*e4b17023SJohn Marino typedef typename _TraitsType::difference_type _DifferenceType; 527*e4b17023SJohn Marino _DifferenceType __n = __end - __begin; 528*e4b17023SJohn Marino __parallel_random_shuffle_drs(__begin, __end, __n, 529*e4b17023SJohn Marino __get_max_threads(), __rng); 530*e4b17023SJohn Marino } 531*e4b17023SJohn Marino } 532*e4b17023SJohn Marino 533*e4b17023SJohn Marino #endif /* _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H */ 534