1 /*
2     Copyright 2005-2014 Intel Corporation.  All Rights Reserved.
3 
4     This file is part of Threading Building Blocks. Threading Building Blocks is free software;
5     you can redistribute it and/or modify it under the terms of the GNU General Public License
6     version 2  as  published  by  the  Free Software Foundation.  Threading Building Blocks is
7     distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
8     implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
9     See  the GNU General Public License for more details.   You should have received a copy of
10     the  GNU General Public License along with Threading Building Blocks; if not, write to the
11     Free Software Foundation, Inc.,  51 Franklin St,  Fifth Floor,  Boston,  MA 02110-1301 USA
12 
13     As a special exception,  you may use this file  as part of a free software library without
14     restriction.  Specifically,  if other files instantiate templates  or use macros or inline
15     functions from this file, or you compile this file and link it with other files to produce
16     an executable,  this file does not by itself cause the resulting executable to be covered
17     by the GNU General Public License. This exception does not however invalidate any other
18     reasons why the executable file might be covered by the GNU General Public License.
19 */
20 
21 #ifndef __TBB_machine_H
22 #define __TBB_machine_H
23 
24 /** This header provides basic platform abstraction layer by hooking up appropriate
25     architecture/OS/compiler specific headers from the /include/tbb/machine directory.
26     If a plug-in header does not implement all the required APIs, it must specify
27     the missing ones by setting one or more of the following macros:
28 
29     __TBB_USE_GENERIC_PART_WORD_CAS
30     __TBB_USE_GENERIC_PART_WORD_FETCH_ADD
31     __TBB_USE_GENERIC_PART_WORD_FETCH_STORE
32     __TBB_USE_GENERIC_FETCH_ADD
33     __TBB_USE_GENERIC_FETCH_STORE
34     __TBB_USE_GENERIC_DWORD_FETCH_ADD
35     __TBB_USE_GENERIC_DWORD_FETCH_STORE
36     __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE
37     __TBB_USE_GENERIC_FULL_FENCED_LOAD_STORE
38     __TBB_USE_GENERIC_RELAXED_LOAD_STORE
39     __TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE
40 
41     In this case tbb_machine.h will add missing functionality based on a minimal set
42     of APIs that are required to be implemented by all plug-n headers as described
43     further.
44     Note that these generic implementations may be sub-optimal for a particular
45     architecture, and thus should be relied upon only after careful evaluation
46     or as the last resort.
47 
48     Additionally __TBB_64BIT_ATOMICS can be set to 0 on a 32-bit architecture to
49     indicate that the port is not going to support double word atomics. It may also
50     be set to 1 explicitly, though normally this is not necessary as tbb_machine.h
51     will set it automatically.
52 
53     __TBB_ENDIANNESS macro can be defined by the implementation as well.
54     It is used only if __TBB_USE_GENERIC_PART_WORD_CAS is set (or for testing),
55     and must specify the layout of aligned 16-bit and 32-bit data anywhere within a process
56     (while the details of unaligned 16-bit or 32-bit data or of 64-bit data are irrelevant).
57     The layout must be the same at all relevant memory locations within the current process;
58     in case of page-specific endianness, one endianness must be kept "out of sight".
59     Possible settings, reflecting hardware and possibly O.S. convention, are:
60     -  __TBB_ENDIAN_BIG for big-endian data,
61     -  __TBB_ENDIAN_LITTLE for little-endian data,
62     -  __TBB_ENDIAN_DETECT for run-time detection iff exactly one of the above,
63     -  __TBB_ENDIAN_UNSUPPORTED to prevent undefined behavior if none of the above.
64 
65     Prerequisites for each architecture port
66     ----------------------------------------
67     The following functions and macros have no generic implementation. Therefore they must be
68     implemented in each machine architecture specific header either as a conventional
69     function or as a functional macro.
70 
71     __TBB_WORDSIZE
72         This is the size of machine word in bytes, i.e. for 32 bit systems it
73         should be defined to 4.
74 
75     __TBB_Yield()
76         Signals OS that the current thread is willing to relinquish the remainder
77         of its time quantum.
78 
79     __TBB_full_memory_fence()
80         Must prevent all memory operations from being reordered across it (both
81         by hardware and compiler). All such fences must be totally ordered (or
82         sequentially consistent).
83 
84     __TBB_machine_cmpswp4( volatile void *ptr, int32_t value, int32_t comparand )
85         Must be provided if __TBB_USE_FENCED_ATOMICS is not set.
86 
87     __TBB_machine_cmpswp8( volatile void *ptr, int32_t value, int64_t comparand )
88         Must be provided for 64-bit architectures if __TBB_USE_FENCED_ATOMICS is not set,
89         and for 32-bit architectures if __TBB_64BIT_ATOMICS is set
90 
91     __TBB_machine_<op><S><fence>(...), where
92         <op> = {cmpswp, fetchadd, fetchstore}
93         <S> = {1, 2, 4, 8}
94         <fence> = {full_fence, acquire, release, relaxed}
95         Must be provided if __TBB_USE_FENCED_ATOMICS is set.
96 
97     __TBB_control_consistency_helper()
98         Bridges the memory-semantics gap between architectures providing only
99         implicit C++0x "consume" semantics (like Power Architecture) and those
100         also implicitly obeying control dependencies (like IA-64 architecture).
101         It must be used only in conditional code where the condition is itself
102         data-dependent, and will then make subsequent code behave as if the
103         original data dependency were acquired.
104         It needs only a compiler fence where implied by the architecture
105         either specifically (like IA-64 architecture) or because generally stronger
106         "acquire" semantics are enforced (like x86).
107         It is always valid, though potentially suboptimal, to replace
108         control with acquire on the load and then remove the helper.
109 
110     __TBB_acquire_consistency_helper(), __TBB_release_consistency_helper()
111         Must be provided if __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE is set.
112         Enforce acquire and release semantics in generic implementations of fenced
113         store and load operations. Depending on the particular architecture/compiler
114         combination they may be a hardware fence, a compiler fence, both or nothing.
115  **/
116 
117 #include "tbb_stddef.h"
118 
119 namespace tbb {
120 namespace internal { //< @cond INTERNAL
121 
122 ////////////////////////////////////////////////////////////////////////////////
123 // Overridable helpers declarations
124 //
125 // A machine/*.h file may choose to define these templates, otherwise it must
126 // request default implementation by setting appropriate __TBB_USE_GENERIC_XXX macro(s).
127 //
128 template <typename T, std::size_t S>
129 struct machine_load_store;
130 
131 template <typename T, std::size_t S>
132 struct machine_load_store_relaxed;
133 
134 template <typename T, std::size_t S>
135 struct machine_load_store_seq_cst;
136 //
137 // End of overridable helpers declarations
138 ////////////////////////////////////////////////////////////////////////////////
139 
140 template<size_t S> struct atomic_selector;
141 
142 template<> struct atomic_selector<1> {
143     typedef int8_t word;
144     inline static word fetch_store ( volatile void* location, word value );
145 };
146 
147 template<> struct atomic_selector<2> {
148     typedef int16_t word;
149     inline static word fetch_store ( volatile void* location, word value );
150 };
151 
152 template<> struct atomic_selector<4> {
153 #if _MSC_VER && !_WIN64
154     // Work-around that avoids spurious /Wp64 warnings
155     typedef intptr_t word;
156 #else
157     typedef int32_t word;
158 #endif
159     inline static word fetch_store ( volatile void* location, word value );
160 };
161 
162 template<> struct atomic_selector<8> {
163     typedef int64_t word;
164     inline static word fetch_store ( volatile void* location, word value );
165 };
166 
167 }} //< namespaces internal @endcond, tbb
168 
169 #define __TBB_MACHINE_DEFINE_STORE8_GENERIC_FENCED(M)                                        \
170     inline void __TBB_machine_generic_store8##M(volatile void *ptr, int64_t value) {         \
171         for(;;) {                                                                            \
172             int64_t result = *(volatile int64_t *)ptr;                                       \
173             if( __TBB_machine_cmpswp8##M(ptr,value,result)==result ) break;                  \
174         }                                                                                    \
175     }                                                                                        \
176 
177 #define __TBB_MACHINE_DEFINE_LOAD8_GENERIC_FENCED(M)                                         \
178     inline int64_t __TBB_machine_generic_load8##M(const volatile void *ptr) {                \
179         /* Comparand and new value may be anything, they only must be equal, and      */     \
180         /* the value should have a low probability to be actually found in 'location'.*/     \
181         const int64_t anyvalue = 2305843009213693951LL;                                      \
182         return __TBB_machine_cmpswp8##M(const_cast<volatile void *>(ptr),anyvalue,anyvalue); \
183     }                                                                                        \
184 
185 // The set of allowed values for __TBB_ENDIANNESS (see above for details)
186 #define __TBB_ENDIAN_UNSUPPORTED -1
187 #define __TBB_ENDIAN_LITTLE       0
188 #define __TBB_ENDIAN_BIG          1
189 #define __TBB_ENDIAN_DETECT       2
190 
191 #if _WIN32||_WIN64
192 
193 #ifdef _MANAGED
194 #pragma managed(push, off)
195 #endif
196 
197     #if __MINGW64__ || __MINGW32__
198         extern "C" __declspec(dllimport) int __stdcall SwitchToThread( void );
199         #define __TBB_Yield()  SwitchToThread()
200         #if (TBB_USE_GCC_BUILTINS && __TBB_GCC_BUILTIN_ATOMICS_PRESENT)
201             #include "machine/gcc_generic.h"
202         #elif __MINGW64__
203             #include "machine/linux_intel64.h"
204         #elif __MINGW32__
205             #include "machine/linux_ia32.h"
206         #endif
207     #elif (TBB_USE_ICC_BUILTINS && __TBB_ICC_BUILTIN_ATOMICS_PRESENT)
208         #include "machine/icc_generic.h"
209     #elif defined(_M_IX86) && !defined(__TBB_WIN32_USE_CL_BUILTINS)
210         #include "machine/windows_ia32.h"
211     #elif defined(_M_X64)
212         #include "machine/windows_intel64.h"
213     #elif defined(_XBOX)
214         #include "machine/xbox360_ppc.h"
215     #elif defined(_M_ARM) || defined(__TBB_WIN32_USE_CL_BUILTINS)
216         #include "machine/msvc_armv7.h"
217     #endif
218 
219 #ifdef _MANAGED
220 #pragma managed(pop)
221 #endif
222 
223 #elif __TBB_DEFINE_MIC
224 
225     #include "machine/mic_common.h"
226     #if (TBB_USE_ICC_BUILTINS && __TBB_ICC_BUILTIN_ATOMICS_PRESENT)
227         #include "machine/icc_generic.h"
228     #else
229         #include "machine/linux_intel64.h"
230     #endif
231 
232 #elif __linux__ || __FreeBSD__ || __NetBSD__
233 
234     #if (TBB_USE_GCC_BUILTINS && __TBB_GCC_BUILTIN_ATOMICS_PRESENT)
235         #include "machine/gcc_generic.h"
236     #elif (TBB_USE_ICC_BUILTINS && __TBB_ICC_BUILTIN_ATOMICS_PRESENT)
237         #include "machine/icc_generic.h"
238     #elif __i386__
239         #include "machine/linux_ia32.h"
240     #elif __x86_64__
241         #include "machine/linux_intel64.h"
242     #elif __ia64__
243         #include "machine/linux_ia64.h"
244     #elif __powerpc__
245         #include "machine/mac_ppc.h"
246     #elif __arm__
247         #include "machine/gcc_armv7.h"
248     #elif __TBB_GCC_BUILTIN_ATOMICS_PRESENT
249         #include "machine/gcc_generic.h"
250     #endif
251     #include "machine/linux_common.h"
252 
253 #elif __APPLE__
254     //TODO:  TBB_USE_GCC_BUILTINS is not used for Mac, Sun, Aix
255     #if (TBB_USE_ICC_BUILTINS && __TBB_ICC_BUILTIN_ATOMICS_PRESENT)
256         #include "machine/icc_generic.h"
257     #elif __i386__
258         #include "machine/linux_ia32.h"
259     #elif __x86_64__
260         #include "machine/linux_intel64.h"
261     #elif __POWERPC__
262         #include "machine/mac_ppc.h"
263     #endif
264     #include "machine/macos_common.h"
265 
266 #elif _AIX
267 
268     #include "machine/ibm_aix51.h"
269 
270 #elif __sun || __SUNPRO_CC
271 
272     #define __asm__ asm
273     #define __volatile__ volatile
274 
275     #if __i386  || __i386__
276         #include "machine/linux_ia32.h"
277     #elif __x86_64__
278         #include "machine/linux_intel64.h"
279     #elif __sparc
280         #include "machine/sunos_sparc.h"
281     #endif
282     #include <sched.h>
283 
284     #define __TBB_Yield() sched_yield()
285 
286 #endif /* OS selection */
287 
288 #ifndef __TBB_64BIT_ATOMICS
289     #define __TBB_64BIT_ATOMICS 1
290 #endif
291 
292 //TODO: replace usage of these functions with usage of tbb::atomic, and then remove them
293 //TODO: map functions with W suffix to use cast to tbb::atomic and according op, i.e. as_atomic().op()
294 // Special atomic functions
295 #if __TBB_USE_FENCED_ATOMICS
296     #define __TBB_machine_cmpswp1   __TBB_machine_cmpswp1full_fence
297     #define __TBB_machine_cmpswp2   __TBB_machine_cmpswp2full_fence
298     #define __TBB_machine_cmpswp4   __TBB_machine_cmpswp4full_fence
299     #define __TBB_machine_cmpswp8   __TBB_machine_cmpswp8full_fence
300 
301     #if __TBB_WORDSIZE==8
302         #define __TBB_machine_fetchadd8             __TBB_machine_fetchadd8full_fence
303         #define __TBB_machine_fetchstore8           __TBB_machine_fetchstore8full_fence
304         #define __TBB_FetchAndAddWrelease(P,V)      __TBB_machine_fetchadd8release(P,V)
305         #define __TBB_FetchAndIncrementWacquire(P)  __TBB_machine_fetchadd8acquire(P,1)
306         #define __TBB_FetchAndDecrementWrelease(P)  __TBB_machine_fetchadd8release(P,(-1))
307     #else
308         #define __TBB_machine_fetchadd4             __TBB_machine_fetchadd4full_fence
309         #define __TBB_machine_fetchstore4           __TBB_machine_fetchstore4full_fence
310         #define __TBB_FetchAndAddWrelease(P,V)      __TBB_machine_fetchadd4release(P,V)
311         #define __TBB_FetchAndIncrementWacquire(P)  __TBB_machine_fetchadd4acquire(P,1)
312         #define __TBB_FetchAndDecrementWrelease(P)  __TBB_machine_fetchadd4release(P,(-1))
313     #endif /* __TBB_WORDSIZE==4 */
314 #else /* !__TBB_USE_FENCED_ATOMICS */
315     #define __TBB_FetchAndAddWrelease(P,V)      __TBB_FetchAndAddW(P,V)
316     #define __TBB_FetchAndIncrementWacquire(P)  __TBB_FetchAndAddW(P,1)
317     #define __TBB_FetchAndDecrementWrelease(P)  __TBB_FetchAndAddW(P,(-1))
318 #endif /* !__TBB_USE_FENCED_ATOMICS */
319 
320 #if __TBB_WORDSIZE==4
321     #define __TBB_CompareAndSwapW(P,V,C)    __TBB_machine_cmpswp4(P,V,C)
322     #define __TBB_FetchAndAddW(P,V)         __TBB_machine_fetchadd4(P,V)
323     #define __TBB_FetchAndStoreW(P,V)       __TBB_machine_fetchstore4(P,V)
324 #elif  __TBB_WORDSIZE==8
325     #if __TBB_USE_GENERIC_DWORD_LOAD_STORE || __TBB_USE_GENERIC_DWORD_FETCH_ADD || __TBB_USE_GENERIC_DWORD_FETCH_STORE
326         #error These macros should only be used on 32-bit platforms.
327     #endif
328 
329     #define __TBB_CompareAndSwapW(P,V,C)    __TBB_machine_cmpswp8(P,V,C)
330     #define __TBB_FetchAndAddW(P,V)         __TBB_machine_fetchadd8(P,V)
331     #define __TBB_FetchAndStoreW(P,V)       __TBB_machine_fetchstore8(P,V)
332 #else /* __TBB_WORDSIZE != 8 */
333     #error Unsupported machine word size.
334 #endif /* __TBB_WORDSIZE */
335 
336 #ifndef __TBB_Pause
337     inline void __TBB_Pause(int32_t) {
338         __TBB_Yield();
339     }
340 #endif
341 
342 namespace tbb {
343 
344 //! Sequentially consistent full memory fence.
345 inline void atomic_fence () { __TBB_full_memory_fence(); }
346 
347 namespace internal { //< @cond INTERNAL
348 
349 //! Class that implements exponential backoff.
350 /** See implementation of spin_wait_while_eq for an example. */
351 class atomic_backoff : no_copy {
352     //! Time delay, in units of "pause" instructions.
353     /** Should be equal to approximately the number of "pause" instructions
354         that take the same time as an context switch. */
355     static const int32_t LOOPS_BEFORE_YIELD = 16;
356     int32_t count;
357 public:
358     // In many cases, an object of this type is initialized eagerly on hot path,
359     // as in for(atomic_backoff b; ; b.pause()) { /*loop body*/ }
360     // For this reason, the construction cost must be very small!
361     atomic_backoff() : count(1) {}
362     // This constructor pauses immediately; do not use on hot paths!
363     atomic_backoff( bool ) : count(1) { pause(); }
364 
365     //! Pause for a while.
366     void pause() {
367         if( count<=LOOPS_BEFORE_YIELD ) {
368             __TBB_Pause(count);
369             // Pause twice as long the next time.
370             count*=2;
371         } else {
372             // Pause is so long that we might as well yield CPU to scheduler.
373             __TBB_Yield();
374         }
375     }
376 
377     // pause for a few times and then return false immediately.
378     bool bounded_pause() {
379         if( count<=LOOPS_BEFORE_YIELD ) {
380             __TBB_Pause(count);
381             // Pause twice as long the next time.
382             count*=2;
383             return true;
384         } else {
385             return false;
386         }
387     }
388 
389     void reset() {
390         count = 1;
391     }
392 };
393 
394 //! Spin WHILE the value of the variable is equal to a given value
395 /** T and U should be comparable types. */
396 template<typename T, typename U>
397 void spin_wait_while_eq( const volatile T& location, U value ) {
398     atomic_backoff backoff;
399     while( location==value ) backoff.pause();
400 }
401 
402 //! Spin UNTIL the value of the variable is equal to a given value
403 /** T and U should be comparable types. */
404 template<typename T, typename U>
405 void spin_wait_until_eq( const volatile T& location, const U value ) {
406     atomic_backoff backoff;
407     while( location!=value ) backoff.pause();
408 }
409 
410 template <typename predicate_type>
411 void spin_wait_while(predicate_type condition){
412     atomic_backoff backoff;
413     while( condition() ) backoff.pause();
414 }
415 
416 ////////////////////////////////////////////////////////////////////////////////
417 // Generic compare-and-swap applied to only a part of a machine word.
418 //
419 #ifndef __TBB_ENDIANNESS
420 #define __TBB_ENDIANNESS __TBB_ENDIAN_DETECT
421 #endif
422 
423 #if __TBB_USE_GENERIC_PART_WORD_CAS && __TBB_ENDIANNESS==__TBB_ENDIAN_UNSUPPORTED
424 #error Generic implementation of part-word CAS may not be used with __TBB_ENDIAN_UNSUPPORTED
425 #endif
426 
427 #if __TBB_ENDIANNESS!=__TBB_ENDIAN_UNSUPPORTED
428 //
429 // This function is the only use of __TBB_ENDIANNESS.
430 // The following restrictions/limitations apply for this operation:
431 //  - T must be an integer type of at most 4 bytes for the casts and calculations to work
432 //  - T must also be less than 4 bytes to avoid compiler warnings when computing mask
433 //      (and for the operation to be useful at all, so no workaround is applied)
434 //  - the architecture must consistently use either little-endian or big-endian (same for all locations)
435 //
436 // TODO: static_assert for the type requirements stated above
437 template<typename T>
438 inline T __TBB_MaskedCompareAndSwap (volatile T * const ptr, const T value, const T comparand ) {
439     struct endianness{ static bool is_big_endian(){
440         #if __TBB_ENDIANNESS==__TBB_ENDIAN_DETECT
441             const uint32_t probe = 0x03020100;
442             return (((const char*)(&probe))[0]==0x03);
443         #elif __TBB_ENDIANNESS==__TBB_ENDIAN_BIG || __TBB_ENDIANNESS==__TBB_ENDIAN_LITTLE
444             return __TBB_ENDIANNESS==__TBB_ENDIAN_BIG;
445         #else
446             #error Unexpected value of __TBB_ENDIANNESS
447         #endif
448     }};
449 
450     const uint32_t byte_offset            = (uint32_t) ((uintptr_t)ptr & 0x3);
451     volatile uint32_t * const aligned_ptr = (uint32_t*)((uintptr_t)ptr - byte_offset );
452 
453     // location of T within uint32_t for a C++ shift operation
454     const uint32_t bits_to_shift     = 8*(endianness::is_big_endian() ? (4 - sizeof(T) - (byte_offset)) : byte_offset);
455     const uint32_t mask              = (((uint32_t)1<<(sizeof(T)*8)) - 1 )<<bits_to_shift;
456     // for signed T, any sign extension bits in cast value/comparand are immediately clipped by mask
457     const uint32_t shifted_comparand = ((uint32_t)comparand << bits_to_shift)&mask;
458     const uint32_t shifted_value     = ((uint32_t)value     << bits_to_shift)&mask;
459 
460     for( atomic_backoff b;;b.pause() ) {
461         const uint32_t surroundings  = *aligned_ptr & ~mask ; // may have changed during the pause
462         const uint32_t big_comparand = surroundings | shifted_comparand ;
463         const uint32_t big_value     = surroundings | shifted_value     ;
464         // __TBB_machine_cmpswp4 presumed to have full fence.
465         // Cast shuts up /Wp64 warning
466         const uint32_t big_result = (uint32_t)__TBB_machine_cmpswp4( aligned_ptr, big_value, big_comparand );
467         if( big_result == big_comparand                    // CAS succeeded
468           || ((big_result ^ big_comparand) & mask) != 0)   // CAS failed and the bits of interest have changed
469         {
470             return T((big_result & mask) >> bits_to_shift);
471         }
472         else continue;                                     // CAS failed but the bits of interest were not changed
473     }
474 }
475 #endif // __TBB_ENDIANNESS!=__TBB_ENDIAN_UNSUPPORTED
476 ////////////////////////////////////////////////////////////////////////////////
477 
478 template<size_t S, typename T>
479 inline T __TBB_CompareAndSwapGeneric (volatile void *ptr, T value, T comparand );
480 
481 template<>
482 inline int8_t __TBB_CompareAndSwapGeneric <1,int8_t> (volatile void *ptr, int8_t value, int8_t comparand ) {
483 #if __TBB_USE_GENERIC_PART_WORD_CAS
484     return __TBB_MaskedCompareAndSwap<int8_t>((volatile int8_t *)ptr,value,comparand);
485 #else
486     return __TBB_machine_cmpswp1(ptr,value,comparand);
487 #endif
488 }
489 
490 template<>
491 inline int16_t __TBB_CompareAndSwapGeneric <2,int16_t> (volatile void *ptr, int16_t value, int16_t comparand ) {
492 #if __TBB_USE_GENERIC_PART_WORD_CAS
493     return __TBB_MaskedCompareAndSwap<int16_t>((volatile int16_t *)ptr,value,comparand);
494 #else
495     return __TBB_machine_cmpswp2(ptr,value,comparand);
496 #endif
497 }
498 
499 template<>
500 inline int32_t __TBB_CompareAndSwapGeneric <4,int32_t> (volatile void *ptr, int32_t value, int32_t comparand ) {
501     // Cast shuts up /Wp64 warning
502     return (int32_t)__TBB_machine_cmpswp4(ptr,value,comparand);
503 }
504 
505 #if __TBB_64BIT_ATOMICS
506 template<>
507 inline int64_t __TBB_CompareAndSwapGeneric <8,int64_t> (volatile void *ptr, int64_t value, int64_t comparand ) {
508     return __TBB_machine_cmpswp8(ptr,value,comparand);
509 }
510 #endif
511 
512 template<size_t S, typename T>
513 inline T __TBB_FetchAndAddGeneric (volatile void *ptr, T addend) {
514     T result;
515     for( atomic_backoff b;;b.pause() ) {
516         result = *reinterpret_cast<volatile T *>(ptr);
517         // __TBB_CompareAndSwapGeneric presumed to have full fence.
518         if( __TBB_CompareAndSwapGeneric<S,T> ( ptr, result+addend, result )==result )
519             break;
520     }
521     return result;
522 }
523 
524 template<size_t S, typename T>
525 inline T __TBB_FetchAndStoreGeneric (volatile void *ptr, T value) {
526     T result;
527     for( atomic_backoff b;;b.pause() ) {
528         result = *reinterpret_cast<volatile T *>(ptr);
529         // __TBB_CompareAndSwapGeneric presumed to have full fence.
530         if( __TBB_CompareAndSwapGeneric<S,T> ( ptr, value, result )==result )
531             break;
532     }
533     return result;
534 }
535 
536 #if __TBB_USE_GENERIC_PART_WORD_CAS
537 #define __TBB_machine_cmpswp1 tbb::internal::__TBB_CompareAndSwapGeneric<1,int8_t>
538 #define __TBB_machine_cmpswp2 tbb::internal::__TBB_CompareAndSwapGeneric<2,int16_t>
539 #endif
540 
541 #if __TBB_USE_GENERIC_FETCH_ADD || __TBB_USE_GENERIC_PART_WORD_FETCH_ADD
542 #define __TBB_machine_fetchadd1 tbb::internal::__TBB_FetchAndAddGeneric<1,int8_t>
543 #define __TBB_machine_fetchadd2 tbb::internal::__TBB_FetchAndAddGeneric<2,int16_t>
544 #endif
545 
546 #if __TBB_USE_GENERIC_FETCH_ADD
547 #define __TBB_machine_fetchadd4 tbb::internal::__TBB_FetchAndAddGeneric<4,int32_t>
548 #endif
549 
550 #if __TBB_USE_GENERIC_FETCH_ADD || __TBB_USE_GENERIC_DWORD_FETCH_ADD
551 #define __TBB_machine_fetchadd8 tbb::internal::__TBB_FetchAndAddGeneric<8,int64_t>
552 #endif
553 
554 #if __TBB_USE_GENERIC_FETCH_STORE || __TBB_USE_GENERIC_PART_WORD_FETCH_STORE
555 #define __TBB_machine_fetchstore1 tbb::internal::__TBB_FetchAndStoreGeneric<1,int8_t>
556 #define __TBB_machine_fetchstore2 tbb::internal::__TBB_FetchAndStoreGeneric<2,int16_t>
557 #endif
558 
559 #if __TBB_USE_GENERIC_FETCH_STORE
560 #define __TBB_machine_fetchstore4 tbb::internal::__TBB_FetchAndStoreGeneric<4,int32_t>
561 #endif
562 
563 #if __TBB_USE_GENERIC_FETCH_STORE || __TBB_USE_GENERIC_DWORD_FETCH_STORE
564 #define __TBB_machine_fetchstore8 tbb::internal::__TBB_FetchAndStoreGeneric<8,int64_t>
565 #endif
566 
567 #if __TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE
568 #define __TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE(S)                                             \
569     atomic_selector<S>::word atomic_selector<S>::fetch_store ( volatile void* location, word value ) {  \
570         return __TBB_machine_fetchstore##S( location, value );                                          \
571     }
572 
573 __TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE(1)
574 __TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE(2)
575 __TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE(4)
576 __TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE(8)
577 
578 #undef __TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE
579 #endif /* __TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE */
580 
581 #if __TBB_USE_GENERIC_DWORD_LOAD_STORE
582 /*TODO: find a more elegant way to handle function names difference*/
583 #if ! __TBB_USE_FENCED_ATOMICS
584     /* This name forwarding is needed for generic implementation of
585      * load8/store8 defined below (via macro) to pick the right CAS function*/
586     #define   __TBB_machine_cmpswp8full_fence __TBB_machine_cmpswp8
587 #endif
588 __TBB_MACHINE_DEFINE_LOAD8_GENERIC_FENCED(full_fence)
589 __TBB_MACHINE_DEFINE_STORE8_GENERIC_FENCED(full_fence)
590 
591 #if ! __TBB_USE_FENCED_ATOMICS
592     #undef   __TBB_machine_cmpswp8full_fence
593 #endif
594 
595 #define __TBB_machine_store8 tbb::internal::__TBB_machine_generic_store8full_fence
596 #define __TBB_machine_load8  tbb::internal::__TBB_machine_generic_load8full_fence
597 #endif /* __TBB_USE_GENERIC_DWORD_LOAD_STORE */
598 
599 #if __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE
600 /** Fenced operations use volatile qualifier to prevent compiler from optimizing
601     them out, and on architectures with weak memory ordering to induce compiler
602     to generate code with appropriate acquire/release semantics.
603     On architectures like IA32, Intel64 (and likely Sparc TSO) volatile has
604     no effect on code gen, and consistency helpers serve as a compiler fence (the
605     latter being true for IA64/gcc as well to fix a bug in some gcc versions).
606     This code assumes that the generated instructions will operate atomically,
607     which typically requires a type that can be moved in a single instruction,
608     cooperation from the compiler for effective use of such an instruction,
609     and appropriate alignment of the data. **/
610 template <typename T, size_t S>
611 struct machine_load_store {
612     static T load_with_acquire ( const volatile T& location ) {
613         T to_return = location;
614         __TBB_acquire_consistency_helper();
615         return to_return;
616     }
617     static void store_with_release ( volatile T &location, T value ) {
618         __TBB_release_consistency_helper();
619         location = value;
620     }
621 };
622 
623 //in general, plain load and store of 32bit compiler is not atomic for 64bit types
624 #if __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS
625 template <typename T>
626 struct machine_load_store<T,8> {
627     static T load_with_acquire ( const volatile T& location ) {
628         return (T)__TBB_machine_load8( (const volatile void*)&location );
629     }
630     static void store_with_release ( volatile T& location, T value ) {
631         __TBB_machine_store8( (volatile void*)&location, (int64_t)value );
632     }
633 };
634 #endif /* __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS */
635 #endif /* __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE */
636 
637 #if __TBB_USE_GENERIC_SEQUENTIAL_CONSISTENCY_LOAD_STORE
638 template <typename T, size_t S>
639 struct machine_load_store_seq_cst {
640     static T load ( const volatile T& location ) {
641         __TBB_full_memory_fence();
642         return machine_load_store<T,S>::load_with_acquire( location );
643     }
644 #if __TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE
645     static void store ( volatile T &location, T value ) {
646         atomic_selector<S>::fetch_store( (volatile void*)&location, (typename atomic_selector<S>::word)value );
647     }
648 #else /* !__TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE */
649     static void store ( volatile T &location, T value ) {
650         machine_load_store<T,S>::store_with_release( location, value );
651         __TBB_full_memory_fence();
652     }
653 #endif /* !__TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE */
654 };
655 
656 #if __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS
657 /** The implementation does not use functions __TBB_machine_load8/store8 as they
658     are not required to be sequentially consistent. **/
659 template <typename T>
660 struct machine_load_store_seq_cst<T,8> {
661     static T load ( const volatile T& location ) {
662         // Comparand and new value may be anything, they only must be equal, and
663         // the value should have a low probability to be actually found in 'location'.
664         const int64_t anyvalue = 2305843009213693951LL;
665         return __TBB_machine_cmpswp8( (volatile void*)const_cast<volatile T*>(&location), anyvalue, anyvalue );
666     }
667     static void store ( volatile T &location, T value ) {
668         int64_t result = (volatile int64_t&)location;
669         while ( __TBB_machine_cmpswp8((volatile void*)&location, (int64_t)value, result) != result )
670             result = (volatile int64_t&)location;
671     }
672 };
673 #endif /* __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS */
674 #endif /*__TBB_USE_GENERIC_SEQUENTIAL_CONSISTENCY_LOAD_STORE */
675 
676 #if __TBB_USE_GENERIC_RELAXED_LOAD_STORE
677 // Relaxed operations add volatile qualifier to prevent compiler from optimizing them out.
678 /** Volatile should not incur any additional cost on IA32, Intel64, and Sparc TSO
679     architectures. However on architectures with weak memory ordering compiler may
680     generate code with acquire/release semantics for operations on volatile data. **/
681 template <typename T, size_t S>
682 struct machine_load_store_relaxed {
683     static inline T load ( const volatile T& location ) {
684         return location;
685     }
686     static inline void store ( volatile T& location, T value ) {
687         location = value;
688     }
689 };
690 
691 #if __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS
692 template <typename T>
693 struct machine_load_store_relaxed<T,8> {
694     static inline T load ( const volatile T& location ) {
695         return (T)__TBB_machine_load8( (const volatile void*)&location );
696     }
697     static inline void store ( volatile T& location, T value ) {
698         __TBB_machine_store8( (volatile void*)&location, (int64_t)value );
699     }
700 };
701 #endif /* __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS */
702 #endif /* __TBB_USE_GENERIC_RELAXED_LOAD_STORE */
703 
704 #undef __TBB_WORDSIZE //this macro is forbidden to use outside of atomic machinery
705 
706 template<typename T>
707 inline T __TBB_load_with_acquire(const volatile T &location) {
708     return machine_load_store<T,sizeof(T)>::load_with_acquire( location );
709 }
710 template<typename T, typename V>
711 inline void __TBB_store_with_release(volatile T& location, V value) {
712     machine_load_store<T,sizeof(T)>::store_with_release( location, T(value) );
713 }
714 //! Overload that exists solely to avoid /Wp64 warnings.
715 inline void __TBB_store_with_release(volatile size_t& location, size_t value) {
716     machine_load_store<size_t,sizeof(size_t)>::store_with_release( location, value );
717 }
718 
719 template<typename T>
720 inline T __TBB_load_full_fence(const volatile T &location) {
721     return machine_load_store_seq_cst<T,sizeof(T)>::load( location );
722 }
723 template<typename T, typename V>
724 inline void __TBB_store_full_fence(volatile T& location, V value) {
725     machine_load_store_seq_cst<T,sizeof(T)>::store( location, T(value) );
726 }
727 //! Overload that exists solely to avoid /Wp64 warnings.
728 inline void __TBB_store_full_fence(volatile size_t& location, size_t value) {
729     machine_load_store_seq_cst<size_t,sizeof(size_t)>::store( location, value );
730 }
731 
732 template<typename T>
733 inline T __TBB_load_relaxed (const volatile T& location) {
734     return machine_load_store_relaxed<T,sizeof(T)>::load( const_cast<T&>(location) );
735 }
736 template<typename T, typename V>
737 inline void __TBB_store_relaxed ( volatile T& location, V value ) {
738     machine_load_store_relaxed<T,sizeof(T)>::store( const_cast<T&>(location), T(value) );
739 }
740 //! Overload that exists solely to avoid /Wp64 warnings.
741 inline void __TBB_store_relaxed ( volatile size_t& location, size_t value ) {
742     machine_load_store_relaxed<size_t,sizeof(size_t)>::store( const_cast<size_t&>(location), value );
743 }
744 
745 // Macro __TBB_TypeWithAlignmentAtLeastAsStrict(T) should be a type with alignment at least as
746 // strict as type T.  The type should have a trivial default constructor and destructor, so that
747 // arrays of that type can be declared without initializers.
748 // It is correct (but perhaps a waste of space) if __TBB_TypeWithAlignmentAtLeastAsStrict(T) expands
749 // to a type bigger than T.
750 // The default definition here works on machines where integers are naturally aligned and the
751 // strictest alignment is 64.
752 #ifndef __TBB_TypeWithAlignmentAtLeastAsStrict
753 
754 #if __TBB_ATTRIBUTE_ALIGNED_PRESENT
755 
756 #define __TBB_DefineTypeWithAlignment(PowerOf2)       \
757 struct __TBB_machine_type_with_alignment_##PowerOf2 { \
758     uint32_t member[PowerOf2/sizeof(uint32_t)];       \
759 } __attribute__((aligned(PowerOf2)));
760 #define __TBB_alignof(T) __alignof__(T)
761 
762 #elif __TBB_DECLSPEC_ALIGN_PRESENT
763 
764 #define __TBB_DefineTypeWithAlignment(PowerOf2)       \
765 __declspec(align(PowerOf2))                           \
766 struct __TBB_machine_type_with_alignment_##PowerOf2 { \
767     uint32_t member[PowerOf2/sizeof(uint32_t)];       \
768 };
769 #define __TBB_alignof(T) __alignof(T)
770 
771 #else /* A compiler with unknown syntax for data alignment */
772 #error Must define __TBB_TypeWithAlignmentAtLeastAsStrict(T)
773 #endif
774 
775 /* Now declare types aligned to useful powers of two */
776 // TODO: Is __TBB_DefineTypeWithAlignment(8) needed on 32 bit platforms?
777 __TBB_DefineTypeWithAlignment(16)
778 __TBB_DefineTypeWithAlignment(32)
779 __TBB_DefineTypeWithAlignment(64)
780 
781 typedef __TBB_machine_type_with_alignment_64 __TBB_machine_type_with_strictest_alignment;
782 
783 // Primary template is a declaration of incomplete type so that it fails with unknown alignments
784 template<size_t N> struct type_with_alignment;
785 
786 // Specializations for allowed alignments
787 template<> struct type_with_alignment<1> { char member; };
788 template<> struct type_with_alignment<2> { uint16_t member; };
789 template<> struct type_with_alignment<4> { uint32_t member; };
790 template<> struct type_with_alignment<8> { uint64_t member; };
791 template<> struct type_with_alignment<16> {__TBB_machine_type_with_alignment_16 member; };
792 template<> struct type_with_alignment<32> {__TBB_machine_type_with_alignment_32 member; };
793 template<> struct type_with_alignment<64> {__TBB_machine_type_with_alignment_64 member; };
794 
795 #if __TBB_ALIGNOF_NOT_INSTANTIATED_TYPES_BROKEN
796 //! Work around for bug in GNU 3.2 and MSVC compilers.
797 /** Bug is that compiler sometimes returns 0 for __alignof(T) when T has not yet been instantiated.
798     The work-around forces instantiation by forcing computation of sizeof(T) before __alignof(T). */
799 template<size_t Size, typename T>
800 struct work_around_alignment_bug {
801     static const size_t alignment = __TBB_alignof(T);
802 };
803 #define __TBB_TypeWithAlignmentAtLeastAsStrict(T) tbb::internal::type_with_alignment<tbb::internal::work_around_alignment_bug<sizeof(T),T>::alignment>
804 #else
805 #define __TBB_TypeWithAlignmentAtLeastAsStrict(T) tbb::internal::type_with_alignment<__TBB_alignof(T)>
806 #endif  /* __TBB_ALIGNOF_NOT_INSTANTIATED_TYPES_BROKEN */
807 
808 #endif  /* __TBB_TypeWithAlignmentAtLeastAsStrict */
809 
810 // Template class here is to avoid instantiation of the static data for modules that don't use it
811 template<typename T>
812 struct reverse {
813     static const T byte_table[256];
814 };
815 // An efficient implementation of the reverse function utilizes a 2^8 lookup table holding the bit-reversed
816 // values of [0..2^8 - 1]. Those values can also be computed on the fly at a slightly higher cost.
817 template<typename T>
818 const T reverse<T>::byte_table[256] = {
819     0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0,
820     0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8,
821     0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4,
822     0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC,
823     0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2,
824     0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
825     0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
826     0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE,
827     0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
828     0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9,
829     0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5,
830     0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
831     0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3,
832     0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
833     0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7,
834     0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF
835 };
836 
837 } // namespace internal @endcond
838 } // namespace tbb
839 
840 // Preserving access to legacy APIs
841 using tbb::internal::__TBB_load_with_acquire;
842 using tbb::internal::__TBB_store_with_release;
843 
844 // Mapping historically used names to the ones expected by atomic_load_store_traits
845 #define __TBB_load_acquire  __TBB_load_with_acquire
846 #define __TBB_store_release __TBB_store_with_release
847 
848 #ifndef __TBB_Log2
849 inline intptr_t __TBB_Log2( uintptr_t x ) {
850     if( x==0 ) return -1;
851     intptr_t result = 0;
852 
853 #if !defined(_M_ARM)
854     uintptr_t tmp;
855     if( sizeof(x)>4 && (tmp = ((uint64_t)x)>>32) ) { x=tmp; result += 32; }
856 #endif
857     if( uintptr_t tmp = x>>16 ) { x=tmp; result += 16; }
858     if( uintptr_t tmp = x>>8 )  { x=tmp; result += 8; }
859     if( uintptr_t tmp = x>>4 )  { x=tmp; result += 4; }
860     if( uintptr_t tmp = x>>2 )  { x=tmp; result += 2; }
861 
862     return (x&2)? result+1: result;
863 }
864 #endif
865 
866 #ifndef __TBB_AtomicOR
867 inline void __TBB_AtomicOR( volatile void *operand, uintptr_t addend ) {
868     for( tbb::internal::atomic_backoff b;;b.pause() ) {
869         uintptr_t tmp = *(volatile uintptr_t *)operand;
870         uintptr_t result = __TBB_CompareAndSwapW(operand, tmp|addend, tmp);
871         if( result==tmp ) break;
872     }
873 }
874 #endif
875 
876 #ifndef __TBB_AtomicAND
877 inline void __TBB_AtomicAND( volatile void *operand, uintptr_t addend ) {
878     for( tbb::internal::atomic_backoff b;;b.pause() ) {
879         uintptr_t tmp = *(volatile uintptr_t *)operand;
880         uintptr_t result = __TBB_CompareAndSwapW(operand, tmp&addend, tmp);
881         if( result==tmp ) break;
882     }
883 }
884 #endif
885 
886 #if __TBB_PREFETCHING
887 #ifndef __TBB_cl_prefetch
888 #error This platform does not define cache management primitives required for __TBB_PREFETCHING
889 #endif
890 
891 #ifndef __TBB_cl_evict
892 #define __TBB_cl_evict(p)
893 #endif
894 #endif
895 
896 #ifndef __TBB_Flag
897 typedef unsigned char __TBB_Flag;
898 #endif
899 typedef __TBB_atomic __TBB_Flag __TBB_atomic_flag;
900 
901 #ifndef __TBB_TryLockByte
902 inline bool __TBB_TryLockByte( __TBB_atomic_flag &flag ) {
903     return __TBB_machine_cmpswp1(&flag,1,0)==0;
904 }
905 #endif
906 
907 #ifndef __TBB_LockByte
908 inline __TBB_Flag __TBB_LockByte( __TBB_atomic_flag& flag ) {
909     tbb::internal::atomic_backoff backoff;
910     while( !__TBB_TryLockByte(flag) ) backoff.pause();
911     return 0;
912 }
913 #endif
914 
915 #ifndef  __TBB_UnlockByte
916 #define __TBB_UnlockByte(addr) __TBB_store_with_release((addr),0)
917 #endif
918 
919 // lock primitives with TSX
920 #if ( __TBB_x86_32 || __TBB_x86_64 )  /* only on ia32/intel64 */
921 inline void __TBB_TryLockByteElidedCancel() { __TBB_machine_try_lock_elided_cancel(); }
922 
923 inline bool __TBB_TryLockByteElided( __TBB_atomic_flag& flag ) {
924     bool res = __TBB_machine_try_lock_elided( &flag )!=0;
925     // to avoid the "lemming" effect, we need to abort the transaction
926     // if  __TBB_machine_try_lock_elided returns false (i.e., someone else
927     // has acquired the mutex non-speculatively).
928     if( !res ) __TBB_TryLockByteElidedCancel();
929     return res;
930 }
931 
932 inline void __TBB_LockByteElided( __TBB_atomic_flag& flag )
933 {
934     for(;;) {
935         tbb::internal::spin_wait_while_eq( flag, 1 );
936         if( __TBB_machine_try_lock_elided( &flag ) )
937             return;
938         // Another thread acquired the lock "for real".
939         // To avoid the "lemming" effect, we abort the transaction.
940         __TBB_TryLockByteElidedCancel();
941     }
942 }
943 
944 inline void __TBB_UnlockByteElided( __TBB_atomic_flag& flag ) {
945     __TBB_machine_unlock_elided( &flag );
946 }
947 #endif
948 
949 #ifndef __TBB_ReverseByte
950 inline unsigned char __TBB_ReverseByte(unsigned char src) {
951     return tbb::internal::reverse<unsigned char>::byte_table[src];
952 }
953 #endif
954 
955 template<typename T>
956 T __TBB_ReverseBits(T src) {
957     T dst;
958     unsigned char *original = (unsigned char *) &src;
959     unsigned char *reversed = (unsigned char *) &dst;
960 
961     for( int i = sizeof(T)-1; i >= 0; i-- )
962         reversed[i] = __TBB_ReverseByte( original[sizeof(T)-i-1] );
963 
964     return dst;
965 }
966 
967 #endif /* __TBB_machine_H */
968