1 /* 2 Copyright 2005-2014 Intel Corporation. All Rights Reserved. 3 4 This file is part of Threading Building Blocks. Threading Building Blocks is free software; 5 you can redistribute it and/or modify it under the terms of the GNU General Public License 6 version 2 as published by the Free Software Foundation. Threading Building Blocks is 7 distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the 8 implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 9 See the GNU General Public License for more details. You should have received a copy of 10 the GNU General Public License along with Threading Building Blocks; if not, write to the 11 Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 12 13 As a special exception, you may use this file as part of a free software library without 14 restriction. Specifically, if other files instantiate templates or use macros or inline 15 functions from this file, or you compile this file and link it with other files to produce 16 an executable, this file does not by itself cause the resulting executable to be covered 17 by the GNU General Public License. This exception does not however invalidate any other 18 reasons why the executable file might be covered by the GNU General Public License. 19 */ 20 21 #ifndef __TBB_machine_H 22 #define __TBB_machine_H 23 24 /** This header provides basic platform abstraction layer by hooking up appropriate 25 architecture/OS/compiler specific headers from the /include/tbb/machine directory. 26 If a plug-in header does not implement all the required APIs, it must specify 27 the missing ones by setting one or more of the following macros: 28 29 __TBB_USE_GENERIC_PART_WORD_CAS 30 __TBB_USE_GENERIC_PART_WORD_FETCH_ADD 31 __TBB_USE_GENERIC_PART_WORD_FETCH_STORE 32 __TBB_USE_GENERIC_FETCH_ADD 33 __TBB_USE_GENERIC_FETCH_STORE 34 __TBB_USE_GENERIC_DWORD_FETCH_ADD 35 __TBB_USE_GENERIC_DWORD_FETCH_STORE 36 __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE 37 __TBB_USE_GENERIC_FULL_FENCED_LOAD_STORE 38 __TBB_USE_GENERIC_RELAXED_LOAD_STORE 39 __TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE 40 41 In this case tbb_machine.h will add missing functionality based on a minimal set 42 of APIs that are required to be implemented by all plug-n headers as described 43 further. 44 Note that these generic implementations may be sub-optimal for a particular 45 architecture, and thus should be relied upon only after careful evaluation 46 or as the last resort. 47 48 Additionally __TBB_64BIT_ATOMICS can be set to 0 on a 32-bit architecture to 49 indicate that the port is not going to support double word atomics. It may also 50 be set to 1 explicitly, though normally this is not necessary as tbb_machine.h 51 will set it automatically. 52 53 __TBB_ENDIANNESS macro can be defined by the implementation as well. 54 It is used only if __TBB_USE_GENERIC_PART_WORD_CAS is set (or for testing), 55 and must specify the layout of aligned 16-bit and 32-bit data anywhere within a process 56 (while the details of unaligned 16-bit or 32-bit data or of 64-bit data are irrelevant). 57 The layout must be the same at all relevant memory locations within the current process; 58 in case of page-specific endianness, one endianness must be kept "out of sight". 59 Possible settings, reflecting hardware and possibly O.S. convention, are: 60 - __TBB_ENDIAN_BIG for big-endian data, 61 - __TBB_ENDIAN_LITTLE for little-endian data, 62 - __TBB_ENDIAN_DETECT for run-time detection iff exactly one of the above, 63 - __TBB_ENDIAN_UNSUPPORTED to prevent undefined behavior if none of the above. 64 65 Prerequisites for each architecture port 66 ---------------------------------------- 67 The following functions and macros have no generic implementation. Therefore they must be 68 implemented in each machine architecture specific header either as a conventional 69 function or as a functional macro. 70 71 __TBB_WORDSIZE 72 This is the size of machine word in bytes, i.e. for 32 bit systems it 73 should be defined to 4. 74 75 __TBB_Yield() 76 Signals OS that the current thread is willing to relinquish the remainder 77 of its time quantum. 78 79 __TBB_full_memory_fence() 80 Must prevent all memory operations from being reordered across it (both 81 by hardware and compiler). All such fences must be totally ordered (or 82 sequentially consistent). 83 84 __TBB_machine_cmpswp4( volatile void *ptr, int32_t value, int32_t comparand ) 85 Must be provided if __TBB_USE_FENCED_ATOMICS is not set. 86 87 __TBB_machine_cmpswp8( volatile void *ptr, int32_t value, int64_t comparand ) 88 Must be provided for 64-bit architectures if __TBB_USE_FENCED_ATOMICS is not set, 89 and for 32-bit architectures if __TBB_64BIT_ATOMICS is set 90 91 __TBB_machine_<op><S><fence>(...), where 92 <op> = {cmpswp, fetchadd, fetchstore} 93 <S> = {1, 2, 4, 8} 94 <fence> = {full_fence, acquire, release, relaxed} 95 Must be provided if __TBB_USE_FENCED_ATOMICS is set. 96 97 __TBB_control_consistency_helper() 98 Bridges the memory-semantics gap between architectures providing only 99 implicit C++0x "consume" semantics (like Power Architecture) and those 100 also implicitly obeying control dependencies (like IA-64 architecture). 101 It must be used only in conditional code where the condition is itself 102 data-dependent, and will then make subsequent code behave as if the 103 original data dependency were acquired. 104 It needs only a compiler fence where implied by the architecture 105 either specifically (like IA-64 architecture) or because generally stronger 106 "acquire" semantics are enforced (like x86). 107 It is always valid, though potentially suboptimal, to replace 108 control with acquire on the load and then remove the helper. 109 110 __TBB_acquire_consistency_helper(), __TBB_release_consistency_helper() 111 Must be provided if __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE is set. 112 Enforce acquire and release semantics in generic implementations of fenced 113 store and load operations. Depending on the particular architecture/compiler 114 combination they may be a hardware fence, a compiler fence, both or nothing. 115 **/ 116 117 #include "tbb_stddef.h" 118 119 namespace tbb { 120 namespace internal { //< @cond INTERNAL 121 122 //////////////////////////////////////////////////////////////////////////////// 123 // Overridable helpers declarations 124 // 125 // A machine/*.h file may choose to define these templates, otherwise it must 126 // request default implementation by setting appropriate __TBB_USE_GENERIC_XXX macro(s). 127 // 128 template <typename T, std::size_t S> 129 struct machine_load_store; 130 131 template <typename T, std::size_t S> 132 struct machine_load_store_relaxed; 133 134 template <typename T, std::size_t S> 135 struct machine_load_store_seq_cst; 136 // 137 // End of overridable helpers declarations 138 //////////////////////////////////////////////////////////////////////////////// 139 140 template<size_t S> struct atomic_selector; 141 142 template<> struct atomic_selector<1> { 143 typedef int8_t word; 144 inline static word fetch_store ( volatile void* location, word value ); 145 }; 146 147 template<> struct atomic_selector<2> { 148 typedef int16_t word; 149 inline static word fetch_store ( volatile void* location, word value ); 150 }; 151 152 template<> struct atomic_selector<4> { 153 #if _MSC_VER && !_WIN64 154 // Work-around that avoids spurious /Wp64 warnings 155 typedef intptr_t word; 156 #else 157 typedef int32_t word; 158 #endif 159 inline static word fetch_store ( volatile void* location, word value ); 160 }; 161 162 template<> struct atomic_selector<8> { 163 typedef int64_t word; 164 inline static word fetch_store ( volatile void* location, word value ); 165 }; 166 167 }} //< namespaces internal @endcond, tbb 168 169 #define __TBB_MACHINE_DEFINE_STORE8_GENERIC_FENCED(M) \ 170 inline void __TBB_machine_generic_store8##M(volatile void *ptr, int64_t value) { \ 171 for(;;) { \ 172 int64_t result = *(volatile int64_t *)ptr; \ 173 if( __TBB_machine_cmpswp8##M(ptr,value,result)==result ) break; \ 174 } \ 175 } \ 176 177 #define __TBB_MACHINE_DEFINE_LOAD8_GENERIC_FENCED(M) \ 178 inline int64_t __TBB_machine_generic_load8##M(const volatile void *ptr) { \ 179 /* Comparand and new value may be anything, they only must be equal, and */ \ 180 /* the value should have a low probability to be actually found in 'location'.*/ \ 181 const int64_t anyvalue = 2305843009213693951LL; \ 182 return __TBB_machine_cmpswp8##M(const_cast<volatile void *>(ptr),anyvalue,anyvalue); \ 183 } \ 184 185 // The set of allowed values for __TBB_ENDIANNESS (see above for details) 186 #define __TBB_ENDIAN_UNSUPPORTED -1 187 #define __TBB_ENDIAN_LITTLE 0 188 #define __TBB_ENDIAN_BIG 1 189 #define __TBB_ENDIAN_DETECT 2 190 191 #if _WIN32||_WIN64 192 193 #ifdef _MANAGED 194 #pragma managed(push, off) 195 #endif 196 197 #if __MINGW64__ || __MINGW32__ 198 extern "C" __declspec(dllimport) int __stdcall SwitchToThread( void ); 199 #define __TBB_Yield() SwitchToThread() 200 #if (TBB_USE_GCC_BUILTINS && __TBB_GCC_BUILTIN_ATOMICS_PRESENT) 201 #include "machine/gcc_generic.h" 202 #elif __MINGW64__ 203 #include "machine/linux_intel64.h" 204 #elif __MINGW32__ 205 #include "machine/linux_ia32.h" 206 #endif 207 #elif (TBB_USE_ICC_BUILTINS && __TBB_ICC_BUILTIN_ATOMICS_PRESENT) 208 #include "machine/icc_generic.h" 209 #elif defined(_M_IX86) && !defined(__TBB_WIN32_USE_CL_BUILTINS) 210 #include "machine/windows_ia32.h" 211 #elif defined(_M_X64) 212 #include "machine/windows_intel64.h" 213 #elif defined(_XBOX) 214 #include "machine/xbox360_ppc.h" 215 #elif defined(_M_ARM) || defined(__TBB_WIN32_USE_CL_BUILTINS) 216 #include "machine/msvc_armv7.h" 217 #endif 218 219 #ifdef _MANAGED 220 #pragma managed(pop) 221 #endif 222 223 #elif __TBB_DEFINE_MIC 224 225 #include "machine/mic_common.h" 226 #if (TBB_USE_ICC_BUILTINS && __TBB_ICC_BUILTIN_ATOMICS_PRESENT) 227 #include "machine/icc_generic.h" 228 #else 229 #include "machine/linux_intel64.h" 230 #endif 231 232 #elif __linux__ || __FreeBSD__ || __NetBSD__ 233 234 #if (TBB_USE_GCC_BUILTINS && __TBB_GCC_BUILTIN_ATOMICS_PRESENT) 235 #include "machine/gcc_generic.h" 236 #elif (TBB_USE_ICC_BUILTINS && __TBB_ICC_BUILTIN_ATOMICS_PRESENT) 237 #include "machine/icc_generic.h" 238 #elif __i386__ 239 #include "machine/linux_ia32.h" 240 #elif __x86_64__ 241 #include "machine/linux_intel64.h" 242 #elif __ia64__ 243 #include "machine/linux_ia64.h" 244 #elif __powerpc__ 245 #include "machine/mac_ppc.h" 246 #elif __arm__ 247 #include "machine/gcc_armv7.h" 248 #elif __TBB_GCC_BUILTIN_ATOMICS_PRESENT 249 #include "machine/gcc_generic.h" 250 #endif 251 #include "machine/linux_common.h" 252 253 #elif __APPLE__ 254 //TODO: TBB_USE_GCC_BUILTINS is not used for Mac, Sun, Aix 255 #if (TBB_USE_ICC_BUILTINS && __TBB_ICC_BUILTIN_ATOMICS_PRESENT) 256 #include "machine/icc_generic.h" 257 #elif __i386__ 258 #include "machine/linux_ia32.h" 259 #elif __x86_64__ 260 #include "machine/linux_intel64.h" 261 #elif __POWERPC__ 262 #include "machine/mac_ppc.h" 263 #endif 264 #include "machine/macos_common.h" 265 266 #elif _AIX 267 268 #include "machine/ibm_aix51.h" 269 270 #elif __sun || __SUNPRO_CC 271 272 #define __asm__ asm 273 #define __volatile__ volatile 274 275 #if __i386 || __i386__ 276 #include "machine/linux_ia32.h" 277 #elif __x86_64__ 278 #include "machine/linux_intel64.h" 279 #elif __sparc 280 #include "machine/sunos_sparc.h" 281 #endif 282 #include <sched.h> 283 284 #define __TBB_Yield() sched_yield() 285 286 #endif /* OS selection */ 287 288 #ifndef __TBB_64BIT_ATOMICS 289 #define __TBB_64BIT_ATOMICS 1 290 #endif 291 292 //TODO: replace usage of these functions with usage of tbb::atomic, and then remove them 293 //TODO: map functions with W suffix to use cast to tbb::atomic and according op, i.e. as_atomic().op() 294 // Special atomic functions 295 #if __TBB_USE_FENCED_ATOMICS 296 #define __TBB_machine_cmpswp1 __TBB_machine_cmpswp1full_fence 297 #define __TBB_machine_cmpswp2 __TBB_machine_cmpswp2full_fence 298 #define __TBB_machine_cmpswp4 __TBB_machine_cmpswp4full_fence 299 #define __TBB_machine_cmpswp8 __TBB_machine_cmpswp8full_fence 300 301 #if __TBB_WORDSIZE==8 302 #define __TBB_machine_fetchadd8 __TBB_machine_fetchadd8full_fence 303 #define __TBB_machine_fetchstore8 __TBB_machine_fetchstore8full_fence 304 #define __TBB_FetchAndAddWrelease(P,V) __TBB_machine_fetchadd8release(P,V) 305 #define __TBB_FetchAndIncrementWacquire(P) __TBB_machine_fetchadd8acquire(P,1) 306 #define __TBB_FetchAndDecrementWrelease(P) __TBB_machine_fetchadd8release(P,(-1)) 307 #else 308 #define __TBB_machine_fetchadd4 __TBB_machine_fetchadd4full_fence 309 #define __TBB_machine_fetchstore4 __TBB_machine_fetchstore4full_fence 310 #define __TBB_FetchAndAddWrelease(P,V) __TBB_machine_fetchadd4release(P,V) 311 #define __TBB_FetchAndIncrementWacquire(P) __TBB_machine_fetchadd4acquire(P,1) 312 #define __TBB_FetchAndDecrementWrelease(P) __TBB_machine_fetchadd4release(P,(-1)) 313 #endif /* __TBB_WORDSIZE==4 */ 314 #else /* !__TBB_USE_FENCED_ATOMICS */ 315 #define __TBB_FetchAndAddWrelease(P,V) __TBB_FetchAndAddW(P,V) 316 #define __TBB_FetchAndIncrementWacquire(P) __TBB_FetchAndAddW(P,1) 317 #define __TBB_FetchAndDecrementWrelease(P) __TBB_FetchAndAddW(P,(-1)) 318 #endif /* !__TBB_USE_FENCED_ATOMICS */ 319 320 #if __TBB_WORDSIZE==4 321 #define __TBB_CompareAndSwapW(P,V,C) __TBB_machine_cmpswp4(P,V,C) 322 #define __TBB_FetchAndAddW(P,V) __TBB_machine_fetchadd4(P,V) 323 #define __TBB_FetchAndStoreW(P,V) __TBB_machine_fetchstore4(P,V) 324 #elif __TBB_WORDSIZE==8 325 #if __TBB_USE_GENERIC_DWORD_LOAD_STORE || __TBB_USE_GENERIC_DWORD_FETCH_ADD || __TBB_USE_GENERIC_DWORD_FETCH_STORE 326 #error These macros should only be used on 32-bit platforms. 327 #endif 328 329 #define __TBB_CompareAndSwapW(P,V,C) __TBB_machine_cmpswp8(P,V,C) 330 #define __TBB_FetchAndAddW(P,V) __TBB_machine_fetchadd8(P,V) 331 #define __TBB_FetchAndStoreW(P,V) __TBB_machine_fetchstore8(P,V) 332 #else /* __TBB_WORDSIZE != 8 */ 333 #error Unsupported machine word size. 334 #endif /* __TBB_WORDSIZE */ 335 336 #ifndef __TBB_Pause 337 inline void __TBB_Pause(int32_t) { 338 __TBB_Yield(); 339 } 340 #endif 341 342 namespace tbb { 343 344 //! Sequentially consistent full memory fence. 345 inline void atomic_fence () { __TBB_full_memory_fence(); } 346 347 namespace internal { //< @cond INTERNAL 348 349 //! Class that implements exponential backoff. 350 /** See implementation of spin_wait_while_eq for an example. */ 351 class atomic_backoff : no_copy { 352 //! Time delay, in units of "pause" instructions. 353 /** Should be equal to approximately the number of "pause" instructions 354 that take the same time as an context switch. */ 355 static const int32_t LOOPS_BEFORE_YIELD = 16; 356 int32_t count; 357 public: 358 // In many cases, an object of this type is initialized eagerly on hot path, 359 // as in for(atomic_backoff b; ; b.pause()) { /*loop body*/ } 360 // For this reason, the construction cost must be very small! 361 atomic_backoff() : count(1) {} 362 // This constructor pauses immediately; do not use on hot paths! 363 atomic_backoff( bool ) : count(1) { pause(); } 364 365 //! Pause for a while. 366 void pause() { 367 if( count<=LOOPS_BEFORE_YIELD ) { 368 __TBB_Pause(count); 369 // Pause twice as long the next time. 370 count*=2; 371 } else { 372 // Pause is so long that we might as well yield CPU to scheduler. 373 __TBB_Yield(); 374 } 375 } 376 377 // pause for a few times and then return false immediately. 378 bool bounded_pause() { 379 if( count<=LOOPS_BEFORE_YIELD ) { 380 __TBB_Pause(count); 381 // Pause twice as long the next time. 382 count*=2; 383 return true; 384 } else { 385 return false; 386 } 387 } 388 389 void reset() { 390 count = 1; 391 } 392 }; 393 394 //! Spin WHILE the value of the variable is equal to a given value 395 /** T and U should be comparable types. */ 396 template<typename T, typename U> 397 void spin_wait_while_eq( const volatile T& location, U value ) { 398 atomic_backoff backoff; 399 while( location==value ) backoff.pause(); 400 } 401 402 //! Spin UNTIL the value of the variable is equal to a given value 403 /** T and U should be comparable types. */ 404 template<typename T, typename U> 405 void spin_wait_until_eq( const volatile T& location, const U value ) { 406 atomic_backoff backoff; 407 while( location!=value ) backoff.pause(); 408 } 409 410 template <typename predicate_type> 411 void spin_wait_while(predicate_type condition){ 412 atomic_backoff backoff; 413 while( condition() ) backoff.pause(); 414 } 415 416 //////////////////////////////////////////////////////////////////////////////// 417 // Generic compare-and-swap applied to only a part of a machine word. 418 // 419 #ifndef __TBB_ENDIANNESS 420 #define __TBB_ENDIANNESS __TBB_ENDIAN_DETECT 421 #endif 422 423 #if __TBB_USE_GENERIC_PART_WORD_CAS && __TBB_ENDIANNESS==__TBB_ENDIAN_UNSUPPORTED 424 #error Generic implementation of part-word CAS may not be used with __TBB_ENDIAN_UNSUPPORTED 425 #endif 426 427 #if __TBB_ENDIANNESS!=__TBB_ENDIAN_UNSUPPORTED 428 // 429 // This function is the only use of __TBB_ENDIANNESS. 430 // The following restrictions/limitations apply for this operation: 431 // - T must be an integer type of at most 4 bytes for the casts and calculations to work 432 // - T must also be less than 4 bytes to avoid compiler warnings when computing mask 433 // (and for the operation to be useful at all, so no workaround is applied) 434 // - the architecture must consistently use either little-endian or big-endian (same for all locations) 435 // 436 // TODO: static_assert for the type requirements stated above 437 template<typename T> 438 inline T __TBB_MaskedCompareAndSwap (volatile T * const ptr, const T value, const T comparand ) { 439 struct endianness{ static bool is_big_endian(){ 440 #if __TBB_ENDIANNESS==__TBB_ENDIAN_DETECT 441 const uint32_t probe = 0x03020100; 442 return (((const char*)(&probe))[0]==0x03); 443 #elif __TBB_ENDIANNESS==__TBB_ENDIAN_BIG || __TBB_ENDIANNESS==__TBB_ENDIAN_LITTLE 444 return __TBB_ENDIANNESS==__TBB_ENDIAN_BIG; 445 #else 446 #error Unexpected value of __TBB_ENDIANNESS 447 #endif 448 }}; 449 450 const uint32_t byte_offset = (uint32_t) ((uintptr_t)ptr & 0x3); 451 volatile uint32_t * const aligned_ptr = (uint32_t*)((uintptr_t)ptr - byte_offset ); 452 453 // location of T within uint32_t for a C++ shift operation 454 const uint32_t bits_to_shift = 8*(endianness::is_big_endian() ? (4 - sizeof(T) - (byte_offset)) : byte_offset); 455 const uint32_t mask = (((uint32_t)1<<(sizeof(T)*8)) - 1 )<<bits_to_shift; 456 // for signed T, any sign extension bits in cast value/comparand are immediately clipped by mask 457 const uint32_t shifted_comparand = ((uint32_t)comparand << bits_to_shift)&mask; 458 const uint32_t shifted_value = ((uint32_t)value << bits_to_shift)&mask; 459 460 for( atomic_backoff b;;b.pause() ) { 461 const uint32_t surroundings = *aligned_ptr & ~mask ; // may have changed during the pause 462 const uint32_t big_comparand = surroundings | shifted_comparand ; 463 const uint32_t big_value = surroundings | shifted_value ; 464 // __TBB_machine_cmpswp4 presumed to have full fence. 465 // Cast shuts up /Wp64 warning 466 const uint32_t big_result = (uint32_t)__TBB_machine_cmpswp4( aligned_ptr, big_value, big_comparand ); 467 if( big_result == big_comparand // CAS succeeded 468 || ((big_result ^ big_comparand) & mask) != 0) // CAS failed and the bits of interest have changed 469 { 470 return T((big_result & mask) >> bits_to_shift); 471 } 472 else continue; // CAS failed but the bits of interest were not changed 473 } 474 } 475 #endif // __TBB_ENDIANNESS!=__TBB_ENDIAN_UNSUPPORTED 476 //////////////////////////////////////////////////////////////////////////////// 477 478 template<size_t S, typename T> 479 inline T __TBB_CompareAndSwapGeneric (volatile void *ptr, T value, T comparand ); 480 481 template<> 482 inline int8_t __TBB_CompareAndSwapGeneric <1,int8_t> (volatile void *ptr, int8_t value, int8_t comparand ) { 483 #if __TBB_USE_GENERIC_PART_WORD_CAS 484 return __TBB_MaskedCompareAndSwap<int8_t>((volatile int8_t *)ptr,value,comparand); 485 #else 486 return __TBB_machine_cmpswp1(ptr,value,comparand); 487 #endif 488 } 489 490 template<> 491 inline int16_t __TBB_CompareAndSwapGeneric <2,int16_t> (volatile void *ptr, int16_t value, int16_t comparand ) { 492 #if __TBB_USE_GENERIC_PART_WORD_CAS 493 return __TBB_MaskedCompareAndSwap<int16_t>((volatile int16_t *)ptr,value,comparand); 494 #else 495 return __TBB_machine_cmpswp2(ptr,value,comparand); 496 #endif 497 } 498 499 template<> 500 inline int32_t __TBB_CompareAndSwapGeneric <4,int32_t> (volatile void *ptr, int32_t value, int32_t comparand ) { 501 // Cast shuts up /Wp64 warning 502 return (int32_t)__TBB_machine_cmpswp4(ptr,value,comparand); 503 } 504 505 #if __TBB_64BIT_ATOMICS 506 template<> 507 inline int64_t __TBB_CompareAndSwapGeneric <8,int64_t> (volatile void *ptr, int64_t value, int64_t comparand ) { 508 return __TBB_machine_cmpswp8(ptr,value,comparand); 509 } 510 #endif 511 512 template<size_t S, typename T> 513 inline T __TBB_FetchAndAddGeneric (volatile void *ptr, T addend) { 514 T result; 515 for( atomic_backoff b;;b.pause() ) { 516 result = *reinterpret_cast<volatile T *>(ptr); 517 // __TBB_CompareAndSwapGeneric presumed to have full fence. 518 if( __TBB_CompareAndSwapGeneric<S,T> ( ptr, result+addend, result )==result ) 519 break; 520 } 521 return result; 522 } 523 524 template<size_t S, typename T> 525 inline T __TBB_FetchAndStoreGeneric (volatile void *ptr, T value) { 526 T result; 527 for( atomic_backoff b;;b.pause() ) { 528 result = *reinterpret_cast<volatile T *>(ptr); 529 // __TBB_CompareAndSwapGeneric presumed to have full fence. 530 if( __TBB_CompareAndSwapGeneric<S,T> ( ptr, value, result )==result ) 531 break; 532 } 533 return result; 534 } 535 536 #if __TBB_USE_GENERIC_PART_WORD_CAS 537 #define __TBB_machine_cmpswp1 tbb::internal::__TBB_CompareAndSwapGeneric<1,int8_t> 538 #define __TBB_machine_cmpswp2 tbb::internal::__TBB_CompareAndSwapGeneric<2,int16_t> 539 #endif 540 541 #if __TBB_USE_GENERIC_FETCH_ADD || __TBB_USE_GENERIC_PART_WORD_FETCH_ADD 542 #define __TBB_machine_fetchadd1 tbb::internal::__TBB_FetchAndAddGeneric<1,int8_t> 543 #define __TBB_machine_fetchadd2 tbb::internal::__TBB_FetchAndAddGeneric<2,int16_t> 544 #endif 545 546 #if __TBB_USE_GENERIC_FETCH_ADD 547 #define __TBB_machine_fetchadd4 tbb::internal::__TBB_FetchAndAddGeneric<4,int32_t> 548 #endif 549 550 #if __TBB_USE_GENERIC_FETCH_ADD || __TBB_USE_GENERIC_DWORD_FETCH_ADD 551 #define __TBB_machine_fetchadd8 tbb::internal::__TBB_FetchAndAddGeneric<8,int64_t> 552 #endif 553 554 #if __TBB_USE_GENERIC_FETCH_STORE || __TBB_USE_GENERIC_PART_WORD_FETCH_STORE 555 #define __TBB_machine_fetchstore1 tbb::internal::__TBB_FetchAndStoreGeneric<1,int8_t> 556 #define __TBB_machine_fetchstore2 tbb::internal::__TBB_FetchAndStoreGeneric<2,int16_t> 557 #endif 558 559 #if __TBB_USE_GENERIC_FETCH_STORE 560 #define __TBB_machine_fetchstore4 tbb::internal::__TBB_FetchAndStoreGeneric<4,int32_t> 561 #endif 562 563 #if __TBB_USE_GENERIC_FETCH_STORE || __TBB_USE_GENERIC_DWORD_FETCH_STORE 564 #define __TBB_machine_fetchstore8 tbb::internal::__TBB_FetchAndStoreGeneric<8,int64_t> 565 #endif 566 567 #if __TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE 568 #define __TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE(S) \ 569 atomic_selector<S>::word atomic_selector<S>::fetch_store ( volatile void* location, word value ) { \ 570 return __TBB_machine_fetchstore##S( location, value ); \ 571 } 572 573 __TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE(1) 574 __TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE(2) 575 __TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE(4) 576 __TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE(8) 577 578 #undef __TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE 579 #endif /* __TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE */ 580 581 #if __TBB_USE_GENERIC_DWORD_LOAD_STORE 582 /*TODO: find a more elegant way to handle function names difference*/ 583 #if ! __TBB_USE_FENCED_ATOMICS 584 /* This name forwarding is needed for generic implementation of 585 * load8/store8 defined below (via macro) to pick the right CAS function*/ 586 #define __TBB_machine_cmpswp8full_fence __TBB_machine_cmpswp8 587 #endif 588 __TBB_MACHINE_DEFINE_LOAD8_GENERIC_FENCED(full_fence) 589 __TBB_MACHINE_DEFINE_STORE8_GENERIC_FENCED(full_fence) 590 591 #if ! __TBB_USE_FENCED_ATOMICS 592 #undef __TBB_machine_cmpswp8full_fence 593 #endif 594 595 #define __TBB_machine_store8 tbb::internal::__TBB_machine_generic_store8full_fence 596 #define __TBB_machine_load8 tbb::internal::__TBB_machine_generic_load8full_fence 597 #endif /* __TBB_USE_GENERIC_DWORD_LOAD_STORE */ 598 599 #if __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE 600 /** Fenced operations use volatile qualifier to prevent compiler from optimizing 601 them out, and on architectures with weak memory ordering to induce compiler 602 to generate code with appropriate acquire/release semantics. 603 On architectures like IA32, Intel64 (and likely Sparc TSO) volatile has 604 no effect on code gen, and consistency helpers serve as a compiler fence (the 605 latter being true for IA64/gcc as well to fix a bug in some gcc versions). 606 This code assumes that the generated instructions will operate atomically, 607 which typically requires a type that can be moved in a single instruction, 608 cooperation from the compiler for effective use of such an instruction, 609 and appropriate alignment of the data. **/ 610 template <typename T, size_t S> 611 struct machine_load_store { 612 static T load_with_acquire ( const volatile T& location ) { 613 T to_return = location; 614 __TBB_acquire_consistency_helper(); 615 return to_return; 616 } 617 static void store_with_release ( volatile T &location, T value ) { 618 __TBB_release_consistency_helper(); 619 location = value; 620 } 621 }; 622 623 //in general, plain load and store of 32bit compiler is not atomic for 64bit types 624 #if __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS 625 template <typename T> 626 struct machine_load_store<T,8> { 627 static T load_with_acquire ( const volatile T& location ) { 628 return (T)__TBB_machine_load8( (const volatile void*)&location ); 629 } 630 static void store_with_release ( volatile T& location, T value ) { 631 __TBB_machine_store8( (volatile void*)&location, (int64_t)value ); 632 } 633 }; 634 #endif /* __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS */ 635 #endif /* __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE */ 636 637 #if __TBB_USE_GENERIC_SEQUENTIAL_CONSISTENCY_LOAD_STORE 638 template <typename T, size_t S> 639 struct machine_load_store_seq_cst { 640 static T load ( const volatile T& location ) { 641 __TBB_full_memory_fence(); 642 return machine_load_store<T,S>::load_with_acquire( location ); 643 } 644 #if __TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE 645 static void store ( volatile T &location, T value ) { 646 atomic_selector<S>::fetch_store( (volatile void*)&location, (typename atomic_selector<S>::word)value ); 647 } 648 #else /* !__TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE */ 649 static void store ( volatile T &location, T value ) { 650 machine_load_store<T,S>::store_with_release( location, value ); 651 __TBB_full_memory_fence(); 652 } 653 #endif /* !__TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE */ 654 }; 655 656 #if __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS 657 /** The implementation does not use functions __TBB_machine_load8/store8 as they 658 are not required to be sequentially consistent. **/ 659 template <typename T> 660 struct machine_load_store_seq_cst<T,8> { 661 static T load ( const volatile T& location ) { 662 // Comparand and new value may be anything, they only must be equal, and 663 // the value should have a low probability to be actually found in 'location'. 664 const int64_t anyvalue = 2305843009213693951LL; 665 return __TBB_machine_cmpswp8( (volatile void*)const_cast<volatile T*>(&location), anyvalue, anyvalue ); 666 } 667 static void store ( volatile T &location, T value ) { 668 int64_t result = (volatile int64_t&)location; 669 while ( __TBB_machine_cmpswp8((volatile void*)&location, (int64_t)value, result) != result ) 670 result = (volatile int64_t&)location; 671 } 672 }; 673 #endif /* __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS */ 674 #endif /*__TBB_USE_GENERIC_SEQUENTIAL_CONSISTENCY_LOAD_STORE */ 675 676 #if __TBB_USE_GENERIC_RELAXED_LOAD_STORE 677 // Relaxed operations add volatile qualifier to prevent compiler from optimizing them out. 678 /** Volatile should not incur any additional cost on IA32, Intel64, and Sparc TSO 679 architectures. However on architectures with weak memory ordering compiler may 680 generate code with acquire/release semantics for operations on volatile data. **/ 681 template <typename T, size_t S> 682 struct machine_load_store_relaxed { 683 static inline T load ( const volatile T& location ) { 684 return location; 685 } 686 static inline void store ( volatile T& location, T value ) { 687 location = value; 688 } 689 }; 690 691 #if __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS 692 template <typename T> 693 struct machine_load_store_relaxed<T,8> { 694 static inline T load ( const volatile T& location ) { 695 return (T)__TBB_machine_load8( (const volatile void*)&location ); 696 } 697 static inline void store ( volatile T& location, T value ) { 698 __TBB_machine_store8( (volatile void*)&location, (int64_t)value ); 699 } 700 }; 701 #endif /* __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS */ 702 #endif /* __TBB_USE_GENERIC_RELAXED_LOAD_STORE */ 703 704 #undef __TBB_WORDSIZE //this macro is forbidden to use outside of atomic machinery 705 706 template<typename T> 707 inline T __TBB_load_with_acquire(const volatile T &location) { 708 return machine_load_store<T,sizeof(T)>::load_with_acquire( location ); 709 } 710 template<typename T, typename V> 711 inline void __TBB_store_with_release(volatile T& location, V value) { 712 machine_load_store<T,sizeof(T)>::store_with_release( location, T(value) ); 713 } 714 //! Overload that exists solely to avoid /Wp64 warnings. 715 inline void __TBB_store_with_release(volatile size_t& location, size_t value) { 716 machine_load_store<size_t,sizeof(size_t)>::store_with_release( location, value ); 717 } 718 719 template<typename T> 720 inline T __TBB_load_full_fence(const volatile T &location) { 721 return machine_load_store_seq_cst<T,sizeof(T)>::load( location ); 722 } 723 template<typename T, typename V> 724 inline void __TBB_store_full_fence(volatile T& location, V value) { 725 machine_load_store_seq_cst<T,sizeof(T)>::store( location, T(value) ); 726 } 727 //! Overload that exists solely to avoid /Wp64 warnings. 728 inline void __TBB_store_full_fence(volatile size_t& location, size_t value) { 729 machine_load_store_seq_cst<size_t,sizeof(size_t)>::store( location, value ); 730 } 731 732 template<typename T> 733 inline T __TBB_load_relaxed (const volatile T& location) { 734 return machine_load_store_relaxed<T,sizeof(T)>::load( const_cast<T&>(location) ); 735 } 736 template<typename T, typename V> 737 inline void __TBB_store_relaxed ( volatile T& location, V value ) { 738 machine_load_store_relaxed<T,sizeof(T)>::store( const_cast<T&>(location), T(value) ); 739 } 740 //! Overload that exists solely to avoid /Wp64 warnings. 741 inline void __TBB_store_relaxed ( volatile size_t& location, size_t value ) { 742 machine_load_store_relaxed<size_t,sizeof(size_t)>::store( const_cast<size_t&>(location), value ); 743 } 744 745 // Macro __TBB_TypeWithAlignmentAtLeastAsStrict(T) should be a type with alignment at least as 746 // strict as type T. The type should have a trivial default constructor and destructor, so that 747 // arrays of that type can be declared without initializers. 748 // It is correct (but perhaps a waste of space) if __TBB_TypeWithAlignmentAtLeastAsStrict(T) expands 749 // to a type bigger than T. 750 // The default definition here works on machines where integers are naturally aligned and the 751 // strictest alignment is 64. 752 #ifndef __TBB_TypeWithAlignmentAtLeastAsStrict 753 754 #if __TBB_ATTRIBUTE_ALIGNED_PRESENT 755 756 #define __TBB_DefineTypeWithAlignment(PowerOf2) \ 757 struct __TBB_machine_type_with_alignment_##PowerOf2 { \ 758 uint32_t member[PowerOf2/sizeof(uint32_t)]; \ 759 } __attribute__((aligned(PowerOf2))); 760 #define __TBB_alignof(T) __alignof__(T) 761 762 #elif __TBB_DECLSPEC_ALIGN_PRESENT 763 764 #define __TBB_DefineTypeWithAlignment(PowerOf2) \ 765 __declspec(align(PowerOf2)) \ 766 struct __TBB_machine_type_with_alignment_##PowerOf2 { \ 767 uint32_t member[PowerOf2/sizeof(uint32_t)]; \ 768 }; 769 #define __TBB_alignof(T) __alignof(T) 770 771 #else /* A compiler with unknown syntax for data alignment */ 772 #error Must define __TBB_TypeWithAlignmentAtLeastAsStrict(T) 773 #endif 774 775 /* Now declare types aligned to useful powers of two */ 776 // TODO: Is __TBB_DefineTypeWithAlignment(8) needed on 32 bit platforms? 777 __TBB_DefineTypeWithAlignment(16) 778 __TBB_DefineTypeWithAlignment(32) 779 __TBB_DefineTypeWithAlignment(64) 780 781 typedef __TBB_machine_type_with_alignment_64 __TBB_machine_type_with_strictest_alignment; 782 783 // Primary template is a declaration of incomplete type so that it fails with unknown alignments 784 template<size_t N> struct type_with_alignment; 785 786 // Specializations for allowed alignments 787 template<> struct type_with_alignment<1> { char member; }; 788 template<> struct type_with_alignment<2> { uint16_t member; }; 789 template<> struct type_with_alignment<4> { uint32_t member; }; 790 template<> struct type_with_alignment<8> { uint64_t member; }; 791 template<> struct type_with_alignment<16> {__TBB_machine_type_with_alignment_16 member; }; 792 template<> struct type_with_alignment<32> {__TBB_machine_type_with_alignment_32 member; }; 793 template<> struct type_with_alignment<64> {__TBB_machine_type_with_alignment_64 member; }; 794 795 #if __TBB_ALIGNOF_NOT_INSTANTIATED_TYPES_BROKEN 796 //! Work around for bug in GNU 3.2 and MSVC compilers. 797 /** Bug is that compiler sometimes returns 0 for __alignof(T) when T has not yet been instantiated. 798 The work-around forces instantiation by forcing computation of sizeof(T) before __alignof(T). */ 799 template<size_t Size, typename T> 800 struct work_around_alignment_bug { 801 static const size_t alignment = __TBB_alignof(T); 802 }; 803 #define __TBB_TypeWithAlignmentAtLeastAsStrict(T) tbb::internal::type_with_alignment<tbb::internal::work_around_alignment_bug<sizeof(T),T>::alignment> 804 #else 805 #define __TBB_TypeWithAlignmentAtLeastAsStrict(T) tbb::internal::type_with_alignment<__TBB_alignof(T)> 806 #endif /* __TBB_ALIGNOF_NOT_INSTANTIATED_TYPES_BROKEN */ 807 808 #endif /* __TBB_TypeWithAlignmentAtLeastAsStrict */ 809 810 // Template class here is to avoid instantiation of the static data for modules that don't use it 811 template<typename T> 812 struct reverse { 813 static const T byte_table[256]; 814 }; 815 // An efficient implementation of the reverse function utilizes a 2^8 lookup table holding the bit-reversed 816 // values of [0..2^8 - 1]. Those values can also be computed on the fly at a slightly higher cost. 817 template<typename T> 818 const T reverse<T>::byte_table[256] = { 819 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0, 820 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, 821 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 822 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 823 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 824 0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 825 0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, 826 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE, 827 0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1, 828 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 829 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 830 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 831 0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 832 0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB, 833 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7, 834 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF 835 }; 836 837 } // namespace internal @endcond 838 } // namespace tbb 839 840 // Preserving access to legacy APIs 841 using tbb::internal::__TBB_load_with_acquire; 842 using tbb::internal::__TBB_store_with_release; 843 844 // Mapping historically used names to the ones expected by atomic_load_store_traits 845 #define __TBB_load_acquire __TBB_load_with_acquire 846 #define __TBB_store_release __TBB_store_with_release 847 848 #ifndef __TBB_Log2 849 inline intptr_t __TBB_Log2( uintptr_t x ) { 850 if( x==0 ) return -1; 851 intptr_t result = 0; 852 853 #if !defined(_M_ARM) 854 uintptr_t tmp; 855 if( sizeof(x)>4 && (tmp = ((uint64_t)x)>>32) ) { x=tmp; result += 32; } 856 #endif 857 if( uintptr_t tmp = x>>16 ) { x=tmp; result += 16; } 858 if( uintptr_t tmp = x>>8 ) { x=tmp; result += 8; } 859 if( uintptr_t tmp = x>>4 ) { x=tmp; result += 4; } 860 if( uintptr_t tmp = x>>2 ) { x=tmp; result += 2; } 861 862 return (x&2)? result+1: result; 863 } 864 #endif 865 866 #ifndef __TBB_AtomicOR 867 inline void __TBB_AtomicOR( volatile void *operand, uintptr_t addend ) { 868 for( tbb::internal::atomic_backoff b;;b.pause() ) { 869 uintptr_t tmp = *(volatile uintptr_t *)operand; 870 uintptr_t result = __TBB_CompareAndSwapW(operand, tmp|addend, tmp); 871 if( result==tmp ) break; 872 } 873 } 874 #endif 875 876 #ifndef __TBB_AtomicAND 877 inline void __TBB_AtomicAND( volatile void *operand, uintptr_t addend ) { 878 for( tbb::internal::atomic_backoff b;;b.pause() ) { 879 uintptr_t tmp = *(volatile uintptr_t *)operand; 880 uintptr_t result = __TBB_CompareAndSwapW(operand, tmp&addend, tmp); 881 if( result==tmp ) break; 882 } 883 } 884 #endif 885 886 #if __TBB_PREFETCHING 887 #ifndef __TBB_cl_prefetch 888 #error This platform does not define cache management primitives required for __TBB_PREFETCHING 889 #endif 890 891 #ifndef __TBB_cl_evict 892 #define __TBB_cl_evict(p) 893 #endif 894 #endif 895 896 #ifndef __TBB_Flag 897 typedef unsigned char __TBB_Flag; 898 #endif 899 typedef __TBB_atomic __TBB_Flag __TBB_atomic_flag; 900 901 #ifndef __TBB_TryLockByte 902 inline bool __TBB_TryLockByte( __TBB_atomic_flag &flag ) { 903 return __TBB_machine_cmpswp1(&flag,1,0)==0; 904 } 905 #endif 906 907 #ifndef __TBB_LockByte 908 inline __TBB_Flag __TBB_LockByte( __TBB_atomic_flag& flag ) { 909 tbb::internal::atomic_backoff backoff; 910 while( !__TBB_TryLockByte(flag) ) backoff.pause(); 911 return 0; 912 } 913 #endif 914 915 #ifndef __TBB_UnlockByte 916 #define __TBB_UnlockByte(addr) __TBB_store_with_release((addr),0) 917 #endif 918 919 // lock primitives with TSX 920 #if ( __TBB_x86_32 || __TBB_x86_64 ) /* only on ia32/intel64 */ 921 inline void __TBB_TryLockByteElidedCancel() { __TBB_machine_try_lock_elided_cancel(); } 922 923 inline bool __TBB_TryLockByteElided( __TBB_atomic_flag& flag ) { 924 bool res = __TBB_machine_try_lock_elided( &flag )!=0; 925 // to avoid the "lemming" effect, we need to abort the transaction 926 // if __TBB_machine_try_lock_elided returns false (i.e., someone else 927 // has acquired the mutex non-speculatively). 928 if( !res ) __TBB_TryLockByteElidedCancel(); 929 return res; 930 } 931 932 inline void __TBB_LockByteElided( __TBB_atomic_flag& flag ) 933 { 934 for(;;) { 935 tbb::internal::spin_wait_while_eq( flag, 1 ); 936 if( __TBB_machine_try_lock_elided( &flag ) ) 937 return; 938 // Another thread acquired the lock "for real". 939 // To avoid the "lemming" effect, we abort the transaction. 940 __TBB_TryLockByteElidedCancel(); 941 } 942 } 943 944 inline void __TBB_UnlockByteElided( __TBB_atomic_flag& flag ) { 945 __TBB_machine_unlock_elided( &flag ); 946 } 947 #endif 948 949 #ifndef __TBB_ReverseByte 950 inline unsigned char __TBB_ReverseByte(unsigned char src) { 951 return tbb::internal::reverse<unsigned char>::byte_table[src]; 952 } 953 #endif 954 955 template<typename T> 956 T __TBB_ReverseBits(T src) { 957 T dst; 958 unsigned char *original = (unsigned char *) &src; 959 unsigned char *reversed = (unsigned char *) &dst; 960 961 for( int i = sizeof(T)-1; i >= 0; i-- ) 962 reversed[i] = __TBB_ReverseByte( original[sizeof(T)-i-1] ); 963 964 return dst; 965 } 966 967 #endif /* __TBB_machine_H */ 968