1 /*
2     Copyright (c) 2005-2017 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 
16 
17 
18 
19 */
20 
21 #if !defined(__TBB_machine_H) || defined(__TBB_machine_gcc_power_H)
22 #error Do not #include this internal file directly; use public TBB headers instead.
23 #endif
24 
25 #define __TBB_machine_gcc_power_H
26 
27 #include <stdint.h>
28 #include <unistd.h>
29 
30 // TODO: rename to gcc_power.h?
31 // This file is for Power Architecture with compilers supporting GNU inline-assembler syntax (currently GNU g++ and IBM XL).
32 // Note that XL V9.0 (sometimes?) has trouble dealing with empty input and/or clobber lists, so they should be avoided.
33 
34 #if __powerpc64__ || __ppc64__
35     // IBM XL documents __powerpc64__ (and __PPC64__).
36     // Apple documents __ppc64__ (with __ppc__ only on 32-bit).
37     #define __TBB_WORDSIZE 8
38 #else
39     #define __TBB_WORDSIZE 4
40 #endif
41 
42 // Traditionally Power Architecture is big-endian.
43 // Little-endian could be just an address manipulation (compatibility with TBB not verified),
44 // or normal little-endian (on more recent systems). Embedded PowerPC systems may support
45 // page-specific endianness, but then one endianness must be hidden from TBB so that it still sees only one.
46 #if __BIG_ENDIAN__ || (defined(__BYTE_ORDER__) && __BYTE_ORDER__==__ORDER_BIG_ENDIAN__)
47     #define __TBB_ENDIANNESS __TBB_ENDIAN_BIG
48 #elif __LITTLE_ENDIAN__ || (defined(__BYTE_ORDER__) && __BYTE_ORDER__==__ORDER_LITTLE_ENDIAN__)
49     #define __TBB_ENDIANNESS __TBB_ENDIAN_LITTLE
50 #elif defined(__BYTE_ORDER__)
51     #define __TBB_ENDIANNESS __TBB_ENDIAN_UNSUPPORTED
52 #else
53     #define __TBB_ENDIANNESS __TBB_ENDIAN_DETECT
54 #endif
55 
56 // On Power Architecture, (lock-free) 64-bit atomics require 64-bit hardware:
57 #if __TBB_WORDSIZE==8
58     // Do not change the following definition, because TBB itself will use 64-bit atomics in 64-bit builds.
59     #define __TBB_64BIT_ATOMICS 1
60 #elif __bgp__
61     // Do not change the following definition, because this is known 32-bit hardware.
62     #define __TBB_64BIT_ATOMICS 0
63 #else
64     // To enable 64-bit atomics in 32-bit builds, set the value below to 1 instead of 0.
65     // You must make certain that the program will only use them on actual 64-bit hardware
66     // (which typically means that the entire program is only executed on such hardware),
67     // because their implementation involves machine instructions that are illegal elsewhere.
68     // The setting can be chosen independently per compilation unit,
69     // which also means that TBB itself does not need to be rebuilt.
70     // Alternatively (but only for the current architecture and TBB version),
71     // override the default as a predefined macro when invoking the compiler.
72     #ifndef __TBB_64BIT_ATOMICS
73     #define __TBB_64BIT_ATOMICS 0
74     #endif
75 #endif
76 
__TBB_machine_cmpswp4(volatile void * ptr,int32_t value,int32_t comparand)77 inline int32_t __TBB_machine_cmpswp4 (volatile void *ptr, int32_t value, int32_t comparand )
78 {
79     int32_t result;
80 
81     __asm__ __volatile__("sync\n"
82                          "0:\n\t"
83                          "lwarx %[res],0,%[ptr]\n\t"     /* load w/ reservation */
84                          "cmpw %[res],%[cmp]\n\t"        /* compare against comparand */
85                          "bne- 1f\n\t"                   /* exit if not same */
86                          "stwcx. %[val],0,%[ptr]\n\t"    /* store new value */
87                          "bne- 0b\n"                     /* retry if reservation lost */
88                          "1:\n\t"                        /* the exit */
89                          "isync"
90                          : [res]"=&r"(result)
91                          , "+m"(* (int32_t*) ptr)        /* redundant with "memory" */
92                          : [ptr]"r"(ptr)
93                          , [val]"r"(value)
94                          , [cmp]"r"(comparand)
95                          : "memory"                      /* compiler full fence */
96                          , "cr0"                         /* clobbered by cmp and/or stwcx. */
97                          );
98     return result;
99 }
100 
101 #if __TBB_WORDSIZE==8
102 
__TBB_machine_cmpswp8(volatile void * ptr,int64_t value,int64_t comparand)103 inline int64_t __TBB_machine_cmpswp8 (volatile void *ptr, int64_t value, int64_t comparand )
104 {
105     int64_t result;
106     __asm__ __volatile__("sync\n"
107                          "0:\n\t"
108                          "ldarx %[res],0,%[ptr]\n\t"     /* load w/ reservation */
109                          "cmpd %[res],%[cmp]\n\t"        /* compare against comparand */
110                          "bne- 1f\n\t"                   /* exit if not same */
111                          "stdcx. %[val],0,%[ptr]\n\t"    /* store new value */
112                          "bne- 0b\n"                     /* retry if reservation lost */
113                          "1:\n\t"                        /* the exit */
114                          "isync"
115                          : [res]"=&r"(result)
116                          , "+m"(* (int64_t*) ptr)        /* redundant with "memory" */
117                          : [ptr]"r"(ptr)
118                          , [val]"r"(value)
119                          , [cmp]"r"(comparand)
120                          : "memory"                      /* compiler full fence */
121                          , "cr0"                         /* clobbered by cmp and/or stdcx. */
122                          );
123     return result;
124 }
125 
126 #elif __TBB_64BIT_ATOMICS /* && __TBB_WORDSIZE==4 */
127 
__TBB_machine_cmpswp8(volatile void * ptr,int64_t value,int64_t comparand)128 inline int64_t __TBB_machine_cmpswp8 (volatile void *ptr, int64_t value, int64_t comparand )
129 {
130     int64_t result;
131     int64_t value_register, comparand_register, result_register; // dummy variables to allocate registers
132     __asm__ __volatile__("sync\n\t"
133                          "ld %[val],%[valm]\n\t"
134                          "ld %[cmp],%[cmpm]\n"
135                          "0:\n\t"
136                          "ldarx %[res],0,%[ptr]\n\t"     /* load w/ reservation */
137                          "cmpd %[res],%[cmp]\n\t"        /* compare against comparand */
138                          "bne- 1f\n\t"                   /* exit if not same */
139                          "stdcx. %[val],0,%[ptr]\n\t"    /* store new value */
140                          "bne- 0b\n"                     /* retry if reservation lost */
141                          "1:\n\t"                        /* the exit */
142                          "std %[res],%[resm]\n\t"
143                          "isync"
144                          : [resm]"=m"(result)
145                          , [res] "=&r"(   result_register)
146                          , [val] "=&r"(    value_register)
147                          , [cmp] "=&r"(comparand_register)
148                          , "+m"(* (int64_t*) ptr)        /* redundant with "memory" */
149                          : [ptr] "r"(ptr)
150                          , [valm]"m"(value)
151                          , [cmpm]"m"(comparand)
152                          : "memory"                      /* compiler full fence */
153                          , "cr0"                         /* clobbered by cmpd and/or stdcx. */
154                          );
155     return result;
156 }
157 
158 #endif /* __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS */
159 
160 #define __TBB_MACHINE_DEFINE_LOAD_STORE(S,ldx,stx,cmpx)                                                       \
161     template <typename T>                                                                                     \
162     struct machine_load_store<T,S> {                                                                          \
163         static inline T load_with_acquire(const volatile T& location) {                                       \
164             T result;                                                                                         \
165             __asm__ __volatile__(ldx " %[res],0(%[ptr])\n"                                                    \
166                                  "0:\n\t"                                                                     \
167                                  cmpx " %[res],%[res]\n\t"                                                    \
168                                  "bne- 0b\n\t"                                                                \
169                                  "isync"                                                                      \
170                                  : [res]"=r"(result)                                                          \
171                                  : [ptr]"b"(&location) /* cannot use register 0 here */                       \
172                                  , "m"(location)       /* redundant with "memory" */                          \
173                                  : "memory"            /* compiler acquire fence */                           \
174                                  , "cr0"               /* clobbered by cmpw/cmpd */);                         \
175             return result;                                                                                    \
176         }                                                                                                     \
177         static inline void store_with_release(volatile T &location, T value) {                                \
178             __asm__ __volatile__("lwsync\n\t"                                                                 \
179                                  stx " %[val],0(%[ptr])"                                                      \
180                                  : "=m"(location)      /* redundant with "memory" */                          \
181                                  : [ptr]"b"(&location) /* cannot use register 0 here */                       \
182                                  , [val]"r"(value)                                                            \
183                                  : "memory"/*compiler release fence*/ /*(cr0 not affected)*/);                \
184         }                                                                                                     \
185     };                                                                                                        \
186                                                                                                               \
187     template <typename T>                                                                                     \
188     struct machine_load_store_relaxed<T,S> {                                                                  \
189         static inline T load (const __TBB_atomic T& location) {                                               \
190             T result;                                                                                         \
191             __asm__ __volatile__(ldx " %[res],0(%[ptr])"                                                      \
192                                  : [res]"=r"(result)                                                          \
193                                  : [ptr]"b"(&location) /* cannot use register 0 here */                       \
194                                  , "m"(location)                                                              \
195                                  ); /*(no compiler fence)*/ /*(cr0 not affected)*/                            \
196             return result;                                                                                    \
197         }                                                                                                     \
198         static inline void store (__TBB_atomic T &location, T value) {                                        \
199             __asm__ __volatile__(stx " %[val],0(%[ptr])"                                                      \
200                                  : "=m"(location)                                                             \
201                                  : [ptr]"b"(&location) /* cannot use register 0 here */                       \
202                                  , [val]"r"(value)                                                            \
203                                  ); /*(no compiler fence)*/ /*(cr0 not affected)*/                            \
204         }                                                                                                     \
205     };
206 
207 namespace tbb {
208 namespace internal {
209     __TBB_MACHINE_DEFINE_LOAD_STORE(1,"lbz","stb","cmpw")
210     __TBB_MACHINE_DEFINE_LOAD_STORE(2,"lhz","sth","cmpw")
211     __TBB_MACHINE_DEFINE_LOAD_STORE(4,"lwz","stw","cmpw")
212 
213 #if __TBB_WORDSIZE==8
214 
215     __TBB_MACHINE_DEFINE_LOAD_STORE(8,"ld" ,"std","cmpd")
216 
217 #elif __TBB_64BIT_ATOMICS /* && __TBB_WORDSIZE==4 */
218 
219     template <typename T>
220     struct machine_load_store<T,8> {
221         static inline T load_with_acquire(const volatile T& location) {
222             T result;
223             T result_register; // dummy variable to allocate a register
224             __asm__ __volatile__("ld %[res],0(%[ptr])\n\t"
225                                  "std %[res],%[resm]\n"
226                                  "0:\n\t"
227                                  "cmpd %[res],%[res]\n\t"
228                                  "bne- 0b\n\t"
229                                  "isync"
230                                  : [resm]"=m"(result)
231                                  , [res]"=&r"(result_register)
232                                  : [ptr]"b"(&location) /* cannot use register 0 here */
233                                  , "m"(location)       /* redundant with "memory" */
234                                  : "memory"            /* compiler acquire fence */
235                                  , "cr0"               /* clobbered by cmpd */);
236             return result;
237         }
238 
239         static inline void store_with_release(volatile T &location, T value) {
240             T value_register; // dummy variable to allocate a register
241             __asm__ __volatile__("lwsync\n\t"
242                                  "ld %[val],%[valm]\n\t"
243                                  "std %[val],0(%[ptr])"
244                                  : "=m"(location)      /* redundant with "memory" */
245                                  , [val]"=&r"(value_register)
246                                  : [ptr]"b"(&location) /* cannot use register 0 here */
247                                  , [valm]"m"(value)
248                                  : "memory"/*compiler release fence*/ /*(cr0 not affected)*/);
249         }
250     };
251 
252     struct machine_load_store_relaxed<T,8> {
253         static inline T load (const volatile T& location) {
254             T result;
255             T result_register; // dummy variable to allocate a register
256             __asm__ __volatile__("ld %[res],0(%[ptr])\n\t"
257                                  "std %[res],%[resm]"
258                                  : [resm]"=m"(result)
259                                  , [res]"=&r"(result_register)
260                                  : [ptr]"b"(&location) /* cannot use register 0 here */
261                                  , "m"(location)
262                                  ); /*(no compiler fence)*/ /*(cr0 not affected)*/
263             return result;
264         }
265 
266         static inline void store (volatile T &location, T value) {
267             T value_register; // dummy variable to allocate a register
268             __asm__ __volatile__("ld %[val],%[valm]\n\t"
269                                  "std %[val],0(%[ptr])"
270                                  : "=m"(location)
271                                  , [val]"=&r"(value_register)
272                                  : [ptr]"b"(&location) /* cannot use register 0 here */
273                                  , [valm]"m"(value)
274                                  ); /*(no compiler fence)*/ /*(cr0 not affected)*/
275         }
276     };
277     #define __TBB_machine_load_store_relaxed_8
278 
279 #endif /* __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS */
280 
281 }} // namespaces internal, tbb
282 
283 #undef __TBB_MACHINE_DEFINE_LOAD_STORE
284 
285 #define __TBB_USE_GENERIC_PART_WORD_CAS                     1
286 #define __TBB_USE_GENERIC_FETCH_ADD                         1
287 #define __TBB_USE_GENERIC_FETCH_STORE                       1
288 #define __TBB_USE_GENERIC_SEQUENTIAL_CONSISTENCY_LOAD_STORE 1
289 
290 #define __TBB_control_consistency_helper() __asm__ __volatile__("isync": : :"memory")
291 #define __TBB_full_memory_fence()          __asm__ __volatile__( "sync": : :"memory")
292 
__TBB_machine_lg(uintptr_t x)293 static inline intptr_t __TBB_machine_lg( uintptr_t x ) {
294     __TBB_ASSERT(x, "__TBB_Log2(0) undefined");
295     // cntlzd/cntlzw starts counting at 2^63/2^31 (ignoring any higher-order bits), and does not affect cr0
296 #if __TBB_WORDSIZE==8
297     __asm__ __volatile__ ("cntlzd %0,%0" : "+r"(x));
298     return 63-static_cast<intptr_t>(x);
299 #else
300     __asm__ __volatile__ ("cntlzw %0,%0" : "+r"(x));
301     return 31-static_cast<intptr_t>(x);
302 #endif
303 }
304 #define __TBB_Log2(V) __TBB_machine_lg(V)
305 
306 // Assumes implicit alignment for any 32-bit value
307 typedef uint32_t __TBB_Flag;
308 #define __TBB_Flag __TBB_Flag
309 
__TBB_machine_trylockbyte(__TBB_atomic __TBB_Flag & flag)310 inline bool __TBB_machine_trylockbyte( __TBB_atomic __TBB_Flag &flag ) {
311     return __TBB_machine_cmpswp4(&flag,1,0)==0;
312 }
313 #define __TBB_TryLockByte(P) __TBB_machine_trylockbyte(P)
314