1 //------------------------------------------------------------------------------
2 // GB_atomics.h: definitions for atomic operations
3 //------------------------------------------------------------------------------
4 
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7 
8 //------------------------------------------------------------------------------
9 
10 // All atomic operations used by SuiteSparse:GraphBLAS appear in this file.
11 
12 // These atomic operations assume either an ANSI C11 compiler that supports
13 // OpenMP 3.1 or later, or Microsoft Visual Studio on 64-bit Windows (which
14 // only supports OpenMP 2.0).  SuiteSparse:GraphBLAS is not supported on 32-bit
15 // Windows.
16 
17 #ifndef GB_ATOMICS_H
18 #define GB_ATOMICS_H
19 #include "GB.h"
20 
21 //------------------------------------------------------------------------------
22 // determine the architecture
23 //------------------------------------------------------------------------------
24 
25 #if __x86_64__
26 
27     // on the x86, atomic updates can be more aggresive.  The MIN, MAX, EQ,
28     // XNOR, and ANY monoids are implemented with atomic compare/exchange.
29 
30     #define GB_X86_64 1
31 
32 #else
33 
34     // on the ARM, Power8/9, and others, only use built-in #pragma omp atomic
35     // updates.  Do not use atomic compare/exchange.
36 
37     #define GB_X86_64 0
38 
39 #endif
40 
41 //------------------------------------------------------------------------------
42 // atomic updates
43 //------------------------------------------------------------------------------
44 
45 // Whenever possible, the OpenMP pragma is used with a clause (as introduced in
46 // OpenMP 3.1), as follow:
47 //
48 //      #pragma omp atomic [clause]
49 //
50 // where [clause] is read, write, update, or capture.
51 //
52 // Microsoft Visual Studio only supports OpenMP 2.0, which does not have the
53 // [clause].  Without the [clause], #pragma omp atomic is like
54 // #pragma omp atomic update, but the expression can only be one of:
55 //
56 //      x binop= expression
57 //      x++
58 //      ++x
59 //      x--
60 //      --x
61 //
62 // where binop is one of these operators:    + * - / & ^ | << >>
63 //
64 // OpenMP 3.0 and later support additional options for the "update" clause,
65 // but SuiteSparse:GraphBLAS uses only this form:
66 //
67 //      x binop= expression
68 //
69 // where binop is:  + * & ^ |
70 //
71 // This atomic update is used for the PLUS, TIMES, LAND, LXOR, and LOR monoids,
72 // when applied to the built-in types.  For PLUS and TIMES, these are the 10
73 // types INTx, UINTx, FP32, FP64 (for x = 8, 16, 32, and 64).  For the boolean
74 // monoids, only the BOOL type is used.
75 //
76 // As a result, the atomic updates are the same for gcc and icc (which support
77 // OpenMP 3.0 or later) with the "update" clause.  For MS Visual Studio, the
78 // "update" clause is removed since it supports OpenMP 2.0.
79 
80 #if ( _OPENMP >= 201307 )
81 
82     // OpenMP 4.0 or later
83     #define GB_ATOMIC_UPDATE GB_PRAGMA (omp atomic update seq_cst)
84 
85 #elif ( _OPENMP >= 201107 )
86 
87     // OpenMP 3.1
88     #define GB_ATOMIC_UPDATE GB_PRAGMA (omp atomic update)
89 
90 #elif ( _OPENMP >= 199810 )
91 
92     // OpenMP 1.0 to 3.0: no optional clauses, "update" is assumed
93     #define GB_ATOMIC_UPDATE GB_PRAGMA (omp atomic)
94 
95 #else
96 
97     // no OpenMP at all
98     #define GB_ATOMIC_UPDATE
99 
100 #endif
101 
102 //------------------------------------------------------------------------------
103 // atomic read and write
104 //------------------------------------------------------------------------------
105 
106 // In Microsoft Visual Studio, simple reads and writes to properly aligned
107 // 64-bit values are already atomic on 64-bit Windows for any architecture
108 // supported by Windows (any Intel or ARM architecture). See:
109 // https://docs.microsoft.com/en-us/windows/win32/sync/interlocked-variable-access
110 // SuiteSparse:GraphBLAS is not supported on 32-bit Windows.  Thus, there
111 // is no need for atomic reads/writes when compiling GraphBLAS on Windows
112 // with MS Visual Studio.
113 
114 // ARM, Power8/9, and others need the explicit atomic read/write.
115 // x86: no atomic read/write is needed.
116 
117 #if GB_X86_64
118 
119     // x86: no atomic read/write is needed.
120     #define GB_ATOMIC_READ
121     #define GB_ATOMIC_WRITE
122 
123 #elif ( _OPENMP >= 201811 )
124 
125     // OpenMP 5.0 or later
126     #define GB_ATOMIC_READ    GB_PRAGMA (omp atomic read acquire)
127     #define GB_ATOMIC_WRITE   GB_PRAGMA (omp atomic write release)
128 
129 #elif ( _OPENMP >= 201307 )
130 
131     // OpenMP 4.0 and 4.5
132     #define GB_ATOMIC_READ    GB_PRAGMA (omp atomic read seq_cst)
133     #define GB_ATOMIC_WRITE   GB_PRAGMA (omp atomic write seq_cst)
134 
135 #elif ( _OPENMP >= 201107 )
136 
137     // OpenMP 3.1
138     #define GB_ATOMIC_READ    GB_PRAGMA (omp atomic read)
139     #define GB_ATOMIC_WRITE   GB_PRAGMA (omp atomic write)
140 
141 #else
142 
143     // OpenMP 3.0 or earlier, or no OpenMP at all
144     #define GB_ATOMIC_READ
145     #define GB_ATOMIC_WRITE
146 
147 #endif
148 
149 //------------------------------------------------------------------------------
150 // flush
151 //------------------------------------------------------------------------------
152 
153 #if defined ( _OPENMP )
154 
155     // All versions of OpenMP have the #pragma omp flush
156     #define GB_OMP_FLUSH GB_PRAGMA (omp flush)
157 
158 #else
159 
160     // no OpenMP at all
161     #define GB_OMP_FLUSH
162 
163 #endif
164 
165 //------------------------------------------------------------------------------
166 // atomic capture
167 //------------------------------------------------------------------------------
168 
169 // An atomic capture loads the prior value of the target into a thread-local
170 // result, and then overwrites the target with the new value.  The target is a
171 // value that is shared between threads.  The value and result arguments are
172 // thread-local.  SuiteSparse:GraphBLAS uses four atomic capture methods,
173 // defined below, of the form:
174 //
175 //      { result = target ; target = value ; }      for int64_t and int8_t
176 //      { result = target ; target |= value ; }     for int64_t
177 //      { result = target++ ; }                     for int64_t
178 //
179 // OpenMP 3.1 and later supports atomic captures with a "capture" clause:
180 //
181 //      #pragma omp atomic capture
182 //      { result = target ; target = value ; }
183 //
184 // or with a binary operator
185 //
186 //      #pragma omp atomic capture
187 //      { result = target ; target binop= value ; }
188 //
189 // MS Visual Studio supports only OpenMP 2.0, and does not support any
190 // "capture" clause.  Thus, on Windows, _InterlockedExchange* and
191 // _InterlockedOr* functions are used instead, as described here:
192 //
193 // https://docs.microsoft.com/en-us/cpp/intrinsics/interlockedexchange-intrinsic-functions
194 // https://docs.microsoft.com/en-us/cpp/intrinsics/interlockedor-intrinsic-functions
195 
196 #if ( _OPENMP >= 201307 )
197 
198     // OpenMP 4.0 or later
199     #define GB_ATOMIC_CAPTURE GB_PRAGMA (omp atomic capture seq_cst)
200 
201 #elif ( _OPENMP >= 201107 )
202 
203     // OpenMP 3.1
204     #define GB_ATOMIC_CAPTURE GB_PRAGMA (omp atomic capture)
205 
206 #elif ( _OPENMP >= 199810 )
207 
208     // OpenMP 1.0 to 3.0: generate an intentional compile-time error if any
209     // attempt is made to use the atomic capture.
210     #define GB_ATOMIC_CAPTURE atomic capture not available
211 
212 #else
213 
214     // no OpenMP at all
215     #define GB_ATOMIC_CAPTURE
216 
217 #endif
218 
219     //--------------------------------------------------------------------------
220     // atomic capture for int64_t
221     //--------------------------------------------------------------------------
222 
223     // int64_t result, target, value ;
224     // do this atomically: { result = target ; target = value ; }
225 
226     #if GB_MICROSOFT
227 
228         #define GB_ATOMIC_CAPTURE_INT64(result, target, value)          \
229         {                                                               \
230             result = _InterlockedExchange64                             \
231                 ((int64_t volatile *) (&(target)), value) ;             \
232         }
233 
234     #else
235 
236         #define GB_ATOMIC_CAPTURE_INT64(result, target, value)          \
237         {                                                               \
238             GB_ATOMIC_CAPTURE                                           \
239             {                                                           \
240                 result = target ;                                       \
241                 target = value ;                                        \
242             }                                                           \
243         }
244 
245     #endif
246 
247     //--------------------------------------------------------------------------
248     // atomic capture for int8_t
249     //--------------------------------------------------------------------------
250 
251     // int8_t result, target, value ;
252     // do this atomically: { result = target ; target = value ; }
253 
254     #if GB_MICROSOFT
255 
256         #define GB_ATOMIC_CAPTURE_INT8(result, target, value)           \
257         {                                                               \
258             result = _InterlockedExchange8                              \
259                 ((char volatile *) &(target), value) ;                  \
260         }
261 
262     #else
263 
264         #define GB_ATOMIC_CAPTURE_INT8(result, target, value)           \
265         {                                                               \
266             GB_ATOMIC_CAPTURE                                           \
267             {                                                           \
268                 result = target ;                                       \
269                 target = value ;                                        \
270             }                                                           \
271         }
272 
273     #endif
274 
275     //--------------------------------------------------------------------------
276     // atomic capture with bitwise OR, for int64_t
277     //--------------------------------------------------------------------------
278 
279     // int64_t result, target, value ;
280     // do this atomically: { result = target ; target |= value ; }
281 
282     #if GB_MICROSOFT
283 
284         #define GB_ATOMIC_CAPTURE_INT64_OR(result, target, value)       \
285         {                                                               \
286             result = _InterlockedOr64                                   \
287                 ((int64_t volatile *) (&(target)), value) ;             \
288         }
289 
290     #else
291 
292         #define GB_ATOMIC_CAPTURE_INT64_OR(result, target, value)       \
293         {                                                               \
294             GB_ATOMIC_CAPTURE                                           \
295             {                                                           \
296                 result = target ;                                       \
297                 target |= value ;                                       \
298             }                                                           \
299         }
300 
301     #endif
302 
303     //--------------------------------------------------------------------------
304     // atomic post-increment
305     //--------------------------------------------------------------------------
306 
307     // Increment an int64_t value and return the value prior to being
308     // incremented:
309     //
310     //      int64_t result = target++ ;
311     //
312     // See
313     // https://docs.microsoft.com/en-us/cpp/intrinsics/interlockedincrement-intrinsic-functions?view=msvc-160
314     // The MS Visual Studio version computes result = ++target, so result must
315     // be decremented by one.
316 
317     #if GB_MICROSOFT
318 
319         #define GB_ATOMIC_CAPTURE_INC64(result,target)                  \
320         {                                                               \
321             result = _InterlockedIncrement64                            \
322                 ((int64_t volatile *) (&(target))) - 1 ;                \
323         }
324 
325     #else
326 
327         #define GB_ATOMIC_CAPTURE_INC64(result,target)                  \
328         {                                                               \
329             GB_ATOMIC_CAPTURE                                           \
330             result = (target)++ ;                                       \
331         }
332 
333     #endif
334 
335 //------------------------------------------------------------------------------
336 // atomic compare-and-exchange
337 //------------------------------------------------------------------------------
338 
339 // Atomic compare-and-exchange is used to implement the MAX, MIN and EQ
340 // monoids, for the fine-grain saxpy-style matrix multiplication.  Ideally,
341 // OpenMP would be used for these atomic operation, but they are not supported.
342 // So compiler-specific functions are used instead.
343 
344 // In gcc, icc, and clang, the atomic compare-and-exchange function
345 // __atomic_compare_exchange computes the following, as a single atomic
346 // operation, where type_t is any 8, 16, 32, or 64 bit scalar type.  In
347 // SuiteSparse:GraphBLAS, type_t can be bool, int8_t, uint8_t, int16_t,
348 // uint16_t, int32_t, uint32_t, int64_t, uint64_t, float, or double.
349 //
350 //      bool __atomic_compare_exchange
351 //      (
352 //          type_t *target,         // input/output
353 //          type_t *expected,       // input/output
354 //          type_t *desired,        // input only, even though it is a pointer
355 //          bool weak,              // true, for SuiteSparse:GraphBLAS
356 //          int success_memorder,   // __ATOMIC_SEQ_CST is used here
357 //          int failure_memorder    // __ATOMIC_SEQ_CST is used here
358 //      )
359 //      {
360 //          bool result ;
361 //          if (*target == *expected)
362 //          {
363 //              *target = *desired ;
364 //              result = true ;
365 //          }
366 //          else
367 //          {
368 //              *expected = *target ;
369 //              result = false ;
370 //          }
371 //          return (result) ;
372 //      }
373 //
374 // The generic __atomic_compare_exchange function in gcc (also supported by
375 // icc) computes the above for any of these 8, 16, 32, or 64-bit scalar types
376 // needed in SuiteSparse:GraphBLAS.  SuiteSparse:GraphBLAS does not require the
377 // 'expected = target' assignment if the result is false.  It ignores the
378 // value of 'expected' after the operation completes.   The target, expected,
379 // and desired parameters are all provided as pointers:
380 //
381 // See https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html
382 
383 // Microsoft Visual Studio provides similar but not identical functionality in
384 // the _InterlockedCompareExchange functions, but they are named differently
385 // for different types.  Only int8_t, int16_t, int32_t, and int64_t types are
386 // supported.  For the int64_t case, the following is performed atomically:
387 //
388 //      int64_t _InterlockedCompareExchange64
389 //      (
390 //          int64_t volatile *target,       // input/output
391 //          int64_t desired                 // input only
392 //          int64_t expected
393 //      )
394 //      {
395 //          int64_t result = *target ;
396 //          if (*target == expected)
397 //          {
398 //              target = desired ;
399 //          }
400 //          return (result) ;
401 //      }
402 //
403 // It does not assign "expected = target" if the test is false, but
404 // SuiteSparse:GraphBLAS does not require this action.  It does not return a
405 // boolean result, but instead returns the original value of (*target).
406 // However, this can be compared with the expected value to obtain the
407 // same boolean result as __atomic_compare_exchange.
408 //
409 // Type punning is used to extend these signed integer types to unsigned
410 // integers of the same number of bytes, and to float and double.
411 
412 #if GB_MICROSOFT
413 
414     //--------------------------------------------------------------------------
415     // GB_PUN: type punning
416     //--------------------------------------------------------------------------
417 
418     // With type punning, a value is treated as a different type, but with no
419     // typecasting.  The address of the variable is first typecasted to a (type
420     // *) pointer, and then the pointer is dereferenced.  Type punning is only
421     // needed to extend the atomic compare/exchange functions for Microsoft
422     // Visual Studio.
423 
424     #define GB_PUN(type,value) (*((type *) (&(value))))
425 
426     //--------------------------------------------------------------------------
427     // compare/exchange for MS Visual Studio
428     //--------------------------------------------------------------------------
429 
430     // bool, int8_t, and uint8_t
431     #define GB_ATOMIC_COMPARE_EXCHANGE_8(target, expected, desired)         \
432     (                                                                       \
433         GB_PUN (int8_t, expected) ==                                        \
434             _InterlockedCompareExchange8 ((int8_t volatile *) (target),     \
435                 GB_PUN (int8_t, desired), GB_PUN (int8_t, expected))        \
436     )
437 
438     // int16_t and uint16_t
439     #define GB_ATOMIC_COMPARE_EXCHANGE_16(target, expected, desired)        \
440     (                                                                       \
441         GB_PUN (int16_t, expected) ==                                       \
442             _InterlockedCompareExchange16 ((int16_t volatile *) (target),   \
443             GB_PUN (int16_t, desired), GB_PUN (int16_t, expected))          \
444     )
445 
446     // float, int32_t, and uint32_t
447     #define GB_ATOMIC_COMPARE_EXCHANGE_32(target, expected, desired)        \
448     (                                                                       \
449         GB_PUN (int32_t, expected) ==                                       \
450             _InterlockedCompareExchange ((int32_t volatile *) (target),     \
451             GB_PUN (int32_t, desired), GB_PUN (int32_t, expected))          \
452     )
453 
454     // double, int64_t, and uint64_t
455     #define GB_ATOMIC_COMPARE_EXCHANGE_64(target, expected, desired)        \
456     (                                                                       \
457         GB_PUN (int64_t, expected) ==                                       \
458             _InterlockedCompareExchange64 ((int64_t volatile *) (target),   \
459             GB_PUN (int64_t, desired), GB_PUN (int64_t, expected))          \
460     )
461 
462 #else
463 
464     //--------------------------------------------------------------------------
465     // compare/exchange for gcc, icc, and clang on x86 and Power8/9
466     //--------------------------------------------------------------------------
467 
468     // the compare/exchange function is generic for any type
469     #define GB_ATOMIC_COMPARE_EXCHANGE_X(target, expected, desired)     \
470         __atomic_compare_exchange (target, &expected, &desired,         \
471             true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)                   \
472 
473     // bool, int8_t, and uint8_t
474     #define GB_ATOMIC_COMPARE_EXCHANGE_8(target, expected, desired)     \
475             GB_ATOMIC_COMPARE_EXCHANGE_X(target, expected, desired)
476 
477     // int16_t and uint16_t
478     #define GB_ATOMIC_COMPARE_EXCHANGE_16(target, expected, desired)    \
479             GB_ATOMIC_COMPARE_EXCHANGE_X (target, expected, desired)
480 
481     // float, int32_t, and uint32_t
482     #define GB_ATOMIC_COMPARE_EXCHANGE_32(target, expected, desired)    \
483             GB_ATOMIC_COMPARE_EXCHANGE_X (target, expected, desired)
484 
485     // double, int64_t, and uint64_t
486     #define GB_ATOMIC_COMPARE_EXCHANGE_64(target, expected, desired)    \
487             GB_ATOMIC_COMPARE_EXCHANGE_X (target, expected, desired)
488 
489 
490 #endif
491 
492 #endif
493 
494