1## ---------------------------------------------------------------------
2##
3## Copyright (C) 2012 - 2020 by the deal.II authors
4##
5## This file is part of the deal.II library.
6##
7## The deal.II library is free software; you can use it, redistribute
8## it, and/or modify it under the terms of the GNU Lesser General
9## Public License as published by the Free Software Foundation; either
10## version 2.1 of the License, or (at your option) any later version.
11## The full text of the license can be found in the file LICENSE.md at
12## the top level directory of deal.II.
13##
14## ---------------------------------------------------------------------
15
16
17########################################################################
18#                                                                      #
19#                   Platform and CPU specific tests:                   #
20#                                                                      #
21########################################################################
22
23#
24# This file sets up
25#
26#   DEAL_II_WORDS_BIGENDIAN
27#   DEAL_II_HAVE_SSE2                    *)
28#   DEAL_II_HAVE_AVX                     *)
29#   DEAL_II_HAVE_AVX512                  *)
30#   DEAL_II_HAVE_ALTIVEC                 *)
31#   DEAL_II_HAVE_OPENMP_SIMD             *)
32#   DEAL_II_VECTORIZATION_WIDTH_IN_BITS
33#   DEAL_II_OPENMP_SIMD_PRAGMA
34#
35# *)
36# It is is possible to manually set the above values to their corresponding
37# values, when platform introspection is disabled with
38# DEAL_II_ALLOW_PLATFORM_INTROSPECTION=OFF,
39#
40
41
42#
43# Determine the Endianness of the platform:
44#
45IF(CMAKE_C_COMPILER_WORKS)
46  INCLUDE(TestBigEndian)
47
48  CLEAR_CMAKE_REQUIRED()
49  TEST_BIG_ENDIAN(DEAL_II_WORDS_BIGENDIAN)
50  RESET_CMAKE_REQUIRED()
51ELSE()
52  MESSAGE(STATUS
53    "No suitable C compiler was found! Assuming little endian platform."
54    )
55  SET(DEAL_II_WORDS_BIGENDIAN "0")
56ENDIF()
57
58
59#
60# Check whether the compiler allows for vectorization and that
61# vectorization actually works on the given CPU. For this test, we use
62# compiler intrinsics similar to what is used in the deal.II library and
63# check whether the arithmetic operations are correctly performed on
64# examples where all numbers are exactly represented as floating point
65# numbers.
66#
67# - Matthias Maier, rewritten 2012
68#
69
70IF(DEAL_II_ALLOW_PLATFORM_INTROSPECTION)
71  #
72  # Take care that the following tests are rerun if the
73  # CMAKE_REQUIRED_FLAGS changes..
74  #
75  UNSET_IF_CHANGED(CHECK_CPU_FEATURES_FLAGS_SAVED "${CMAKE_REQUIRED_FLAGS}"
76    DEAL_II_HAVE_SSE2 DEAL_II_HAVE_AVX DEAL_II_HAVE_AVX512 DEAL_II_HAVE_ALTIVEC
77    )
78
79  CHECK_CXX_SOURCE_RUNS(
80    "
81    #include <x86intrin.h>
82    int main()
83    {
84    __m128d a, b;
85    const unsigned int vector_bytes = sizeof(__m128d);
86    const int n_vectors = vector_bytes/sizeof(double);
87    __m128d * data =
88      reinterpret_cast<__m128d*>(_mm_malloc (2*vector_bytes, vector_bytes));
89    double * ptr = reinterpret_cast<double*>(&a);
90    ptr[0] = static_cast<volatile double>(1.0);
91    for (int i=1; i<n_vectors; ++i)
92      ptr[i] = 0.0;
93    b = _mm_set1_pd (static_cast<volatile double>(2.25));
94    data[0] = _mm_add_pd (a, b);
95    data[1] = _mm_mul_pd (b, data[0]);
96    ptr = reinterpret_cast<double*>(&data[1]);
97    int return_value = 0;
98    if (ptr[0] != 7.3125)
99      return_value = 1;
100    for (int i=1; i<n_vectors; ++i)
101      if (ptr[i] != 5.0625)
102        return_value = 1;
103    _mm_free (data);
104    return return_value;
105    }
106    "
107    DEAL_II_HAVE_SSE2)
108
109  #
110  # clang-3.6.0 has a bug in operator+ on two VectorizedArray components as
111  # defined in deal.II. Therefore, the test for AVX needs to also test for
112  # operator+ to be correctly implemented.
113  #
114  CHECK_CXX_SOURCE_RUNS(
115    "
116    #ifndef __AVX__
117    #error \"__AVX__ flag not set, no support for AVX\"
118    #endif
119    #include <x86intrin.h>
120    class VectorizedArray
121    {
122    public:
123      VectorizedArray &
124      operator += (const VectorizedArray &vec)
125      {
126        data = _mm256_add_pd (data, vec.data);
127        return *this;
128      }
129      __m256d data;
130    };
131    inline
132    VectorizedArray
133    operator + (const VectorizedArray &u, const VectorizedArray &v)
134    {
135      VectorizedArray tmp = u;
136      return tmp+=v;
137    }
138    int main()
139    {
140      __m256d a, b;
141      const unsigned int vector_bytes = sizeof(__m256d);
142      const int n_vectors = vector_bytes/sizeof(double);
143      __m256d * data =
144        reinterpret_cast<__m256d*>(_mm_malloc (2*vector_bytes, vector_bytes));
145      double * ptr = reinterpret_cast<double*>(&a);
146      ptr[0] = static_cast<volatile double>(1.0);
147      for (int i=1; i<n_vectors; ++i)
148        ptr[i] = 0.0;
149      b = _mm256_set1_pd (static_cast<volatile double>(2.25));
150      data[0] = _mm256_add_pd (a, b);
151      data[1] = _mm256_mul_pd (b, data[0]);
152      ptr = reinterpret_cast<double*>(&data[1]);
153      int return_value = 0;
154      if (ptr[0] != 7.3125)
155        return_value = 1;
156      for (int i=1; i<n_vectors; ++i)
157        if (ptr[i] != 5.0625)
158          return_value = 1;
159      VectorizedArray c, d, e;
160      c.data = b;
161      d.data = b;
162      e = c + d;
163      ptr = reinterpret_cast<double*>(&e.data);
164      for (int i=0; i<n_vectors; ++i)
165        if (ptr[i] != 4.5)
166          return_value = 1;
167      _mm_free (data);
168      return return_value;
169    }
170    "
171    DEAL_II_HAVE_AVX)
172
173  CHECK_CXX_SOURCE_RUNS(
174    "
175    #ifndef __AVX512F__
176    #error \"__AVX512F__ flag not set, no support for AVX512\"
177    #endif
178    #include <x86intrin.h>
179    int main()
180    {
181      __m512d a, b;
182      const unsigned int vector_bytes = sizeof(__m512d);
183      const int n_vectors = vector_bytes/sizeof(double);
184      __m512d * data =
185        reinterpret_cast<__m512d*>(_mm_malloc (2*vector_bytes, vector_bytes));
186      double * ptr = reinterpret_cast<double*>(&a);
187      ptr[0] = static_cast<volatile double>(1.0);
188      for (int i=1; i<n_vectors; ++i)
189        ptr[i] = 0.0;
190      const volatile double x = 2.25;
191      b = _mm512_set1_pd(x);
192      data[0] = _mm512_add_pd (a, b);
193      data[1] = _mm512_mul_pd (b, data[0]);
194      ptr = reinterpret_cast<double*>(&data[1]);
195      int return_value = 0;
196      if (ptr[0] != 7.3125)
197        return_value = 1;
198      for (int i=1; i<n_vectors; ++i)
199        if (ptr[i] != 5.0625)
200          return_value = 1;
201      _mm_free (data);
202      return return_value;
203    }
204    "
205    DEAL_II_HAVE_AVX512)
206
207  CHECK_CXX_SOURCE_RUNS(
208    "
209    #ifndef __ALTIVEC__
210    #error \"__ALTIVEC__ flag not set, no support for Altivec\"
211    #endif
212    #include <altivec.h>
213    #undef vector
214    #undef pixel
215    #undef bool
216    int main()
217    {
218    __vector double a, b, data1, data2;
219    const int n_vectors = sizeof(a)/sizeof(double);
220    double * ptr = reinterpret_cast<double*>(&a);
221    ptr[0] = static_cast<volatile double>(1.0);
222    for (int i=1; i<n_vectors; ++i)
223      ptr[i] = 0.0;
224    b = vec_splats (static_cast<volatile double>(2.25));
225    data1 = vec_add (a, b);
226    data2 = vec_mul (b, data1);
227    ptr = reinterpret_cast<double*>(&data2);
228    int return_value = 0;
229    if (ptr[0] != 7.3125)
230      return_value += 1;
231    for (int i=1; i<n_vectors; ++i)
232      if (ptr[i] != 5.0625)
233        return_value += 2;
234    b = vec_splats (static_cast<volatile double>(-1.0));
235    data1 = vec_abs(vec_mul (b, data2));
236    vec_vsx_st(data1, 0, ptr);
237    b = vec_vsx_ld(0, ptr);
238    ptr = reinterpret_cast<double*>(&b);
239    if (ptr[0] != 7.3125)
240      return_value += 4;
241    for (int i=1; i<n_vectors; ++i)
242      if (ptr[i] != 5.0625)
243        return_value += 8;
244    return return_value;
245    }
246    "
247    DEAL_II_HAVE_ALTIVEC)
248
249  #
250  # OpenMP 4.0 can be used for vectorization. Only the vectorization
251  # instructions are allowed, the threading must be done through TBB.
252  #
253
254  #
255  # Choosing the right compiler flag is a bit of a mess:
256  #
257  IF(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
258    IF("${CMAKE_CXX_COMPILER_VERSION}" VERSION_GREATER "15" )
259      SET(_keyword "qopenmp")
260    ELSEIF("${CMAKE_CXX_COMPILER_VERSION}" VERSION_GREATER "14" )
261      SET(_keyword "openmp")
262    ENDIF()
263  ELSEIF(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
264    SET(_keyword "openmp")
265  ELSE()
266    SET(_keyword "fopenmp")
267  ENDIF()
268
269  CHECK_CXX_COMPILER_FLAG("-${_keyword}-simd" DEAL_II_HAVE_OPENMP_SIMD)
270
271ENDIF() # IF DEAL_II_ALLOW_PLATFORM_INTROSPECTION
272
273
274#
275# Choose DEAL_II_COMPILER_VECTORIZATION level depending on AVX support
276# (that was autodetected or manually specified).
277#
278
279IF(DEAL_II_HAVE_AVX512)
280  SET(DEAL_II_VECTORIZATION_WIDTH_IN_BITS 512)
281ELSEIF(DEAL_II_HAVE_AVX)
282  SET(DEAL_II_VECTORIZATION_WIDTH_IN_BITS 256)
283ELSEIF(DEAL_II_HAVE_SSE2)
284  SET(DEAL_II_VECTORIZATION_WIDTH_IN_BITS 128)
285ELSE()
286  SET(DEAL_II_VECTORIZATION_WIDTH_IN_BITS 0)
287ENDIF()
288
289IF(DEAL_II_HAVE_ALTIVEC)
290  SET(DEAL_II_VECTORIZATION_WIDTH_IN_BITS 128)
291ENDIF()
292
293#
294# We need to disable SIMD vectorization for CUDA device code.
295# Otherwise, nvcc compilers from version 9 on will emit an error message like:
296# "[...] contains a vector, which is not supported in device code"
297#
298
299IF(DEAL_II_WITH_CUDA)
300  SET(DEAL_II_VECTORIZATION_WIDTH_IN_BITS 0)
301ENDIF()
302
303#
304# If we have OpenMP SIMD support (i.e. DEAL_II_HAVE_OPENMP_SIMD is true)
305# populate DEAL_II_OPENMP_SIMD_PRAGMA.
306#
307
308SET(DEAL_II_OPENMP_SIMD_PRAGMA " ")
309IF(DEAL_II_HAVE_OPENMP_SIMD)
310  ADD_FLAGS(DEAL_II_CXX_FLAGS "-${_keyword}-simd")
311  # Intel is special:
312  IF(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
313    ADD_FLAGS(DEAL_II_LINKER_FLAGS "-${_keyword}")
314  ENDIF()
315  SET(DEAL_II_OPENMP_SIMD_PRAGMA "_Pragma(\"omp simd\")")
316ENDIF()
317