1## --------------------------------------------------------------------- 2## 3## Copyright (C) 2012 - 2020 by the deal.II authors 4## 5## This file is part of the deal.II library. 6## 7## The deal.II library is free software; you can use it, redistribute 8## it, and/or modify it under the terms of the GNU Lesser General 9## Public License as published by the Free Software Foundation; either 10## version 2.1 of the License, or (at your option) any later version. 11## The full text of the license can be found in the file LICENSE.md at 12## the top level directory of deal.II. 13## 14## --------------------------------------------------------------------- 15 16 17######################################################################## 18# # 19# Platform and CPU specific tests: # 20# # 21######################################################################## 22 23# 24# This file sets up 25# 26# DEAL_II_WORDS_BIGENDIAN 27# DEAL_II_HAVE_SSE2 *) 28# DEAL_II_HAVE_AVX *) 29# DEAL_II_HAVE_AVX512 *) 30# DEAL_II_HAVE_ALTIVEC *) 31# DEAL_II_HAVE_OPENMP_SIMD *) 32# DEAL_II_VECTORIZATION_WIDTH_IN_BITS 33# DEAL_II_OPENMP_SIMD_PRAGMA 34# 35# *) 36# It is is possible to manually set the above values to their corresponding 37# values, when platform introspection is disabled with 38# DEAL_II_ALLOW_PLATFORM_INTROSPECTION=OFF, 39# 40 41 42# 43# Determine the Endianness of the platform: 44# 45IF(CMAKE_C_COMPILER_WORKS) 46 INCLUDE(TestBigEndian) 47 48 CLEAR_CMAKE_REQUIRED() 49 TEST_BIG_ENDIAN(DEAL_II_WORDS_BIGENDIAN) 50 RESET_CMAKE_REQUIRED() 51ELSE() 52 MESSAGE(STATUS 53 "No suitable C compiler was found! Assuming little endian platform." 54 ) 55 SET(DEAL_II_WORDS_BIGENDIAN "0") 56ENDIF() 57 58 59# 60# Check whether the compiler allows for vectorization and that 61# vectorization actually works on the given CPU. For this test, we use 62# compiler intrinsics similar to what is used in the deal.II library and 63# check whether the arithmetic operations are correctly performed on 64# examples where all numbers are exactly represented as floating point 65# numbers. 66# 67# - Matthias Maier, rewritten 2012 68# 69 70IF(DEAL_II_ALLOW_PLATFORM_INTROSPECTION) 71 # 72 # Take care that the following tests are rerun if the 73 # CMAKE_REQUIRED_FLAGS changes.. 74 # 75 UNSET_IF_CHANGED(CHECK_CPU_FEATURES_FLAGS_SAVED "${CMAKE_REQUIRED_FLAGS}" 76 DEAL_II_HAVE_SSE2 DEAL_II_HAVE_AVX DEAL_II_HAVE_AVX512 DEAL_II_HAVE_ALTIVEC 77 ) 78 79 CHECK_CXX_SOURCE_RUNS( 80 " 81 #include <x86intrin.h> 82 int main() 83 { 84 __m128d a, b; 85 const unsigned int vector_bytes = sizeof(__m128d); 86 const int n_vectors = vector_bytes/sizeof(double); 87 __m128d * data = 88 reinterpret_cast<__m128d*>(_mm_malloc (2*vector_bytes, vector_bytes)); 89 double * ptr = reinterpret_cast<double*>(&a); 90 ptr[0] = static_cast<volatile double>(1.0); 91 for (int i=1; i<n_vectors; ++i) 92 ptr[i] = 0.0; 93 b = _mm_set1_pd (static_cast<volatile double>(2.25)); 94 data[0] = _mm_add_pd (a, b); 95 data[1] = _mm_mul_pd (b, data[0]); 96 ptr = reinterpret_cast<double*>(&data[1]); 97 int return_value = 0; 98 if (ptr[0] != 7.3125) 99 return_value = 1; 100 for (int i=1; i<n_vectors; ++i) 101 if (ptr[i] != 5.0625) 102 return_value = 1; 103 _mm_free (data); 104 return return_value; 105 } 106 " 107 DEAL_II_HAVE_SSE2) 108 109 # 110 # clang-3.6.0 has a bug in operator+ on two VectorizedArray components as 111 # defined in deal.II. Therefore, the test for AVX needs to also test for 112 # operator+ to be correctly implemented. 113 # 114 CHECK_CXX_SOURCE_RUNS( 115 " 116 #ifndef __AVX__ 117 #error \"__AVX__ flag not set, no support for AVX\" 118 #endif 119 #include <x86intrin.h> 120 class VectorizedArray 121 { 122 public: 123 VectorizedArray & 124 operator += (const VectorizedArray &vec) 125 { 126 data = _mm256_add_pd (data, vec.data); 127 return *this; 128 } 129 __m256d data; 130 }; 131 inline 132 VectorizedArray 133 operator + (const VectorizedArray &u, const VectorizedArray &v) 134 { 135 VectorizedArray tmp = u; 136 return tmp+=v; 137 } 138 int main() 139 { 140 __m256d a, b; 141 const unsigned int vector_bytes = sizeof(__m256d); 142 const int n_vectors = vector_bytes/sizeof(double); 143 __m256d * data = 144 reinterpret_cast<__m256d*>(_mm_malloc (2*vector_bytes, vector_bytes)); 145 double * ptr = reinterpret_cast<double*>(&a); 146 ptr[0] = static_cast<volatile double>(1.0); 147 for (int i=1; i<n_vectors; ++i) 148 ptr[i] = 0.0; 149 b = _mm256_set1_pd (static_cast<volatile double>(2.25)); 150 data[0] = _mm256_add_pd (a, b); 151 data[1] = _mm256_mul_pd (b, data[0]); 152 ptr = reinterpret_cast<double*>(&data[1]); 153 int return_value = 0; 154 if (ptr[0] != 7.3125) 155 return_value = 1; 156 for (int i=1; i<n_vectors; ++i) 157 if (ptr[i] != 5.0625) 158 return_value = 1; 159 VectorizedArray c, d, e; 160 c.data = b; 161 d.data = b; 162 e = c + d; 163 ptr = reinterpret_cast<double*>(&e.data); 164 for (int i=0; i<n_vectors; ++i) 165 if (ptr[i] != 4.5) 166 return_value = 1; 167 _mm_free (data); 168 return return_value; 169 } 170 " 171 DEAL_II_HAVE_AVX) 172 173 CHECK_CXX_SOURCE_RUNS( 174 " 175 #ifndef __AVX512F__ 176 #error \"__AVX512F__ flag not set, no support for AVX512\" 177 #endif 178 #include <x86intrin.h> 179 int main() 180 { 181 __m512d a, b; 182 const unsigned int vector_bytes = sizeof(__m512d); 183 const int n_vectors = vector_bytes/sizeof(double); 184 __m512d * data = 185 reinterpret_cast<__m512d*>(_mm_malloc (2*vector_bytes, vector_bytes)); 186 double * ptr = reinterpret_cast<double*>(&a); 187 ptr[0] = static_cast<volatile double>(1.0); 188 for (int i=1; i<n_vectors; ++i) 189 ptr[i] = 0.0; 190 const volatile double x = 2.25; 191 b = _mm512_set1_pd(x); 192 data[0] = _mm512_add_pd (a, b); 193 data[1] = _mm512_mul_pd (b, data[0]); 194 ptr = reinterpret_cast<double*>(&data[1]); 195 int return_value = 0; 196 if (ptr[0] != 7.3125) 197 return_value = 1; 198 for (int i=1; i<n_vectors; ++i) 199 if (ptr[i] != 5.0625) 200 return_value = 1; 201 _mm_free (data); 202 return return_value; 203 } 204 " 205 DEAL_II_HAVE_AVX512) 206 207 CHECK_CXX_SOURCE_RUNS( 208 " 209 #ifndef __ALTIVEC__ 210 #error \"__ALTIVEC__ flag not set, no support for Altivec\" 211 #endif 212 #include <altivec.h> 213 #undef vector 214 #undef pixel 215 #undef bool 216 int main() 217 { 218 __vector double a, b, data1, data2; 219 const int n_vectors = sizeof(a)/sizeof(double); 220 double * ptr = reinterpret_cast<double*>(&a); 221 ptr[0] = static_cast<volatile double>(1.0); 222 for (int i=1; i<n_vectors; ++i) 223 ptr[i] = 0.0; 224 b = vec_splats (static_cast<volatile double>(2.25)); 225 data1 = vec_add (a, b); 226 data2 = vec_mul (b, data1); 227 ptr = reinterpret_cast<double*>(&data2); 228 int return_value = 0; 229 if (ptr[0] != 7.3125) 230 return_value += 1; 231 for (int i=1; i<n_vectors; ++i) 232 if (ptr[i] != 5.0625) 233 return_value += 2; 234 b = vec_splats (static_cast<volatile double>(-1.0)); 235 data1 = vec_abs(vec_mul (b, data2)); 236 vec_vsx_st(data1, 0, ptr); 237 b = vec_vsx_ld(0, ptr); 238 ptr = reinterpret_cast<double*>(&b); 239 if (ptr[0] != 7.3125) 240 return_value += 4; 241 for (int i=1; i<n_vectors; ++i) 242 if (ptr[i] != 5.0625) 243 return_value += 8; 244 return return_value; 245 } 246 " 247 DEAL_II_HAVE_ALTIVEC) 248 249 # 250 # OpenMP 4.0 can be used for vectorization. Only the vectorization 251 # instructions are allowed, the threading must be done through TBB. 252 # 253 254 # 255 # Choosing the right compiler flag is a bit of a mess: 256 # 257 IF(CMAKE_CXX_COMPILER_ID MATCHES "Intel") 258 IF("${CMAKE_CXX_COMPILER_VERSION}" VERSION_GREATER "15" ) 259 SET(_keyword "qopenmp") 260 ELSEIF("${CMAKE_CXX_COMPILER_VERSION}" VERSION_GREATER "14" ) 261 SET(_keyword "openmp") 262 ENDIF() 263 ELSEIF(CMAKE_CXX_COMPILER_ID MATCHES "Clang") 264 SET(_keyword "openmp") 265 ELSE() 266 SET(_keyword "fopenmp") 267 ENDIF() 268 269 CHECK_CXX_COMPILER_FLAG("-${_keyword}-simd" DEAL_II_HAVE_OPENMP_SIMD) 270 271ENDIF() # IF DEAL_II_ALLOW_PLATFORM_INTROSPECTION 272 273 274# 275# Choose DEAL_II_COMPILER_VECTORIZATION level depending on AVX support 276# (that was autodetected or manually specified). 277# 278 279IF(DEAL_II_HAVE_AVX512) 280 SET(DEAL_II_VECTORIZATION_WIDTH_IN_BITS 512) 281ELSEIF(DEAL_II_HAVE_AVX) 282 SET(DEAL_II_VECTORIZATION_WIDTH_IN_BITS 256) 283ELSEIF(DEAL_II_HAVE_SSE2) 284 SET(DEAL_II_VECTORIZATION_WIDTH_IN_BITS 128) 285ELSE() 286 SET(DEAL_II_VECTORIZATION_WIDTH_IN_BITS 0) 287ENDIF() 288 289IF(DEAL_II_HAVE_ALTIVEC) 290 SET(DEAL_II_VECTORIZATION_WIDTH_IN_BITS 128) 291ENDIF() 292 293# 294# We need to disable SIMD vectorization for CUDA device code. 295# Otherwise, nvcc compilers from version 9 on will emit an error message like: 296# "[...] contains a vector, which is not supported in device code" 297# 298 299IF(DEAL_II_WITH_CUDA) 300 SET(DEAL_II_VECTORIZATION_WIDTH_IN_BITS 0) 301ENDIF() 302 303# 304# If we have OpenMP SIMD support (i.e. DEAL_II_HAVE_OPENMP_SIMD is true) 305# populate DEAL_II_OPENMP_SIMD_PRAGMA. 306# 307 308SET(DEAL_II_OPENMP_SIMD_PRAGMA " ") 309IF(DEAL_II_HAVE_OPENMP_SIMD) 310 ADD_FLAGS(DEAL_II_CXX_FLAGS "-${_keyword}-simd") 311 # Intel is special: 312 IF(CMAKE_CXX_COMPILER_ID MATCHES "Intel") 313 ADD_FLAGS(DEAL_II_LINKER_FLAGS "-${_keyword}") 314 ENDIF() 315 SET(DEAL_II_OPENMP_SIMD_PRAGMA "_Pragma(\"omp simd\")") 316ENDIF() 317