1 ///////////////////////////////////////////////////////////////////////
2 // File:        simddetect.cpp
3 // Description: Architecture detector.
4 // Author:      Stefan Weil (based on code from Ray Smith)
5 //
6 // (C) Copyright 2014, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 ///////////////////////////////////////////////////////////////////////
17 
18 #ifdef HAVE_CONFIG_H
19 #  include "config_auto.h" // for HAVE_AVX, ...
20 #endif
21 #include <numeric> // for std::inner_product
22 #include "dotproduct.h"
23 #include "intsimdmatrix.h" // for IntSimdMatrix
24 #include "params.h"        // for STRING_VAR
25 #include "simddetect.h"
26 #include "tprintf.h" // for tprintf
27 
28 #if !defined(__clang__) && defined(__GNUC__) && (__GNUC__ < 12)
29 // The GNU compiler g++ fails to compile with the Accelerate framework
30 // (tested with versions 10 and 11), so unconditionally disable it.
31 #undef HAVE_FRAMEWORK_ACCELERATE
32 #endif
33 
34 #if defined(HAVE_FRAMEWORK_ACCELERATE)
35 
36 // Use Apple Accelerate framework.
37 // https://developer.apple.com/documentation/accelerate/simd
38 
39 #include <Accelerate/Accelerate.h>
40 
41 #endif
42 
43 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)
44 #  define HAS_CPUID
45 #endif
46 
47 #if defined(HAS_CPUID)
48 #  if defined(__GNUC__)
49 #    include <cpuid.h>
50 #  elif defined(_WIN32)
51 #    include <intrin.h>
52 #  endif
53 #endif
54 
55 #if defined(HAVE_NEON) && !defined(__aarch64__)
56 #  ifdef ANDROID
57 #    include <cpu-features.h>
58 #  else
59 /* Assume linux */
60 #    include <asm/hwcap.h>
61 #    include <sys/auxv.h>
62 #  endif
63 #endif
64 
65 namespace tesseract {
66 
67 // Computes and returns the dot product of the two n-vectors u and v.
68 // Note: because the order of addition is different among the different dot
69 // product functions, the results can (and do) vary slightly (although they
70 // agree to within about 4e-15). This produces different results when running
71 // training, despite all random inputs being precisely equal.
72 // To get consistent results, use just one of these dot product functions.
73 // On a test multi-layer network, serial is 57% slower than SSE, and AVX
74 // is about 8% faster than SSE. This suggests that the time is memory
75 // bandwidth constrained and could benefit from holding the reused vector
76 // in AVX registers.
77 DotProductFunction DotProduct;
78 
79 static STRING_VAR(dotproduct, "auto", "Function used for calculation of dot product");
80 
81 SIMDDetect SIMDDetect::detector;
82 
83 #if defined(__aarch64__)
84 // ARMv8 always has NEON.
85 bool SIMDDetect::neon_available_ = true;
86 #elif defined(HAVE_NEON)
87 // If true, then Neon has been detected.
88 bool SIMDDetect::neon_available_;
89 #else
90 // If true, then AVX has been detected.
91 bool SIMDDetect::avx_available_;
92 bool SIMDDetect::avx2_available_;
93 bool SIMDDetect::avx512F_available_;
94 bool SIMDDetect::avx512BW_available_;
95 // If true, then FMA has been detected.
96 bool SIMDDetect::fma_available_;
97 // If true, then SSe4.1 has been detected.
98 bool SIMDDetect::sse_available_;
99 #endif
100 
101 #if defined(HAVE_FRAMEWORK_ACCELERATE)
DotProductAccelerate(const TFloat * u,const TFloat * v,int n)102 static TFloat DotProductAccelerate(const TFloat* u, const TFloat* v, int n) {
103   TFloat total = 0;
104   const int stride = 1;
105 #if defined(FAST_FLOAT)
106   vDSP_dotpr(u, stride, v, stride, &total, n);
107 #else
108   vDSP_dotprD(u, stride, v, stride, &total, n);
109 #endif
110   return total;
111 }
112 #endif
113 
114 // Computes and returns the dot product of the two n-vectors u and v.
DotProductGeneric(const TFloat * u,const TFloat * v,int n)115 static TFloat DotProductGeneric(const TFloat *u, const TFloat *v, int n) {
116   TFloat total = 0;
117   for (int k = 0; k < n; ++k) {
118     total += u[k] * v[k];
119   }
120   return total;
121 }
122 
123 // Compute dot product using std::inner_product.
DotProductStdInnerProduct(const TFloat * u,const TFloat * v,int n)124 static TFloat DotProductStdInnerProduct(const TFloat *u, const TFloat *v, int n) {
125   return std::inner_product(u, u + n, v, static_cast<TFloat>(0));
126 }
127 
SetDotProduct(DotProductFunction f,const IntSimdMatrix * m=nullptr)128 static void SetDotProduct(DotProductFunction f, const IntSimdMatrix *m = nullptr) {
129   DotProduct = f;
130   IntSimdMatrix::intSimdMatrix = m;
131 }
132 
133 // Constructor.
134 // Tests the architecture in a system-dependent way to detect AVX, SSE and
135 // any other available SIMD equipment.
136 // __GNUC__ is also defined by compilers that include GNU extensions such as
137 // clang.
SIMDDetect()138 SIMDDetect::SIMDDetect() {
139   // The fallback is a generic dot product calculation.
140   SetDotProduct(DotProductGeneric);
141 
142 #if defined(HAS_CPUID)
143 #  if defined(__GNUC__)
144   unsigned int eax, ebx, ecx, edx;
145   if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) {
146     // Note that these tests all use hex because the older compilers don't have
147     // the newer flags.
148 #    if defined(HAVE_SSE4_1)
149     sse_available_ = (ecx & 0x00080000) != 0;
150 #    endif
151 #    if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
152     auto xgetbv = []() {
153       uint32_t xcr0;
154       __asm__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx");
155       return xcr0;
156     };
157     if ((ecx & 0x08000000) && ((xgetbv() & 6) == 6)) {
158       // OSXSAVE bit is set, XMM state and YMM state are fine.
159 #      if defined(HAVE_FMA)
160       fma_available_ = (ecx & 0x00001000) != 0;
161 #      endif
162 #      if defined(HAVE_AVX)
163       avx_available_ = (ecx & 0x10000000) != 0;
164       if (avx_available_) {
165         // There is supposed to be a __get_cpuid_count function, but this is all
166         // there is in my cpuid.h. It is a macro for an asm statement and cannot
167         // be used inside an if.
168         __cpuid_count(7, 0, eax, ebx, ecx, edx);
169         avx2_available_ = (ebx & 0x00000020) != 0;
170         avx512F_available_ = (ebx & 0x00010000) != 0;
171         avx512BW_available_ = (ebx & 0x40000000) != 0;
172       }
173 #      endif
174     }
175 #    endif
176   }
177 #  elif defined(_WIN32)
178   int cpuInfo[4];
179   int max_function_id;
180   __cpuid(cpuInfo, 0);
181   max_function_id = cpuInfo[0];
182   if (max_function_id >= 1) {
183     __cpuid(cpuInfo, 1);
184 #    if defined(HAVE_SSE4_1)
185     sse_available_ = (cpuInfo[2] & 0x00080000) != 0;
186 #    endif
187 #    if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
188     if ((cpuInfo[2] & 0x08000000) && ((_xgetbv(0) & 6) == 6)) {
189       // OSXSAVE bit is set, XMM state and YMM state are fine.
190 #      if defined(HAVE_FMA)
191       fma_available_ = (cpuInfo[2] & 0x00001000) != 0;
192 #      endif
193 #      if defined(HAVE_AVX)
194       avx_available_ = (cpuInfo[2] & 0x10000000) != 0;
195 #      endif
196 #      if defined(HAVE_AVX2)
197       if (max_function_id >= 7) {
198         __cpuid(cpuInfo, 7);
199         avx2_available_ = (cpuInfo[1] & 0x00000020) != 0;
200         avx512F_available_ = (cpuInfo[1] & 0x00010000) != 0;
201         avx512BW_available_ = (cpuInfo[1] & 0x40000000) != 0;
202       }
203 #      endif
204     }
205 #    endif
206   }
207 #  else
208 #    error "I don't know how to test for SIMD with this compiler"
209 #  endif
210 #endif
211 
212 #if defined(HAVE_NEON) && !defined(__aarch64__)
213 #  ifdef ANDROID
214   {
215     AndroidCpuFamily family = android_getCpuFamily();
216     if (family == ANDROID_CPU_FAMILY_ARM)
217       neon_available_ = (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON);
218   }
219 #  else
220   /* Assume linux */
221   neon_available_ = getauxval(AT_HWCAP) & HWCAP_NEON;
222 #  endif
223 #endif
224 
225   // Select code for calculation of dot product based on autodetection.
226   if (false) {
227     // This is a dummy to support conditional compilation.
228 #if defined(HAVE_AVX2)
229   } else if (avx2_available_) {
230     // AVX2 detected.
231     SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2);
232 #endif
233 #if defined(HAVE_AVX)
234   } else if (avx_available_) {
235     // AVX detected.
236     SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE);
237 #endif
238 #if defined(HAVE_SSE4_1)
239   } else if (sse_available_) {
240     // SSE detected.
241     SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE);
242 #endif
243 #if defined(HAVE_NEON) || defined(__aarch64__)
244   } else if (neon_available_) {
245     // NEON detected.
246     SetDotProduct(DotProductNEON, &IntSimdMatrix::intSimdMatrixNEON);
247 #endif
248   }
249 
250   const char *dotproduct_env = getenv("DOTPRODUCT");
251   if (dotproduct_env != nullptr) {
252     // Override automatic settings by value from environment variable.
253     dotproduct = dotproduct_env;
254     Update();
255   }
256 }
257 
Update()258 void SIMDDetect::Update() {
259   // Select code for calculation of dot product based on the
260   // value of the config variable if that value is not empty.
261   const char *dotproduct_method = "generic";
262   if (dotproduct == "auto") {
263     // Automatic detection. Nothing to be done.
264   } else if (dotproduct == "generic") {
265     // Generic code selected by config variable.
266     SetDotProduct(DotProductGeneric);
267     dotproduct_method = "generic";
268   } else if (dotproduct == "native") {
269     // Native optimized code selected by config variable.
270     SetDotProduct(DotProductNative, IntSimdMatrix::intSimdMatrix);
271     dotproduct_method = "native";
272 #if defined(HAVE_AVX2)
273   } else if (dotproduct == "avx2") {
274     // AVX2 selected by config variable.
275     SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2);
276     dotproduct_method = "avx2";
277 #endif
278 #if defined(HAVE_AVX)
279   } else if (dotproduct == "avx") {
280     // AVX selected by config variable.
281     SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE);
282     dotproduct_method = "avx";
283 #endif
284 #if defined(HAVE_FMA)
285   } else if (dotproduct == "fma") {
286     // FMA selected by config variable.
287     SetDotProduct(DotProductFMA, IntSimdMatrix::intSimdMatrix);
288     dotproduct_method = "fma";
289 #endif
290 #if defined(HAVE_SSE4_1)
291   } else if (dotproduct == "sse") {
292     // SSE selected by config variable.
293     SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE);
294     dotproduct_method = "sse";
295 #endif
296 #if defined(HAVE_FRAMEWORK_ACCELERATE)
297   } else if (dotproduct == "accelerate") {
298     SetDotProduct(DotProductAccelerate, IntSimdMatrix::intSimdMatrix);
299 #endif
300 #if defined(HAVE_NEON) || defined(__aarch64__)
301   } else if (dotproduct == "neon" && neon_available_) {
302     // NEON selected by config variable.
303     SetDotProduct(DotProductNEON, &IntSimdMatrix::intSimdMatrixNEON);
304     dotproduct_method = "neon";
305 #endif
306   } else if (dotproduct == "std::inner_product") {
307     // std::inner_product selected by config variable.
308     SetDotProduct(DotProductStdInnerProduct, IntSimdMatrix::intSimdMatrix);
309     dotproduct_method = "std::inner_product";
310   } else {
311     // Unsupported value of config variable.
312     tprintf("Warning, ignoring unsupported config variable value: dotproduct=%s\n",
313             dotproduct.c_str());
314     tprintf(
315         "Supported values for dotproduct: auto generic native"
316 #if defined(HAVE_AVX2)
317         " avx2"
318 #endif
319 #if defined(HAVE_AVX)
320         " avx"
321 #endif
322 #if defined(HAVE_FMA)
323         " fma"
324 #endif
325 #if defined(HAVE_SSE4_1)
326         " sse"
327 #endif
328 #if defined(HAVE_FRAMEWORK_ACCELERATE)
329         " accelerate"
330 #endif
331         " std::inner_product.\n");
332   }
333 
334   dotproduct.set_value(dotproduct_method);
335 }
336 
337 } // namespace tesseract
338