1 ///////////////////////////////////////////////////////////////////////
2 // File: simddetect.cpp
3 // Description: Architecture detector.
4 // Author: Stefan Weil (based on code from Ray Smith)
5 //
6 // (C) Copyright 2014, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 ///////////////////////////////////////////////////////////////////////
17
18 #ifdef HAVE_CONFIG_H
19 # include "config_auto.h" // for HAVE_AVX, ...
20 #endif
21 #include <numeric> // for std::inner_product
22 #include "dotproduct.h"
23 #include "intsimdmatrix.h" // for IntSimdMatrix
24 #include "params.h" // for STRING_VAR
25 #include "simddetect.h"
26 #include "tprintf.h" // for tprintf
27
28 #if !defined(__clang__) && defined(__GNUC__) && (__GNUC__ < 12)
29 // The GNU compiler g++ fails to compile with the Accelerate framework
30 // (tested with versions 10 and 11), so unconditionally disable it.
31 #undef HAVE_FRAMEWORK_ACCELERATE
32 #endif
33
34 #if defined(HAVE_FRAMEWORK_ACCELERATE)
35
36 // Use Apple Accelerate framework.
37 // https://developer.apple.com/documentation/accelerate/simd
38
39 #include <Accelerate/Accelerate.h>
40
41 #endif
42
43 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)
44 # define HAS_CPUID
45 #endif
46
47 #if defined(HAS_CPUID)
48 # if defined(__GNUC__)
49 # include <cpuid.h>
50 # elif defined(_WIN32)
51 # include <intrin.h>
52 # endif
53 #endif
54
55 #if defined(HAVE_NEON) && !defined(__aarch64__)
56 # ifdef ANDROID
57 # include <cpu-features.h>
58 # else
59 /* Assume linux */
60 # include <asm/hwcap.h>
61 # include <sys/auxv.h>
62 # endif
63 #endif
64
65 namespace tesseract {
66
67 // Computes and returns the dot product of the two n-vectors u and v.
68 // Note: because the order of addition is different among the different dot
69 // product functions, the results can (and do) vary slightly (although they
70 // agree to within about 4e-15). This produces different results when running
71 // training, despite all random inputs being precisely equal.
72 // To get consistent results, use just one of these dot product functions.
73 // On a test multi-layer network, serial is 57% slower than SSE, and AVX
74 // is about 8% faster than SSE. This suggests that the time is memory
75 // bandwidth constrained and could benefit from holding the reused vector
76 // in AVX registers.
77 DotProductFunction DotProduct;
78
79 static STRING_VAR(dotproduct, "auto", "Function used for calculation of dot product");
80
81 SIMDDetect SIMDDetect::detector;
82
83 #if defined(__aarch64__)
84 // ARMv8 always has NEON.
85 bool SIMDDetect::neon_available_ = true;
86 #elif defined(HAVE_NEON)
87 // If true, then Neon has been detected.
88 bool SIMDDetect::neon_available_;
89 #else
90 // If true, then AVX has been detected.
91 bool SIMDDetect::avx_available_;
92 bool SIMDDetect::avx2_available_;
93 bool SIMDDetect::avx512F_available_;
94 bool SIMDDetect::avx512BW_available_;
95 // If true, then FMA has been detected.
96 bool SIMDDetect::fma_available_;
97 // If true, then SSe4.1 has been detected.
98 bool SIMDDetect::sse_available_;
99 #endif
100
101 #if defined(HAVE_FRAMEWORK_ACCELERATE)
DotProductAccelerate(const TFloat * u,const TFloat * v,int n)102 static TFloat DotProductAccelerate(const TFloat* u, const TFloat* v, int n) {
103 TFloat total = 0;
104 const int stride = 1;
105 #if defined(FAST_FLOAT)
106 vDSP_dotpr(u, stride, v, stride, &total, n);
107 #else
108 vDSP_dotprD(u, stride, v, stride, &total, n);
109 #endif
110 return total;
111 }
112 #endif
113
114 // Computes and returns the dot product of the two n-vectors u and v.
DotProductGeneric(const TFloat * u,const TFloat * v,int n)115 static TFloat DotProductGeneric(const TFloat *u, const TFloat *v, int n) {
116 TFloat total = 0;
117 for (int k = 0; k < n; ++k) {
118 total += u[k] * v[k];
119 }
120 return total;
121 }
122
123 // Compute dot product using std::inner_product.
DotProductStdInnerProduct(const TFloat * u,const TFloat * v,int n)124 static TFloat DotProductStdInnerProduct(const TFloat *u, const TFloat *v, int n) {
125 return std::inner_product(u, u + n, v, static_cast<TFloat>(0));
126 }
127
SetDotProduct(DotProductFunction f,const IntSimdMatrix * m=nullptr)128 static void SetDotProduct(DotProductFunction f, const IntSimdMatrix *m = nullptr) {
129 DotProduct = f;
130 IntSimdMatrix::intSimdMatrix = m;
131 }
132
133 // Constructor.
134 // Tests the architecture in a system-dependent way to detect AVX, SSE and
135 // any other available SIMD equipment.
136 // __GNUC__ is also defined by compilers that include GNU extensions such as
137 // clang.
SIMDDetect()138 SIMDDetect::SIMDDetect() {
139 // The fallback is a generic dot product calculation.
140 SetDotProduct(DotProductGeneric);
141
142 #if defined(HAS_CPUID)
143 # if defined(__GNUC__)
144 unsigned int eax, ebx, ecx, edx;
145 if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) {
146 // Note that these tests all use hex because the older compilers don't have
147 // the newer flags.
148 # if defined(HAVE_SSE4_1)
149 sse_available_ = (ecx & 0x00080000) != 0;
150 # endif
151 # if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
152 auto xgetbv = []() {
153 uint32_t xcr0;
154 __asm__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx");
155 return xcr0;
156 };
157 if ((ecx & 0x08000000) && ((xgetbv() & 6) == 6)) {
158 // OSXSAVE bit is set, XMM state and YMM state are fine.
159 # if defined(HAVE_FMA)
160 fma_available_ = (ecx & 0x00001000) != 0;
161 # endif
162 # if defined(HAVE_AVX)
163 avx_available_ = (ecx & 0x10000000) != 0;
164 if (avx_available_) {
165 // There is supposed to be a __get_cpuid_count function, but this is all
166 // there is in my cpuid.h. It is a macro for an asm statement and cannot
167 // be used inside an if.
168 __cpuid_count(7, 0, eax, ebx, ecx, edx);
169 avx2_available_ = (ebx & 0x00000020) != 0;
170 avx512F_available_ = (ebx & 0x00010000) != 0;
171 avx512BW_available_ = (ebx & 0x40000000) != 0;
172 }
173 # endif
174 }
175 # endif
176 }
177 # elif defined(_WIN32)
178 int cpuInfo[4];
179 int max_function_id;
180 __cpuid(cpuInfo, 0);
181 max_function_id = cpuInfo[0];
182 if (max_function_id >= 1) {
183 __cpuid(cpuInfo, 1);
184 # if defined(HAVE_SSE4_1)
185 sse_available_ = (cpuInfo[2] & 0x00080000) != 0;
186 # endif
187 # if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
188 if ((cpuInfo[2] & 0x08000000) && ((_xgetbv(0) & 6) == 6)) {
189 // OSXSAVE bit is set, XMM state and YMM state are fine.
190 # if defined(HAVE_FMA)
191 fma_available_ = (cpuInfo[2] & 0x00001000) != 0;
192 # endif
193 # if defined(HAVE_AVX)
194 avx_available_ = (cpuInfo[2] & 0x10000000) != 0;
195 # endif
196 # if defined(HAVE_AVX2)
197 if (max_function_id >= 7) {
198 __cpuid(cpuInfo, 7);
199 avx2_available_ = (cpuInfo[1] & 0x00000020) != 0;
200 avx512F_available_ = (cpuInfo[1] & 0x00010000) != 0;
201 avx512BW_available_ = (cpuInfo[1] & 0x40000000) != 0;
202 }
203 # endif
204 }
205 # endif
206 }
207 # else
208 # error "I don't know how to test for SIMD with this compiler"
209 # endif
210 #endif
211
212 #if defined(HAVE_NEON) && !defined(__aarch64__)
213 # ifdef ANDROID
214 {
215 AndroidCpuFamily family = android_getCpuFamily();
216 if (family == ANDROID_CPU_FAMILY_ARM)
217 neon_available_ = (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON);
218 }
219 # else
220 /* Assume linux */
221 neon_available_ = getauxval(AT_HWCAP) & HWCAP_NEON;
222 # endif
223 #endif
224
225 // Select code for calculation of dot product based on autodetection.
226 if (false) {
227 // This is a dummy to support conditional compilation.
228 #if defined(HAVE_AVX2)
229 } else if (avx2_available_) {
230 // AVX2 detected.
231 SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2);
232 #endif
233 #if defined(HAVE_AVX)
234 } else if (avx_available_) {
235 // AVX detected.
236 SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE);
237 #endif
238 #if defined(HAVE_SSE4_1)
239 } else if (sse_available_) {
240 // SSE detected.
241 SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE);
242 #endif
243 #if defined(HAVE_NEON) || defined(__aarch64__)
244 } else if (neon_available_) {
245 // NEON detected.
246 SetDotProduct(DotProductNEON, &IntSimdMatrix::intSimdMatrixNEON);
247 #endif
248 }
249
250 const char *dotproduct_env = getenv("DOTPRODUCT");
251 if (dotproduct_env != nullptr) {
252 // Override automatic settings by value from environment variable.
253 dotproduct = dotproduct_env;
254 Update();
255 }
256 }
257
Update()258 void SIMDDetect::Update() {
259 // Select code for calculation of dot product based on the
260 // value of the config variable if that value is not empty.
261 const char *dotproduct_method = "generic";
262 if (dotproduct == "auto") {
263 // Automatic detection. Nothing to be done.
264 } else if (dotproduct == "generic") {
265 // Generic code selected by config variable.
266 SetDotProduct(DotProductGeneric);
267 dotproduct_method = "generic";
268 } else if (dotproduct == "native") {
269 // Native optimized code selected by config variable.
270 SetDotProduct(DotProductNative, IntSimdMatrix::intSimdMatrix);
271 dotproduct_method = "native";
272 #if defined(HAVE_AVX2)
273 } else if (dotproduct == "avx2") {
274 // AVX2 selected by config variable.
275 SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2);
276 dotproduct_method = "avx2";
277 #endif
278 #if defined(HAVE_AVX)
279 } else if (dotproduct == "avx") {
280 // AVX selected by config variable.
281 SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE);
282 dotproduct_method = "avx";
283 #endif
284 #if defined(HAVE_FMA)
285 } else if (dotproduct == "fma") {
286 // FMA selected by config variable.
287 SetDotProduct(DotProductFMA, IntSimdMatrix::intSimdMatrix);
288 dotproduct_method = "fma";
289 #endif
290 #if defined(HAVE_SSE4_1)
291 } else if (dotproduct == "sse") {
292 // SSE selected by config variable.
293 SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE);
294 dotproduct_method = "sse";
295 #endif
296 #if defined(HAVE_FRAMEWORK_ACCELERATE)
297 } else if (dotproduct == "accelerate") {
298 SetDotProduct(DotProductAccelerate, IntSimdMatrix::intSimdMatrix);
299 #endif
300 #if defined(HAVE_NEON) || defined(__aarch64__)
301 } else if (dotproduct == "neon" && neon_available_) {
302 // NEON selected by config variable.
303 SetDotProduct(DotProductNEON, &IntSimdMatrix::intSimdMatrixNEON);
304 dotproduct_method = "neon";
305 #endif
306 } else if (dotproduct == "std::inner_product") {
307 // std::inner_product selected by config variable.
308 SetDotProduct(DotProductStdInnerProduct, IntSimdMatrix::intSimdMatrix);
309 dotproduct_method = "std::inner_product";
310 } else {
311 // Unsupported value of config variable.
312 tprintf("Warning, ignoring unsupported config variable value: dotproduct=%s\n",
313 dotproduct.c_str());
314 tprintf(
315 "Supported values for dotproduct: auto generic native"
316 #if defined(HAVE_AVX2)
317 " avx2"
318 #endif
319 #if defined(HAVE_AVX)
320 " avx"
321 #endif
322 #if defined(HAVE_FMA)
323 " fma"
324 #endif
325 #if defined(HAVE_SSE4_1)
326 " sse"
327 #endif
328 #if defined(HAVE_FRAMEWORK_ACCELERATE)
329 " accelerate"
330 #endif
331 " std::inner_product.\n");
332 }
333
334 dotproduct.set_value(dotproduct_method);
335 }
336
337 } // namespace tesseract
338