1 /****************************************************************************
2 **
3 ** Copyright (C) 2016 The Qt Company Ltd.
4 ** Copyright (C) 2018 Intel Corporation.
5 ** Contact: https://www.qt.io/licensing/
6 **
7 ** This file is part of the QtCore module of the Qt Toolkit.
8 **
9 ** $QT_BEGIN_LICENSE:LGPL$
10 ** Commercial License Usage
11 ** Licensees holding valid commercial Qt licenses may use this file in
12 ** accordance with the commercial license agreement provided with the
13 ** Software or, alternatively, in accordance with the terms contained in
14 ** a written agreement between you and The Qt Company. For licensing terms
15 ** and conditions see https://www.qt.io/terms-conditions. For further
16 ** information use the contact form at https://www.qt.io/contact-us.
17 **
18 ** GNU Lesser General Public License Usage
19 ** Alternatively, this file may be used under the terms of the GNU Lesser
20 ** General Public License version 3 as published by the Free Software
21 ** Foundation and appearing in the file LICENSE.LGPL3 included in the
22 ** packaging of this file. Please review the following information to
23 ** ensure the GNU Lesser General Public License version 3 requirements
24 ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
25 **
26 ** GNU General Public License Usage
27 ** Alternatively, this file may be used under the terms of the GNU
28 ** General Public License version 2.0 or (at your option) the GNU General
29 ** Public license version 3 or any later version approved by the KDE Free
30 ** Qt Foundation. The licenses are as published by the Free Software
31 ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
32 ** included in the packaging of this file. Please review the following
33 ** information to ensure the GNU General Public License requirements will
34 ** be met: https://www.gnu.org/licenses/gpl-2.0.html and
35 ** https://www.gnu.org/licenses/gpl-3.0.html.
36 **
37 ** $QT_END_LICENSE$
38 **
39 ****************************************************************************/
40 
41 #ifndef QSIMD_P_H
42 #define QSIMD_P_H
43 
44 //
45 //  W A R N I N G
46 //  -------------
47 //
48 // This file is not part of the Qt API.  It exists purely as an
49 // implementation detail.  This header file may change from version to
50 // version without notice, or even be removed.
51 //
52 // We mean it.
53 //
54 
55 #include <QtCore/private/qglobal_p.h>
56 
57 /*
58  * qt_module_config.prf defines the QT_COMPILER_SUPPORTS_XXX macros.
59  * They mean the compiler supports the necessary flags and the headers
60  * for the x86 and ARM intrinsics:
61  *  - GCC: the -mXXX or march=YYY flag is necessary before #include
62  *    up to 4.8; GCC >= 4.9 can include unconditionally
63  *  - Intel CC: #include can happen unconditionally
64  *  - MSVC: #include can happen unconditionally
65  *  - RVCT: ???
66  *
67  * We will try to include all headers possible under this configuration.
68  *
69  * MSVC does not define __SSE2__ & family, so we will define them. MSVC 2013 &
70  * up do define __AVX__ if the -arch:AVX option is passed on the command-line.
71  *
72  * Supported XXX are:
73  *   Flag    | Arch |  GCC  | Intel CC |  MSVC  |
74  *  ARM_NEON | ARM  | I & C | None     |   ?    |
75  *  SSE2     | x86  | I & C | I & C    | I & C  |
76  *  SSE3     | x86  | I & C | I & C    | I only |
77  *  SSSE3    | x86  | I & C | I & C    | I only |
78  *  SSE4_1   | x86  | I & C | I & C    | I only |
79  *  SSE4_2   | x86  | I & C | I & C    | I only |
80  *  AVX      | x86  | I & C | I & C    | I & C  |
81  *  AVX2     | x86  | I & C | I & C    | I only |
82  *  AVX512xx | x86  | I & C | I & C    | I only |
83  * I = intrinsics; C = code generation
84  *
85  * Code can use the following constructs to determine compiler support & status:
86  * - #ifdef __XXX__      (e.g: #ifdef __AVX__  or #ifdef __ARM_NEON__)
87  *   If this test passes, then the compiler is already generating code for that
88  *   given sub-architecture. The intrinsics for that sub-architecture are
89  *   #included and can be used without restriction or runtime check.
90  *
91  * - #if QT_COMPILER_SUPPORTS(XXX)
92  *   If this test passes, then the compiler is able to generate code for that
93  *   given sub-architecture in another translation unit, given the right set of
94  *   flags. Use of the intrinsics is not guaranteed. This is useful with
95  *   runtime detection (see below).
96  *
97  * - #if QT_COMPILER_SUPPORTS_HERE(XXX)
98  *   If this test passes, then the compiler is able to generate code for that
99  *   given sub-architecture in this translation unit, even if it is not doing
100  *   that now (it might be). Individual functions may be tagged with
101  *   QT_FUNCTION_TARGET(XXX) to cause the compiler to generate code for that
102  *   sub-arch. Only inside such functions is the use of the intrisics
103  *   guaranteed to work. This is useful with runtime detection (see below).
104  *
105  * Runtime detection of a CPU sub-architecture can be done with the
106  * qCpuHasFeature(XXX) function. There are two strategies for generating
107  * optimized code like that:
108  *
109  * 1) place the optimized code in a different translation unit (C or assembly
110  * sources) and pass the correct flags to the compiler to enable support. Those
111  * sources must not include qglobal.h, which means they cannot include this
112  * file either. The dispatcher function would look like this:
113  *
114  *      void foo()
115  *      {
116  *      #if QT_COMPILER_SUPPORTS(XXX)
117  *          if (qCpuHasFeature(XXX)) {
118  *              foo_optimized_xxx();
119  *              return;
120  *          }
121  *      #endif
122  *          foo_plain();
123  *      }
124  *
125  * 2) place the optimized code in a function tagged with QT_FUNCTION_TARGET and
126  * surrounded by #if QT_COMPILER_SUPPORTS_HERE(XXX). That code can freely use
127  * other Qt code. The dispatcher function would look like this:
128  *
129  *      void foo()
130  *      {
131  *      #if QT_COMPILER_SUPPORTS_HERE(XXX)
132  *          if (qCpuHasFeature(XXX)) {
133  *              foo_optimized_xxx();
134  *              return;
135  *          }
136  *      #endif
137  *          foo_plain();
138  *      }
139  */
140 
141 #if defined(__MINGW64_VERSION_MAJOR) || defined(Q_CC_MSVC)
142 #include <intrin.h>
143 #endif
144 
145 #define QT_COMPILER_SUPPORTS(x)     (QT_COMPILER_SUPPORTS_ ## x - 0)
146 
147 #if defined(Q_PROCESSOR_ARM)
148 #  define QT_COMPILER_SUPPORTS_HERE(x)    (__ARM_FEATURE_ ## x)
149 #  if defined(Q_CC_GNU) && !defined(Q_CC_INTEL) && Q_CC_GNU >= 600
150      /* GCC requires attributes for a function */
151 #    define QT_FUNCTION_TARGET(x)  __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
152 #  else
153 #    define QT_FUNCTION_TARGET(x)
154 #  endif
155 #  if !defined(__ARM_FEATURE_NEON) && defined(__ARM_NEON__)
156 #    define __ARM_FEATURE_NEON           // also support QT_COMPILER_SUPPORTS_HERE(NEON)
157 #  endif
158 #elif defined(Q_PROCESSOR_MIPS)
159 #  define QT_COMPILER_SUPPORTS_HERE(x)    (__ ## x ## __)
160 #  define QT_FUNCTION_TARGET(x)
161 #  if !defined(__MIPS_DSP__) && defined(__mips_dsp) && defined(Q_PROCESSOR_MIPS_32)
162 #    define __MIPS_DSP__
163 #  endif
164 #  if !defined(__MIPS_DSPR2__) && defined(__mips_dspr2) && defined(Q_PROCESSOR_MIPS_32)
165 #    define __MIPS_DSPR2__
166 #  endif
167 #elif defined(Q_PROCESSOR_X86) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)
168 #  define QT_COMPILER_SUPPORTS_HERE(x)    ((__ ## x ## __) || QT_COMPILER_SUPPORTS(x))
169 #  if defined(Q_CC_GNU) && !defined(Q_CC_INTEL)
170      /* GCC requires attributes for a function */
171 #    define QT_FUNCTION_TARGET(x)  __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
172 #  else
173 #    define QT_FUNCTION_TARGET(x)
174 #  endif
175 #else
176 #  define QT_COMPILER_SUPPORTS_HERE(x)    (__ ## x ## __)
177 #  define QT_FUNCTION_TARGET(x)
178 #endif
179 
180 #ifdef Q_PROCESSOR_X86
181 /* -- x86 intrinsic support -- */
182 
183 #  if defined(Q_CC_MSVC) && (defined(_M_X64) || _M_IX86_FP >= 2)
184 // MSVC doesn't define __SSE2__, so do it ourselves
185 #    define __SSE__                         1
186 #    define __SSE2__                        1
187 #  endif
188 
189 #  ifdef __SSE2__
190 // #include the intrinsics
191 #    include <immintrin.h>
192 #  endif
193 
194 #  if defined(Q_CC_GNU) && !defined(Q_CC_INTEL)
195 // GCC 4.4 and Clang 2.8 added a few more intrinsics there
196 #    include <x86intrin.h>
197 #  endif
198 
199 #  if defined(Q_CC_MSVC) && (defined(_M_AVX) || defined(__AVX__))
200 // Visual Studio defines __AVX__ when /arch:AVX is passed, but not the earlier macros
201 // See: https://msdn.microsoft.com/en-us/library/b0084kay.aspx
202 #    define __SSE3__                        1
203 #    define __SSSE3__                       1
204 // no Intel CPU supports SSE4a, so don't define it
205 #    define __SSE4_1__                      1
206 #    define __SSE4_2__                      1
207 #    ifndef __AVX__
208 #      define __AVX__                       1
209 #    endif
210 #  endif
211 
212 #  if defined(__SSE4_2__) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) && (defined(Q_CC_INTEL) || defined(Q_CC_MSVC))
213 // POPCNT instructions:
214 // All processors that support SSE4.2 support POPCNT
215 // (but neither MSVC nor the Intel compiler define this macro)
216 #    define __POPCNT__                      1
217 #  endif
218 
219 // AVX intrinsics
220 #  if defined(__AVX__) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) && (defined(Q_CC_INTEL) || defined(Q_CC_MSVC))
221 // AES, PCLMULQDQ instructions:
222 // All processors that support AVX support PCLMULQDQ
223 // (but neither MSVC nor the Intel compiler define this macro)
224 #    define __PCLMUL__                      1
225 #  endif
226 
227 #  if defined(__AVX2__) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) && (defined(Q_CC_INTEL) || defined(Q_CC_MSVC))
228 // F16C & RDRAND instructions:
229 // All processors that support AVX2 support F16C & RDRAND:
230 // (but neither MSVC nor the Intel compiler define these macros)
231 #    define __F16C__                        1
232 #    define __RDRND__                       1
233 #  endif
234 
235 #  if defined(__BMI__) && !defined(__BMI2__) && defined(Q_CC_INTEL)
236 // BMI2 instructions:
237 // All processors that support BMI support BMI2 (and AVX2)
238 // (but neither MSVC nor the Intel compiler define this macro)
239 #    define __BMI2__                        1
240 #  endif
241 
242 #  include "qsimd_x86_p.h"
243 
244 // Haswell sub-architecture
245 //
246 // The Intel Core 4th generation was codenamed "Haswell" and introduced AVX2,
247 // BMI1, BMI2, FMA, LZCNT, MOVBE, which makes it a good divider for a
248 // sub-target for us. The first AMD processor with AVX2 support (Zen) has the
249 // same features.
250 //
251 // macOS's fat binaries support the "x86_64h" sub-architecture and the GNU libc
252 // ELF loader also supports a "haswell/" subdir (e.g., /usr/lib/haswell).
253 #  define QT_FUNCTION_TARGET_STRING_ARCH_HASWELL    "arch=haswell"
254 #  if defined(__AVX2__) && defined(__BMI__) && defined(__BMI2__) && defined(__F16C__) && \
255     defined(__FMA__) && defined(__LZCNT__) && defined(__RDRND__)
256 #    define __haswell__       1
257 #  endif
258 
259 // This constant does not include all CPU features found in a Haswell, only
260 // those that we'd have optimized code for.
261 // Note: must use Q_CONSTEXPR here, as this file may be compiled in C mode.
262 QT_BEGIN_NAMESPACE
263 static const quint64 CpuFeatureArchHaswell    = 0
264         | CpuFeatureSSE2
265         | CpuFeatureSSE3
266         | CpuFeatureSSSE3
267         | CpuFeatureSSE4_1
268         | CpuFeatureSSE4_2
269         | CpuFeatureFMA
270         | CpuFeaturePOPCNT
271         | CpuFeatureAVX
272         | CpuFeatureF16C
273         | CpuFeatureAVX2
274         | CpuFeatureBMI
275         | CpuFeatureBMI2;
276 QT_END_NAMESPACE
277 
278 #endif  /* Q_PROCESSOR_X86 */
279 
280 // Clang compiler fix, see http://lists.llvm.org/pipermail/cfe-commits/Week-of-Mon-20160222/151168.html
281 // This should be tweaked with an "upper version" of clang once we know which release fixes the
282 // issue. At that point we can rely on __ARM_FEATURE_CRC32 again.
283 #if defined(Q_CC_CLANG) && defined(Q_OS_DARWIN) && defined (__ARM_FEATURE_CRC32)
284 #  undef __ARM_FEATURE_CRC32
285 #endif
286 
287 // NEON intrinsics
288 // note: as of GCC 4.9, does not support function targets for ARM
289 #if defined(__ARM_NEON) || defined(__ARM_NEON__)
290 #include <arm_neon.h>
291 #define QT_FUNCTION_TARGET_STRING_NEON      "+neon" // unused: gcc doesn't support function targets on non-aarch64, and on Aarch64 NEON is always available.
292 #ifndef __ARM_NEON__
293 // __ARM_NEON__ is not defined on AArch64, but we need it in our NEON detection.
294 #define __ARM_NEON__
295 #endif
296 #endif
297 // AArch64/ARM64
298 #if defined(Q_PROCESSOR_ARM_V8) && defined(__ARM_FEATURE_CRC32)
299 #if defined(Q_PROCESSOR_ARM_64)
300 // only available on aarch64
301 #define QT_FUNCTION_TARGET_STRING_CRC32      "+crc"
302 #endif
303 #  include <arm_acle.h>
304 #endif
305 
306 #ifdef __cplusplus
307 #include <qatomic.h>
308 
309 QT_BEGIN_NAMESPACE
310 
311 #ifndef Q_PROCESSOR_X86
312 enum CPUFeatures {
313 #if defined(Q_PROCESSOR_ARM)
314     CpuFeatureNEON          = 2,
315     CpuFeatureARM_NEON      = CpuFeatureNEON,
316     CpuFeatureCRC32         = 4,
317 #elif defined(Q_PROCESSOR_MIPS)
318     CpuFeatureDSP           = 2,
319     CpuFeatureDSPR2         = 4,
320 #endif
321 
322     // used only to indicate that the CPU detection was initialised
323     QSimdInitialized        = 1
324 };
325 
326 static const quint64 qCompilerCpuFeatures = 0
327 #if defined __ARM_NEON__
328         | CpuFeatureNEON
329 #endif
330 #if defined __ARM_FEATURE_CRC32
331         | CpuFeatureCRC32
332 #endif
333 #if defined __mips_dsp
334         | CpuFeatureDSP
335 #endif
336 #if defined __mips_dspr2
337         | CpuFeatureDSPR2
338 #endif
339         ;
340 #endif
341 
342 #ifdef Q_ATOMIC_INT64_IS_SUPPORTED
343 extern Q_CORE_EXPORT QBasicAtomicInteger<quint64> qt_cpu_features[1];
344 #else
345 extern Q_CORE_EXPORT QBasicAtomicInteger<unsigned> qt_cpu_features[2];
346 #endif
347 Q_CORE_EXPORT quint64 qDetectCpuFeatures();
348 
349 #if defined(Q_PROCESSOR_X86) && QT_COMPILER_SUPPORTS_HERE(RDRND) && !defined(QT_BOOTSTRAPPED)
350 Q_CORE_EXPORT qsizetype qRandomCpu(void *, qsizetype) noexcept;
351 #else
qRandomCpu(void *,qsizetype)352 static inline qsizetype qRandomCpu(void *, qsizetype) noexcept
353 {
354     return 0;
355 }
356 #endif
357 
qCpuFeatures()358 static inline quint64 qCpuFeatures()
359 {
360     quint64 features = qt_cpu_features[0].loadRelaxed();
361 #ifndef Q_ATOMIC_INT64_IS_SUPPORTED
362     features |= quint64(qt_cpu_features[1].loadRelaxed()) << 32;
363 #endif
364     if (Q_UNLIKELY(features == 0)) {
365         features = qDetectCpuFeatures();
366         Q_ASSUME(features != 0);
367     }
368     return features;
369 }
370 
371 #define qCpuHasFeature(feature)     (((qCompilerCpuFeatures & CpuFeature ## feature) == CpuFeature ## feature) \
372                                      || ((qCpuFeatures() & CpuFeature ## feature) == CpuFeature ## feature))
373 
qHasHwrng()374 inline bool qHasHwrng()
375 {
376 #if defined(Q_PROCESSOR_X86) && QT_COMPILER_SUPPORTS_HERE(RDRND)
377     return qCpuHasFeature(RDRND);
378 #else
379     return false;
380 #endif
381 }
382 
383 #define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \
384     for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i)
385 
386 #define ALIGNMENT_PROLOGUE_32BYTES(ptr, i, length) \
387     for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((8 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x7)) & 0x7))); ++i)
388 
389 QT_END_NAMESPACE
390 
391 #endif // __cplusplus
392 
393 #define SIMD_EPILOGUE(i, length, max) \
394     for (int _i = 0; _i < max && i < length; ++i, ++_i)
395 
396 #endif // QSIMD_P_H
397