1 /****************************************************************************
2 **
3 ** Copyright (C) 2016 The Qt Company Ltd.
4 ** Copyright (C) 2018 Intel Corporation.
5 ** Contact: https://www.qt.io/licensing/
6 **
7 ** This file is part of the QtCore module of the Qt Toolkit.
8 **
9 ** $QT_BEGIN_LICENSE:LGPL$
10 ** Commercial License Usage
11 ** Licensees holding valid commercial Qt licenses may use this file in
12 ** accordance with the commercial license agreement provided with the
13 ** Software or, alternatively, in accordance with the terms contained in
14 ** a written agreement between you and The Qt Company. For licensing terms
15 ** and conditions see https://www.qt.io/terms-conditions. For further
16 ** information use the contact form at https://www.qt.io/contact-us.
17 **
18 ** GNU Lesser General Public License Usage
19 ** Alternatively, this file may be used under the terms of the GNU Lesser
20 ** General Public License version 3 as published by the Free Software
21 ** Foundation and appearing in the file LICENSE.LGPL3 included in the
22 ** packaging of this file. Please review the following information to
23 ** ensure the GNU Lesser General Public License version 3 requirements
24 ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
25 **
26 ** GNU General Public License Usage
27 ** Alternatively, this file may be used under the terms of the GNU
28 ** General Public License version 2.0 or (at your option) the GNU General
29 ** Public license version 3 or any later version approved by the KDE Free
30 ** Qt Foundation. The licenses are as published by the Free Software
31 ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
32 ** included in the packaging of this file. Please review the following
33 ** information to ensure the GNU General Public License requirements will
34 ** be met: https://www.gnu.org/licenses/gpl-2.0.html and
35 ** https://www.gnu.org/licenses/gpl-3.0.html.
36 **
37 ** $QT_END_LICENSE$
38 **
39 ****************************************************************************/
40
41 #ifndef QSIMD_P_H
42 #define QSIMD_P_H
43
44 //
45 // W A R N I N G
46 // -------------
47 //
48 // This file is not part of the Qt API. It exists purely as an
49 // implementation detail. This header file may change from version to
50 // version without notice, or even be removed.
51 //
52 // We mean it.
53 //
54
55 #include <QtCore/private/qglobal_p.h>
56
57 /*
58 * qt_module_config.prf defines the QT_COMPILER_SUPPORTS_XXX macros.
59 * They mean the compiler supports the necessary flags and the headers
60 * for the x86 and ARM intrinsics:
61 * - GCC: the -mXXX or march=YYY flag is necessary before #include
62 * up to 4.8; GCC >= 4.9 can include unconditionally
63 * - Intel CC: #include can happen unconditionally
64 * - MSVC: #include can happen unconditionally
65 * - RVCT: ???
66 *
67 * We will try to include all headers possible under this configuration.
68 *
69 * MSVC does not define __SSE2__ & family, so we will define them. MSVC 2013 &
70 * up do define __AVX__ if the -arch:AVX option is passed on the command-line.
71 *
72 * Supported XXX are:
73 * Flag | Arch | GCC | Intel CC | MSVC |
74 * ARM_NEON | ARM | I & C | None | ? |
75 * SSE2 | x86 | I & C | I & C | I & C |
76 * SSE3 | x86 | I & C | I & C | I only |
77 * SSSE3 | x86 | I & C | I & C | I only |
78 * SSE4_1 | x86 | I & C | I & C | I only |
79 * SSE4_2 | x86 | I & C | I & C | I only |
80 * AVX | x86 | I & C | I & C | I & C |
81 * AVX2 | x86 | I & C | I & C | I only |
82 * AVX512xx | x86 | I & C | I & C | I only |
83 * I = intrinsics; C = code generation
84 *
85 * Code can use the following constructs to determine compiler support & status:
86 * - #ifdef __XXX__ (e.g: #ifdef __AVX__ or #ifdef __ARM_NEON__)
87 * If this test passes, then the compiler is already generating code for that
88 * given sub-architecture. The intrinsics for that sub-architecture are
89 * #included and can be used without restriction or runtime check.
90 *
91 * - #if QT_COMPILER_SUPPORTS(XXX)
92 * If this test passes, then the compiler is able to generate code for that
93 * given sub-architecture in another translation unit, given the right set of
94 * flags. Use of the intrinsics is not guaranteed. This is useful with
95 * runtime detection (see below).
96 *
97 * - #if QT_COMPILER_SUPPORTS_HERE(XXX)
98 * If this test passes, then the compiler is able to generate code for that
99 * given sub-architecture in this translation unit, even if it is not doing
100 * that now (it might be). Individual functions may be tagged with
101 * QT_FUNCTION_TARGET(XXX) to cause the compiler to generate code for that
102 * sub-arch. Only inside such functions is the use of the intrisics
103 * guaranteed to work. This is useful with runtime detection (see below).
104 *
105 * Runtime detection of a CPU sub-architecture can be done with the
106 * qCpuHasFeature(XXX) function. There are two strategies for generating
107 * optimized code like that:
108 *
109 * 1) place the optimized code in a different translation unit (C or assembly
110 * sources) and pass the correct flags to the compiler to enable support. Those
111 * sources must not include qglobal.h, which means they cannot include this
112 * file either. The dispatcher function would look like this:
113 *
114 * void foo()
115 * {
116 * #if QT_COMPILER_SUPPORTS(XXX)
117 * if (qCpuHasFeature(XXX)) {
118 * foo_optimized_xxx();
119 * return;
120 * }
121 * #endif
122 * foo_plain();
123 * }
124 *
125 * 2) place the optimized code in a function tagged with QT_FUNCTION_TARGET and
126 * surrounded by #if QT_COMPILER_SUPPORTS_HERE(XXX). That code can freely use
127 * other Qt code. The dispatcher function would look like this:
128 *
129 * void foo()
130 * {
131 * #if QT_COMPILER_SUPPORTS_HERE(XXX)
132 * if (qCpuHasFeature(XXX)) {
133 * foo_optimized_xxx();
134 * return;
135 * }
136 * #endif
137 * foo_plain();
138 * }
139 */
140
141 #if defined(__MINGW64_VERSION_MAJOR) || defined(Q_CC_MSVC)
142 #include <intrin.h>
143 #endif
144
145 #define QT_COMPILER_SUPPORTS(x) (QT_COMPILER_SUPPORTS_ ## x - 0)
146
147 #if defined(Q_PROCESSOR_ARM)
148 # define QT_COMPILER_SUPPORTS_HERE(x) (__ARM_FEATURE_ ## x)
149 # if defined(Q_CC_GNU) && !defined(Q_CC_INTEL) && Q_CC_GNU >= 600
150 /* GCC requires attributes for a function */
151 # define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
152 # else
153 # define QT_FUNCTION_TARGET(x)
154 # endif
155 # if !defined(__ARM_FEATURE_NEON) && defined(__ARM_NEON__)
156 # define __ARM_FEATURE_NEON // also support QT_COMPILER_SUPPORTS_HERE(NEON)
157 # endif
158 #elif defined(Q_PROCESSOR_MIPS)
159 # define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
160 # define QT_FUNCTION_TARGET(x)
161 # if !defined(__MIPS_DSP__) && defined(__mips_dsp) && defined(Q_PROCESSOR_MIPS_32)
162 # define __MIPS_DSP__
163 # endif
164 # if !defined(__MIPS_DSPR2__) && defined(__mips_dspr2) && defined(Q_PROCESSOR_MIPS_32)
165 # define __MIPS_DSPR2__
166 # endif
167 #elif defined(Q_PROCESSOR_X86) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)
168 # define QT_COMPILER_SUPPORTS_HERE(x) ((__ ## x ## __) || QT_COMPILER_SUPPORTS(x))
169 # if defined(Q_CC_GNU) && !defined(Q_CC_INTEL)
170 /* GCC requires attributes for a function */
171 # define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
172 # else
173 # define QT_FUNCTION_TARGET(x)
174 # endif
175 #else
176 # define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
177 # define QT_FUNCTION_TARGET(x)
178 #endif
179
180 #ifdef Q_PROCESSOR_X86
181 /* -- x86 intrinsic support -- */
182
183 # if defined(Q_CC_MSVC) && (defined(_M_X64) || _M_IX86_FP >= 2)
184 // MSVC doesn't define __SSE2__, so do it ourselves
185 # define __SSE__ 1
186 # define __SSE2__ 1
187 # endif
188
189 # ifdef __SSE2__
190 // #include the intrinsics
191 # include <immintrin.h>
192 # endif
193
194 # if defined(Q_CC_GNU) && !defined(Q_CC_INTEL)
195 // GCC 4.4 and Clang 2.8 added a few more intrinsics there
196 # include <x86intrin.h>
197 # endif
198
199 # if defined(Q_CC_MSVC) && (defined(_M_AVX) || defined(__AVX__))
200 // Visual Studio defines __AVX__ when /arch:AVX is passed, but not the earlier macros
201 // See: https://msdn.microsoft.com/en-us/library/b0084kay.aspx
202 # define __SSE3__ 1
203 # define __SSSE3__ 1
204 // no Intel CPU supports SSE4a, so don't define it
205 # define __SSE4_1__ 1
206 # define __SSE4_2__ 1
207 # ifndef __AVX__
208 # define __AVX__ 1
209 # endif
210 # endif
211
212 # if defined(__SSE4_2__) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) && (defined(Q_CC_INTEL) || defined(Q_CC_MSVC))
213 // POPCNT instructions:
214 // All processors that support SSE4.2 support POPCNT
215 // (but neither MSVC nor the Intel compiler define this macro)
216 # define __POPCNT__ 1
217 # endif
218
219 // AVX intrinsics
220 # if defined(__AVX__) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) && (defined(Q_CC_INTEL) || defined(Q_CC_MSVC))
221 // AES, PCLMULQDQ instructions:
222 // All processors that support AVX support PCLMULQDQ
223 // (but neither MSVC nor the Intel compiler define this macro)
224 # define __PCLMUL__ 1
225 # endif
226
227 # if defined(__AVX2__) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) && (defined(Q_CC_INTEL) || defined(Q_CC_MSVC))
228 // F16C & RDRAND instructions:
229 // All processors that support AVX2 support F16C & RDRAND:
230 // (but neither MSVC nor the Intel compiler define these macros)
231 # define __F16C__ 1
232 # define __RDRND__ 1
233 # endif
234
235 # if defined(__BMI__) && !defined(__BMI2__) && defined(Q_CC_INTEL)
236 // BMI2 instructions:
237 // All processors that support BMI support BMI2 (and AVX2)
238 // (but neither MSVC nor the Intel compiler define this macro)
239 # define __BMI2__ 1
240 # endif
241
242 # include "qsimd_x86_p.h"
243
244 // Haswell sub-architecture
245 //
246 // The Intel Core 4th generation was codenamed "Haswell" and introduced AVX2,
247 // BMI1, BMI2, FMA, LZCNT, MOVBE, which makes it a good divider for a
248 // sub-target for us. The first AMD processor with AVX2 support (Zen) has the
249 // same features.
250 //
251 // macOS's fat binaries support the "x86_64h" sub-architecture and the GNU libc
252 // ELF loader also supports a "haswell/" subdir (e.g., /usr/lib/haswell).
253 # define QT_FUNCTION_TARGET_STRING_ARCH_HASWELL "arch=haswell"
254 # if defined(__AVX2__) && defined(__BMI__) && defined(__BMI2__) && defined(__F16C__) && \
255 defined(__FMA__) && defined(__LZCNT__) && defined(__RDRND__)
256 # define __haswell__ 1
257 # endif
258
259 // This constant does not include all CPU features found in a Haswell, only
260 // those that we'd have optimized code for.
261 // Note: must use Q_CONSTEXPR here, as this file may be compiled in C mode.
262 QT_BEGIN_NAMESPACE
263 static const quint64 CpuFeatureArchHaswell = 0
264 | CpuFeatureSSE2
265 | CpuFeatureSSE3
266 | CpuFeatureSSSE3
267 | CpuFeatureSSE4_1
268 | CpuFeatureSSE4_2
269 | CpuFeatureFMA
270 | CpuFeaturePOPCNT
271 | CpuFeatureAVX
272 | CpuFeatureF16C
273 | CpuFeatureAVX2
274 | CpuFeatureBMI
275 | CpuFeatureBMI2;
276 QT_END_NAMESPACE
277
278 #endif /* Q_PROCESSOR_X86 */
279
280 // Clang compiler fix, see http://lists.llvm.org/pipermail/cfe-commits/Week-of-Mon-20160222/151168.html
281 // This should be tweaked with an "upper version" of clang once we know which release fixes the
282 // issue. At that point we can rely on __ARM_FEATURE_CRC32 again.
283 #if defined(Q_CC_CLANG) && defined(Q_OS_DARWIN) && defined (__ARM_FEATURE_CRC32)
284 # undef __ARM_FEATURE_CRC32
285 #endif
286
287 // NEON intrinsics
288 // note: as of GCC 4.9, does not support function targets for ARM
289 #if defined(__ARM_NEON) || defined(__ARM_NEON__)
290 #include <arm_neon.h>
291 #define QT_FUNCTION_TARGET_STRING_NEON "+neon" // unused: gcc doesn't support function targets on non-aarch64, and on Aarch64 NEON is always available.
292 #ifndef __ARM_NEON__
293 // __ARM_NEON__ is not defined on AArch64, but we need it in our NEON detection.
294 #define __ARM_NEON__
295 #endif
296 #endif
297 // AArch64/ARM64
298 #if defined(Q_PROCESSOR_ARM_V8) && defined(__ARM_FEATURE_CRC32)
299 #if defined(Q_PROCESSOR_ARM_64)
300 // only available on aarch64
301 #define QT_FUNCTION_TARGET_STRING_CRC32 "+crc"
302 #endif
303 # include <arm_acle.h>
304 #endif
305
306 #ifdef __cplusplus
307 #include <qatomic.h>
308
309 QT_BEGIN_NAMESPACE
310
311 #ifndef Q_PROCESSOR_X86
312 enum CPUFeatures {
313 #if defined(Q_PROCESSOR_ARM)
314 CpuFeatureNEON = 2,
315 CpuFeatureARM_NEON = CpuFeatureNEON,
316 CpuFeatureCRC32 = 4,
317 #elif defined(Q_PROCESSOR_MIPS)
318 CpuFeatureDSP = 2,
319 CpuFeatureDSPR2 = 4,
320 #endif
321
322 // used only to indicate that the CPU detection was initialised
323 QSimdInitialized = 1
324 };
325
326 static const quint64 qCompilerCpuFeatures = 0
327 #if defined __ARM_NEON__
328 | CpuFeatureNEON
329 #endif
330 #if defined __ARM_FEATURE_CRC32
331 | CpuFeatureCRC32
332 #endif
333 #if defined __mips_dsp
334 | CpuFeatureDSP
335 #endif
336 #if defined __mips_dspr2
337 | CpuFeatureDSPR2
338 #endif
339 ;
340 #endif
341
342 #ifdef Q_ATOMIC_INT64_IS_SUPPORTED
343 extern Q_CORE_EXPORT QBasicAtomicInteger<quint64> qt_cpu_features[1];
344 #else
345 extern Q_CORE_EXPORT QBasicAtomicInteger<unsigned> qt_cpu_features[2];
346 #endif
347 Q_CORE_EXPORT quint64 qDetectCpuFeatures();
348
349 #if defined(Q_PROCESSOR_X86) && QT_COMPILER_SUPPORTS_HERE(RDRND) && !defined(QT_BOOTSTRAPPED)
350 Q_CORE_EXPORT qsizetype qRandomCpu(void *, qsizetype) noexcept;
351 #else
qRandomCpu(void *,qsizetype)352 static inline qsizetype qRandomCpu(void *, qsizetype) noexcept
353 {
354 return 0;
355 }
356 #endif
357
qCpuFeatures()358 static inline quint64 qCpuFeatures()
359 {
360 quint64 features = qt_cpu_features[0].loadRelaxed();
361 #ifndef Q_ATOMIC_INT64_IS_SUPPORTED
362 features |= quint64(qt_cpu_features[1].loadRelaxed()) << 32;
363 #endif
364 if (Q_UNLIKELY(features == 0)) {
365 features = qDetectCpuFeatures();
366 Q_ASSUME(features != 0);
367 }
368 return features;
369 }
370
371 #define qCpuHasFeature(feature) (((qCompilerCpuFeatures & CpuFeature ## feature) == CpuFeature ## feature) \
372 || ((qCpuFeatures() & CpuFeature ## feature) == CpuFeature ## feature))
373
qHasHwrng()374 inline bool qHasHwrng()
375 {
376 #if defined(Q_PROCESSOR_X86) && QT_COMPILER_SUPPORTS_HERE(RDRND)
377 return qCpuHasFeature(RDRND);
378 #else
379 return false;
380 #endif
381 }
382
383 #define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \
384 for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i)
385
386 #define ALIGNMENT_PROLOGUE_32BYTES(ptr, i, length) \
387 for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((8 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x7)) & 0x7))); ++i)
388
389 QT_END_NAMESPACE
390
391 #endif // __cplusplus
392
393 #define SIMD_EPILOGUE(i, length, max) \
394 for (int _i = 0; _i < max && i < length; ++i, ++_i)
395
396 #endif // QSIMD_P_H
397