1 /****************************************************************************
2  * Copyright (C) 2014-2017 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  ****************************************************************************/
23 
24 #ifndef __SWR_OS_H__
25 #define __SWR_OS_H__
26 
27 #include <cstddef>
28 #include "core/knobs.h"
29 
30 #if (defined(FORCE_WINDOWS) || defined(_WIN32)) && !defined(FORCE_LINUX)
31 
32 #define SWR_API __cdecl
33 #define SWR_VISIBLE __declspec(dllexport)
34 
35 #ifndef NOMINMAX
36 #undef UNICODE
37 #define NOMINMAX
38 #include <windows.h>
39 #undef NOMINMAX
40 #define UNICODE
41 #else
42 #undef UNICODE
43 #include <windows.h>
44 #define UNICODE
45 #endif
46 #include <intrin.h>
47 #include <cstdint>
48 
49 #if defined(MemoryFence)
50 // Windows.h defines MemoryFence as _mm_mfence, but this conflicts with llvm::sys::MemoryFence
51 #undef MemoryFence
52 #endif
53 
54 #if defined(_MSC_VER)
55 #define OSALIGN(RWORD, WIDTH) __declspec(align(WIDTH)) RWORD
56 #elif defined(__GNUC__)
57 #define OSALIGN(RWORD, WIDTH) RWORD __attribute__((aligned(WIDTH)))
58 #endif
59 
60 #if defined(_DEBUG)
61 // We compile Debug builds with inline function expansion enabled.  This allows
62 // functions compiled with __forceinline to be inlined even in Debug builds.
63 // The inline_depth(0) pragma below will disable inline function expansion for
64 // normal INLINE / inline functions, but not for __forceinline functions.
65 // Our SIMD function wrappers (see simdlib.hpp) use __forceinline even in
66 // Debug builds.
67 #define INLINE inline
68 #pragma inline_depth(0)
69 #else
70 // Use of __forceinline increases compile time dramatically in release builds
71 // and provides almost 0 measurable benefit.  Disable until we have a compelling
72 // use-case
73 // #define INLINE __forceinline
74 #define INLINE inline
75 #endif
76 #ifndef FORCEINLINE
77 #define FORCEINLINE __forceinline
78 #endif
79 
80 #define DEBUGBREAK __debugbreak()
81 
82 #define PRAGMA_WARNING_PUSH_DISABLE(...) \
83     __pragma(warning(push));             \
84     __pragma(warning(disable : __VA_ARGS__));
85 
86 #define PRAGMA_WARNING_POP() __pragma(warning(pop))
87 
AlignedMalloc(size_t _Size,size_t _Alignment)88 static inline void* AlignedMalloc(size_t _Size, size_t _Alignment)
89 {
90     return _aligned_malloc(_Size, _Alignment);
91 }
92 
AlignedFree(void * p)93 static inline void AlignedFree(void* p)
94 {
95     return _aligned_free(p);
96 }
97 
98 #if defined(_WIN64)
99 #define BitScanReverseSizeT BitScanReverse64
100 #define BitScanForwardSizeT BitScanForward64
101 #define _mm_popcount_sizeT _mm_popcnt_u64
102 #else
103 #define BitScanReverseSizeT BitScanReverse
104 #define BitScanForwardSizeT BitScanForward
105 #define _mm_popcount_sizeT _mm_popcnt_u32
106 #endif
107 
108 #if !defined(_WIN64)
109 extern "C" {
_BitScanForward64(unsigned long * Index,uint64_t Mask)110 inline unsigned char _BitScanForward64(unsigned long* Index, uint64_t Mask)
111 {
112     if (Mask == 0)
113       return 0;
114 #ifdef __GNUC__
115     *Index = __builtin_ctzll(Mask);
116 #else
117     *Index = 0;
118     for (int i = 0; i < 64; ++ i)
119       if ((1ULL << i) & Mask)
120         *Index = i;
121 #endif
122     return 1;
123 }
124 
_BitScanReverse64(unsigned long * Index,uint64_t Mask)125 inline unsigned char _BitScanReverse64(unsigned long* Index, uint64_t Mask)
126 {
127     if (Mask == 0)
128       return 0;
129 #ifdef __GNUC__
130     *Index = 63 - __builtin_clzll(Mask);
131 #else
132     *Index = 0;
133     for (int i = 63; i >= 0; -- i)
134       if ((1ULL << i) & Mask)
135         *Index = i;
136 #endif
137     return 1;
138 }
139 }
140 #endif
141 
142 #elif defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
143 
144 #define SWR_API
145 #define SWR_VISIBLE __attribute__((visibility("default")))
146 
147 #include <stdlib.h>
148 #include <string.h>
149 #include <x86intrin.h>
150 #include <stdint.h>
151 #include <sys/types.h>
152 #include <unistd.h>
153 #include <sys/stat.h>
154 #include <stdio.h>
155 #include <limits.h>
156 
157 typedef void         VOID;
158 typedef void*        LPVOID;
159 typedef int          INT;
160 typedef unsigned int UINT;
161 typedef void*        HANDLE;
162 typedef int          LONG;
163 typedef unsigned int DWORD;
164 
165 #undef FALSE
166 #define FALSE 0
167 
168 #undef TRUE
169 #define TRUE 1
170 
171 #define MAX_PATH PATH_MAX
172 
173 #define OSALIGN(RWORD, WIDTH) RWORD __attribute__((aligned(WIDTH)))
174 #ifndef INLINE
175 #define INLINE __inline
176 #endif
177 #ifndef FORCEINLINE
178 #define FORCEINLINE INLINE
179 #endif
180 #define DEBUGBREAK asm("int $3")
181 
182 #if !defined(__CYGWIN__)
183 
184 #ifndef __cdecl
185 #define __cdecl
186 #endif
187 #ifndef __stdcall
188 #define __stdcall
189 #endif
190 
191 #if defined(__GNUC__) && !defined(__INTEL_COMPILER)
192 #define __declspec(x) __declspec_##x
193 #define __declspec_align(y) __attribute__((aligned(y)))
194 #define __declspec_deprecated __attribute__((deprecated))
195 #define __declspec_dllexport
196 #define __declspec_dllimport
197 #define __declspec_noinline __attribute__((__noinline__))
198 #define __declspec_nothrow __attribute__((nothrow))
199 #define __declspec_novtable
200 #define __declspec_thread __thread
201 #else
202 #define __declspec(X)
203 #endif
204 
205 #endif
206 
207 #define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
208 
209 #if !defined(__clang__) && (__GNUC__) && (GCC_VERSION < 40500)
__rdtsc()210 inline uint64_t      __rdtsc()
211 {
212     long low, high;
213     asm volatile("rdtsc" : "=a"(low), "=d"(high));
214     return (low | ((uint64_t)high << 32));
215 }
216 #endif
217 
218 #if !defined(__clang__) && !defined(__INTEL_COMPILER)
219 // Intrinsic not defined in gcc < 10
220 #if (__GNUC__) && (GCC_VERSION < 100000)
_mm256_storeu2_m128i(__m128i * hi,__m128i * lo,__m256i a)221 static INLINE void _mm256_storeu2_m128i(__m128i* hi, __m128i* lo, __m256i a)
222 {
223     _mm_storeu_si128((__m128i*)lo, _mm256_castsi256_si128(a));
224     _mm_storeu_si128((__m128i*)hi, _mm256_extractf128_si256(a, 0x1));
225 }
226 #endif
227 
228 // gcc prior to 4.9 doesn't have _mm*_undefined_*
229 #if (__GNUC__) && (GCC_VERSION < 40900)
230 #define _mm_undefined_si128 _mm_setzero_si128
231 #define _mm256_undefined_ps _mm256_setzero_ps
232 #endif
233 #endif
234 
_BitScanForward64(unsigned long * Index,uint64_t Mask)235 inline unsigned char _BitScanForward64(unsigned long* Index, uint64_t Mask)
236 {
237     if (Mask == 0)
238       return 0;
239     *Index = __builtin_ctzll(Mask);
240     return 1;
241 }
242 
_BitScanForward(unsigned long * Index,uint32_t Mask)243 inline unsigned char _BitScanForward(unsigned long* Index, uint32_t Mask)
244 {
245     if (Mask == 0)
246       return 0;
247     *Index = __builtin_ctz(Mask);
248     return 1;
249 }
250 
_BitScanReverse64(unsigned long * Index,uint64_t Mask)251 inline unsigned char _BitScanReverse64(unsigned long* Index, uint64_t Mask)
252 {
253     if (Mask == 0)
254       return 0;
255     *Index = 63 - __builtin_clzll(Mask);
256     return 1;
257 }
258 
_BitScanReverse(unsigned long * Index,uint32_t Mask)259 inline unsigned char _BitScanReverse(unsigned long* Index, uint32_t Mask)
260 {
261     if (Mask == 0)
262       return 0;
263     *Index = 31 - __builtin_clz(Mask);
264     return 1;
265 }
266 
AlignedMalloc(size_t size,size_t alignment)267 inline void* AlignedMalloc(size_t size, size_t alignment)
268 {
269     void* ret;
270     if (posix_memalign(&ret, alignment, size))
271     {
272         return NULL;
273     }
274     return ret;
275 }
276 
AlignedFree(void * p)277 static inline void AlignedFree(void* p)
278 {
279     free(p);
280 }
281 
282 #define _countof(a) (sizeof(a) / sizeof(*(a)))
283 
284 #define sprintf_s sprintf
285 #define strcpy_s(dst, size, src) strncpy(dst, src, size)
286 #define GetCurrentProcessId getpid
287 
288 #define InterlockedCompareExchange(Dest, Exchange, Comparand) \
289     __sync_val_compare_and_swap(Dest, Comparand, Exchange)
290 #define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value)
291 #define InterlockedDecrement(Append) __sync_sub_and_fetch(Append, 1)
292 #define InterlockedDecrement64(Append) __sync_sub_and_fetch(Append, 1)
293 #define InterlockedIncrement(Append) __sync_add_and_fetch(Append, 1)
294 #define InterlockedAdd(Addend, Value) __sync_add_and_fetch(Addend, Value)
295 #define InterlockedAdd64(Addend, Value) __sync_add_and_fetch(Addend, Value)
296 #define _ReadWriteBarrier() asm volatile("" ::: "memory")
297 
298 #define PRAGMA_WARNING_PUSH_DISABLE(...)
299 #define PRAGMA_WARNING_POP()
300 
301 #define ZeroMemory(dst, size) memset(dst, 0, size)
302 #else
303 
304 #error Unsupported OS/system.
305 
306 #endif
307 
308 #define THREAD thread_local
309 
310 // Universal types
311 typedef uint8_t  KILOBYTE[1024];
312 typedef KILOBYTE MEGABYTE[1024];
313 typedef MEGABYTE GIGABYTE[1024];
314 
315 #define OSALIGNLINE(RWORD) OSALIGN(RWORD, 64)
316 #define OSALIGNSIMD(RWORD) OSALIGN(RWORD, KNOB_SIMD_BYTES)
317 #define OSALIGNSIMD16(RWORD) OSALIGN(RWORD, KNOB_SIMD16_BYTES)
318 
319 #include "common/swr_assert.h"
320 
321 #ifdef __GNUC__
322 #define ATTR_UNUSED __attribute__((unused))
323 #else
324 #define ATTR_UNUSED
325 #endif
326 
327 #define SWR_FUNC(_retType, _funcName, /* args */...)        \
328     typedef _retType(SWR_API* PFN##_funcName)(__VA_ARGS__); \
329     _retType SWR_API _funcName(__VA_ARGS__);
330 
331 // Defined in os.cpp
332 void SWR_API SetCurrentThreadName(const char* pThreadName);
333 void SWR_API CreateDirectoryPath(const std::string& path);
334 
335 /// Execute Command (block until finished)
336 /// @returns process exit value
337 int SWR_API
338     ExecCmd(const std::string& cmd,                ///< (In) Command line string
339             const char*  pOptEnvStrings = nullptr, ///< (Optional In) Environment block for new process
340             std::string* pOptStdOut     = nullptr,   ///< (Optional Out) Standard Output text
341             std::string* pOptStdErr     = nullptr,   ///< (Optional Out) Standard Error text
342             const std::string* pOptStdIn = nullptr); ///< (Optional In) Standard Input text
343 
344 
345 /// Helper for setting up FP state
346 /// @returns old csr state
SetOptimalVectorCSR()347 static INLINE uint32_t SetOptimalVectorCSR()
348 {
349     uint32_t oldCSR = _mm_getcsr();
350 
351     uint32_t newCSR = (oldCSR & ~(_MM_ROUND_MASK | _MM_DENORMALS_ZERO_MASK | _MM_FLUSH_ZERO_MASK));
352     newCSR |= (_MM_ROUND_NEAREST | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
353     _mm_setcsr(newCSR);
354 
355     return oldCSR;
356 }
357 
358 /// Set Vector CSR state.
359 /// @param csrState - should be value returned from SetOptimalVectorCSR()
RestoreVectorCSR(uint32_t csrState)360 static INLINE void RestoreVectorCSR(uint32_t csrState)
361 {
362     _mm_setcsr(csrState);
363 }
364 
365 #endif //__SWR_OS_H__
366