1 //===----------------------------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 // Copyright (c) Microsoft Corporation.
10 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 
12 // Copyright 2018 Ulf Adams
13 // Copyright (c) Microsoft Corporation. All rights reserved.
14 
15 // Boost Software License - Version 1.0 - August 17th, 2003
16 
17 // Permission is hereby granted, free of charge, to any person or organization
18 // obtaining a copy of the software and accompanying documentation covered by
19 // this license (the "Software") to use, reproduce, display, distribute,
20 // execute, and transmit the Software, and to prepare derivative works of the
21 // Software, and to permit third-parties to whom the Software is furnished to
22 // do so, all subject to the following:
23 
24 // The copyright notices in the Software and this entire statement, including
25 // the above license grant, this restriction and the following disclaimer,
26 // must be included in all copies of the Software, in whole or in part, and
27 // all derivative works of the Software, unless such copies or derivative
28 // works are solely in the form of machine-executable object code generated by
29 // a source language processor.
30 
31 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
32 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
33 // FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
34 // SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
35 // FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
36 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
37 // DEALINGS IN THE SOFTWARE.
38 
39 #ifndef _LIBCPP_SRC_INCLUDE_RYU_DS2_INTRINSICS_H
40 #define _LIBCPP_SRC_INCLUDE_RYU_DS2_INTRINSICS_H
41 
42 // Avoid formatting to keep the changes with the original code minimal.
43 // clang-format off
44 
45 #include "__config"
46 
47 _LIBCPP_BEGIN_NAMESPACE_STD
48 
49 #if defined(_M_X64) && defined(_MSC_VER)
50 #define _LIBCPP_INTRINSIC128 1
51 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __ryu_umul128(const uint64_t __a, const uint64_t __b, uint64_t* const __productHi) {
52   return _umul128(__a, __b, __productHi);
53 }
54 
55 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __ryu_shiftright128(const uint64_t __lo, const uint64_t __hi, const uint32_t __dist) {
56   // For the __shiftright128 intrinsic, the shift value is always
57   // modulo 64.
58   // In the current implementation of the double-precision version
59   // of Ryu, the shift value is always < 64.
60   // (The shift value is in the range [49, 58].)
61   // Check this here in case a future change requires larger shift
62   // values. In this case this function needs to be adjusted.
63   _LIBCPP_ASSERT(__dist < 64, "");
64   return __shiftright128(__lo, __hi, static_cast<unsigned char>(__dist));
65 }
66 
67 // ^^^ intrinsics available ^^^ / vvv __int128 available vvv
68 #elif defined(__SIZEOF_INT128__) && ( \
69     (defined(__clang__) && !defined(_MSC_VER)) || \
70     (defined(__GNUC__) && !defined(__clang__) && !defined(__CUDACC__)))
71 #define _LIBCPP_INTRINSIC128 1
72   // We have __uint128 support in clang or gcc
73 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __ryu_umul128(const uint64_t __a, const uint64_t __b, uint64_t* const __productHi) {
74   auto __temp = __a * (unsigned __int128)__b;
75   *__productHi = __temp >> 64;
76   return static_cast<uint64_t>(__temp);
77 }
78 
79 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __ryu_shiftright128(const uint64_t __lo, const uint64_t __hi, const uint32_t __dist) {
80   // In the current implementation of the double-precision version
81   // of Ryu, the shift value is always < 64.
82   // (The shift value is in the range [49, 58].)
83   // Check this here in case a future change requires larger shift
84   // values. In this case this function needs to be adjusted.
85   _LIBCPP_ASSERT(__dist < 64, "");
86   auto __temp = __lo | ((unsigned __int128)__hi << 64);
87   // For x64 128-bit shfits using the `shrd` instruction and two 64-bit
88   // registers, the shift value is modulo 64.  Thus the `& 63` is free.
89   return static_cast<uint64_t>(__temp >> (__dist & 63));
90 }
91 #else // ^^^ __int128 available ^^^ / vvv intrinsics unavailable vvv
92 
93 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline _LIBCPP_ALWAYS_INLINE uint64_t __ryu_umul128(const uint64_t __a, const uint64_t __b, uint64_t* const __productHi) {
94   // TRANSITION, VSO-634761
95   // The casts here help MSVC to avoid calls to the __allmul library function.
96   const uint32_t __aLo = static_cast<uint32_t>(__a);
97   const uint32_t __aHi = static_cast<uint32_t>(__a >> 32);
98   const uint32_t __bLo = static_cast<uint32_t>(__b);
99   const uint32_t __bHi = static_cast<uint32_t>(__b >> 32);
100 
101   const uint64_t __b00 = static_cast<uint64_t>(__aLo) * __bLo;
102   const uint64_t __b01 = static_cast<uint64_t>(__aLo) * __bHi;
103   const uint64_t __b10 = static_cast<uint64_t>(__aHi) * __bLo;
104   const uint64_t __b11 = static_cast<uint64_t>(__aHi) * __bHi;
105 
106   const uint32_t __b00Lo = static_cast<uint32_t>(__b00);
107   const uint32_t __b00Hi = static_cast<uint32_t>(__b00 >> 32);
108 
109   const uint64_t __mid1 = __b10 + __b00Hi;
110   const uint32_t __mid1Lo = static_cast<uint32_t>(__mid1);
111   const uint32_t __mid1Hi = static_cast<uint32_t>(__mid1 >> 32);
112 
113   const uint64_t __mid2 = __b01 + __mid1Lo;
114   const uint32_t __mid2Lo = static_cast<uint32_t>(__mid2);
115   const uint32_t __mid2Hi = static_cast<uint32_t>(__mid2 >> 32);
116 
117   const uint64_t __pHi = __b11 + __mid1Hi + __mid2Hi;
118   const uint64_t __pLo = (static_cast<uint64_t>(__mid2Lo) << 32) | __b00Lo;
119 
120   *__productHi = __pHi;
121   return __pLo;
122 }
123 
124 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __ryu_shiftright128(const uint64_t __lo, const uint64_t __hi, const uint32_t __dist) {
125   // We don't need to handle the case __dist >= 64 here (see above).
126   _LIBCPP_ASSERT(__dist < 64, "");
127 #ifdef _LIBCPP_64_BIT
128   _LIBCPP_ASSERT(__dist > 0, "");
129   return (__hi << (64 - __dist)) | (__lo >> __dist);
130 #else // ^^^ 64-bit ^^^ / vvv 32-bit vvv
131   // Avoid a 64-bit shift by taking advantage of the range of shift values.
132   _LIBCPP_ASSERT(__dist >= 32, "");
133   return (__hi << (64 - __dist)) | (static_cast<uint32_t>(__lo >> 32) >> (__dist - 32));
134 #endif // ^^^ 32-bit ^^^
135 }
136 
137 #endif // ^^^ intrinsics unavailable ^^^
138 
139 #ifndef _LIBCPP_64_BIT
140 
141 // Returns the high 64 bits of the 128-bit product of __a and __b.
142 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __umulh(const uint64_t __a, const uint64_t __b) {
143   // Reuse the __ryu_umul128 implementation.
144   // Optimizers will likely eliminate the instructions used to compute the
145   // low part of the product.
146   uint64_t __hi;
147   (void) __ryu_umul128(__a, __b, &__hi);
148   return __hi;
149 }
150 
151 // On 32-bit platforms, compilers typically generate calls to library
152 // functions for 64-bit divisions, even if the divisor is a constant.
153 //
154 // TRANSITION, LLVM-37932
155 //
156 // The functions here perform division-by-constant using multiplications
157 // in the same way as 64-bit compilers would do.
158 //
159 // NB:
160 // The multipliers and shift values are the ones generated by clang x64
161 // for expressions like x/5, x/10, etc.
162 
163 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div5(const uint64_t __x) {
164   return __umulh(__x, 0xCCCCCCCCCCCCCCCDu) >> 2;
165 }
166 
167 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div10(const uint64_t __x) {
168   return __umulh(__x, 0xCCCCCCCCCCCCCCCDu) >> 3;
169 }
170 
171 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div100(const uint64_t __x) {
172   return __umulh(__x >> 2, 0x28F5C28F5C28F5C3u) >> 2;
173 }
174 
175 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div1e8(const uint64_t __x) {
176   return __umulh(__x, 0xABCC77118461CEFDu) >> 26;
177 }
178 
179 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div1e9(const uint64_t __x) {
180   return __umulh(__x >> 9, 0x44B82FA09B5A53u) >> 11;
181 }
182 
183 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint32_t __mod1e9(const uint64_t __x) {
184   // Avoid 64-bit math as much as possible.
185   // Returning static_cast<uint32_t>(__x - 1000000000 * __div1e9(__x)) would
186   // perform 32x64-bit multiplication and 64-bit subtraction.
187   // __x and 1000000000 * __div1e9(__x) are guaranteed to differ by
188   // less than 10^9, so their highest 32 bits must be identical,
189   // so we can truncate both sides to uint32_t before subtracting.
190   // We can also simplify static_cast<uint32_t>(1000000000 * __div1e9(__x)).
191   // We can truncate before multiplying instead of after, as multiplying
192   // the highest 32 bits of __div1e9(__x) can't affect the lowest 32 bits.
193   return static_cast<uint32_t>(__x) - 1000000000 * static_cast<uint32_t>(__div1e9(__x));
194 }
195 
196 #else // ^^^ 32-bit ^^^ / vvv 64-bit vvv
197 
198 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div5(const uint64_t __x) {
199   return __x / 5;
200 }
201 
202 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div10(const uint64_t __x) {
203   return __x / 10;
204 }
205 
206 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div100(const uint64_t __x) {
207   return __x / 100;
208 }
209 
210 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div1e8(const uint64_t __x) {
211   return __x / 100000000;
212 }
213 
214 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div1e9(const uint64_t __x) {
215   return __x / 1000000000;
216 }
217 
218 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint32_t __mod1e9(const uint64_t __x) {
219   return static_cast<uint32_t>(__x - 1000000000 * __div1e9(__x));
220 }
221 
222 #endif // ^^^ 64-bit ^^^
223 
224 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint32_t __pow5Factor(uint64_t __value) {
225   uint32_t __count = 0;
226   for (;;) {
227     _LIBCPP_ASSERT(__value != 0, "");
228     const uint64_t __q = __div5(__value);
229     const uint32_t __r = static_cast<uint32_t>(__value) - 5 * static_cast<uint32_t>(__q);
230     if (__r != 0) {
231       break;
232     }
233     __value = __q;
234     ++__count;
235   }
236   return __count;
237 }
238 
239 // Returns true if __value is divisible by 5^__p.
240 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline bool __multipleOfPowerOf5(const uint64_t __value, const uint32_t __p) {
241   // I tried a case distinction on __p, but there was no performance difference.
242   return __pow5Factor(__value) >= __p;
243 }
244 
245 // Returns true if __value is divisible by 2^__p.
246 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline bool __multipleOfPowerOf2(const uint64_t __value, const uint32_t __p) {
247   _LIBCPP_ASSERT(__value != 0, "");
248   _LIBCPP_ASSERT(__p < 64, "");
249   // __builtin_ctzll doesn't appear to be faster here.
250   return (__value & ((1ull << __p) - 1)) == 0;
251 }
252 
253 _LIBCPP_END_NAMESPACE_STD
254 
255 // clang-format on
256 
257 #endif // _LIBCPP_SRC_INCLUDE_RYU_DS2_INTRINSICS_H
258