1 /*
2  * Distributed under the Boost Software License, Version 1.0.
3  * (See accompanying file LICENSE_1_0.txt or copy at
4  * http://www.boost.org/LICENSE_1_0.txt)
5  *
6  * Copyright (c) 2020 Andrey Semashev
7  */
8 /*!
9  * \file   find_address_sse2.cpp
10  *
11  * This file contains SSE2 implementation of the \c find_address algorithm
12  */
13 
14 #include <boost/predef/architecture/x86.h>
15 #include <boost/atomic/detail/int_sizes.hpp>
16 
17 #if BOOST_ARCH_X86 && defined(BOOST_ATOMIC_DETAIL_SIZEOF_POINTER) && (BOOST_ATOMIC_DETAIL_SIZEOF_POINTER == 8 || BOOST_ATOMIC_DETAIL_SIZEOF_POINTER == 4)
18 
19 #include <cstddef>
20 #include <emmintrin.h>
21 
22 #include <boost/cstdint.hpp>
23 #include <boost/atomic/detail/config.hpp>
24 #include <boost/atomic/detail/intptr.hpp>
25 #include "find_address.hpp"
26 #include "x86_vector_tools.hpp"
27 #include "bit_operation_tools.hpp"
28 
29 #include <boost/atomic/detail/header.hpp>
30 
31 namespace boost {
32 namespace atomics {
33 namespace detail {
34 
35 #if BOOST_ATOMIC_DETAIL_SIZEOF_POINTER == 8
36 namespace {
37 
mm_pand_si128(__m128i mm1,__m128i mm2)38 BOOST_FORCEINLINE __m128i mm_pand_si128(__m128i mm1, __m128i mm2)
39 {
40     // As of 2020, gcc, clang and icc prefer to generate andps instead of pand if the surrounding
41     // instructions pertain to FP domain, even if we use the _mm_and_si128 intrinsic. In our
42     // algorithm implementation, the FP instruction happen to be shufps, which is not actually
43     // restricted to FP domain (it is actually implemented in a separate MMX EU in Pentium 4 or
44     // a shuffle EU in INT domain in Core 2; on AMD K8/K10 all SSE instructions are implemented in
45     // FADD, FMUL and FMISC EUs regardless of INT/FP data types, and shufps is implemented in FADD/FMUL).
46     // In other words, there should be no domain bypass penalty between shufps and pand.
47     //
48     // This would usually not pose a problem since andps and pand have the same latency and throughput
49     // on most architectures of that age (before SSE4.1). However, it is possible that a newer architecture
50     // runs the SSE2 code path (e.g. because some weird compiler doesn't support SSE4.1 or because
51     // a hypervisor blocks SSE4.1 detection), and there pand may have a better throughput. For example,
52     // Sandy Bridge can execute 3 pand instructions per cycle, but only one andps. For this reason
53     // we prefer to generate pand and not andps.
54 #if defined(__GNUC__)
55     __asm__("pand %1, %0\n\t" : "+x" (mm1) : "x" (mm2));
56 #else
57     mm1 = _mm_and_si128(mm1, mm2);
58 #endif
59     return mm1;
60 }
61 
62 } // namespace
63 #endif // BOOST_ATOMIC_DETAIL_SIZEOF_POINTER == 8
64 
65 //! SSE2 implementation of the \c find_address algorithm
find_address_sse2(const volatile void * addr,const volatile void * const * addrs,std::size_t size)66 std::size_t find_address_sse2(const volatile void* addr, const volatile void* const* addrs, std::size_t size)
67 {
68 #if BOOST_ATOMIC_DETAIL_SIZEOF_POINTER == 8
69 
70     if (size < 12u)
71         return find_address_generic(addr, addrs, size);
72 
73     const __m128i mm_addr = mm_set1_epiptr((uintptr_t)addr);
74     std::size_t pos = 0u;
75     const std::size_t n = (size + 1u) & ~static_cast< std::size_t >(1u);
76     for (std::size_t m = n & ~static_cast< std::size_t >(15u); pos < m; pos += 16u)
77     {
78         __m128i mm1 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos));
79         __m128i mm2 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 2u));
80         __m128i mm3 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 4u));
81         __m128i mm4 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 6u));
82         __m128i mm5 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 8u));
83         __m128i mm6 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 10u));
84         __m128i mm7 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 12u));
85         __m128i mm8 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 14u));
86 
87         mm1 = _mm_cmpeq_epi32(mm1, mm_addr);
88         mm2 = _mm_cmpeq_epi32(mm2, mm_addr);
89         mm3 = _mm_cmpeq_epi32(mm3, mm_addr);
90         mm4 = _mm_cmpeq_epi32(mm4, mm_addr);
91         mm5 = _mm_cmpeq_epi32(mm5, mm_addr);
92         mm6 = _mm_cmpeq_epi32(mm6, mm_addr);
93         mm7 = _mm_cmpeq_epi32(mm7, mm_addr);
94         mm8 = _mm_cmpeq_epi32(mm8, mm_addr);
95 
96         __m128i mm_mask1_lo = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm1), _mm_castsi128_ps(mm2), _MM_SHUFFLE(2, 0, 2, 0)));
97         __m128i mm_mask1_hi = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm1), _mm_castsi128_ps(mm2), _MM_SHUFFLE(3, 1, 3, 1)));
98 
99         __m128i mm_mask2_lo = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm3), _mm_castsi128_ps(mm4), _MM_SHUFFLE(2, 0, 2, 0)));
100         __m128i mm_mask2_hi = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm3), _mm_castsi128_ps(mm4), _MM_SHUFFLE(3, 1, 3, 1)));
101 
102         __m128i mm_mask3_lo = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm5), _mm_castsi128_ps(mm6), _MM_SHUFFLE(2, 0, 2, 0)));
103         __m128i mm_mask3_hi = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm5), _mm_castsi128_ps(mm6), _MM_SHUFFLE(3, 1, 3, 1)));
104 
105         __m128i mm_mask4_lo = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm7), _mm_castsi128_ps(mm8), _MM_SHUFFLE(2, 0, 2, 0)));
106         __m128i mm_mask4_hi = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm7), _mm_castsi128_ps(mm8), _MM_SHUFFLE(3, 1, 3, 1)));
107 
108         mm_mask1_lo = mm_pand_si128(mm_mask1_lo, mm_mask1_hi);
109         mm_mask2_lo = mm_pand_si128(mm_mask2_lo, mm_mask2_hi);
110         mm_mask3_lo = mm_pand_si128(mm_mask3_lo, mm_mask3_hi);
111         mm_mask4_lo = mm_pand_si128(mm_mask4_lo, mm_mask4_hi);
112 
113         mm_mask1_lo = _mm_packs_epi32(mm_mask1_lo, mm_mask2_lo);
114         mm_mask3_lo = _mm_packs_epi32(mm_mask3_lo, mm_mask4_lo);
115 
116         mm_mask1_lo = _mm_packs_epi16(mm_mask1_lo, mm_mask3_lo);
117 
118         uint32_t mask = _mm_movemask_epi8(mm_mask1_lo);
119         if (mask)
120         {
121             pos += atomics::detail::count_trailing_zeros(mask);
122             goto done;
123         }
124     }
125 
126     if ((n - pos) >= 8u)
127     {
128         __m128i mm1 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos));
129         __m128i mm2 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 2u));
130         __m128i mm3 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 4u));
131         __m128i mm4 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 6u));
132 
133         mm1 = _mm_cmpeq_epi32(mm1, mm_addr);
134         mm2 = _mm_cmpeq_epi32(mm2, mm_addr);
135         mm3 = _mm_cmpeq_epi32(mm3, mm_addr);
136         mm4 = _mm_cmpeq_epi32(mm4, mm_addr);
137 
138         __m128i mm_mask1_lo = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm1), _mm_castsi128_ps(mm2), _MM_SHUFFLE(2, 0, 2, 0)));
139         __m128i mm_mask1_hi = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm1), _mm_castsi128_ps(mm2), _MM_SHUFFLE(3, 1, 3, 1)));
140 
141         __m128i mm_mask2_lo = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm3), _mm_castsi128_ps(mm4), _MM_SHUFFLE(2, 0, 2, 0)));
142         __m128i mm_mask2_hi = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm3), _mm_castsi128_ps(mm4), _MM_SHUFFLE(3, 1, 3, 1)));
143 
144         mm_mask1_lo = mm_pand_si128(mm_mask1_lo, mm_mask1_hi);
145         mm_mask2_lo = mm_pand_si128(mm_mask2_lo, mm_mask2_hi);
146 
147         mm_mask1_lo = _mm_packs_epi32(mm_mask1_lo, mm_mask2_lo);
148 
149         uint32_t mask = _mm_movemask_epi8(mm_mask1_lo);
150         if (mask)
151         {
152             pos += atomics::detail::count_trailing_zeros(mask) / 2u;
153             goto done;
154         }
155 
156         pos += 8u;
157     }
158 
159     if ((n - pos) >= 4u)
160     {
161         __m128i mm1 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos));
162         __m128i mm2 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 2u));
163 
164         mm1 = _mm_cmpeq_epi32(mm1, mm_addr);
165         mm2 = _mm_cmpeq_epi32(mm2, mm_addr);
166 
167         __m128i mm_mask1_lo = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm1), _mm_castsi128_ps(mm2), _MM_SHUFFLE(2, 0, 2, 0)));
168         __m128i mm_mask1_hi = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm1), _mm_castsi128_ps(mm2), _MM_SHUFFLE(3, 1, 3, 1)));
169 
170         mm_mask1_lo = mm_pand_si128(mm_mask1_lo, mm_mask1_hi);
171 
172         uint32_t mask = _mm_movemask_ps(_mm_castsi128_ps(mm_mask1_lo));
173         if (mask)
174         {
175             pos += atomics::detail::count_trailing_zeros(mask);
176             goto done;
177         }
178 
179         pos += 4u;
180     }
181 
182     if (pos < n)
183     {
184         __m128i mm1 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos));
185 
186         mm1 = _mm_cmpeq_epi32(mm1, mm_addr);
187         __m128i mm_mask = _mm_shuffle_epi32(mm1, _MM_SHUFFLE(2, 3, 0, 1));
188         mm_mask = mm_pand_si128(mm_mask, mm1);
189 
190         uint32_t mask = _mm_movemask_pd(_mm_castsi128_pd(mm_mask));
191         if (mask)
192         {
193             pos += atomics::detail::count_trailing_zeros(mask);
194             goto done;
195         }
196 
197         pos += 2u;
198     }
199 
200 done:
201     return pos;
202 
203 #else // BOOST_ATOMIC_DETAIL_SIZEOF_POINTER == 8
204 
205     if (size < 10u)
206         return find_address_generic(addr, addrs, size);
207 
208     const __m128i mm_addr = _mm_set1_epi32((uintptr_t)addr);
209     std::size_t pos = 0u;
210     const std::size_t n = (size + 3u) & ~static_cast< std::size_t >(3u);
211     for (std::size_t m = n & ~static_cast< std::size_t >(15u); pos < m; pos += 16u)
212     {
213         __m128i mm1 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos));
214         __m128i mm2 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 4u));
215         __m128i mm3 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 8u));
216         __m128i mm4 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 12u));
217 
218         mm1 = _mm_cmpeq_epi32(mm1, mm_addr);
219         mm2 = _mm_cmpeq_epi32(mm2, mm_addr);
220         mm3 = _mm_cmpeq_epi32(mm3, mm_addr);
221         mm4 = _mm_cmpeq_epi32(mm4, mm_addr);
222 
223         mm1 = _mm_packs_epi32(mm1, mm2);
224         mm3 = _mm_packs_epi32(mm3, mm4);
225 
226         mm1 = _mm_packs_epi16(mm1, mm3);
227 
228         uint32_t mask = _mm_movemask_epi8(mm1);
229         if (mask)
230         {
231             pos += atomics::detail::count_trailing_zeros(mask);
232             goto done;
233         }
234     }
235 
236     if ((n - pos) >= 8u)
237     {
238         __m128i mm1 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos));
239         __m128i mm2 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 4u));
240 
241         mm1 = _mm_cmpeq_epi32(mm1, mm_addr);
242         mm2 = _mm_cmpeq_epi32(mm2, mm_addr);
243 
244         mm1 = _mm_packs_epi32(mm1, mm2);
245 
246         uint32_t mask = _mm_movemask_epi8(mm1);
247         if (mask)
248         {
249             pos += atomics::detail::count_trailing_zeros(mask) / 2u;
250             goto done;
251         }
252 
253         pos += 8u;
254     }
255 
256     if (pos < n)
257     {
258         __m128i mm1 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos));
259 
260         mm1 = _mm_cmpeq_epi32(mm1, mm_addr);
261 
262         uint32_t mask = _mm_movemask_ps(_mm_castsi128_ps(mm1));
263         if (mask)
264         {
265             pos += atomics::detail::count_trailing_zeros(mask);
266             goto done;
267         }
268 
269         pos += 4u;
270     }
271 
272 done:
273     return pos;
274 
275 #endif // BOOST_ATOMIC_DETAIL_SIZEOF_POINTER == 8
276 }
277 
278 } // namespace detail
279 } // namespace atomics
280 } // namespace boost
281 
282 #include <boost/atomic/detail/footer.hpp>
283 
284 #endif // BOOST_ARCH_X86 && defined(BOOST_ATOMIC_DETAIL_SIZEOF_POINTER) && (BOOST_ATOMIC_DETAIL_SIZEOF_POINTER == 8 || BOOST_ATOMIC_DETAIL_SIZEOF_POINTER == 4)
285