1 /*
2  * Distributed under the Boost Software License, Version 1.0.
3  * (See accompanying file LICENSE_1_0.txt or copy at
4  * http://www.boost.org/LICENSE_1_0.txt)
5  *
6  * Copyright (c) 2020 Andrey Semashev
7  */
8 /*!
9  * \file   find_address_sse41.cpp
10  *
11  * This file contains SSE4.1 implementation of the \c find_address algorithm
12  */
13 
14 #include <boost/predef/architecture/x86.h>
15 #include <boost/atomic/detail/int_sizes.hpp>
16 
17 #if BOOST_ARCH_X86 && defined(BOOST_ATOMIC_DETAIL_SIZEOF_POINTER) && (BOOST_ATOMIC_DETAIL_SIZEOF_POINTER == 8)
18 
19 #include <cstddef>
20 #include <smmintrin.h>
21 
22 #include <boost/cstdint.hpp>
23 #include <boost/atomic/detail/config.hpp>
24 #include <boost/atomic/detail/intptr.hpp>
25 #include "find_address.hpp"
26 #include "x86_vector_tools.hpp"
27 #include "bit_operation_tools.hpp"
28 
29 #include <boost/atomic/detail/header.hpp>
30 
31 namespace boost {
32 namespace atomics {
33 namespace detail {
34 
35 //! SSE4.1 implementation of the \c find_address algorithm
find_address_sse41(const volatile void * addr,const volatile void * const * addrs,std::size_t size)36 std::size_t find_address_sse41(const volatile void* addr, const volatile void* const* addrs, std::size_t size)
37 {
38     if (size < 12u)
39         return find_address_generic(addr, addrs, size);
40 
41     const __m128i mm_addr = mm_set1_epiptr((uintptr_t)addr);
42     std::size_t pos = 0u;
43     const std::size_t n = (size + 1u) & ~static_cast< std::size_t >(1u);
44     for (std::size_t m = n & ~static_cast< std::size_t >(15u); pos < m; pos += 16u)
45     {
46         __m128i mm1 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos));
47         __m128i mm2 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 2u));
48         __m128i mm3 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 4u));
49         __m128i mm4 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 6u));
50         __m128i mm5 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 8u));
51         __m128i mm6 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 10u));
52         __m128i mm7 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 12u));
53         __m128i mm8 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 14u));
54 
55         mm1 = _mm_cmpeq_epi64(mm1, mm_addr);
56         mm2 = _mm_cmpeq_epi64(mm2, mm_addr);
57         mm3 = _mm_cmpeq_epi64(mm3, mm_addr);
58         mm4 = _mm_cmpeq_epi64(mm4, mm_addr);
59         mm5 = _mm_cmpeq_epi64(mm5, mm_addr);
60         mm6 = _mm_cmpeq_epi64(mm6, mm_addr);
61         mm7 = _mm_cmpeq_epi64(mm7, mm_addr);
62         mm8 = _mm_cmpeq_epi64(mm8, mm_addr);
63 
64         mm1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm1), _mm_castsi128_ps(mm2), _MM_SHUFFLE(2, 0, 2, 0)));
65         mm3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm3), _mm_castsi128_ps(mm4), _MM_SHUFFLE(2, 0, 2, 0)));
66         mm5 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm5), _mm_castsi128_ps(mm6), _MM_SHUFFLE(2, 0, 2, 0)));
67         mm7 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm7), _mm_castsi128_ps(mm8), _MM_SHUFFLE(2, 0, 2, 0)));
68 
69         mm1 = _mm_packs_epi32(mm1, mm3);
70         mm5 = _mm_packs_epi32(mm5, mm7);
71 
72         mm1 = _mm_packs_epi16(mm1, mm5);
73 
74         uint32_t mask = _mm_movemask_epi8(mm1);
75         if (mask)
76         {
77             pos += atomics::detail::count_trailing_zeros(mask);
78             goto done;
79         }
80     }
81 
82     if ((n - pos) >= 8u)
83     {
84         __m128i mm1 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos));
85         __m128i mm2 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 2u));
86         __m128i mm3 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 4u));
87         __m128i mm4 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 6u));
88 
89         mm1 = _mm_cmpeq_epi64(mm1, mm_addr);
90         mm2 = _mm_cmpeq_epi64(mm2, mm_addr);
91         mm3 = _mm_cmpeq_epi64(mm3, mm_addr);
92         mm4 = _mm_cmpeq_epi64(mm4, mm_addr);
93 
94         mm1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm1), _mm_castsi128_ps(mm2), _MM_SHUFFLE(2, 0, 2, 0)));
95         mm3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm3), _mm_castsi128_ps(mm4), _MM_SHUFFLE(2, 0, 2, 0)));
96 
97         mm1 = _mm_packs_epi32(mm1, mm3);
98 
99         uint32_t mask = _mm_movemask_epi8(mm1);
100         if (mask)
101         {
102             pos += atomics::detail::count_trailing_zeros(mask) / 2u;
103             goto done;
104         }
105 
106         pos += 8u;
107     }
108 
109     if ((n - pos) >= 4u)
110     {
111         __m128i mm1 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos));
112         __m128i mm2 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos + 2u));
113 
114         mm1 = _mm_cmpeq_epi64(mm1, mm_addr);
115         mm2 = _mm_cmpeq_epi64(mm2, mm_addr);
116 
117         mm1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm1), _mm_castsi128_ps(mm2), _MM_SHUFFLE(2, 0, 2, 0)));
118 
119         uint32_t mask = _mm_movemask_ps(_mm_castsi128_ps(mm1));
120         if (mask)
121         {
122             pos += atomics::detail::count_trailing_zeros(mask);
123             goto done;
124         }
125 
126         pos += 4u;
127     }
128 
129     if (pos < n)
130     {
131         __m128i mm1 = _mm_load_si128(reinterpret_cast< const __m128i* >(addrs + pos));
132 
133         mm1 = _mm_cmpeq_epi64(mm1, mm_addr);
134         uint32_t mask = _mm_movemask_pd(_mm_castsi128_pd(mm1));
135         if (mask)
136         {
137             pos += atomics::detail::count_trailing_zeros(mask);
138             goto done;
139         }
140 
141         pos += 2u;
142     }
143 
144 done:
145     return pos;
146 }
147 
148 } // namespace detail
149 } // namespace atomics
150 } // namespace boost
151 
152 #include <boost/atomic/detail/footer.hpp>
153 
154 #endif // BOOST_ARCH_X86 && defined(BOOST_ATOMIC_DETAIL_SIZEOF_POINTER) && (BOOST_ATOMIC_DETAIL_SIZEOF_POINTER == 8)
155