xref: /qemu/util/bufferiszero.c (revision d1da8af8)
1 /*
2  * Simple C functions to supplement the C library
3  *
4  * Copyright (c) 2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "qemu/osdep.h"
25 #include "qemu/cutils.h"
26 #include "qemu/bswap.h"
27 #include "host/cpuinfo.h"
28 
29 typedef bool (*biz_accel_fn)(const void *, size_t);
30 
31 static bool buffer_is_zero_int_lt256(const void *buf, size_t len)
32 {
33     uint64_t t;
34     const uint64_t *p, *e;
35 
36     /*
37      * Use unaligned memory access functions to handle
38      * the beginning and end of the buffer.
39      */
40     if (unlikely(len <= 8)) {
41         return (ldl_he_p(buf) | ldl_he_p(buf + len - 4)) == 0;
42     }
43 
44     t = ldq_he_p(buf) | ldq_he_p(buf + len - 8);
45     p = QEMU_ALIGN_PTR_DOWN(buf + 8, 8);
46     e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 8);
47 
48     /* Read 0 to 31 aligned words from the middle. */
49     while (p < e) {
50         t |= *p++;
51     }
52     return t == 0;
53 }
54 
55 static bool buffer_is_zero_int_ge256(const void *buf, size_t len)
56 {
57     /*
58      * Use unaligned memory access functions to handle
59      * the beginning and end of the buffer.
60      */
61     uint64_t t = ldq_he_p(buf) | ldq_he_p(buf + len - 8);
62     const uint64_t *p = QEMU_ALIGN_PTR_DOWN(buf + 8, 8);
63     const uint64_t *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 8);
64 
65     /* Collect a partial block at the tail end. */
66     t |= e[-7] | e[-6] | e[-5] | e[-4] | e[-3] | e[-2] | e[-1];
67 
68     /*
69      * Loop over 64 byte blocks.
70      * With the head and tail removed, e - p >= 30,
71      * so the loop must iterate at least 3 times.
72      */
73     do {
74         if (t) {
75             return false;
76         }
77         t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7];
78         p += 8;
79     } while (p < e - 7);
80 
81     return t == 0;
82 }
83 
84 #if defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
85 #include <immintrin.h>
86 
87 /* Helper for preventing the compiler from reassociating
88    chains of binary vector operations.  */
89 #define SSE_REASSOC_BARRIER(vec0, vec1) asm("" : "+x"(vec0), "+x"(vec1))
90 
91 /* Note that these vectorized functions may assume len >= 256.  */
92 
93 static bool __attribute__((target("sse2")))
94 buffer_zero_sse2(const void *buf, size_t len)
95 {
96     /* Unaligned loads at head/tail.  */
97     __m128i v = *(__m128i_u *)(buf);
98     __m128i w = *(__m128i_u *)(buf + len - 16);
99     /* Align head/tail to 16-byte boundaries.  */
100     const __m128i *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16);
101     const __m128i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16);
102     __m128i zero = { 0 };
103 
104     /* Collect a partial block at tail end.  */
105     v |= e[-1]; w |= e[-2];
106     SSE_REASSOC_BARRIER(v, w);
107     v |= e[-3]; w |= e[-4];
108     SSE_REASSOC_BARRIER(v, w);
109     v |= e[-5]; w |= e[-6];
110     SSE_REASSOC_BARRIER(v, w);
111     v |= e[-7]; v |= w;
112 
113     /*
114      * Loop over complete 128-byte blocks.
115      * With the head and tail removed, e - p >= 14, so the loop
116      * must iterate at least once.
117      */
118     do {
119         v = _mm_cmpeq_epi8(v, zero);
120         if (unlikely(_mm_movemask_epi8(v) != 0xFFFF)) {
121             return false;
122         }
123         v = p[0]; w = p[1];
124         SSE_REASSOC_BARRIER(v, w);
125         v |= p[2]; w |= p[3];
126         SSE_REASSOC_BARRIER(v, w);
127         v |= p[4]; w |= p[5];
128         SSE_REASSOC_BARRIER(v, w);
129         v |= p[6]; w |= p[7];
130         SSE_REASSOC_BARRIER(v, w);
131         v |= w;
132         p += 8;
133     } while (p < e - 7);
134 
135     return _mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) == 0xFFFF;
136 }
137 
138 #ifdef CONFIG_AVX2_OPT
139 static bool __attribute__((target("avx2")))
140 buffer_zero_avx2(const void *buf, size_t len)
141 {
142     /* Unaligned loads at head/tail.  */
143     __m256i v = *(__m256i_u *)(buf);
144     __m256i w = *(__m256i_u *)(buf + len - 32);
145     /* Align head/tail to 32-byte boundaries.  */
146     const __m256i *p = QEMU_ALIGN_PTR_DOWN(buf + 32, 32);
147     const __m256i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 32);
148     __m256i zero = { 0 };
149 
150     /* Collect a partial block at tail end.  */
151     v |= e[-1]; w |= e[-2];
152     SSE_REASSOC_BARRIER(v, w);
153     v |= e[-3]; w |= e[-4];
154     SSE_REASSOC_BARRIER(v, w);
155     v |= e[-5]; w |= e[-6];
156     SSE_REASSOC_BARRIER(v, w);
157     v |= e[-7]; v |= w;
158 
159     /* Loop over complete 256-byte blocks.  */
160     for (; p < e - 7; p += 8) {
161         /* PTEST is not profitable here.  */
162         v = _mm256_cmpeq_epi8(v, zero);
163         if (unlikely(_mm256_movemask_epi8(v) != 0xFFFFFFFF)) {
164             return false;
165         }
166         v = p[0]; w = p[1];
167         SSE_REASSOC_BARRIER(v, w);
168         v |= p[2]; w |= p[3];
169         SSE_REASSOC_BARRIER(v, w);
170         v |= p[4]; w |= p[5];
171         SSE_REASSOC_BARRIER(v, w);
172         v |= p[6]; w |= p[7];
173         SSE_REASSOC_BARRIER(v, w);
174         v |= w;
175     }
176 
177     return _mm256_movemask_epi8(_mm256_cmpeq_epi8(v, zero)) == 0xFFFFFFFF;
178 }
179 #endif /* CONFIG_AVX2_OPT */
180 
181 static biz_accel_fn const accel_table[] = {
182     buffer_is_zero_int_ge256,
183     buffer_zero_sse2,
184 #ifdef CONFIG_AVX2_OPT
185     buffer_zero_avx2,
186 #endif
187 };
188 
189 static unsigned best_accel(void)
190 {
191     unsigned info = cpuinfo_init();
192 
193 #ifdef CONFIG_AVX2_OPT
194     if (info & CPUINFO_AVX2) {
195         return 2;
196     }
197 #endif
198     return info & CPUINFO_SSE2 ? 1 : 0;
199 }
200 
201 #elif defined(__aarch64__) && defined(__ARM_NEON)
202 #include <arm_neon.h>
203 
204 /*
205  * Helper for preventing the compiler from reassociating
206  * chains of binary vector operations.
207  */
208 #define REASSOC_BARRIER(vec0, vec1) asm("" : "+w"(vec0), "+w"(vec1))
209 
210 static bool buffer_is_zero_simd(const void *buf, size_t len)
211 {
212     uint32x4_t t0, t1, t2, t3;
213 
214     /* Align head/tail to 16-byte boundaries.  */
215     const uint32x4_t *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16);
216     const uint32x4_t *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16);
217 
218     /* Unaligned loads at head/tail.  */
219     t0 = vld1q_u32(buf) | vld1q_u32(buf + len - 16);
220 
221     /* Collect a partial block at tail end.  */
222     t1 = e[-7] | e[-6];
223     t2 = e[-5] | e[-4];
224     t3 = e[-3] | e[-2];
225     t0 |= e[-1];
226     REASSOC_BARRIER(t0, t1);
227     REASSOC_BARRIER(t2, t3);
228     t0 |= t1;
229     t2 |= t3;
230     REASSOC_BARRIER(t0, t2);
231     t0 |= t2;
232 
233     /*
234      * Loop over complete 128-byte blocks.
235      * With the head and tail removed, e - p >= 14, so the loop
236      * must iterate at least once.
237      */
238     do {
239         /*
240          * Reduce via UMAXV.  Whatever the actual result,
241          * it will only be zero if all input bytes are zero.
242          */
243         if (unlikely(vmaxvq_u32(t0) != 0)) {
244             return false;
245         }
246 
247         t0 = p[0] | p[1];
248         t1 = p[2] | p[3];
249         t2 = p[4] | p[5];
250         t3 = p[6] | p[7];
251         REASSOC_BARRIER(t0, t1);
252         REASSOC_BARRIER(t2, t3);
253         t0 |= t1;
254         t2 |= t3;
255         REASSOC_BARRIER(t0, t2);
256         t0 |= t2;
257         p += 8;
258     } while (p < e - 7);
259 
260     return vmaxvq_u32(t0) == 0;
261 }
262 
263 #define best_accel() 1
264 static biz_accel_fn const accel_table[] = {
265     buffer_is_zero_int_ge256,
266     buffer_is_zero_simd,
267 };
268 #else
269 #define best_accel() 0
270 static biz_accel_fn const accel_table[1] = {
271     buffer_is_zero_int_ge256
272 };
273 #endif
274 
275 static biz_accel_fn buffer_is_zero_accel;
276 static unsigned accel_index;
277 
278 bool buffer_is_zero_ool(const void *buf, size_t len)
279 {
280     if (unlikely(len == 0)) {
281         return true;
282     }
283     if (!buffer_is_zero_sample3(buf, len)) {
284         return false;
285     }
286     /* All bytes are covered for any len <= 3.  */
287     if (unlikely(len <= 3)) {
288         return true;
289     }
290 
291     if (likely(len >= 256)) {
292         return buffer_is_zero_accel(buf, len);
293     }
294     return buffer_is_zero_int_lt256(buf, len);
295 }
296 
297 bool buffer_is_zero_ge256(const void *buf, size_t len)
298 {
299     return buffer_is_zero_accel(buf, len);
300 }
301 
302 bool test_buffer_is_zero_next_accel(void)
303 {
304     if (accel_index != 0) {
305         buffer_is_zero_accel = accel_table[--accel_index];
306         return true;
307     }
308     return false;
309 }
310 
311 static void __attribute__((constructor)) init_accel(void)
312 {
313     accel_index = best_accel();
314     buffer_is_zero_accel = accel_table[accel_index];
315 }
316