xref: /qemu/host/include/i386/host/bufferiszero.c.inc (revision 87b8bde5)
1/*
2 * SPDX-License-Identifier: GPL-2.0-or-later
3 * buffer_is_zero acceleration, x86 version.
4 */
5
6#if defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
7#include <immintrin.h>
8
9/* Helper for preventing the compiler from reassociating
10   chains of binary vector operations.  */
11#define SSE_REASSOC_BARRIER(vec0, vec1) asm("" : "+x"(vec0), "+x"(vec1))
12
13/* Note that these vectorized functions may assume len >= 256.  */
14
15static bool __attribute__((target("sse2")))
16buffer_zero_sse2(const void *buf, size_t len)
17{
18    /* Unaligned loads at head/tail.  */
19    __m128i v = *(__m128i_u *)(buf);
20    __m128i w = *(__m128i_u *)(buf + len - 16);
21    /* Align head/tail to 16-byte boundaries.  */
22    const __m128i *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16);
23    const __m128i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16);
24    __m128i zero = { 0 };
25
26    /* Collect a partial block at tail end.  */
27    v |= e[-1]; w |= e[-2];
28    SSE_REASSOC_BARRIER(v, w);
29    v |= e[-3]; w |= e[-4];
30    SSE_REASSOC_BARRIER(v, w);
31    v |= e[-5]; w |= e[-6];
32    SSE_REASSOC_BARRIER(v, w);
33    v |= e[-7]; v |= w;
34
35    /*
36     * Loop over complete 128-byte blocks.
37     * With the head and tail removed, e - p >= 14, so the loop
38     * must iterate at least once.
39     */
40    do {
41        v = _mm_cmpeq_epi8(v, zero);
42        if (unlikely(_mm_movemask_epi8(v) != 0xFFFF)) {
43            return false;
44        }
45        v = p[0]; w = p[1];
46        SSE_REASSOC_BARRIER(v, w);
47        v |= p[2]; w |= p[3];
48        SSE_REASSOC_BARRIER(v, w);
49        v |= p[4]; w |= p[5];
50        SSE_REASSOC_BARRIER(v, w);
51        v |= p[6]; w |= p[7];
52        SSE_REASSOC_BARRIER(v, w);
53        v |= w;
54        p += 8;
55    } while (p < e - 7);
56
57    return _mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) == 0xFFFF;
58}
59
60#ifdef CONFIG_AVX2_OPT
61static bool __attribute__((target("avx2")))
62buffer_zero_avx2(const void *buf, size_t len)
63{
64    /* Unaligned loads at head/tail.  */
65    __m256i v = *(__m256i_u *)(buf);
66    __m256i w = *(__m256i_u *)(buf + len - 32);
67    /* Align head/tail to 32-byte boundaries.  */
68    const __m256i *p = QEMU_ALIGN_PTR_DOWN(buf + 32, 32);
69    const __m256i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 32);
70    __m256i zero = { 0 };
71
72    /* Collect a partial block at tail end.  */
73    v |= e[-1]; w |= e[-2];
74    SSE_REASSOC_BARRIER(v, w);
75    v |= e[-3]; w |= e[-4];
76    SSE_REASSOC_BARRIER(v, w);
77    v |= e[-5]; w |= e[-6];
78    SSE_REASSOC_BARRIER(v, w);
79    v |= e[-7]; v |= w;
80
81    /* Loop over complete 256-byte blocks.  */
82    for (; p < e - 7; p += 8) {
83        /* PTEST is not profitable here.  */
84        v = _mm256_cmpeq_epi8(v, zero);
85        if (unlikely(_mm256_movemask_epi8(v) != 0xFFFFFFFF)) {
86            return false;
87        }
88        v = p[0]; w = p[1];
89        SSE_REASSOC_BARRIER(v, w);
90        v |= p[2]; w |= p[3];
91        SSE_REASSOC_BARRIER(v, w);
92        v |= p[4]; w |= p[5];
93        SSE_REASSOC_BARRIER(v, w);
94        v |= p[6]; w |= p[7];
95        SSE_REASSOC_BARRIER(v, w);
96        v |= w;
97    }
98
99    return _mm256_movemask_epi8(_mm256_cmpeq_epi8(v, zero)) == 0xFFFFFFFF;
100}
101#endif /* CONFIG_AVX2_OPT */
102
103static biz_accel_fn const accel_table[] = {
104    buffer_is_zero_int_ge256,
105    buffer_zero_sse2,
106#ifdef CONFIG_AVX2_OPT
107    buffer_zero_avx2,
108#endif
109};
110
111static unsigned best_accel(void)
112{
113    unsigned info = cpuinfo_init();
114
115#ifdef CONFIG_AVX2_OPT
116    if (info & CPUINFO_AVX2) {
117        return 2;
118    }
119#endif
120    return info & CPUINFO_SSE2 ? 1 : 0;
121}
122
123#else
124# include "host/include/generic/host/bufferiszero.c.inc"
125#endif
126