1/* 2 * SPDX-License-Identifier: GPL-2.0-or-later 3 * buffer_is_zero acceleration, x86 version. 4 */ 5 6#if defined(CONFIG_AVX2_OPT) || defined(__SSE2__) 7#include <immintrin.h> 8 9/* Helper for preventing the compiler from reassociating 10 chains of binary vector operations. */ 11#define SSE_REASSOC_BARRIER(vec0, vec1) asm("" : "+x"(vec0), "+x"(vec1)) 12 13/* Note that these vectorized functions may assume len >= 256. */ 14 15static bool __attribute__((target("sse2"))) 16buffer_zero_sse2(const void *buf, size_t len) 17{ 18 /* Unaligned loads at head/tail. */ 19 __m128i v = *(__m128i_u *)(buf); 20 __m128i w = *(__m128i_u *)(buf + len - 16); 21 /* Align head/tail to 16-byte boundaries. */ 22 const __m128i *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16); 23 const __m128i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16); 24 __m128i zero = { 0 }; 25 26 /* Collect a partial block at tail end. */ 27 v |= e[-1]; w |= e[-2]; 28 SSE_REASSOC_BARRIER(v, w); 29 v |= e[-3]; w |= e[-4]; 30 SSE_REASSOC_BARRIER(v, w); 31 v |= e[-5]; w |= e[-6]; 32 SSE_REASSOC_BARRIER(v, w); 33 v |= e[-7]; v |= w; 34 35 /* 36 * Loop over complete 128-byte blocks. 37 * With the head and tail removed, e - p >= 14, so the loop 38 * must iterate at least once. 39 */ 40 do { 41 v = _mm_cmpeq_epi8(v, zero); 42 if (unlikely(_mm_movemask_epi8(v) != 0xFFFF)) { 43 return false; 44 } 45 v = p[0]; w = p[1]; 46 SSE_REASSOC_BARRIER(v, w); 47 v |= p[2]; w |= p[3]; 48 SSE_REASSOC_BARRIER(v, w); 49 v |= p[4]; w |= p[5]; 50 SSE_REASSOC_BARRIER(v, w); 51 v |= p[6]; w |= p[7]; 52 SSE_REASSOC_BARRIER(v, w); 53 v |= w; 54 p += 8; 55 } while (p < e - 7); 56 57 return _mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) == 0xFFFF; 58} 59 60#ifdef CONFIG_AVX2_OPT 61static bool __attribute__((target("avx2"))) 62buffer_zero_avx2(const void *buf, size_t len) 63{ 64 /* Unaligned loads at head/tail. */ 65 __m256i v = *(__m256i_u *)(buf); 66 __m256i w = *(__m256i_u *)(buf + len - 32); 67 /* Align head/tail to 32-byte boundaries. */ 68 const __m256i *p = QEMU_ALIGN_PTR_DOWN(buf + 32, 32); 69 const __m256i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 32); 70 __m256i zero = { 0 }; 71 72 /* Collect a partial block at tail end. */ 73 v |= e[-1]; w |= e[-2]; 74 SSE_REASSOC_BARRIER(v, w); 75 v |= e[-3]; w |= e[-4]; 76 SSE_REASSOC_BARRIER(v, w); 77 v |= e[-5]; w |= e[-6]; 78 SSE_REASSOC_BARRIER(v, w); 79 v |= e[-7]; v |= w; 80 81 /* Loop over complete 256-byte blocks. */ 82 for (; p < e - 7; p += 8) { 83 /* PTEST is not profitable here. */ 84 v = _mm256_cmpeq_epi8(v, zero); 85 if (unlikely(_mm256_movemask_epi8(v) != 0xFFFFFFFF)) { 86 return false; 87 } 88 v = p[0]; w = p[1]; 89 SSE_REASSOC_BARRIER(v, w); 90 v |= p[2]; w |= p[3]; 91 SSE_REASSOC_BARRIER(v, w); 92 v |= p[4]; w |= p[5]; 93 SSE_REASSOC_BARRIER(v, w); 94 v |= p[6]; w |= p[7]; 95 SSE_REASSOC_BARRIER(v, w); 96 v |= w; 97 } 98 99 return _mm256_movemask_epi8(_mm256_cmpeq_epi8(v, zero)) == 0xFFFFFFFF; 100} 101#endif /* CONFIG_AVX2_OPT */ 102 103static biz_accel_fn const accel_table[] = { 104 buffer_is_zero_int_ge256, 105 buffer_zero_sse2, 106#ifdef CONFIG_AVX2_OPT 107 buffer_zero_avx2, 108#endif 109}; 110 111static unsigned best_accel(void) 112{ 113 unsigned info = cpuinfo_init(); 114 115#ifdef CONFIG_AVX2_OPT 116 if (info & CPUINFO_AVX2) { 117 return 2; 118 } 119#endif 120 return info & CPUINFO_SSE2 ? 1 : 0; 121} 122 123#else 124# include "host/include/generic/host/bufferiszero.c.inc" 125#endif 126