1 #ifdef ZIMG_X86_AVX512
2 
3 #include <cstdint>
4 #include <immintrin.h>
5 #include "common/align.h"
6 #include "common/ccdep.h"
7 #include "dither_x86.h"
8 
9 #include "common/x86/avx512_util.h"
10 
11 namespace zimg {
12 namespace depth {
13 
14 namespace {
15 
16 struct LoadU8 {
17 	typedef uint8_t type;
18 
load16zimg::depth::__anon0a030b2a0111::LoadU819 	static inline FORCE_INLINE __m512 load16(const uint8_t *ptr)
20 	{
21 		return _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i *)ptr)));
22 	}
23 };
24 
25 struct LoadU16 {
26 	typedef uint16_t type;
27 
load16zimg::depth::__anon0a030b2a0111::LoadU1628 	static inline FORCE_INLINE __m512 load16(const uint16_t *ptr)
29 	{
30 		return _mm512_cvtepi32_ps(_mm512_cvtepu16_epi32(_mm256_load_si256((const __m256i *)ptr)));
31 
32 	}
33 };
34 
35 struct LoadF16 {
36 	typedef uint16_t type;
37 
load16zimg::depth::__anon0a030b2a0111::LoadF1638 	static inline FORCE_INLINE __m512 load16(const uint16_t *ptr)
39 	{
40 		return _mm512_cvtph_ps(_mm256_load_si256((const __m256i *)ptr));
41 	}
42 };
43 
44 struct LoadF32 {
45 	typedef float type;
46 
load16zimg::depth::__anon0a030b2a0111::LoadF3247 	static inline FORCE_INLINE __m512 load16(const float *ptr)
48 	{
49 		return _mm512_load_ps(ptr);
50 	}
51 };
52 
53 struct StoreU8 {
54 	typedef uint8_t type;
55 
mask_store16zimg::depth::__anon0a030b2a0111::StoreU856 	static inline FORCE_INLINE void mask_store16(uint8_t *ptr, __mmask16 mask, __m512i x)
57 	{
58 		_mm_mask_storeu_epi8(ptr, mask, _mm512_cvtusepi32_epi8(x));
59 	}
60 };
61 
62 struct StoreU16 {
63 	typedef uint16_t type;
64 
mask_store16zimg::depth::__anon0a030b2a0111::StoreU1665 	static inline FORCE_INLINE void mask_store16(uint16_t *ptr, __mmask16 mask, __m512i x)
66 	{
67 		_mm256_mask_storeu_epi16(ptr, mask, _mm512_cvtusepi32_epi16(x));
68 	}
69 };
70 
71 
ordered_dither_avx512_xiter(__m512 x,unsigned j,const float * dither,unsigned dither_offset,unsigned dither_mask,const __m512 & scale,const __m512 & offset,const __m512i & out_max)72 inline FORCE_INLINE __m512i ordered_dither_avx512_xiter(__m512 x, unsigned j, const float *dither, unsigned dither_offset, unsigned dither_mask,
73                                                         const __m512 &scale, const __m512 &offset, const __m512i &out_max)
74 {
75 	__m512 dith = _mm512_load_ps(dither + ((dither_offset + j) & dither_mask));
76 	__m512i out;
77 
78 	x = _mm512_fmadd_ps(scale, x, offset);
79 	x = _mm512_add_ps(x, dith);
80 	out = _mm512_cvtps_epi32(x);
81 	out = _mm512_min_epi32(out, out_max);
82 	out = _mm512_max_epi32(out, _mm512_setzero_si512());
83 
84 	return out;
85 }
86 
87 template <class Load, class Store>
ordered_dither_avx512_impl(const float * dither,unsigned dither_offset,unsigned dither_mask,const void * src,void * dst,float scale,float offset,unsigned bits,unsigned left,unsigned right)88 inline FORCE_INLINE void ordered_dither_avx512_impl(const float *dither, unsigned dither_offset, unsigned dither_mask,
89                                                     const void *src, void *dst, float scale, float offset, unsigned bits, unsigned left, unsigned right)
90 {
91 	const typename Load::type *src_p = static_cast<const typename Load::type *>(src);
92 	typename Store::type *dst_p = static_cast<typename Store::type *>(dst);
93 
94 	unsigned vec_left = ceil_n(left, 16);
95 	unsigned vec_right = floor_n(right, 16);
96 
97 	const __m512 scale_ps = _mm512_set1_ps(scale);
98 	const __m512 offset_ps = _mm512_set1_ps(offset);
99 	const __m512i out_max = _mm512_set1_epi32((1 << bits) - 1);
100 
101 #define XARGS dither, dither_offset, dither_mask, scale_ps, offset_ps, out_max
102 	if (left != vec_left) {
103 		__m512 x = Load::load16(src_p + vec_left - 16);
104 		__m512i out = ordered_dither_avx512_xiter(x, vec_left - 16, XARGS);
105 
106 		Store::mask_store16(dst_p + vec_left - 16, mmask16_set_hi(vec_left - left), out);
107 	}
108 
109 	for (unsigned j = vec_left; j < vec_right; j += 16) {
110 		__m512 x = Load::load16(src_p + j);
111 		__m512i out = ordered_dither_avx512_xiter(x, j, XARGS);
112 
113 		Store::mask_store16(dst_p + j, 0xFFFFU, out);
114 	}
115 
116 	if (right != vec_right) {
117 		__m512 x = Load::load16(src_p + vec_right);
118 		__m512i out = ordered_dither_avx512_xiter(x, vec_right, XARGS);
119 
120 		Store::mask_store16(dst_p + vec_right, mmask16_set_lo(right - vec_right), out);
121 	}
122 #undef XARGS
123 }
124 
125 } // namespace
126 
127 
ordered_dither_b2b_avx512(const float * dither,unsigned dither_offset,unsigned dither_mask,const void * src,void * dst,float scale,float offset,unsigned bits,unsigned left,unsigned right)128 void ordered_dither_b2b_avx512(const float *dither, unsigned dither_offset, unsigned dither_mask,
129                                const void *src, void *dst, float scale, float offset, unsigned bits, unsigned left, unsigned right)
130 {
131 	ordered_dither_avx512_impl<LoadU8, StoreU8>(dither, dither_offset, dither_mask, src, dst, scale, offset, bits, left, right);
132 }
133 
ordered_dither_b2w_avx512(const float * dither,unsigned dither_offset,unsigned dither_mask,const void * src,void * dst,float scale,float offset,unsigned bits,unsigned left,unsigned right)134 void ordered_dither_b2w_avx512(const float *dither, unsigned dither_offset, unsigned dither_mask,
135                                const void *src, void *dst, float scale, float offset, unsigned bits, unsigned left, unsigned right)
136 {
137 	ordered_dither_avx512_impl<LoadU8, StoreU16>(dither, dither_offset, dither_mask, src, dst, scale, offset, bits, left, right);
138 }
139 
ordered_dither_w2b_avx512(const float * dither,unsigned dither_offset,unsigned dither_mask,const void * src,void * dst,float scale,float offset,unsigned bits,unsigned left,unsigned right)140 void ordered_dither_w2b_avx512(const float *dither, unsigned dither_offset, unsigned dither_mask,
141                                const void *src, void *dst, float scale, float offset, unsigned bits, unsigned left, unsigned right)
142 {
143 	ordered_dither_avx512_impl<LoadU16, StoreU8>(dither, dither_offset, dither_mask, src, dst, scale, offset, bits, left, right);
144 }
145 
ordered_dither_w2w_avx512(const float * dither,unsigned dither_offset,unsigned dither_mask,const void * src,void * dst,float scale,float offset,unsigned bits,unsigned left,unsigned right)146 void ordered_dither_w2w_avx512(const float *dither, unsigned dither_offset, unsigned dither_mask,
147                                const void *src, void *dst, float scale, float offset, unsigned bits, unsigned left, unsigned right)
148 {
149 	ordered_dither_avx512_impl<LoadU16, StoreU16>(dither, dither_offset, dither_mask, src, dst, scale, offset, bits, left, right);
150 }
151 
ordered_dither_h2b_avx512(const float * dither,unsigned dither_offset,unsigned dither_mask,const void * src,void * dst,float scale,float offset,unsigned bits,unsigned left,unsigned right)152 void ordered_dither_h2b_avx512(const float *dither, unsigned dither_offset, unsigned dither_mask,
153                                const void *src, void *dst, float scale, float offset, unsigned bits, unsigned left, unsigned right)
154 {
155 	ordered_dither_avx512_impl<LoadF16, StoreU8>(dither, dither_offset, dither_mask, src, dst, scale, offset, bits, left, right);
156 }
157 
ordered_dither_h2w_avx512(const float * dither,unsigned dither_offset,unsigned dither_mask,const void * src,void * dst,float scale,float offset,unsigned bits,unsigned left,unsigned right)158 void ordered_dither_h2w_avx512(const float *dither, unsigned dither_offset, unsigned dither_mask,
159                                const void *src, void *dst, float scale, float offset, unsigned bits, unsigned left, unsigned right)
160 {
161 	ordered_dither_avx512_impl<LoadF16, StoreU16>(dither, dither_offset, dither_mask, src, dst, scale, offset, bits, left, right);
162 }
163 
ordered_dither_f2b_avx512(const float * dither,unsigned dither_offset,unsigned dither_mask,const void * src,void * dst,float scale,float offset,unsigned bits,unsigned left,unsigned right)164 void ordered_dither_f2b_avx512(const float *dither, unsigned dither_offset, unsigned dither_mask,
165                                const void *src, void *dst, float scale, float offset, unsigned bits, unsigned left, unsigned right)
166 {
167 	ordered_dither_avx512_impl<LoadF32, StoreU8>(dither, dither_offset, dither_mask, src, dst, scale, offset, bits, left, right);
168 }
169 
ordered_dither_f2w_avx512(const float * dither,unsigned dither_offset,unsigned dither_mask,const void * src,void * dst,float scale,float offset,unsigned bits,unsigned left,unsigned right)170 void ordered_dither_f2w_avx512(const float *dither, unsigned dither_offset, unsigned dither_mask,
171                                const void *src, void *dst, float scale, float offset, unsigned bits, unsigned left, unsigned right)
172 {
173 	ordered_dither_avx512_impl<LoadF32, StoreU16>(dither, dither_offset, dither_mask, src, dst, scale, offset, bits, left, right);
174 }
175 
176 } // namespace depth
177 } // namespace zimg
178 
179 #endif // ZIMG_X86_AVX512
180