1 #ifdef ZIMG_X86_AVX512
2
3 #include <cstdint>
4 #include <immintrin.h>
5 #include "common/align.h"
6 #include "common/ccdep.h"
7 #include "dither_x86.h"
8
9 #include "common/x86/avx512_util.h"
10
11 namespace zimg {
12 namespace depth {
13
14 namespace {
15
16 struct LoadU8 {
17 typedef uint8_t type;
18
load16zimg::depth::__anon0a030b2a0111::LoadU819 static inline FORCE_INLINE __m512 load16(const uint8_t *ptr)
20 {
21 return _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_load_si128((const __m128i *)ptr)));
22 }
23 };
24
25 struct LoadU16 {
26 typedef uint16_t type;
27
load16zimg::depth::__anon0a030b2a0111::LoadU1628 static inline FORCE_INLINE __m512 load16(const uint16_t *ptr)
29 {
30 return _mm512_cvtepi32_ps(_mm512_cvtepu16_epi32(_mm256_load_si256((const __m256i *)ptr)));
31
32 }
33 };
34
35 struct LoadF16 {
36 typedef uint16_t type;
37
load16zimg::depth::__anon0a030b2a0111::LoadF1638 static inline FORCE_INLINE __m512 load16(const uint16_t *ptr)
39 {
40 return _mm512_cvtph_ps(_mm256_load_si256((const __m256i *)ptr));
41 }
42 };
43
44 struct LoadF32 {
45 typedef float type;
46
load16zimg::depth::__anon0a030b2a0111::LoadF3247 static inline FORCE_INLINE __m512 load16(const float *ptr)
48 {
49 return _mm512_load_ps(ptr);
50 }
51 };
52
53 struct StoreU8 {
54 typedef uint8_t type;
55
mask_store16zimg::depth::__anon0a030b2a0111::StoreU856 static inline FORCE_INLINE void mask_store16(uint8_t *ptr, __mmask16 mask, __m512i x)
57 {
58 _mm_mask_storeu_epi8(ptr, mask, _mm512_cvtusepi32_epi8(x));
59 }
60 };
61
62 struct StoreU16 {
63 typedef uint16_t type;
64
mask_store16zimg::depth::__anon0a030b2a0111::StoreU1665 static inline FORCE_INLINE void mask_store16(uint16_t *ptr, __mmask16 mask, __m512i x)
66 {
67 _mm256_mask_storeu_epi16(ptr, mask, _mm512_cvtusepi32_epi16(x));
68 }
69 };
70
71
ordered_dither_avx512_xiter(__m512 x,unsigned j,const float * dither,unsigned dither_offset,unsigned dither_mask,const __m512 & scale,const __m512 & offset,const __m512i & out_max)72 inline FORCE_INLINE __m512i ordered_dither_avx512_xiter(__m512 x, unsigned j, const float *dither, unsigned dither_offset, unsigned dither_mask,
73 const __m512 &scale, const __m512 &offset, const __m512i &out_max)
74 {
75 __m512 dith = _mm512_load_ps(dither + ((dither_offset + j) & dither_mask));
76 __m512i out;
77
78 x = _mm512_fmadd_ps(scale, x, offset);
79 x = _mm512_add_ps(x, dith);
80 out = _mm512_cvtps_epi32(x);
81 out = _mm512_min_epi32(out, out_max);
82 out = _mm512_max_epi32(out, _mm512_setzero_si512());
83
84 return out;
85 }
86
87 template <class Load, class Store>
ordered_dither_avx512_impl(const float * dither,unsigned dither_offset,unsigned dither_mask,const void * src,void * dst,float scale,float offset,unsigned bits,unsigned left,unsigned right)88 inline FORCE_INLINE void ordered_dither_avx512_impl(const float *dither, unsigned dither_offset, unsigned dither_mask,
89 const void *src, void *dst, float scale, float offset, unsigned bits, unsigned left, unsigned right)
90 {
91 const typename Load::type *src_p = static_cast<const typename Load::type *>(src);
92 typename Store::type *dst_p = static_cast<typename Store::type *>(dst);
93
94 unsigned vec_left = ceil_n(left, 16);
95 unsigned vec_right = floor_n(right, 16);
96
97 const __m512 scale_ps = _mm512_set1_ps(scale);
98 const __m512 offset_ps = _mm512_set1_ps(offset);
99 const __m512i out_max = _mm512_set1_epi32((1 << bits) - 1);
100
101 #define XARGS dither, dither_offset, dither_mask, scale_ps, offset_ps, out_max
102 if (left != vec_left) {
103 __m512 x = Load::load16(src_p + vec_left - 16);
104 __m512i out = ordered_dither_avx512_xiter(x, vec_left - 16, XARGS);
105
106 Store::mask_store16(dst_p + vec_left - 16, mmask16_set_hi(vec_left - left), out);
107 }
108
109 for (unsigned j = vec_left; j < vec_right; j += 16) {
110 __m512 x = Load::load16(src_p + j);
111 __m512i out = ordered_dither_avx512_xiter(x, j, XARGS);
112
113 Store::mask_store16(dst_p + j, 0xFFFFU, out);
114 }
115
116 if (right != vec_right) {
117 __m512 x = Load::load16(src_p + vec_right);
118 __m512i out = ordered_dither_avx512_xiter(x, vec_right, XARGS);
119
120 Store::mask_store16(dst_p + vec_right, mmask16_set_lo(right - vec_right), out);
121 }
122 #undef XARGS
123 }
124
125 } // namespace
126
127
ordered_dither_b2b_avx512(const float * dither,unsigned dither_offset,unsigned dither_mask,const void * src,void * dst,float scale,float offset,unsigned bits,unsigned left,unsigned right)128 void ordered_dither_b2b_avx512(const float *dither, unsigned dither_offset, unsigned dither_mask,
129 const void *src, void *dst, float scale, float offset, unsigned bits, unsigned left, unsigned right)
130 {
131 ordered_dither_avx512_impl<LoadU8, StoreU8>(dither, dither_offset, dither_mask, src, dst, scale, offset, bits, left, right);
132 }
133
ordered_dither_b2w_avx512(const float * dither,unsigned dither_offset,unsigned dither_mask,const void * src,void * dst,float scale,float offset,unsigned bits,unsigned left,unsigned right)134 void ordered_dither_b2w_avx512(const float *dither, unsigned dither_offset, unsigned dither_mask,
135 const void *src, void *dst, float scale, float offset, unsigned bits, unsigned left, unsigned right)
136 {
137 ordered_dither_avx512_impl<LoadU8, StoreU16>(dither, dither_offset, dither_mask, src, dst, scale, offset, bits, left, right);
138 }
139
ordered_dither_w2b_avx512(const float * dither,unsigned dither_offset,unsigned dither_mask,const void * src,void * dst,float scale,float offset,unsigned bits,unsigned left,unsigned right)140 void ordered_dither_w2b_avx512(const float *dither, unsigned dither_offset, unsigned dither_mask,
141 const void *src, void *dst, float scale, float offset, unsigned bits, unsigned left, unsigned right)
142 {
143 ordered_dither_avx512_impl<LoadU16, StoreU8>(dither, dither_offset, dither_mask, src, dst, scale, offset, bits, left, right);
144 }
145
ordered_dither_w2w_avx512(const float * dither,unsigned dither_offset,unsigned dither_mask,const void * src,void * dst,float scale,float offset,unsigned bits,unsigned left,unsigned right)146 void ordered_dither_w2w_avx512(const float *dither, unsigned dither_offset, unsigned dither_mask,
147 const void *src, void *dst, float scale, float offset, unsigned bits, unsigned left, unsigned right)
148 {
149 ordered_dither_avx512_impl<LoadU16, StoreU16>(dither, dither_offset, dither_mask, src, dst, scale, offset, bits, left, right);
150 }
151
ordered_dither_h2b_avx512(const float * dither,unsigned dither_offset,unsigned dither_mask,const void * src,void * dst,float scale,float offset,unsigned bits,unsigned left,unsigned right)152 void ordered_dither_h2b_avx512(const float *dither, unsigned dither_offset, unsigned dither_mask,
153 const void *src, void *dst, float scale, float offset, unsigned bits, unsigned left, unsigned right)
154 {
155 ordered_dither_avx512_impl<LoadF16, StoreU8>(dither, dither_offset, dither_mask, src, dst, scale, offset, bits, left, right);
156 }
157
ordered_dither_h2w_avx512(const float * dither,unsigned dither_offset,unsigned dither_mask,const void * src,void * dst,float scale,float offset,unsigned bits,unsigned left,unsigned right)158 void ordered_dither_h2w_avx512(const float *dither, unsigned dither_offset, unsigned dither_mask,
159 const void *src, void *dst, float scale, float offset, unsigned bits, unsigned left, unsigned right)
160 {
161 ordered_dither_avx512_impl<LoadF16, StoreU16>(dither, dither_offset, dither_mask, src, dst, scale, offset, bits, left, right);
162 }
163
ordered_dither_f2b_avx512(const float * dither,unsigned dither_offset,unsigned dither_mask,const void * src,void * dst,float scale,float offset,unsigned bits,unsigned left,unsigned right)164 void ordered_dither_f2b_avx512(const float *dither, unsigned dither_offset, unsigned dither_mask,
165 const void *src, void *dst, float scale, float offset, unsigned bits, unsigned left, unsigned right)
166 {
167 ordered_dither_avx512_impl<LoadF32, StoreU8>(dither, dither_offset, dither_mask, src, dst, scale, offset, bits, left, right);
168 }
169
ordered_dither_f2w_avx512(const float * dither,unsigned dither_offset,unsigned dither_mask,const void * src,void * dst,float scale,float offset,unsigned bits,unsigned left,unsigned right)170 void ordered_dither_f2w_avx512(const float *dither, unsigned dither_offset, unsigned dither_mask,
171 const void *src, void *dst, float scale, float offset, unsigned bits, unsigned left, unsigned right)
172 {
173 ordered_dither_avx512_impl<LoadF32, StoreU16>(dither, dither_offset, dither_mask, src, dst, scale, offset, bits, left, right);
174 }
175
176 } // namespace depth
177 } // namespace zimg
178
179 #endif // ZIMG_X86_AVX512
180