1 #ifdef ZIMG_X86
2
3 #include <algorithm>
4 #include <climits>
5 #include <stdexcept>
6 #include <tuple>
7 #include <type_traits>
8 #include <immintrin.h>
9 #include "common/align.h"
10 #include "common/ccdep.h"
11 #include "common/checked_int.h"
12 #include "common/except.h"
13 #include "common/make_unique.h"
14 #include "common/pixel.h"
15 #include "common/zassert.h"
16 #include "depth/quantize.h"
17 #include "graph/image_buffer.h"
18 #include "graph/image_filter.h"
19 #include "dither_x86.h"
20
21 #include "common/x86/avx_util.h"
22
23 namespace zimg {
24 namespace depth {
25
26 namespace {
27
28 struct error_state {
29 float err_left[8];
30 float err_top_right[8];
31 float err_top[8];
32 float err_top_left[8];
33 };
34
35
36 template <PixelType SrcType>
37 struct error_diffusion_traits;
38
39 template <>
40 struct error_diffusion_traits<PixelType::BYTE> {
41 typedef uint8_t type;
42
load1zimg::depth::__anon1daf47140111::error_diffusion_traits43 static float load1(const uint8_t *ptr) { return *ptr; }
store1zimg::depth::__anon1daf47140111::error_diffusion_traits44 static void store1(uint8_t *ptr, uint32_t x) { *ptr = static_cast<uint8_t>(x); }
45
load8zimg::depth::__anon1daf47140111::error_diffusion_traits46 static __m256 load8(const uint8_t *ptr)
47 {
48 return _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)ptr)));
49 }
50
store8zimg::depth::__anon1daf47140111::error_diffusion_traits51 static void store8(uint8_t *ptr, __m256i x)
52 {
53 x = _mm256_packs_epi32(x, x);
54 x = _mm256_permute4x64_epi64(x, _MM_SHUFFLE(0, 0, 2, 0));
55 x = _mm256_packus_epi16(x, x);
56 _mm_storel_epi64((__m128i *)ptr, _mm256_castsi256_si128(x));
57 }
58 };
59
60 template <>
61 struct error_diffusion_traits<PixelType::WORD> {
62 typedef uint16_t type;
63
load1zimg::depth::__anon1daf47140111::error_diffusion_traits64 static float load1(const uint16_t *ptr) { return *ptr; }
store1zimg::depth::__anon1daf47140111::error_diffusion_traits65 static void store1(uint16_t *ptr, uint32_t x) { *ptr = static_cast<uint32_t>(x); }
66
load8zimg::depth::__anon1daf47140111::error_diffusion_traits67 static __m256 load8(const uint16_t *ptr)
68 {
69 return _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)ptr)));
70 }
71
store8zimg::depth::__anon1daf47140111::error_diffusion_traits72 static void store8(uint16_t *ptr, __m256i x)
73 {
74 x = _mm256_packus_epi32(x, x);
75 x = _mm256_permute4x64_epi64(x, _MM_SHUFFLE(0, 0, 2, 0));
76 _mm_storeu_si128((__m128i *)ptr, _mm256_castsi256_si128(x));
77 }
78 };
79
80 template <>
81 struct error_diffusion_traits<PixelType::HALF> {
82 typedef uint16_t type;
83
load1zimg::depth::__anon1daf47140111::error_diffusion_traits84 static float load1(const uint16_t *ptr)
85 {
86 return _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(*ptr)));
87 }
88
load8zimg::depth::__anon1daf47140111::error_diffusion_traits89 static __m256 load8(const uint16_t *ptr) {
90 return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)ptr));
91 }
92 };
93
94 template <>
95 struct error_diffusion_traits<PixelType::FLOAT> {
96 typedef float type;
97
load1zimg::depth::__anon1daf47140111::error_diffusion_traits98 static float load1(const float *ptr) { return *ptr; }
load8zimg::depth::__anon1daf47140111::error_diffusion_traits99 static __m256 load8(const float *ptr) { return _mm256_loadu_ps(ptr); }
100 };
101
102
fma(float a,float b,float c)103 inline FORCE_INLINE float fma(float a, float b, float c)
104 {
105 return _mm_cvtss_f32(_mm_fmadd_ss(_mm_set_ss(a), _mm_set_ss(b), _mm_set_ss(c)));
106 }
107
max(float x,float y)108 inline FORCE_INLINE float max(float x, float y)
109 {
110 return _mm_cvtss_f32(_mm_max_ss(_mm_set_ss(x), _mm_set_ss(y)));
111 }
112
min(float x,float y)113 inline FORCE_INLINE float min(float x, float y)
114 {
115 return _mm_cvtss_f32(_mm_min_ss(_mm_set_ss(x), _mm_set_ss(y)));
116 }
117
118
119 template <PixelType SrcType, PixelType DstType>
error_diffusion_scalar(const void * src,void * dst,const float * RESTRICT error_top,float * RESTRICT error_cur,float scale,float offset,unsigned bits,unsigned width)120 void error_diffusion_scalar(const void *src, void *dst, const float * RESTRICT error_top, float * RESTRICT error_cur,
121 float scale, float offset, unsigned bits, unsigned width)
122 {
123 typedef error_diffusion_traits<SrcType> src_traits;
124 typedef error_diffusion_traits<DstType> dst_traits;
125
126 const typename src_traits::type *src_p = static_cast<const typename src_traits::type *>(src);
127 typename dst_traits::type *dst_p = static_cast<typename dst_traits::type *>(dst);
128
129 float err_left = error_cur[0];
130 float err_top_right;
131 float err_top = error_top[0 + 1];
132 float err_top_left = error_top[0];
133
134 for (unsigned j = 0; j < width; ++j) {
135 // Error array is padded by one on each side.
136 unsigned j_err = j + 1;
137 err_top_right = error_top[j_err + 1];
138
139 float x = fma(src_traits::load1(src_p + j), scale, offset);
140 float err, err0, err1;
141
142 err0 = err_left * (7.0f / 16.0f);
143 err0 = fma(err_top_right, 3.0f / 16.0f, err0);
144 err1 = err_top * (5.0f / 16.0f);
145 err1 = fma(err_top_left, 1.0f / 16.0f, err1);
146 err = err0 + err1;
147
148 x += err;
149 x = min(max(x, 0.0f), static_cast<float>(1L << bits) - 1);
150
151 uint32_t q = _mm_cvt_ss2si(_mm_set_ss(x));
152 err = x - static_cast<float>(q);
153
154 dst_traits::store1(dst_p + j, q);
155 error_cur[j_err] = err;
156
157 err_left = err;
158 err_top_left = err_top;
159 err_top = err_top_right;
160 }
161 }
162
select_error_diffusion_scalar_func(PixelType pixel_in,PixelType pixel_out)163 decltype(&error_diffusion_scalar<PixelType::BYTE, PixelType::BYTE>) select_error_diffusion_scalar_func(PixelType pixel_in, PixelType pixel_out)
164 {
165 if (pixel_in == PixelType::BYTE && pixel_out == PixelType::BYTE)
166 return error_diffusion_scalar<PixelType::BYTE, PixelType::BYTE>;
167 else if (pixel_in == PixelType::BYTE && pixel_out == PixelType::WORD)
168 return error_diffusion_scalar<PixelType::BYTE, PixelType::WORD>;
169 else if (pixel_in == PixelType::WORD && pixel_out == PixelType::BYTE)
170 return error_diffusion_scalar<PixelType::WORD, PixelType::BYTE>;
171 else if (pixel_in == PixelType::WORD && pixel_out == PixelType::WORD)
172 return error_diffusion_scalar<PixelType::WORD, PixelType::WORD>;
173 else if (pixel_in == PixelType::HALF && pixel_out == PixelType::BYTE)
174 return error_diffusion_scalar<PixelType::HALF, PixelType::BYTE>;
175 else if (pixel_in == PixelType::HALF && pixel_out == PixelType::WORD)
176 return error_diffusion_scalar<PixelType::HALF, PixelType::WORD>;
177 else if (pixel_in == PixelType::FLOAT && pixel_out == PixelType::BYTE)
178 return error_diffusion_scalar<PixelType::FLOAT, PixelType::BYTE>;
179 else if (pixel_in == PixelType::FLOAT && pixel_out == PixelType::WORD)
180 return error_diffusion_scalar<PixelType::FLOAT, PixelType::WORD>;
181 else
182 error::throw_<error::InternalError>("no conversion between pixel types");
183 }
184
185
error_diffusion_wf_avx2_xiter(__m256 & v,unsigned j,const float * error_top,float * error_cur,const __m256 & max_val,const __m256 & err_left_w,const __m256 & err_top_right_w,const __m256 & err_top_w,const __m256 & err_top_left_w,__m256 & err_left,__m256 & err_top_right,__m256 & err_top,__m256 & err_top_left)186 inline FORCE_INLINE void error_diffusion_wf_avx2_xiter(__m256 &v, unsigned j, const float *error_top, float *error_cur, const __m256 &max_val,
187 const __m256 &err_left_w, const __m256 &err_top_right_w, const __m256 &err_top_w, const __m256 &err_top_left_w,
188 __m256 &err_left, __m256 &err_top_right, __m256 &err_top, __m256 &err_top_left)
189 {
190 const __m256i rot_mask = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7);
191
192 unsigned j_err = j + 1;
193
194 __m256 x, y, err0, err1, err_rot;
195 __m256i q;
196
197 err0 = _mm256_mul_ps(err_left_w, err_left);
198 err0 = _mm256_fmadd_ps(err_top_right_w, err_top_right, err0);
199 err1 = _mm256_mul_ps(err_top_w, err_top);
200 err1 = _mm256_fmadd_ps(err_top_left_w, err_top_left, err1);
201 err0 = _mm256_add_ps(err0, err1);
202
203 x = _mm256_add_ps(v, err0);
204 x = _mm256_max_ps(x, _mm256_setzero_ps());
205 x = _mm256_min_ps(x, max_val);
206 q = _mm256_cvtps_epi32(x);
207 v = _mm256_castsi256_ps(q);
208
209 y = _mm256_cvtepi32_ps(q);
210 err0 = _mm256_sub_ps(x, y);
211
212 // Left-rotate err0 by 32 bits.
213 err_rot = _mm256_permutevar8x32_ps(err0, rot_mask);
214
215 // Extract the previous high error.
216 error_cur[j_err + 0] = _mm_cvtss_f32(_mm256_castps256_ps128(err_rot));
217
218 // Insert the next error into the low position.
219 err_rot = _mm256_blend_ps(err_rot, _mm256_castps128_ps256(_mm_set_ss(error_top[j_err + 14 + 2])), 1);
220
221 err_left = err0;
222 err_top_left = err_top;
223 err_top = err_top_right;
224 err_top_right = err_rot;
225 }
226
227 template <PixelType SrcType, PixelType DstType, class T, class U>
error_diffusion_wf_avx2(const graph::ImageBuffer<const T> & src,const graph::ImageBuffer<U> & dst,unsigned i,const float * error_top,float * error_cur,error_state * state,float scale,float offset,unsigned bits,unsigned width)228 void error_diffusion_wf_avx2(const graph::ImageBuffer<const T> &src, const graph::ImageBuffer<U> &dst, unsigned i,
229 const float *error_top, float *error_cur, error_state *state, float scale, float offset, unsigned bits, unsigned width)
230 {
231 typedef error_diffusion_traits<SrcType> src_traits;
232 typedef error_diffusion_traits<DstType> dst_traits;
233
234 typedef typename src_traits::type src_type;
235 typedef typename dst_traits::type dst_type;
236
237 static_assert(std::is_same<T, src_type>::value, "wrong type");
238 static_assert(std::is_same<U, dst_type>::value, "wrong type");
239
240 const __m256 err_left_w = _mm256_set1_ps(7.0f / 16.0f);
241 const __m256 err_top_right_w = _mm256_set1_ps(3.0f / 16.0f);
242 const __m256 err_top_w = _mm256_set1_ps(5.0f / 16.0f);
243 const __m256 err_top_left_w = _mm256_set1_ps(1.0f / 16.0f);
244
245 const __m256 scale_ps = _mm256_set1_ps(scale);
246 const __m256 offset_ps = _mm256_set1_ps(offset);
247
248 const __m256 max_val = _mm256_set1_ps(static_cast<float>((1UL << bits) - 1));
249
250 __m256 err_left = _mm256_load_ps(state->err_left);
251 __m256 err_top_right = _mm256_load_ps(state->err_top_right);
252 __m256 err_top = _mm256_load_ps(state->err_top);
253 __m256 err_top_left = _mm256_load_ps(state->err_top_left);
254
255 #define XITER error_diffusion_wf_avx2_xiter
256 #define XARGS error_top, error_cur, max_val, err_left_w, err_top_right_w, err_top_w, err_top_left_w, err_left, err_top_right, err_top, err_top_left
257 for (unsigned j = 0; j < width; j += 8) {
258 __m256 v0 = src_traits::load8(src[i + 0] + j + 14);
259 __m256 v1 = src_traits::load8(src[i + 1] + j + 12);
260 __m256 v2 = src_traits::load8(src[i + 2] + j + 10);
261 __m256 v3 = src_traits::load8(src[i + 3] + j + 8);
262 __m256 v4 = src_traits::load8(src[i + 4] + j + 6);
263 __m256 v5 = src_traits::load8(src[i + 5] + j + 4);
264 __m256 v6 = src_traits::load8(src[i + 6] + j + 2);
265 __m256 v7 = src_traits::load8(src[i + 7] + j + 0);
266
267 v0 = _mm256_fmadd_ps(v0, scale_ps, offset_ps);
268 v1 = _mm256_fmadd_ps(v1, scale_ps, offset_ps);
269 v2 = _mm256_fmadd_ps(v2, scale_ps, offset_ps);
270 v3 = _mm256_fmadd_ps(v3, scale_ps, offset_ps);
271 v4 = _mm256_fmadd_ps(v4, scale_ps, offset_ps);
272 v5 = _mm256_fmadd_ps(v5, scale_ps, offset_ps);
273 v6 = _mm256_fmadd_ps(v6, scale_ps, offset_ps);
274 v7 = _mm256_fmadd_ps(v7, scale_ps, offset_ps);
275
276 mm256_transpose8_ps(v0, v1, v2, v3, v4, v5, v6, v7);
277
278 XITER(v0, j + 0, XARGS);
279 XITER(v1, j + 1, XARGS);
280 XITER(v2, j + 2, XARGS);
281 XITER(v3, j + 3, XARGS);
282 XITER(v4, j + 4, XARGS);
283 XITER(v5, j + 5, XARGS);
284 XITER(v6, j + 6, XARGS);
285 XITER(v7, j + 7, XARGS);
286
287 mm256_transpose8_ps(v0, v1, v2, v3, v4, v5, v6, v7);
288
289 dst_traits::store8(dst[i + 0] + j + 14, _mm256_castps_si256(v0));
290 dst_traits::store8(dst[i + 1] + j + 12, _mm256_castps_si256(v1));
291 dst_traits::store8(dst[i + 2] + j + 10, _mm256_castps_si256(v2));
292 dst_traits::store8(dst[i + 3] + j + 8, _mm256_castps_si256(v3));
293 dst_traits::store8(dst[i + 4] + j + 6, _mm256_castps_si256(v4));
294 dst_traits::store8(dst[i + 5] + j + 4, _mm256_castps_si256(v5));
295 dst_traits::store8(dst[i + 6] + j + 2, _mm256_castps_si256(v6));
296 dst_traits::store8(dst[i + 7] + j + 0, _mm256_castps_si256(v7));
297 }
298 #undef XITER
299 #undef XARGS
300
301 _mm256_store_ps(state->err_left, err_left);
302 _mm256_store_ps(state->err_top_right, err_top_right);
303 _mm256_store_ps(state->err_top, err_top);
304 _mm256_store_ps(state->err_top_left, err_top_left);
305 }
306
307 template <PixelType SrcType, PixelType DstType>
error_diffusion_avx2(const graph::ImageBuffer<const void> & src,const graph::ImageBuffer<void> & dst,unsigned i,const float * error_top,float * error_cur,float scale,float offset,unsigned bits,unsigned width)308 void error_diffusion_avx2(const graph::ImageBuffer<const void> &src, const graph::ImageBuffer<void> &dst, unsigned i,
309 const float *error_top, float *error_cur, float scale, float offset, unsigned bits, unsigned width)
310 {
311 typedef error_diffusion_traits<SrcType> src_traits;
312 typedef error_diffusion_traits<DstType> dst_traits;
313
314 typedef typename src_traits::type src_type;
315 typedef typename dst_traits::type dst_type;
316
317 const graph::ImageBuffer<const src_type> &src_buf = graph::static_buffer_cast<const src_type>(src);
318 const graph::ImageBuffer<dst_type> &dst_buf = graph::static_buffer_cast<dst_type>(dst);
319
320 error_state state alignas(32) = {};
321 float error_tmp[7][24] = {};
322
323 // Prologue.
324 error_diffusion_scalar<SrcType, DstType>(src_buf[i + 0], dst_buf[i + 0], error_top, error_tmp[0], scale, offset, bits, 14);
325 error_diffusion_scalar<SrcType, DstType>(src_buf[i + 1], dst_buf[i + 1], error_tmp[0], error_tmp[1], scale, offset, bits, 12);
326 error_diffusion_scalar<SrcType, DstType>(src_buf[i + 2], dst_buf[i + 2], error_tmp[1], error_tmp[2], scale, offset, bits, 10);
327 error_diffusion_scalar<SrcType, DstType>(src_buf[i + 3], dst_buf[i + 3], error_tmp[2], error_tmp[3], scale, offset, bits, 8);
328 error_diffusion_scalar<SrcType, DstType>(src_buf[i + 4], dst_buf[i + 4], error_tmp[3], error_tmp[4], scale, offset, bits, 6);
329 error_diffusion_scalar<SrcType, DstType>(src_buf[i + 5], dst_buf[i + 5], error_tmp[4], error_tmp[5], scale, offset, bits, 4);
330 error_diffusion_scalar<SrcType, DstType>(src_buf[i + 6], dst_buf[i + 6], error_tmp[5], error_tmp[6], scale, offset, bits, 2);
331
332 // Wavefront.
333 state.err_left[0] = error_tmp[0][13 + 1];
334 state.err_left[1] = error_tmp[1][11 + 1];
335 state.err_left[2] = error_tmp[2][9 + 1];
336 state.err_left[3] = error_tmp[3][7 + 1];
337 state.err_left[4] = error_tmp[4][5 + 1];
338 state.err_left[5] = error_tmp[5][3 + 1];
339 state.err_left[6] = error_tmp[6][1 + 1];
340 state.err_left[7] = 0.0f;
341
342 state.err_top_right[0] = error_top[15 + 1];
343 state.err_top_right[1] = error_tmp[0][13 + 1];
344 state.err_top_right[2] = error_tmp[1][11 + 1];
345 state.err_top_right[3] = error_tmp[2][9 + 1];
346 state.err_top_right[4] = error_tmp[3][7 + 1];
347 state.err_top_right[5] = error_tmp[4][5 + 1];
348 state.err_top_right[6] = error_tmp[5][3 + 1];
349 state.err_top_right[7] = error_tmp[6][1 + 1];
350
351 state.err_top[0] = error_top[14 + 1];
352 state.err_top[1] = error_tmp[0][12 + 1];
353 state.err_top[2] = error_tmp[1][10 + 1];
354 state.err_top[3] = error_tmp[2][8 + 1];
355 state.err_top[4] = error_tmp[3][6 + 1];
356 state.err_top[5] = error_tmp[4][4 + 1];
357 state.err_top[6] = error_tmp[5][2 + 1];
358 state.err_top[7] = error_tmp[6][0 + 1];
359
360 state.err_top_left[0] = error_top[13 + 1];
361 state.err_top_left[1] = error_tmp[0][11 + 1];
362 state.err_top_left[2] = error_tmp[1][9 + 1];
363 state.err_top_left[3] = error_tmp[2][7 + 1];
364 state.err_top_left[4] = error_tmp[3][5 + 1];
365 state.err_top_left[5] = error_tmp[4][3 + 1];
366 state.err_top_left[6] = error_tmp[5][1 + 1];
367 state.err_top_left[7] = 0.0f;
368
369 unsigned vec_count = floor_n(width - 14, 8);
370 error_diffusion_wf_avx2<SrcType, DstType>(src_buf, dst_buf, i, error_top, error_cur, &state, scale, offset, bits, vec_count);
371
372 error_tmp[0][13 + 1] = state.err_top_right[1];
373 error_tmp[0][12 + 1] = state.err_top[1];
374 error_tmp[0][11 + 1] = state.err_top_left[1];
375
376 error_tmp[1][11 + 1] = state.err_top_right[2];
377 error_tmp[1][10 + 1] = state.err_top[2];
378 error_tmp[1][9 + 1] = state.err_top_left[2];
379
380 error_tmp[2][9 + 1] = state.err_top_right[3];
381 error_tmp[2][8 + 1] = state.err_top[3];
382 error_tmp[2][7 + 1] = state.err_top_left[3];
383
384 error_tmp[3][7 + 1] = state.err_top_right[4];
385 error_tmp[3][6 + 1] = state.err_top[4];
386 error_tmp[3][5 + 1] = state.err_top_left[4];
387
388 error_tmp[4][5 + 1] = state.err_top_right[5];
389 error_tmp[4][4 + 1] = state.err_top[5];
390 error_tmp[4][3 + 1] = state.err_top_left[5];
391
392 error_tmp[5][3 + 1] = state.err_top_right[6];
393 error_tmp[5][2 + 1] = state.err_top[6];
394 error_tmp[5][1 + 1] = state.err_top_left[6];
395
396 error_tmp[6][1 + 1] = state.err_top_right[7];
397 error_tmp[6][0 + 1] = state.err_top[7];
398 error_tmp[6][0] = state.err_top_left[7];
399
400 // Epilogue.
401 error_diffusion_scalar<SrcType, DstType>(src_buf[i + 0] + vec_count + 14, dst_buf[i + 0] + vec_count + 14, error_top + vec_count + 14, error_tmp[0] + 14,
402 scale, offset, bits, width - vec_count - 14);
403 error_diffusion_scalar<SrcType, DstType>(src_buf[i + 1] + vec_count + 12, dst_buf[i + 1] + vec_count + 12, error_tmp[0] + 12, error_tmp[1] + 12,
404 scale, offset, bits, width - vec_count - 12);
405 error_diffusion_scalar<SrcType, DstType>(src_buf[i + 2] + vec_count + 10, dst_buf[i + 2] + vec_count + 10, error_tmp[1] + 10, error_tmp[2] + 10,
406 scale, offset, bits, width - vec_count - 10);
407 error_diffusion_scalar<SrcType, DstType>(src_buf[i + 3] + vec_count + 8, dst_buf[i + 3] + vec_count + 8, error_tmp[2] + 8, error_tmp[3] + 8,
408 scale, offset, bits, width - vec_count - 8);
409 error_diffusion_scalar<SrcType, DstType>(src_buf[i + 4] + vec_count + 6, dst_buf[i + 4] + vec_count + 6, error_tmp[3] + 6, error_tmp[4] + 6,
410 scale, offset, bits, width - vec_count - 6);
411 error_diffusion_scalar<SrcType, DstType>(src_buf[i + 5] + vec_count + 4, dst_buf[i + 5] + vec_count + 4, error_tmp[4] + 4, error_tmp[5] + 4,
412 scale, offset, bits, width - vec_count - 4);
413 error_diffusion_scalar<SrcType, DstType>(src_buf[i + 6] + vec_count + 2, dst_buf[i + 6] + vec_count + 2, error_tmp[5] + 2, error_tmp[6] + 2,
414 scale, offset, bits, width - vec_count - 2);
415 error_diffusion_scalar<SrcType, DstType>(src_buf[i + 7] + vec_count + 0, dst_buf[i + 7] + vec_count + 0, error_tmp[6] + 0, error_cur + vec_count + 0,
416 scale, offset, bits, width - vec_count - 0);
417 }
418
select_error_diffusion_avx2_func(PixelType pixel_in,PixelType pixel_out)419 decltype(&error_diffusion_avx2<PixelType::BYTE, PixelType::BYTE>) select_error_diffusion_avx2_func(PixelType pixel_in, PixelType pixel_out)
420 {
421 if (pixel_in == PixelType::BYTE && pixel_out == PixelType::BYTE)
422 return error_diffusion_avx2<PixelType::BYTE, PixelType::BYTE>;
423 else if (pixel_in == PixelType::BYTE && pixel_out == PixelType::WORD)
424 return error_diffusion_avx2<PixelType::BYTE, PixelType::WORD>;
425 else if (pixel_in == PixelType::WORD && pixel_out == PixelType::BYTE)
426 return error_diffusion_avx2<PixelType::WORD, PixelType::BYTE>;
427 else if (pixel_in == PixelType::WORD && pixel_out == PixelType::WORD)
428 return error_diffusion_avx2<PixelType::WORD, PixelType::WORD>;
429 else if (pixel_in == PixelType::HALF && pixel_out == PixelType::BYTE)
430 return error_diffusion_avx2<PixelType::HALF, PixelType::BYTE>;
431 else if (pixel_in == PixelType::HALF && pixel_out == PixelType::WORD)
432 return error_diffusion_avx2<PixelType::HALF, PixelType::WORD>;
433 else if (pixel_in == PixelType::FLOAT && pixel_out == PixelType::BYTE)
434 return error_diffusion_avx2<PixelType::FLOAT, PixelType::BYTE>;
435 else if (pixel_in == PixelType::FLOAT && pixel_out == PixelType::WORD)
436 return error_diffusion_avx2<PixelType::FLOAT, PixelType::WORD>;
437 else
438 error::throw_<error::InternalError>("no conversion between pixel types");
439 }
440
441
442 class ErrorDiffusionAVX2 final : public graph::ImageFilter {
443 decltype(&error_diffusion_scalar<PixelType::BYTE, PixelType::BYTE>) m_scalar_func;
444 decltype(&error_diffusion_avx2<PixelType::BYTE, PixelType::BYTE>) m_avx2_func;
445
446 PixelType m_pixel_in;
447 PixelType m_pixel_out;
448
449 float m_scale;
450 float m_offset;
451 unsigned m_depth;
452
453 unsigned m_width;
454 unsigned m_height;
455
process_scalar(void * ctx,const void * src,void * dst,bool parity) const456 void process_scalar(void *ctx, const void *src, void *dst, bool parity) const
457 {
458 float *ctx_a = reinterpret_cast<float *>(ctx);
459 float *ctx_b = reinterpret_cast<float *>(static_cast<unsigned char *>(ctx) + get_context_size() / 2);
460
461 float *error_top = parity ? ctx_a : ctx_b;
462 float *error_cur = parity ? ctx_b : ctx_a;
463
464 m_scalar_func(src, dst, error_top, error_cur, m_scale, m_offset, m_depth, m_width);
465 }
466
process_vector(void * ctx,const graph::ImageBuffer<const void> & src,const graph::ImageBuffer<void> & dst,unsigned i) const467 void process_vector(void *ctx, const graph::ImageBuffer<const void> &src, const graph::ImageBuffer<void> &dst, unsigned i) const
468 {
469 float *ctx_a = reinterpret_cast<float *>(ctx);
470 float *ctx_b = reinterpret_cast<float *>(static_cast<unsigned char *>(ctx) + get_context_size() / 2);
471
472 float *error_top = (i / 8) % 2 ? ctx_a : ctx_b;
473 float *error_cur = (i / 8) % 2 ? ctx_b : ctx_a;
474
475 m_avx2_func(src, dst, i, error_top, error_cur, m_scale, m_offset, m_depth, m_width);
476 }
477 public:
ErrorDiffusionAVX2(unsigned width,unsigned height,const PixelFormat & format_in,const PixelFormat & format_out)478 ErrorDiffusionAVX2(unsigned width, unsigned height, const PixelFormat &format_in, const PixelFormat &format_out) :
479 m_scalar_func{ select_error_diffusion_scalar_func(format_in.type, format_out.type) },
480 m_avx2_func{ select_error_diffusion_avx2_func(format_in.type, format_out.type) },
481 m_pixel_in{ format_in.type },
482 m_pixel_out{ format_out.type },
483 m_scale{},
484 m_offset{},
485 m_depth{ format_out.depth },
486 m_width{ width },
487 m_height{ height }
488 {
489 zassert_d(width <= pixel_max_width(format_in.type), "overflow");
490 zassert_d(width <= pixel_max_width(format_out.type), "overflow");
491
492 if (!pixel_is_integer(format_out.type))
493 error::throw_<error::InternalError>("cannot dither to non-integer format");
494
495 std::tie(m_scale, m_offset) = get_scale_offset(format_in, format_out);
496 }
497
get_flags() const498 filter_flags get_flags() const override
499 {
500 filter_flags flags{};
501
502 flags.has_state = true;
503 flags.same_row = true;
504 flags.in_place = pixel_size(m_pixel_in) == pixel_size(m_pixel_out);
505 flags.entire_row = true;
506
507 return flags;
508 }
509
get_image_attributes() const510 image_attributes get_image_attributes() const override
511 {
512 return{ m_width, m_height, m_pixel_out };
513 }
514
get_required_row_range(unsigned i) const515 pair_unsigned get_required_row_range(unsigned i) const override
516 {
517 unsigned last = std::min(i, UINT_MAX - 8) + 8;
518 return{ i, std::min(last, m_height) };
519 }
520
get_required_col_range(unsigned,unsigned) const521 pair_unsigned get_required_col_range(unsigned, unsigned) const override
522 {
523 return{ 0, get_image_attributes().width };
524 }
525
get_simultaneous_lines() const526 unsigned get_simultaneous_lines() const override { return 8; }
527
get_max_buffering() const528 unsigned get_max_buffering() const override { return 8; }
529
get_context_size() const530 size_t get_context_size() const override
531 {
532 try {
533 checked_size_t size = (static_cast<checked_size_t>(m_width) + 2) * sizeof(float) * 2;
534 return size.get();
535 } catch (const std::overflow_error &) {
536 error::throw_<error::OutOfMemory>();
537 }
538 }
539
get_tmp_size(unsigned,unsigned) const540 size_t get_tmp_size(unsigned, unsigned) const override { return 0; }
541
init_context(void * ctx,unsigned seq) const542 void init_context(void *ctx, unsigned seq) const override
543 {
544 std::fill_n(static_cast<unsigned char *>(ctx), get_context_size(), 0);
545 }
546
process(void * ctx,const graph::ImageBuffer<const void> * src,const graph::ImageBuffer<void> * dst,void *,unsigned i,unsigned,unsigned) const547 void process(void *ctx, const graph::ImageBuffer<const void> *src, const graph::ImageBuffer<void> *dst, void *, unsigned i, unsigned, unsigned) const override
548 {
549 if (m_height - i < 8) {
550 bool parity = !!((i / 8) % 2);
551
552 for (unsigned ii = i; ii < m_height; ++ii) {
553 process_scalar(ctx, (*src)[ii], (*dst)[ii], parity);
554 parity = !parity;
555 }
556 } else {
557 process_vector(ctx, *src, *dst, i);
558 }
559 }
560 };
561
562 } // namespace
563
564
create_error_diffusion_avx2(unsigned width,unsigned height,const PixelFormat & pixel_in,const PixelFormat & pixel_out)565 std::unique_ptr<graph::ImageFilter> create_error_diffusion_avx2(unsigned width, unsigned height, const PixelFormat &pixel_in, const PixelFormat &pixel_out)
566 {
567 if (width < 14)
568 return nullptr;
569
570 return ztd::make_unique<ErrorDiffusionAVX2>(width, height, pixel_in, pixel_out);
571 }
572
573 } // namespace depth
574 } // namespace zimg
575
576 #endif // ZIMG_X86
577