1 #ifdef ZIMG_X86
2 
3 #include <algorithm>
4 #include <climits>
5 #include <stdexcept>
6 #include <tuple>
7 #include <type_traits>
8 #include <immintrin.h>
9 #include "common/align.h"
10 #include "common/ccdep.h"
11 #include "common/checked_int.h"
12 #include "common/except.h"
13 #include "common/make_unique.h"
14 #include "common/pixel.h"
15 #include "common/zassert.h"
16 #include "depth/quantize.h"
17 #include "graph/image_buffer.h"
18 #include "graph/image_filter.h"
19 #include "dither_x86.h"
20 
21 #include "common/x86/avx_util.h"
22 
23 namespace zimg {
24 namespace depth {
25 
26 namespace {
27 
28 struct error_state {
29 	float err_left[8];
30 	float err_top_right[8];
31 	float err_top[8];
32 	float err_top_left[8];
33 };
34 
35 
36 template <PixelType SrcType>
37 struct error_diffusion_traits;
38 
39 template <>
40 struct error_diffusion_traits<PixelType::BYTE> {
41 	typedef uint8_t type;
42 
load1zimg::depth::__anon1daf47140111::error_diffusion_traits43 	static float load1(const uint8_t *ptr) { return *ptr; }
store1zimg::depth::__anon1daf47140111::error_diffusion_traits44 	static void store1(uint8_t *ptr, uint32_t x) { *ptr = static_cast<uint8_t>(x); }
45 
load8zimg::depth::__anon1daf47140111::error_diffusion_traits46 	static __m256 load8(const uint8_t *ptr)
47 	{
48 		return _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)ptr)));
49 	}
50 
store8zimg::depth::__anon1daf47140111::error_diffusion_traits51 	static void store8(uint8_t *ptr, __m256i x)
52 	{
53 		x = _mm256_packs_epi32(x, x);
54 		x = _mm256_permute4x64_epi64(x, _MM_SHUFFLE(0, 0, 2, 0));
55 		x = _mm256_packus_epi16(x, x);
56 		_mm_storel_epi64((__m128i *)ptr, _mm256_castsi256_si128(x));
57 	}
58 };
59 
60 template <>
61 struct error_diffusion_traits<PixelType::WORD> {
62 	typedef uint16_t type;
63 
load1zimg::depth::__anon1daf47140111::error_diffusion_traits64 	static float load1(const uint16_t *ptr) { return *ptr; }
store1zimg::depth::__anon1daf47140111::error_diffusion_traits65 	static void store1(uint16_t *ptr, uint32_t x) { *ptr = static_cast<uint32_t>(x); }
66 
load8zimg::depth::__anon1daf47140111::error_diffusion_traits67 	static __m256 load8(const uint16_t *ptr)
68 	{
69 		return _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)ptr)));
70 	}
71 
store8zimg::depth::__anon1daf47140111::error_diffusion_traits72 	static void store8(uint16_t *ptr, __m256i x)
73 	{
74 		x = _mm256_packus_epi32(x, x);
75 		x = _mm256_permute4x64_epi64(x, _MM_SHUFFLE(0, 0, 2, 0));
76 		_mm_storeu_si128((__m128i *)ptr, _mm256_castsi256_si128(x));
77 	}
78 };
79 
80 template <>
81 struct error_diffusion_traits<PixelType::HALF> {
82 	typedef uint16_t type;
83 
load1zimg::depth::__anon1daf47140111::error_diffusion_traits84 	static float load1(const uint16_t *ptr)
85 	{
86 		return _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(*ptr)));
87 	}
88 
load8zimg::depth::__anon1daf47140111::error_diffusion_traits89 	static __m256 load8(const uint16_t *ptr) {
90 		return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)ptr));
91 	}
92 };
93 
94 template <>
95 struct error_diffusion_traits<PixelType::FLOAT> {
96 	typedef float type;
97 
load1zimg::depth::__anon1daf47140111::error_diffusion_traits98 	static float load1(const float *ptr) { return *ptr; }
load8zimg::depth::__anon1daf47140111::error_diffusion_traits99 	static __m256 load8(const float *ptr) { return _mm256_loadu_ps(ptr); }
100 };
101 
102 
fma(float a,float b,float c)103 inline FORCE_INLINE float fma(float a, float b, float c)
104 {
105 	return _mm_cvtss_f32(_mm_fmadd_ss(_mm_set_ss(a), _mm_set_ss(b), _mm_set_ss(c)));
106 }
107 
max(float x,float y)108 inline FORCE_INLINE float max(float x, float y)
109 {
110 	return _mm_cvtss_f32(_mm_max_ss(_mm_set_ss(x), _mm_set_ss(y)));
111 }
112 
min(float x,float y)113 inline FORCE_INLINE float min(float x, float y)
114 {
115 	return _mm_cvtss_f32(_mm_min_ss(_mm_set_ss(x), _mm_set_ss(y)));
116 }
117 
118 
119 template <PixelType SrcType, PixelType DstType>
error_diffusion_scalar(const void * src,void * dst,const float * RESTRICT error_top,float * RESTRICT error_cur,float scale,float offset,unsigned bits,unsigned width)120 void error_diffusion_scalar(const void *src, void *dst, const float * RESTRICT error_top, float * RESTRICT error_cur,
121                             float scale, float offset, unsigned bits, unsigned width)
122 {
123 	typedef error_diffusion_traits<SrcType> src_traits;
124 	typedef error_diffusion_traits<DstType> dst_traits;
125 
126 	const typename src_traits::type *src_p = static_cast<const typename src_traits::type *>(src);
127 	typename dst_traits::type *dst_p = static_cast<typename dst_traits::type *>(dst);
128 
129 	float err_left = error_cur[0];
130 	float err_top_right;
131 	float err_top = error_top[0 + 1];
132 	float err_top_left = error_top[0];
133 
134 	for (unsigned j = 0; j < width; ++j) {
135 		// Error array is padded by one on each side.
136 		unsigned j_err = j + 1;
137 		err_top_right = error_top[j_err + 1];
138 
139 		float x = fma(src_traits::load1(src_p + j), scale, offset);
140 		float err, err0, err1;
141 
142 		err0 = err_left * (7.0f / 16.0f);
143 		err0 = fma(err_top_right, 3.0f / 16.0f, err0);
144 		err1 = err_top * (5.0f / 16.0f);
145 		err1 = fma(err_top_left, 1.0f / 16.0f, err1);
146 		err = err0 + err1;
147 
148 		x += err;
149 		x = min(max(x, 0.0f), static_cast<float>(1L << bits) - 1);
150 
151 		uint32_t q = _mm_cvt_ss2si(_mm_set_ss(x));
152 		err = x - static_cast<float>(q);
153 
154 		dst_traits::store1(dst_p + j, q);
155 		error_cur[j_err] = err;
156 
157 		err_left = err;
158 		err_top_left = err_top;
159 		err_top = err_top_right;
160 	}
161 }
162 
select_error_diffusion_scalar_func(PixelType pixel_in,PixelType pixel_out)163 decltype(&error_diffusion_scalar<PixelType::BYTE, PixelType::BYTE>) select_error_diffusion_scalar_func(PixelType pixel_in, PixelType pixel_out)
164 {
165 	if (pixel_in == PixelType::BYTE && pixel_out == PixelType::BYTE)
166 		return error_diffusion_scalar<PixelType::BYTE, PixelType::BYTE>;
167 	else if (pixel_in == PixelType::BYTE && pixel_out == PixelType::WORD)
168 		return error_diffusion_scalar<PixelType::BYTE, PixelType::WORD>;
169 	else if (pixel_in == PixelType::WORD && pixel_out == PixelType::BYTE)
170 		return error_diffusion_scalar<PixelType::WORD, PixelType::BYTE>;
171 	else if (pixel_in == PixelType::WORD && pixel_out == PixelType::WORD)
172 		return error_diffusion_scalar<PixelType::WORD, PixelType::WORD>;
173 	else if (pixel_in == PixelType::HALF && pixel_out == PixelType::BYTE)
174 		return error_diffusion_scalar<PixelType::HALF, PixelType::BYTE>;
175 	else if (pixel_in == PixelType::HALF && pixel_out == PixelType::WORD)
176 		return error_diffusion_scalar<PixelType::HALF, PixelType::WORD>;
177 	else if (pixel_in == PixelType::FLOAT && pixel_out == PixelType::BYTE)
178 		return error_diffusion_scalar<PixelType::FLOAT, PixelType::BYTE>;
179 	else if (pixel_in == PixelType::FLOAT && pixel_out == PixelType::WORD)
180 		return error_diffusion_scalar<PixelType::FLOAT, PixelType::WORD>;
181 	else
182 		error::throw_<error::InternalError>("no conversion between pixel types");
183 }
184 
185 
error_diffusion_wf_avx2_xiter(__m256 & v,unsigned j,const float * error_top,float * error_cur,const __m256 & max_val,const __m256 & err_left_w,const __m256 & err_top_right_w,const __m256 & err_top_w,const __m256 & err_top_left_w,__m256 & err_left,__m256 & err_top_right,__m256 & err_top,__m256 & err_top_left)186 inline FORCE_INLINE void error_diffusion_wf_avx2_xiter(__m256 &v, unsigned j, const float *error_top, float *error_cur, const __m256 &max_val,
187                                                        const __m256 &err_left_w, const __m256 &err_top_right_w, const __m256 &err_top_w, const __m256 &err_top_left_w,
188                                                        __m256 &err_left, __m256 &err_top_right, __m256 &err_top, __m256 &err_top_left)
189 {
190 	const __m256i rot_mask = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7);
191 
192 	unsigned j_err = j + 1;
193 
194 	__m256 x, y, err0, err1, err_rot;
195 	__m256i q;
196 
197 	err0 = _mm256_mul_ps(err_left_w, err_left);
198 	err0 = _mm256_fmadd_ps(err_top_right_w, err_top_right, err0);
199 	err1 = _mm256_mul_ps(err_top_w, err_top);
200 	err1 = _mm256_fmadd_ps(err_top_left_w, err_top_left, err1);
201 	err0 = _mm256_add_ps(err0, err1);
202 
203 	x = _mm256_add_ps(v, err0);
204 	x = _mm256_max_ps(x, _mm256_setzero_ps());
205 	x = _mm256_min_ps(x, max_val);
206 	q = _mm256_cvtps_epi32(x);
207 	v = _mm256_castsi256_ps(q);
208 
209 	y = _mm256_cvtepi32_ps(q);
210 	err0 = _mm256_sub_ps(x, y);
211 
212 	// Left-rotate err0 by 32 bits.
213 	err_rot = _mm256_permutevar8x32_ps(err0, rot_mask);
214 
215 	// Extract the previous high error.
216 	error_cur[j_err + 0] = _mm_cvtss_f32(_mm256_castps256_ps128(err_rot));
217 
218 	// Insert the next error into the low position.
219 	err_rot = _mm256_blend_ps(err_rot, _mm256_castps128_ps256(_mm_set_ss(error_top[j_err + 14 + 2])), 1);
220 
221 	err_left = err0;
222 	err_top_left = err_top;
223 	err_top = err_top_right;
224 	err_top_right = err_rot;
225 }
226 
227 template <PixelType SrcType, PixelType DstType, class T, class U>
error_diffusion_wf_avx2(const graph::ImageBuffer<const T> & src,const graph::ImageBuffer<U> & dst,unsigned i,const float * error_top,float * error_cur,error_state * state,float scale,float offset,unsigned bits,unsigned width)228 void error_diffusion_wf_avx2(const graph::ImageBuffer<const T> &src, const graph::ImageBuffer<U> &dst, unsigned i,
229                              const float *error_top, float *error_cur, error_state *state, float scale, float offset, unsigned bits, unsigned width)
230 {
231 	typedef error_diffusion_traits<SrcType> src_traits;
232 	typedef error_diffusion_traits<DstType> dst_traits;
233 
234 	typedef typename src_traits::type src_type;
235 	typedef typename dst_traits::type dst_type;
236 
237 	static_assert(std::is_same<T, src_type>::value, "wrong type");
238 	static_assert(std::is_same<U, dst_type>::value, "wrong type");
239 
240 	const __m256 err_left_w = _mm256_set1_ps(7.0f / 16.0f);
241 	const __m256 err_top_right_w = _mm256_set1_ps(3.0f / 16.0f);
242 	const __m256 err_top_w = _mm256_set1_ps(5.0f / 16.0f);
243 	const __m256 err_top_left_w = _mm256_set1_ps(1.0f / 16.0f);
244 
245 	const __m256 scale_ps = _mm256_set1_ps(scale);
246 	const __m256 offset_ps = _mm256_set1_ps(offset);
247 
248 	const __m256 max_val = _mm256_set1_ps(static_cast<float>((1UL << bits) - 1));
249 
250 	__m256 err_left = _mm256_load_ps(state->err_left);
251 	__m256 err_top_right = _mm256_load_ps(state->err_top_right);
252 	__m256 err_top = _mm256_load_ps(state->err_top);
253 	__m256 err_top_left = _mm256_load_ps(state->err_top_left);
254 
255 #define XITER error_diffusion_wf_avx2_xiter
256 #define XARGS error_top, error_cur, max_val, err_left_w, err_top_right_w, err_top_w, err_top_left_w, err_left, err_top_right, err_top, err_top_left
257 	for (unsigned j = 0; j < width; j += 8) {
258 		__m256 v0 = src_traits::load8(src[i + 0] + j + 14);
259 		__m256 v1 = src_traits::load8(src[i + 1] + j + 12);
260 		__m256 v2 = src_traits::load8(src[i + 2] + j + 10);
261 		__m256 v3 = src_traits::load8(src[i + 3] + j + 8);
262 		__m256 v4 = src_traits::load8(src[i + 4] + j + 6);
263 		__m256 v5 = src_traits::load8(src[i + 5] + j + 4);
264 		__m256 v6 = src_traits::load8(src[i + 6] + j + 2);
265 		__m256 v7 = src_traits::load8(src[i + 7] + j + 0);
266 
267 		v0 = _mm256_fmadd_ps(v0, scale_ps, offset_ps);
268 		v1 = _mm256_fmadd_ps(v1, scale_ps, offset_ps);
269 		v2 = _mm256_fmadd_ps(v2, scale_ps, offset_ps);
270 		v3 = _mm256_fmadd_ps(v3, scale_ps, offset_ps);
271 		v4 = _mm256_fmadd_ps(v4, scale_ps, offset_ps);
272 		v5 = _mm256_fmadd_ps(v5, scale_ps, offset_ps);
273 		v6 = _mm256_fmadd_ps(v6, scale_ps, offset_ps);
274 		v7 = _mm256_fmadd_ps(v7, scale_ps, offset_ps);
275 
276 		mm256_transpose8_ps(v0, v1, v2, v3, v4, v5, v6, v7);
277 
278 		XITER(v0, j + 0, XARGS);
279 		XITER(v1, j + 1, XARGS);
280 		XITER(v2, j + 2, XARGS);
281 		XITER(v3, j + 3, XARGS);
282 		XITER(v4, j + 4, XARGS);
283 		XITER(v5, j + 5, XARGS);
284 		XITER(v6, j + 6, XARGS);
285 		XITER(v7, j + 7, XARGS);
286 
287 		mm256_transpose8_ps(v0, v1, v2, v3, v4, v5, v6, v7);
288 
289 		dst_traits::store8(dst[i + 0] + j + 14, _mm256_castps_si256(v0));
290 		dst_traits::store8(dst[i + 1] + j + 12, _mm256_castps_si256(v1));
291 		dst_traits::store8(dst[i + 2] + j + 10, _mm256_castps_si256(v2));
292 		dst_traits::store8(dst[i + 3] + j + 8, _mm256_castps_si256(v3));
293 		dst_traits::store8(dst[i + 4] + j + 6, _mm256_castps_si256(v4));
294 		dst_traits::store8(dst[i + 5] + j + 4, _mm256_castps_si256(v5));
295 		dst_traits::store8(dst[i + 6] + j + 2, _mm256_castps_si256(v6));
296 		dst_traits::store8(dst[i + 7] + j + 0, _mm256_castps_si256(v7));
297 	}
298 #undef XITER
299 #undef XARGS
300 
301 	_mm256_store_ps(state->err_left, err_left);
302 	_mm256_store_ps(state->err_top_right, err_top_right);
303 	_mm256_store_ps(state->err_top, err_top);
304 	_mm256_store_ps(state->err_top_left, err_top_left);
305 }
306 
307 template <PixelType SrcType, PixelType DstType>
error_diffusion_avx2(const graph::ImageBuffer<const void> & src,const graph::ImageBuffer<void> & dst,unsigned i,const float * error_top,float * error_cur,float scale,float offset,unsigned bits,unsigned width)308 void error_diffusion_avx2(const graph::ImageBuffer<const void> &src, const graph::ImageBuffer<void> &dst, unsigned i,
309                           const float *error_top, float *error_cur, float scale, float offset, unsigned bits, unsigned width)
310 {
311 	typedef error_diffusion_traits<SrcType> src_traits;
312 	typedef error_diffusion_traits<DstType> dst_traits;
313 
314 	typedef typename src_traits::type src_type;
315 	typedef typename dst_traits::type dst_type;
316 
317 	const graph::ImageBuffer<const src_type> &src_buf = graph::static_buffer_cast<const src_type>(src);
318 	const graph::ImageBuffer<dst_type> &dst_buf = graph::static_buffer_cast<dst_type>(dst);
319 
320 	error_state state alignas(32) = {};
321 	float error_tmp[7][24] = {};
322 
323 	// Prologue.
324 	error_diffusion_scalar<SrcType, DstType>(src_buf[i + 0], dst_buf[i + 0], error_top, error_tmp[0], scale, offset, bits, 14);
325 	error_diffusion_scalar<SrcType, DstType>(src_buf[i + 1], dst_buf[i + 1], error_tmp[0], error_tmp[1], scale, offset, bits, 12);
326 	error_diffusion_scalar<SrcType, DstType>(src_buf[i + 2], dst_buf[i + 2], error_tmp[1], error_tmp[2], scale, offset, bits, 10);
327 	error_diffusion_scalar<SrcType, DstType>(src_buf[i + 3], dst_buf[i + 3], error_tmp[2], error_tmp[3], scale, offset, bits, 8);
328 	error_diffusion_scalar<SrcType, DstType>(src_buf[i + 4], dst_buf[i + 4], error_tmp[3], error_tmp[4], scale, offset, bits, 6);
329 	error_diffusion_scalar<SrcType, DstType>(src_buf[i + 5], dst_buf[i + 5], error_tmp[4], error_tmp[5], scale, offset, bits, 4);
330 	error_diffusion_scalar<SrcType, DstType>(src_buf[i + 6], dst_buf[i + 6], error_tmp[5], error_tmp[6], scale, offset, bits, 2);
331 
332 	// Wavefront.
333 	state.err_left[0] = error_tmp[0][13 + 1];
334 	state.err_left[1] = error_tmp[1][11 + 1];
335 	state.err_left[2] = error_tmp[2][9 + 1];
336 	state.err_left[3] = error_tmp[3][7 + 1];
337 	state.err_left[4] = error_tmp[4][5 + 1];
338 	state.err_left[5] = error_tmp[5][3 + 1];
339 	state.err_left[6] = error_tmp[6][1 + 1];
340 	state.err_left[7] = 0.0f;
341 
342 	state.err_top_right[0] = error_top[15 + 1];
343 	state.err_top_right[1] = error_tmp[0][13 + 1];
344 	state.err_top_right[2] = error_tmp[1][11 + 1];
345 	state.err_top_right[3] = error_tmp[2][9 + 1];
346 	state.err_top_right[4] = error_tmp[3][7 + 1];
347 	state.err_top_right[5] = error_tmp[4][5 + 1];
348 	state.err_top_right[6] = error_tmp[5][3 + 1];
349 	state.err_top_right[7] = error_tmp[6][1 + 1];
350 
351 	state.err_top[0] = error_top[14 + 1];
352 	state.err_top[1] = error_tmp[0][12 + 1];
353 	state.err_top[2] = error_tmp[1][10 + 1];
354 	state.err_top[3] = error_tmp[2][8 + 1];
355 	state.err_top[4] = error_tmp[3][6 + 1];
356 	state.err_top[5] = error_tmp[4][4 + 1];
357 	state.err_top[6] = error_tmp[5][2 + 1];
358 	state.err_top[7] = error_tmp[6][0 + 1];
359 
360 	state.err_top_left[0] = error_top[13 + 1];
361 	state.err_top_left[1] = error_tmp[0][11 + 1];
362 	state.err_top_left[2] = error_tmp[1][9 + 1];
363 	state.err_top_left[3] = error_tmp[2][7 + 1];
364 	state.err_top_left[4] = error_tmp[3][5 + 1];
365 	state.err_top_left[5] = error_tmp[4][3 + 1];
366 	state.err_top_left[6] = error_tmp[5][1 + 1];
367 	state.err_top_left[7] = 0.0f;
368 
369 	unsigned vec_count = floor_n(width - 14, 8);
370 	error_diffusion_wf_avx2<SrcType, DstType>(src_buf, dst_buf, i, error_top, error_cur, &state, scale, offset, bits, vec_count);
371 
372 	error_tmp[0][13 + 1] = state.err_top_right[1];
373 	error_tmp[0][12 + 1] = state.err_top[1];
374 	error_tmp[0][11 + 1] = state.err_top_left[1];
375 
376 	error_tmp[1][11 + 1] = state.err_top_right[2];
377 	error_tmp[1][10 + 1] = state.err_top[2];
378 	error_tmp[1][9 + 1] = state.err_top_left[2];
379 
380 	error_tmp[2][9 + 1] = state.err_top_right[3];
381 	error_tmp[2][8 + 1] = state.err_top[3];
382 	error_tmp[2][7 + 1] = state.err_top_left[3];
383 
384 	error_tmp[3][7 + 1] = state.err_top_right[4];
385 	error_tmp[3][6 + 1] = state.err_top[4];
386 	error_tmp[3][5 + 1] = state.err_top_left[4];
387 
388 	error_tmp[4][5 + 1] = state.err_top_right[5];
389 	error_tmp[4][4 + 1] = state.err_top[5];
390 	error_tmp[4][3 + 1] = state.err_top_left[5];
391 
392 	error_tmp[5][3 + 1] = state.err_top_right[6];
393 	error_tmp[5][2 + 1] = state.err_top[6];
394 	error_tmp[5][1 + 1] = state.err_top_left[6];
395 
396 	error_tmp[6][1 + 1] = state.err_top_right[7];
397 	error_tmp[6][0 + 1] = state.err_top[7];
398 	error_tmp[6][0] = state.err_top_left[7];
399 
400 	// Epilogue.
401 	error_diffusion_scalar<SrcType, DstType>(src_buf[i + 0] + vec_count + 14, dst_buf[i + 0] + vec_count + 14, error_top + vec_count + 14, error_tmp[0] + 14,
402 	                                         scale, offset, bits, width - vec_count - 14);
403 	error_diffusion_scalar<SrcType, DstType>(src_buf[i + 1] + vec_count + 12, dst_buf[i + 1] + vec_count + 12, error_tmp[0] + 12, error_tmp[1] + 12,
404 	                                         scale, offset, bits, width - vec_count - 12);
405 	error_diffusion_scalar<SrcType, DstType>(src_buf[i + 2] + vec_count + 10, dst_buf[i + 2] + vec_count + 10, error_tmp[1] + 10, error_tmp[2] + 10,
406 	                                         scale, offset, bits, width - vec_count - 10);
407 	error_diffusion_scalar<SrcType, DstType>(src_buf[i + 3] + vec_count + 8, dst_buf[i + 3] + vec_count + 8, error_tmp[2] + 8, error_tmp[3] + 8,
408 	                                         scale, offset, bits, width - vec_count - 8);
409 	error_diffusion_scalar<SrcType, DstType>(src_buf[i + 4] + vec_count + 6, dst_buf[i + 4] + vec_count + 6, error_tmp[3] + 6, error_tmp[4] + 6,
410 	                                         scale, offset, bits, width - vec_count - 6);
411 	error_diffusion_scalar<SrcType, DstType>(src_buf[i + 5] + vec_count + 4, dst_buf[i + 5] + vec_count + 4, error_tmp[4] + 4, error_tmp[5] + 4,
412 	                                         scale, offset, bits, width - vec_count - 4);
413 	error_diffusion_scalar<SrcType, DstType>(src_buf[i + 6] + vec_count + 2, dst_buf[i + 6] + vec_count + 2, error_tmp[5] + 2, error_tmp[6] + 2,
414 	                                         scale, offset, bits, width - vec_count - 2);
415 	error_diffusion_scalar<SrcType, DstType>(src_buf[i + 7] + vec_count + 0, dst_buf[i + 7] + vec_count + 0, error_tmp[6] + 0, error_cur + vec_count + 0,
416 	                                         scale, offset, bits, width - vec_count - 0);
417 }
418 
select_error_diffusion_avx2_func(PixelType pixel_in,PixelType pixel_out)419 decltype(&error_diffusion_avx2<PixelType::BYTE, PixelType::BYTE>) select_error_diffusion_avx2_func(PixelType pixel_in, PixelType pixel_out)
420 {
421 	if (pixel_in == PixelType::BYTE && pixel_out == PixelType::BYTE)
422 		return error_diffusion_avx2<PixelType::BYTE, PixelType::BYTE>;
423 	else if (pixel_in == PixelType::BYTE && pixel_out == PixelType::WORD)
424 		return error_diffusion_avx2<PixelType::BYTE, PixelType::WORD>;
425 	else if (pixel_in == PixelType::WORD && pixel_out == PixelType::BYTE)
426 		return error_diffusion_avx2<PixelType::WORD, PixelType::BYTE>;
427 	else if (pixel_in == PixelType::WORD && pixel_out == PixelType::WORD)
428 		return error_diffusion_avx2<PixelType::WORD, PixelType::WORD>;
429 	else if (pixel_in == PixelType::HALF && pixel_out == PixelType::BYTE)
430 		return error_diffusion_avx2<PixelType::HALF, PixelType::BYTE>;
431 	else if (pixel_in == PixelType::HALF && pixel_out == PixelType::WORD)
432 		return error_diffusion_avx2<PixelType::HALF, PixelType::WORD>;
433 	else if (pixel_in == PixelType::FLOAT && pixel_out == PixelType::BYTE)
434 		return error_diffusion_avx2<PixelType::FLOAT, PixelType::BYTE>;
435 	else if (pixel_in == PixelType::FLOAT && pixel_out == PixelType::WORD)
436 		return error_diffusion_avx2<PixelType::FLOAT, PixelType::WORD>;
437 	else
438 		error::throw_<error::InternalError>("no conversion between pixel types");
439 }
440 
441 
442 class ErrorDiffusionAVX2 final : public graph::ImageFilter {
443 	decltype(&error_diffusion_scalar<PixelType::BYTE, PixelType::BYTE>) m_scalar_func;
444 	decltype(&error_diffusion_avx2<PixelType::BYTE, PixelType::BYTE>) m_avx2_func;
445 
446 	PixelType m_pixel_in;
447 	PixelType m_pixel_out;
448 
449 	float m_scale;
450 	float m_offset;
451 	unsigned m_depth;
452 
453 	unsigned m_width;
454 	unsigned m_height;
455 
process_scalar(void * ctx,const void * src,void * dst,bool parity) const456 	void process_scalar(void *ctx, const void *src, void *dst, bool parity) const
457 	{
458 		float *ctx_a = reinterpret_cast<float *>(ctx);
459 		float *ctx_b = reinterpret_cast<float *>(static_cast<unsigned char *>(ctx) + get_context_size() / 2);
460 
461 		float *error_top = parity ? ctx_a : ctx_b;
462 		float *error_cur = parity ? ctx_b : ctx_a;
463 
464 		m_scalar_func(src, dst, error_top, error_cur, m_scale, m_offset, m_depth, m_width);
465 	}
466 
process_vector(void * ctx,const graph::ImageBuffer<const void> & src,const graph::ImageBuffer<void> & dst,unsigned i) const467 	void process_vector(void *ctx, const graph::ImageBuffer<const void> &src, const graph::ImageBuffer<void> &dst, unsigned i) const
468 	{
469 		float *ctx_a = reinterpret_cast<float *>(ctx);
470 		float *ctx_b = reinterpret_cast<float *>(static_cast<unsigned char *>(ctx) + get_context_size() / 2);
471 
472 		float *error_top = (i / 8) % 2 ? ctx_a : ctx_b;
473 		float *error_cur = (i / 8) % 2 ? ctx_b : ctx_a;
474 
475 		m_avx2_func(src, dst, i, error_top, error_cur, m_scale, m_offset, m_depth, m_width);
476 	}
477 public:
ErrorDiffusionAVX2(unsigned width,unsigned height,const PixelFormat & format_in,const PixelFormat & format_out)478 	ErrorDiffusionAVX2(unsigned width, unsigned height, const PixelFormat &format_in, const PixelFormat &format_out) :
479 		m_scalar_func{ select_error_diffusion_scalar_func(format_in.type, format_out.type) },
480 		m_avx2_func{ select_error_diffusion_avx2_func(format_in.type, format_out.type) },
481 		m_pixel_in{ format_in.type },
482 		m_pixel_out{ format_out.type },
483 		m_scale{},
484 		m_offset{},
485 		m_depth{ format_out.depth },
486 		m_width{ width },
487 		m_height{ height }
488 	{
489 		zassert_d(width <= pixel_max_width(format_in.type), "overflow");
490 		zassert_d(width <= pixel_max_width(format_out.type), "overflow");
491 
492 		if (!pixel_is_integer(format_out.type))
493 			error::throw_<error::InternalError>("cannot dither to non-integer format");
494 
495 		std::tie(m_scale, m_offset) = get_scale_offset(format_in, format_out);
496 	}
497 
get_flags() const498 	filter_flags get_flags() const override
499 	{
500 		filter_flags flags{};
501 
502 		flags.has_state = true;
503 		flags.same_row = true;
504 		flags.in_place = pixel_size(m_pixel_in) == pixel_size(m_pixel_out);
505 		flags.entire_row = true;
506 
507 		return flags;
508 	}
509 
get_image_attributes() const510 	image_attributes get_image_attributes() const override
511 	{
512 		return{ m_width, m_height, m_pixel_out };
513 	}
514 
get_required_row_range(unsigned i) const515 	pair_unsigned get_required_row_range(unsigned i) const override
516 	{
517 		unsigned last = std::min(i, UINT_MAX - 8) + 8;
518 		return{ i, std::min(last, m_height) };
519 	}
520 
get_required_col_range(unsigned,unsigned) const521 	pair_unsigned get_required_col_range(unsigned, unsigned) const override
522 	{
523 		return{ 0, get_image_attributes().width };
524 	}
525 
get_simultaneous_lines() const526 	unsigned get_simultaneous_lines() const override { return 8; }
527 
get_max_buffering() const528 	unsigned get_max_buffering() const override { return 8; }
529 
get_context_size() const530 	size_t get_context_size() const override
531 	{
532 		try {
533 			checked_size_t size = (static_cast<checked_size_t>(m_width) + 2) * sizeof(float) * 2;
534 			return size.get();
535 		} catch (const std::overflow_error &) {
536 			error::throw_<error::OutOfMemory>();
537 		}
538 	}
539 
get_tmp_size(unsigned,unsigned) const540 	size_t get_tmp_size(unsigned, unsigned) const override { return 0; }
541 
init_context(void * ctx,unsigned seq) const542 	void init_context(void *ctx, unsigned seq) const override
543 	{
544 		std::fill_n(static_cast<unsigned char *>(ctx), get_context_size(), 0);
545 	}
546 
process(void * ctx,const graph::ImageBuffer<const void> * src,const graph::ImageBuffer<void> * dst,void *,unsigned i,unsigned,unsigned) const547 	void process(void *ctx, const graph::ImageBuffer<const void> *src, const graph::ImageBuffer<void> *dst, void *, unsigned i, unsigned, unsigned) const override
548 	{
549 		if (m_height - i < 8) {
550 			bool parity = !!((i / 8) % 2);
551 
552 			for (unsigned ii = i; ii < m_height; ++ii) {
553 				process_scalar(ctx, (*src)[ii], (*dst)[ii], parity);
554 				parity = !parity;
555 			}
556 		} else {
557 			process_vector(ctx, *src, *dst, i);
558 		}
559 	}
560 };
561 
562 } // namespace
563 
564 
create_error_diffusion_avx2(unsigned width,unsigned height,const PixelFormat & pixel_in,const PixelFormat & pixel_out)565 std::unique_ptr<graph::ImageFilter> create_error_diffusion_avx2(unsigned width, unsigned height, const PixelFormat &pixel_in, const PixelFormat &pixel_out)
566 {
567 	if (width < 14)
568 		return nullptr;
569 
570 	return ztd::make_unique<ErrorDiffusionAVX2>(width, height, pixel_in, pixel_out);
571 }
572 
573 } // namespace depth
574 } // namespace zimg
575 
576 #endif // ZIMG_X86
577