1 //------------------------------------------------------------------------------
2 // emPainter_ScTlIntImg_AVX2.cpp
3 //
4 // Copyright (C) 2020 Oliver Hamann.
5 //
6 // Homepage: http://eaglemode.sourceforge.net/
7 //
8 // This program is free software: you can redistribute it and/or modify it under
9 // the terms of the GNU General Public License version 3 as published by the
10 // Free Software Foundation.
11 //
12 // This program is distributed in the hope that it will be useful, but WITHOUT
13 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14 // FOR A PARTICULAR PURPOSE. See the GNU General Public License version 3 for
15 // more details.
16 //
17 // You should have received a copy of the GNU General Public License version 3
18 // along with this program. If not, see <http://www.gnu.org/licenses/>.
19 //------------------------------------------------------------------------------
20 
21 //------------------------------------------------------------------------------
22 // This cpp file includes itself multiple times in order to expand the
23 // algorithms for emPainter::ScanlineTool::InterpolateImageAvx2..(..) with
24 // different settings. The preprocessor defines for these settings are:
25 //   EXTENSION:  0, 1, 2       - One of EXTEND_TILED, EXTEND_EDGE, EXTEND_ZERO
26 //   CHANNELS:   1, 2, 3, or 4 - Number of channels in the input map.
27 //------------------------------------------------------------------------------
28 
29 #if !defined(EXTENSION)
30 //==============================================================================
31 //===================== Top level include / Set EXTENSION ======================
32 //==============================================================================
33 
34 #include "emPainter_ScTl.h"
35 
36 #if EM_HAVE_X86_INTRINSICS
37 #	if defined(_MSC_VER)
38 #		include <immintrin.h>
39 #	else
40 #		include <x86intrin.h>
41 #	endif
42 #	define CONCATIMPL(a,b) a##b
43 #	define CONCAT(a,b) CONCATIMPL(a,b)
44 #	define EXTEND_TILED  0
45 #	define EXTEND_EDGE   1
46 #	define EXTEND_ZERO   2
47 #	define METHOD_NAME_EXTENSION_0 Et
48 #	define METHOD_NAME_EXTENSION_1 Ee
49 #	define METHOD_NAME_EXTENSION_2 Ez
50 #	define METHOD_NAME_CHANNELS_1 Cs1
51 #	define METHOD_NAME_CHANNELS_2 Cs2
52 #	define METHOD_NAME_CHANNELS_3 Cs3
53 #	define METHOD_NAME_CHANNELS_4 Cs4
54 
55 #	define EXTENSION EXTEND_TILED
56 #	include "emPainter_ScTlIntImg_AVX2.cpp"
57 #	undef EXTENSION
58 
59 #	define EXTENSION EXTEND_EDGE
60 #	include "emPainter_ScTlIntImg_AVX2.cpp"
61 #	undef EXTENSION
62 
63 #	define EXTENSION EXTEND_ZERO
64 #	include "emPainter_ScTlIntImg_AVX2.cpp"
65 #	undef EXTENSION
66 
67 #endif
68 
69 #elif !defined(CHANNELS)
70 //==============================================================================
71 //================================ Set CHANNELS ================================
72 //==============================================================================
73 
74 #define CHANNELS 1
75 #include "emPainter_ScTlIntImg_AVX2.cpp"
76 #undef CHANNELS
77 
78 #define CHANNELS 2
79 #include "emPainter_ScTlIntImg_AVX2.cpp"
80 #undef CHANNELS
81 
82 #define CHANNELS 3
83 #include "emPainter_ScTlIntImg_AVX2.cpp"
84 #undef CHANNELS
85 
86 #define CHANNELS 4
87 #include "emPainter_ScTlIntImg_AVX2.cpp"
88 #undef CHANNELS
89 
90 
91 #else
92 //==============================================================================
93 //======================== Define General Helper Macros ========================
94 //==============================================================================
95 
96 // DEFINE_AND_SET_IMAGE_Y(Y,Y_IN,DY,SY)
97 #if EXTENSION==EXTEND_TILED
98 #	define DEFINE_AND_SET_IMAGE_Y(Y,Y_IN,DY,SY) \
99 		ssize_t Y=((Y_IN)*DY)%SY; \
100 		if (Y<0) Y+=SY;
101 #elif EXTENSION==EXTEND_EDGE
102 #	define DEFINE_AND_SET_IMAGE_Y(Y,Y_IN,DY,SY) \
103 		ssize_t Y=(Y_IN)*DY; \
104 		ssize_t Y##Clipped=Y; \
105 		if ((size_t)Y##Clipped>=(size_t)SY) { \
106 			if (Y##Clipped<0) Y##Clipped=0; \
107 			else Y##Clipped=SY-DY; \
108 		}
109 #else
110 #	define DEFINE_AND_SET_IMAGE_Y(Y,Y_IN,DY,SY) \
111 		ssize_t Y=(Y_IN)*DY;
112 #endif
113 
114 
115 // DEFINE_AND_COPY_IMAGE_Y(Y,Y_SRC)
116 #if EXTENSION==EXTEND_TILED
117 #	define DEFINE_AND_COPY_IMAGE_Y(Y,Y_SRC) \
118 		ssize_t Y=Y_SRC;
119 #elif EXTENSION==EXTEND_EDGE
120 #	define DEFINE_AND_COPY_IMAGE_Y(Y,Y_SRC) \
121 		ssize_t Y=Y_SRC; \
122 		ssize_t Y##Clipped=Y_SRC##Clipped;
123 #else
124 #	define DEFINE_AND_COPY_IMAGE_Y(Y,Y_SRC) \
125 		ssize_t Y=Y_SRC;
126 #endif
127 
128 
129 // INCREMENT_IMAGE_Y(Y,DY,SY)
130 #if EXTENSION==EXTEND_TILED
131 #	define INCREMENT_IMAGE_Y(Y,DY,SY) \
132 		Y+=DY; \
133 		if (Y>=SY) Y=0;
134 #elif EXTENSION==EXTEND_EDGE
135 #	define INCREMENT_IMAGE_Y(Y,DY,SY) \
136 		Y+=DY; \
137 		Y##Clipped=Y; \
138 		if ((size_t)Y##Clipped>=(size_t)SY) { \
139 			if (Y##Clipped<0) Y##Clipped=0; \
140 			else Y##Clipped=SY-DY; \
141 		}
142 #else
143 #	define INCREMENT_IMAGE_Y(Y,DY,SY) \
144 		Y+=DY;
145 #endif
146 
147 
148 // DEFINE_AND_SET_IMAGE_ROW_PTR(ROW_PTR,Y,SX,SY,MAP)
149 #if EXTENSION==EXTEND_TILED
150 #	define DEFINE_AND_SET_IMAGE_ROW_PTR(ROW_PTR,Y,SX,SY,MAP) \
151 		const emByte * ROW_PTR=MAP+Y;
152 #elif EXTENSION==EXTEND_EDGE
153 #	define DEFINE_AND_SET_IMAGE_ROW_PTR(ROW_PTR,Y,SX,SY,MAP) \
154 		const emByte * ROW_PTR=MAP+Y##Clipped;
155 #else
156 #	define DEFINE_AND_SET_IMAGE_ROW_PTR(ROW_PTR,Y,SX,SY,MAP) \
157 		const emByte * ROW_PTR=MAP+Y; \
158 		int ROW_PTR##UsedSX=SX; \
159 		if ((size_t)Y>=(size_t)SY) ROW_PTR##UsedSX=0;
160 #endif
161 
162 
163 // DEFINE_AND_SET_IMAGE_X(X,X_IN,DX,SX)
164 #if EXTENSION==EXTEND_TILED
165 #	define DEFINE_AND_SET_IMAGE_X(X,X_IN,DX,SX) \
166 		ssize_t X=((X_IN)*DX)%SX; \
167 		if (X<0) X+=SX;
168 #elif EXTENSION==EXTEND_EDGE
169 #	define DEFINE_AND_SET_IMAGE_X(X,X_IN,DX,SX) \
170 		ssize_t X=(X_IN)*DX; \
171 		ssize_t X##Clipped=X; \
172 		if ((size_t)X##Clipped>=(size_t)SX) { \
173 			if (X##Clipped<0) X##Clipped=0; \
174 			else X##Clipped=SX-DX; \
175 		}
176 #else
177 #	define DEFINE_AND_SET_IMAGE_X(X,X_IN,DX,SX) \
178 		ssize_t X=(X_IN)*DX;
179 #endif
180 
181 
182 // INCREMENT_IMAGE_X(X,DX,SX)
183 #if EXTENSION==EXTEND_TILED
184 #	define INCREMENT_IMAGE_X(X,DX,SX) \
185 		X+=DX; \
186 		if (X>=SX) X=0;
187 #elif EXTENSION==EXTEND_EDGE
188 #	define INCREMENT_IMAGE_X(X,DX,SX) \
189 		X+=DX; \
190 		X##Clipped=X; \
191 		if ((size_t)X##Clipped>=(size_t)SX) { \
192 			if (X##Clipped<0) X##Clipped=0; \
193 			else X##Clipped=SX-DX; \
194 		}
195 #else
196 #	define INCREMENT_IMAGE_X(X,DX,SX) \
197 		X+=DX;
198 #endif
199 
200 
201 // ARE_THERE_16_CONSECUTIVE_BYTES_AT_IMAGE_X(X,SX)
202 #if EXTENSION==EXTEND_TILED
203 #	define ARE_THERE_16_CONSECUTIVE_BYTES_AT_IMAGE_X(X,SX) \
204 		(X+16<=SX)
205 #elif EXTENSION==EXTEND_EDGE
206 #	define ARE_THERE_16_CONSECUTIVE_BYTES_AT_IMAGE_X(X,SX) \
207 		(X>=0 && X+16<=SX)
208 #else
209 #	define ARE_THERE_16_CONSECUTIVE_BYTES_AT_IMAGE_X(X,SX) \
210 		(X>=0 && X+16<=SX)
211 		// No need to test ROW_PTR##UsedSX, because the ZeroPixels
212 		// array has been made large enough.
213 #endif
214 
215 
216 // DEFINE_AND_SET_IMAGE_PIX_PTR(PIX_PTR,ROW_PTR,X)
217 #ifndef ZERO_PIXELS_DEFINED
218 #	define ZERO_PIXELS_DEFINED
219 	struct alignas(32) ZeroPixelsStruct {
220 		emByte data[32];
221 	};
222 	static const ZeroPixelsStruct ZeroPixels = {0};
223 #endif
224 #if EXTENSION==EXTEND_TILED
225 #	define DEFINE_AND_SET_IMAGE_PIX_PTR(PIX_PTR,ROW_PTR,X) \
226 		const emByte * PIX_PTR=ROW_PTR+X;
227 #elif EXTENSION==EXTEND_EDGE
228 #	define DEFINE_AND_SET_IMAGE_PIX_PTR(PIX_PTR,ROW_PTR,X) \
229 		const emByte * PIX_PTR=ROW_PTR+X##Clipped;
230 #else
231 #	define DEFINE_AND_SET_IMAGE_PIX_PTR(PIX_PTR,ROW_PTR,X) \
232 		const emByte * PIX_PTR=ROW_PTR+X; \
233 		if ((size_t)X>=(size_t)ROW_PTR##UsedSX) PIX_PTR=ZeroPixels.data;
234 #endif
235 
236 
237 // PREMULFIN_COLOR_VEC8(C)
238 #if CHANNELS==1 || CHANNELS==3
239 #	define PREMULFIN_COLOR_VEC8(C)
240 #elif CHANNELS==2
241 #	define PREMULFIN_COLOR_VEC8(C) { \
242 		__m256i c=_mm256_cvtepu8_epi16(C); \
243 		__m256i a=_mm256_shuffle_epi8(c,_mm256_set_epi8( \
244 			15,14,15,14, 11,10,11,10,  7, 6, 7, 6,  3, 2, 3, 2, \
245 			15,14,15,14, 11,10,11,10,  7, 6, 7, 6,  3, 2, 3, 2 \
246 		)); \
247 		c=_mm256_or_si256(c,_mm256_set1_epi32(0x00ff0000)); \
248 		c=_mm256_mullo_epi16(c,a); \
249 		c=_mm256_add_epi16(c,_mm256_set1_epi16(0x80)); \
250 		c=_mm256_add_epi16(c,_mm256_srli_epi16(c,8)); \
251 		c=_mm256_srli_epi16(c,8); \
252 		C=_mm_packus_epi16(_mm256_castsi256_si128(c),_mm256_extracti128_si256(c,1)); \
253 	}
254 #else
255 #	define PREMULFIN_COLOR_VEC8(C) { \
256 		__m256i c=_mm256_cvtepu8_epi16(C); \
257 		__m256i a=_mm256_shuffle_epi8(c,_mm256_set_epi8( \
258 			15,14,15,14, 15,14,15,14,  7, 6, 7, 6,  7, 6, 7, 6, \
259 			15,14,15,14, 15,14,15,14,  7, 6, 7, 6,  7, 6, 7, 6 \
260 		)); \
261 		c=_mm256_or_si256(c,_mm256_set_epi16(255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0)); \
262 		c=_mm256_mullo_epi16(c,a); \
263 		c=_mm256_add_epi16(c,_mm256_set1_epi16(0x80)); \
264 		c=_mm256_add_epi16(c,_mm256_srli_epi16(c,8)); \
265 		c=_mm256_srli_epi16(c,8); \
266 		C=_mm_packus_epi16(_mm256_castsi256_si128(c),_mm256_extracti128_si256(c,1)); \
267 	}
268 #endif
269 
270 
271 // PREMULFIN_SHL_COLOR_VEC16(C,S)
272 #if CHANNELS==1 || CHANNELS==3
273 #	define PREMULFIN_SHL_COLOR_VEC16(C,S) { \
274 		C=_mm256_slli_epi16(C,S); \
275 	}
276 #elif CHANNELS==2
277 #	define PREMULFIN_SHL_COLOR_VEC16(C,S) { \
278 		__m256i a=_mm256_shuffle_epi8(C,_mm256_set_epi8( \
279 			15,14,15,14, 11,10,11,10,  7, 6, 7, 6,  3, 2, 3, 2, \
280 			15,14,15,14, 11,10,11,10,  7, 6, 7, 6,  3, 2, 3, 2 \
281 		)); \
282 		C=_mm256_or_si256(C,_mm256_set1_epi32(0x00ff0000)); \
283 		C=_mm256_mullo_epi16(C,a); \
284 		C=_mm256_add_epi16(C,_mm256_set1_epi16(0x80)); \
285 		C=_mm256_add_epi16(C,_mm256_srli_epi16(C,8)); \
286 		C=_mm256_srli_epi16(C,8); \
287 		C=_mm256_slli_epi16(C,S); \
288 	}
289 #else
290 #	define PREMULFIN_SHL_COLOR_VEC16(C,S) { \
291 		__m256i a=_mm256_shuffle_epi8(C,_mm256_set_epi8( \
292 			15,14,15,14, 15,14,15,14,  7, 6, 7, 6,  7, 6, 7, 6, \
293 			15,14,15,14, 15,14,15,14,  7, 6, 7, 6,  7, 6, 7, 6 \
294 		)); \
295 		C=_mm256_or_si256(C,_mm256_set_epi16(255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0)); \
296 		C=_mm256_mullo_epi16(C,a); \
297 		C=_mm256_add_epi16(C,_mm256_set1_epi16(0x80)); \
298 		C=_mm256_add_epi16(C,_mm256_srli_epi16(C,8)); \
299 		C=_mm256_srli_epi16(C,8); \
300 		C=_mm256_slli_epi16(C,S); \
301 	}
302 #endif
303 
304 
305 //==============================================================================
306 //========== emPainter::ScanlineTool::InterpolateImageAvx2Nearest... ===========
307 //==============================================================================
308 
309 #if defined(__GNUC__)
310 	__attribute__((target("avx2")))
311 #endif
CONCAT(InterpolateImageAvx2Nearest,CONCAT (CONCAT (METHOD_NAME_EXTENSION_,EXTENSION),CONCAT (METHOD_NAME_CHANNELS_,CHANNELS)))312 void emPainter::ScanlineTool::CONCAT(InterpolateImageAvx2Nearest,CONCAT(
313 	CONCAT(METHOD_NAME_EXTENSION_,EXTENSION),
314 	CONCAT(METHOD_NAME_CHANNELS_,CHANNELS)
315 )) (const ScanlineTool & sct, int x, int y, int w)
316 {
317 	emInt64 ty=y*sct.TDY-sct.TY;
318 	DEFINE_AND_SET_IMAGE_Y(imgY,ty>>24,sct.ImgDY,sct.ImgSY)
319 	ssize_t imgSX=sct.ImgSX;
320 	DEFINE_AND_SET_IMAGE_ROW_PTR(row,imgY,imgSX,sct.ImgSY,sct.ImgMap)
321 
322 	emByte * buf=(emByte*)sct.InterpolationBuffer;
323 	emByte * bufEnd=buf+w*CHANNELS;
324 	emInt64 tdx=sct.TDX;
325 	emInt64 tx=x*tdx-sct.TX;
326 
327 	int sw=((tx&0xffffff)+0x1000000+(w-1)*tdx)>>24;
328 	if (w==sw) {
329 		DEFINE_AND_SET_IMAGE_X(imgX,tx>>24,CHANNELS,imgSX)
330 		do {
331 			__m128i v;
332 			if (ARE_THERE_16_CONSECUTIVE_BYTES_AT_IMAGE_X(imgX,imgSX)) {
333 				DEFINE_AND_SET_IMAGE_PIX_PTR(p,row,imgX)
334 				v=_mm_loadu_si128((__m128i*)p);
335 				INCREMENT_IMAGE_X(imgX,((16/CHANNELS)*CHANNELS),imgSX)
336 			}
337 			else {
338 				for (int i=0, j=bufEnd-buf; i<=16-CHANNELS; i+=CHANNELS) {
339 					v=_mm_srli_si128(v,CHANNELS);
340 					if (i<j) {
341 						DEFINE_AND_SET_IMAGE_PIX_PTR(p,row,imgX)
342 #						if CHANNELS==1
343 							v=_mm_insert_epi8(v,p[0],15);
344 #						elif CHANNELS==2
345 							v=_mm_insert_epi16(v,((emUInt16*)p)[0],7);
346 #						elif CHANNELS==3
347 							v=_mm_insert_epi16(v,p[0]|(p[1]<<8),6);
348 							v=_mm_insert_epi8(v,p[2],14);
349 #						else
350 							v=_mm_insert_epi32(v,((emUInt32*)p)[0],3);
351 #						endif
352 						INCREMENT_IMAGE_X(imgX,CHANNELS,imgSX)
353 					}
354 				}
355 			}
356 			PREMULFIN_COLOR_VEC8(v);
357 			_mm_storeu_si128((__m128i*)buf,v);
358 			buf+=(16/CHANNELS)*CHANNELS;
359 		} while (buf<bufEnd);
360 	}
361 	else {
362 		do {
363 			__m128i v;
364 			for (int i=0, j=bufEnd-buf; i<=16-CHANNELS; i+=CHANNELS) {
365 				v=_mm_srli_si128(v,CHANNELS);
366 				if (i<j) {
367 					DEFINE_AND_SET_IMAGE_X(imgX,tx>>24,CHANNELS,imgSX)
368 					DEFINE_AND_SET_IMAGE_PIX_PTR(p,row,imgX)
369 #					if CHANNELS==1
370 						v=_mm_insert_epi8(v,p[0],15);
371 #					elif CHANNELS==2
372 						v=_mm_insert_epi16(v,((emUInt16*)p)[0],7);
373 #					elif CHANNELS==3
374 						v=_mm_insert_epi16(v,p[0]|(p[1]<<8),6);
375 						v=_mm_insert_epi8(v,p[2],14);
376 #					else
377 						v=_mm_insert_epi32(v,((emUInt32*)p)[0],3);
378 #					endif
379 					tx+=tdx;
380 				}
381 			}
382 			PREMULFIN_COLOR_VEC8(v);
383 			_mm_storeu_si128((__m128i*)buf,v);
384 			buf+=(16/CHANNELS)*CHANNELS;
385 		} while (buf<bufEnd);
386 	}
387 }
388 
389 
390 //==============================================================================
391 //========== emPainter::ScanlineTool::InterpolateImageAvx2Bilinear... ==========
392 //==============================================================================
393 
394 #if defined(__GNUC__)
395 	__attribute__((target("avx2")))
396 #endif
CONCAT(InterpolateImageAvx2Bilinear,CONCAT (CONCAT (METHOD_NAME_EXTENSION_,EXTENSION),CONCAT (METHOD_NAME_CHANNELS_,CHANNELS)))397 void emPainter::ScanlineTool::CONCAT(InterpolateImageAvx2Bilinear,CONCAT(
398 	CONCAT(METHOD_NAME_EXTENSION_,EXTENSION),
399 	CONCAT(METHOD_NAME_CHANNELS_,CHANNELS)
400 )) (const ScanlineTool & sct, int x, int y, int w)
401 {
402 	emInt64 ty=y*sct.TDY-sct.TY-0x800000;
403 	emUInt32 oy=((ty&0xffffff)+0x7fff)>>16;
404 
405 	DEFINE_AND_SET_IMAGE_Y(imgY,ty>>24,sct.ImgDY,sct.ImgSY)
406 	ssize_t imgSX=sct.ImgSX;
407 	DEFINE_AND_SET_IMAGE_ROW_PTR(row0,imgY,imgSX,sct.ImgSY,sct.ImgMap)
408 	INCREMENT_IMAGE_Y(imgY,sct.ImgDY,sct.ImgSY)
409 	DEFINE_AND_SET_IMAGE_ROW_PTR(row1,imgY,imgSX,sct.ImgSY,sct.ImgMap)
410 
411 	emInt64 tdx=sct.TDX;
412 	emInt64 tx=x*tdx-sct.TX-0x800000;
413 
414 	DEFINE_AND_SET_IMAGE_X(imgX,tx>>24,CHANNELS,imgSX)
415 
416 	tx=(tx&0xffffff)-0x1000000-tdx;
417 	int tc=((tx+0x3000000+w*tdx)>>24)*CHANNELS;
418 
419 	const emByte * p=(emByte*)sct.InterpolationBuffer+InterpolationBufferSize-tc*2-64;
420 	p-=(p-(emByte*)NULL)&31;
421 	const emInt16 * pvyBeg=(emInt16*)p;
422 	const emInt16 * pvy=pvyBeg;
423 	const emInt16 * pvyEnd=pvyBeg+tc;
424 
425 	__m256i fy1=_mm256_set1_epi16(oy<<6);
426 	__m256i fy0=_mm256_sub_epi16(_mm256_set1_epi16(16384),fy1);
427 
428 	do {
429 		__m128i svy0,svy1;
430 		if (ARE_THERE_16_CONSECUTIVE_BYTES_AT_IMAGE_X(imgX,imgSX)) {
431 			DEFINE_AND_SET_IMAGE_PIX_PTR(p0,row0,imgX)
432 			DEFINE_AND_SET_IMAGE_PIX_PTR(p1,row1,imgX)
433 			svy0=_mm_loadu_si128((__m128i*)p0);
434 			svy1=_mm_loadu_si128((__m128i*)p1);
435 			INCREMENT_IMAGE_X(imgX,((16/CHANNELS)*CHANNELS),imgSX)
436 		}
437 		else {
438 			for (int i=0, j=pvyEnd-pvy; i<=16-CHANNELS; i+=CHANNELS) {
439 				svy0=_mm_srli_si128(svy0,CHANNELS);
440 				svy1=_mm_srli_si128(svy1,CHANNELS);
441 				if (i<j) {
442 					DEFINE_AND_SET_IMAGE_PIX_PTR(p0,row0,imgX)
443 					DEFINE_AND_SET_IMAGE_PIX_PTR(p1,row1,imgX)
444 #					if CHANNELS==1
445 						svy0=_mm_insert_epi8(svy0,p0[0],15);
446 						svy1=_mm_insert_epi8(svy1,p1[0],15);
447 #					elif CHANNELS==2
448 						svy0=_mm_insert_epi16(svy0,((emUInt16*)p0)[0],7);
449 						svy1=_mm_insert_epi16(svy1,((emUInt16*)p1)[0],7);
450 #					elif CHANNELS==3
451 						svy0=_mm_insert_epi16(svy0,p0[0]|(p0[1]<<8),6);
452 						svy0=_mm_insert_epi8(svy0,p0[2],14);
453 						svy1=_mm_insert_epi16(svy1,p1[0]|(p1[1]<<8),6);
454 						svy1=_mm_insert_epi8(svy1,p1[2],14);
455 #					else
456 						svy0=_mm_insert_epi32(svy0,((emUInt32*)p0)[0],3);
457 						svy1=_mm_insert_epi32(svy1,((emUInt32*)p1)[0],3);
458 #					endif
459 					INCREMENT_IMAGE_X(imgX,CHANNELS,imgSX)
460 				}
461 			}
462 		}
463 
464 		__m256i vy0=_mm256_cvtepu8_epi16(svy0);
465 		__m256i vy1=_mm256_cvtepu8_epi16(svy1);
466 
467 		PREMULFIN_SHL_COLOR_VEC16(vy0,7)
468 		PREMULFIN_SHL_COLOR_VEC16(vy1,7)
469 
470 		__m256i vy=_mm256_add_epi16(
471 			_mm256_mulhrs_epi16(vy0,fy0),
472 			_mm256_mulhrs_epi16(vy1,fy1)
473 		);
474 
475 		_mm256_storeu_si256((__m256i*)pvy,vy);
476 		pvy+=(16/CHANNELS)*CHANNELS;
477 	} while (pvy<pvyEnd);
478 
479 	_mm256_storeu_si256((__m256i*)pvy,_mm256_setzero_si256());
480 
481 	pvy=pvyBeg;
482 	emByte * buf=(emByte*)sct.InterpolationBuffer;
483 	emByte * bufEnd=buf+w*CHANNELS;
484 
485 	// Order of pixels in v01 and vff with 3-4 / 1-2 channels:
486 	//   v01:   2 1 1 0   /    4 3 2 1  3 2 1 0
487 	//   vff:   4 3 5 2   /    8 7 6 5  b a 9 4
488 #	if CHANNELS<=2
489 #		if CHANNELS==1
490 			__m256i vt=_mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)pvy));
491 #		else
492 			__m256i vt=_mm256_loadu_si256((__m256i*)pvy);
493 #		endif
494 		__m256i v01=_mm256_permutevar8x32_epi32(vt,_mm256_set_epi32(4,3,2,1,3,2,1,0));
495 		__m256i vff=_mm256_permutevar8x32_epi32(vt,_mm256_set_epi32(0,7,6,5,0,0,0,4));
496 		pvy+=CHANNELS*8;
497 		int vn=7;
498 #	elif CHANNELS==3
499 		__m256i vt=_mm256_loadu_si256((__m256i*)pvy);
500 		__m256i vta=_mm256_shuffle_epi8(vt,_mm256_set_epi8(
501 			-1,-1, 1, 0, -1,-1,-1,-1, -1,-1, 7, 6,  5, 4, 3, 2,
502 			-1,-1,11,10,  9, 8, 7, 6, -1,-1,-1,-1, 15,14,13,12
503 		));
504 		__m256i vtb=_mm256_permutevar8x32_epi32(vta,_mm256_set_epi32(7,0,3,2,3,2,7,0));
505 		vta=_mm256_blend_epi32(vt,vta,0x3f);
506 		__m256i v01=_mm256_blend_epi32(vt,vtb,0xfc);
507 		__m256i vff=_mm256_blend_epi32(vta,vtb,0x0f);
508 		pvy+=CHANNELS*5;
509 		int vn=4;
510 #	else
511 		__m256i vt=_mm256_loadu_si256((__m256i*)pvy);
512 		__m256i v01=_mm256_permute4x64_epi64(vt,0x94);
513 		__m256i vff=_mm256_permute4x64_epi64(vt,0x36);
514 		pvy+=CHANNELS*4;
515 		int vn=3;
516 #	endif
517 
518 	do {
519 		__m256i v01l,v01h,f1l,f1h;
520 		int cx=16/CHANNELS-1;
521 
522 		do {
523 			tx+=tdx;
524 			if (tx>=0) {
525 				tx-=0x1000000;
526 
527 #				if CHANNELS<=2
528 					v01=_mm256_alignr_epi8(vff,v01,4);
529 					vff=_mm256_permutevar8x32_epi32(vff,_mm256_set_epi32(1,7,6,5,0,3,2,4));
530 #				else
531 					v01=_mm256_alignr_epi8(vff,v01,8);
532 					vff=_mm256_permute4x64_epi64(vff,0x72);
533 #				endif
534 
535 				vn--;
536 				if (vn<=0) {
537 #					if CHANNELS<=2
538 #						if CHANNELS==1
539 							vff=_mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)pvy));
540 #						else
541 							vff=_mm256_loadu_si256((__m256i*)pvy);
542 #						endif
543 						__m256i t=_mm256_permutevar8x32_epi32(vff,_mm256_set_epi32(3,2,1,0,2,1,0,3));
544 						v01=_mm256_blend_epi32(v01,t,0xfe);
545 						vff=_mm256_blend_epi32(vff,t,0x0f);
546 						pvy+=CHANNELS*8;
547 						vn+=8;
548 #					elif CHANNELS==3
549 						__m256i t=_mm256_loadu_si256((__m256i*)pvy);
550 						__m256i t1=_mm256_permutevar8x32_epi32(t,_mm256_set_epi32(7,6,4,3,7,6,1,0));
551 						__m256i t2=_mm256_shuffle_epi8(t,_mm256_set_epi8(
552 							-1,-1, 7, 6,  5, 4, 3, 2, -1,-1, 1, 0, -1,-1,-1,-1,
553 							-1,-1, 5, 4,  3, 2, 1, 0, -1,-1,11,10,  9, 8, 7, 6
554 						));
555 						vff=_mm256_blend_epi32(t1,t2,0xc3);
556 						__m256i t3=_mm256_permute4x64_epi64(t2,0x14);
557 						v01=_mm256_blend_epi32(v01,t3,0xfc);
558 						pvy+=CHANNELS*5;
559 						vn+=5;
560 #					else
561 						vff=_mm256_loadu_si256((__m256i*)pvy);
562 						__m256i t=_mm256_permute4x64_epi64(vff,0x41);
563 						v01=_mm256_blend_epi32(v01,t,0xfc);
564 						vff=_mm256_blend_epi32(vff,t,0x0f);
565 						pvy+=CHANNELS*4;
566 						vn+=4;
567 #					endif
568 				}
569 			}
570 
571 			emUInt32 ox=(tx+0x1007fff)>>16;
572 
573 			__m256i f1=_mm256_castsi128_si256(_mm_set1_epi16(ox));
574 
575 			if (cx==7/CHANNELS) {
576 #				if CHANNELS==3
577 					v01l=_mm256_alignr_epi8(v01,v01h,4);
578 					f1l=_mm256_alignr_epi8(f1,f1h,4);
579 #				else
580 					v01l=v01h;
581 					f1l=f1h;
582 #				endif
583 			}
584 
585 			v01h=_mm256_alignr_epi8(v01,v01h,CHANNELS*2);
586 			f1h=_mm256_alignr_epi8(f1,f1h,CHANNELS*2);
587 			cx--;
588 		} while (cx>=0);
589 
590 #		if CHANNELS==3
591 			v01h=_mm256_srli_si256(v01h,2);
592 			f1h=_mm256_srli_si256(f1h,2);
593 #		endif
594 
595 		__m256i vx0=_mm256_permute2x128_si256(v01l,v01h,0x20);
596 		__m256i vx1=_mm256_permute2x128_si256(v01l,v01h,0x31);
597 		__m256i fx1=_mm256_permute2x128_si256(f1l,f1h,0x20);
598 
599 		fx1=_mm256_slli_epi16(fx1,6);
600 		__m256i fx0=_mm256_sub_epi16(_mm256_set1_epi16(16384),fx1);
601 
602 		__m256i vx=_mm256_add_epi16(
603 			_mm256_mulhrs_epi16(vx0,fx0),
604 			_mm256_mulhrs_epi16(vx1,fx1)
605 		);
606 
607 		vx=_mm256_add_epi16(vx,_mm256_set1_epi16(0x10));
608 		vx=_mm256_srai_epi16(vx,5);
609 		__m128i svx=_mm_packus_epi16(
610 			_mm256_castsi256_si128(vx),
611 			_mm256_extracti128_si256(vx,1)
612 		);
613 
614 		_mm_storeu_si128((__m128i*)buf,svx);
615 
616 		buf+=(16/CHANNELS)*CHANNELS;
617 	} while (buf<bufEnd);
618 }
619 
620 
621 //==============================================================================
622 //========== emPainter::ScanlineTool::InterpolateImageAvx2Bicubic... ===========
623 //==============================================================================
624 
625 #ifndef BICUBIC_FACTORS_TABLE_DEFINED
626 #	define BICUBIC_FACTORS_TABLE_DEFINED
627 	struct alignas(8) BicubicFactors {
628 		emInt16 f0;
629 		emInt16 f1;
630 		emInt16 f2;
631 		emInt16 f3;
632 	};
633 	static const BicubicFactors BicubicFactorsTable[257] = {
634 		// #include <stdio.h>
635 		// #include <math.h>
636 		// int main(int argc, char * argv[])
637 		// {
638 		//   for (int i=0; i<=256; i++) {
639 		//     double o=i/256.0;
640 		//     double s=1.0-o;
641 		//     double f=16384;
642 		//     int f0=(int)round((-0.5*s*o)*s*f);
643 		//     int f1=(int)round(((1 - 1.5*o)*o + 1)*s*f);
644 		//     int f2=(int)round(((1 - 1.5*s)*s + 1)*o*f);
645 		//     int f3=(int)round((-0.5*s*o)*o*f);
646 		//     printf("%s{%d,%d,%d,%d},",i%4?"":"\n",f0,f1,f2,f3);
647 		//   }
648 		//   return 0;
649 		// }
650 		{0,16384,0,0},{-32,16383,32,0},{-63,16382,66,0},{-94,16378,100,-1},
651 		{-124,16374,136,-2},{-154,16369,172,-3},{-183,16362,210,-4},{-212,16354,248,-6},
652 		{-240,16345,287,-8},{-268,16334,327,-10},{-295,16323,369,-12},{-322,16310,411,-14},
653 		{-349,16297,453,-17},{-375,16282,497,-20},{-400,16266,542,-23},{-425,16248,588,-26},
654 		{-450,16230,634,-30},{-474,16211,681,-34},{-498,16190,729,-38},{-521,16168,778,-42},
655 		{-544,16146,828,-46},{-566,16122,879,-51},{-588,16097,930,-55},{-610,16071,983,-60},
656 		{-631,16044,1036,-65},{-651,16016,1090,-70},{-672,15987,1144,-76},{-691,15957,1200,-82},
657 		{-711,15926,1256,-87},{-730,15894,1313,-93},{-748,15861,1370,-99},{-766,15827,1429,-106},
658 		{-784,15792,1488,-112},{-801,15756,1548,-119},{-818,15719,1608,-125},{-835,15681,1670,-132},
659 		{-851,15642,1732,-139},{-866,15603,1794,-146},{-882,15562,1858,-154},{-897,15520,1922,-161},
660 		{-911,15478,1986,-169},{-925,15434,2052,-176},{-939,15390,2117,-184},{-953,15345,2184,-192},
661 		{-966,15299,2251,-200},{-978,15252,2319,-209},{-991,15204,2387,-217},{-1002,15155,2456,-225},
662 		{-1014,15106,2526,-234},{-1025,15056,2596,-243},{-1036,15005,2667,-251},{-1047,14953,2738,-260},
663 		{-1057,14900,2810,-269},{-1066,14846,2882,-278},{-1076,14792,2955,-288},{-1085,14737,3029,-297},
664 		{-1094,14681,3103,-306},{-1102,14625,3177,-316},{-1110,14567,3252,-325},{-1118,14509,3328,-335},
665 		{-1125,14450,3404,-345},{-1133,14391,3480,-354},{-1139,14331,3557,-364},{-1146,14270,3634,-374},
666 		{-1152,14208,3712,-384},{-1158,14146,3790,-394},{-1163,14083,3869,-404},{-1169,14019,3948,-414},
667 		{-1174,13955,4027,-424},{-1178,13890,4107,-435},{-1182,13824,4188,-445},{-1187,13758,4268,-455},
668 		{-1190,13691,4349,-466},{-1194,13623,4431,-476},{-1197,13555,4512,-487},{-1200,13486,4595,-497},
669 		{-1202,13417,4677,-508},{-1205,13347,4760,-518},{-1207,13277,4843,-529},{-1208,13206,4926,-539},
670 		{-1210,13134,5010,-550},{-1211,13062,5094,-561},{-1212,12989,5178,-571},{-1213,12916,5263,-582},
671 		{-1213,12842,5348,-593},{-1214,12768,5433,-603},{-1214,12693,5518,-614},{-1213,12618,5604,-625},
672 		{-1213,12542,5690,-635},{-1212,12466,5776,-646},{-1211,12389,5862,-657},{-1210,12312,5949,-667},
673 		{-1208,12235,6035,-678},{-1207,12157,6122,-688},{-1205,12078,6209,-699},{-1202,11999,6297,-709},
674 		{-1200,11920,6384,-720},{-1197,11840,6472,-730},{-1195,11760,6559,-741},{-1192,11680,6647,-751},
675 		{-1188,11599,6735,-762},{-1185,11518,6823,-772},{-1181,11436,6911,-782},{-1177,11354,7000,-793},
676 		{-1173,11272,7088,-803},{-1169,11189,7177,-813},{-1165,11106,7265,-823},{-1160,11023,7354,-833},
677 		{-1155,10939,7443,-843},{-1150,10855,7531,-853},{-1145,10771,7620,-863},{-1140,10687,7709,-872},
678 		{-1134,10602,7798,-882},{-1128,10517,7887,-892},{-1122,10432,7976,-901},{-1116,10346,8065,-911},
679 		{-1110,10260,8154,-920},{-1104,10174,8242,-929},{-1097,10088,8331,-938},{-1091,10002,8420,-947},
680 		{-1084,9915,8509,-956},{-1077,9828,8597,-965},{-1070,9741,8686,-974},{-1062,9654,8775,-982},
681 		{-1055,9567,8863,-991},{-1047,9479,8951,-999},{-1040,9392,9040,-1008},{-1032,9304,9128,-1016},
682 		{-1024,9216,9216,-1024},{-1016,9128,9304,-1032},{-1008,9040,9392,-1040},{-999,8951,9479,-1047},
683 		{-991,8863,9567,-1055},{-982,8775,9654,-1062},{-974,8686,9741,-1070},{-965,8597,9828,-1077},
684 		{-956,8509,9915,-1084},{-947,8420,10002,-1091},{-938,8331,10088,-1097},{-929,8242,10174,-1104},
685 		{-920,8154,10260,-1110},{-911,8065,10346,-1116},{-901,7976,10432,-1122},{-892,7887,10517,-1128},
686 		{-882,7798,10602,-1134},{-872,7709,10687,-1140},{-863,7620,10771,-1145},{-853,7531,10855,-1150},
687 		{-843,7443,10939,-1155},{-833,7354,11023,-1160},{-823,7265,11106,-1165},{-813,7177,11189,-1169},
688 		{-803,7088,11272,-1173},{-793,7000,11354,-1177},{-782,6911,11436,-1181},{-772,6823,11518,-1185},
689 		{-762,6735,11599,-1188},{-751,6647,11680,-1192},{-741,6559,11760,-1195},{-730,6472,11840,-1197},
690 		{-720,6384,11920,-1200},{-709,6297,11999,-1202},{-699,6209,12078,-1205},{-688,6122,12157,-1207},
691 		{-678,6035,12235,-1208},{-667,5949,12312,-1210},{-657,5862,12389,-1211},{-646,5776,12466,-1212},
692 		{-635,5690,12542,-1213},{-625,5604,12618,-1213},{-614,5518,12693,-1214},{-603,5433,12768,-1214},
693 		{-593,5348,12842,-1213},{-582,5263,12916,-1213},{-571,5178,12989,-1212},{-561,5094,13062,-1211},
694 		{-550,5010,13134,-1210},{-539,4926,13206,-1208},{-529,4843,13277,-1207},{-518,4760,13347,-1205},
695 		{-508,4677,13417,-1202},{-497,4595,13486,-1200},{-487,4512,13555,-1197},{-476,4431,13623,-1194},
696 		{-466,4349,13691,-1190},{-455,4268,13758,-1187},{-445,4188,13824,-1182},{-435,4107,13890,-1178},
697 		{-424,4027,13955,-1174},{-414,3948,14019,-1169},{-404,3869,14083,-1163},{-394,3790,14146,-1158},
698 		{-384,3712,14208,-1152},{-374,3634,14270,-1146},{-364,3557,14331,-1139},{-354,3480,14391,-1133},
699 		{-345,3404,14450,-1125},{-335,3328,14509,-1118},{-325,3252,14567,-1110},{-316,3177,14625,-1102},
700 		{-306,3103,14681,-1094},{-297,3029,14737,-1085},{-288,2955,14792,-1076},{-278,2882,14846,-1066},
701 		{-269,2810,14900,-1057},{-260,2738,14953,-1047},{-251,2667,15005,-1036},{-243,2596,15056,-1025},
702 		{-234,2526,15106,-1014},{-225,2456,15155,-1002},{-217,2387,15204,-991},{-209,2319,15252,-978},
703 		{-200,2251,15299,-966},{-192,2184,15345,-953},{-184,2117,15390,-939},{-176,2052,15434,-925},
704 		{-169,1986,15478,-911},{-161,1922,15520,-897},{-154,1858,15562,-882},{-146,1794,15603,-866},
705 		{-139,1732,15642,-851},{-132,1670,15681,-835},{-125,1608,15719,-818},{-119,1548,15756,-801},
706 		{-112,1488,15792,-784},{-106,1429,15827,-766},{-99,1370,15861,-748},{-93,1313,15894,-730},
707 		{-87,1256,15926,-711},{-82,1200,15957,-691},{-76,1144,15987,-672},{-70,1090,16016,-651},
708 		{-65,1036,16044,-631},{-60,983,16071,-610},{-55,930,16097,-588},{-51,879,16122,-566},
709 		{-46,828,16146,-544},{-42,778,16168,-521},{-38,729,16190,-498},{-34,681,16211,-474},
710 		{-30,634,16230,-450},{-26,588,16248,-425},{-23,542,16266,-400},{-20,497,16282,-375},
711 		{-17,453,16297,-349},{-14,411,16310,-322},{-12,369,16323,-295},{-10,327,16334,-268},
712 		{-8,287,16345,-240},{-6,248,16354,-212},{-4,210,16362,-183},{-3,172,16369,-154},
713 		{-2,136,16374,-124},{-1,100,16378,-94},{0,66,16382,-63},{0,32,16383,-32},
714 		{0,0,16384,0}
715 	};
716 #endif
717 
718 
719 #if defined(__GNUC__)
720 	__attribute__((target("avx2")))
721 #endif
CONCAT(InterpolateImageAvx2Bicubic,CONCAT (CONCAT (METHOD_NAME_EXTENSION_,EXTENSION),CONCAT (METHOD_NAME_CHANNELS_,CHANNELS)))722 void emPainter::ScanlineTool::CONCAT(InterpolateImageAvx2Bicubic,CONCAT(
723 	CONCAT(METHOD_NAME_EXTENSION_,EXTENSION),
724 	CONCAT(METHOD_NAME_CHANNELS_,CHANNELS)
725 )) (const ScanlineTool & sct, int x, int y, int w)
726 {
727 	emInt64 ty=y*sct.TDY-sct.TY-0x1800000;
728 	emUInt32 oy=((ty&0xffffff)+0x7fff)>>16;
729 	const BicubicFactors & fy=BicubicFactorsTable[oy];
730 
731 	DEFINE_AND_SET_IMAGE_Y(imgY,ty>>24,sct.ImgDY,sct.ImgSY)
732 	ssize_t imgSX=sct.ImgSX;
733 	DEFINE_AND_SET_IMAGE_ROW_PTR(row0,imgY,imgSX,sct.ImgSY,sct.ImgMap)
734 	INCREMENT_IMAGE_Y(imgY,sct.ImgDY,sct.ImgSY)
735 	DEFINE_AND_SET_IMAGE_ROW_PTR(row1,imgY,imgSX,sct.ImgSY,sct.ImgMap)
736 	INCREMENT_IMAGE_Y(imgY,sct.ImgDY,sct.ImgSY)
737 	DEFINE_AND_SET_IMAGE_ROW_PTR(row2,imgY,imgSX,sct.ImgSY,sct.ImgMap)
738 	INCREMENT_IMAGE_Y(imgY,sct.ImgDY,sct.ImgSY)
739 	DEFINE_AND_SET_IMAGE_ROW_PTR(row3,imgY,imgSX,sct.ImgSY,sct.ImgMap)
740 
741 	emInt64 tdx=sct.TDX;
742 	emInt64 tx=x*tdx-sct.TX-0x1800000;
743 
744 	DEFINE_AND_SET_IMAGE_X(imgX,tx>>24,CHANNELS,imgSX)
745 
746 	tx=(tx&0xffffff)-0x1000000-tdx;
747 	int tc=((tx+0x5000000+w*tdx)>>24)*CHANNELS;
748 
749 	const emByte * p=(emByte*)sct.InterpolationBuffer+InterpolationBufferSize-tc*2-64;
750 	p-=(p-(emByte*)NULL)&31;
751 	const emInt16 * pvyBeg=(emInt16*)p;
752 	const emInt16 * pvy=pvyBeg;
753 	const emInt16 * pvyEnd=pvyBeg+tc;
754 
755 	__m128i sfy=_mm_loadl_epi64((__m128i*)&fy);
756 	sfy=_mm_unpacklo_epi16(sfy,sfy);
757 	__m256i afy=_mm256_broadcastsi128_si256(sfy);
758 	__m256i fy0=_mm256_shuffle_epi32(afy,0x00);
759 	__m256i fy1=_mm256_shuffle_epi32(afy,0x55);
760 	__m256i fy2=_mm256_shuffle_epi32(afy,0xaa);
761 	__m256i fy3=_mm256_shuffle_epi32(afy,0xff);
762 
763 	do {
764 		__m128i svy0,svy1,svy2,svy3;
765 		if (ARE_THERE_16_CONSECUTIVE_BYTES_AT_IMAGE_X(imgX,imgSX)) {
766 			DEFINE_AND_SET_IMAGE_PIX_PTR(p0,row0,imgX)
767 			DEFINE_AND_SET_IMAGE_PIX_PTR(p1,row1,imgX)
768 			DEFINE_AND_SET_IMAGE_PIX_PTR(p2,row2,imgX)
769 			DEFINE_AND_SET_IMAGE_PIX_PTR(p3,row3,imgX)
770 			svy0=_mm_loadu_si128((__m128i*)p0);
771 			svy1=_mm_loadu_si128((__m128i*)p1);
772 			svy2=_mm_loadu_si128((__m128i*)p2);
773 			svy3=_mm_loadu_si128((__m128i*)p3);
774 			INCREMENT_IMAGE_X(imgX,((16/CHANNELS)*CHANNELS),imgSX)
775 		}
776 		else {
777 			for (int i=0, j=pvyEnd-pvy; i<=16-CHANNELS; i+=CHANNELS) {
778 				svy0=_mm_srli_si128(svy0,CHANNELS);
779 				svy1=_mm_srli_si128(svy1,CHANNELS);
780 				svy2=_mm_srli_si128(svy2,CHANNELS);
781 				svy3=_mm_srli_si128(svy3,CHANNELS);
782 				if (i<j) {
783 					DEFINE_AND_SET_IMAGE_PIX_PTR(p0,row0,imgX)
784 					DEFINE_AND_SET_IMAGE_PIX_PTR(p1,row1,imgX)
785 					DEFINE_AND_SET_IMAGE_PIX_PTR(p2,row2,imgX)
786 					DEFINE_AND_SET_IMAGE_PIX_PTR(p3,row3,imgX)
787 #					if CHANNELS==1
788 						svy0=_mm_insert_epi8(svy0,p0[0],15);
789 						svy1=_mm_insert_epi8(svy1,p1[0],15);
790 						svy2=_mm_insert_epi8(svy2,p2[0],15);
791 						svy3=_mm_insert_epi8(svy3,p3[0],15);
792 #					elif CHANNELS==2
793 						svy0=_mm_insert_epi16(svy0,((emUInt16*)p0)[0],7);
794 						svy1=_mm_insert_epi16(svy1,((emUInt16*)p1)[0],7);
795 						svy2=_mm_insert_epi16(svy2,((emUInt16*)p2)[0],7);
796 						svy3=_mm_insert_epi16(svy3,((emUInt16*)p3)[0],7);
797 #					elif CHANNELS==3
798 						svy0=_mm_insert_epi16(svy0,p0[0]|(p0[1]<<8),6);
799 						svy0=_mm_insert_epi8(svy0,p0[2],14);
800 						svy1=_mm_insert_epi16(svy1,p1[0]|(p1[1]<<8),6);
801 						svy1=_mm_insert_epi8(svy1,p1[2],14);
802 						svy2=_mm_insert_epi16(svy2,p2[0]|(p2[1]<<8),6);
803 						svy2=_mm_insert_epi8(svy2,p2[2],14);
804 						svy3=_mm_insert_epi16(svy3,p3[0]|(p3[1]<<8),6);
805 						svy3=_mm_insert_epi8(svy3,p3[2],14);
806 #					else
807 						svy0=_mm_insert_epi32(svy0,((emUInt32*)p0)[0],3);
808 						svy1=_mm_insert_epi32(svy1,((emUInt32*)p1)[0],3);
809 						svy2=_mm_insert_epi32(svy2,((emUInt32*)p2)[0],3);
810 						svy3=_mm_insert_epi32(svy3,((emUInt32*)p3)[0],3);
811 #					endif
812 					INCREMENT_IMAGE_X(imgX,CHANNELS,imgSX)
813 				}
814 			}
815 		}
816 
817 		__m256i vy0=_mm256_cvtepu8_epi16(svy0);
818 		__m256i vy1=_mm256_cvtepu8_epi16(svy1);
819 		__m256i vy2=_mm256_cvtepu8_epi16(svy2);
820 		__m256i vy3=_mm256_cvtepu8_epi16(svy3);
821 
822 		PREMULFIN_SHL_COLOR_VEC16(vy0,7)
823 		PREMULFIN_SHL_COLOR_VEC16(vy1,7)
824 		PREMULFIN_SHL_COLOR_VEC16(vy2,7)
825 		PREMULFIN_SHL_COLOR_VEC16(vy3,7)
826 
827 		__m256i vy=_mm256_add_epi16(
828 			_mm256_add_epi16(
829 				_mm256_mulhrs_epi16(vy0,fy0),
830 				_mm256_mulhrs_epi16(vy1,fy1)
831 			),
832 			_mm256_add_epi16(
833 				_mm256_mulhrs_epi16(vy2,fy2),
834 				_mm256_mulhrs_epi16(vy3,fy3)
835 			)
836 		);
837 
838 		_mm256_storeu_si256((__m256i*)pvy,vy);
839 		pvy+=(16/CHANNELS)*CHANNELS;
840 	} while (pvy<pvyEnd);
841 
842 	_mm256_storeu_si256((__m256i*)pvy,_mm256_setzero_si256());
843 
844 	pvy=pvyBeg;
845 	emByte * buf=(emByte*)sct.InterpolationBuffer;
846 	emByte * bufEnd=buf+w*CHANNELS;
847 
848 	// Order of pixels in v02 and v13 with 3-4 / 1-2 channels:
849 	//   v02:   6 2 4 0   /   14 12 10  2   8  6  4  0
850 	//   v13:   7 3 5 1   /   15 13 11  3   9  7  5  1
851 #	if CHANNELS<=2
852 #		if CHANNELS==1
853 			__m256i vt=_mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)pvy));
854 #		else
855 			__m256i vt=_mm256_loadu_si256((__m256i*)pvy);
856 #		endif
857 		__m256i v02=_mm256_permutevar8x32_epi32(vt,_mm256_set_epi32(0,0,0,2,0,6,4,0));
858 		__m256i v13=_mm256_permutevar8x32_epi32(vt,_mm256_set_epi32(0,0,0,3,0,7,5,1));
859 		pvy+=CHANNELS*8;
860 		int vn=5;
861 #	elif CHANNELS==3
862 		__m256i v02=_mm256_loadu_si256((__m256i*)pvy);
863 		__m256i v13=_mm256_permutevar8x32_epi32(v02,_mm256_set_epi32(5,4,4,3,7,6,2,1));
864 		v02=_mm256_blend_epi32(v02,v13,0xfc);
865 		v13=_mm256_blend_epi32(_mm256_srli_si256(v13,2),_mm256_srli_si256(v13,10),0xf0);
866 		pvy+=CHANNELS*5;
867 		int vn=2;
868 #	else
869 		__m256i v02=_mm256_loadu_si256((__m256i*)pvy);
870 		__m256i v13=_mm256_srli_si256(v02,8);
871 		pvy+=CHANNELS*4;
872 		int vn=1;
873 #	endif
874 
875 	do {
876 		__m256i v02l,v02h,v13l,v13h,f02l,f02h,f13l,f13h;
877 		int cx=16/CHANNELS-1;
878 
879 		do {
880 			tx+=tdx;
881 			if (tx>=0) {
882 				tx-=0x1000000;
883 
884 				__m256i oldV02=v02;
885 				v02=v13;
886 #				if CHANNELS<=2
887 					v13=_mm256_permutevar8x32_epi32(oldV02,_mm256_set_epi32(0,7,6,1,5,3,2,4));
888 #				else
889 					v13=_mm256_permute4x64_epi64(oldV02,0x1e);
890 #				endif
891 
892 				vn--;
893 				if (vn<=0) {
894 #					if CHANNELS<=2
895 #						if CHANNELS==1
896 							__m256i t=_mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)pvy));
897 #						else
898 							__m256i t=_mm256_loadu_si256((__m256i*)pvy);
899 #						endif
900 						__m256i ta=_mm256_permutevar8x32_epi32(t,_mm256_set_epi32(0,0,7,0,5,3,1,0));
901 						__m256i tb=_mm256_permutevar8x32_epi32(t,_mm256_set_epi32(0,0,0,0,6,4,2,0));
902 						v02=_mm256_blend_epi32(v02,ta,0xee);
903 						v13=_mm256_blend_epi32(v13,tb,0xfe);
904 						pvy+=CHANNELS*8;
905 						vn+=8;
906 #					elif CHANNELS==3
907 						__m256i t=_mm256_loadu_si256((__m256i*)pvy);
908 						__m256i ta=_mm256_shuffle_epi8(t,_mm256_set_epi8(
909 							-1,-1, 7, 6,  5, 4, 3, 2, -1,-1,-1,-1, -1,-1,-1,-1,
910 							-1,-1,11,10,  9, 8, 7, 6, -1,-1,-1,-1, -1,-1,-1,-1
911 						));
912 						__m256i tb=_mm256_permutevar8x32_epi32(t,_mm256_set_epi32(7,6,1,0,4,3,2,1));
913 						v02=_mm256_blend_epi32(v02,ta,0xcc);
914 						v13=_mm256_blend_epi32(v13,tb,0xfc);
915 						pvy+=CHANNELS*5;
916 						vn+=5;
917 #					else
918 						__m256i ta=_mm256_loadu_si256((__m256i*)pvy);
919 						__m256i tb=_mm256_permute4x64_epi64(ta,0x0a);
920 						v02=_mm256_blend_epi32(v02,ta,0xcc);
921 						v13=_mm256_blend_epi32(v13,tb,0xfc);
922 						pvy+=CHANNELS*4;
923 						vn+=4;
924 #					endif
925 				}
926 			}
927 
928 			emUInt32 ox=(tx+0x1007fff)>>16;
929 			const BicubicFactors & fx=BicubicFactorsTable[ox];
930 
931 			__m256i f02=_mm256_cvtepu16_epi64(_mm_loadl_epi64((__m128i*)&fx));
932 #			if CHANNELS>=2
933 				f02=_mm256_shuffle_epi8(f02,_mm256_set_epi8(
934 					9, 8, 9, 8, 9, 8, 9, 8, 1, 0, 1, 0, 1, 0, 1, 0,
935 					9, 8, 9, 8, 9, 8, 9, 8, 1, 0, 1, 0, 1, 0, 1, 0
936 				));
937 #			endif
938 
939 			__m256i f13=_mm256_srli_si256(f02,8);
940 
941 			if (cx==7/CHANNELS) {
942 #				if CHANNELS==3
943 					v02l=_mm256_alignr_epi8(v02,v02h,4);
944 					v13l=_mm256_alignr_epi8(v13,v13h,4);
945 					f02l=_mm256_alignr_epi8(f02,f02h,4);
946 					f13l=_mm256_alignr_epi8(f13,f13h,4);
947 #				else
948 					v02l=v02h;
949 					v13l=v13h;
950 					f02l=f02h;
951 					f13l=f13h;
952 #				endif
953 			}
954 
955 			v02h=_mm256_alignr_epi8(v02,v02h,CHANNELS*2);
956 			v13h=_mm256_alignr_epi8(v13,v13h,CHANNELS*2);
957 			f02h=_mm256_alignr_epi8(f02,f02h,CHANNELS*2);
958 			f13h=_mm256_alignr_epi8(f13,f13h,CHANNELS*2);
959 			cx--;
960 		} while (cx>=0);
961 
962 #		if CHANNELS==3
963 			v02h=_mm256_srli_si256(v02h,2);
964 			v13h=_mm256_srli_si256(v13h,2);
965 			f02h=_mm256_srli_si256(f02h,2);
966 			f13h=_mm256_srli_si256(f13h,2);
967 #		endif
968 
969 		__m256i vx0=_mm256_permute2x128_si256(v02l,v02h,0x20);
970 		__m256i vx1=_mm256_permute2x128_si256(v13l,v13h,0x20);
971 		__m256i vx2=_mm256_permute2x128_si256(v02l,v02h,0x31);
972 		__m256i vx3=_mm256_permute2x128_si256(v13l,v13h,0x31);
973 		__m256i fx0=_mm256_permute2x128_si256(f02l,f02h,0x20);
974 		__m256i fx1=_mm256_permute2x128_si256(f13l,f13h,0x20);
975 		__m256i fx2=_mm256_permute2x128_si256(f02l,f02h,0x31);
976 		__m256i fx3=_mm256_permute2x128_si256(f13l,f13h,0x31);
977 
978 		__m256i vx=_mm256_add_epi16(
979 			_mm256_add_epi16(
980 				_mm256_mulhrs_epi16(vx0,fx0),
981 				_mm256_mulhrs_epi16(vx1,fx1)
982 			),
983 			_mm256_add_epi16(
984 				_mm256_mulhrs_epi16(vx2,fx2),
985 				_mm256_mulhrs_epi16(vx3,fx3)
986 			)
987 		);
988 
989 		vx=_mm256_add_epi16(vx,_mm256_set1_epi16(0x10));
990 		vx=_mm256_srai_epi16(vx,5);
991 		vx=_mm256_max_epi16(vx,_mm256_setzero_si256());
992 		__m128i svx=_mm_packus_epi16(
993 			_mm256_castsi256_si128(vx),
994 			_mm256_extracti128_si256(vx,1)
995 		);
996 #		if CHANNELS==2 || CHANNELS==4
997 			svx=_mm_min_epu8(svx,_mm_shuffle_epi8(svx,_mm_set_epi8(
998 #				if CHANNELS==2
999 					15,15,13,13,11,11,9,9,7,7,5,5,3,3,1,1
1000 #				else
1001 					15,15,15,15,11,11,11,11,7,7,7,7,3,3,3,3
1002 #				endif
1003 			)));
1004 #		endif
1005 
1006 		_mm_storeu_si128((__m128i*)buf,svx);
1007 
1008 		buf+=(16/CHANNELS)*CHANNELS;
1009 	} while (buf<bufEnd);
1010 }
1011 
1012 
1013 //==============================================================================
1014 //========== emPainter::ScanlineTool::InterpolateImageAvx2Lanczos... ===========
1015 //==============================================================================
1016 
1017 #ifndef LANCZOS_FACTORS_TABLE_DEFINED
1018 #	define LANCZOS_FACTORS_TABLE_DEFINED
1019 	struct alignas(8) LanczosFactors {
1020 		emInt16 f0;
1021 		emInt16 f1;
1022 		emInt16 f2;
1023 		emInt16 f3;
1024 	};
1025 	static const LanczosFactors LanczosFactorsTable[257] = {
1026 		// #include <stdio.h>
1027 		// #include <math.h>
1028 		// int main(int argc, char * argv[])
1029 		// {
1030 		//   for (int i=0; i<=256; i++) {
1031 		//     double f=16384;
1032 		//     double radius=2.5;
1033 		//     double v[4];
1034 		//     for (int j=0; j<4; j++) {
1035 		//       double d=fabs((j-1-i/256.0)*M_PI);
1036 		//       if (d<1E-10) v[j]=1.0/radius;
1037 		//       else v[j]=sin(d)*sin(d/radius)/(d*d);
1038 		//     }
1039 		//     int f0=(int)round(f*v[0]/(v[0]+v[1]+v[2]+v[3]));
1040 		//     int f1=(int)round(f*v[1]/(v[0]+v[1]+v[2]+v[3]));
1041 		//     int f2=(int)round(f*v[2]/(v[0]+v[1]+v[2]+v[3]));
1042 		//     int f3=(int)round(f*v[3]/(v[0]+v[1]+v[2]+v[3]));
1043 		//     printf("%s{%d,%d,%d,%d},",i%4?"":"\n",f0,f1,f2,f3);
1044 		//   }
1045 		//   return 0;
1046 		// }
1047 		{0,16384,0,0},{-48,16391,49,-8},{-96,16397,98,-15},{-143,16402,148,-23},
1048 		{-189,16406,199,-31},{-235,16408,250,-40},{-280,16410,302,-48},{-325,16411,355,-56},
1049 		{-369,16411,408,-65},{-413,16409,462,-74},{-456,16407,516,-83},{-499,16403,572,-92},
1050 		{-540,16399,627,-102},{-582,16393,684,-111},{-622,16387,741,-121},{-662,16379,798,-131},
1051 		{-702,16370,857,-141},{-741,16361,916,-151},{-779,16350,975,-162},{-817,16338,1035,-172},
1052 		{-854,16325,1096,-183},{-891,16311,1157,-194},{-927,16296,1219,-205},{-962,16280,1282,-216},
1053 		{-997,16263,1345,-227},{-1031,16246,1408,-239},{-1065,16226,1473,-250},{-1098,16206,1538,-262},
1054 		{-1130,16185,1603,-274},{-1162,16163,1669,-286},{-1194,16140,1736,-298},{-1224,16116,1803,-311},
1055 		{-1254,16091,1870,-323},{-1284,16065,1939,-336},{-1313,16038,2007,-349},{-1341,16010,2077,-362},
1056 		{-1369,15981,2147,-375},{-1396,15951,2217,-388},{-1422,15920,2288,-401},{-1448,15888,2360,-415},
1057 		{-1474,15855,2432,-429},{-1499,15821,2504,-442},{-1523,15786,2577,-456},{-1547,15750,2651,-470},
1058 		{-1570,15713,2725,-484},{-1592,15676,2799,-499},{-1614,15637,2874,-513},{-1635,15597,2950,-528},
1059 		{-1656,15557,3026,-542},{-1676,15515,3102,-557},{-1696,15473,3179,-572},{-1715,15429,3257,-587},
1060 		{-1734,15385,3334,-602},{-1752,15340,3413,-617},{-1769,15294,3491,-632},{-1786,15247,3571,-648},
1061 		{-1803,15200,3650,-663},{-1818,15151,3730,-679},{-1834,15101,3811,-694},{-1848,15051,3891,-710},
1062 		{-1863,15000,3973,-726},{-1876,14948,4054,-742},{-1889,14895,4136,-758},{-1902,14841,4219,-774},
1063 		{-1914,14787,4301,-790},{-1926,14731,4384,-806},{-1937,14675,4468,-823},{-1947,14618,4552,-839},
1064 		{-1957,14561,4636,-855},{-1967,14502,4720,-872},{-1976,14443,4805,-888},{-1984,14383,4890,-905},
1065 		{-1992,14322,4976,-921},{-2000,14260,5061,-938},{-2007,14198,5147,-955},{-2014,14135,5234,-971},
1066 		{-2020,14071,5320,-988},{-2025,14007,5407,-1005},{-2030,13942,5494,-1022},{-2035,13876,5582,-1038},
1067 		{-2039,13809,5669,-1055},{-2043,13742,5757,-1072},{-2046,13674,5845,-1089},{-2049,13605,5934,-1106},
1068 		{-2052,13536,6022,-1122},{-2054,13466,6111,-1139},{-2055,13396,6200,-1156},{-2056,13325,6289,-1173},
1069 		{-2057,13253,6378,-1190},{-2058,13181,6467,-1207},{-2057,13108,6557,-1223},{-2057,13034,6647,-1240},
1070 		{-2056,12960,6737,-1257},{-2055,12885,6827,-1273},{-2053,12810,6917,-1290},{-2051,12734,7007,-1306},
1071 		{-2048,12658,7097,-1323},{-2045,12581,7188,-1339},{-2042,12504,7278,-1356},{-2039,12426,7369,-1372},
1072 		{-2035,12347,7460,-1388},{-2030,12268,7550,-1405},{-2025,12189,7641,-1421},{-2020,12109,7732,-1437},
1073 		{-2015,12029,7823,-1453},{-2009,11948,7914,-1468},{-2003,11867,8005,-1484},{-1997,11785,8096,-1500},
1074 		{-1990,11703,8186,-1515},{-1983,11620,8277,-1531},{-1976,11538,8368,-1546},{-1968,11454,8459,-1561},
1075 		{-1960,11371,8550,-1576},{-1952,11287,8640,-1591},{-1943,11202,8731,-1606},{-1934,11117,8822,-1621},
1076 		{-1925,11032,8912,-1635},{-1916,10947,9002,-1649},{-1906,10861,9093,-1664},{-1896,10775,9183,-1678},
1077 		{-1886,10688,9273,-1691},{-1876,10602,9363,-1705},{-1865,10515,9453,-1719},{-1854,10428,9542,-1732},
1078 		{-1843,10340,9632,-1745},{-1831,10252,9721,-1758},{-1820,10164,9810,-1771},{-1808,10076,9899,-1783},
1079 		{-1796,9988,9988,-1796},{-1783,9899,10076,-1808},{-1771,9810,10164,-1820},{-1758,9721,10252,-1831},
1080 		{-1745,9632,10340,-1843},{-1732,9542,10428,-1854},{-1719,9453,10515,-1865},{-1705,9363,10602,-1876},
1081 		{-1691,9273,10688,-1886},{-1678,9183,10775,-1896},{-1664,9093,10861,-1906},{-1649,9002,10947,-1916},
1082 		{-1635,8912,11032,-1925},{-1621,8822,11117,-1934},{-1606,8731,11202,-1943},{-1591,8640,11287,-1952},
1083 		{-1576,8550,11371,-1960},{-1561,8459,11454,-1968},{-1546,8368,11538,-1976},{-1531,8277,11620,-1983},
1084 		{-1515,8186,11703,-1990},{-1500,8096,11785,-1997},{-1484,8005,11867,-2003},{-1468,7914,11948,-2009},
1085 		{-1453,7823,12029,-2015},{-1437,7732,12109,-2020},{-1421,7641,12189,-2025},{-1405,7550,12268,-2030},
1086 		{-1388,7460,12347,-2035},{-1372,7369,12426,-2039},{-1356,7278,12504,-2042},{-1339,7188,12581,-2045},
1087 		{-1323,7097,12658,-2048},{-1306,7007,12734,-2051},{-1290,6917,12810,-2053},{-1273,6827,12885,-2055},
1088 		{-1257,6737,12960,-2056},{-1240,6647,13034,-2057},{-1223,6557,13108,-2057},{-1207,6467,13181,-2058},
1089 		{-1190,6378,13253,-2057},{-1173,6289,13325,-2056},{-1156,6200,13396,-2055},{-1139,6111,13466,-2054},
1090 		{-1122,6022,13536,-2052},{-1106,5934,13605,-2049},{-1089,5845,13674,-2046},{-1072,5757,13742,-2043},
1091 		{-1055,5669,13809,-2039},{-1038,5582,13876,-2035},{-1022,5494,13942,-2030},{-1005,5407,14007,-2025},
1092 		{-988,5320,14071,-2020},{-971,5234,14135,-2014},{-955,5147,14198,-2007},{-938,5061,14260,-2000},
1093 		{-921,4976,14322,-1992},{-905,4890,14383,-1984},{-888,4805,14443,-1976},{-872,4720,14502,-1967},
1094 		{-855,4636,14561,-1957},{-839,4552,14618,-1947},{-823,4468,14675,-1937},{-806,4384,14731,-1926},
1095 		{-790,4301,14787,-1914},{-774,4219,14841,-1902},{-758,4136,14895,-1889},{-742,4054,14948,-1876},
1096 		{-726,3973,15000,-1863},{-710,3891,15051,-1848},{-694,3811,15101,-1834},{-679,3730,15151,-1818},
1097 		{-663,3650,15200,-1803},{-648,3571,15247,-1786},{-632,3491,15294,-1769},{-617,3413,15340,-1752},
1098 		{-602,3334,15385,-1734},{-587,3257,15429,-1715},{-572,3179,15473,-1696},{-557,3102,15515,-1676},
1099 		{-542,3026,15557,-1656},{-528,2950,15597,-1635},{-513,2874,15637,-1614},{-499,2799,15676,-1592},
1100 		{-484,2725,15713,-1570},{-470,2651,15750,-1547},{-456,2577,15786,-1523},{-442,2504,15821,-1499},
1101 		{-429,2432,15855,-1474},{-415,2360,15888,-1448},{-401,2288,15920,-1422},{-388,2217,15951,-1396},
1102 		{-375,2147,15981,-1369},{-362,2077,16010,-1341},{-349,2007,16038,-1313},{-336,1939,16065,-1284},
1103 		{-323,1870,16091,-1254},{-311,1803,16116,-1224},{-298,1736,16140,-1194},{-286,1669,16163,-1162},
1104 		{-274,1603,16185,-1130},{-262,1538,16206,-1098},{-250,1473,16226,-1065},{-239,1408,16246,-1031},
1105 		{-227,1345,16263,-997},{-216,1282,16280,-962},{-205,1219,16296,-927},{-194,1157,16311,-891},
1106 		{-183,1096,16325,-854},{-172,1035,16338,-817},{-162,975,16350,-779},{-151,916,16361,-741},
1107 		{-141,857,16370,-702},{-131,798,16379,-662},{-121,741,16387,-622},{-111,684,16393,-582},
1108 		{-102,627,16399,-540},{-92,572,16403,-499},{-83,516,16407,-456},{-74,462,16409,-413},
1109 		{-65,408,16411,-369},{-56,355,16411,-325},{-48,302,16410,-280},{-40,250,16408,-235},
1110 		{-31,199,16406,-189},{-23,148,16402,-143},{-15,98,16397,-96},{-8,49,16391,-48},
1111 		{0,0,16384,0}
1112 	};
1113 #endif
1114 
1115 
1116 #if defined(__GNUC__)
1117 	__attribute__((target("avx2")))
1118 #endif
CONCAT(InterpolateImageAvx2Lanczos,CONCAT (CONCAT (METHOD_NAME_EXTENSION_,EXTENSION),CONCAT (METHOD_NAME_CHANNELS_,CHANNELS)))1119 void emPainter::ScanlineTool::CONCAT(InterpolateImageAvx2Lanczos,CONCAT(
1120 	CONCAT(METHOD_NAME_EXTENSION_,EXTENSION),
1121 	CONCAT(METHOD_NAME_CHANNELS_,CHANNELS)
1122 )) (const ScanlineTool & sct, int x, int y, int w)
1123 {
1124 	emInt64 ty=y*sct.TDY-sct.TY-0x1800000;
1125 	emUInt32 oy=((ty&0xffffff)+0x7fff)>>16;
1126 	const LanczosFactors & fy=LanczosFactorsTable[oy];
1127 
1128 	DEFINE_AND_SET_IMAGE_Y(imgY,ty>>24,sct.ImgDY,sct.ImgSY)
1129 	ssize_t imgSX=sct.ImgSX;
1130 	DEFINE_AND_SET_IMAGE_ROW_PTR(row0,imgY,imgSX,sct.ImgSY,sct.ImgMap)
1131 	INCREMENT_IMAGE_Y(imgY,sct.ImgDY,sct.ImgSY)
1132 	DEFINE_AND_SET_IMAGE_ROW_PTR(row1,imgY,imgSX,sct.ImgSY,sct.ImgMap)
1133 	INCREMENT_IMAGE_Y(imgY,sct.ImgDY,sct.ImgSY)
1134 	DEFINE_AND_SET_IMAGE_ROW_PTR(row2,imgY,imgSX,sct.ImgSY,sct.ImgMap)
1135 	INCREMENT_IMAGE_Y(imgY,sct.ImgDY,sct.ImgSY)
1136 	DEFINE_AND_SET_IMAGE_ROW_PTR(row3,imgY,imgSX,sct.ImgSY,sct.ImgMap)
1137 
1138 	emInt64 tdx=sct.TDX;
1139 	emInt64 tx=x*tdx-sct.TX-0x1800000;
1140 
1141 	DEFINE_AND_SET_IMAGE_X(imgX,tx>>24,CHANNELS,imgSX)
1142 
1143 	tx=(tx&0xffffff)-0x1000000-tdx;
1144 	int tc=((tx+0x5000000+w*tdx)>>24)*CHANNELS;
1145 
1146 	const emByte * p=(emByte*)sct.InterpolationBuffer+InterpolationBufferSize-tc*2-64;
1147 	p-=(p-(emByte*)NULL)&31;
1148 	const emInt16 * pvyBeg=(emInt16*)p;
1149 	const emInt16 * pvy=pvyBeg;
1150 	const emInt16 * pvyEnd=pvyBeg+tc;
1151 
1152 	__m128i sfy=_mm_loadl_epi64((__m128i*)&fy);
1153 	sfy=_mm_unpacklo_epi16(sfy,sfy);
1154 	__m256i afy=_mm256_broadcastsi128_si256(sfy);
1155 	__m256i fy0=_mm256_shuffle_epi32(afy,0x00);
1156 	__m256i fy1=_mm256_shuffle_epi32(afy,0x55);
1157 	__m256i fy2=_mm256_shuffle_epi32(afy,0xaa);
1158 	__m256i fy3=_mm256_shuffle_epi32(afy,0xff);
1159 
1160 	do {
1161 		__m128i svy0,svy1,svy2,svy3;
1162 		if (ARE_THERE_16_CONSECUTIVE_BYTES_AT_IMAGE_X(imgX,imgSX)) {
1163 			DEFINE_AND_SET_IMAGE_PIX_PTR(p0,row0,imgX)
1164 			DEFINE_AND_SET_IMAGE_PIX_PTR(p1,row1,imgX)
1165 			DEFINE_AND_SET_IMAGE_PIX_PTR(p2,row2,imgX)
1166 			DEFINE_AND_SET_IMAGE_PIX_PTR(p3,row3,imgX)
1167 			svy0=_mm_loadu_si128((__m128i*)p0);
1168 			svy1=_mm_loadu_si128((__m128i*)p1);
1169 			svy2=_mm_loadu_si128((__m128i*)p2);
1170 			svy3=_mm_loadu_si128((__m128i*)p3);
1171 			INCREMENT_IMAGE_X(imgX,((16/CHANNELS)*CHANNELS),imgSX)
1172 		}
1173 		else {
1174 			for (int i=0, j=pvyEnd-pvy; i<=16-CHANNELS; i+=CHANNELS) {
1175 				svy0=_mm_srli_si128(svy0,CHANNELS);
1176 				svy1=_mm_srli_si128(svy1,CHANNELS);
1177 				svy2=_mm_srli_si128(svy2,CHANNELS);
1178 				svy3=_mm_srli_si128(svy3,CHANNELS);
1179 				if (i<j) {
1180 					DEFINE_AND_SET_IMAGE_PIX_PTR(p0,row0,imgX)
1181 					DEFINE_AND_SET_IMAGE_PIX_PTR(p1,row1,imgX)
1182 					DEFINE_AND_SET_IMAGE_PIX_PTR(p2,row2,imgX)
1183 					DEFINE_AND_SET_IMAGE_PIX_PTR(p3,row3,imgX)
1184 #					if CHANNELS==1
1185 						svy0=_mm_insert_epi8(svy0,p0[0],15);
1186 						svy1=_mm_insert_epi8(svy1,p1[0],15);
1187 						svy2=_mm_insert_epi8(svy2,p2[0],15);
1188 						svy3=_mm_insert_epi8(svy3,p3[0],15);
1189 #					elif CHANNELS==2
1190 						svy0=_mm_insert_epi16(svy0,((emUInt16*)p0)[0],7);
1191 						svy1=_mm_insert_epi16(svy1,((emUInt16*)p1)[0],7);
1192 						svy2=_mm_insert_epi16(svy2,((emUInt16*)p2)[0],7);
1193 						svy3=_mm_insert_epi16(svy3,((emUInt16*)p3)[0],7);
1194 #					elif CHANNELS==3
1195 						svy0=_mm_insert_epi16(svy0,p0[0]|(p0[1]<<8),6);
1196 						svy0=_mm_insert_epi8(svy0,p0[2],14);
1197 						svy1=_mm_insert_epi16(svy1,p1[0]|(p1[1]<<8),6);
1198 						svy1=_mm_insert_epi8(svy1,p1[2],14);
1199 						svy2=_mm_insert_epi16(svy2,p2[0]|(p2[1]<<8),6);
1200 						svy2=_mm_insert_epi8(svy2,p2[2],14);
1201 						svy3=_mm_insert_epi16(svy3,p3[0]|(p3[1]<<8),6);
1202 						svy3=_mm_insert_epi8(svy3,p3[2],14);
1203 #					else
1204 						svy0=_mm_insert_epi32(svy0,((emUInt32*)p0)[0],3);
1205 						svy1=_mm_insert_epi32(svy1,((emUInt32*)p1)[0],3);
1206 						svy2=_mm_insert_epi32(svy2,((emUInt32*)p2)[0],3);
1207 						svy3=_mm_insert_epi32(svy3,((emUInt32*)p3)[0],3);
1208 #					endif
1209 					INCREMENT_IMAGE_X(imgX,CHANNELS,imgSX)
1210 				}
1211 			}
1212 		}
1213 
1214 		__m256i vy0=_mm256_cvtepu8_epi16(svy0);
1215 		__m256i vy1=_mm256_cvtepu8_epi16(svy1);
1216 		__m256i vy2=_mm256_cvtepu8_epi16(svy2);
1217 		__m256i vy3=_mm256_cvtepu8_epi16(svy3);
1218 
1219 		PREMULFIN_SHL_COLOR_VEC16(vy0,7)
1220 		PREMULFIN_SHL_COLOR_VEC16(vy1,7)
1221 		PREMULFIN_SHL_COLOR_VEC16(vy2,7)
1222 		PREMULFIN_SHL_COLOR_VEC16(vy3,7)
1223 
1224 		__m256i vy=_mm256_add_epi16(
1225 			_mm256_add_epi16(
1226 				_mm256_mulhrs_epi16(vy0,fy0),
1227 				_mm256_mulhrs_epi16(vy1,fy1)
1228 			),
1229 			_mm256_add_epi16(
1230 				_mm256_mulhrs_epi16(vy2,fy2),
1231 				_mm256_mulhrs_epi16(vy3,fy3)
1232 			)
1233 		);
1234 
1235 		_mm256_storeu_si256((__m256i*)pvy,vy);
1236 		pvy+=(16/CHANNELS)*CHANNELS;
1237 	} while (pvy<pvyEnd);
1238 
1239 	_mm256_storeu_si256((__m256i*)pvy,_mm256_setzero_si256());
1240 
1241 	pvy=pvyBeg;
1242 	emByte * buf=(emByte*)sct.InterpolationBuffer;
1243 	emByte * bufEnd=buf+w*CHANNELS;
1244 
1245 	// Order of pixels in v02 and v13 with 3-4 / 1-2 channels:
1246 	//   v02:   6 2 4 0   /   14 12 10  2   8  6  4  0
1247 	//   v13:   7 3 5 1   /   15 13 11  3   9  7  5  1
1248 #	if CHANNELS<=2
1249 #		if CHANNELS==1
1250 			__m256i vt=_mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)pvy));
1251 #		else
1252 			__m256i vt=_mm256_loadu_si256((__m256i*)pvy);
1253 #		endif
1254 		__m256i v02=_mm256_permutevar8x32_epi32(vt,_mm256_set_epi32(0,0,0,2,0,6,4,0));
1255 		__m256i v13=_mm256_permutevar8x32_epi32(vt,_mm256_set_epi32(0,0,0,3,0,7,5,1));
1256 		pvy+=CHANNELS*8;
1257 		int vn=5;
1258 #	elif CHANNELS==3
1259 		__m256i v02=_mm256_loadu_si256((__m256i*)pvy);
1260 		__m256i v13=_mm256_permutevar8x32_epi32(v02,_mm256_set_epi32(5,4,4,3,7,6,2,1));
1261 		v02=_mm256_blend_epi32(v02,v13,0xfc);
1262 		v13=_mm256_blend_epi32(_mm256_srli_si256(v13,2),_mm256_srli_si256(v13,10),0xf0);
1263 		pvy+=CHANNELS*5;
1264 		int vn=2;
1265 #	else
1266 		__m256i v02=_mm256_loadu_si256((__m256i*)pvy);
1267 		__m256i v13=_mm256_srli_si256(v02,8);
1268 		pvy+=CHANNELS*4;
1269 		int vn=1;
1270 #	endif
1271 
1272 	do {
1273 		__m256i v02l,v02h,v13l,v13h,f02l,f02h,f13l,f13h;
1274 		int cx=16/CHANNELS-1;
1275 
1276 		do {
1277 			tx+=tdx;
1278 			if (tx>=0) {
1279 				tx-=0x1000000;
1280 
1281 				__m256i oldV02=v02;
1282 				v02=v13;
1283 #				if CHANNELS<=2
1284 					v13=_mm256_permutevar8x32_epi32(oldV02,_mm256_set_epi32(0,7,6,1,5,3,2,4));
1285 #				else
1286 					v13=_mm256_permute4x64_epi64(oldV02,0x1e);
1287 #				endif
1288 
1289 				vn--;
1290 				if (vn<=0) {
1291 #					if CHANNELS<=2
1292 #						if CHANNELS==1
1293 							__m256i t=_mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)pvy));
1294 #						else
1295 							__m256i t=_mm256_loadu_si256((__m256i*)pvy);
1296 #						endif
1297 						__m256i ta=_mm256_permutevar8x32_epi32(t,_mm256_set_epi32(0,0,7,0,5,3,1,0));
1298 						__m256i tb=_mm256_permutevar8x32_epi32(t,_mm256_set_epi32(0,0,0,0,6,4,2,0));
1299 						v02=_mm256_blend_epi32(v02,ta,0xee);
1300 						v13=_mm256_blend_epi32(v13,tb,0xfe);
1301 						pvy+=CHANNELS*8;
1302 						vn+=8;
1303 #					elif CHANNELS==3
1304 						__m256i t=_mm256_loadu_si256((__m256i*)pvy);
1305 						__m256i ta=_mm256_shuffle_epi8(t,_mm256_set_epi8(
1306 							-1,-1, 7, 6,  5, 4, 3, 2, -1,-1,-1,-1, -1,-1,-1,-1,
1307 							-1,-1,11,10,  9, 8, 7, 6, -1,-1,-1,-1, -1,-1,-1,-1
1308 						));
1309 						__m256i tb=_mm256_permutevar8x32_epi32(t,_mm256_set_epi32(7,6,1,0,4,3,2,1));
1310 						v02=_mm256_blend_epi32(v02,ta,0xcc);
1311 						v13=_mm256_blend_epi32(v13,tb,0xfc);
1312 						pvy+=CHANNELS*5;
1313 						vn+=5;
1314 #					else
1315 						__m256i ta=_mm256_loadu_si256((__m256i*)pvy);
1316 						__m256i tb=_mm256_permute4x64_epi64(ta,0x0a);
1317 						v02=_mm256_blend_epi32(v02,ta,0xcc);
1318 						v13=_mm256_blend_epi32(v13,tb,0xfc);
1319 						pvy+=CHANNELS*4;
1320 						vn+=4;
1321 #					endif
1322 				}
1323 			}
1324 
1325 			emUInt32 ox=(tx+0x1007fff)>>16;
1326 			const LanczosFactors & fx=LanczosFactorsTable[ox];
1327 
1328 			__m256i f02=_mm256_cvtepu16_epi64(_mm_loadl_epi64((__m128i*)&fx));
1329 #			if CHANNELS>=2
1330 				f02=_mm256_shuffle_epi8(f02,_mm256_set_epi8(
1331 					9, 8, 9, 8, 9, 8, 9, 8, 1, 0, 1, 0, 1, 0, 1, 0,
1332 					9, 8, 9, 8, 9, 8, 9, 8, 1, 0, 1, 0, 1, 0, 1, 0
1333 				));
1334 #			endif
1335 
1336 			__m256i f13=_mm256_srli_si256(f02,8);
1337 
1338 			if (cx==7/CHANNELS) {
1339 #				if CHANNELS==3
1340 					v02l=_mm256_alignr_epi8(v02,v02h,4);
1341 					v13l=_mm256_alignr_epi8(v13,v13h,4);
1342 					f02l=_mm256_alignr_epi8(f02,f02h,4);
1343 					f13l=_mm256_alignr_epi8(f13,f13h,4);
1344 #				else
1345 					v02l=v02h;
1346 					v13l=v13h;
1347 					f02l=f02h;
1348 					f13l=f13h;
1349 #				endif
1350 			}
1351 
1352 			v02h=_mm256_alignr_epi8(v02,v02h,CHANNELS*2);
1353 			v13h=_mm256_alignr_epi8(v13,v13h,CHANNELS*2);
1354 			f02h=_mm256_alignr_epi8(f02,f02h,CHANNELS*2);
1355 			f13h=_mm256_alignr_epi8(f13,f13h,CHANNELS*2);
1356 			cx--;
1357 		} while (cx>=0);
1358 
1359 #		if CHANNELS==3
1360 			v02h=_mm256_srli_si256(v02h,2);
1361 			v13h=_mm256_srli_si256(v13h,2);
1362 			f02h=_mm256_srli_si256(f02h,2);
1363 			f13h=_mm256_srli_si256(f13h,2);
1364 #		endif
1365 
1366 		__m256i vx0=_mm256_permute2x128_si256(v02l,v02h,0x20);
1367 		__m256i vx1=_mm256_permute2x128_si256(v13l,v13h,0x20);
1368 		__m256i vx2=_mm256_permute2x128_si256(v02l,v02h,0x31);
1369 		__m256i vx3=_mm256_permute2x128_si256(v13l,v13h,0x31);
1370 		__m256i fx0=_mm256_permute2x128_si256(f02l,f02h,0x20);
1371 		__m256i fx1=_mm256_permute2x128_si256(f13l,f13h,0x20);
1372 		__m256i fx2=_mm256_permute2x128_si256(f02l,f02h,0x31);
1373 		__m256i fx3=_mm256_permute2x128_si256(f13l,f13h,0x31);
1374 
1375 		__m256i vx=_mm256_add_epi16(
1376 			_mm256_add_epi16(
1377 				_mm256_mulhrs_epi16(vx0,fx0),
1378 				_mm256_mulhrs_epi16(vx1,fx1)
1379 			),
1380 			_mm256_add_epi16(
1381 				_mm256_mulhrs_epi16(vx2,fx2),
1382 				_mm256_mulhrs_epi16(vx3,fx3)
1383 			)
1384 		);
1385 
1386 		vx=_mm256_add_epi16(vx,_mm256_set1_epi16(0x10));
1387 		vx=_mm256_srai_epi16(vx,5);
1388 		vx=_mm256_max_epi16(vx,_mm256_setzero_si256());
1389 		__m128i svx=_mm_packus_epi16(
1390 			_mm256_castsi256_si128(vx),
1391 			_mm256_extracti128_si256(vx,1)
1392 		);
1393 #		if CHANNELS==2 || CHANNELS==4
1394 			svx=_mm_min_epu8(svx,_mm_shuffle_epi8(svx,_mm_set_epi8(
1395 #				if CHANNELS==2
1396 					15,15,13,13,11,11,9,9,7,7,5,5,3,3,1,1
1397 #				else
1398 					15,15,15,15,11,11,11,11,7,7,7,7,3,3,3,3
1399 #				endif
1400 			)));
1401 #		endif
1402 
1403 		_mm_storeu_si128((__m128i*)buf,svx);
1404 
1405 		buf+=(16/CHANNELS)*CHANNELS;
1406 	} while (buf<bufEnd);
1407 }
1408 
1409 
1410 //==============================================================================
1411 //========== emPainter::ScanlineTool::InterpolateImageAvx2Adaptive... ==========
1412 //==============================================================================
1413 
1414 //---------------------------- AdaptiveFactorsTable ----------------------------
1415 
1416 #ifndef ADAPTIVE_FACTORS_TABLE_DEFINED
1417 #	define ADAPTIVE_FACTORS_TABLE_DEFINED
1418 	struct alignas(8) AdaptiveFactors {
1419 		emInt16 fv1;
1420 		emInt16 fv2;
1421 		emInt16 fs1;
1422 		emInt16 fs2;
1423 	};
1424 	static const AdaptiveFactors AdaptiveFactorsTable[257] = {
1425 		// #include <stdio.h>
1426 		// #include <math.h>
1427 		// int main(int argc, char * argv[])
1428 		// {
1429 		//   for (int i=0; i<=256; i++) {
1430 		//     double f=-32768.0; // Negative because +32768 is beyond signed 16-Bit
1431 		//     double o=i/256.0;
1432 		//     int fv1=(int)round((2*o*o*o-3*o*o+1)*f);
1433 		//     int fv2=(int)round((-2*o*o*o+3*o*o)*f);
1434 		//     int fs1=(int)round((o*o*o-2*o*o+o)*f);
1435 		//     int fs2=(int)round((o*o*o-o*o)*f);
1436 		//     printf("%s{%d,%d,%d,%d},",i%4?"":"\n",fv1,fv2,fs1,fs2);
1437 		//   }
1438 		//   return 0;
1439 		// }
1440 		{-32768,0,0,0},{-32767,-1,-127,0},{-32762,-6,-252,2},{-32755,-13,-375,4},
1441 		{-32744,-24,-496,8},{-32731,-37,-615,12},{-32715,-53,-732,18},{-32696,-72,-848,24},
1442 		{-32674,-94,-961,31},{-32649,-119,-1072,39},{-32622,-146,-1182,48},{-32592,-176,-1290,58},
1443 		{-32559,-209,-1395,69},{-32523,-245,-1499,80},{-32485,-283,-1601,93},{-32444,-324,-1702,106},
1444 		{-32400,-368,-1800,120},{-32354,-414,-1897,135},{-32305,-463,-1991,151},{-32253,-515,-2084,167},
1445 		{-32199,-569,-2176,184},{-32143,-625,-2265,202},{-32084,-684,-2353,221},{-32022,-746,-2439,241},
1446 		{-31958,-810,-2523,261},{-31892,-876,-2606,282},{-31823,-945,-2686,304},{-31751,-1017,-2765,326},
1447 		{-31678,-1090,-2843,349},{-31602,-1166,-2919,373},{-31523,-1245,-2993,397},{-31443,-1325,-3065,422},
1448 		{-31360,-1408,-3136,448},{-31275,-1493,-3205,474},{-31188,-1580,-3273,501},{-31098,-1670,-3339,529},
1449 		{-31006,-1762,-3403,557},{-30912,-1856,-3466,586},{-30816,-1952,-3527,615},{-30718,-2050,-3587,645},
1450 		{-30618,-2150,-3645,675},{-30516,-2252,-3702,706},{-30411,-2357,-3757,737},{-30305,-2463,-3810,769},
1451 		{-30197,-2571,-3862,802},{-30086,-2682,-3913,835},{-29974,-2794,-3962,868},{-29860,-2908,-4010,902},
1452 		{-29744,-3024,-4056,936},{-29626,-3142,-4101,971},{-29506,-3262,-4144,1006},{-29385,-3383,-4186,1041},
1453 		{-29261,-3507,-4227,1077},{-29136,-3632,-4266,1114},{-29009,-3759,-4304,1150},{-28880,-3888,-4340,1188},
1454 		{-28750,-4018,-4375,1225},{-28618,-4150,-4409,1263},{-28484,-4284,-4441,1301},{-28349,-4419,-4472,1339},
1455 		{-28212,-4556,-4502,1378},{-28073,-4695,-4530,1417},{-27933,-4835,-4557,1457},{-27791,-4977,-4583,1496},
1456 		{-27648,-5120,-4608,1536},{-27503,-5265,-4631,1576},{-27357,-5411,-4654,1616},{-27209,-5559,-4674,1657},
1457 		{-27060,-5708,-4694,1698},{-26910,-5858,-4713,1739},{-26758,-6010,-4730,1780},{-26605,-6163,-4746,1821},
1458 		{-26450,-6318,-4761,1863},{-26294,-6474,-4775,1905},{-26137,-6631,-4787,1947},{-25978,-6790,-4799,1989},
1459 		{-25819,-6949,-4809,2031},{-25658,-7110,-4819,2073},{-25496,-7272,-4827,2115},{-25332,-7436,-4834,2158},
1460 		{-25168,-7600,-4840,2200},{-25002,-7766,-4845,2243},{-24836,-7932,-4849,2285},{-24668,-8100,-4852,2328},
1461 		{-24499,-8269,-4854,2370},{-24329,-8439,-4854,2413},{-24159,-8609,-4854,2456},{-23987,-8781,-4853,2498},
1462 		{-23814,-8954,-4851,2541},{-23640,-9128,-4848,2584},{-23466,-9302,-4844,2626},{-23290,-9478,-4839,2669},
1463 		{-23114,-9654,-4833,2711},{-22937,-9831,-4826,2753},{-22758,-10010,-4818,2796},{-22580,-10188,-4810,2838},
1464 		{-22400,-10368,-4800,2880},{-22220,-10548,-4790,2922},{-22039,-10729,-4778,2964},{-21857,-10911,-4766,3005},
1465 		{-21674,-11094,-4753,3047},{-21491,-11277,-4739,3088},{-21307,-11461,-4725,3129},{-21123,-11645,-4709,3170},
1466 		{-20938,-11830,-4693,3211},{-20752,-12016,-4676,3252},{-20566,-12202,-4658,3292},{-20380,-12388,-4640,3332},
1467 		{-20193,-12575,-4620,3372},{-20005,-12763,-4600,3411},{-19817,-12951,-4580,3450},{-19629,-13139,-4558,3489},
1468 		{-19440,-13328,-4536,3528},{-19251,-13517,-4513,3566},{-19061,-13707,-4490,3604},{-18871,-13897,-4465,3642},
1469 		{-18681,-14087,-4441,3679},{-18491,-14277,-4415,3716},{-18300,-14468,-4389,3753},{-18109,-14659,-4362,3789},
1470 		{-17918,-14850,-4335,3825},{-17727,-15041,-4307,3860},{-17535,-15233,-4279,3895},{-17344,-15424,-4250,3930},
1471 		{-17152,-15616,-4220,3964},{-16960,-15808,-4190,3998},{-16768,-16000,-4159,4031},{-16576,-16192,-4128,4064},
1472 		{-16384,-16384,-4096,4096},{-16192,-16576,-4064,4128},{-16000,-16768,-4031,4159},{-15808,-16960,-3998,4190},
1473 		{-15616,-17152,-3964,4220},{-15424,-17344,-3930,4250},{-15233,-17535,-3895,4279},{-15041,-17727,-3860,4307},
1474 		{-14850,-17918,-3825,4335},{-14659,-18109,-3789,4362},{-14468,-18300,-3753,4389},{-14277,-18491,-3716,4415},
1475 		{-14087,-18681,-3679,4441},{-13897,-18871,-3642,4465},{-13707,-19061,-3604,4490},{-13517,-19251,-3566,4513},
1476 		{-13328,-19440,-3528,4536},{-13139,-19629,-3489,4558},{-12951,-19817,-3450,4580},{-12763,-20005,-3411,4600},
1477 		{-12575,-20193,-3372,4620},{-12388,-20380,-3332,4640},{-12202,-20566,-3292,4658},{-12016,-20752,-3252,4676},
1478 		{-11830,-20938,-3211,4693},{-11645,-21123,-3170,4709},{-11461,-21307,-3129,4725},{-11277,-21491,-3088,4739},
1479 		{-11094,-21674,-3047,4753},{-10911,-21857,-3005,4766},{-10729,-22039,-2964,4778},{-10548,-22220,-2922,4790},
1480 		{-10368,-22400,-2880,4800},{-10188,-22580,-2838,4810},{-10010,-22758,-2796,4818},{-9831,-22937,-2753,4826},
1481 		{-9654,-23114,-2711,4833},{-9478,-23290,-2669,4839},{-9302,-23466,-2626,4844},{-9128,-23640,-2584,4848},
1482 		{-8954,-23814,-2541,4851},{-8781,-23987,-2498,4853},{-8609,-24159,-2456,4854},{-8439,-24329,-2413,4854},
1483 		{-8269,-24499,-2370,4854},{-8100,-24668,-2328,4852},{-7932,-24836,-2285,4849},{-7766,-25002,-2243,4845},
1484 		{-7600,-25168,-2200,4840},{-7436,-25332,-2158,4834},{-7272,-25496,-2115,4827},{-7110,-25658,-2073,4819},
1485 		{-6949,-25819,-2031,4809},{-6790,-25978,-1989,4799},{-6631,-26137,-1947,4787},{-6474,-26294,-1905,4775},
1486 		{-6318,-26450,-1863,4761},{-6163,-26605,-1821,4746},{-6010,-26758,-1780,4730},{-5858,-26910,-1739,4713},
1487 		{-5708,-27060,-1698,4694},{-5559,-27209,-1657,4674},{-5411,-27357,-1616,4654},{-5265,-27503,-1576,4631},
1488 		{-5120,-27648,-1536,4608},{-4977,-27791,-1496,4583},{-4835,-27933,-1457,4557},{-4695,-28073,-1417,4530},
1489 		{-4556,-28212,-1378,4502},{-4419,-28349,-1339,4472},{-4284,-28484,-1301,4441},{-4150,-28618,-1263,4409},
1490 		{-4018,-28750,-1225,4375},{-3888,-28880,-1188,4340},{-3759,-29009,-1150,4304},{-3632,-29136,-1114,4266},
1491 		{-3507,-29261,-1077,4227},{-3383,-29385,-1041,4186},{-3262,-29506,-1006,4144},{-3142,-29626,-971,4101},
1492 		{-3024,-29744,-936,4056},{-2908,-29860,-902,4010},{-2794,-29974,-868,3962},{-2682,-30086,-835,3913},
1493 		{-2571,-30197,-802,3862},{-2463,-30305,-769,3810},{-2357,-30411,-737,3757},{-2252,-30516,-706,3702},
1494 		{-2150,-30618,-675,3645},{-2050,-30718,-645,3587},{-1952,-30816,-615,3527},{-1856,-30912,-586,3466},
1495 		{-1762,-31006,-557,3403},{-1670,-31098,-529,3339},{-1580,-31188,-501,3273},{-1493,-31275,-474,3205},
1496 		{-1408,-31360,-448,3136},{-1325,-31443,-422,3065},{-1245,-31523,-397,2993},{-1166,-31602,-373,2919},
1497 		{-1090,-31678,-349,2843},{-1017,-31751,-326,2765},{-945,-31823,-304,2686},{-876,-31892,-282,2606},
1498 		{-810,-31958,-261,2523},{-746,-32022,-241,2439},{-684,-32084,-221,2353},{-625,-32143,-202,2265},
1499 		{-569,-32199,-184,2176},{-515,-32253,-167,2084},{-463,-32305,-151,1991},{-414,-32354,-135,1897},
1500 		{-368,-32400,-120,1800},{-324,-32444,-106,1702},{-283,-32485,-93,1601},{-245,-32523,-80,1499},
1501 		{-209,-32559,-69,1395},{-176,-32592,-58,1290},{-146,-32622,-48,1182},{-119,-32649,-39,1072},
1502 		{-94,-32674,-31,961},{-72,-32696,-24,848},{-53,-32715,-18,732},{-37,-32731,-12,615},
1503 		{-24,-32744,-8,496},{-13,-32755,-4,375},{-6,-32762,-2,252},{-1,-32767,0,127},
1504 		{0,-32768,0,0}
1505 	};
1506 #endif
1507 
1508 
1509 //----------------- Subroutine: InterpolateFourVectorsAdaptive -----------------
1510 
1511 #ifndef INTERPOLATE_FOUR_VECTORS_ADAPTIVE_DEFINED
1512 #	define INTERPOLATE_FOUR_VECTORS_ADAPTIVE_DEFINED
1513 
1514 #	if defined(__GNUC__)
1515 		__attribute__((target("avx2")))
1516 #	endif
InterpolateFourVectorsAdaptive(__m256i v0,__m256i v1,__m256i v2,__m256i v3,__m256i fv1,__m256i fv2,__m256i fs1,__m256i fs2)1517 	static inline __m256i InterpolateFourVectorsAdaptive(
1518 		__m256i v0, __m256i v1, __m256i v2, __m256i v3,
1519 		__m256i fv1, __m256i fv2, __m256i fs1, __m256i fs2
1520 	)
1521 	{
1522 		__m256i neg=_mm256_or_si256(
1523 			_mm256_cmpgt_epi16(v2,v1),
1524 			_mm256_set1_epi16(1)
1525 		);
1526 
1527 		v0=_mm256_sign_epi16(v0,neg);
1528 		v1=_mm256_sign_epi16(v1,neg);
1529 		v2=_mm256_sign_epi16(v2,neg);
1530 		v3=_mm256_sign_epi16(v3,neg);
1531 
1532 		__m256i s01=_mm256_sub_epi16(v1,v0);
1533 		__m256i s12=_mm256_sub_epi16(v2,v1);
1534 		__m256i s21=_mm256_sub_epi16(v1,v2);
1535 		__m256i s23=_mm256_sub_epi16(v3,v2);
1536 
1537 		__m256i s01x2=_mm256_add_epi16(s01,s01);
1538 		__m256i s12x2=_mm256_add_epi16(s12,s12);
1539 		__m256i s23x2=_mm256_add_epi16(s23,s23);
1540 
1541 		__m256i s1=_mm256_min_epi16(
1542 			_mm256_max_epi16(s01x2,s12),
1543 			_mm256_max_epi16(s12x2,s01)
1544 		);
1545 		__m256i s2=_mm256_min_epi16(
1546 			_mm256_max_epi16(s23x2,s12),
1547 			_mm256_max_epi16(s12x2,s23)
1548 		);
1549 
1550 		__m256i q1=_mm256_sub_epi16(s1,s23x2);
1551 		__m256i q2=_mm256_sub_epi16(s2,s01x2);
1552 
1553 		s1=_mm256_add_epi16(
1554 			s1,
1555 			_mm256_min_epi16(
1556 				_mm256_setzero_si256(),
1557 				_mm256_max_epi16(q1,s1)
1558 			)
1559 		);
1560 		s2=_mm256_add_epi16(
1561 			s2,
1562 			_mm256_min_epi16(
1563 				_mm256_setzero_si256(),
1564 				_mm256_max_epi16(q2,s2)
1565 			)
1566 		);
1567 
1568 		s1=_mm256_min_epi16(_mm256_setzero_si256(),s1);
1569 		s2=_mm256_min_epi16(_mm256_setzero_si256(),s2);
1570 
1571 		__m256i s21p7=_mm256_add_epi16(s21,_mm256_set1_epi16(7));
1572 
1573 		v1=_mm256_add_epi16(
1574 			v1,
1575 			_mm256_max_epi16(
1576 				_mm256_setzero_si256(),
1577 				_mm256_min_epi16(
1578 					_mm256_min_epi16(s21,s01),
1579 					_mm256_srai_epi16(_mm256_add_epi16(s01,s21p7),4)
1580 				)
1581 			)
1582 		);
1583 		v2=_mm256_sub_epi16(
1584 			v2,
1585 			_mm256_max_epi16(
1586 				_mm256_setzero_si256(),
1587 				_mm256_min_epi16(
1588 					_mm256_min_epi16(s21,s23),
1589 					_mm256_srai_epi16(_mm256_add_epi16(s23,s21p7),4)
1590 				)
1591 			)
1592 		);
1593 
1594 		__m256i v=_mm256_add_epi16(
1595 			_mm256_add_epi16(
1596 				_mm256_mulhrs_epi16(v1,fv1),
1597 				_mm256_mulhrs_epi16(v2,fv2)
1598 			),
1599 			_mm256_add_epi16(
1600 				_mm256_mulhrs_epi16(s1,fs1),
1601 				_mm256_mulhrs_epi16(s2,fs2)
1602 			)
1603 		);
1604 
1605 		v=_mm256_sign_epi16(v,neg);
1606 		return v;
1607 	}
1608 #endif
1609 
1610 
1611 //---------- emPainter::ScanlineTool::InterpolateImageAvx2Adaptive... ----------
1612 
1613 #if defined(__GNUC__)
1614 	__attribute__((target("avx2")))
1615 #endif
CONCAT(InterpolateImageAvx2Adaptive,CONCAT (CONCAT (METHOD_NAME_EXTENSION_,EXTENSION),CONCAT (METHOD_NAME_CHANNELS_,CHANNELS)))1616 void emPainter::ScanlineTool::CONCAT(InterpolateImageAvx2Adaptive,CONCAT(
1617 	CONCAT(METHOD_NAME_EXTENSION_,EXTENSION),
1618 	CONCAT(METHOD_NAME_CHANNELS_,CHANNELS)
1619 )) (const ScanlineTool & sct, int x, int y, int w)
1620 {
1621 	emInt64 ty=y*sct.TDY-sct.TY-0x1800000;
1622 	emUInt32 oy=((ty&0xffffff)+0x7fff)>>16;
1623 	const AdaptiveFactors & fy=AdaptiveFactorsTable[oy];
1624 
1625 	DEFINE_AND_SET_IMAGE_Y(imgY,ty>>24,sct.ImgDY,sct.ImgSY)
1626 	ssize_t imgSX=sct.ImgSX;
1627 	DEFINE_AND_SET_IMAGE_ROW_PTR(row0,imgY,imgSX,sct.ImgSY,sct.ImgMap)
1628 	INCREMENT_IMAGE_Y(imgY,sct.ImgDY,sct.ImgSY)
1629 	DEFINE_AND_SET_IMAGE_ROW_PTR(row1,imgY,imgSX,sct.ImgSY,sct.ImgMap)
1630 	INCREMENT_IMAGE_Y(imgY,sct.ImgDY,sct.ImgSY)
1631 	DEFINE_AND_SET_IMAGE_ROW_PTR(row2,imgY,imgSX,sct.ImgSY,sct.ImgMap)
1632 	INCREMENT_IMAGE_Y(imgY,sct.ImgDY,sct.ImgSY)
1633 	DEFINE_AND_SET_IMAGE_ROW_PTR(row3,imgY,imgSX,sct.ImgSY,sct.ImgMap)
1634 
1635 	emInt64 tdx=sct.TDX;
1636 	emInt64 tx=x*tdx-sct.TX-0x1800000;
1637 
1638 	DEFINE_AND_SET_IMAGE_X(imgX,tx>>24,CHANNELS,imgSX)
1639 
1640 	tx=(tx&0xffffff)-0x1000000-tdx;
1641 	int tc=((tx+0x5000000+w*tdx)>>24)*CHANNELS;
1642 
1643 	const emByte * p=(emByte*)sct.InterpolationBuffer+InterpolationBufferSize-tc*2-64;
1644 	p-=(p-(emByte*)NULL)&31;
1645 	const emInt16 * pvyBeg=(emInt16*)p;
1646 	const emInt16 * pvy=pvyBeg;
1647 	const emInt16 * pvyEnd=pvyBeg+tc;
1648 
1649 	__m128i sfy=_mm_loadl_epi64((__m128i*)&fy);
1650 	sfy=_mm_unpacklo_epi16(sfy,sfy);
1651 	__m256i afy=_mm256_broadcastsi128_si256(sfy);
1652 	__m256i fy0=_mm256_shuffle_epi32(afy,0x00);
1653 	__m256i fy1=_mm256_shuffle_epi32(afy,0x55);
1654 	__m256i fy2=_mm256_shuffle_epi32(afy,0xaa);
1655 	__m256i fy3=_mm256_shuffle_epi32(afy,0xff);
1656 
1657 	do {
1658 		__m128i svy0,svy1,svy2,svy3;
1659 		if (ARE_THERE_16_CONSECUTIVE_BYTES_AT_IMAGE_X(imgX,imgSX)) {
1660 			DEFINE_AND_SET_IMAGE_PIX_PTR(p0,row0,imgX)
1661 			DEFINE_AND_SET_IMAGE_PIX_PTR(p1,row1,imgX)
1662 			DEFINE_AND_SET_IMAGE_PIX_PTR(p2,row2,imgX)
1663 			DEFINE_AND_SET_IMAGE_PIX_PTR(p3,row3,imgX)
1664 			svy0=_mm_loadu_si128((__m128i*)p0);
1665 			svy1=_mm_loadu_si128((__m128i*)p1);
1666 			svy2=_mm_loadu_si128((__m128i*)p2);
1667 			svy3=_mm_loadu_si128((__m128i*)p3);
1668 			INCREMENT_IMAGE_X(imgX,((16/CHANNELS)*CHANNELS),imgSX)
1669 		}
1670 		else {
1671 			for (int i=0, j=pvyEnd-pvy; i<=16-CHANNELS; i+=CHANNELS) {
1672 				svy0=_mm_srli_si128(svy0,CHANNELS);
1673 				svy1=_mm_srli_si128(svy1,CHANNELS);
1674 				svy2=_mm_srli_si128(svy2,CHANNELS);
1675 				svy3=_mm_srli_si128(svy3,CHANNELS);
1676 				if (i<j) {
1677 					DEFINE_AND_SET_IMAGE_PIX_PTR(p0,row0,imgX)
1678 					DEFINE_AND_SET_IMAGE_PIX_PTR(p1,row1,imgX)
1679 					DEFINE_AND_SET_IMAGE_PIX_PTR(p2,row2,imgX)
1680 					DEFINE_AND_SET_IMAGE_PIX_PTR(p3,row3,imgX)
1681 #					if CHANNELS==1
1682 						svy0=_mm_insert_epi8(svy0,p0[0],15);
1683 						svy1=_mm_insert_epi8(svy1,p1[0],15);
1684 						svy2=_mm_insert_epi8(svy2,p2[0],15);
1685 						svy3=_mm_insert_epi8(svy3,p3[0],15);
1686 #					elif CHANNELS==2
1687 						svy0=_mm_insert_epi16(svy0,((emUInt16*)p0)[0],7);
1688 						svy1=_mm_insert_epi16(svy1,((emUInt16*)p1)[0],7);
1689 						svy2=_mm_insert_epi16(svy2,((emUInt16*)p2)[0],7);
1690 						svy3=_mm_insert_epi16(svy3,((emUInt16*)p3)[0],7);
1691 #					elif CHANNELS==3
1692 						svy0=_mm_insert_epi16(svy0,p0[0]|(p0[1]<<8),6);
1693 						svy0=_mm_insert_epi8(svy0,p0[2],14);
1694 						svy1=_mm_insert_epi16(svy1,p1[0]|(p1[1]<<8),6);
1695 						svy1=_mm_insert_epi8(svy1,p1[2],14);
1696 						svy2=_mm_insert_epi16(svy2,p2[0]|(p2[1]<<8),6);
1697 						svy2=_mm_insert_epi8(svy2,p2[2],14);
1698 						svy3=_mm_insert_epi16(svy3,p3[0]|(p3[1]<<8),6);
1699 						svy3=_mm_insert_epi8(svy3,p3[2],14);
1700 #					else
1701 						svy0=_mm_insert_epi32(svy0,((emUInt32*)p0)[0],3);
1702 						svy1=_mm_insert_epi32(svy1,((emUInt32*)p1)[0],3);
1703 						svy2=_mm_insert_epi32(svy2,((emUInt32*)p2)[0],3);
1704 						svy3=_mm_insert_epi32(svy3,((emUInt32*)p3)[0],3);
1705 #					endif
1706 					INCREMENT_IMAGE_X(imgX,CHANNELS,imgSX)
1707 				}
1708 			}
1709 		}
1710 
1711 		__m256i vy0=_mm256_cvtepu8_epi16(svy0);
1712 		__m256i vy1=_mm256_cvtepu8_epi16(svy1);
1713 		__m256i vy2=_mm256_cvtepu8_epi16(svy2);
1714 		__m256i vy3=_mm256_cvtepu8_epi16(svy3);
1715 
1716 		PREMULFIN_SHL_COLOR_VEC16(vy0,5)
1717 		PREMULFIN_SHL_COLOR_VEC16(vy1,5)
1718 		PREMULFIN_SHL_COLOR_VEC16(vy2,5)
1719 		PREMULFIN_SHL_COLOR_VEC16(vy3,5)
1720 
1721 		__m256i vy=InterpolateFourVectorsAdaptive(vy0,vy1,vy2,vy3,fy0,fy1,fy2,fy3);
1722 
1723 		_mm256_storeu_si256((__m256i*)pvy,vy);
1724 		pvy+=(16/CHANNELS)*CHANNELS;
1725 	} while (pvy<pvyEnd);
1726 
1727 	_mm256_storeu_si256((__m256i*)pvy,_mm256_setzero_si256());
1728 
1729 	pvy=pvyBeg;
1730 	emByte * buf=(emByte*)sct.InterpolationBuffer;
1731 	emByte * bufEnd=buf+w*CHANNELS;
1732 
1733 	// Order of pixels in v02 and v13 with 3-4 / 1-2 channels:
1734 	//   v02:   6 2 4 0   /   14 12 10  2   8  6  4  0
1735 	//   v13:   7 3 5 1   /   15 13 11  3   9  7  5  1
1736 #	if CHANNELS<=2
1737 #		if CHANNELS==1
1738 			__m256i vt=_mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)pvy));
1739 #		else
1740 			__m256i vt=_mm256_loadu_si256((__m256i*)pvy);
1741 #		endif
1742 		__m256i v02=_mm256_permutevar8x32_epi32(vt,_mm256_set_epi32(0,0,0,2,0,6,4,0));
1743 		__m256i v13=_mm256_permutevar8x32_epi32(vt,_mm256_set_epi32(0,0,0,3,0,7,5,1));
1744 		pvy+=CHANNELS*8;
1745 		int vn=5;
1746 #	elif CHANNELS==3
1747 		__m256i v02=_mm256_loadu_si256((__m256i*)pvy);
1748 		__m256i v13=_mm256_permutevar8x32_epi32(v02,_mm256_set_epi32(5,4,4,3,7,6,2,1));
1749 		v02=_mm256_blend_epi32(v02,v13,0xfc);
1750 		v13=_mm256_blend_epi32(_mm256_srli_si256(v13,2),_mm256_srli_si256(v13,10),0xf0);
1751 		pvy+=CHANNELS*5;
1752 		int vn=2;
1753 #	else
1754 		__m256i v02=_mm256_loadu_si256((__m256i*)pvy);
1755 		__m256i v13=_mm256_srli_si256(v02,8);
1756 		pvy+=CHANNELS*4;
1757 		int vn=1;
1758 #	endif
1759 
1760 	do {
1761 		__m256i v02l,v02h,v13l,v13h,f02l,f02h,f13l,f13h;
1762 		int cx=16/CHANNELS-1;
1763 
1764 		do {
1765 			tx+=tdx;
1766 			if (tx>=0) {
1767 				tx-=0x1000000;
1768 
1769 				__m256i oldV02=v02;
1770 				v02=v13;
1771 #				if CHANNELS<=2
1772 					v13=_mm256_permutevar8x32_epi32(oldV02,_mm256_set_epi32(0,7,6,1,5,3,2,4));
1773 #				else
1774 					v13=_mm256_permute4x64_epi64(oldV02,0x1e);
1775 #				endif
1776 
1777 				vn--;
1778 				if (vn<=0) {
1779 #					if CHANNELS<=2
1780 #						if CHANNELS==1
1781 							__m256i t=_mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)pvy));
1782 #						else
1783 							__m256i t=_mm256_loadu_si256((__m256i*)pvy);
1784 #						endif
1785 						__m256i ta=_mm256_permutevar8x32_epi32(t,_mm256_set_epi32(0,0,7,0,5,3,1,0));
1786 						__m256i tb=_mm256_permutevar8x32_epi32(t,_mm256_set_epi32(0,0,0,0,6,4,2,0));
1787 						v02=_mm256_blend_epi32(v02,ta,0xee);
1788 						v13=_mm256_blend_epi32(v13,tb,0xfe);
1789 						pvy+=CHANNELS*8;
1790 						vn+=8;
1791 #					elif CHANNELS==3
1792 						__m256i t=_mm256_loadu_si256((__m256i*)pvy);
1793 						__m256i ta=_mm256_shuffle_epi8(t,_mm256_set_epi8(
1794 							-1,-1, 7, 6,  5, 4, 3, 2, -1,-1,-1,-1, -1,-1,-1,-1,
1795 							-1,-1,11,10,  9, 8, 7, 6, -1,-1,-1,-1, -1,-1,-1,-1
1796 						));
1797 						__m256i tb=_mm256_permutevar8x32_epi32(t,_mm256_set_epi32(7,6,1,0,4,3,2,1));
1798 						v02=_mm256_blend_epi32(v02,ta,0xcc);
1799 						v13=_mm256_blend_epi32(v13,tb,0xfc);
1800 						pvy+=CHANNELS*5;
1801 						vn+=5;
1802 #					else
1803 						__m256i ta=_mm256_loadu_si256((__m256i*)pvy);
1804 						__m256i tb=_mm256_permute4x64_epi64(ta,0x0a);
1805 						v02=_mm256_blend_epi32(v02,ta,0xcc);
1806 						v13=_mm256_blend_epi32(v13,tb,0xfc);
1807 						pvy+=CHANNELS*4;
1808 						vn+=4;
1809 #					endif
1810 				}
1811 			}
1812 
1813 			emUInt32 ox=(tx+0x1007fff)>>16;
1814 			const AdaptiveFactors & fx=AdaptiveFactorsTable[ox];
1815 
1816 			__m256i f02=_mm256_cvtepu16_epi64(_mm_loadl_epi64((__m128i*)&fx));
1817 #			if CHANNELS>=2
1818 				f02=_mm256_shuffle_epi8(f02,_mm256_set_epi8(
1819 					9, 8, 9, 8, 9, 8, 9, 8, 1, 0, 1, 0, 1, 0, 1, 0,
1820 					9, 8, 9, 8, 9, 8, 9, 8, 1, 0, 1, 0, 1, 0, 1, 0
1821 				));
1822 #			endif
1823 
1824 			__m256i f13=_mm256_srli_si256(f02,8);
1825 
1826 			if (cx==7/CHANNELS) {
1827 #				if CHANNELS==3
1828 					v02l=_mm256_alignr_epi8(v02,v02h,4);
1829 					v13l=_mm256_alignr_epi8(v13,v13h,4);
1830 					f02l=_mm256_alignr_epi8(f02,f02h,4);
1831 					f13l=_mm256_alignr_epi8(f13,f13h,4);
1832 #				else
1833 					v02l=v02h;
1834 					v13l=v13h;
1835 					f02l=f02h;
1836 					f13l=f13h;
1837 #				endif
1838 			}
1839 
1840 			v02h=_mm256_alignr_epi8(v02,v02h,CHANNELS*2);
1841 			v13h=_mm256_alignr_epi8(v13,v13h,CHANNELS*2);
1842 			f02h=_mm256_alignr_epi8(f02,f02h,CHANNELS*2);
1843 			f13h=_mm256_alignr_epi8(f13,f13h,CHANNELS*2);
1844 			cx--;
1845 		} while (cx>=0);
1846 
1847 #		if CHANNELS==3
1848 			v02h=_mm256_srli_si256(v02h,2);
1849 			v13h=_mm256_srli_si256(v13h,2);
1850 			f02h=_mm256_srli_si256(f02h,2);
1851 			f13h=_mm256_srli_si256(f13h,2);
1852 #		endif
1853 
1854 		__m256i vx0=_mm256_permute2x128_si256(v02l,v02h,0x20);
1855 		__m256i vx1=_mm256_permute2x128_si256(v13l,v13h,0x20);
1856 		__m256i vx2=_mm256_permute2x128_si256(v02l,v02h,0x31);
1857 		__m256i vx3=_mm256_permute2x128_si256(v13l,v13h,0x31);
1858 		__m256i fx0=_mm256_permute2x128_si256(f02l,f02h,0x20);
1859 		__m256i fx1=_mm256_permute2x128_si256(f13l,f13h,0x20);
1860 		__m256i fx2=_mm256_permute2x128_si256(f02l,f02h,0x31);
1861 		__m256i fx3=_mm256_permute2x128_si256(f13l,f13h,0x31);
1862 
1863 		__m256i vx=InterpolateFourVectorsAdaptive(vx0,vx1,vx2,vx3,fx0,fx1,fx2,fx3);
1864 
1865 		vx=_mm256_add_epi16(vx,_mm256_set1_epi16(0x10));
1866 		vx=_mm256_srai_epi16(vx,5);
1867 		vx=_mm256_max_epi16(vx,_mm256_setzero_si256());
1868 		__m128i svx=_mm_packus_epi16(
1869 			_mm256_castsi256_si128(vx),
1870 			_mm256_extracti128_si256(vx,1)
1871 		);
1872 #		if CHANNELS==2 || CHANNELS==4
1873 			svx=_mm_min_epu8(svx,_mm_shuffle_epi8(svx,_mm_set_epi8(
1874 #				if CHANNELS==2
1875 					15,15,13,13,11,11,9,9,7,7,5,5,3,3,1,1
1876 #				else
1877 					15,15,15,15,11,11,11,11,7,7,7,7,3,3,3,3
1878 #				endif
1879 			)));
1880 #		endif
1881 
1882 		_mm_storeu_si128((__m128i*)buf,svx);
1883 
1884 		buf+=(16/CHANNELS)*CHANNELS;
1885 	} while (buf<bufEnd);
1886 }
1887 
1888 
1889 //==============================================================================
1890 //======================= Undefine General Helper Macros =======================
1891 //==============================================================================
1892 
1893 #undef DEFINE_AND_SET_IMAGE_Y
1894 #undef DEFINE_AND_COPY_IMAGE_Y
1895 #undef INCREMENT_IMAGE_Y
1896 #undef DEFINE_AND_SET_IMAGE_ROW_PTR
1897 #undef DEFINE_AND_SET_IMAGE_X
1898 #undef INCREMENT_IMAGE_X
1899 #undef ARE_THERE_16_CONSECUTIVE_BYTES_AT_IMAGE_X
1900 #undef DEFINE_AND_SET_IMAGE_PIX_PTR
1901 #undef PREMULFIN_COLOR_VEC8
1902 #undef PREMULFIN_SHL_COLOR_VEC16
1903 
1904 
1905 #endif
1906