1 //------------------------------------------------------------------------------
2 // emPainter_ScTlIntImg_AVX2.cpp
3 //
4 // Copyright (C) 2020 Oliver Hamann.
5 //
6 // Homepage: http://eaglemode.sourceforge.net/
7 //
8 // This program is free software: you can redistribute it and/or modify it under
9 // the terms of the GNU General Public License version 3 as published by the
10 // Free Software Foundation.
11 //
12 // This program is distributed in the hope that it will be useful, but WITHOUT
13 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14 // FOR A PARTICULAR PURPOSE. See the GNU General Public License version 3 for
15 // more details.
16 //
17 // You should have received a copy of the GNU General Public License version 3
18 // along with this program. If not, see <http://www.gnu.org/licenses/>.
19 //------------------------------------------------------------------------------
20
21 //------------------------------------------------------------------------------
22 // This cpp file includes itself multiple times in order to expand the
23 // algorithms for emPainter::ScanlineTool::InterpolateImageAvx2..(..) with
24 // different settings. The preprocessor defines for these settings are:
25 // EXTENSION: 0, 1, 2 - One of EXTEND_TILED, EXTEND_EDGE, EXTEND_ZERO
26 // CHANNELS: 1, 2, 3, or 4 - Number of channels in the input map.
27 //------------------------------------------------------------------------------
28
29 #if !defined(EXTENSION)
30 //==============================================================================
31 //===================== Top level include / Set EXTENSION ======================
32 //==============================================================================
33
34 #include "emPainter_ScTl.h"
35
36 #if EM_HAVE_X86_INTRINSICS
37 # if defined(_MSC_VER)
38 # include <immintrin.h>
39 # else
40 # include <x86intrin.h>
41 # endif
42 # define CONCATIMPL(a,b) a##b
43 # define CONCAT(a,b) CONCATIMPL(a,b)
44 # define EXTEND_TILED 0
45 # define EXTEND_EDGE 1
46 # define EXTEND_ZERO 2
47 # define METHOD_NAME_EXTENSION_0 Et
48 # define METHOD_NAME_EXTENSION_1 Ee
49 # define METHOD_NAME_EXTENSION_2 Ez
50 # define METHOD_NAME_CHANNELS_1 Cs1
51 # define METHOD_NAME_CHANNELS_2 Cs2
52 # define METHOD_NAME_CHANNELS_3 Cs3
53 # define METHOD_NAME_CHANNELS_4 Cs4
54
55 # define EXTENSION EXTEND_TILED
56 # include "emPainter_ScTlIntImg_AVX2.cpp"
57 # undef EXTENSION
58
59 # define EXTENSION EXTEND_EDGE
60 # include "emPainter_ScTlIntImg_AVX2.cpp"
61 # undef EXTENSION
62
63 # define EXTENSION EXTEND_ZERO
64 # include "emPainter_ScTlIntImg_AVX2.cpp"
65 # undef EXTENSION
66
67 #endif
68
69 #elif !defined(CHANNELS)
70 //==============================================================================
71 //================================ Set CHANNELS ================================
72 //==============================================================================
73
74 #define CHANNELS 1
75 #include "emPainter_ScTlIntImg_AVX2.cpp"
76 #undef CHANNELS
77
78 #define CHANNELS 2
79 #include "emPainter_ScTlIntImg_AVX2.cpp"
80 #undef CHANNELS
81
82 #define CHANNELS 3
83 #include "emPainter_ScTlIntImg_AVX2.cpp"
84 #undef CHANNELS
85
86 #define CHANNELS 4
87 #include "emPainter_ScTlIntImg_AVX2.cpp"
88 #undef CHANNELS
89
90
91 #else
92 //==============================================================================
93 //======================== Define General Helper Macros ========================
94 //==============================================================================
95
96 // DEFINE_AND_SET_IMAGE_Y(Y,Y_IN,DY,SY)
97 #if EXTENSION==EXTEND_TILED
98 # define DEFINE_AND_SET_IMAGE_Y(Y,Y_IN,DY,SY) \
99 ssize_t Y=((Y_IN)*DY)%SY; \
100 if (Y<0) Y+=SY;
101 #elif EXTENSION==EXTEND_EDGE
102 # define DEFINE_AND_SET_IMAGE_Y(Y,Y_IN,DY,SY) \
103 ssize_t Y=(Y_IN)*DY; \
104 ssize_t Y##Clipped=Y; \
105 if ((size_t)Y##Clipped>=(size_t)SY) { \
106 if (Y##Clipped<0) Y##Clipped=0; \
107 else Y##Clipped=SY-DY; \
108 }
109 #else
110 # define DEFINE_AND_SET_IMAGE_Y(Y,Y_IN,DY,SY) \
111 ssize_t Y=(Y_IN)*DY;
112 #endif
113
114
115 // DEFINE_AND_COPY_IMAGE_Y(Y,Y_SRC)
116 #if EXTENSION==EXTEND_TILED
117 # define DEFINE_AND_COPY_IMAGE_Y(Y,Y_SRC) \
118 ssize_t Y=Y_SRC;
119 #elif EXTENSION==EXTEND_EDGE
120 # define DEFINE_AND_COPY_IMAGE_Y(Y,Y_SRC) \
121 ssize_t Y=Y_SRC; \
122 ssize_t Y##Clipped=Y_SRC##Clipped;
123 #else
124 # define DEFINE_AND_COPY_IMAGE_Y(Y,Y_SRC) \
125 ssize_t Y=Y_SRC;
126 #endif
127
128
129 // INCREMENT_IMAGE_Y(Y,DY,SY)
130 #if EXTENSION==EXTEND_TILED
131 # define INCREMENT_IMAGE_Y(Y,DY,SY) \
132 Y+=DY; \
133 if (Y>=SY) Y=0;
134 #elif EXTENSION==EXTEND_EDGE
135 # define INCREMENT_IMAGE_Y(Y,DY,SY) \
136 Y+=DY; \
137 Y##Clipped=Y; \
138 if ((size_t)Y##Clipped>=(size_t)SY) { \
139 if (Y##Clipped<0) Y##Clipped=0; \
140 else Y##Clipped=SY-DY; \
141 }
142 #else
143 # define INCREMENT_IMAGE_Y(Y,DY,SY) \
144 Y+=DY;
145 #endif
146
147
148 // DEFINE_AND_SET_IMAGE_ROW_PTR(ROW_PTR,Y,SX,SY,MAP)
149 #if EXTENSION==EXTEND_TILED
150 # define DEFINE_AND_SET_IMAGE_ROW_PTR(ROW_PTR,Y,SX,SY,MAP) \
151 const emByte * ROW_PTR=MAP+Y;
152 #elif EXTENSION==EXTEND_EDGE
153 # define DEFINE_AND_SET_IMAGE_ROW_PTR(ROW_PTR,Y,SX,SY,MAP) \
154 const emByte * ROW_PTR=MAP+Y##Clipped;
155 #else
156 # define DEFINE_AND_SET_IMAGE_ROW_PTR(ROW_PTR,Y,SX,SY,MAP) \
157 const emByte * ROW_PTR=MAP+Y; \
158 int ROW_PTR##UsedSX=SX; \
159 if ((size_t)Y>=(size_t)SY) ROW_PTR##UsedSX=0;
160 #endif
161
162
163 // DEFINE_AND_SET_IMAGE_X(X,X_IN,DX,SX)
164 #if EXTENSION==EXTEND_TILED
165 # define DEFINE_AND_SET_IMAGE_X(X,X_IN,DX,SX) \
166 ssize_t X=((X_IN)*DX)%SX; \
167 if (X<0) X+=SX;
168 #elif EXTENSION==EXTEND_EDGE
169 # define DEFINE_AND_SET_IMAGE_X(X,X_IN,DX,SX) \
170 ssize_t X=(X_IN)*DX; \
171 ssize_t X##Clipped=X; \
172 if ((size_t)X##Clipped>=(size_t)SX) { \
173 if (X##Clipped<0) X##Clipped=0; \
174 else X##Clipped=SX-DX; \
175 }
176 #else
177 # define DEFINE_AND_SET_IMAGE_X(X,X_IN,DX,SX) \
178 ssize_t X=(X_IN)*DX;
179 #endif
180
181
182 // INCREMENT_IMAGE_X(X,DX,SX)
183 #if EXTENSION==EXTEND_TILED
184 # define INCREMENT_IMAGE_X(X,DX,SX) \
185 X+=DX; \
186 if (X>=SX) X=0;
187 #elif EXTENSION==EXTEND_EDGE
188 # define INCREMENT_IMAGE_X(X,DX,SX) \
189 X+=DX; \
190 X##Clipped=X; \
191 if ((size_t)X##Clipped>=(size_t)SX) { \
192 if (X##Clipped<0) X##Clipped=0; \
193 else X##Clipped=SX-DX; \
194 }
195 #else
196 # define INCREMENT_IMAGE_X(X,DX,SX) \
197 X+=DX;
198 #endif
199
200
201 // ARE_THERE_16_CONSECUTIVE_BYTES_AT_IMAGE_X(X,SX)
202 #if EXTENSION==EXTEND_TILED
203 # define ARE_THERE_16_CONSECUTIVE_BYTES_AT_IMAGE_X(X,SX) \
204 (X+16<=SX)
205 #elif EXTENSION==EXTEND_EDGE
206 # define ARE_THERE_16_CONSECUTIVE_BYTES_AT_IMAGE_X(X,SX) \
207 (X>=0 && X+16<=SX)
208 #else
209 # define ARE_THERE_16_CONSECUTIVE_BYTES_AT_IMAGE_X(X,SX) \
210 (X>=0 && X+16<=SX)
211 // No need to test ROW_PTR##UsedSX, because the ZeroPixels
212 // array has been made large enough.
213 #endif
214
215
216 // DEFINE_AND_SET_IMAGE_PIX_PTR(PIX_PTR,ROW_PTR,X)
217 #ifndef ZERO_PIXELS_DEFINED
218 # define ZERO_PIXELS_DEFINED
219 struct alignas(32) ZeroPixelsStruct {
220 emByte data[32];
221 };
222 static const ZeroPixelsStruct ZeroPixels = {0};
223 #endif
224 #if EXTENSION==EXTEND_TILED
225 # define DEFINE_AND_SET_IMAGE_PIX_PTR(PIX_PTR,ROW_PTR,X) \
226 const emByte * PIX_PTR=ROW_PTR+X;
227 #elif EXTENSION==EXTEND_EDGE
228 # define DEFINE_AND_SET_IMAGE_PIX_PTR(PIX_PTR,ROW_PTR,X) \
229 const emByte * PIX_PTR=ROW_PTR+X##Clipped;
230 #else
231 # define DEFINE_AND_SET_IMAGE_PIX_PTR(PIX_PTR,ROW_PTR,X) \
232 const emByte * PIX_PTR=ROW_PTR+X; \
233 if ((size_t)X>=(size_t)ROW_PTR##UsedSX) PIX_PTR=ZeroPixels.data;
234 #endif
235
236
237 // PREMULFIN_COLOR_VEC8(C)
238 #if CHANNELS==1 || CHANNELS==3
239 # define PREMULFIN_COLOR_VEC8(C)
240 #elif CHANNELS==2
241 # define PREMULFIN_COLOR_VEC8(C) { \
242 __m256i c=_mm256_cvtepu8_epi16(C); \
243 __m256i a=_mm256_shuffle_epi8(c,_mm256_set_epi8( \
244 15,14,15,14, 11,10,11,10, 7, 6, 7, 6, 3, 2, 3, 2, \
245 15,14,15,14, 11,10,11,10, 7, 6, 7, 6, 3, 2, 3, 2 \
246 )); \
247 c=_mm256_or_si256(c,_mm256_set1_epi32(0x00ff0000)); \
248 c=_mm256_mullo_epi16(c,a); \
249 c=_mm256_add_epi16(c,_mm256_set1_epi16(0x80)); \
250 c=_mm256_add_epi16(c,_mm256_srli_epi16(c,8)); \
251 c=_mm256_srli_epi16(c,8); \
252 C=_mm_packus_epi16(_mm256_castsi256_si128(c),_mm256_extracti128_si256(c,1)); \
253 }
254 #else
255 # define PREMULFIN_COLOR_VEC8(C) { \
256 __m256i c=_mm256_cvtepu8_epi16(C); \
257 __m256i a=_mm256_shuffle_epi8(c,_mm256_set_epi8( \
258 15,14,15,14, 15,14,15,14, 7, 6, 7, 6, 7, 6, 7, 6, \
259 15,14,15,14, 15,14,15,14, 7, 6, 7, 6, 7, 6, 7, 6 \
260 )); \
261 c=_mm256_or_si256(c,_mm256_set_epi16(255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0)); \
262 c=_mm256_mullo_epi16(c,a); \
263 c=_mm256_add_epi16(c,_mm256_set1_epi16(0x80)); \
264 c=_mm256_add_epi16(c,_mm256_srli_epi16(c,8)); \
265 c=_mm256_srli_epi16(c,8); \
266 C=_mm_packus_epi16(_mm256_castsi256_si128(c),_mm256_extracti128_si256(c,1)); \
267 }
268 #endif
269
270
271 // PREMULFIN_SHL_COLOR_VEC16(C,S)
272 #if CHANNELS==1 || CHANNELS==3
273 # define PREMULFIN_SHL_COLOR_VEC16(C,S) { \
274 C=_mm256_slli_epi16(C,S); \
275 }
276 #elif CHANNELS==2
277 # define PREMULFIN_SHL_COLOR_VEC16(C,S) { \
278 __m256i a=_mm256_shuffle_epi8(C,_mm256_set_epi8( \
279 15,14,15,14, 11,10,11,10, 7, 6, 7, 6, 3, 2, 3, 2, \
280 15,14,15,14, 11,10,11,10, 7, 6, 7, 6, 3, 2, 3, 2 \
281 )); \
282 C=_mm256_or_si256(C,_mm256_set1_epi32(0x00ff0000)); \
283 C=_mm256_mullo_epi16(C,a); \
284 C=_mm256_add_epi16(C,_mm256_set1_epi16(0x80)); \
285 C=_mm256_add_epi16(C,_mm256_srli_epi16(C,8)); \
286 C=_mm256_srli_epi16(C,8); \
287 C=_mm256_slli_epi16(C,S); \
288 }
289 #else
290 # define PREMULFIN_SHL_COLOR_VEC16(C,S) { \
291 __m256i a=_mm256_shuffle_epi8(C,_mm256_set_epi8( \
292 15,14,15,14, 15,14,15,14, 7, 6, 7, 6, 7, 6, 7, 6, \
293 15,14,15,14, 15,14,15,14, 7, 6, 7, 6, 7, 6, 7, 6 \
294 )); \
295 C=_mm256_or_si256(C,_mm256_set_epi16(255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0)); \
296 C=_mm256_mullo_epi16(C,a); \
297 C=_mm256_add_epi16(C,_mm256_set1_epi16(0x80)); \
298 C=_mm256_add_epi16(C,_mm256_srli_epi16(C,8)); \
299 C=_mm256_srli_epi16(C,8); \
300 C=_mm256_slli_epi16(C,S); \
301 }
302 #endif
303
304
305 //==============================================================================
306 //========== emPainter::ScanlineTool::InterpolateImageAvx2Nearest... ===========
307 //==============================================================================
308
309 #if defined(__GNUC__)
310 __attribute__((target("avx2")))
311 #endif
CONCAT(InterpolateImageAvx2Nearest,CONCAT (CONCAT (METHOD_NAME_EXTENSION_,EXTENSION),CONCAT (METHOD_NAME_CHANNELS_,CHANNELS)))312 void emPainter::ScanlineTool::CONCAT(InterpolateImageAvx2Nearest,CONCAT(
313 CONCAT(METHOD_NAME_EXTENSION_,EXTENSION),
314 CONCAT(METHOD_NAME_CHANNELS_,CHANNELS)
315 )) (const ScanlineTool & sct, int x, int y, int w)
316 {
317 emInt64 ty=y*sct.TDY-sct.TY;
318 DEFINE_AND_SET_IMAGE_Y(imgY,ty>>24,sct.ImgDY,sct.ImgSY)
319 ssize_t imgSX=sct.ImgSX;
320 DEFINE_AND_SET_IMAGE_ROW_PTR(row,imgY,imgSX,sct.ImgSY,sct.ImgMap)
321
322 emByte * buf=(emByte*)sct.InterpolationBuffer;
323 emByte * bufEnd=buf+w*CHANNELS;
324 emInt64 tdx=sct.TDX;
325 emInt64 tx=x*tdx-sct.TX;
326
327 int sw=((tx&0xffffff)+0x1000000+(w-1)*tdx)>>24;
328 if (w==sw) {
329 DEFINE_AND_SET_IMAGE_X(imgX,tx>>24,CHANNELS,imgSX)
330 do {
331 __m128i v;
332 if (ARE_THERE_16_CONSECUTIVE_BYTES_AT_IMAGE_X(imgX,imgSX)) {
333 DEFINE_AND_SET_IMAGE_PIX_PTR(p,row,imgX)
334 v=_mm_loadu_si128((__m128i*)p);
335 INCREMENT_IMAGE_X(imgX,((16/CHANNELS)*CHANNELS),imgSX)
336 }
337 else {
338 for (int i=0, j=bufEnd-buf; i<=16-CHANNELS; i+=CHANNELS) {
339 v=_mm_srli_si128(v,CHANNELS);
340 if (i<j) {
341 DEFINE_AND_SET_IMAGE_PIX_PTR(p,row,imgX)
342 # if CHANNELS==1
343 v=_mm_insert_epi8(v,p[0],15);
344 # elif CHANNELS==2
345 v=_mm_insert_epi16(v,((emUInt16*)p)[0],7);
346 # elif CHANNELS==3
347 v=_mm_insert_epi16(v,p[0]|(p[1]<<8),6);
348 v=_mm_insert_epi8(v,p[2],14);
349 # else
350 v=_mm_insert_epi32(v,((emUInt32*)p)[0],3);
351 # endif
352 INCREMENT_IMAGE_X(imgX,CHANNELS,imgSX)
353 }
354 }
355 }
356 PREMULFIN_COLOR_VEC8(v);
357 _mm_storeu_si128((__m128i*)buf,v);
358 buf+=(16/CHANNELS)*CHANNELS;
359 } while (buf<bufEnd);
360 }
361 else {
362 do {
363 __m128i v;
364 for (int i=0, j=bufEnd-buf; i<=16-CHANNELS; i+=CHANNELS) {
365 v=_mm_srli_si128(v,CHANNELS);
366 if (i<j) {
367 DEFINE_AND_SET_IMAGE_X(imgX,tx>>24,CHANNELS,imgSX)
368 DEFINE_AND_SET_IMAGE_PIX_PTR(p,row,imgX)
369 # if CHANNELS==1
370 v=_mm_insert_epi8(v,p[0],15);
371 # elif CHANNELS==2
372 v=_mm_insert_epi16(v,((emUInt16*)p)[0],7);
373 # elif CHANNELS==3
374 v=_mm_insert_epi16(v,p[0]|(p[1]<<8),6);
375 v=_mm_insert_epi8(v,p[2],14);
376 # else
377 v=_mm_insert_epi32(v,((emUInt32*)p)[0],3);
378 # endif
379 tx+=tdx;
380 }
381 }
382 PREMULFIN_COLOR_VEC8(v);
383 _mm_storeu_si128((__m128i*)buf,v);
384 buf+=(16/CHANNELS)*CHANNELS;
385 } while (buf<bufEnd);
386 }
387 }
388
389
390 //==============================================================================
391 //========== emPainter::ScanlineTool::InterpolateImageAvx2Bilinear... ==========
392 //==============================================================================
393
394 #if defined(__GNUC__)
395 __attribute__((target("avx2")))
396 #endif
CONCAT(InterpolateImageAvx2Bilinear,CONCAT (CONCAT (METHOD_NAME_EXTENSION_,EXTENSION),CONCAT (METHOD_NAME_CHANNELS_,CHANNELS)))397 void emPainter::ScanlineTool::CONCAT(InterpolateImageAvx2Bilinear,CONCAT(
398 CONCAT(METHOD_NAME_EXTENSION_,EXTENSION),
399 CONCAT(METHOD_NAME_CHANNELS_,CHANNELS)
400 )) (const ScanlineTool & sct, int x, int y, int w)
401 {
402 emInt64 ty=y*sct.TDY-sct.TY-0x800000;
403 emUInt32 oy=((ty&0xffffff)+0x7fff)>>16;
404
405 DEFINE_AND_SET_IMAGE_Y(imgY,ty>>24,sct.ImgDY,sct.ImgSY)
406 ssize_t imgSX=sct.ImgSX;
407 DEFINE_AND_SET_IMAGE_ROW_PTR(row0,imgY,imgSX,sct.ImgSY,sct.ImgMap)
408 INCREMENT_IMAGE_Y(imgY,sct.ImgDY,sct.ImgSY)
409 DEFINE_AND_SET_IMAGE_ROW_PTR(row1,imgY,imgSX,sct.ImgSY,sct.ImgMap)
410
411 emInt64 tdx=sct.TDX;
412 emInt64 tx=x*tdx-sct.TX-0x800000;
413
414 DEFINE_AND_SET_IMAGE_X(imgX,tx>>24,CHANNELS,imgSX)
415
416 tx=(tx&0xffffff)-0x1000000-tdx;
417 int tc=((tx+0x3000000+w*tdx)>>24)*CHANNELS;
418
419 const emByte * p=(emByte*)sct.InterpolationBuffer+InterpolationBufferSize-tc*2-64;
420 p-=(p-(emByte*)NULL)&31;
421 const emInt16 * pvyBeg=(emInt16*)p;
422 const emInt16 * pvy=pvyBeg;
423 const emInt16 * pvyEnd=pvyBeg+tc;
424
425 __m256i fy1=_mm256_set1_epi16(oy<<6);
426 __m256i fy0=_mm256_sub_epi16(_mm256_set1_epi16(16384),fy1);
427
428 do {
429 __m128i svy0,svy1;
430 if (ARE_THERE_16_CONSECUTIVE_BYTES_AT_IMAGE_X(imgX,imgSX)) {
431 DEFINE_AND_SET_IMAGE_PIX_PTR(p0,row0,imgX)
432 DEFINE_AND_SET_IMAGE_PIX_PTR(p1,row1,imgX)
433 svy0=_mm_loadu_si128((__m128i*)p0);
434 svy1=_mm_loadu_si128((__m128i*)p1);
435 INCREMENT_IMAGE_X(imgX,((16/CHANNELS)*CHANNELS),imgSX)
436 }
437 else {
438 for (int i=0, j=pvyEnd-pvy; i<=16-CHANNELS; i+=CHANNELS) {
439 svy0=_mm_srli_si128(svy0,CHANNELS);
440 svy1=_mm_srli_si128(svy1,CHANNELS);
441 if (i<j) {
442 DEFINE_AND_SET_IMAGE_PIX_PTR(p0,row0,imgX)
443 DEFINE_AND_SET_IMAGE_PIX_PTR(p1,row1,imgX)
444 # if CHANNELS==1
445 svy0=_mm_insert_epi8(svy0,p0[0],15);
446 svy1=_mm_insert_epi8(svy1,p1[0],15);
447 # elif CHANNELS==2
448 svy0=_mm_insert_epi16(svy0,((emUInt16*)p0)[0],7);
449 svy1=_mm_insert_epi16(svy1,((emUInt16*)p1)[0],7);
450 # elif CHANNELS==3
451 svy0=_mm_insert_epi16(svy0,p0[0]|(p0[1]<<8),6);
452 svy0=_mm_insert_epi8(svy0,p0[2],14);
453 svy1=_mm_insert_epi16(svy1,p1[0]|(p1[1]<<8),6);
454 svy1=_mm_insert_epi8(svy1,p1[2],14);
455 # else
456 svy0=_mm_insert_epi32(svy0,((emUInt32*)p0)[0],3);
457 svy1=_mm_insert_epi32(svy1,((emUInt32*)p1)[0],3);
458 # endif
459 INCREMENT_IMAGE_X(imgX,CHANNELS,imgSX)
460 }
461 }
462 }
463
464 __m256i vy0=_mm256_cvtepu8_epi16(svy0);
465 __m256i vy1=_mm256_cvtepu8_epi16(svy1);
466
467 PREMULFIN_SHL_COLOR_VEC16(vy0,7)
468 PREMULFIN_SHL_COLOR_VEC16(vy1,7)
469
470 __m256i vy=_mm256_add_epi16(
471 _mm256_mulhrs_epi16(vy0,fy0),
472 _mm256_mulhrs_epi16(vy1,fy1)
473 );
474
475 _mm256_storeu_si256((__m256i*)pvy,vy);
476 pvy+=(16/CHANNELS)*CHANNELS;
477 } while (pvy<pvyEnd);
478
479 _mm256_storeu_si256((__m256i*)pvy,_mm256_setzero_si256());
480
481 pvy=pvyBeg;
482 emByte * buf=(emByte*)sct.InterpolationBuffer;
483 emByte * bufEnd=buf+w*CHANNELS;
484
485 // Order of pixels in v01 and vff with 3-4 / 1-2 channels:
486 // v01: 2 1 1 0 / 4 3 2 1 3 2 1 0
487 // vff: 4 3 5 2 / 8 7 6 5 b a 9 4
488 # if CHANNELS<=2
489 # if CHANNELS==1
490 __m256i vt=_mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)pvy));
491 # else
492 __m256i vt=_mm256_loadu_si256((__m256i*)pvy);
493 # endif
494 __m256i v01=_mm256_permutevar8x32_epi32(vt,_mm256_set_epi32(4,3,2,1,3,2,1,0));
495 __m256i vff=_mm256_permutevar8x32_epi32(vt,_mm256_set_epi32(0,7,6,5,0,0,0,4));
496 pvy+=CHANNELS*8;
497 int vn=7;
498 # elif CHANNELS==3
499 __m256i vt=_mm256_loadu_si256((__m256i*)pvy);
500 __m256i vta=_mm256_shuffle_epi8(vt,_mm256_set_epi8(
501 -1,-1, 1, 0, -1,-1,-1,-1, -1,-1, 7, 6, 5, 4, 3, 2,
502 -1,-1,11,10, 9, 8, 7, 6, -1,-1,-1,-1, 15,14,13,12
503 ));
504 __m256i vtb=_mm256_permutevar8x32_epi32(vta,_mm256_set_epi32(7,0,3,2,3,2,7,0));
505 vta=_mm256_blend_epi32(vt,vta,0x3f);
506 __m256i v01=_mm256_blend_epi32(vt,vtb,0xfc);
507 __m256i vff=_mm256_blend_epi32(vta,vtb,0x0f);
508 pvy+=CHANNELS*5;
509 int vn=4;
510 # else
511 __m256i vt=_mm256_loadu_si256((__m256i*)pvy);
512 __m256i v01=_mm256_permute4x64_epi64(vt,0x94);
513 __m256i vff=_mm256_permute4x64_epi64(vt,0x36);
514 pvy+=CHANNELS*4;
515 int vn=3;
516 # endif
517
518 do {
519 __m256i v01l,v01h,f1l,f1h;
520 int cx=16/CHANNELS-1;
521
522 do {
523 tx+=tdx;
524 if (tx>=0) {
525 tx-=0x1000000;
526
527 # if CHANNELS<=2
528 v01=_mm256_alignr_epi8(vff,v01,4);
529 vff=_mm256_permutevar8x32_epi32(vff,_mm256_set_epi32(1,7,6,5,0,3,2,4));
530 # else
531 v01=_mm256_alignr_epi8(vff,v01,8);
532 vff=_mm256_permute4x64_epi64(vff,0x72);
533 # endif
534
535 vn--;
536 if (vn<=0) {
537 # if CHANNELS<=2
538 # if CHANNELS==1
539 vff=_mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)pvy));
540 # else
541 vff=_mm256_loadu_si256((__m256i*)pvy);
542 # endif
543 __m256i t=_mm256_permutevar8x32_epi32(vff,_mm256_set_epi32(3,2,1,0,2,1,0,3));
544 v01=_mm256_blend_epi32(v01,t,0xfe);
545 vff=_mm256_blend_epi32(vff,t,0x0f);
546 pvy+=CHANNELS*8;
547 vn+=8;
548 # elif CHANNELS==3
549 __m256i t=_mm256_loadu_si256((__m256i*)pvy);
550 __m256i t1=_mm256_permutevar8x32_epi32(t,_mm256_set_epi32(7,6,4,3,7,6,1,0));
551 __m256i t2=_mm256_shuffle_epi8(t,_mm256_set_epi8(
552 -1,-1, 7, 6, 5, 4, 3, 2, -1,-1, 1, 0, -1,-1,-1,-1,
553 -1,-1, 5, 4, 3, 2, 1, 0, -1,-1,11,10, 9, 8, 7, 6
554 ));
555 vff=_mm256_blend_epi32(t1,t2,0xc3);
556 __m256i t3=_mm256_permute4x64_epi64(t2,0x14);
557 v01=_mm256_blend_epi32(v01,t3,0xfc);
558 pvy+=CHANNELS*5;
559 vn+=5;
560 # else
561 vff=_mm256_loadu_si256((__m256i*)pvy);
562 __m256i t=_mm256_permute4x64_epi64(vff,0x41);
563 v01=_mm256_blend_epi32(v01,t,0xfc);
564 vff=_mm256_blend_epi32(vff,t,0x0f);
565 pvy+=CHANNELS*4;
566 vn+=4;
567 # endif
568 }
569 }
570
571 emUInt32 ox=(tx+0x1007fff)>>16;
572
573 __m256i f1=_mm256_castsi128_si256(_mm_set1_epi16(ox));
574
575 if (cx==7/CHANNELS) {
576 # if CHANNELS==3
577 v01l=_mm256_alignr_epi8(v01,v01h,4);
578 f1l=_mm256_alignr_epi8(f1,f1h,4);
579 # else
580 v01l=v01h;
581 f1l=f1h;
582 # endif
583 }
584
585 v01h=_mm256_alignr_epi8(v01,v01h,CHANNELS*2);
586 f1h=_mm256_alignr_epi8(f1,f1h,CHANNELS*2);
587 cx--;
588 } while (cx>=0);
589
590 # if CHANNELS==3
591 v01h=_mm256_srli_si256(v01h,2);
592 f1h=_mm256_srli_si256(f1h,2);
593 # endif
594
595 __m256i vx0=_mm256_permute2x128_si256(v01l,v01h,0x20);
596 __m256i vx1=_mm256_permute2x128_si256(v01l,v01h,0x31);
597 __m256i fx1=_mm256_permute2x128_si256(f1l,f1h,0x20);
598
599 fx1=_mm256_slli_epi16(fx1,6);
600 __m256i fx0=_mm256_sub_epi16(_mm256_set1_epi16(16384),fx1);
601
602 __m256i vx=_mm256_add_epi16(
603 _mm256_mulhrs_epi16(vx0,fx0),
604 _mm256_mulhrs_epi16(vx1,fx1)
605 );
606
607 vx=_mm256_add_epi16(vx,_mm256_set1_epi16(0x10));
608 vx=_mm256_srai_epi16(vx,5);
609 __m128i svx=_mm_packus_epi16(
610 _mm256_castsi256_si128(vx),
611 _mm256_extracti128_si256(vx,1)
612 );
613
614 _mm_storeu_si128((__m128i*)buf,svx);
615
616 buf+=(16/CHANNELS)*CHANNELS;
617 } while (buf<bufEnd);
618 }
619
620
621 //==============================================================================
622 //========== emPainter::ScanlineTool::InterpolateImageAvx2Bicubic... ===========
623 //==============================================================================
624
625 #ifndef BICUBIC_FACTORS_TABLE_DEFINED
626 # define BICUBIC_FACTORS_TABLE_DEFINED
627 struct alignas(8) BicubicFactors {
628 emInt16 f0;
629 emInt16 f1;
630 emInt16 f2;
631 emInt16 f3;
632 };
633 static const BicubicFactors BicubicFactorsTable[257] = {
634 // #include <stdio.h>
635 // #include <math.h>
636 // int main(int argc, char * argv[])
637 // {
638 // for (int i=0; i<=256; i++) {
639 // double o=i/256.0;
640 // double s=1.0-o;
641 // double f=16384;
642 // int f0=(int)round((-0.5*s*o)*s*f);
643 // int f1=(int)round(((1 - 1.5*o)*o + 1)*s*f);
644 // int f2=(int)round(((1 - 1.5*s)*s + 1)*o*f);
645 // int f3=(int)round((-0.5*s*o)*o*f);
646 // printf("%s{%d,%d,%d,%d},",i%4?"":"\n",f0,f1,f2,f3);
647 // }
648 // return 0;
649 // }
650 {0,16384,0,0},{-32,16383,32,0},{-63,16382,66,0},{-94,16378,100,-1},
651 {-124,16374,136,-2},{-154,16369,172,-3},{-183,16362,210,-4},{-212,16354,248,-6},
652 {-240,16345,287,-8},{-268,16334,327,-10},{-295,16323,369,-12},{-322,16310,411,-14},
653 {-349,16297,453,-17},{-375,16282,497,-20},{-400,16266,542,-23},{-425,16248,588,-26},
654 {-450,16230,634,-30},{-474,16211,681,-34},{-498,16190,729,-38},{-521,16168,778,-42},
655 {-544,16146,828,-46},{-566,16122,879,-51},{-588,16097,930,-55},{-610,16071,983,-60},
656 {-631,16044,1036,-65},{-651,16016,1090,-70},{-672,15987,1144,-76},{-691,15957,1200,-82},
657 {-711,15926,1256,-87},{-730,15894,1313,-93},{-748,15861,1370,-99},{-766,15827,1429,-106},
658 {-784,15792,1488,-112},{-801,15756,1548,-119},{-818,15719,1608,-125},{-835,15681,1670,-132},
659 {-851,15642,1732,-139},{-866,15603,1794,-146},{-882,15562,1858,-154},{-897,15520,1922,-161},
660 {-911,15478,1986,-169},{-925,15434,2052,-176},{-939,15390,2117,-184},{-953,15345,2184,-192},
661 {-966,15299,2251,-200},{-978,15252,2319,-209},{-991,15204,2387,-217},{-1002,15155,2456,-225},
662 {-1014,15106,2526,-234},{-1025,15056,2596,-243},{-1036,15005,2667,-251},{-1047,14953,2738,-260},
663 {-1057,14900,2810,-269},{-1066,14846,2882,-278},{-1076,14792,2955,-288},{-1085,14737,3029,-297},
664 {-1094,14681,3103,-306},{-1102,14625,3177,-316},{-1110,14567,3252,-325},{-1118,14509,3328,-335},
665 {-1125,14450,3404,-345},{-1133,14391,3480,-354},{-1139,14331,3557,-364},{-1146,14270,3634,-374},
666 {-1152,14208,3712,-384},{-1158,14146,3790,-394},{-1163,14083,3869,-404},{-1169,14019,3948,-414},
667 {-1174,13955,4027,-424},{-1178,13890,4107,-435},{-1182,13824,4188,-445},{-1187,13758,4268,-455},
668 {-1190,13691,4349,-466},{-1194,13623,4431,-476},{-1197,13555,4512,-487},{-1200,13486,4595,-497},
669 {-1202,13417,4677,-508},{-1205,13347,4760,-518},{-1207,13277,4843,-529},{-1208,13206,4926,-539},
670 {-1210,13134,5010,-550},{-1211,13062,5094,-561},{-1212,12989,5178,-571},{-1213,12916,5263,-582},
671 {-1213,12842,5348,-593},{-1214,12768,5433,-603},{-1214,12693,5518,-614},{-1213,12618,5604,-625},
672 {-1213,12542,5690,-635},{-1212,12466,5776,-646},{-1211,12389,5862,-657},{-1210,12312,5949,-667},
673 {-1208,12235,6035,-678},{-1207,12157,6122,-688},{-1205,12078,6209,-699},{-1202,11999,6297,-709},
674 {-1200,11920,6384,-720},{-1197,11840,6472,-730},{-1195,11760,6559,-741},{-1192,11680,6647,-751},
675 {-1188,11599,6735,-762},{-1185,11518,6823,-772},{-1181,11436,6911,-782},{-1177,11354,7000,-793},
676 {-1173,11272,7088,-803},{-1169,11189,7177,-813},{-1165,11106,7265,-823},{-1160,11023,7354,-833},
677 {-1155,10939,7443,-843},{-1150,10855,7531,-853},{-1145,10771,7620,-863},{-1140,10687,7709,-872},
678 {-1134,10602,7798,-882},{-1128,10517,7887,-892},{-1122,10432,7976,-901},{-1116,10346,8065,-911},
679 {-1110,10260,8154,-920},{-1104,10174,8242,-929},{-1097,10088,8331,-938},{-1091,10002,8420,-947},
680 {-1084,9915,8509,-956},{-1077,9828,8597,-965},{-1070,9741,8686,-974},{-1062,9654,8775,-982},
681 {-1055,9567,8863,-991},{-1047,9479,8951,-999},{-1040,9392,9040,-1008},{-1032,9304,9128,-1016},
682 {-1024,9216,9216,-1024},{-1016,9128,9304,-1032},{-1008,9040,9392,-1040},{-999,8951,9479,-1047},
683 {-991,8863,9567,-1055},{-982,8775,9654,-1062},{-974,8686,9741,-1070},{-965,8597,9828,-1077},
684 {-956,8509,9915,-1084},{-947,8420,10002,-1091},{-938,8331,10088,-1097},{-929,8242,10174,-1104},
685 {-920,8154,10260,-1110},{-911,8065,10346,-1116},{-901,7976,10432,-1122},{-892,7887,10517,-1128},
686 {-882,7798,10602,-1134},{-872,7709,10687,-1140},{-863,7620,10771,-1145},{-853,7531,10855,-1150},
687 {-843,7443,10939,-1155},{-833,7354,11023,-1160},{-823,7265,11106,-1165},{-813,7177,11189,-1169},
688 {-803,7088,11272,-1173},{-793,7000,11354,-1177},{-782,6911,11436,-1181},{-772,6823,11518,-1185},
689 {-762,6735,11599,-1188},{-751,6647,11680,-1192},{-741,6559,11760,-1195},{-730,6472,11840,-1197},
690 {-720,6384,11920,-1200},{-709,6297,11999,-1202},{-699,6209,12078,-1205},{-688,6122,12157,-1207},
691 {-678,6035,12235,-1208},{-667,5949,12312,-1210},{-657,5862,12389,-1211},{-646,5776,12466,-1212},
692 {-635,5690,12542,-1213},{-625,5604,12618,-1213},{-614,5518,12693,-1214},{-603,5433,12768,-1214},
693 {-593,5348,12842,-1213},{-582,5263,12916,-1213},{-571,5178,12989,-1212},{-561,5094,13062,-1211},
694 {-550,5010,13134,-1210},{-539,4926,13206,-1208},{-529,4843,13277,-1207},{-518,4760,13347,-1205},
695 {-508,4677,13417,-1202},{-497,4595,13486,-1200},{-487,4512,13555,-1197},{-476,4431,13623,-1194},
696 {-466,4349,13691,-1190},{-455,4268,13758,-1187},{-445,4188,13824,-1182},{-435,4107,13890,-1178},
697 {-424,4027,13955,-1174},{-414,3948,14019,-1169},{-404,3869,14083,-1163},{-394,3790,14146,-1158},
698 {-384,3712,14208,-1152},{-374,3634,14270,-1146},{-364,3557,14331,-1139},{-354,3480,14391,-1133},
699 {-345,3404,14450,-1125},{-335,3328,14509,-1118},{-325,3252,14567,-1110},{-316,3177,14625,-1102},
700 {-306,3103,14681,-1094},{-297,3029,14737,-1085},{-288,2955,14792,-1076},{-278,2882,14846,-1066},
701 {-269,2810,14900,-1057},{-260,2738,14953,-1047},{-251,2667,15005,-1036},{-243,2596,15056,-1025},
702 {-234,2526,15106,-1014},{-225,2456,15155,-1002},{-217,2387,15204,-991},{-209,2319,15252,-978},
703 {-200,2251,15299,-966},{-192,2184,15345,-953},{-184,2117,15390,-939},{-176,2052,15434,-925},
704 {-169,1986,15478,-911},{-161,1922,15520,-897},{-154,1858,15562,-882},{-146,1794,15603,-866},
705 {-139,1732,15642,-851},{-132,1670,15681,-835},{-125,1608,15719,-818},{-119,1548,15756,-801},
706 {-112,1488,15792,-784},{-106,1429,15827,-766},{-99,1370,15861,-748},{-93,1313,15894,-730},
707 {-87,1256,15926,-711},{-82,1200,15957,-691},{-76,1144,15987,-672},{-70,1090,16016,-651},
708 {-65,1036,16044,-631},{-60,983,16071,-610},{-55,930,16097,-588},{-51,879,16122,-566},
709 {-46,828,16146,-544},{-42,778,16168,-521},{-38,729,16190,-498},{-34,681,16211,-474},
710 {-30,634,16230,-450},{-26,588,16248,-425},{-23,542,16266,-400},{-20,497,16282,-375},
711 {-17,453,16297,-349},{-14,411,16310,-322},{-12,369,16323,-295},{-10,327,16334,-268},
712 {-8,287,16345,-240},{-6,248,16354,-212},{-4,210,16362,-183},{-3,172,16369,-154},
713 {-2,136,16374,-124},{-1,100,16378,-94},{0,66,16382,-63},{0,32,16383,-32},
714 {0,0,16384,0}
715 };
716 #endif
717
718
719 #if defined(__GNUC__)
720 __attribute__((target("avx2")))
721 #endif
CONCAT(InterpolateImageAvx2Bicubic,CONCAT (CONCAT (METHOD_NAME_EXTENSION_,EXTENSION),CONCAT (METHOD_NAME_CHANNELS_,CHANNELS)))722 void emPainter::ScanlineTool::CONCAT(InterpolateImageAvx2Bicubic,CONCAT(
723 CONCAT(METHOD_NAME_EXTENSION_,EXTENSION),
724 CONCAT(METHOD_NAME_CHANNELS_,CHANNELS)
725 )) (const ScanlineTool & sct, int x, int y, int w)
726 {
727 emInt64 ty=y*sct.TDY-sct.TY-0x1800000;
728 emUInt32 oy=((ty&0xffffff)+0x7fff)>>16;
729 const BicubicFactors & fy=BicubicFactorsTable[oy];
730
731 DEFINE_AND_SET_IMAGE_Y(imgY,ty>>24,sct.ImgDY,sct.ImgSY)
732 ssize_t imgSX=sct.ImgSX;
733 DEFINE_AND_SET_IMAGE_ROW_PTR(row0,imgY,imgSX,sct.ImgSY,sct.ImgMap)
734 INCREMENT_IMAGE_Y(imgY,sct.ImgDY,sct.ImgSY)
735 DEFINE_AND_SET_IMAGE_ROW_PTR(row1,imgY,imgSX,sct.ImgSY,sct.ImgMap)
736 INCREMENT_IMAGE_Y(imgY,sct.ImgDY,sct.ImgSY)
737 DEFINE_AND_SET_IMAGE_ROW_PTR(row2,imgY,imgSX,sct.ImgSY,sct.ImgMap)
738 INCREMENT_IMAGE_Y(imgY,sct.ImgDY,sct.ImgSY)
739 DEFINE_AND_SET_IMAGE_ROW_PTR(row3,imgY,imgSX,sct.ImgSY,sct.ImgMap)
740
741 emInt64 tdx=sct.TDX;
742 emInt64 tx=x*tdx-sct.TX-0x1800000;
743
744 DEFINE_AND_SET_IMAGE_X(imgX,tx>>24,CHANNELS,imgSX)
745
746 tx=(tx&0xffffff)-0x1000000-tdx;
747 int tc=((tx+0x5000000+w*tdx)>>24)*CHANNELS;
748
749 const emByte * p=(emByte*)sct.InterpolationBuffer+InterpolationBufferSize-tc*2-64;
750 p-=(p-(emByte*)NULL)&31;
751 const emInt16 * pvyBeg=(emInt16*)p;
752 const emInt16 * pvy=pvyBeg;
753 const emInt16 * pvyEnd=pvyBeg+tc;
754
755 __m128i sfy=_mm_loadl_epi64((__m128i*)&fy);
756 sfy=_mm_unpacklo_epi16(sfy,sfy);
757 __m256i afy=_mm256_broadcastsi128_si256(sfy);
758 __m256i fy0=_mm256_shuffle_epi32(afy,0x00);
759 __m256i fy1=_mm256_shuffle_epi32(afy,0x55);
760 __m256i fy2=_mm256_shuffle_epi32(afy,0xaa);
761 __m256i fy3=_mm256_shuffle_epi32(afy,0xff);
762
763 do {
764 __m128i svy0,svy1,svy2,svy3;
765 if (ARE_THERE_16_CONSECUTIVE_BYTES_AT_IMAGE_X(imgX,imgSX)) {
766 DEFINE_AND_SET_IMAGE_PIX_PTR(p0,row0,imgX)
767 DEFINE_AND_SET_IMAGE_PIX_PTR(p1,row1,imgX)
768 DEFINE_AND_SET_IMAGE_PIX_PTR(p2,row2,imgX)
769 DEFINE_AND_SET_IMAGE_PIX_PTR(p3,row3,imgX)
770 svy0=_mm_loadu_si128((__m128i*)p0);
771 svy1=_mm_loadu_si128((__m128i*)p1);
772 svy2=_mm_loadu_si128((__m128i*)p2);
773 svy3=_mm_loadu_si128((__m128i*)p3);
774 INCREMENT_IMAGE_X(imgX,((16/CHANNELS)*CHANNELS),imgSX)
775 }
776 else {
777 for (int i=0, j=pvyEnd-pvy; i<=16-CHANNELS; i+=CHANNELS) {
778 svy0=_mm_srli_si128(svy0,CHANNELS);
779 svy1=_mm_srli_si128(svy1,CHANNELS);
780 svy2=_mm_srli_si128(svy2,CHANNELS);
781 svy3=_mm_srli_si128(svy3,CHANNELS);
782 if (i<j) {
783 DEFINE_AND_SET_IMAGE_PIX_PTR(p0,row0,imgX)
784 DEFINE_AND_SET_IMAGE_PIX_PTR(p1,row1,imgX)
785 DEFINE_AND_SET_IMAGE_PIX_PTR(p2,row2,imgX)
786 DEFINE_AND_SET_IMAGE_PIX_PTR(p3,row3,imgX)
787 # if CHANNELS==1
788 svy0=_mm_insert_epi8(svy0,p0[0],15);
789 svy1=_mm_insert_epi8(svy1,p1[0],15);
790 svy2=_mm_insert_epi8(svy2,p2[0],15);
791 svy3=_mm_insert_epi8(svy3,p3[0],15);
792 # elif CHANNELS==2
793 svy0=_mm_insert_epi16(svy0,((emUInt16*)p0)[0],7);
794 svy1=_mm_insert_epi16(svy1,((emUInt16*)p1)[0],7);
795 svy2=_mm_insert_epi16(svy2,((emUInt16*)p2)[0],7);
796 svy3=_mm_insert_epi16(svy3,((emUInt16*)p3)[0],7);
797 # elif CHANNELS==3
798 svy0=_mm_insert_epi16(svy0,p0[0]|(p0[1]<<8),6);
799 svy0=_mm_insert_epi8(svy0,p0[2],14);
800 svy1=_mm_insert_epi16(svy1,p1[0]|(p1[1]<<8),6);
801 svy1=_mm_insert_epi8(svy1,p1[2],14);
802 svy2=_mm_insert_epi16(svy2,p2[0]|(p2[1]<<8),6);
803 svy2=_mm_insert_epi8(svy2,p2[2],14);
804 svy3=_mm_insert_epi16(svy3,p3[0]|(p3[1]<<8),6);
805 svy3=_mm_insert_epi8(svy3,p3[2],14);
806 # else
807 svy0=_mm_insert_epi32(svy0,((emUInt32*)p0)[0],3);
808 svy1=_mm_insert_epi32(svy1,((emUInt32*)p1)[0],3);
809 svy2=_mm_insert_epi32(svy2,((emUInt32*)p2)[0],3);
810 svy3=_mm_insert_epi32(svy3,((emUInt32*)p3)[0],3);
811 # endif
812 INCREMENT_IMAGE_X(imgX,CHANNELS,imgSX)
813 }
814 }
815 }
816
817 __m256i vy0=_mm256_cvtepu8_epi16(svy0);
818 __m256i vy1=_mm256_cvtepu8_epi16(svy1);
819 __m256i vy2=_mm256_cvtepu8_epi16(svy2);
820 __m256i vy3=_mm256_cvtepu8_epi16(svy3);
821
822 PREMULFIN_SHL_COLOR_VEC16(vy0,7)
823 PREMULFIN_SHL_COLOR_VEC16(vy1,7)
824 PREMULFIN_SHL_COLOR_VEC16(vy2,7)
825 PREMULFIN_SHL_COLOR_VEC16(vy3,7)
826
827 __m256i vy=_mm256_add_epi16(
828 _mm256_add_epi16(
829 _mm256_mulhrs_epi16(vy0,fy0),
830 _mm256_mulhrs_epi16(vy1,fy1)
831 ),
832 _mm256_add_epi16(
833 _mm256_mulhrs_epi16(vy2,fy2),
834 _mm256_mulhrs_epi16(vy3,fy3)
835 )
836 );
837
838 _mm256_storeu_si256((__m256i*)pvy,vy);
839 pvy+=(16/CHANNELS)*CHANNELS;
840 } while (pvy<pvyEnd);
841
842 _mm256_storeu_si256((__m256i*)pvy,_mm256_setzero_si256());
843
844 pvy=pvyBeg;
845 emByte * buf=(emByte*)sct.InterpolationBuffer;
846 emByte * bufEnd=buf+w*CHANNELS;
847
848 // Order of pixels in v02 and v13 with 3-4 / 1-2 channels:
849 // v02: 6 2 4 0 / 14 12 10 2 8 6 4 0
850 // v13: 7 3 5 1 / 15 13 11 3 9 7 5 1
851 # if CHANNELS<=2
852 # if CHANNELS==1
853 __m256i vt=_mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)pvy));
854 # else
855 __m256i vt=_mm256_loadu_si256((__m256i*)pvy);
856 # endif
857 __m256i v02=_mm256_permutevar8x32_epi32(vt,_mm256_set_epi32(0,0,0,2,0,6,4,0));
858 __m256i v13=_mm256_permutevar8x32_epi32(vt,_mm256_set_epi32(0,0,0,3,0,7,5,1));
859 pvy+=CHANNELS*8;
860 int vn=5;
861 # elif CHANNELS==3
862 __m256i v02=_mm256_loadu_si256((__m256i*)pvy);
863 __m256i v13=_mm256_permutevar8x32_epi32(v02,_mm256_set_epi32(5,4,4,3,7,6,2,1));
864 v02=_mm256_blend_epi32(v02,v13,0xfc);
865 v13=_mm256_blend_epi32(_mm256_srli_si256(v13,2),_mm256_srli_si256(v13,10),0xf0);
866 pvy+=CHANNELS*5;
867 int vn=2;
868 # else
869 __m256i v02=_mm256_loadu_si256((__m256i*)pvy);
870 __m256i v13=_mm256_srli_si256(v02,8);
871 pvy+=CHANNELS*4;
872 int vn=1;
873 # endif
874
875 do {
876 __m256i v02l,v02h,v13l,v13h,f02l,f02h,f13l,f13h;
877 int cx=16/CHANNELS-1;
878
879 do {
880 tx+=tdx;
881 if (tx>=0) {
882 tx-=0x1000000;
883
884 __m256i oldV02=v02;
885 v02=v13;
886 # if CHANNELS<=2
887 v13=_mm256_permutevar8x32_epi32(oldV02,_mm256_set_epi32(0,7,6,1,5,3,2,4));
888 # else
889 v13=_mm256_permute4x64_epi64(oldV02,0x1e);
890 # endif
891
892 vn--;
893 if (vn<=0) {
894 # if CHANNELS<=2
895 # if CHANNELS==1
896 __m256i t=_mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)pvy));
897 # else
898 __m256i t=_mm256_loadu_si256((__m256i*)pvy);
899 # endif
900 __m256i ta=_mm256_permutevar8x32_epi32(t,_mm256_set_epi32(0,0,7,0,5,3,1,0));
901 __m256i tb=_mm256_permutevar8x32_epi32(t,_mm256_set_epi32(0,0,0,0,6,4,2,0));
902 v02=_mm256_blend_epi32(v02,ta,0xee);
903 v13=_mm256_blend_epi32(v13,tb,0xfe);
904 pvy+=CHANNELS*8;
905 vn+=8;
906 # elif CHANNELS==3
907 __m256i t=_mm256_loadu_si256((__m256i*)pvy);
908 __m256i ta=_mm256_shuffle_epi8(t,_mm256_set_epi8(
909 -1,-1, 7, 6, 5, 4, 3, 2, -1,-1,-1,-1, -1,-1,-1,-1,
910 -1,-1,11,10, 9, 8, 7, 6, -1,-1,-1,-1, -1,-1,-1,-1
911 ));
912 __m256i tb=_mm256_permutevar8x32_epi32(t,_mm256_set_epi32(7,6,1,0,4,3,2,1));
913 v02=_mm256_blend_epi32(v02,ta,0xcc);
914 v13=_mm256_blend_epi32(v13,tb,0xfc);
915 pvy+=CHANNELS*5;
916 vn+=5;
917 # else
918 __m256i ta=_mm256_loadu_si256((__m256i*)pvy);
919 __m256i tb=_mm256_permute4x64_epi64(ta,0x0a);
920 v02=_mm256_blend_epi32(v02,ta,0xcc);
921 v13=_mm256_blend_epi32(v13,tb,0xfc);
922 pvy+=CHANNELS*4;
923 vn+=4;
924 # endif
925 }
926 }
927
928 emUInt32 ox=(tx+0x1007fff)>>16;
929 const BicubicFactors & fx=BicubicFactorsTable[ox];
930
931 __m256i f02=_mm256_cvtepu16_epi64(_mm_loadl_epi64((__m128i*)&fx));
932 # if CHANNELS>=2
933 f02=_mm256_shuffle_epi8(f02,_mm256_set_epi8(
934 9, 8, 9, 8, 9, 8, 9, 8, 1, 0, 1, 0, 1, 0, 1, 0,
935 9, 8, 9, 8, 9, 8, 9, 8, 1, 0, 1, 0, 1, 0, 1, 0
936 ));
937 # endif
938
939 __m256i f13=_mm256_srli_si256(f02,8);
940
941 if (cx==7/CHANNELS) {
942 # if CHANNELS==3
943 v02l=_mm256_alignr_epi8(v02,v02h,4);
944 v13l=_mm256_alignr_epi8(v13,v13h,4);
945 f02l=_mm256_alignr_epi8(f02,f02h,4);
946 f13l=_mm256_alignr_epi8(f13,f13h,4);
947 # else
948 v02l=v02h;
949 v13l=v13h;
950 f02l=f02h;
951 f13l=f13h;
952 # endif
953 }
954
955 v02h=_mm256_alignr_epi8(v02,v02h,CHANNELS*2);
956 v13h=_mm256_alignr_epi8(v13,v13h,CHANNELS*2);
957 f02h=_mm256_alignr_epi8(f02,f02h,CHANNELS*2);
958 f13h=_mm256_alignr_epi8(f13,f13h,CHANNELS*2);
959 cx--;
960 } while (cx>=0);
961
962 # if CHANNELS==3
963 v02h=_mm256_srli_si256(v02h,2);
964 v13h=_mm256_srli_si256(v13h,2);
965 f02h=_mm256_srli_si256(f02h,2);
966 f13h=_mm256_srli_si256(f13h,2);
967 # endif
968
969 __m256i vx0=_mm256_permute2x128_si256(v02l,v02h,0x20);
970 __m256i vx1=_mm256_permute2x128_si256(v13l,v13h,0x20);
971 __m256i vx2=_mm256_permute2x128_si256(v02l,v02h,0x31);
972 __m256i vx3=_mm256_permute2x128_si256(v13l,v13h,0x31);
973 __m256i fx0=_mm256_permute2x128_si256(f02l,f02h,0x20);
974 __m256i fx1=_mm256_permute2x128_si256(f13l,f13h,0x20);
975 __m256i fx2=_mm256_permute2x128_si256(f02l,f02h,0x31);
976 __m256i fx3=_mm256_permute2x128_si256(f13l,f13h,0x31);
977
978 __m256i vx=_mm256_add_epi16(
979 _mm256_add_epi16(
980 _mm256_mulhrs_epi16(vx0,fx0),
981 _mm256_mulhrs_epi16(vx1,fx1)
982 ),
983 _mm256_add_epi16(
984 _mm256_mulhrs_epi16(vx2,fx2),
985 _mm256_mulhrs_epi16(vx3,fx3)
986 )
987 );
988
989 vx=_mm256_add_epi16(vx,_mm256_set1_epi16(0x10));
990 vx=_mm256_srai_epi16(vx,5);
991 vx=_mm256_max_epi16(vx,_mm256_setzero_si256());
992 __m128i svx=_mm_packus_epi16(
993 _mm256_castsi256_si128(vx),
994 _mm256_extracti128_si256(vx,1)
995 );
996 # if CHANNELS==2 || CHANNELS==4
997 svx=_mm_min_epu8(svx,_mm_shuffle_epi8(svx,_mm_set_epi8(
998 # if CHANNELS==2
999 15,15,13,13,11,11,9,9,7,7,5,5,3,3,1,1
1000 # else
1001 15,15,15,15,11,11,11,11,7,7,7,7,3,3,3,3
1002 # endif
1003 )));
1004 # endif
1005
1006 _mm_storeu_si128((__m128i*)buf,svx);
1007
1008 buf+=(16/CHANNELS)*CHANNELS;
1009 } while (buf<bufEnd);
1010 }
1011
1012
1013 //==============================================================================
1014 //========== emPainter::ScanlineTool::InterpolateImageAvx2Lanczos... ===========
1015 //==============================================================================
1016
1017 #ifndef LANCZOS_FACTORS_TABLE_DEFINED
1018 # define LANCZOS_FACTORS_TABLE_DEFINED
1019 struct alignas(8) LanczosFactors {
1020 emInt16 f0;
1021 emInt16 f1;
1022 emInt16 f2;
1023 emInt16 f3;
1024 };
1025 static const LanczosFactors LanczosFactorsTable[257] = {
1026 // #include <stdio.h>
1027 // #include <math.h>
1028 // int main(int argc, char * argv[])
1029 // {
1030 // for (int i=0; i<=256; i++) {
1031 // double f=16384;
1032 // double radius=2.5;
1033 // double v[4];
1034 // for (int j=0; j<4; j++) {
1035 // double d=fabs((j-1-i/256.0)*M_PI);
1036 // if (d<1E-10) v[j]=1.0/radius;
1037 // else v[j]=sin(d)*sin(d/radius)/(d*d);
1038 // }
1039 // int f0=(int)round(f*v[0]/(v[0]+v[1]+v[2]+v[3]));
1040 // int f1=(int)round(f*v[1]/(v[0]+v[1]+v[2]+v[3]));
1041 // int f2=(int)round(f*v[2]/(v[0]+v[1]+v[2]+v[3]));
1042 // int f3=(int)round(f*v[3]/(v[0]+v[1]+v[2]+v[3]));
1043 // printf("%s{%d,%d,%d,%d},",i%4?"":"\n",f0,f1,f2,f3);
1044 // }
1045 // return 0;
1046 // }
1047 {0,16384,0,0},{-48,16391,49,-8},{-96,16397,98,-15},{-143,16402,148,-23},
1048 {-189,16406,199,-31},{-235,16408,250,-40},{-280,16410,302,-48},{-325,16411,355,-56},
1049 {-369,16411,408,-65},{-413,16409,462,-74},{-456,16407,516,-83},{-499,16403,572,-92},
1050 {-540,16399,627,-102},{-582,16393,684,-111},{-622,16387,741,-121},{-662,16379,798,-131},
1051 {-702,16370,857,-141},{-741,16361,916,-151},{-779,16350,975,-162},{-817,16338,1035,-172},
1052 {-854,16325,1096,-183},{-891,16311,1157,-194},{-927,16296,1219,-205},{-962,16280,1282,-216},
1053 {-997,16263,1345,-227},{-1031,16246,1408,-239},{-1065,16226,1473,-250},{-1098,16206,1538,-262},
1054 {-1130,16185,1603,-274},{-1162,16163,1669,-286},{-1194,16140,1736,-298},{-1224,16116,1803,-311},
1055 {-1254,16091,1870,-323},{-1284,16065,1939,-336},{-1313,16038,2007,-349},{-1341,16010,2077,-362},
1056 {-1369,15981,2147,-375},{-1396,15951,2217,-388},{-1422,15920,2288,-401},{-1448,15888,2360,-415},
1057 {-1474,15855,2432,-429},{-1499,15821,2504,-442},{-1523,15786,2577,-456},{-1547,15750,2651,-470},
1058 {-1570,15713,2725,-484},{-1592,15676,2799,-499},{-1614,15637,2874,-513},{-1635,15597,2950,-528},
1059 {-1656,15557,3026,-542},{-1676,15515,3102,-557},{-1696,15473,3179,-572},{-1715,15429,3257,-587},
1060 {-1734,15385,3334,-602},{-1752,15340,3413,-617},{-1769,15294,3491,-632},{-1786,15247,3571,-648},
1061 {-1803,15200,3650,-663},{-1818,15151,3730,-679},{-1834,15101,3811,-694},{-1848,15051,3891,-710},
1062 {-1863,15000,3973,-726},{-1876,14948,4054,-742},{-1889,14895,4136,-758},{-1902,14841,4219,-774},
1063 {-1914,14787,4301,-790},{-1926,14731,4384,-806},{-1937,14675,4468,-823},{-1947,14618,4552,-839},
1064 {-1957,14561,4636,-855},{-1967,14502,4720,-872},{-1976,14443,4805,-888},{-1984,14383,4890,-905},
1065 {-1992,14322,4976,-921},{-2000,14260,5061,-938},{-2007,14198,5147,-955},{-2014,14135,5234,-971},
1066 {-2020,14071,5320,-988},{-2025,14007,5407,-1005},{-2030,13942,5494,-1022},{-2035,13876,5582,-1038},
1067 {-2039,13809,5669,-1055},{-2043,13742,5757,-1072},{-2046,13674,5845,-1089},{-2049,13605,5934,-1106},
1068 {-2052,13536,6022,-1122},{-2054,13466,6111,-1139},{-2055,13396,6200,-1156},{-2056,13325,6289,-1173},
1069 {-2057,13253,6378,-1190},{-2058,13181,6467,-1207},{-2057,13108,6557,-1223},{-2057,13034,6647,-1240},
1070 {-2056,12960,6737,-1257},{-2055,12885,6827,-1273},{-2053,12810,6917,-1290},{-2051,12734,7007,-1306},
1071 {-2048,12658,7097,-1323},{-2045,12581,7188,-1339},{-2042,12504,7278,-1356},{-2039,12426,7369,-1372},
1072 {-2035,12347,7460,-1388},{-2030,12268,7550,-1405},{-2025,12189,7641,-1421},{-2020,12109,7732,-1437},
1073 {-2015,12029,7823,-1453},{-2009,11948,7914,-1468},{-2003,11867,8005,-1484},{-1997,11785,8096,-1500},
1074 {-1990,11703,8186,-1515},{-1983,11620,8277,-1531},{-1976,11538,8368,-1546},{-1968,11454,8459,-1561},
1075 {-1960,11371,8550,-1576},{-1952,11287,8640,-1591},{-1943,11202,8731,-1606},{-1934,11117,8822,-1621},
1076 {-1925,11032,8912,-1635},{-1916,10947,9002,-1649},{-1906,10861,9093,-1664},{-1896,10775,9183,-1678},
1077 {-1886,10688,9273,-1691},{-1876,10602,9363,-1705},{-1865,10515,9453,-1719},{-1854,10428,9542,-1732},
1078 {-1843,10340,9632,-1745},{-1831,10252,9721,-1758},{-1820,10164,9810,-1771},{-1808,10076,9899,-1783},
1079 {-1796,9988,9988,-1796},{-1783,9899,10076,-1808},{-1771,9810,10164,-1820},{-1758,9721,10252,-1831},
1080 {-1745,9632,10340,-1843},{-1732,9542,10428,-1854},{-1719,9453,10515,-1865},{-1705,9363,10602,-1876},
1081 {-1691,9273,10688,-1886},{-1678,9183,10775,-1896},{-1664,9093,10861,-1906},{-1649,9002,10947,-1916},
1082 {-1635,8912,11032,-1925},{-1621,8822,11117,-1934},{-1606,8731,11202,-1943},{-1591,8640,11287,-1952},
1083 {-1576,8550,11371,-1960},{-1561,8459,11454,-1968},{-1546,8368,11538,-1976},{-1531,8277,11620,-1983},
1084 {-1515,8186,11703,-1990},{-1500,8096,11785,-1997},{-1484,8005,11867,-2003},{-1468,7914,11948,-2009},
1085 {-1453,7823,12029,-2015},{-1437,7732,12109,-2020},{-1421,7641,12189,-2025},{-1405,7550,12268,-2030},
1086 {-1388,7460,12347,-2035},{-1372,7369,12426,-2039},{-1356,7278,12504,-2042},{-1339,7188,12581,-2045},
1087 {-1323,7097,12658,-2048},{-1306,7007,12734,-2051},{-1290,6917,12810,-2053},{-1273,6827,12885,-2055},
1088 {-1257,6737,12960,-2056},{-1240,6647,13034,-2057},{-1223,6557,13108,-2057},{-1207,6467,13181,-2058},
1089 {-1190,6378,13253,-2057},{-1173,6289,13325,-2056},{-1156,6200,13396,-2055},{-1139,6111,13466,-2054},
1090 {-1122,6022,13536,-2052},{-1106,5934,13605,-2049},{-1089,5845,13674,-2046},{-1072,5757,13742,-2043},
1091 {-1055,5669,13809,-2039},{-1038,5582,13876,-2035},{-1022,5494,13942,-2030},{-1005,5407,14007,-2025},
1092 {-988,5320,14071,-2020},{-971,5234,14135,-2014},{-955,5147,14198,-2007},{-938,5061,14260,-2000},
1093 {-921,4976,14322,-1992},{-905,4890,14383,-1984},{-888,4805,14443,-1976},{-872,4720,14502,-1967},
1094 {-855,4636,14561,-1957},{-839,4552,14618,-1947},{-823,4468,14675,-1937},{-806,4384,14731,-1926},
1095 {-790,4301,14787,-1914},{-774,4219,14841,-1902},{-758,4136,14895,-1889},{-742,4054,14948,-1876},
1096 {-726,3973,15000,-1863},{-710,3891,15051,-1848},{-694,3811,15101,-1834},{-679,3730,15151,-1818},
1097 {-663,3650,15200,-1803},{-648,3571,15247,-1786},{-632,3491,15294,-1769},{-617,3413,15340,-1752},
1098 {-602,3334,15385,-1734},{-587,3257,15429,-1715},{-572,3179,15473,-1696},{-557,3102,15515,-1676},
1099 {-542,3026,15557,-1656},{-528,2950,15597,-1635},{-513,2874,15637,-1614},{-499,2799,15676,-1592},
1100 {-484,2725,15713,-1570},{-470,2651,15750,-1547},{-456,2577,15786,-1523},{-442,2504,15821,-1499},
1101 {-429,2432,15855,-1474},{-415,2360,15888,-1448},{-401,2288,15920,-1422},{-388,2217,15951,-1396},
1102 {-375,2147,15981,-1369},{-362,2077,16010,-1341},{-349,2007,16038,-1313},{-336,1939,16065,-1284},
1103 {-323,1870,16091,-1254},{-311,1803,16116,-1224},{-298,1736,16140,-1194},{-286,1669,16163,-1162},
1104 {-274,1603,16185,-1130},{-262,1538,16206,-1098},{-250,1473,16226,-1065},{-239,1408,16246,-1031},
1105 {-227,1345,16263,-997},{-216,1282,16280,-962},{-205,1219,16296,-927},{-194,1157,16311,-891},
1106 {-183,1096,16325,-854},{-172,1035,16338,-817},{-162,975,16350,-779},{-151,916,16361,-741},
1107 {-141,857,16370,-702},{-131,798,16379,-662},{-121,741,16387,-622},{-111,684,16393,-582},
1108 {-102,627,16399,-540},{-92,572,16403,-499},{-83,516,16407,-456},{-74,462,16409,-413},
1109 {-65,408,16411,-369},{-56,355,16411,-325},{-48,302,16410,-280},{-40,250,16408,-235},
1110 {-31,199,16406,-189},{-23,148,16402,-143},{-15,98,16397,-96},{-8,49,16391,-48},
1111 {0,0,16384,0}
1112 };
1113 #endif
1114
1115
1116 #if defined(__GNUC__)
1117 __attribute__((target("avx2")))
1118 #endif
CONCAT(InterpolateImageAvx2Lanczos,CONCAT (CONCAT (METHOD_NAME_EXTENSION_,EXTENSION),CONCAT (METHOD_NAME_CHANNELS_,CHANNELS)))1119 void emPainter::ScanlineTool::CONCAT(InterpolateImageAvx2Lanczos,CONCAT(
1120 CONCAT(METHOD_NAME_EXTENSION_,EXTENSION),
1121 CONCAT(METHOD_NAME_CHANNELS_,CHANNELS)
1122 )) (const ScanlineTool & sct, int x, int y, int w)
1123 {
1124 emInt64 ty=y*sct.TDY-sct.TY-0x1800000;
1125 emUInt32 oy=((ty&0xffffff)+0x7fff)>>16;
1126 const LanczosFactors & fy=LanczosFactorsTable[oy];
1127
1128 DEFINE_AND_SET_IMAGE_Y(imgY,ty>>24,sct.ImgDY,sct.ImgSY)
1129 ssize_t imgSX=sct.ImgSX;
1130 DEFINE_AND_SET_IMAGE_ROW_PTR(row0,imgY,imgSX,sct.ImgSY,sct.ImgMap)
1131 INCREMENT_IMAGE_Y(imgY,sct.ImgDY,sct.ImgSY)
1132 DEFINE_AND_SET_IMAGE_ROW_PTR(row1,imgY,imgSX,sct.ImgSY,sct.ImgMap)
1133 INCREMENT_IMAGE_Y(imgY,sct.ImgDY,sct.ImgSY)
1134 DEFINE_AND_SET_IMAGE_ROW_PTR(row2,imgY,imgSX,sct.ImgSY,sct.ImgMap)
1135 INCREMENT_IMAGE_Y(imgY,sct.ImgDY,sct.ImgSY)
1136 DEFINE_AND_SET_IMAGE_ROW_PTR(row3,imgY,imgSX,sct.ImgSY,sct.ImgMap)
1137
1138 emInt64 tdx=sct.TDX;
1139 emInt64 tx=x*tdx-sct.TX-0x1800000;
1140
1141 DEFINE_AND_SET_IMAGE_X(imgX,tx>>24,CHANNELS,imgSX)
1142
1143 tx=(tx&0xffffff)-0x1000000-tdx;
1144 int tc=((tx+0x5000000+w*tdx)>>24)*CHANNELS;
1145
1146 const emByte * p=(emByte*)sct.InterpolationBuffer+InterpolationBufferSize-tc*2-64;
1147 p-=(p-(emByte*)NULL)&31;
1148 const emInt16 * pvyBeg=(emInt16*)p;
1149 const emInt16 * pvy=pvyBeg;
1150 const emInt16 * pvyEnd=pvyBeg+tc;
1151
1152 __m128i sfy=_mm_loadl_epi64((__m128i*)&fy);
1153 sfy=_mm_unpacklo_epi16(sfy,sfy);
1154 __m256i afy=_mm256_broadcastsi128_si256(sfy);
1155 __m256i fy0=_mm256_shuffle_epi32(afy,0x00);
1156 __m256i fy1=_mm256_shuffle_epi32(afy,0x55);
1157 __m256i fy2=_mm256_shuffle_epi32(afy,0xaa);
1158 __m256i fy3=_mm256_shuffle_epi32(afy,0xff);
1159
1160 do {
1161 __m128i svy0,svy1,svy2,svy3;
1162 if (ARE_THERE_16_CONSECUTIVE_BYTES_AT_IMAGE_X(imgX,imgSX)) {
1163 DEFINE_AND_SET_IMAGE_PIX_PTR(p0,row0,imgX)
1164 DEFINE_AND_SET_IMAGE_PIX_PTR(p1,row1,imgX)
1165 DEFINE_AND_SET_IMAGE_PIX_PTR(p2,row2,imgX)
1166 DEFINE_AND_SET_IMAGE_PIX_PTR(p3,row3,imgX)
1167 svy0=_mm_loadu_si128((__m128i*)p0);
1168 svy1=_mm_loadu_si128((__m128i*)p1);
1169 svy2=_mm_loadu_si128((__m128i*)p2);
1170 svy3=_mm_loadu_si128((__m128i*)p3);
1171 INCREMENT_IMAGE_X(imgX,((16/CHANNELS)*CHANNELS),imgSX)
1172 }
1173 else {
1174 for (int i=0, j=pvyEnd-pvy; i<=16-CHANNELS; i+=CHANNELS) {
1175 svy0=_mm_srli_si128(svy0,CHANNELS);
1176 svy1=_mm_srli_si128(svy1,CHANNELS);
1177 svy2=_mm_srli_si128(svy2,CHANNELS);
1178 svy3=_mm_srli_si128(svy3,CHANNELS);
1179 if (i<j) {
1180 DEFINE_AND_SET_IMAGE_PIX_PTR(p0,row0,imgX)
1181 DEFINE_AND_SET_IMAGE_PIX_PTR(p1,row1,imgX)
1182 DEFINE_AND_SET_IMAGE_PIX_PTR(p2,row2,imgX)
1183 DEFINE_AND_SET_IMAGE_PIX_PTR(p3,row3,imgX)
1184 # if CHANNELS==1
1185 svy0=_mm_insert_epi8(svy0,p0[0],15);
1186 svy1=_mm_insert_epi8(svy1,p1[0],15);
1187 svy2=_mm_insert_epi8(svy2,p2[0],15);
1188 svy3=_mm_insert_epi8(svy3,p3[0],15);
1189 # elif CHANNELS==2
1190 svy0=_mm_insert_epi16(svy0,((emUInt16*)p0)[0],7);
1191 svy1=_mm_insert_epi16(svy1,((emUInt16*)p1)[0],7);
1192 svy2=_mm_insert_epi16(svy2,((emUInt16*)p2)[0],7);
1193 svy3=_mm_insert_epi16(svy3,((emUInt16*)p3)[0],7);
1194 # elif CHANNELS==3
1195 svy0=_mm_insert_epi16(svy0,p0[0]|(p0[1]<<8),6);
1196 svy0=_mm_insert_epi8(svy0,p0[2],14);
1197 svy1=_mm_insert_epi16(svy1,p1[0]|(p1[1]<<8),6);
1198 svy1=_mm_insert_epi8(svy1,p1[2],14);
1199 svy2=_mm_insert_epi16(svy2,p2[0]|(p2[1]<<8),6);
1200 svy2=_mm_insert_epi8(svy2,p2[2],14);
1201 svy3=_mm_insert_epi16(svy3,p3[0]|(p3[1]<<8),6);
1202 svy3=_mm_insert_epi8(svy3,p3[2],14);
1203 # else
1204 svy0=_mm_insert_epi32(svy0,((emUInt32*)p0)[0],3);
1205 svy1=_mm_insert_epi32(svy1,((emUInt32*)p1)[0],3);
1206 svy2=_mm_insert_epi32(svy2,((emUInt32*)p2)[0],3);
1207 svy3=_mm_insert_epi32(svy3,((emUInt32*)p3)[0],3);
1208 # endif
1209 INCREMENT_IMAGE_X(imgX,CHANNELS,imgSX)
1210 }
1211 }
1212 }
1213
1214 __m256i vy0=_mm256_cvtepu8_epi16(svy0);
1215 __m256i vy1=_mm256_cvtepu8_epi16(svy1);
1216 __m256i vy2=_mm256_cvtepu8_epi16(svy2);
1217 __m256i vy3=_mm256_cvtepu8_epi16(svy3);
1218
1219 PREMULFIN_SHL_COLOR_VEC16(vy0,7)
1220 PREMULFIN_SHL_COLOR_VEC16(vy1,7)
1221 PREMULFIN_SHL_COLOR_VEC16(vy2,7)
1222 PREMULFIN_SHL_COLOR_VEC16(vy3,7)
1223
1224 __m256i vy=_mm256_add_epi16(
1225 _mm256_add_epi16(
1226 _mm256_mulhrs_epi16(vy0,fy0),
1227 _mm256_mulhrs_epi16(vy1,fy1)
1228 ),
1229 _mm256_add_epi16(
1230 _mm256_mulhrs_epi16(vy2,fy2),
1231 _mm256_mulhrs_epi16(vy3,fy3)
1232 )
1233 );
1234
1235 _mm256_storeu_si256((__m256i*)pvy,vy);
1236 pvy+=(16/CHANNELS)*CHANNELS;
1237 } while (pvy<pvyEnd);
1238
1239 _mm256_storeu_si256((__m256i*)pvy,_mm256_setzero_si256());
1240
1241 pvy=pvyBeg;
1242 emByte * buf=(emByte*)sct.InterpolationBuffer;
1243 emByte * bufEnd=buf+w*CHANNELS;
1244
1245 // Order of pixels in v02 and v13 with 3-4 / 1-2 channels:
1246 // v02: 6 2 4 0 / 14 12 10 2 8 6 4 0
1247 // v13: 7 3 5 1 / 15 13 11 3 9 7 5 1
1248 # if CHANNELS<=2
1249 # if CHANNELS==1
1250 __m256i vt=_mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)pvy));
1251 # else
1252 __m256i vt=_mm256_loadu_si256((__m256i*)pvy);
1253 # endif
1254 __m256i v02=_mm256_permutevar8x32_epi32(vt,_mm256_set_epi32(0,0,0,2,0,6,4,0));
1255 __m256i v13=_mm256_permutevar8x32_epi32(vt,_mm256_set_epi32(0,0,0,3,0,7,5,1));
1256 pvy+=CHANNELS*8;
1257 int vn=5;
1258 # elif CHANNELS==3
1259 __m256i v02=_mm256_loadu_si256((__m256i*)pvy);
1260 __m256i v13=_mm256_permutevar8x32_epi32(v02,_mm256_set_epi32(5,4,4,3,7,6,2,1));
1261 v02=_mm256_blend_epi32(v02,v13,0xfc);
1262 v13=_mm256_blend_epi32(_mm256_srli_si256(v13,2),_mm256_srli_si256(v13,10),0xf0);
1263 pvy+=CHANNELS*5;
1264 int vn=2;
1265 # else
1266 __m256i v02=_mm256_loadu_si256((__m256i*)pvy);
1267 __m256i v13=_mm256_srli_si256(v02,8);
1268 pvy+=CHANNELS*4;
1269 int vn=1;
1270 # endif
1271
1272 do {
1273 __m256i v02l,v02h,v13l,v13h,f02l,f02h,f13l,f13h;
1274 int cx=16/CHANNELS-1;
1275
1276 do {
1277 tx+=tdx;
1278 if (tx>=0) {
1279 tx-=0x1000000;
1280
1281 __m256i oldV02=v02;
1282 v02=v13;
1283 # if CHANNELS<=2
1284 v13=_mm256_permutevar8x32_epi32(oldV02,_mm256_set_epi32(0,7,6,1,5,3,2,4));
1285 # else
1286 v13=_mm256_permute4x64_epi64(oldV02,0x1e);
1287 # endif
1288
1289 vn--;
1290 if (vn<=0) {
1291 # if CHANNELS<=2
1292 # if CHANNELS==1
1293 __m256i t=_mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)pvy));
1294 # else
1295 __m256i t=_mm256_loadu_si256((__m256i*)pvy);
1296 # endif
1297 __m256i ta=_mm256_permutevar8x32_epi32(t,_mm256_set_epi32(0,0,7,0,5,3,1,0));
1298 __m256i tb=_mm256_permutevar8x32_epi32(t,_mm256_set_epi32(0,0,0,0,6,4,2,0));
1299 v02=_mm256_blend_epi32(v02,ta,0xee);
1300 v13=_mm256_blend_epi32(v13,tb,0xfe);
1301 pvy+=CHANNELS*8;
1302 vn+=8;
1303 # elif CHANNELS==3
1304 __m256i t=_mm256_loadu_si256((__m256i*)pvy);
1305 __m256i ta=_mm256_shuffle_epi8(t,_mm256_set_epi8(
1306 -1,-1, 7, 6, 5, 4, 3, 2, -1,-1,-1,-1, -1,-1,-1,-1,
1307 -1,-1,11,10, 9, 8, 7, 6, -1,-1,-1,-1, -1,-1,-1,-1
1308 ));
1309 __m256i tb=_mm256_permutevar8x32_epi32(t,_mm256_set_epi32(7,6,1,0,4,3,2,1));
1310 v02=_mm256_blend_epi32(v02,ta,0xcc);
1311 v13=_mm256_blend_epi32(v13,tb,0xfc);
1312 pvy+=CHANNELS*5;
1313 vn+=5;
1314 # else
1315 __m256i ta=_mm256_loadu_si256((__m256i*)pvy);
1316 __m256i tb=_mm256_permute4x64_epi64(ta,0x0a);
1317 v02=_mm256_blend_epi32(v02,ta,0xcc);
1318 v13=_mm256_blend_epi32(v13,tb,0xfc);
1319 pvy+=CHANNELS*4;
1320 vn+=4;
1321 # endif
1322 }
1323 }
1324
1325 emUInt32 ox=(tx+0x1007fff)>>16;
1326 const LanczosFactors & fx=LanczosFactorsTable[ox];
1327
1328 __m256i f02=_mm256_cvtepu16_epi64(_mm_loadl_epi64((__m128i*)&fx));
1329 # if CHANNELS>=2
1330 f02=_mm256_shuffle_epi8(f02,_mm256_set_epi8(
1331 9, 8, 9, 8, 9, 8, 9, 8, 1, 0, 1, 0, 1, 0, 1, 0,
1332 9, 8, 9, 8, 9, 8, 9, 8, 1, 0, 1, 0, 1, 0, 1, 0
1333 ));
1334 # endif
1335
1336 __m256i f13=_mm256_srli_si256(f02,8);
1337
1338 if (cx==7/CHANNELS) {
1339 # if CHANNELS==3
1340 v02l=_mm256_alignr_epi8(v02,v02h,4);
1341 v13l=_mm256_alignr_epi8(v13,v13h,4);
1342 f02l=_mm256_alignr_epi8(f02,f02h,4);
1343 f13l=_mm256_alignr_epi8(f13,f13h,4);
1344 # else
1345 v02l=v02h;
1346 v13l=v13h;
1347 f02l=f02h;
1348 f13l=f13h;
1349 # endif
1350 }
1351
1352 v02h=_mm256_alignr_epi8(v02,v02h,CHANNELS*2);
1353 v13h=_mm256_alignr_epi8(v13,v13h,CHANNELS*2);
1354 f02h=_mm256_alignr_epi8(f02,f02h,CHANNELS*2);
1355 f13h=_mm256_alignr_epi8(f13,f13h,CHANNELS*2);
1356 cx--;
1357 } while (cx>=0);
1358
1359 # if CHANNELS==3
1360 v02h=_mm256_srli_si256(v02h,2);
1361 v13h=_mm256_srli_si256(v13h,2);
1362 f02h=_mm256_srli_si256(f02h,2);
1363 f13h=_mm256_srli_si256(f13h,2);
1364 # endif
1365
1366 __m256i vx0=_mm256_permute2x128_si256(v02l,v02h,0x20);
1367 __m256i vx1=_mm256_permute2x128_si256(v13l,v13h,0x20);
1368 __m256i vx2=_mm256_permute2x128_si256(v02l,v02h,0x31);
1369 __m256i vx3=_mm256_permute2x128_si256(v13l,v13h,0x31);
1370 __m256i fx0=_mm256_permute2x128_si256(f02l,f02h,0x20);
1371 __m256i fx1=_mm256_permute2x128_si256(f13l,f13h,0x20);
1372 __m256i fx2=_mm256_permute2x128_si256(f02l,f02h,0x31);
1373 __m256i fx3=_mm256_permute2x128_si256(f13l,f13h,0x31);
1374
1375 __m256i vx=_mm256_add_epi16(
1376 _mm256_add_epi16(
1377 _mm256_mulhrs_epi16(vx0,fx0),
1378 _mm256_mulhrs_epi16(vx1,fx1)
1379 ),
1380 _mm256_add_epi16(
1381 _mm256_mulhrs_epi16(vx2,fx2),
1382 _mm256_mulhrs_epi16(vx3,fx3)
1383 )
1384 );
1385
1386 vx=_mm256_add_epi16(vx,_mm256_set1_epi16(0x10));
1387 vx=_mm256_srai_epi16(vx,5);
1388 vx=_mm256_max_epi16(vx,_mm256_setzero_si256());
1389 __m128i svx=_mm_packus_epi16(
1390 _mm256_castsi256_si128(vx),
1391 _mm256_extracti128_si256(vx,1)
1392 );
1393 # if CHANNELS==2 || CHANNELS==4
1394 svx=_mm_min_epu8(svx,_mm_shuffle_epi8(svx,_mm_set_epi8(
1395 # if CHANNELS==2
1396 15,15,13,13,11,11,9,9,7,7,5,5,3,3,1,1
1397 # else
1398 15,15,15,15,11,11,11,11,7,7,7,7,3,3,3,3
1399 # endif
1400 )));
1401 # endif
1402
1403 _mm_storeu_si128((__m128i*)buf,svx);
1404
1405 buf+=(16/CHANNELS)*CHANNELS;
1406 } while (buf<bufEnd);
1407 }
1408
1409
1410 //==============================================================================
1411 //========== emPainter::ScanlineTool::InterpolateImageAvx2Adaptive... ==========
1412 //==============================================================================
1413
1414 //---------------------------- AdaptiveFactorsTable ----------------------------
1415
1416 #ifndef ADAPTIVE_FACTORS_TABLE_DEFINED
1417 # define ADAPTIVE_FACTORS_TABLE_DEFINED
1418 struct alignas(8) AdaptiveFactors {
1419 emInt16 fv1;
1420 emInt16 fv2;
1421 emInt16 fs1;
1422 emInt16 fs2;
1423 };
1424 static const AdaptiveFactors AdaptiveFactorsTable[257] = {
1425 // #include <stdio.h>
1426 // #include <math.h>
1427 // int main(int argc, char * argv[])
1428 // {
1429 // for (int i=0; i<=256; i++) {
1430 // double f=-32768.0; // Negative because +32768 is beyond signed 16-Bit
1431 // double o=i/256.0;
1432 // int fv1=(int)round((2*o*o*o-3*o*o+1)*f);
1433 // int fv2=(int)round((-2*o*o*o+3*o*o)*f);
1434 // int fs1=(int)round((o*o*o-2*o*o+o)*f);
1435 // int fs2=(int)round((o*o*o-o*o)*f);
1436 // printf("%s{%d,%d,%d,%d},",i%4?"":"\n",fv1,fv2,fs1,fs2);
1437 // }
1438 // return 0;
1439 // }
1440 {-32768,0,0,0},{-32767,-1,-127,0},{-32762,-6,-252,2},{-32755,-13,-375,4},
1441 {-32744,-24,-496,8},{-32731,-37,-615,12},{-32715,-53,-732,18},{-32696,-72,-848,24},
1442 {-32674,-94,-961,31},{-32649,-119,-1072,39},{-32622,-146,-1182,48},{-32592,-176,-1290,58},
1443 {-32559,-209,-1395,69},{-32523,-245,-1499,80},{-32485,-283,-1601,93},{-32444,-324,-1702,106},
1444 {-32400,-368,-1800,120},{-32354,-414,-1897,135},{-32305,-463,-1991,151},{-32253,-515,-2084,167},
1445 {-32199,-569,-2176,184},{-32143,-625,-2265,202},{-32084,-684,-2353,221},{-32022,-746,-2439,241},
1446 {-31958,-810,-2523,261},{-31892,-876,-2606,282},{-31823,-945,-2686,304},{-31751,-1017,-2765,326},
1447 {-31678,-1090,-2843,349},{-31602,-1166,-2919,373},{-31523,-1245,-2993,397},{-31443,-1325,-3065,422},
1448 {-31360,-1408,-3136,448},{-31275,-1493,-3205,474},{-31188,-1580,-3273,501},{-31098,-1670,-3339,529},
1449 {-31006,-1762,-3403,557},{-30912,-1856,-3466,586},{-30816,-1952,-3527,615},{-30718,-2050,-3587,645},
1450 {-30618,-2150,-3645,675},{-30516,-2252,-3702,706},{-30411,-2357,-3757,737},{-30305,-2463,-3810,769},
1451 {-30197,-2571,-3862,802},{-30086,-2682,-3913,835},{-29974,-2794,-3962,868},{-29860,-2908,-4010,902},
1452 {-29744,-3024,-4056,936},{-29626,-3142,-4101,971},{-29506,-3262,-4144,1006},{-29385,-3383,-4186,1041},
1453 {-29261,-3507,-4227,1077},{-29136,-3632,-4266,1114},{-29009,-3759,-4304,1150},{-28880,-3888,-4340,1188},
1454 {-28750,-4018,-4375,1225},{-28618,-4150,-4409,1263},{-28484,-4284,-4441,1301},{-28349,-4419,-4472,1339},
1455 {-28212,-4556,-4502,1378},{-28073,-4695,-4530,1417},{-27933,-4835,-4557,1457},{-27791,-4977,-4583,1496},
1456 {-27648,-5120,-4608,1536},{-27503,-5265,-4631,1576},{-27357,-5411,-4654,1616},{-27209,-5559,-4674,1657},
1457 {-27060,-5708,-4694,1698},{-26910,-5858,-4713,1739},{-26758,-6010,-4730,1780},{-26605,-6163,-4746,1821},
1458 {-26450,-6318,-4761,1863},{-26294,-6474,-4775,1905},{-26137,-6631,-4787,1947},{-25978,-6790,-4799,1989},
1459 {-25819,-6949,-4809,2031},{-25658,-7110,-4819,2073},{-25496,-7272,-4827,2115},{-25332,-7436,-4834,2158},
1460 {-25168,-7600,-4840,2200},{-25002,-7766,-4845,2243},{-24836,-7932,-4849,2285},{-24668,-8100,-4852,2328},
1461 {-24499,-8269,-4854,2370},{-24329,-8439,-4854,2413},{-24159,-8609,-4854,2456},{-23987,-8781,-4853,2498},
1462 {-23814,-8954,-4851,2541},{-23640,-9128,-4848,2584},{-23466,-9302,-4844,2626},{-23290,-9478,-4839,2669},
1463 {-23114,-9654,-4833,2711},{-22937,-9831,-4826,2753},{-22758,-10010,-4818,2796},{-22580,-10188,-4810,2838},
1464 {-22400,-10368,-4800,2880},{-22220,-10548,-4790,2922},{-22039,-10729,-4778,2964},{-21857,-10911,-4766,3005},
1465 {-21674,-11094,-4753,3047},{-21491,-11277,-4739,3088},{-21307,-11461,-4725,3129},{-21123,-11645,-4709,3170},
1466 {-20938,-11830,-4693,3211},{-20752,-12016,-4676,3252},{-20566,-12202,-4658,3292},{-20380,-12388,-4640,3332},
1467 {-20193,-12575,-4620,3372},{-20005,-12763,-4600,3411},{-19817,-12951,-4580,3450},{-19629,-13139,-4558,3489},
1468 {-19440,-13328,-4536,3528},{-19251,-13517,-4513,3566},{-19061,-13707,-4490,3604},{-18871,-13897,-4465,3642},
1469 {-18681,-14087,-4441,3679},{-18491,-14277,-4415,3716},{-18300,-14468,-4389,3753},{-18109,-14659,-4362,3789},
1470 {-17918,-14850,-4335,3825},{-17727,-15041,-4307,3860},{-17535,-15233,-4279,3895},{-17344,-15424,-4250,3930},
1471 {-17152,-15616,-4220,3964},{-16960,-15808,-4190,3998},{-16768,-16000,-4159,4031},{-16576,-16192,-4128,4064},
1472 {-16384,-16384,-4096,4096},{-16192,-16576,-4064,4128},{-16000,-16768,-4031,4159},{-15808,-16960,-3998,4190},
1473 {-15616,-17152,-3964,4220},{-15424,-17344,-3930,4250},{-15233,-17535,-3895,4279},{-15041,-17727,-3860,4307},
1474 {-14850,-17918,-3825,4335},{-14659,-18109,-3789,4362},{-14468,-18300,-3753,4389},{-14277,-18491,-3716,4415},
1475 {-14087,-18681,-3679,4441},{-13897,-18871,-3642,4465},{-13707,-19061,-3604,4490},{-13517,-19251,-3566,4513},
1476 {-13328,-19440,-3528,4536},{-13139,-19629,-3489,4558},{-12951,-19817,-3450,4580},{-12763,-20005,-3411,4600},
1477 {-12575,-20193,-3372,4620},{-12388,-20380,-3332,4640},{-12202,-20566,-3292,4658},{-12016,-20752,-3252,4676},
1478 {-11830,-20938,-3211,4693},{-11645,-21123,-3170,4709},{-11461,-21307,-3129,4725},{-11277,-21491,-3088,4739},
1479 {-11094,-21674,-3047,4753},{-10911,-21857,-3005,4766},{-10729,-22039,-2964,4778},{-10548,-22220,-2922,4790},
1480 {-10368,-22400,-2880,4800},{-10188,-22580,-2838,4810},{-10010,-22758,-2796,4818},{-9831,-22937,-2753,4826},
1481 {-9654,-23114,-2711,4833},{-9478,-23290,-2669,4839},{-9302,-23466,-2626,4844},{-9128,-23640,-2584,4848},
1482 {-8954,-23814,-2541,4851},{-8781,-23987,-2498,4853},{-8609,-24159,-2456,4854},{-8439,-24329,-2413,4854},
1483 {-8269,-24499,-2370,4854},{-8100,-24668,-2328,4852},{-7932,-24836,-2285,4849},{-7766,-25002,-2243,4845},
1484 {-7600,-25168,-2200,4840},{-7436,-25332,-2158,4834},{-7272,-25496,-2115,4827},{-7110,-25658,-2073,4819},
1485 {-6949,-25819,-2031,4809},{-6790,-25978,-1989,4799},{-6631,-26137,-1947,4787},{-6474,-26294,-1905,4775},
1486 {-6318,-26450,-1863,4761},{-6163,-26605,-1821,4746},{-6010,-26758,-1780,4730},{-5858,-26910,-1739,4713},
1487 {-5708,-27060,-1698,4694},{-5559,-27209,-1657,4674},{-5411,-27357,-1616,4654},{-5265,-27503,-1576,4631},
1488 {-5120,-27648,-1536,4608},{-4977,-27791,-1496,4583},{-4835,-27933,-1457,4557},{-4695,-28073,-1417,4530},
1489 {-4556,-28212,-1378,4502},{-4419,-28349,-1339,4472},{-4284,-28484,-1301,4441},{-4150,-28618,-1263,4409},
1490 {-4018,-28750,-1225,4375},{-3888,-28880,-1188,4340},{-3759,-29009,-1150,4304},{-3632,-29136,-1114,4266},
1491 {-3507,-29261,-1077,4227},{-3383,-29385,-1041,4186},{-3262,-29506,-1006,4144},{-3142,-29626,-971,4101},
1492 {-3024,-29744,-936,4056},{-2908,-29860,-902,4010},{-2794,-29974,-868,3962},{-2682,-30086,-835,3913},
1493 {-2571,-30197,-802,3862},{-2463,-30305,-769,3810},{-2357,-30411,-737,3757},{-2252,-30516,-706,3702},
1494 {-2150,-30618,-675,3645},{-2050,-30718,-645,3587},{-1952,-30816,-615,3527},{-1856,-30912,-586,3466},
1495 {-1762,-31006,-557,3403},{-1670,-31098,-529,3339},{-1580,-31188,-501,3273},{-1493,-31275,-474,3205},
1496 {-1408,-31360,-448,3136},{-1325,-31443,-422,3065},{-1245,-31523,-397,2993},{-1166,-31602,-373,2919},
1497 {-1090,-31678,-349,2843},{-1017,-31751,-326,2765},{-945,-31823,-304,2686},{-876,-31892,-282,2606},
1498 {-810,-31958,-261,2523},{-746,-32022,-241,2439},{-684,-32084,-221,2353},{-625,-32143,-202,2265},
1499 {-569,-32199,-184,2176},{-515,-32253,-167,2084},{-463,-32305,-151,1991},{-414,-32354,-135,1897},
1500 {-368,-32400,-120,1800},{-324,-32444,-106,1702},{-283,-32485,-93,1601},{-245,-32523,-80,1499},
1501 {-209,-32559,-69,1395},{-176,-32592,-58,1290},{-146,-32622,-48,1182},{-119,-32649,-39,1072},
1502 {-94,-32674,-31,961},{-72,-32696,-24,848},{-53,-32715,-18,732},{-37,-32731,-12,615},
1503 {-24,-32744,-8,496},{-13,-32755,-4,375},{-6,-32762,-2,252},{-1,-32767,0,127},
1504 {0,-32768,0,0}
1505 };
1506 #endif
1507
1508
1509 //----------------- Subroutine: InterpolateFourVectorsAdaptive -----------------
1510
1511 #ifndef INTERPOLATE_FOUR_VECTORS_ADAPTIVE_DEFINED
1512 # define INTERPOLATE_FOUR_VECTORS_ADAPTIVE_DEFINED
1513
1514 # if defined(__GNUC__)
1515 __attribute__((target("avx2")))
1516 # endif
InterpolateFourVectorsAdaptive(__m256i v0,__m256i v1,__m256i v2,__m256i v3,__m256i fv1,__m256i fv2,__m256i fs1,__m256i fs2)1517 static inline __m256i InterpolateFourVectorsAdaptive(
1518 __m256i v0, __m256i v1, __m256i v2, __m256i v3,
1519 __m256i fv1, __m256i fv2, __m256i fs1, __m256i fs2
1520 )
1521 {
1522 __m256i neg=_mm256_or_si256(
1523 _mm256_cmpgt_epi16(v2,v1),
1524 _mm256_set1_epi16(1)
1525 );
1526
1527 v0=_mm256_sign_epi16(v0,neg);
1528 v1=_mm256_sign_epi16(v1,neg);
1529 v2=_mm256_sign_epi16(v2,neg);
1530 v3=_mm256_sign_epi16(v3,neg);
1531
1532 __m256i s01=_mm256_sub_epi16(v1,v0);
1533 __m256i s12=_mm256_sub_epi16(v2,v1);
1534 __m256i s21=_mm256_sub_epi16(v1,v2);
1535 __m256i s23=_mm256_sub_epi16(v3,v2);
1536
1537 __m256i s01x2=_mm256_add_epi16(s01,s01);
1538 __m256i s12x2=_mm256_add_epi16(s12,s12);
1539 __m256i s23x2=_mm256_add_epi16(s23,s23);
1540
1541 __m256i s1=_mm256_min_epi16(
1542 _mm256_max_epi16(s01x2,s12),
1543 _mm256_max_epi16(s12x2,s01)
1544 );
1545 __m256i s2=_mm256_min_epi16(
1546 _mm256_max_epi16(s23x2,s12),
1547 _mm256_max_epi16(s12x2,s23)
1548 );
1549
1550 __m256i q1=_mm256_sub_epi16(s1,s23x2);
1551 __m256i q2=_mm256_sub_epi16(s2,s01x2);
1552
1553 s1=_mm256_add_epi16(
1554 s1,
1555 _mm256_min_epi16(
1556 _mm256_setzero_si256(),
1557 _mm256_max_epi16(q1,s1)
1558 )
1559 );
1560 s2=_mm256_add_epi16(
1561 s2,
1562 _mm256_min_epi16(
1563 _mm256_setzero_si256(),
1564 _mm256_max_epi16(q2,s2)
1565 )
1566 );
1567
1568 s1=_mm256_min_epi16(_mm256_setzero_si256(),s1);
1569 s2=_mm256_min_epi16(_mm256_setzero_si256(),s2);
1570
1571 __m256i s21p7=_mm256_add_epi16(s21,_mm256_set1_epi16(7));
1572
1573 v1=_mm256_add_epi16(
1574 v1,
1575 _mm256_max_epi16(
1576 _mm256_setzero_si256(),
1577 _mm256_min_epi16(
1578 _mm256_min_epi16(s21,s01),
1579 _mm256_srai_epi16(_mm256_add_epi16(s01,s21p7),4)
1580 )
1581 )
1582 );
1583 v2=_mm256_sub_epi16(
1584 v2,
1585 _mm256_max_epi16(
1586 _mm256_setzero_si256(),
1587 _mm256_min_epi16(
1588 _mm256_min_epi16(s21,s23),
1589 _mm256_srai_epi16(_mm256_add_epi16(s23,s21p7),4)
1590 )
1591 )
1592 );
1593
1594 __m256i v=_mm256_add_epi16(
1595 _mm256_add_epi16(
1596 _mm256_mulhrs_epi16(v1,fv1),
1597 _mm256_mulhrs_epi16(v2,fv2)
1598 ),
1599 _mm256_add_epi16(
1600 _mm256_mulhrs_epi16(s1,fs1),
1601 _mm256_mulhrs_epi16(s2,fs2)
1602 )
1603 );
1604
1605 v=_mm256_sign_epi16(v,neg);
1606 return v;
1607 }
1608 #endif
1609
1610
1611 //---------- emPainter::ScanlineTool::InterpolateImageAvx2Adaptive... ----------
1612
1613 #if defined(__GNUC__)
1614 __attribute__((target("avx2")))
1615 #endif
CONCAT(InterpolateImageAvx2Adaptive,CONCAT (CONCAT (METHOD_NAME_EXTENSION_,EXTENSION),CONCAT (METHOD_NAME_CHANNELS_,CHANNELS)))1616 void emPainter::ScanlineTool::CONCAT(InterpolateImageAvx2Adaptive,CONCAT(
1617 CONCAT(METHOD_NAME_EXTENSION_,EXTENSION),
1618 CONCAT(METHOD_NAME_CHANNELS_,CHANNELS)
1619 )) (const ScanlineTool & sct, int x, int y, int w)
1620 {
1621 emInt64 ty=y*sct.TDY-sct.TY-0x1800000;
1622 emUInt32 oy=((ty&0xffffff)+0x7fff)>>16;
1623 const AdaptiveFactors & fy=AdaptiveFactorsTable[oy];
1624
1625 DEFINE_AND_SET_IMAGE_Y(imgY,ty>>24,sct.ImgDY,sct.ImgSY)
1626 ssize_t imgSX=sct.ImgSX;
1627 DEFINE_AND_SET_IMAGE_ROW_PTR(row0,imgY,imgSX,sct.ImgSY,sct.ImgMap)
1628 INCREMENT_IMAGE_Y(imgY,sct.ImgDY,sct.ImgSY)
1629 DEFINE_AND_SET_IMAGE_ROW_PTR(row1,imgY,imgSX,sct.ImgSY,sct.ImgMap)
1630 INCREMENT_IMAGE_Y(imgY,sct.ImgDY,sct.ImgSY)
1631 DEFINE_AND_SET_IMAGE_ROW_PTR(row2,imgY,imgSX,sct.ImgSY,sct.ImgMap)
1632 INCREMENT_IMAGE_Y(imgY,sct.ImgDY,sct.ImgSY)
1633 DEFINE_AND_SET_IMAGE_ROW_PTR(row3,imgY,imgSX,sct.ImgSY,sct.ImgMap)
1634
1635 emInt64 tdx=sct.TDX;
1636 emInt64 tx=x*tdx-sct.TX-0x1800000;
1637
1638 DEFINE_AND_SET_IMAGE_X(imgX,tx>>24,CHANNELS,imgSX)
1639
1640 tx=(tx&0xffffff)-0x1000000-tdx;
1641 int tc=((tx+0x5000000+w*tdx)>>24)*CHANNELS;
1642
1643 const emByte * p=(emByte*)sct.InterpolationBuffer+InterpolationBufferSize-tc*2-64;
1644 p-=(p-(emByte*)NULL)&31;
1645 const emInt16 * pvyBeg=(emInt16*)p;
1646 const emInt16 * pvy=pvyBeg;
1647 const emInt16 * pvyEnd=pvyBeg+tc;
1648
1649 __m128i sfy=_mm_loadl_epi64((__m128i*)&fy);
1650 sfy=_mm_unpacklo_epi16(sfy,sfy);
1651 __m256i afy=_mm256_broadcastsi128_si256(sfy);
1652 __m256i fy0=_mm256_shuffle_epi32(afy,0x00);
1653 __m256i fy1=_mm256_shuffle_epi32(afy,0x55);
1654 __m256i fy2=_mm256_shuffle_epi32(afy,0xaa);
1655 __m256i fy3=_mm256_shuffle_epi32(afy,0xff);
1656
1657 do {
1658 __m128i svy0,svy1,svy2,svy3;
1659 if (ARE_THERE_16_CONSECUTIVE_BYTES_AT_IMAGE_X(imgX,imgSX)) {
1660 DEFINE_AND_SET_IMAGE_PIX_PTR(p0,row0,imgX)
1661 DEFINE_AND_SET_IMAGE_PIX_PTR(p1,row1,imgX)
1662 DEFINE_AND_SET_IMAGE_PIX_PTR(p2,row2,imgX)
1663 DEFINE_AND_SET_IMAGE_PIX_PTR(p3,row3,imgX)
1664 svy0=_mm_loadu_si128((__m128i*)p0);
1665 svy1=_mm_loadu_si128((__m128i*)p1);
1666 svy2=_mm_loadu_si128((__m128i*)p2);
1667 svy3=_mm_loadu_si128((__m128i*)p3);
1668 INCREMENT_IMAGE_X(imgX,((16/CHANNELS)*CHANNELS),imgSX)
1669 }
1670 else {
1671 for (int i=0, j=pvyEnd-pvy; i<=16-CHANNELS; i+=CHANNELS) {
1672 svy0=_mm_srli_si128(svy0,CHANNELS);
1673 svy1=_mm_srli_si128(svy1,CHANNELS);
1674 svy2=_mm_srli_si128(svy2,CHANNELS);
1675 svy3=_mm_srli_si128(svy3,CHANNELS);
1676 if (i<j) {
1677 DEFINE_AND_SET_IMAGE_PIX_PTR(p0,row0,imgX)
1678 DEFINE_AND_SET_IMAGE_PIX_PTR(p1,row1,imgX)
1679 DEFINE_AND_SET_IMAGE_PIX_PTR(p2,row2,imgX)
1680 DEFINE_AND_SET_IMAGE_PIX_PTR(p3,row3,imgX)
1681 # if CHANNELS==1
1682 svy0=_mm_insert_epi8(svy0,p0[0],15);
1683 svy1=_mm_insert_epi8(svy1,p1[0],15);
1684 svy2=_mm_insert_epi8(svy2,p2[0],15);
1685 svy3=_mm_insert_epi8(svy3,p3[0],15);
1686 # elif CHANNELS==2
1687 svy0=_mm_insert_epi16(svy0,((emUInt16*)p0)[0],7);
1688 svy1=_mm_insert_epi16(svy1,((emUInt16*)p1)[0],7);
1689 svy2=_mm_insert_epi16(svy2,((emUInt16*)p2)[0],7);
1690 svy3=_mm_insert_epi16(svy3,((emUInt16*)p3)[0],7);
1691 # elif CHANNELS==3
1692 svy0=_mm_insert_epi16(svy0,p0[0]|(p0[1]<<8),6);
1693 svy0=_mm_insert_epi8(svy0,p0[2],14);
1694 svy1=_mm_insert_epi16(svy1,p1[0]|(p1[1]<<8),6);
1695 svy1=_mm_insert_epi8(svy1,p1[2],14);
1696 svy2=_mm_insert_epi16(svy2,p2[0]|(p2[1]<<8),6);
1697 svy2=_mm_insert_epi8(svy2,p2[2],14);
1698 svy3=_mm_insert_epi16(svy3,p3[0]|(p3[1]<<8),6);
1699 svy3=_mm_insert_epi8(svy3,p3[2],14);
1700 # else
1701 svy0=_mm_insert_epi32(svy0,((emUInt32*)p0)[0],3);
1702 svy1=_mm_insert_epi32(svy1,((emUInt32*)p1)[0],3);
1703 svy2=_mm_insert_epi32(svy2,((emUInt32*)p2)[0],3);
1704 svy3=_mm_insert_epi32(svy3,((emUInt32*)p3)[0],3);
1705 # endif
1706 INCREMENT_IMAGE_X(imgX,CHANNELS,imgSX)
1707 }
1708 }
1709 }
1710
1711 __m256i vy0=_mm256_cvtepu8_epi16(svy0);
1712 __m256i vy1=_mm256_cvtepu8_epi16(svy1);
1713 __m256i vy2=_mm256_cvtepu8_epi16(svy2);
1714 __m256i vy3=_mm256_cvtepu8_epi16(svy3);
1715
1716 PREMULFIN_SHL_COLOR_VEC16(vy0,5)
1717 PREMULFIN_SHL_COLOR_VEC16(vy1,5)
1718 PREMULFIN_SHL_COLOR_VEC16(vy2,5)
1719 PREMULFIN_SHL_COLOR_VEC16(vy3,5)
1720
1721 __m256i vy=InterpolateFourVectorsAdaptive(vy0,vy1,vy2,vy3,fy0,fy1,fy2,fy3);
1722
1723 _mm256_storeu_si256((__m256i*)pvy,vy);
1724 pvy+=(16/CHANNELS)*CHANNELS;
1725 } while (pvy<pvyEnd);
1726
1727 _mm256_storeu_si256((__m256i*)pvy,_mm256_setzero_si256());
1728
1729 pvy=pvyBeg;
1730 emByte * buf=(emByte*)sct.InterpolationBuffer;
1731 emByte * bufEnd=buf+w*CHANNELS;
1732
1733 // Order of pixels in v02 and v13 with 3-4 / 1-2 channels:
1734 // v02: 6 2 4 0 / 14 12 10 2 8 6 4 0
1735 // v13: 7 3 5 1 / 15 13 11 3 9 7 5 1
1736 # if CHANNELS<=2
1737 # if CHANNELS==1
1738 __m256i vt=_mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)pvy));
1739 # else
1740 __m256i vt=_mm256_loadu_si256((__m256i*)pvy);
1741 # endif
1742 __m256i v02=_mm256_permutevar8x32_epi32(vt,_mm256_set_epi32(0,0,0,2,0,6,4,0));
1743 __m256i v13=_mm256_permutevar8x32_epi32(vt,_mm256_set_epi32(0,0,0,3,0,7,5,1));
1744 pvy+=CHANNELS*8;
1745 int vn=5;
1746 # elif CHANNELS==3
1747 __m256i v02=_mm256_loadu_si256((__m256i*)pvy);
1748 __m256i v13=_mm256_permutevar8x32_epi32(v02,_mm256_set_epi32(5,4,4,3,7,6,2,1));
1749 v02=_mm256_blend_epi32(v02,v13,0xfc);
1750 v13=_mm256_blend_epi32(_mm256_srli_si256(v13,2),_mm256_srli_si256(v13,10),0xf0);
1751 pvy+=CHANNELS*5;
1752 int vn=2;
1753 # else
1754 __m256i v02=_mm256_loadu_si256((__m256i*)pvy);
1755 __m256i v13=_mm256_srli_si256(v02,8);
1756 pvy+=CHANNELS*4;
1757 int vn=1;
1758 # endif
1759
1760 do {
1761 __m256i v02l,v02h,v13l,v13h,f02l,f02h,f13l,f13h;
1762 int cx=16/CHANNELS-1;
1763
1764 do {
1765 tx+=tdx;
1766 if (tx>=0) {
1767 tx-=0x1000000;
1768
1769 __m256i oldV02=v02;
1770 v02=v13;
1771 # if CHANNELS<=2
1772 v13=_mm256_permutevar8x32_epi32(oldV02,_mm256_set_epi32(0,7,6,1,5,3,2,4));
1773 # else
1774 v13=_mm256_permute4x64_epi64(oldV02,0x1e);
1775 # endif
1776
1777 vn--;
1778 if (vn<=0) {
1779 # if CHANNELS<=2
1780 # if CHANNELS==1
1781 __m256i t=_mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)pvy));
1782 # else
1783 __m256i t=_mm256_loadu_si256((__m256i*)pvy);
1784 # endif
1785 __m256i ta=_mm256_permutevar8x32_epi32(t,_mm256_set_epi32(0,0,7,0,5,3,1,0));
1786 __m256i tb=_mm256_permutevar8x32_epi32(t,_mm256_set_epi32(0,0,0,0,6,4,2,0));
1787 v02=_mm256_blend_epi32(v02,ta,0xee);
1788 v13=_mm256_blend_epi32(v13,tb,0xfe);
1789 pvy+=CHANNELS*8;
1790 vn+=8;
1791 # elif CHANNELS==3
1792 __m256i t=_mm256_loadu_si256((__m256i*)pvy);
1793 __m256i ta=_mm256_shuffle_epi8(t,_mm256_set_epi8(
1794 -1,-1, 7, 6, 5, 4, 3, 2, -1,-1,-1,-1, -1,-1,-1,-1,
1795 -1,-1,11,10, 9, 8, 7, 6, -1,-1,-1,-1, -1,-1,-1,-1
1796 ));
1797 __m256i tb=_mm256_permutevar8x32_epi32(t,_mm256_set_epi32(7,6,1,0,4,3,2,1));
1798 v02=_mm256_blend_epi32(v02,ta,0xcc);
1799 v13=_mm256_blend_epi32(v13,tb,0xfc);
1800 pvy+=CHANNELS*5;
1801 vn+=5;
1802 # else
1803 __m256i ta=_mm256_loadu_si256((__m256i*)pvy);
1804 __m256i tb=_mm256_permute4x64_epi64(ta,0x0a);
1805 v02=_mm256_blend_epi32(v02,ta,0xcc);
1806 v13=_mm256_blend_epi32(v13,tb,0xfc);
1807 pvy+=CHANNELS*4;
1808 vn+=4;
1809 # endif
1810 }
1811 }
1812
1813 emUInt32 ox=(tx+0x1007fff)>>16;
1814 const AdaptiveFactors & fx=AdaptiveFactorsTable[ox];
1815
1816 __m256i f02=_mm256_cvtepu16_epi64(_mm_loadl_epi64((__m128i*)&fx));
1817 # if CHANNELS>=2
1818 f02=_mm256_shuffle_epi8(f02,_mm256_set_epi8(
1819 9, 8, 9, 8, 9, 8, 9, 8, 1, 0, 1, 0, 1, 0, 1, 0,
1820 9, 8, 9, 8, 9, 8, 9, 8, 1, 0, 1, 0, 1, 0, 1, 0
1821 ));
1822 # endif
1823
1824 __m256i f13=_mm256_srli_si256(f02,8);
1825
1826 if (cx==7/CHANNELS) {
1827 # if CHANNELS==3
1828 v02l=_mm256_alignr_epi8(v02,v02h,4);
1829 v13l=_mm256_alignr_epi8(v13,v13h,4);
1830 f02l=_mm256_alignr_epi8(f02,f02h,4);
1831 f13l=_mm256_alignr_epi8(f13,f13h,4);
1832 # else
1833 v02l=v02h;
1834 v13l=v13h;
1835 f02l=f02h;
1836 f13l=f13h;
1837 # endif
1838 }
1839
1840 v02h=_mm256_alignr_epi8(v02,v02h,CHANNELS*2);
1841 v13h=_mm256_alignr_epi8(v13,v13h,CHANNELS*2);
1842 f02h=_mm256_alignr_epi8(f02,f02h,CHANNELS*2);
1843 f13h=_mm256_alignr_epi8(f13,f13h,CHANNELS*2);
1844 cx--;
1845 } while (cx>=0);
1846
1847 # if CHANNELS==3
1848 v02h=_mm256_srli_si256(v02h,2);
1849 v13h=_mm256_srli_si256(v13h,2);
1850 f02h=_mm256_srli_si256(f02h,2);
1851 f13h=_mm256_srli_si256(f13h,2);
1852 # endif
1853
1854 __m256i vx0=_mm256_permute2x128_si256(v02l,v02h,0x20);
1855 __m256i vx1=_mm256_permute2x128_si256(v13l,v13h,0x20);
1856 __m256i vx2=_mm256_permute2x128_si256(v02l,v02h,0x31);
1857 __m256i vx3=_mm256_permute2x128_si256(v13l,v13h,0x31);
1858 __m256i fx0=_mm256_permute2x128_si256(f02l,f02h,0x20);
1859 __m256i fx1=_mm256_permute2x128_si256(f13l,f13h,0x20);
1860 __m256i fx2=_mm256_permute2x128_si256(f02l,f02h,0x31);
1861 __m256i fx3=_mm256_permute2x128_si256(f13l,f13h,0x31);
1862
1863 __m256i vx=InterpolateFourVectorsAdaptive(vx0,vx1,vx2,vx3,fx0,fx1,fx2,fx3);
1864
1865 vx=_mm256_add_epi16(vx,_mm256_set1_epi16(0x10));
1866 vx=_mm256_srai_epi16(vx,5);
1867 vx=_mm256_max_epi16(vx,_mm256_setzero_si256());
1868 __m128i svx=_mm_packus_epi16(
1869 _mm256_castsi256_si128(vx),
1870 _mm256_extracti128_si256(vx,1)
1871 );
1872 # if CHANNELS==2 || CHANNELS==4
1873 svx=_mm_min_epu8(svx,_mm_shuffle_epi8(svx,_mm_set_epi8(
1874 # if CHANNELS==2
1875 15,15,13,13,11,11,9,9,7,7,5,5,3,3,1,1
1876 # else
1877 15,15,15,15,11,11,11,11,7,7,7,7,3,3,3,3
1878 # endif
1879 )));
1880 # endif
1881
1882 _mm_storeu_si128((__m128i*)buf,svx);
1883
1884 buf+=(16/CHANNELS)*CHANNELS;
1885 } while (buf<bufEnd);
1886 }
1887
1888
1889 //==============================================================================
1890 //======================= Undefine General Helper Macros =======================
1891 //==============================================================================
1892
1893 #undef DEFINE_AND_SET_IMAGE_Y
1894 #undef DEFINE_AND_COPY_IMAGE_Y
1895 #undef INCREMENT_IMAGE_Y
1896 #undef DEFINE_AND_SET_IMAGE_ROW_PTR
1897 #undef DEFINE_AND_SET_IMAGE_X
1898 #undef INCREMENT_IMAGE_X
1899 #undef ARE_THERE_16_CONSECUTIVE_BYTES_AT_IMAGE_X
1900 #undef DEFINE_AND_SET_IMAGE_PIX_PTR
1901 #undef PREMULFIN_COLOR_VEC8
1902 #undef PREMULFIN_SHL_COLOR_VEC16
1903
1904
1905 #endif
1906