1 /* babl - dynamically extendable universal pixel conversion library.
2 * Copyright (C) 2019 Ell
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 3 of the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General
15 * Public License along with this library; if not, see
16 * <https://www.gnu.org/licenses/>.
17 */
18
19 #include "config.h"
20
21 #if defined(USE_AVX2)
22
23 /* AVX 2 */
24 #include <immintrin.h>
25
26 #include <stdint.h>
27 #include <stdlib.h>
28
29 #include "babl.h"
30 #include "babl-cpuaccel.h"
31 #include "extensions/util.h"
32 #include "extensions/avx2-int8-tables.h"
33
34 #define TABLE_SIZE (sizeof (linear_to_gamma) / sizeof (linear_to_gamma[0]))
35 #define SCALE ((float) (TABLE_SIZE - 1))
36
37 #define CVT1(src, dst) \
38 do \
39 { \
40 float x = *src; \
41 \
42 if (x < 0.0f) \
43 *dst = 0; \
44 else if (x <= 1.0f) \
45 *dst = linear_to_gamma[(int) (SCALE * x + 0.5f)]; \
46 else /* x > 1.0f || isnan (x) */ \
47 *dst = 255; \
48 \
49 src++; \
50 dst++; \
51 } \
52 while (0)
53
54 #define CVTA1(src, dst) \
55 do \
56 { \
57 float x = *src; \
58 \
59 if (x < 0.0f) \
60 *dst = 0; \
61 else if (x <= 1.0f) \
62 *dst = 255.0f * x + 0.5f; \
63 else /* x > 1.0f || isnan (x) */ \
64 *dst = 255; \
65 \
66 src++; \
67 dst++; \
68 } \
69 while (0)
70
71 static inline void
conv_yF_linear_y8_gamma(const Babl * conversion,const float * src,uint8_t * dst,long samples)72 conv_yF_linear_y8_gamma (const Babl *conversion,
73 const float *src,
74 uint8_t *dst,
75 long samples)
76 {
77 const __v8sf *src_vec;
78 __m256i *dst_vec;
79 const __v8sf scale = _mm256_set1_ps (SCALE);
80 const __v8sf zero = _mm256_setzero_ps ();
81 const __v8sf half = _mm256_set1_ps (0.5f);
82
83 while ((uintptr_t) src % 32 && samples > 0)
84 {
85 CVT1 (src, dst);
86
87 samples--;
88 }
89
90 src_vec = (const __v8sf *) src;
91 dst_vec = (__m256i *) dst;
92
93 while (samples >= 32)
94 {
95 __m256i i32_0, i32_1, i32_2, i32_3;
96 __m256i i16_01, i16_23;
97 __m256i i8_0123;
98
99 #define CVT8(i) \
100 do \
101 { \
102 __v8sf yyyyyyyy; \
103 \
104 yyyyyyyy = scale * src_vec[i] + half; \
105 yyyyyyyy = _mm256_max_ps (yyyyyyyy, zero); \
106 yyyyyyyy = _mm256_min_ps (yyyyyyyy, scale); \
107 i32_##i = _mm256_cvttps_epi32 (yyyyyyyy); \
108 i32_##i = _mm256_i32gather_epi32 (linear_to_gamma, i32_##i, 4); \
109 } \
110 while (0)
111
112 CVT8 (0);
113 CVT8 (1);
114
115 i16_01 = _mm256_packus_epi32 (i32_0, i32_1);
116
117 CVT8 (2);
118 CVT8 (3);
119
120 i16_23 = _mm256_packus_epi32 (i32_2, i32_3);
121
122 i8_0123 = _mm256_packus_epi16 (i16_01, i16_23);
123 i8_0123 = _mm256_permutevar8x32_epi32 (
124 i8_0123,
125 _mm256_setr_epi32 (0, 4, 1, 5,
126 2, 6, 3, 7));
127
128 _mm256_storeu_si256 (dst_vec, i8_0123);
129
130 #undef CVT8
131
132 src_vec += 4;
133 dst_vec++;
134
135 samples -= 32;
136 }
137
138 src = (const float *) src_vec;
139 dst = (uint8_t *) dst_vec;
140
141 while (samples > 0)
142 {
143 CVT1 (src, dst);
144
145 samples--;
146 }
147 }
148
149 static inline void
conv_yaF_linear_ya8_gamma(const Babl * conversion,const float * src,uint8_t * dst,long samples)150 conv_yaF_linear_ya8_gamma (const Babl *conversion,
151 const float *src,
152 uint8_t *dst,
153 long samples)
154 {
155 if ((uintptr_t) src % 8 == 0)
156 {
157 const __v8sf *src_vec;
158 __m256i *dst_vec;
159 const __v8sf scale = _mm256_setr_ps (SCALE, 255.0f, SCALE, 255.0f,
160 SCALE, 255.0f, SCALE, 255.0f);
161 const __v8sf zero = _mm256_setzero_ps ();
162 const __v8sf half = _mm256_set1_ps (0.5f);
163 const __m256i mask = _mm256_setr_epi32 (-1, 0, -1, 0,
164 -1, 0, -1, 0);
165
166 while ((uintptr_t) src % 32 && samples > 0)
167 {
168 CVT1 (src, dst);
169 CVTA1 (src, dst);
170
171 samples--;
172 }
173
174 src_vec = (const __v8sf *) src;
175 dst_vec = (__m256i *) dst;
176
177 while (samples >= 16)
178 {
179 __m256i i32_0, i32_1, i32_2, i32_3;
180 __m256i i16_01, i16_23;
181 __m256i i8_0123;
182
183 #define CVT8(i) \
184 do \
185 { \
186 __v8sf yayayaya; \
187 \
188 yayayaya = scale * src_vec[i] + half; \
189 yayayaya = _mm256_max_ps (yayayaya, zero); \
190 yayayaya = _mm256_min_ps (yayayaya, scale); \
191 i32_##i = _mm256_cvttps_epi32 (yayayaya); \
192 i32_##i = _mm256_mask_i32gather_epi32 (i32_##i, \
193 linear_to_gamma, \
194 i32_##i, mask, 4); \
195 } \
196 while (0)
197
198 CVT8 (0);
199 CVT8 (1);
200
201 i16_01 = _mm256_packus_epi32 (i32_0, i32_1);
202
203 CVT8 (2);
204 CVT8 (3);
205
206 i16_23 = _mm256_packus_epi32 (i32_2, i32_3);
207
208 i8_0123 = _mm256_packus_epi16 (i16_01, i16_23);
209 i8_0123 = _mm256_permutevar8x32_epi32 (
210 i8_0123,
211 _mm256_setr_epi32 (0, 4, 1, 5,
212 2, 6, 3, 7));
213
214 _mm256_storeu_si256 (dst_vec, i8_0123);
215
216 #undef CVT8
217
218 src_vec += 4;
219 dst_vec++;
220
221 samples -= 16;
222 }
223
224 src = (const float *) src_vec;
225 dst = (uint8_t *) dst_vec;
226 }
227
228 while (samples > 0)
229 {
230 CVT1 (src, dst);
231 CVTA1 (src, dst);
232
233 samples--;
234 }
235 }
236
237 static void
conv_rgbF_linear_rgb8_gamma(const Babl * conversion,const float * src,uint8_t * dst,long samples)238 conv_rgbF_linear_rgb8_gamma (const Babl *conversion,
239 const float *src,
240 uint8_t *dst,
241 long samples)
242 {
243 conv_yF_linear_y8_gamma (conversion, src, dst, 3 * samples);
244 }
245
246 static inline void
conv_rgbaF_linear_rgba8_gamma(const Babl * conversion,const float * src,uint8_t * dst,long samples)247 conv_rgbaF_linear_rgba8_gamma (const Babl *conversion,
248 const float *src,
249 uint8_t *dst,
250 long samples)
251 {
252 if ((uintptr_t) src % 16 == 0)
253 {
254 const __v8sf *src_vec;
255 __m256i *dst_vec;
256 const __v8sf scale = _mm256_setr_ps (SCALE, SCALE, SCALE, 255.0f,
257 SCALE, SCALE, SCALE, 255.0f);
258 const __v8sf zero = _mm256_setzero_ps ();
259 const __v8sf half = _mm256_set1_ps (0.5f);
260 const __m256i mask = _mm256_setr_epi32 (-1, -1, -1, 0,
261 -1, -1, -1, 0);
262
263 while ((uintptr_t) src % 32 && samples > 0)
264 {
265 CVT1 (src, dst);
266 CVT1 (src, dst);
267 CVT1 (src, dst);
268 CVTA1 (src, dst);
269
270 samples--;
271 }
272
273 src_vec = (const __v8sf *) src;
274 dst_vec = (__m256i *) dst;
275
276 while (samples >= 8)
277 {
278 __m256i i32_0, i32_1, i32_2, i32_3;
279 __m256i i16_01, i16_23;
280 __m256i i8_0123;
281
282 #define CVT8(i) \
283 do \
284 { \
285 __v8sf rgbargba; \
286 \
287 rgbargba = scale * src_vec[i] + half; \
288 rgbargba = _mm256_max_ps (rgbargba, zero); \
289 rgbargba = _mm256_min_ps (rgbargba, scale); \
290 i32_##i = _mm256_cvttps_epi32 (rgbargba); \
291 i32_##i = _mm256_mask_i32gather_epi32 (i32_##i, \
292 linear_to_gamma, \
293 i32_##i, mask, 4); \
294 } \
295 while (0)
296
297 CVT8 (0);
298 CVT8 (1);
299
300 i16_01 = _mm256_packus_epi32 (i32_0, i32_1);
301
302 CVT8 (2);
303 CVT8 (3);
304
305 i16_23 = _mm256_packus_epi32 (i32_2, i32_3);
306
307 i8_0123 = _mm256_packus_epi16 (i16_01, i16_23);
308 i8_0123 = _mm256_permutevar8x32_epi32 (
309 i8_0123,
310 _mm256_setr_epi32 (0, 4, 1, 5,
311 2, 6, 3, 7));
312
313 _mm256_storeu_si256 (dst_vec, i8_0123);
314
315 #undef CVT8
316
317 src_vec += 4;
318 dst_vec++;
319
320 samples -= 8;
321 }
322
323 src = (const float *) src_vec;
324 dst = (uint8_t *) dst_vec;
325 }
326
327 while (samples > 0)
328 {
329 CVT1 (src, dst);
330 CVT1 (src, dst);
331 CVT1 (src, dst);
332 CVTA1 (src, dst);
333
334 samples--;
335 }
336 }
337
338 #undef CVT1
339 #undef CVTA1
340
341 #define CVT1(src, dst) \
342 (*dst++ = gamma_to_linear[*src++])
343
344 #define CVTA1(src, dst) \
345 (*dst++ = gamma_to_linear[*src++ + 256])
346
347 static inline void
conv_y8_gamma_yF_linear(const Babl * conversion,const uint8_t * src,float * dst,long samples)348 conv_y8_gamma_yF_linear (const Babl *conversion,
349 const uint8_t *src,
350 float *dst,
351 long samples)
352 {
353 const __m128i *src_vec;
354 __v8sf *dst_vec;
355
356 while ((uintptr_t) dst % 32 && samples > 0)
357 {
358 CVT1 (src, dst);
359
360 samples--;
361 }
362
363 src_vec = (const __m128i *) src;
364 dst_vec = (__v8sf *) dst;
365
366 while (samples >= 16)
367 {
368 __m128i i8_01;
369 __m256i i32_0;
370
371 i8_01 = _mm_loadu_si128 (src_vec++);
372
373 i32_0 = _mm256_cvtepu8_epi32 (i8_01);
374 *dst_vec++ = _mm256_i32gather_ps (gamma_to_linear, i32_0, 4);
375
376 i8_01 = _mm_shuffle_epi32 (i8_01, _MM_SHUFFLE (1, 0, 3, 2));
377
378 i32_0 = _mm256_cvtepu8_epi32 (i8_01);
379 *dst_vec++ = _mm256_i32gather_ps (gamma_to_linear, i32_0, 4);
380
381 samples -= 16;
382 }
383
384 src = (const uint8_t *) src_vec;
385 dst = (float *) dst_vec;
386
387 while (samples > 0)
388 {
389 CVT1 (src, dst);
390
391 samples--;
392 }
393 }
394
395 static inline void
conv_ya8_gamma_yaF_linear(const Babl * conversion,const uint8_t * src,float * dst,long samples)396 conv_ya8_gamma_yaF_linear (const Babl *conversion,
397 const uint8_t *src,
398 float *dst,
399 long samples)
400 {
401 const __m128i *src_vec;
402 __v8sf *dst_vec;
403 const __m256i offset = _mm256_setr_epi32 (0, 256, 0, 256,
404 0, 256, 0, 256);
405
406 while ((uintptr_t) dst % 32 && samples > 0)
407 {
408 CVT1 (src, dst);
409 CVTA1 (src, dst);
410
411 samples--;
412 }
413
414 src_vec = (const __m128i *) src;
415 dst_vec = (__v8sf *) dst;
416
417 while (samples >= 8)
418 {
419 __m128i i8_01;
420 __m256i i32_0;
421
422 i8_01 = _mm_loadu_si128 (src_vec++);
423
424 i32_0 = _mm256_cvtepu8_epi32 (i8_01);
425 i32_0 += offset;
426 *dst_vec++ = _mm256_i32gather_ps (gamma_to_linear, i32_0, 4);
427
428 i8_01 = _mm_shuffle_epi32 (i8_01, _MM_SHUFFLE (1, 0, 3, 2));
429
430 i32_0 = _mm256_cvtepu8_epi32 (i8_01);
431 i32_0 += offset;
432 *dst_vec++ = _mm256_i32gather_ps (gamma_to_linear, i32_0, 4);
433
434 samples -= 8;
435 }
436
437 src = (const uint8_t *) src_vec;
438 dst = (float *) dst_vec;
439
440 while (samples > 0)
441 {
442 CVT1 (src, dst);
443 CVTA1 (src, dst);
444
445 samples--;
446 }
447 }
448
449 static inline void
conv_rgb8_gamma_rgbF_linear(const Babl * conversion,const uint8_t * src,float * dst,long samples)450 conv_rgb8_gamma_rgbF_linear (const Babl *conversion,
451 const uint8_t *src,
452 float *dst,
453 long samples)
454 {
455 conv_y8_gamma_yF_linear (conversion, src, dst, 3 * samples);
456 }
457
458 static inline void
conv_rgba8_gamma_rgbaF_linear(const Babl * conversion,const uint8_t * src,float * dst,long samples)459 conv_rgba8_gamma_rgbaF_linear (const Babl *conversion,
460 const uint8_t *src,
461 float *dst,
462 long samples)
463 {
464 const __m128i *src_vec;
465 __v8sf *dst_vec;
466 const __m256i offset = _mm256_setr_epi32 (0, 0, 0, 256,
467 0, 0, 0, 256);
468
469 while ((uintptr_t) dst % 32 && samples > 0)
470 {
471 CVT1 (src, dst);
472 CVT1 (src, dst);
473 CVT1 (src, dst);
474 CVTA1 (src, dst);
475
476 samples--;
477 }
478
479 src_vec = (const __m128i *) src;
480 dst_vec = (__v8sf *) dst;
481
482 while (samples >= 4)
483 {
484 __m128i i8_01;
485 __m256i i32_0;
486
487 i8_01 = _mm_loadu_si128 (src_vec++);
488
489 i32_0 = _mm256_cvtepu8_epi32 (i8_01);
490 i32_0 += offset;
491 *dst_vec++ = _mm256_i32gather_ps (gamma_to_linear, i32_0, 4);
492
493 i8_01 = _mm_shuffle_epi32 (i8_01, _MM_SHUFFLE (1, 0, 3, 2));
494
495 i32_0 = _mm256_cvtepu8_epi32 (i8_01);
496 i32_0 += offset;
497 *dst_vec++ = _mm256_i32gather_ps (gamma_to_linear, i32_0, 4);
498
499 samples -= 4;
500 }
501
502 src = (const uint8_t *) src_vec;
503 dst = (float *) dst_vec;
504
505 while (samples > 0)
506 {
507 CVT1 (src, dst);
508 CVT1 (src, dst);
509 CVT1 (src, dst);
510 CVTA1 (src, dst);
511
512 samples--;
513 }
514 }
515
516 #undef CVT1
517 #undef CVTA1
518
519 #endif /* defined(USE_AVX2) */
520
521 int init (void);
522
523 int
init(void)524 init (void)
525 {
526 #if defined(USE_AVX2)
527
528 const Babl *yF_linear = babl_format_new (
529 babl_model ("Y"),
530 babl_type ("float"),
531 babl_component ("Y"),
532 NULL);
533 const Babl *y8_gamma = babl_format_new (
534 babl_model ("Y'"),
535 babl_type ("u8"),
536 babl_component ("Y'"),
537 NULL);
538 const Babl *yaF_linear = babl_format_new (
539 babl_model ("YA"),
540 babl_type ("float"),
541 babl_component ("Y"),
542 babl_component ("A"),
543 NULL);
544 const Babl *ya8_gamma = babl_format_new (
545 babl_model ("Y'A"),
546 babl_type ("u8"),
547 babl_component ("Y'"),
548 babl_component ("A"),
549 NULL);
550 const Babl *rgbF_linear = babl_format_new (
551 babl_model ("RGB"),
552 babl_type ("float"),
553 babl_component ("R"),
554 babl_component ("G"),
555 babl_component ("B"),
556 NULL);
557 const Babl *rgb8_gamma = babl_format_new (
558 babl_model ("R'G'B'"),
559 babl_type ("u8"),
560 babl_component ("R'"),
561 babl_component ("G'"),
562 babl_component ("B'"),
563 NULL);
564 const Babl *rgbaF_linear = babl_format_new (
565 babl_model ("RGBA"),
566 babl_type ("float"),
567 babl_component ("R"),
568 babl_component ("G"),
569 babl_component ("B"),
570 babl_component ("A"),
571 NULL);
572 const Babl *rgba8_gamma = babl_format_new (
573 babl_model ("R'G'B'A"),
574 babl_type ("u8"),
575 babl_component ("R'"),
576 babl_component ("G'"),
577 babl_component ("B'"),
578 babl_component ("A"),
579 NULL);
580
581 #define CONV(src, dst) \
582 do \
583 { \
584 babl_conversion_new (src ## _linear, \
585 dst ## _gamma, \
586 "linear", \
587 conv_ ## src ## _linear_ ## dst ## _gamma, \
588 NULL); \
589 \
590 babl_conversion_new (dst ## _gamma, \
591 src ## _linear, \
592 "linear", \
593 conv_ ## dst ## _gamma_ ## src ## _linear, \
594 NULL); \
595 } \
596 while (0)
597
598 if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_AVX2))
599 {
600 CONV (yF, y8);
601 CONV (yaF, ya8);
602 CONV (rgbF, rgb8);
603 CONV (rgbaF, rgba8);
604 }
605
606 #endif /* defined(USE_AVX2) */
607
608 return 0;
609 }
610
611