1 // Copyright 2021 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // SSE41 variant of methods for lossless decoder
11 
12 #include "src/dsp/dsp.h"
13 
14 #if defined(WEBP_USE_SSE41)
15 
16 #include "src/dsp/common_sse41.h"
17 #include "src/dsp/lossless.h"
18 #include "src/dsp/lossless_common.h"
19 
20 //------------------------------------------------------------------------------
21 // Color-space conversion functions
22 
TransformColorInverse_SSE41(const VP8LMultipliers * const m,const uint32_t * const src,int num_pixels,uint32_t * dst)23 static void TransformColorInverse_SSE41(const VP8LMultipliers* const m,
24                                         const uint32_t* const src,
25                                         int num_pixels, uint32_t* dst) {
26 // sign-extended multiplying constants, pre-shifted by 5.
27 #define CST(X)  (((int16_t)(m->X << 8)) >> 5)   // sign-extend
28   const __m128i mults_rb = _mm_set1_epi32((uint32_t)CST(green_to_red_) << 16 |
29                                           (CST(green_to_blue_) & 0xffff));
30   const __m128i mults_b2 = _mm_set1_epi32(CST(red_to_blue_));
31 #undef CST
32   const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);
33   const __m128i perm1 = _mm_setr_epi8(-1, 1, -1, 1, -1, 5, -1, 5,
34                                       -1, 9, -1, 9, -1, 13, -1, 13);
35   const __m128i perm2 = _mm_setr_epi8(-1, 2, -1, -1, -1, 6, -1, -1,
36                                       -1, 10, -1, -1, -1, 14, -1, -1);
37   int i;
38   for (i = 0; i + 4 <= num_pixels; i += 4) {
39     const __m128i A = _mm_loadu_si128((const __m128i*)(src + i));
40     const __m128i B = _mm_shuffle_epi8(A, perm1); // argb -> g0g0
41     const __m128i C = _mm_mulhi_epi16(B, mults_rb);
42     const __m128i D = _mm_add_epi8(A, C);
43     const __m128i E = _mm_shuffle_epi8(D, perm2);
44     const __m128i F = _mm_mulhi_epi16(E, mults_b2);
45     const __m128i G = _mm_add_epi8(D, F);
46     const __m128i out = _mm_blendv_epi8(G, A, mask_ag);
47     _mm_storeu_si128((__m128i*)&dst[i], out);
48   }
49   // Fall-back to C-version for left-overs.
50   if (i != num_pixels) {
51     VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
52   }
53 }
54 
55 //------------------------------------------------------------------------------
56 
57 #define ARGB_TO_RGB_SSE41 do {                        \
58   while (num_pixels >= 16) {                          \
59     const __m128i in0 = _mm_loadu_si128(in + 0);      \
60     const __m128i in1 = _mm_loadu_si128(in + 1);      \
61     const __m128i in2 = _mm_loadu_si128(in + 2);      \
62     const __m128i in3 = _mm_loadu_si128(in + 3);      \
63     const __m128i a0 = _mm_shuffle_epi8(in0, perm0);  \
64     const __m128i a1 = _mm_shuffle_epi8(in1, perm1);  \
65     const __m128i a2 = _mm_shuffle_epi8(in2, perm2);  \
66     const __m128i a3 = _mm_shuffle_epi8(in3, perm3);  \
67     const __m128i b0 = _mm_blend_epi16(a0, a1, 0xc0); \
68     const __m128i b1 = _mm_blend_epi16(a1, a2, 0xf0); \
69     const __m128i b2 = _mm_blend_epi16(a2, a3, 0xfc); \
70     _mm_storeu_si128(out + 0, b0);                    \
71     _mm_storeu_si128(out + 1, b1);                    \
72     _mm_storeu_si128(out + 2, b2);                    \
73     in += 4;                                          \
74     out += 3;                                         \
75     num_pixels -= 16;                                 \
76   }                                                   \
77 } while (0)
78 
ConvertBGRAToRGB_SSE41(const uint32_t * src,int num_pixels,uint8_t * dst)79 static void ConvertBGRAToRGB_SSE41(const uint32_t* src, int num_pixels,
80                                    uint8_t* dst) {
81   const __m128i* in = (const __m128i*)src;
82   __m128i* out = (__m128i*)dst;
83   const __m128i perm0 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9,
84                                       8, 14, 13, 12, -1, -1, -1, -1);
85   const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39);
86   const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e);
87   const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93);
88 
89   ARGB_TO_RGB_SSE41;
90 
91   // left-overs
92   if (num_pixels > 0) {
93     VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
94   }
95 }
96 
ConvertBGRAToBGR_SSE41(const uint32_t * src,int num_pixels,uint8_t * dst)97 static void ConvertBGRAToBGR_SSE41(const uint32_t* src,
98                                    int num_pixels, uint8_t* dst) {
99   const __m128i* in = (const __m128i*)src;
100   __m128i* out = (__m128i*)dst;
101   const __m128i perm0 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10,
102                                       12, 13, 14, -1, -1, -1, -1);
103   const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39);
104   const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e);
105   const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93);
106 
107   ARGB_TO_RGB_SSE41;
108 
109   // left-overs
110   if (num_pixels > 0) {
111     VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
112   }
113 }
114 
115 #undef ARGB_TO_RGB_SSE41
116 
117 //------------------------------------------------------------------------------
118 // Entry point
119 
120 extern void VP8LDspInitSSE41(void);
121 
VP8LDspInitSSE41(void)122 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE41(void) {
123   VP8LTransformColorInverse = TransformColorInverse_SSE41;
124   VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE41;
125   VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE41;
126 }
127 
128 #else  // !WEBP_USE_SSE41
129 
130 WEBP_DSP_INIT_STUB(VP8LDspInitSSE41)
131 
132 #endif  // WEBP_USE_SSE41
133