1 // Copyright 2014 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // SSE2 variant of methods for lossless decoder
11 //
12 // Author: Skal (pascal.massimino@gmail.com)
13 
14 #include "src/dsp/dsp.h"
15 
16 #if defined(WEBP_USE_SSE2)
17 
18 #include "src/dsp/common_sse2.h"
19 #include "src/dsp/lossless.h"
20 #include "src/dsp/lossless_common.h"
21 #include <assert.h>
22 #include <emmintrin.h>
23 
24 //------------------------------------------------------------------------------
25 // Predictor Transform
26 
ClampedAddSubtractFull_SSE2(uint32_t c0,uint32_t c1,uint32_t c2)27 static WEBP_INLINE uint32_t ClampedAddSubtractFull_SSE2(uint32_t c0,
28                                                         uint32_t c1,
29                                                         uint32_t c2) {
30   const __m128i zero = _mm_setzero_si128();
31   const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
32   const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
33   const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
34   const __m128i V1 = _mm_add_epi16(C0, C1);
35   const __m128i V2 = _mm_sub_epi16(V1, C2);
36   const __m128i b = _mm_packus_epi16(V2, V2);
37   const uint32_t output = _mm_cvtsi128_si32(b);
38   return output;
39 }
40 
ClampedAddSubtractHalf_SSE2(uint32_t c0,uint32_t c1,uint32_t c2)41 static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0,
42                                                         uint32_t c1,
43                                                         uint32_t c2) {
44   const __m128i zero = _mm_setzero_si128();
45   const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
46   const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
47   const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
48   const __m128i avg = _mm_add_epi16(C1, C0);
49   const __m128i A0 = _mm_srli_epi16(avg, 1);
50   const __m128i A1 = _mm_sub_epi16(A0, B0);
51   const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
52   const __m128i A2 = _mm_sub_epi16(A1, BgtA);
53   const __m128i A3 = _mm_srai_epi16(A2, 1);
54   const __m128i A4 = _mm_add_epi16(A0, A3);
55   const __m128i A5 = _mm_packus_epi16(A4, A4);
56   const uint32_t output = _mm_cvtsi128_si32(A5);
57   return output;
58 }
59 
Select_SSE2(uint32_t a,uint32_t b,uint32_t c)60 static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) {
61   int pa_minus_pb;
62   const __m128i zero = _mm_setzero_si128();
63   const __m128i A0 = _mm_cvtsi32_si128(a);
64   const __m128i B0 = _mm_cvtsi32_si128(b);
65   const __m128i C0 = _mm_cvtsi32_si128(c);
66   const __m128i AC0 = _mm_subs_epu8(A0, C0);
67   const __m128i CA0 = _mm_subs_epu8(C0, A0);
68   const __m128i BC0 = _mm_subs_epu8(B0, C0);
69   const __m128i CB0 = _mm_subs_epu8(C0, B0);
70   const __m128i AC = _mm_or_si128(AC0, CA0);
71   const __m128i BC = _mm_or_si128(BC0, CB0);
72   const __m128i pa = _mm_unpacklo_epi8(AC, zero);  // |a - c|
73   const __m128i pb = _mm_unpacklo_epi8(BC, zero);  // |b - c|
74   const __m128i diff = _mm_sub_epi16(pb, pa);
75   {
76     int16_t out[8];
77     _mm_storeu_si128((__m128i*)out, diff);
78     pa_minus_pb = out[0] + out[1] + out[2] + out[3];
79   }
80   return (pa_minus_pb <= 0) ? a : b;
81 }
82 
Average2_m128i(const __m128i * const a0,const __m128i * const a1,__m128i * const avg)83 static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
84                                        const __m128i* const a1,
85                                        __m128i* const avg) {
86   // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
87   const __m128i ones = _mm_set1_epi8(1);
88   const __m128i avg1 = _mm_avg_epu8(*a0, *a1);
89   const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones);
90   *avg = _mm_sub_epi8(avg1, one);
91 }
92 
Average2_uint32_SSE2(const uint32_t a0,const uint32_t a1,__m128i * const avg)93 static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0,
94                                              const uint32_t a1,
95                                              __m128i* const avg) {
96   // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
97   const __m128i ones = _mm_set1_epi8(1);
98   const __m128i A0 = _mm_cvtsi32_si128(a0);
99   const __m128i A1 = _mm_cvtsi32_si128(a1);
100   const __m128i avg1 = _mm_avg_epu8(A0, A1);
101   const __m128i one = _mm_and_si128(_mm_xor_si128(A0, A1), ones);
102   *avg = _mm_sub_epi8(avg1, one);
103 }
104 
Average2_uint32_16_SSE2(uint32_t a0,uint32_t a1)105 static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) {
106   const __m128i zero = _mm_setzero_si128();
107   const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero);
108   const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
109   const __m128i sum = _mm_add_epi16(A1, A0);
110   return _mm_srli_epi16(sum, 1);
111 }
112 
Average2_SSE2(uint32_t a0,uint32_t a1)113 static WEBP_INLINE uint32_t Average2_SSE2(uint32_t a0, uint32_t a1) {
114   __m128i output;
115   Average2_uint32_SSE2(a0, a1, &output);
116   return _mm_cvtsi128_si32(output);
117 }
118 
Average3_SSE2(uint32_t a0,uint32_t a1,uint32_t a2)119 static WEBP_INLINE uint32_t Average3_SSE2(uint32_t a0, uint32_t a1,
120                                           uint32_t a2) {
121   const __m128i zero = _mm_setzero_si128();
122   const __m128i avg1 = Average2_uint32_16_SSE2(a0, a2);
123   const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
124   const __m128i sum = _mm_add_epi16(avg1, A1);
125   const __m128i avg2 = _mm_srli_epi16(sum, 1);
126   const __m128i A2 = _mm_packus_epi16(avg2, avg2);
127   const uint32_t output = _mm_cvtsi128_si32(A2);
128   return output;
129 }
130 
Average4_SSE2(uint32_t a0,uint32_t a1,uint32_t a2,uint32_t a3)131 static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
132                                           uint32_t a2, uint32_t a3) {
133   const __m128i avg1 = Average2_uint32_16_SSE2(a0, a1);
134   const __m128i avg2 = Average2_uint32_16_SSE2(a2, a3);
135   const __m128i sum = _mm_add_epi16(avg2, avg1);
136   const __m128i avg3 = _mm_srli_epi16(sum, 1);
137   const __m128i A0 = _mm_packus_epi16(avg3, avg3);
138   const uint32_t output = _mm_cvtsi128_si32(A0);
139   return output;
140 }
141 
Predictor5_SSE2(uint32_t left,const uint32_t * const top)142 static uint32_t Predictor5_SSE2(uint32_t left, const uint32_t* const top) {
143   const uint32_t pred = Average3_SSE2(left, top[0], top[1]);
144   return pred;
145 }
Predictor6_SSE2(uint32_t left,const uint32_t * const top)146 static uint32_t Predictor6_SSE2(uint32_t left, const uint32_t* const top) {
147   const uint32_t pred = Average2_SSE2(left, top[-1]);
148   return pred;
149 }
Predictor7_SSE2(uint32_t left,const uint32_t * const top)150 static uint32_t Predictor7_SSE2(uint32_t left, const uint32_t* const top) {
151   const uint32_t pred = Average2_SSE2(left, top[0]);
152   return pred;
153 }
Predictor8_SSE2(uint32_t left,const uint32_t * const top)154 static uint32_t Predictor8_SSE2(uint32_t left, const uint32_t* const top) {
155   const uint32_t pred = Average2_SSE2(top[-1], top[0]);
156   (void)left;
157   return pred;
158 }
Predictor9_SSE2(uint32_t left,const uint32_t * const top)159 static uint32_t Predictor9_SSE2(uint32_t left, const uint32_t* const top) {
160   const uint32_t pred = Average2_SSE2(top[0], top[1]);
161   (void)left;
162   return pred;
163 }
Predictor10_SSE2(uint32_t left,const uint32_t * const top)164 static uint32_t Predictor10_SSE2(uint32_t left, const uint32_t* const top) {
165   const uint32_t pred = Average4_SSE2(left, top[-1], top[0], top[1]);
166   return pred;
167 }
Predictor11_SSE2(uint32_t left,const uint32_t * const top)168 static uint32_t Predictor11_SSE2(uint32_t left, const uint32_t* const top) {
169   const uint32_t pred = Select_SSE2(top[0], left, top[-1]);
170   return pred;
171 }
Predictor12_SSE2(uint32_t left,const uint32_t * const top)172 static uint32_t Predictor12_SSE2(uint32_t left, const uint32_t* const top) {
173   const uint32_t pred = ClampedAddSubtractFull_SSE2(left, top[0], top[-1]);
174   return pred;
175 }
Predictor13_SSE2(uint32_t left,const uint32_t * const top)176 static uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) {
177   const uint32_t pred = ClampedAddSubtractHalf_SSE2(left, top[0], top[-1]);
178   return pred;
179 }
180 
181 // Batch versions of those functions.
182 
183 // Predictor0: ARGB_BLACK.
PredictorAdd0_SSE2(const uint32_t * in,const uint32_t * upper,int num_pixels,uint32_t * out)184 static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
185                                int num_pixels, uint32_t* out) {
186   int i;
187   const __m128i black = _mm_set1_epi32(ARGB_BLACK);
188   for (i = 0; i + 4 <= num_pixels; i += 4) {
189     const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
190     const __m128i res = _mm_add_epi8(src, black);
191     _mm_storeu_si128((__m128i*)&out[i], res);
192   }
193   if (i != num_pixels) {
194     VP8LPredictorsAdd_C[0](in + i, NULL, num_pixels - i, out + i);
195   }
196   (void)upper;
197 }
198 
199 // Predictor1: left.
PredictorAdd1_SSE2(const uint32_t * in,const uint32_t * upper,int num_pixels,uint32_t * out)200 static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,
201                                int num_pixels, uint32_t* out) {
202   int i;
203   __m128i prev = _mm_set1_epi32(out[-1]);
204   for (i = 0; i + 4 <= num_pixels; i += 4) {
205     // a | b | c | d
206     const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
207     // 0 | a | b | c
208     const __m128i shift0 = _mm_slli_si128(src, 4);
209     // a | a + b | b + c | c + d
210     const __m128i sum0 = _mm_add_epi8(src, shift0);
211     // 0 | 0 | a | a + b
212     const __m128i shift1 = _mm_slli_si128(sum0, 8);
213     // a | a + b | a + b + c | a + b + c + d
214     const __m128i sum1 = _mm_add_epi8(sum0, shift1);
215     const __m128i res = _mm_add_epi8(sum1, prev);
216     _mm_storeu_si128((__m128i*)&out[i], res);
217     // replicate prev output on the four lanes
218     prev = _mm_shuffle_epi32(res, (3 << 0) | (3 << 2) | (3 << 4) | (3 << 6));
219   }
220   if (i != num_pixels) {
221     VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i);
222   }
223 }
224 
225 // Macro that adds 32-bit integers from IN using mod 256 arithmetic
226 // per 8 bit channel.
227 #define GENERATE_PREDICTOR_1(X, IN)                                           \
228 static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
229                                   int num_pixels, uint32_t* out) {            \
230   int i;                                                                      \
231   for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
232     const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);              \
233     const __m128i other = _mm_loadu_si128((const __m128i*)&(IN));             \
234     const __m128i res = _mm_add_epi8(src, other);                             \
235     _mm_storeu_si128((__m128i*)&out[i], res);                                 \
236   }                                                                           \
237   if (i != num_pixels) {                                                      \
238     VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i);     \
239   }                                                                           \
240 }
241 
242 // Predictor2: Top.
243 GENERATE_PREDICTOR_1(2, upper[i])
244 // Predictor3: Top-right.
245 GENERATE_PREDICTOR_1(3, upper[i + 1])
246 // Predictor4: Top-left.
247 GENERATE_PREDICTOR_1(4, upper[i - 1])
248 #undef GENERATE_PREDICTOR_1
249 
250 // Due to averages with integers, values cannot be accumulated in parallel for
251 // predictors 5 to 7.
GENERATE_PREDICTOR_ADD(Predictor5_SSE2,PredictorAdd5_SSE2)252 GENERATE_PREDICTOR_ADD(Predictor5_SSE2, PredictorAdd5_SSE2)
253 GENERATE_PREDICTOR_ADD(Predictor6_SSE2, PredictorAdd6_SSE2)
254 GENERATE_PREDICTOR_ADD(Predictor7_SSE2, PredictorAdd7_SSE2)
255 
256 #define GENERATE_PREDICTOR_2(X, IN)                                           \
257 static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
258                                    int num_pixels, uint32_t* out) {           \
259   int i;                                                                      \
260   for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
261     const __m128i Tother = _mm_loadu_si128((const __m128i*)&(IN));            \
262     const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);             \
263     const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);              \
264     __m128i avg, res;                                                         \
265     Average2_m128i(&T, &Tother, &avg);                                        \
266     res = _mm_add_epi8(avg, src);                                             \
267     _mm_storeu_si128((__m128i*)&out[i], res);                                 \
268   }                                                                           \
269   if (i != num_pixels) {                                                      \
270     VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i);     \
271   }                                                                           \
272 }
273 // Predictor8: average TL T.
274 GENERATE_PREDICTOR_2(8, upper[i - 1])
275 // Predictor9: average T TR.
276 GENERATE_PREDICTOR_2(9, upper[i + 1])
277 #undef GENERATE_PREDICTOR_2
278 
279 // Predictor10: average of (average of (L,TL), average of (T, TR)).
280 #define DO_PRED10(OUT) do {               \
281   __m128i avgLTL, avg;                    \
282   Average2_m128i(&L, &TL, &avgLTL);       \
283   Average2_m128i(&avgTTR, &avgLTL, &avg); \
284   L = _mm_add_epi8(avg, src);             \
285   out[i + (OUT)] = _mm_cvtsi128_si32(L);  \
286 } while (0)
287 
288 #define DO_PRED10_SHIFT do {                                  \
289   /* Rotate the pre-computed values for the next iteration.*/ \
290   avgTTR = _mm_srli_si128(avgTTR, 4);                         \
291   TL = _mm_srli_si128(TL, 4);                                 \
292   src = _mm_srli_si128(src, 4);                               \
293 } while (0)
294 
295 static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
296                                 int num_pixels, uint32_t* out) {
297   int i;
298   __m128i L = _mm_cvtsi32_si128(out[-1]);
299   for (i = 0; i + 4 <= num_pixels; i += 4) {
300     __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
301     __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
302     const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
303     const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
304     __m128i avgTTR;
305     Average2_m128i(&T, &TR, &avgTTR);
306     DO_PRED10(0);
307     DO_PRED10_SHIFT;
308     DO_PRED10(1);
309     DO_PRED10_SHIFT;
310     DO_PRED10(2);
311     DO_PRED10_SHIFT;
312     DO_PRED10(3);
313   }
314   if (i != num_pixels) {
315     VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
316   }
317 }
318 #undef DO_PRED10
319 #undef DO_PRED10_SHIFT
320 
321 // Predictor11: select.
322 #define DO_PRED11(OUT) do {                                            \
323   const __m128i L_lo = _mm_unpacklo_epi32(L, T);                       \
324   const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);                     \
325   const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/   \
326   const __m128i mask = _mm_cmpgt_epi32(pb, pa);                        \
327   const __m128i A = _mm_and_si128(mask, L);                            \
328   const __m128i B = _mm_andnot_si128(mask, T);                         \
329   const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \
330   L = _mm_add_epi8(src, pred);                                         \
331   out[i + (OUT)] = _mm_cvtsi128_si32(L);                               \
332 } while (0)
333 
334 #define DO_PRED11_SHIFT do {                                \
335   /* Shift the pre-computed value for the next iteration.*/ \
336   T = _mm_srli_si128(T, 4);                                 \
337   TL = _mm_srli_si128(TL, 4);                               \
338   src = _mm_srli_si128(src, 4);                             \
339   pa = _mm_srli_si128(pa, 4);                               \
340 } while (0)
341 
PredictorAdd11_SSE2(const uint32_t * in,const uint32_t * upper,int num_pixels,uint32_t * out)342 static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
343                                 int num_pixels, uint32_t* out) {
344   int i;
345   __m128i pa;
346   __m128i L = _mm_cvtsi32_si128(out[-1]);
347   for (i = 0; i + 4 <= num_pixels; i += 4) {
348     __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
349     __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
350     __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
351     {
352       // We can unpack with any value on the upper 32 bits, provided it's the
353       // same on both operands (so that their sum of abs diff is zero). Here we
354       // use T.
355       const __m128i T_lo = _mm_unpacklo_epi32(T, T);
356       const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);
357       const __m128i T_hi = _mm_unpackhi_epi32(T, T);
358       const __m128i TL_hi = _mm_unpackhi_epi32(TL, T);
359       const __m128i s_lo = _mm_sad_epu8(T_lo, TL_lo);
360       const __m128i s_hi = _mm_sad_epu8(T_hi, TL_hi);
361       pa = _mm_packs_epi32(s_lo, s_hi);  // pa = sum |T-TL|
362     }
363     DO_PRED11(0);
364     DO_PRED11_SHIFT;
365     DO_PRED11(1);
366     DO_PRED11_SHIFT;
367     DO_PRED11(2);
368     DO_PRED11_SHIFT;
369     DO_PRED11(3);
370   }
371   if (i != num_pixels) {
372     VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
373   }
374 }
375 #undef DO_PRED11
376 #undef DO_PRED11_SHIFT
377 
378 // Predictor12: ClampedAddSubtractFull.
379 #define DO_PRED12(DIFF, LANE, OUT) do {            \
380   const __m128i all = _mm_add_epi16(L, (DIFF));    \
381   const __m128i alls = _mm_packus_epi16(all, all); \
382   const __m128i res = _mm_add_epi8(src, alls);     \
383   out[i + (OUT)] = _mm_cvtsi128_si32(res);         \
384   L = _mm_unpacklo_epi8(res, zero);                \
385 } while (0)
386 
387 #define DO_PRED12_SHIFT(DIFF, LANE) do {                    \
388   /* Shift the pre-computed value for the next iteration.*/ \
389   if ((LANE) == 0) (DIFF) = _mm_srli_si128((DIFF), 8);      \
390   src = _mm_srli_si128(src, 4);                             \
391 } while (0)
392 
PredictorAdd12_SSE2(const uint32_t * in,const uint32_t * upper,int num_pixels,uint32_t * out)393 static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
394                                 int num_pixels, uint32_t* out) {
395   int i;
396   const __m128i zero = _mm_setzero_si128();
397   const __m128i L8 = _mm_cvtsi32_si128(out[-1]);
398   __m128i L = _mm_unpacklo_epi8(L8, zero);
399   for (i = 0; i + 4 <= num_pixels; i += 4) {
400     // Load 4 pixels at a time.
401     __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
402     const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
403     const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
404     const __m128i T_hi = _mm_unpackhi_epi8(T, zero);
405     const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
406     const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
407     const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero);
408     __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
409     __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
410     DO_PRED12(diff_lo, 0, 0);
411     DO_PRED12_SHIFT(diff_lo, 0);
412     DO_PRED12(diff_lo, 1, 1);
413     DO_PRED12_SHIFT(diff_lo, 1);
414     DO_PRED12(diff_hi, 0, 2);
415     DO_PRED12_SHIFT(diff_hi, 0);
416     DO_PRED12(diff_hi, 1, 3);
417   }
418   if (i != num_pixels) {
419     VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i);
420   }
421 }
422 #undef DO_PRED12
423 #undef DO_PRED12_SHIFT
424 
425 // Due to averages with integers, values cannot be accumulated in parallel for
426 // predictors 13.
GENERATE_PREDICTOR_ADD(Predictor13_SSE2,PredictorAdd13_SSE2)427 GENERATE_PREDICTOR_ADD(Predictor13_SSE2, PredictorAdd13_SSE2)
428 
429 //------------------------------------------------------------------------------
430 // Subtract-Green Transform
431 
432 static void AddGreenToBlueAndRed_SSE2(const uint32_t* const src, int num_pixels,
433                                       uint32_t* dst) {
434   int i;
435   for (i = 0; i + 4 <= num_pixels; i += 4) {
436     const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
437     const __m128i A = _mm_srli_epi16(in, 8);     // 0 a 0 g
438     const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
439     const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // 0g0g
440     const __m128i out = _mm_add_epi8(in, C);
441     _mm_storeu_si128((__m128i*)&dst[i], out);
442   }
443   // fallthrough and finish off with plain-C
444   if (i != num_pixels) {
445     VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i);
446   }
447 }
448 
449 //------------------------------------------------------------------------------
450 // Color Transform
451 
TransformColorInverse_SSE2(const VP8LMultipliers * const m,const uint32_t * const src,int num_pixels,uint32_t * dst)452 static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
453                                        const uint32_t* const src,
454                                        int num_pixels, uint32_t* dst) {
455 // sign-extended multiplying constants, pre-shifted by 5.
456 #define CST(X)  (((int16_t)(m->X << 8)) >> 5)   // sign-extend
457 #define MK_CST_16(HI, LO) \
458   _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
459   const __m128i mults_rb = MK_CST_16(CST(green_to_red_), CST(green_to_blue_));
460   const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0);
461 #undef MK_CST_16
462 #undef CST
463   const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
464   int i;
465   for (i = 0; i + 4 <= num_pixels; i += 4) {
466     const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
467     const __m128i A = _mm_and_si128(in, mask_ag);     // a   0   g   0
468     const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
469     const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // g0g0
470     const __m128i D = _mm_mulhi_epi16(C, mults_rb);    // x dr  x db1
471     const __m128i E = _mm_add_epi8(in, D);             // x r'  x   b'
472     const __m128i F = _mm_slli_epi16(E, 8);            // r' 0   b' 0
473     const __m128i G = _mm_mulhi_epi16(F, mults_b2);    // x db2  0  0
474     const __m128i H = _mm_srli_epi32(G, 8);            // 0  x db2  0
475     const __m128i I = _mm_add_epi8(H, F);              // r' x  b'' 0
476     const __m128i J = _mm_srli_epi16(I, 8);            // 0  r'  0  b''
477     const __m128i out = _mm_or_si128(J, A);
478     _mm_storeu_si128((__m128i*)&dst[i], out);
479   }
480   // Fall-back to C-version for left-overs.
481   if (i != num_pixels) {
482     VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
483   }
484 }
485 
486 //------------------------------------------------------------------------------
487 // Color-space conversion functions
488 
ConvertBGRAToRGB_SSE2(const uint32_t * src,int num_pixels,uint8_t * dst)489 static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,
490                                   uint8_t* dst) {
491   const __m128i* in = (const __m128i*)src;
492   __m128i* out = (__m128i*)dst;
493 
494   while (num_pixels >= 32) {
495     // Load the BGRA buffers.
496     __m128i in0 = _mm_loadu_si128(in + 0);
497     __m128i in1 = _mm_loadu_si128(in + 1);
498     __m128i in2 = _mm_loadu_si128(in + 2);
499     __m128i in3 = _mm_loadu_si128(in + 3);
500     __m128i in4 = _mm_loadu_si128(in + 4);
501     __m128i in5 = _mm_loadu_si128(in + 5);
502     __m128i in6 = _mm_loadu_si128(in + 6);
503     __m128i in7 = _mm_loadu_si128(in + 7);
504     VP8L32bToPlanar_SSE2(&in0, &in1, &in2, &in3);
505     VP8L32bToPlanar_SSE2(&in4, &in5, &in6, &in7);
506     // At this points, in1/in5 contains red only, in2/in6 green only ...
507     // Pack the colors in 24b RGB.
508     VP8PlanarTo24b_SSE2(&in1, &in5, &in2, &in6, &in3, &in7);
509     _mm_storeu_si128(out + 0, in1);
510     _mm_storeu_si128(out + 1, in5);
511     _mm_storeu_si128(out + 2, in2);
512     _mm_storeu_si128(out + 3, in6);
513     _mm_storeu_si128(out + 4, in3);
514     _mm_storeu_si128(out + 5, in7);
515     in += 8;
516     out += 6;
517     num_pixels -= 32;
518   }
519   // left-overs
520   if (num_pixels > 0) {
521     VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
522   }
523 }
524 
ConvertBGRAToRGBA_SSE2(const uint32_t * src,int num_pixels,uint8_t * dst)525 static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
526                                    int num_pixels, uint8_t* dst) {
527   const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
528   const __m128i* in = (const __m128i*)src;
529   __m128i* out = (__m128i*)dst;
530   while (num_pixels >= 8) {
531     const __m128i A1 = _mm_loadu_si128(in++);
532     const __m128i A2 = _mm_loadu_si128(in++);
533     const __m128i B1 = _mm_and_si128(A1, red_blue_mask);     // R 0 B 0
534     const __m128i B2 = _mm_and_si128(A2, red_blue_mask);     // R 0 B 0
535     const __m128i C1 = _mm_andnot_si128(red_blue_mask, A1);  // 0 G 0 A
536     const __m128i C2 = _mm_andnot_si128(red_blue_mask, A2);  // 0 G 0 A
537     const __m128i D1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(2, 3, 0, 1));
538     const __m128i D2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(2, 3, 0, 1));
539     const __m128i E1 = _mm_shufflehi_epi16(D1, _MM_SHUFFLE(2, 3, 0, 1));
540     const __m128i E2 = _mm_shufflehi_epi16(D2, _MM_SHUFFLE(2, 3, 0, 1));
541     const __m128i F1 = _mm_or_si128(E1, C1);
542     const __m128i F2 = _mm_or_si128(E2, C2);
543     _mm_storeu_si128(out++, F1);
544     _mm_storeu_si128(out++, F2);
545     num_pixels -= 8;
546   }
547   // left-overs
548   if (num_pixels > 0) {
549     VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
550   }
551 }
552 
ConvertBGRAToRGBA4444_SSE2(const uint32_t * src,int num_pixels,uint8_t * dst)553 static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,
554                                        int num_pixels, uint8_t* dst) {
555   const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
556   const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
557   const __m128i* in = (const __m128i*)src;
558   __m128i* out = (__m128i*)dst;
559   while (num_pixels >= 8) {
560     const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
561     const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
562     const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4);  // b0b4g0g4r0r4a0a4...
563     const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4);  // b2b6g2g6r2r6a2a6...
564     const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h);    // b0b2b4b6g0g2g4g6...
565     const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h);    // b1b3b5b7g1g3g5g7...
566     const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h);    // b0...b7 | g0...g7
567     const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h);    // r0...r7 | a0...a7
568     const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h);   // g0...g7 | a0...a7
569     const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l);   // r0...r7 | b0...b7
570     const __m128i ga1 = _mm_srli_epi16(ga0, 4);         // g0-|g1-|...|a6-|a7-
571     const __m128i rb1 = _mm_and_si128(rb0, mask_0xf0);  // -r0|-r1|...|-b6|-a7
572     const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f);  // g0-|g1-|...|a6-|a7-
573     const __m128i rgba0 = _mm_or_si128(ga2, rb1);       // rg0..rg7 | ba0..ba7
574     const __m128i rgba1 = _mm_srli_si128(rgba0, 8);     // ba0..ba7 | 0
575 #if (WEBP_SWAP_16BIT_CSP == 1)
576     const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0);  // barg0...barg7
577 #else
578     const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1);  // rgba0...rgba7
579 #endif
580     _mm_storeu_si128(out++, rgba);
581     num_pixels -= 8;
582   }
583   // left-overs
584   if (num_pixels > 0) {
585     VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
586   }
587 }
588 
ConvertBGRAToRGB565_SSE2(const uint32_t * src,int num_pixels,uint8_t * dst)589 static void ConvertBGRAToRGB565_SSE2(const uint32_t* src,
590                                      int num_pixels, uint8_t* dst) {
591   const __m128i mask_0xe0 = _mm_set1_epi8(0xe0);
592   const __m128i mask_0xf8 = _mm_set1_epi8(0xf8);
593   const __m128i mask_0x07 = _mm_set1_epi8(0x07);
594   const __m128i* in = (const __m128i*)src;
595   __m128i* out = (__m128i*)dst;
596   while (num_pixels >= 8) {
597     const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
598     const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
599     const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4);  // b0b4g0g4r0r4a0a4...
600     const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4);  // b2b6g2g6r2r6a2a6...
601     const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h);      // b0b2b4b6g0g2g4g6...
602     const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h);      // b1b3b5b7g1g3g5g7...
603     const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h);      // b0...b7 | g0...g7
604     const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h);      // r0...r7 | a0...a7
605     const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h);     // g0...g7 | a0...a7
606     const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l);     // r0...r7 | b0...b7
607     const __m128i rb1 = _mm_and_si128(rb0, mask_0xf8);    // -r0..-r7|-b0..-b7
608     const __m128i g_lo1 = _mm_srli_epi16(ga0, 5);
609     const __m128i g_lo2 = _mm_and_si128(g_lo1, mask_0x07);  // g0-...g7-|xx (3b)
610     const __m128i g_hi1 = _mm_slli_epi16(ga0, 3);
611     const __m128i g_hi2 = _mm_and_si128(g_hi1, mask_0xe0);  // -g0...-g7|xx (3b)
612     const __m128i b0 = _mm_srli_si128(rb1, 8);              // -b0...-b7|0
613     const __m128i rg1 = _mm_or_si128(rb1, g_lo2);           // gr0...gr7|xx
614     const __m128i b1 = _mm_srli_epi16(b0, 3);
615     const __m128i gb1 = _mm_or_si128(b1, g_hi2);            // bg0...bg7|xx
616 #if (WEBP_SWAP_16BIT_CSP == 1)
617     const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1);     // rggb0...rggb7
618 #else
619     const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1);     // bgrb0...bgrb7
620 #endif
621     _mm_storeu_si128(out++, rgba);
622     num_pixels -= 8;
623   }
624   // left-overs
625   if (num_pixels > 0) {
626     VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
627   }
628 }
629 
ConvertBGRAToBGR_SSE2(const uint32_t * src,int num_pixels,uint8_t * dst)630 static void ConvertBGRAToBGR_SSE2(const uint32_t* src,
631                                   int num_pixels, uint8_t* dst) {
632   const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);
633   const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);
634   const __m128i* in = (const __m128i*)src;
635   const uint8_t* const end = dst + num_pixels * 3;
636   // the last storel_epi64 below writes 8 bytes starting at offset 18
637   while (dst + 26 <= end) {
638     const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
639     const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
640     const __m128i a0l = _mm_and_si128(bgra0, mask_l);   // bgr0|0|bgr0|0
641     const __m128i a4l = _mm_and_si128(bgra4, mask_l);   // bgr0|0|bgr0|0
642     const __m128i a0h = _mm_and_si128(bgra0, mask_h);   // 0|bgr0|0|bgr0
643     const __m128i a4h = _mm_and_si128(bgra4, mask_h);   // 0|bgr0|0|bgr0
644     const __m128i b0h = _mm_srli_epi64(a0h, 8);         // 000b|gr00|000b|gr00
645     const __m128i b4h = _mm_srli_epi64(a4h, 8);         // 000b|gr00|000b|gr00
646     const __m128i c0 = _mm_or_si128(a0l, b0h);          // rgbrgb00|rgbrgb00
647     const __m128i c4 = _mm_or_si128(a4l, b4h);          // rgbrgb00|rgbrgb00
648     const __m128i c2 = _mm_srli_si128(c0, 8);
649     const __m128i c6 = _mm_srli_si128(c4, 8);
650     _mm_storel_epi64((__m128i*)(dst +   0), c0);
651     _mm_storel_epi64((__m128i*)(dst +   6), c2);
652     _mm_storel_epi64((__m128i*)(dst +  12), c4);
653     _mm_storel_epi64((__m128i*)(dst +  18), c6);
654     dst += 24;
655     num_pixels -= 8;
656   }
657   // left-overs
658   if (num_pixels > 0) {
659     VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst);
660   }
661 }
662 
663 //------------------------------------------------------------------------------
664 // Entry point
665 
666 extern void VP8LDspInitSSE2(void);
667 
VP8LDspInitSSE2(void)668 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) {
669   VP8LPredictors[5] = Predictor5_SSE2;
670   VP8LPredictors[6] = Predictor6_SSE2;
671   VP8LPredictors[7] = Predictor7_SSE2;
672   VP8LPredictors[8] = Predictor8_SSE2;
673   VP8LPredictors[9] = Predictor9_SSE2;
674   VP8LPredictors[10] = Predictor10_SSE2;
675   VP8LPredictors[11] = Predictor11_SSE2;
676   VP8LPredictors[12] = Predictor12_SSE2;
677   VP8LPredictors[13] = Predictor13_SSE2;
678 
679   VP8LPredictorsAdd[0] = PredictorAdd0_SSE2;
680   VP8LPredictorsAdd[1] = PredictorAdd1_SSE2;
681   VP8LPredictorsAdd[2] = PredictorAdd2_SSE2;
682   VP8LPredictorsAdd[3] = PredictorAdd3_SSE2;
683   VP8LPredictorsAdd[4] = PredictorAdd4_SSE2;
684   VP8LPredictorsAdd[5] = PredictorAdd5_SSE2;
685   VP8LPredictorsAdd[6] = PredictorAdd6_SSE2;
686   VP8LPredictorsAdd[7] = PredictorAdd7_SSE2;
687   VP8LPredictorsAdd[8] = PredictorAdd8_SSE2;
688   VP8LPredictorsAdd[9] = PredictorAdd9_SSE2;
689   VP8LPredictorsAdd[10] = PredictorAdd10_SSE2;
690   VP8LPredictorsAdd[11] = PredictorAdd11_SSE2;
691   VP8LPredictorsAdd[12] = PredictorAdd12_SSE2;
692   VP8LPredictorsAdd[13] = PredictorAdd13_SSE2;
693 
694   VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_SSE2;
695   VP8LTransformColorInverse = TransformColorInverse_SSE2;
696 
697   VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE2;
698   VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_SSE2;
699   VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_SSE2;
700   VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_SSE2;
701   VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE2;
702 }
703 
704 #else  // !WEBP_USE_SSE2
705 
706 WEBP_DSP_INIT_STUB(VP8LDspInitSSE2)
707 
708 #endif  // WEBP_USE_SSE2
709