1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 #include "Swizzle.h"
8 
9 #include <emmintrin.h>
10 
11 namespace mozilla::gfx {
12 
13 // Load 1-3 pixels into a 4 pixel vector.
LoadRemainder_SSE2(const uint8_t * aSrc,size_t aLength)14 static MOZ_ALWAYS_INLINE __m128i LoadRemainder_SSE2(const uint8_t* aSrc,
15                                                     size_t aLength) {
16   __m128i px;
17   if (aLength >= 2) {
18     // Load first 2 pixels
19     px = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(aSrc));
20     // Load third pixel
21     if (aLength >= 3) {
22       px = _mm_unpacklo_epi64(
23           px,
24           _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(aSrc + 2 * 4)));
25     }
26   } else {
27     // Load single pixel
28     px = _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(aSrc));
29   }
30   return px;
31 }
32 
33 // Store 1-3 pixels from a vector into memory without overwriting.
StoreRemainder_SSE2(uint8_t * aDst,size_t aLength,const __m128i & aSrc)34 static MOZ_ALWAYS_INLINE void StoreRemainder_SSE2(uint8_t* aDst, size_t aLength,
35                                                   const __m128i& aSrc) {
36   if (aLength >= 2) {
37     // Store first 2 pixels
38     _mm_storel_epi64(reinterpret_cast<__m128i*>(aDst), aSrc);
39     // Store third pixel
40     if (aLength >= 3) {
41       *reinterpret_cast<uint32_t*>(aDst + 2 * 4) =
42           _mm_cvtsi128_si32(_mm_srli_si128(aSrc, 2 * 4));
43     }
44   } else {
45     // Store single pixel
46     *reinterpret_cast<uint32_t*>(aDst) = _mm_cvtsi128_si32(aSrc);
47   }
48 }
49 
50 // Premultiply vector of 4 pixels using splayed math.
51 template <bool aSwapRB, bool aOpaqueAlpha>
PremultiplyVector_SSE2(const __m128i & aSrc)52 static MOZ_ALWAYS_INLINE __m128i PremultiplyVector_SSE2(const __m128i& aSrc) {
53   // Isolate R and B with mask.
54   const __m128i mask = _mm_set1_epi32(0x00FF00FF);
55   __m128i rb = _mm_and_si128(mask, aSrc);
56   // Swap R and B if necessary.
57   if (aSwapRB) {
58     rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
59     rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
60   }
61   // Isolate G and A by shifting down to bottom of word.
62   __m128i ga = _mm_srli_epi16(aSrc, 8);
63 
64   // Duplicate alphas to get vector of A1 A1 A2 A2 A3 A3 A4 A4
65   __m128i alphas = _mm_shufflelo_epi16(ga, _MM_SHUFFLE(3, 3, 1, 1));
66   alphas = _mm_shufflehi_epi16(alphas, _MM_SHUFFLE(3, 3, 1, 1));
67 
68   // rb = rb*a + 255; rb += rb >> 8;
69   rb = _mm_add_epi16(_mm_mullo_epi16(rb, alphas), mask);
70   rb = _mm_add_epi16(rb, _mm_srli_epi16(rb, 8));
71 
72   // If format is not opaque, force A to 255 so that A*alpha/255 = alpha
73   if (!aOpaqueAlpha) {
74     ga = _mm_or_si128(ga, _mm_set1_epi32(0x00FF0000));
75   }
76   // ga = ga*a + 255; ga += ga >> 8;
77   ga = _mm_add_epi16(_mm_mullo_epi16(ga, alphas), mask);
78   ga = _mm_add_epi16(ga, _mm_srli_epi16(ga, 8));
79   // If format is opaque, force output A to be 255.
80   if (aOpaqueAlpha) {
81     ga = _mm_or_si128(ga, _mm_set1_epi32(0xFF000000));
82   }
83 
84   // Combine back to final pixel with (rb >> 8) | (ga & 0xFF00FF00)
85   rb = _mm_srli_epi16(rb, 8);
86   ga = _mm_andnot_si128(mask, ga);
87   return _mm_or_si128(rb, ga);
88 }
89 
90 // Premultiply vector of aAlignedRow + aRemainder pixels.
91 template <bool aSwapRB, bool aOpaqueAlpha>
PremultiplyChunk_SSE2(const uint8_t * & aSrc,uint8_t * & aDst,int32_t aAlignedRow,int32_t aRemainder)92 static MOZ_ALWAYS_INLINE void PremultiplyChunk_SSE2(const uint8_t*& aSrc,
93                                                     uint8_t*& aDst,
94                                                     int32_t aAlignedRow,
95                                                     int32_t aRemainder) {
96   // Process all 4-pixel chunks as one vector.
97   for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
98     __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
99     px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
100     _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
101     aSrc += 4 * 4;
102     aDst += 4 * 4;
103   }
104 
105   // Handle any 1-3 remaining pixels.
106   if (aRemainder) {
107     __m128i px = LoadRemainder_SSE2(aSrc, aRemainder);
108     px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
109     StoreRemainder_SSE2(aDst, aRemainder, px);
110   }
111 }
112 
113 // Premultiply vector of aLength pixels.
114 template <bool aSwapRB, bool aOpaqueAlpha>
PremultiplyRow_SSE2(const uint8_t * aSrc,uint8_t * aDst,int32_t aLength)115 void PremultiplyRow_SSE2(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
116   int32_t alignedRow = 4 * (aLength & ~3);
117   int32_t remainder = aLength & 3;
118   PremultiplyChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
119                                                remainder);
120 }
121 
122 template <bool aSwapRB, bool aOpaqueAlpha>
Premultiply_SSE2(const uint8_t * aSrc,int32_t aSrcGap,uint8_t * aDst,int32_t aDstGap,IntSize aSize)123 void Premultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
124                       int32_t aDstGap, IntSize aSize) {
125   int32_t alignedRow = 4 * (aSize.width & ~3);
126   int32_t remainder = aSize.width & 3;
127   // Fold remainder into stride gap.
128   aSrcGap += 4 * remainder;
129   aDstGap += 4 * remainder;
130 
131   for (int32_t height = aSize.height; height > 0; height--) {
132     PremultiplyChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
133                                                  remainder);
134     aSrc += aSrcGap;
135     aDst += aDstGap;
136   }
137 }
138 
139 // Force instantiation of premultiply variants here.
140 template void PremultiplyRow_SSE2<false, false>(const uint8_t*, uint8_t*,
141                                                 int32_t);
142 template void PremultiplyRow_SSE2<false, true>(const uint8_t*, uint8_t*,
143                                                int32_t);
144 template void PremultiplyRow_SSE2<true, false>(const uint8_t*, uint8_t*,
145                                                int32_t);
146 template void PremultiplyRow_SSE2<true, true>(const uint8_t*, uint8_t*,
147                                               int32_t);
148 template void Premultiply_SSE2<false, false>(const uint8_t*, int32_t, uint8_t*,
149                                              int32_t, IntSize);
150 template void Premultiply_SSE2<false, true>(const uint8_t*, int32_t, uint8_t*,
151                                             int32_t, IntSize);
152 template void Premultiply_SSE2<true, false>(const uint8_t*, int32_t, uint8_t*,
153                                             int32_t, IntSize);
154 template void Premultiply_SSE2<true, true>(const uint8_t*, int32_t, uint8_t*,
155                                            int32_t, IntSize);
156 
157 // This generates a table of fixed-point reciprocals representing 1/alpha
158 // similar to the fallback implementation. However, the reciprocal must fit
159 // in 16 bits to multiply cheaply. Observe that reciprocals of smaller alphas
160 // require more bits than for larger alphas. We take advantage of this by
161 // shifting the reciprocal down by either 3 or 8 bits depending on whether
162 // the alpha value is less than 0x20. This is easy to then undo by multiplying
163 // the color component to be unpremultiplying by either 8 or 0x100,
164 // respectively. The 16 bit reciprocal is duplicated into both words of a
165 // uint32_t here to reduce unpacking overhead.
166 #define UNPREMULQ_SSE2(x) \
167   (0x10001U * (0xFF0220U / ((x) * ((x) < 0x20 ? 0x100 : 8))))
168 #define UNPREMULQ_SSE2_2(x) UNPREMULQ_SSE2(x), UNPREMULQ_SSE2((x) + 1)
169 #define UNPREMULQ_SSE2_4(x) UNPREMULQ_SSE2_2(x), UNPREMULQ_SSE2_2((x) + 2)
170 #define UNPREMULQ_SSE2_8(x) UNPREMULQ_SSE2_4(x), UNPREMULQ_SSE2_4((x) + 4)
171 #define UNPREMULQ_SSE2_16(x) UNPREMULQ_SSE2_8(x), UNPREMULQ_SSE2_8((x) + 8)
172 #define UNPREMULQ_SSE2_32(x) UNPREMULQ_SSE2_16(x), UNPREMULQ_SSE2_16((x) + 16)
173 static const uint32_t sUnpremultiplyTable_SSE2[256] = {0,
174                                                        UNPREMULQ_SSE2(1),
175                                                        UNPREMULQ_SSE2_2(2),
176                                                        UNPREMULQ_SSE2_4(4),
177                                                        UNPREMULQ_SSE2_8(8),
178                                                        UNPREMULQ_SSE2_16(16),
179                                                        UNPREMULQ_SSE2_32(32),
180                                                        UNPREMULQ_SSE2_32(64),
181                                                        UNPREMULQ_SSE2_32(96),
182                                                        UNPREMULQ_SSE2_32(128),
183                                                        UNPREMULQ_SSE2_32(160),
184                                                        UNPREMULQ_SSE2_32(192),
185                                                        UNPREMULQ_SSE2_32(224)};
186 
187 // Unpremultiply a vector of 4 pixels using splayed math and a reciprocal table
188 // that avoids doing any actual division.
189 template <bool aSwapRB>
UnpremultiplyVector_SSE2(const __m128i & aSrc)190 static MOZ_ALWAYS_INLINE __m128i UnpremultiplyVector_SSE2(const __m128i& aSrc) {
191   // Isolate R and B with mask.
192   __m128i rb = _mm_and_si128(aSrc, _mm_set1_epi32(0x00FF00FF));
193   // Swap R and B if necessary.
194   if (aSwapRB) {
195     rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
196     rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
197   }
198 
199   // Isolate G and A by shifting down to bottom of word.
200   __m128i ga = _mm_srli_epi16(aSrc, 8);
201   // Extract the alphas for the 4 pixels from the now isolated words.
202   int a1 = _mm_extract_epi16(ga, 1);
203   int a2 = _mm_extract_epi16(ga, 3);
204   int a3 = _mm_extract_epi16(ga, 5);
205   int a4 = _mm_extract_epi16(ga, 7);
206 
207   // Load the 16 bit reciprocals from the table for each alpha.
208   // The reciprocals are doubled in each uint32_t entry.
209   // Unpack them to a final vector of duplicated reciprocals of
210   // the form Q1 Q1 Q2 Q2 Q3 Q3 Q4 Q4.
211   __m128i q12 =
212       _mm_unpacklo_epi32(_mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a1]),
213                          _mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a2]));
214   __m128i q34 =
215       _mm_unpacklo_epi32(_mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a3]),
216                          _mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a4]));
217   __m128i q1234 = _mm_unpacklo_epi64(q12, q34);
218 
219   // Check if the alphas are less than 0x20, so that we can undo
220   // scaling of the reciprocals as appropriate.
221   __m128i scale = _mm_cmplt_epi32(ga, _mm_set1_epi32(0x00200000));
222   // Produce scale factors by ((a < 0x20) ^ 8) & 0x108,
223   // such that scale is 0x100 if < 0x20, and 8 otherwise.
224   scale = _mm_xor_si128(scale, _mm_set1_epi16(8));
225   scale = _mm_and_si128(scale, _mm_set1_epi16(0x108));
226   // Isolate G now so that we don't accidentally unpremultiply A.
227   ga = _mm_and_si128(ga, _mm_set1_epi32(0x000000FF));
228 
229   // Scale R, B, and G as required depending on reciprocal precision.
230   rb = _mm_mullo_epi16(rb, scale);
231   ga = _mm_mullo_epi16(ga, scale);
232 
233   // Multiply R, B, and G by the reciprocal, only taking the high word
234   // too effectively shift right by 16.
235   rb = _mm_mulhi_epu16(rb, q1234);
236   ga = _mm_mulhi_epu16(ga, q1234);
237 
238   // Combine back to final pixel with rb | (ga << 8) | (aSrc & 0xFF000000),
239   // which will add back on the original alpha value unchanged.
240   ga = _mm_slli_si128(ga, 1);
241   ga = _mm_or_si128(ga, _mm_and_si128(aSrc, _mm_set1_epi32(0xFF000000)));
242   return _mm_or_si128(rb, ga);
243 }
244 
245 template <bool aSwapRB>
UnpremultiplyChunk_SSE2(const uint8_t * & aSrc,uint8_t * & aDst,int32_t aAlignedRow,int32_t aRemainder)246 static MOZ_ALWAYS_INLINE void UnpremultiplyChunk_SSE2(const uint8_t*& aSrc,
247                                                       uint8_t*& aDst,
248                                                       int32_t aAlignedRow,
249                                                       int32_t aRemainder) {
250   // Process all 4-pixel chunks as one vector.
251   for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
252     __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
253     px = UnpremultiplyVector_SSE2<aSwapRB>(px);
254     _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
255     aSrc += 4 * 4;
256     aDst += 4 * 4;
257   }
258 
259   // Handle any 1-3 remaining pixels.
260   if (aRemainder) {
261     __m128i px = LoadRemainder_SSE2(aSrc, aRemainder);
262     px = UnpremultiplyVector_SSE2<aSwapRB>(px);
263     StoreRemainder_SSE2(aDst, aRemainder, px);
264   }
265 }
266 
267 template <bool aSwapRB>
UnpremultiplyRow_SSE2(const uint8_t * aSrc,uint8_t * aDst,int32_t aLength)268 void UnpremultiplyRow_SSE2(const uint8_t* aSrc, uint8_t* aDst,
269                            int32_t aLength) {
270   int32_t alignedRow = 4 * (aLength & ~3);
271   int32_t remainder = aLength & 3;
272   UnpremultiplyChunk_SSE2<aSwapRB>(aSrc, aDst, alignedRow, remainder);
273 }
274 
275 template <bool aSwapRB>
Unpremultiply_SSE2(const uint8_t * aSrc,int32_t aSrcGap,uint8_t * aDst,int32_t aDstGap,IntSize aSize)276 void Unpremultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
277                         int32_t aDstGap, IntSize aSize) {
278   int32_t alignedRow = 4 * (aSize.width & ~3);
279   int32_t remainder = aSize.width & 3;
280   // Fold remainder into stride gap.
281   aSrcGap += 4 * remainder;
282   aDstGap += 4 * remainder;
283 
284   for (int32_t height = aSize.height; height > 0; height--) {
285     UnpremultiplyChunk_SSE2<aSwapRB>(aSrc, aDst, alignedRow, remainder);
286     aSrc += aSrcGap;
287     aDst += aDstGap;
288   }
289 }
290 
291 // Force instantiation of unpremultiply variants here.
292 template void UnpremultiplyRow_SSE2<false>(const uint8_t*, uint8_t*, int32_t);
293 template void UnpremultiplyRow_SSE2<true>(const uint8_t*, uint8_t*, int32_t);
294 template void Unpremultiply_SSE2<false>(const uint8_t*, int32_t, uint8_t*,
295                                         int32_t, IntSize);
296 template void Unpremultiply_SSE2<true>(const uint8_t*, int32_t, uint8_t*,
297                                        int32_t, IntSize);
298 
299 // Swizzle a vector of 4 pixels providing swaps and opaquifying.
300 template <bool aSwapRB, bool aOpaqueAlpha>
SwizzleVector_SSE2(const __m128i & aSrc)301 static MOZ_ALWAYS_INLINE __m128i SwizzleVector_SSE2(const __m128i& aSrc) {
302   // Isolate R and B.
303   __m128i rb = _mm_and_si128(aSrc, _mm_set1_epi32(0x00FF00FF));
304   // Swap R and B.
305   rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
306   rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
307   // Isolate G and A.
308   __m128i ga = _mm_and_si128(aSrc, _mm_set1_epi32(0xFF00FF00));
309   // Force alpha to 255 if necessary.
310   if (aOpaqueAlpha) {
311     ga = _mm_or_si128(ga, _mm_set1_epi32(0xFF000000));
312   }
313   // Combine everything back together.
314   return _mm_or_si128(rb, ga);
315 }
316 
317 #if 0
318 // These specializations currently do not profile faster than the generic versions,
319 // so disable them for now.
320 
321 // Optimized implementations for when there is no R and B swap.
322 template<>
323 MOZ_ALWAYS_INLINE __m128i
324 SwizzleVector_SSE2<false, true>(const __m128i& aSrc)
325 {
326   // Force alpha to 255.
327   return _mm_or_si128(aSrc, _mm_set1_epi32(0xFF000000));
328 }
329 
330 template<>
331 MOZ_ALWAYS_INLINE __m128i
332 SwizzleVector_SSE2<false, false>(const __m128i& aSrc)
333 {
334   return aSrc;
335 }
336 #endif
337 
338 template <bool aSwapRB, bool aOpaqueAlpha>
SwizzleChunk_SSE2(const uint8_t * & aSrc,uint8_t * & aDst,int32_t aAlignedRow,int32_t aRemainder)339 static MOZ_ALWAYS_INLINE void SwizzleChunk_SSE2(const uint8_t*& aSrc,
340                                                 uint8_t*& aDst,
341                                                 int32_t aAlignedRow,
342                                                 int32_t aRemainder) {
343   // Process all 4-pixel chunks as one vector.
344   for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
345     __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
346     px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
347     _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
348     aSrc += 4 * 4;
349     aDst += 4 * 4;
350   }
351 
352   // Handle any 1-3 remaining pixels.
353   if (aRemainder) {
354     __m128i px = LoadRemainder_SSE2(aSrc, aRemainder);
355     px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
356     StoreRemainder_SSE2(aDst, aRemainder, px);
357   }
358 }
359 
360 template <bool aSwapRB, bool aOpaqueAlpha>
SwizzleRow_SSE2(const uint8_t * aSrc,uint8_t * aDst,int32_t aLength)361 void SwizzleRow_SSE2(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
362   int32_t alignedRow = 4 * (aLength & ~3);
363   int32_t remainder = aLength & 3;
364   SwizzleChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder);
365 }
366 
367 template <bool aSwapRB, bool aOpaqueAlpha>
Swizzle_SSE2(const uint8_t * aSrc,int32_t aSrcGap,uint8_t * aDst,int32_t aDstGap,IntSize aSize)368 void Swizzle_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
369                   int32_t aDstGap, IntSize aSize) {
370   int32_t alignedRow = 4 * (aSize.width & ~3);
371   int32_t remainder = aSize.width & 3;
372   // Fold remainder into stride gap.
373   aSrcGap += 4 * remainder;
374   aDstGap += 4 * remainder;
375 
376   for (int32_t height = aSize.height; height > 0; height--) {
377     SwizzleChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder);
378     aSrc += aSrcGap;
379     aDst += aDstGap;
380   }
381 }
382 
383 // Force instantiation of swizzle variants here.
384 template void SwizzleRow_SSE2<true, false>(const uint8_t*, uint8_t*, int32_t);
385 template void SwizzleRow_SSE2<true, true>(const uint8_t*, uint8_t*, int32_t);
386 template void Swizzle_SSE2<true, false>(const uint8_t*, int32_t, uint8_t*,
387                                         int32_t, IntSize);
388 template void Swizzle_SSE2<true, true>(const uint8_t*, int32_t, uint8_t*,
389                                        int32_t, IntSize);
390 
391 }  // namespace mozilla::gfx
392