1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 #include "FilterProcessing.h"
8 
9 #include "SIMD.h"
10 #include "SVGTurbulenceRenderer-inl.h"
11 
12 namespace mozilla {
13 namespace gfx {
14 
15 template <typename u8x16_t>
ConvertToB8G8R8A8_SIMD(SourceSurface * aSurface)16 inline already_AddRefed<DataSourceSurface> ConvertToB8G8R8A8_SIMD(
17     SourceSurface* aSurface) {
18   IntSize size = aSurface->GetSize();
19   RefPtr<DataSourceSurface> output =
20       Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
21   if (!output) {
22     return nullptr;
23   }
24 
25   RefPtr<DataSourceSurface> input = aSurface->GetDataSurface();
26   DataSourceSurface::ScopedMap inputMap(input, DataSourceSurface::READ);
27   DataSourceSurface::ScopedMap outputMap(output, DataSourceSurface::READ_WRITE);
28   uint8_t* inputData = inputMap.GetData();
29   uint8_t* outputData = outputMap.GetData();
30   int32_t inputStride = inputMap.GetStride();
31   int32_t outputStride = outputMap.GetStride();
32   switch (input->GetFormat()) {
33     case SurfaceFormat::B8G8R8A8:
34       output = input;
35       break;
36     case SurfaceFormat::B8G8R8X8:
37       for (int32_t y = 0; y < size.height; y++) {
38         for (int32_t x = 0; x < size.width; x++) {
39           int32_t inputIndex = y * inputStride + 4 * x;
40           int32_t outputIndex = y * outputStride + 4 * x;
41           outputData[outputIndex + 0] = inputData[inputIndex + 0];
42           outputData[outputIndex + 1] = inputData[inputIndex + 1];
43           outputData[outputIndex + 2] = inputData[inputIndex + 2];
44           outputData[outputIndex + 3] = 255;
45         }
46       }
47       break;
48     case SurfaceFormat::R8G8B8A8:
49       for (int32_t y = 0; y < size.height; y++) {
50         for (int32_t x = 0; x < size.width; x++) {
51           int32_t inputIndex = y * inputStride + 4 * x;
52           int32_t outputIndex = y * outputStride + 4 * x;
53           outputData[outputIndex + 2] = inputData[inputIndex + 0];
54           outputData[outputIndex + 1] = inputData[inputIndex + 1];
55           outputData[outputIndex + 0] = inputData[inputIndex + 2];
56           outputData[outputIndex + 3] = inputData[inputIndex + 3];
57         }
58       }
59       break;
60     case SurfaceFormat::R8G8B8X8:
61       for (int32_t y = 0; y < size.height; y++) {
62         for (int32_t x = 0; x < size.width; x++) {
63           int32_t inputIndex = y * inputStride + 4 * x;
64           int32_t outputIndex = y * outputStride + 4 * x;
65           outputData[outputIndex + 2] = inputData[inputIndex + 0];
66           outputData[outputIndex + 1] = inputData[inputIndex + 1];
67           outputData[outputIndex + 0] = inputData[inputIndex + 2];
68           outputData[outputIndex + 3] = 255;
69         }
70       }
71       break;
72     case SurfaceFormat::A8:
73       for (int32_t y = 0; y < size.height; y++) {
74         for (int32_t x = 0; x < size.width; x += 16) {
75           int32_t inputIndex = y * inputStride + x;
76           int32_t outputIndex = y * outputStride + 4 * x;
77           u8x16_t p1To16 = simd::Load8<u8x16_t>(&inputData[inputIndex]);
78           // Turn AAAAAAAAAAAAAAAA into four chunks of 000A000A000A000A by
79           // interleaving with 0000000000000000 twice.
80           u8x16_t zero = simd::FromZero8<u8x16_t>();
81           u8x16_t p1To8 = simd::InterleaveLo8(zero, p1To16);
82           u8x16_t p9To16 = simd::InterleaveHi8(zero, p1To16);
83           u8x16_t p1To4 = simd::InterleaveLo8(zero, p1To8);
84           u8x16_t p5To8 = simd::InterleaveHi8(zero, p1To8);
85           u8x16_t p9To12 = simd::InterleaveLo8(zero, p9To16);
86           u8x16_t p13To16 = simd::InterleaveHi8(zero, p9To16);
87           simd::Store8(&outputData[outputIndex], p1To4);
88           if ((x + 4) * 4 < outputStride) {
89             simd::Store8(&outputData[outputIndex + 4 * 4], p5To8);
90           }
91           if ((x + 8) * 4 < outputStride) {
92             simd::Store8(&outputData[outputIndex + 4 * 8], p9To12);
93           }
94           if ((x + 12) * 4 < outputStride) {
95             simd::Store8(&outputData[outputIndex + 4 * 12], p13To16);
96           }
97         }
98       }
99       break;
100     default:
101       output = nullptr;
102       break;
103   }
104   return output.forget();
105 }
106 
107 template <typename u8x16_t>
ExtractAlpha_SIMD(const IntSize & size,uint8_t * sourceData,int32_t sourceStride,uint8_t * alphaData,int32_t alphaStride)108 inline void ExtractAlpha_SIMD(const IntSize& size, uint8_t* sourceData,
109                               int32_t sourceStride, uint8_t* alphaData,
110                               int32_t alphaStride) {
111   for (int32_t y = 0; y < size.height; y++) {
112     for (int32_t x = 0; x < size.width; x += 16) {
113       // Process 16 pixels at a time.
114       // Turn up to four chunks of BGRABGRABGRABGRA into one chunk of
115       // AAAAAAAAAAAAAAAA.
116       int32_t sourceIndex = y * sourceStride + 4 * x;
117       int32_t targetIndex = y * alphaStride + x;
118 
119       u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>();
120       u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>();
121       u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>();
122 
123       u8x16_t bgrabgrabgrabgra1 =
124           simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
125       if (4 * (x + 4) < sourceStride) {
126         bgrabgrabgrabgra2 =
127             simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]);
128       }
129       if (4 * (x + 8) < sourceStride) {
130         bgrabgrabgrabgra3 =
131             simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]);
132       }
133       if (4 * (x + 12) < sourceStride) {
134         bgrabgrabgrabgra4 =
135             simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]);
136       }
137 
138       u8x16_t bbggrraabbggrraa1 =
139           simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
140       u8x16_t bbggrraabbggrraa2 =
141           simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
142       u8x16_t bbggrraabbggrraa3 =
143           simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
144       u8x16_t bbggrraabbggrraa4 =
145           simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
146       u8x16_t bbbbggggrrrraaaa1 =
147           simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3);
148       u8x16_t bbbbggggrrrraaaa2 =
149           simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3);
150       u8x16_t bbbbggggrrrraaaa3 =
151           simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4);
152       u8x16_t bbbbggggrrrraaaa4 =
153           simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4);
154       u8x16_t rrrrrrrraaaaaaaa1 =
155           simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
156       u8x16_t rrrrrrrraaaaaaaa2 =
157           simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
158       u8x16_t aaaaaaaaaaaaaaaa =
159           simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
160 
161       simd::Store8(&alphaData[targetIndex], aaaaaaaaaaaaaaaa);
162     }
163   }
164 }
165 
166 // This function calculates the result color values for four pixels, but for
167 // only two color channels - either b & r or g & a. However, the a result will
168 // not be used.
169 // source and dest each contain 8 values, either bbbb gggg or rrrr aaaa.
170 // sourceAlpha and destAlpha are of the form aaaa aaaa, where each aaaa is the
171 // alpha of all four pixels (and both aaaa's are the same).
172 // blendendComponent1 and blendedComponent2 are the out parameters.
173 template <typename i16x8_t, typename i32x4_t, uint32_t aBlendMode>
BlendTwoComponentsOfFourPixels(i16x8_t source,i16x8_t sourceAlpha,i16x8_t dest,const i16x8_t & destAlpha,i32x4_t & blendedComponent1,i32x4_t & blendedComponent2)174 inline void BlendTwoComponentsOfFourPixels(i16x8_t source, i16x8_t sourceAlpha,
175                                            i16x8_t dest,
176                                            const i16x8_t& destAlpha,
177                                            i32x4_t& blendedComponent1,
178                                            i32x4_t& blendedComponent2) {
179   i16x8_t x255 = simd::FromI16<i16x8_t>(255);
180 
181   switch (aBlendMode) {
182     case BLEND_MODE_MULTIPLY: {
183       // val = ((255 - destAlpha) * source + (255 - sourceAlpha + source) *
184       // dest);
185       i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
186       i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
187       i16x8_t twoFiftyFiveMinusSourceAlphaPlusSource =
188           simd::Add16(twoFiftyFiveMinusSourceAlpha, source);
189 
190       i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest);
191       i16x8_t leftFactor1 = simd::InterleaveLo16(
192           twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource);
193       blendedComponent1 =
194           simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest1, leftFactor1);
195       blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
196 
197       i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest);
198       i16x8_t leftFactor2 = simd::InterleaveHi16(
199           twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource);
200       blendedComponent2 =
201           simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest2, leftFactor2);
202       blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
203 
204       break;
205     }
206 
207     case BLEND_MODE_SCREEN: {
208       // val = 255 * (source + dest) + (0 - dest) * source;
209       i16x8_t sourcePlusDest = simd::Add16(source, dest);
210       i16x8_t zeroMinusDest = simd::Sub16(simd::FromI16<i16x8_t>(0), dest);
211 
212       i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest1 =
213           simd::InterleaveLo16(x255, zeroMinusDest);
214       i16x8_t sourcePlusDestInterleavedWithSource1 =
215           simd::InterleaveLo16(sourcePlusDest, source);
216       blendedComponent1 =
217           simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest1,
218                                    sourcePlusDestInterleavedWithSource1);
219       blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
220 
221       i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest2 =
222           simd::InterleaveHi16(x255, zeroMinusDest);
223       i16x8_t sourcePlusDestInterleavedWithSource2 =
224           simd::InterleaveHi16(sourcePlusDest, source);
225       blendedComponent2 =
226           simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest2,
227                                    sourcePlusDestInterleavedWithSource2);
228       blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
229 
230       break;
231     }
232 
233     case BLEND_MODE_DARKEN:
234     case BLEND_MODE_LIGHTEN: {
235       // Darken:
236       // val = min((255 - destAlpha) * source + 255                 * dest,
237       //           255               * source + (255 - sourceAlpha) * dest);
238       //
239       // Lighten:
240       // val = max((255 - destAlpha) * source + 255                 * dest,
241       //           255               * source + (255 - sourceAlpha) * dest);
242 
243       i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
244       i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
245 
246       i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1 =
247           simd::InterleaveLo16(twoFiftyFiveMinusDestAlpha, x255);
248       i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1 =
249           simd::InterleaveLo16(x255, twoFiftyFiveMinusSourceAlpha);
250       i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest);
251       i32x4_t product1_1 = simd::MulAdd16x8x2To32x4(
252           twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1,
253           sourceInterleavedWithDest1);
254       i32x4_t product1_2 = simd::MulAdd16x8x2To32x4(
255           twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1,
256           sourceInterleavedWithDest1);
257       blendedComponent1 = aBlendMode == BLEND_MODE_DARKEN
258                               ? simd::Min32(product1_1, product1_2)
259                               : simd::Max32(product1_1, product1_2);
260       blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
261 
262       i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2 =
263           simd::InterleaveHi16(twoFiftyFiveMinusDestAlpha, x255);
264       i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2 =
265           simd::InterleaveHi16(x255, twoFiftyFiveMinusSourceAlpha);
266       i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest);
267       i32x4_t product2_1 = simd::MulAdd16x8x2To32x4(
268           twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2,
269           sourceInterleavedWithDest2);
270       i32x4_t product2_2 = simd::MulAdd16x8x2To32x4(
271           twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2,
272           sourceInterleavedWithDest2);
273       blendedComponent2 = aBlendMode == BLEND_MODE_DARKEN
274                               ? simd::Min32(product2_1, product2_2)
275                               : simd::Max32(product2_1, product2_2);
276       blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
277 
278       break;
279     }
280   }
281 }
282 
283 // The alpha channel is subject to a different calculation than the RGB
284 // channels, and this calculation is the same for all blend modes:
285 // resultAlpha * 255 = 255 * 255 - (255 - sourceAlpha) * (255 - destAlpha)
286 template <typename i16x8_t, typename i32x4_t>
BlendAlphaOfFourPixels(i16x8_t s_rrrraaaa1234,i16x8_t d_rrrraaaa1234)287 inline i32x4_t BlendAlphaOfFourPixels(i16x8_t s_rrrraaaa1234,
288                                       i16x8_t d_rrrraaaa1234) {
289   // clang-format off
290   // We're using MulAdd16x8x2To32x4, so we need to interleave our factors
291   // appropriately. The calculation is rewritten as follows:
292   // resultAlpha[0] * 255 = 255 * 255 - (255 - sourceAlpha[0]) * (255 - destAlpha[0])
293   //                      = 255 * 255 + (255 - sourceAlpha[0]) * (destAlpha[0] - 255)
294   //                      = (255 - 0) * (510 - 255) + (255 - sourceAlpha[0]) * (destAlpha[0] - 255)
295   //                      = MulAdd(255 - IntLv(0, sourceAlpha), IntLv(510, destAlpha) - 255)[0]
296   // clang-format on
297   i16x8_t zeroInterleavedWithSourceAlpha =
298       simd::InterleaveHi16(simd::FromI16<i16x8_t>(0), s_rrrraaaa1234);
299   i16x8_t fiveTenInterleavedWithDestAlpha =
300       simd::InterleaveHi16(simd::FromI16<i16x8_t>(510), d_rrrraaaa1234);
301   i16x8_t f1 =
302       simd::Sub16(simd::FromI16<i16x8_t>(255), zeroInterleavedWithSourceAlpha);
303   i16x8_t f2 =
304       simd::Sub16(fiveTenInterleavedWithDestAlpha, simd::FromI16<i16x8_t>(255));
305   return simd::FastDivideBy255(simd::MulAdd16x8x2To32x4(f1, f2));
306 }
307 
308 template <typename u8x16_t, typename i16x8_t>
UnpackAndShuffleComponents(u8x16_t bgrabgrabgrabgra1234,i16x8_t & bbbbgggg1234,i16x8_t & rrrraaaa1234)309 inline void UnpackAndShuffleComponents(u8x16_t bgrabgrabgrabgra1234,
310                                        i16x8_t& bbbbgggg1234,
311                                        i16x8_t& rrrraaaa1234) {
312   // bgrabgrabgrabgra1234 -> bbbbgggg1234, rrrraaaa1234
313   i16x8_t bgrabgra12 = simd::UnpackLo8x8ToI16x8(bgrabgrabgrabgra1234);
314   i16x8_t bgrabgra34 = simd::UnpackHi8x8ToI16x8(bgrabgrabgrabgra1234);
315   i16x8_t bbggrraa13 = simd::InterleaveLo16(bgrabgra12, bgrabgra34);
316   i16x8_t bbggrraa24 = simd::InterleaveHi16(bgrabgra12, bgrabgra34);
317   bbbbgggg1234 = simd::InterleaveLo16(bbggrraa13, bbggrraa24);
318   rrrraaaa1234 = simd::InterleaveHi16(bbggrraa13, bbggrraa24);
319 }
320 
321 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
ShuffleAndPackComponents(i32x4_t bbbb1234,i32x4_t gggg1234,i32x4_t rrrr1234,const i32x4_t & aaaa1234)322 inline u8x16_t ShuffleAndPackComponents(i32x4_t bbbb1234, i32x4_t gggg1234,
323                                         i32x4_t rrrr1234,
324                                         const i32x4_t& aaaa1234) {
325   // bbbb1234, gggg1234, rrrr1234, aaaa1234 -> bgrabgrabgrabgra1234
326   i16x8_t bbbbgggg1234 = simd::PackAndSaturate32To16(bbbb1234, gggg1234);
327   i16x8_t rrrraaaa1234 = simd::PackAndSaturate32To16(rrrr1234, aaaa1234);
328   i16x8_t brbrbrbr1234 = simd::InterleaveLo16(bbbbgggg1234, rrrraaaa1234);
329   i16x8_t gagagaga1234 = simd::InterleaveHi16(bbbbgggg1234, rrrraaaa1234);
330   i16x8_t bgrabgra12 = simd::InterleaveLo16(brbrbrbr1234, gagagaga1234);
331   i16x8_t bgrabgra34 = simd::InterleaveHi16(brbrbrbr1234, gagagaga1234);
332   return simd::PackAndSaturate16To8(bgrabgra12, bgrabgra34);
333 }
334 
335 template <typename i32x4_t, typename i16x8_t, typename u8x16_t, BlendMode mode>
ApplyBlending_SIMD(const DataSourceSurface::ScopedMap & aInputMap1,const DataSourceSurface::ScopedMap & aInputMap2,const DataSourceSurface::ScopedMap & aOutputMap,const IntSize & aSize)336 inline void ApplyBlending_SIMD(const DataSourceSurface::ScopedMap& aInputMap1,
337                                const DataSourceSurface::ScopedMap& aInputMap2,
338                                const DataSourceSurface::ScopedMap& aOutputMap,
339                                const IntSize& aSize) {
340   uint8_t* source1Data = aInputMap1.GetData();
341   uint8_t* source2Data = aInputMap2.GetData();
342   uint8_t* targetData = aOutputMap.GetData();
343   int32_t targetStride = aOutputMap.GetStride();
344   int32_t source1Stride = aInputMap1.GetStride();
345   int32_t source2Stride = aInputMap2.GetStride();
346 
347   for (int32_t y = 0; y < aSize.height; y++) {
348     for (int32_t x = 0; x < aSize.width; x += 4) {
349       int32_t targetIndex = y * targetStride + 4 * x;
350       int32_t source1Index = y * source1Stride + 4 * x;
351       int32_t source2Index = y * source2Stride + 4 * x;
352 
353       u8x16_t s1234 = simd::Load8<u8x16_t>(&source2Data[source2Index]);
354       u8x16_t d1234 = simd::Load8<u8x16_t>(&source1Data[source1Index]);
355 
356       // The blending calculation for the RGB channels all need access to the
357       // alpha channel of their pixel, and the alpha calculation is different,
358       // so it makes sense to separate by channel.
359 
360       i16x8_t s_bbbbgggg1234, s_rrrraaaa1234;
361       i16x8_t d_bbbbgggg1234, d_rrrraaaa1234;
362       UnpackAndShuffleComponents(s1234, s_bbbbgggg1234, s_rrrraaaa1234);
363       UnpackAndShuffleComponents(d1234, d_bbbbgggg1234, d_rrrraaaa1234);
364       i16x8_t s_aaaaaaaa1234 = simd::Shuffle32<3, 2, 3, 2>(s_rrrraaaa1234);
365       i16x8_t d_aaaaaaaa1234 = simd::Shuffle32<3, 2, 3, 2>(d_rrrraaaa1234);
366 
367       // We only use blendedB, blendedG and blendedR.
368       i32x4_t blendedB, blendedG, blendedR, blendedA;
369       BlendTwoComponentsOfFourPixels<i16x8_t, i32x4_t, mode>(
370           s_bbbbgggg1234, s_aaaaaaaa1234, d_bbbbgggg1234, d_aaaaaaaa1234,
371           blendedB, blendedG);
372       BlendTwoComponentsOfFourPixels<i16x8_t, i32x4_t, mode>(
373           s_rrrraaaa1234, s_aaaaaaaa1234, d_rrrraaaa1234, d_aaaaaaaa1234,
374           blendedR, blendedA);
375 
376       // Throw away blendedA and overwrite it with the correct blended alpha.
377       blendedA = BlendAlphaOfFourPixels<i16x8_t, i32x4_t>(s_rrrraaaa1234,
378                                                           d_rrrraaaa1234);
379 
380       u8x16_t result1234 = ShuffleAndPackComponents<i32x4_t, i16x8_t, u8x16_t>(
381           blendedB, blendedG, blendedR, blendedA);
382       simd::Store8(&targetData[targetIndex], result1234);
383     }
384   }
385 }
386 
387 template <typename i32x4_t, typename i16x8_t, typename u8x16_t, BlendMode mode>
ApplyBlending_SIMD(DataSourceSurface * aInput1,DataSourceSurface * aInput2)388 inline already_AddRefed<DataSourceSurface> ApplyBlending_SIMD(
389     DataSourceSurface* aInput1, DataSourceSurface* aInput2) {
390   IntSize size = aInput1->GetSize();
391   RefPtr<DataSourceSurface> target =
392       Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
393   if (!target) {
394     return nullptr;
395   }
396 
397   DataSourceSurface::ScopedMap inputMap1(aInput1, DataSourceSurface::READ);
398   DataSourceSurface::ScopedMap outputMap(target, DataSourceSurface::READ_WRITE);
399   if (aInput1->Equals(aInput2)) {
400     ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, mode>(inputMap1, inputMap1,
401                                                         outputMap, size);
402   } else {
403     DataSourceSurface::ScopedMap inputMap2(aInput2, DataSourceSurface::READ);
404     ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, mode>(inputMap1, inputMap2,
405                                                         outputMap, size);
406   }
407 
408   return target.forget();
409 }
410 
411 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
ApplyBlending_SIMD(DataSourceSurface * aInput1,DataSourceSurface * aInput2,BlendMode aBlendMode)412 static already_AddRefed<DataSourceSurface> ApplyBlending_SIMD(
413     DataSourceSurface* aInput1, DataSourceSurface* aInput2,
414     BlendMode aBlendMode) {
415   switch (aBlendMode) {
416     case BLEND_MODE_MULTIPLY:
417       return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_MULTIPLY>(
418           aInput1, aInput2);
419     case BLEND_MODE_SCREEN:
420       return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_SCREEN>(
421           aInput1, aInput2);
422     case BLEND_MODE_DARKEN:
423       return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_DARKEN>(
424           aInput1, aInput2);
425     case BLEND_MODE_LIGHTEN:
426       return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_LIGHTEN>(
427           aInput1, aInput2);
428     default:
429       return nullptr;
430   }
431 }
432 
433 template <MorphologyOperator Operator, typename u8x16_t>
Morph8(u8x16_t a,u8x16_t b)434 static u8x16_t Morph8(u8x16_t a, u8x16_t b) {
435   return Operator == MORPHOLOGY_OPERATOR_ERODE ? simd::Min8(a, b)
436                                                : simd::Max8(a, b);
437 }
438 
439 // Set every pixel to the per-component minimum or maximum of the pixels around
440 // it that are up to aRadius pixels away from it (horizontally).
441 template <MorphologyOperator op, typename i16x8_t, typename u8x16_t>
ApplyMorphologyHorizontal_SIMD(uint8_t * aSourceData,int32_t aSourceStride,uint8_t * aDestData,int32_t aDestStride,const IntRect & aDestRect,int32_t aRadius)442 inline void ApplyMorphologyHorizontal_SIMD(
443     uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData,
444     int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius) {
445   static_assert(
446       op == MORPHOLOGY_OPERATOR_ERODE || op == MORPHOLOGY_OPERATOR_DILATE,
447       "unexpected morphology operator");
448 
449   int32_t kernelSize = aRadius + 1 + aRadius;
450   MOZ_ASSERT(kernelSize >= 3, "don't call this with aRadius <= 0");
451   MOZ_ASSERT(kernelSize % 4 == 1 || kernelSize % 4 == 3);
452   int32_t completeKernelSizeForFourPixels = kernelSize + 3;
453   MOZ_ASSERT(completeKernelSizeForFourPixels % 4 == 0 ||
454              completeKernelSizeForFourPixels % 4 == 2);
455 
456   // aSourceData[-aRadius] and aDestData[0] are both aligned to 16 bytes, just
457   // the way we need them to be.
458 
459   IntRect sourceRect = aDestRect;
460   sourceRect.Inflate(aRadius, 0);
461 
462   for (int32_t y = aDestRect.Y(); y < aDestRect.YMost(); y++) {
463     int32_t kernelStartX = aDestRect.X() - aRadius;
464     for (int32_t x = aDestRect.X(); x < aDestRect.XMost();
465          x += 4, kernelStartX += 4) {
466       // We process four pixels (16 color values) at a time.
467       // aSourceData[0] points to the pixel located at aDestRect.TopLeft();
468       // source values can be read beyond that because the source is extended
469       // by aRadius pixels.
470 
471       int32_t sourceIndex = y * aSourceStride + 4 * kernelStartX;
472       u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
473       u8x16_t m1234 = p1234;
474 
475       for (int32_t i = 4; i < completeKernelSizeForFourPixels; i += 4) {
476         u8x16_t p5678 =
477             (kernelStartX + i < sourceRect.XMost())
478                 ? simd::Load8<u8x16_t>(&aSourceData[sourceIndex + 4 * i])
479                 : simd::FromZero8<u8x16_t>();
480         u8x16_t p2345 = simd::Rotate8<4>(p1234, p5678);
481         u8x16_t p3456 = simd::Rotate8<8>(p1234, p5678);
482         m1234 = Morph8<op, u8x16_t>(m1234, p2345);
483         m1234 = Morph8<op, u8x16_t>(m1234, p3456);
484         if (i + 2 < completeKernelSizeForFourPixels) {
485           u8x16_t p4567 = simd::Rotate8<12>(p1234, p5678);
486           m1234 = Morph8<op, u8x16_t>(m1234, p4567);
487           m1234 = Morph8<op, u8x16_t>(m1234, p5678);
488         }
489         p1234 = p5678;
490       }
491 
492       int32_t destIndex = y * aDestStride + 4 * x;
493       simd::Store8(&aDestData[destIndex], m1234);
494     }
495   }
496 }
497 
498 template <typename i16x8_t, typename u8x16_t>
ApplyMorphologyHorizontal_SIMD(uint8_t * aSourceData,int32_t aSourceStride,uint8_t * aDestData,int32_t aDestStride,const IntRect & aDestRect,int32_t aRadius,MorphologyOperator aOp)499 inline void ApplyMorphologyHorizontal_SIMD(
500     uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData,
501     int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius,
502     MorphologyOperator aOp) {
503   if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
504     ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_ERODE, i16x8_t, u8x16_t>(
505         aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
506   } else {
507     ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_DILATE, i16x8_t,
508                                    u8x16_t>(
509         aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
510   }
511 }
512 
513 // Set every pixel to the per-component minimum or maximum of the pixels around
514 // it that are up to aRadius pixels away from it (vertically).
515 template <MorphologyOperator op, typename i16x8_t, typename u8x16_t>
ApplyMorphologyVertical_SIMD(uint8_t * aSourceData,int32_t aSourceStride,uint8_t * aDestData,int32_t aDestStride,const IntRect & aDestRect,int32_t aRadius)516 static void ApplyMorphologyVertical_SIMD(
517     uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData,
518     int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius) {
519   static_assert(
520       op == MORPHOLOGY_OPERATOR_ERODE || op == MORPHOLOGY_OPERATOR_DILATE,
521       "unexpected morphology operator");
522 
523   int32_t startY = aDestRect.Y() - aRadius;
524   int32_t endY = aDestRect.Y() + aRadius;
525   for (int32_t y = aDestRect.Y(); y < aDestRect.YMost();
526        y++, startY++, endY++) {
527     for (int32_t x = aDestRect.X(); x < aDestRect.XMost(); x += 4) {
528       int32_t sourceIndex = startY * aSourceStride + 4 * x;
529       u8x16_t u = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
530       sourceIndex += aSourceStride;
531       for (int32_t iy = startY + 1; iy <= endY;
532            iy++, sourceIndex += aSourceStride) {
533         u8x16_t u2 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
534         u = Morph8<op, u8x16_t>(u, u2);
535       }
536 
537       int32_t destIndex = y * aDestStride + 4 * x;
538       simd::Store8(&aDestData[destIndex], u);
539     }
540   }
541 }
542 
543 template <typename i16x8_t, typename u8x16_t>
ApplyMorphologyVertical_SIMD(uint8_t * aSourceData,int32_t aSourceStride,uint8_t * aDestData,int32_t aDestStride,const IntRect & aDestRect,int32_t aRadius,MorphologyOperator aOp)544 inline void ApplyMorphologyVertical_SIMD(
545     uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData,
546     int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius,
547     MorphologyOperator aOp) {
548   if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
549     ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_ERODE, i16x8_t, u8x16_t>(
550         aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
551   } else {
552     ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_DILATE, i16x8_t, u8x16_t>(
553         aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
554   }
555 }
556 
557 template <typename i32x4_t, typename i16x8_t>
ColorMatrixMultiply(i16x8_t p,i16x8_t rows_bg,i16x8_t rows_ra,const i32x4_t & bias)558 static i32x4_t ColorMatrixMultiply(i16x8_t p, i16x8_t rows_bg, i16x8_t rows_ra,
559                                    const i32x4_t& bias) {
560   // int16_t p[8] == { b, g, r, a, b, g, r, a }.
561   // int16_t rows_bg[8] == { bB, bG, bR, bA, gB, gG, gR, gA }.
562   // int16_t rows_ra[8] == { rB, rG, rR, rA, aB, aG, aR, aA }.
563   // int32_t bias[4] == { _B, _G, _R, _A }.
564 
565   i32x4_t sum = bias;
566 
567   // int16_t bg[8] = { b, g, b, g, b, g, b, g };
568   i16x8_t bg = simd::ShuffleHi16<1, 0, 1, 0>(simd::ShuffleLo16<1, 0, 1, 0>(p));
569   // int32_t prodsum_bg[4] =
570   //   { b * bB + g * gB, b * bG + g * gG, b * bR + g * gR, b * bA + g * gA }
571   i32x4_t prodsum_bg = simd::MulAdd16x8x2To32x4(bg, rows_bg);
572   sum = simd::Add32(sum, prodsum_bg);
573 
574   // uint16_t ra[8] = { r, a, r, a, r, a, r, a };
575   i16x8_t ra = simd::ShuffleHi16<3, 2, 3, 2>(simd::ShuffleLo16<3, 2, 3, 2>(p));
576   // int32_t prodsum_ra[4] =
577   //   { r * rB + a * aB, r * rG + a * aG, r * rR + a * aR, r * rA + a * aA }
578   i32x4_t prodsum_ra = simd::MulAdd16x8x2To32x4(ra, rows_ra);
579   sum = simd::Add32(sum, prodsum_ra);
580 
581   // int32_t sum[4] == { b * bB + g * gB + r * rB + a * aB + _B, ... }.
582   return sum;
583 }
584 
585 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
ApplyColorMatrix_SIMD(DataSourceSurface * aInput,const Matrix5x4 & aMatrix)586 static already_AddRefed<DataSourceSurface> ApplyColorMatrix_SIMD(
587     DataSourceSurface* aInput, const Matrix5x4& aMatrix) {
588   IntSize size = aInput->GetSize();
589   RefPtr<DataSourceSurface> target =
590       Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
591   if (!target) {
592     return nullptr;
593   }
594 
595   DataSourceSurface::ScopedMap inputMap(aInput, DataSourceSurface::READ);
596   DataSourceSurface::ScopedMap outputMap(target, DataSourceSurface::READ_WRITE);
597 
598   uint8_t* sourceData = inputMap.GetData();
599   uint8_t* targetData = outputMap.GetData();
600   int32_t sourceStride = inputMap.GetStride();
601   int32_t targetStride = outputMap.GetStride();
602 
603   const int16_t factor = 128;
604   const Float floatElementMax = INT16_MAX / factor;  // 255
605   MOZ_ASSERT((floatElementMax * factor) <= INT16_MAX,
606              "badly chosen float-to-int scale");
607 
608   const Float* floats = &aMatrix._11;
609 
610   ptrdiff_t componentOffsets[4] = {
611       B8G8R8A8_COMPONENT_BYTEOFFSET_R, B8G8R8A8_COMPONENT_BYTEOFFSET_G,
612       B8G8R8A8_COMPONENT_BYTEOFFSET_B, B8G8R8A8_COMPONENT_BYTEOFFSET_A};
613 
614   // We store the color matrix in rows_bgra in the following format:
615   // { bB, bG, bR, bA, gB, gG, gR, gA }.
616   // { bB, gB, bG, gG, bR, gR, bA, gA }
617   // The way this is interleaved allows us to use the intrinsic _mm_madd_epi16
618   // which works especially well for our use case.
619   int16_t rows_bgra[2][8];
620   for (size_t rowIndex = 0; rowIndex < 4; rowIndex++) {
621     for (size_t colIndex = 0; colIndex < 4; colIndex++) {
622       const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex];
623       Float clampedFloatMatrixElement = std::min(
624           std::max(floatMatrixElement, -floatElementMax), floatElementMax);
625       int16_t scaledIntMatrixElement =
626           int16_t(clampedFloatMatrixElement * factor + 0.5);
627       int8_t bg_or_ra = componentOffsets[rowIndex] / 2;
628       int8_t g_or_a = componentOffsets[rowIndex] % 2;
629       int8_t B_or_G_or_R_or_A = componentOffsets[colIndex];
630       rows_bgra[bg_or_ra][B_or_G_or_R_or_A * 2 + g_or_a] =
631           scaledIntMatrixElement;
632     }
633   }
634 
635   int32_t rowBias[4];
636   Float biasMax = (INT32_MAX - 4 * 255 * INT16_MAX) / (factor * 255);
637   for (size_t colIndex = 0; colIndex < 4; colIndex++) {
638     size_t rowIndex = 4;
639     const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex];
640     Float clampedFloatMatrixElement =
641         std::min(std::max(floatMatrixElement, -biasMax), biasMax);
642     int32_t scaledIntMatrixElement =
643         int32_t(clampedFloatMatrixElement * factor * 255 + 0.5);
644     rowBias[componentOffsets[colIndex]] = scaledIntMatrixElement;
645   }
646 
647   i16x8_t row_bg_v = simd::FromI16<i16x8_t>(
648       rows_bgra[0][0], rows_bgra[0][1], rows_bgra[0][2], rows_bgra[0][3],
649       rows_bgra[0][4], rows_bgra[0][5], rows_bgra[0][6], rows_bgra[0][7]);
650 
651   i16x8_t row_ra_v = simd::FromI16<i16x8_t>(
652       rows_bgra[1][0], rows_bgra[1][1], rows_bgra[1][2], rows_bgra[1][3],
653       rows_bgra[1][4], rows_bgra[1][5], rows_bgra[1][6], rows_bgra[1][7]);
654 
655   i32x4_t rowsBias_v =
656       simd::From32<i32x4_t>(rowBias[0], rowBias[1], rowBias[2], rowBias[3]);
657 
658   for (int32_t y = 0; y < size.height; y++) {
659     for (int32_t x = 0; x < size.width; x += 4) {
660       MOZ_ASSERT(sourceStride >= 4 * (x + 4),
661                  "need to be able to read 4 pixels at this position");
662       MOZ_ASSERT(targetStride >= 4 * (x + 4),
663                  "need to be able to write 4 pixels at this position");
664       int32_t sourceIndex = y * sourceStride + 4 * x;
665       int32_t targetIndex = y * targetStride + 4 * x;
666 
667       // We load 4 pixels, unpack them, process them 1 pixel at a time, and
668       // finally pack and store the 4 result pixels.
669 
670       u8x16_t p1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
671 
672       // Splat needed to get each pixel twice into i16x8
673       i16x8_t p11 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<0>(p1234));
674       i16x8_t p22 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<1>(p1234));
675       i16x8_t p33 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<2>(p1234));
676       i16x8_t p44 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<3>(p1234));
677 
678       i32x4_t result_p1 =
679           ColorMatrixMultiply(p11, row_bg_v, row_ra_v, rowsBias_v);
680       i32x4_t result_p2 =
681           ColorMatrixMultiply(p22, row_bg_v, row_ra_v, rowsBias_v);
682       i32x4_t result_p3 =
683           ColorMatrixMultiply(p33, row_bg_v, row_ra_v, rowsBias_v);
684       i32x4_t result_p4 =
685           ColorMatrixMultiply(p44, row_bg_v, row_ra_v, rowsBias_v);
686 
687       static_assert(factor == 1 << 7,
688                     "Please adapt the calculation in the lines below for a "
689                     "different factor.");
690       u8x16_t result_p1234 = simd::PackAndSaturate32To8(
691           simd::ShiftRight32<7>(result_p1), simd::ShiftRight32<7>(result_p2),
692           simd::ShiftRight32<7>(result_p3), simd::ShiftRight32<7>(result_p4));
693       simd::Store8(&targetData[targetIndex], result_p1234);
694     }
695   }
696 
697   return target.forget();
698 }
699 
700 // source / dest: bgra bgra
701 // sourceAlpha / destAlpha: aaaa aaaa
702 // result: bgra bgra
703 template <typename i32x4_t, typename u16x8_t, uint32_t aCompositeOperator>
CompositeTwoPixels(u16x8_t source,u16x8_t sourceAlpha,u16x8_t dest,const u16x8_t & destAlpha)704 static inline u16x8_t CompositeTwoPixels(u16x8_t source, u16x8_t sourceAlpha,
705                                          u16x8_t dest,
706                                          const u16x8_t& destAlpha) {
707   u16x8_t x255 = simd::FromU16<u16x8_t>(255);
708 
709   switch (aCompositeOperator) {
710     case COMPOSITE_OPERATOR_OVER: {
711       // val = dest * (255 - sourceAlpha) + source * 255;
712       u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
713 
714       u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
715       u16x8_t rightFactor1 =
716           simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, x255);
717       i32x4_t result1 =
718           simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
719 
720       u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
721       u16x8_t rightFactor2 =
722           simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, x255);
723       i32x4_t result2 =
724           simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
725 
726       return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
727                                           simd::FastDivideBy255(result2));
728     }
729 
730     case COMPOSITE_OPERATOR_IN: {
731       // val = source * destAlpha;
732       return simd::FastDivideBy255_16(simd::Mul16(source, destAlpha));
733     }
734 
735     case COMPOSITE_OPERATOR_OUT: {
736       // val = source * (255 - destAlpha);
737       u16x8_t prod = simd::Mul16(source, simd::Sub16(x255, destAlpha));
738       return simd::FastDivideBy255_16(prod);
739     }
740 
741     case COMPOSITE_OPERATOR_ATOP: {
742       // val = dest * (255 - sourceAlpha) + source * destAlpha;
743       u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
744 
745       u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
746       u16x8_t rightFactor1 =
747           simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, destAlpha);
748       i32x4_t result1 =
749           simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
750 
751       u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
752       u16x8_t rightFactor2 =
753           simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, destAlpha);
754       i32x4_t result2 =
755           simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
756 
757       return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
758                                           simd::FastDivideBy255(result2));
759     }
760 
761     case COMPOSITE_OPERATOR_XOR: {
762       // val = dest * (255 - sourceAlpha) + source * (255 - destAlpha);
763       u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
764       u16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
765 
766       u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
767       u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha,
768                                                   twoFiftyFiveMinusDestAlpha);
769       i32x4_t result1 =
770           simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
771 
772       u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
773       u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha,
774                                                   twoFiftyFiveMinusDestAlpha);
775       i32x4_t result2 =
776           simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
777 
778       return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
779                                           simd::FastDivideBy255(result2));
780     }
781 
782     case COMPOSITE_OPERATOR_LIGHTER: {
783       // val = dest * sourceAlpha + source * destAlpha;
784       u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
785       u16x8_t rightFactor1 = simd::InterleaveLo16(sourceAlpha, destAlpha);
786       i32x4_t result1 =
787           simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
788 
789       u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
790       u16x8_t rightFactor2 = simd::InterleaveHi16(sourceAlpha, destAlpha);
791       i32x4_t result2 =
792           simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
793 
794       return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
795                                           simd::FastDivideBy255(result2));
796     }
797 
798     default:
799       return simd::FromU16<u16x8_t>(0);
800   }
801 }
802 
803 template <typename i32x4_t, typename u16x8_t, typename u8x16_t, uint32_t op>
ApplyComposition(DataSourceSurface * aSource,DataSourceSurface * aDest)804 static void ApplyComposition(DataSourceSurface* aSource,
805                              DataSourceSurface* aDest) {
806   IntSize size = aDest->GetSize();
807 
808   DataSourceSurface::ScopedMap input(aSource, DataSourceSurface::READ);
809   DataSourceSurface::ScopedMap output(aDest, DataSourceSurface::READ_WRITE);
810 
811   uint8_t* sourceData = input.GetData();
812   uint8_t* destData = output.GetData();
813   uint32_t sourceStride = input.GetStride();
814   uint32_t destStride = output.GetStride();
815 
816   for (int32_t y = 0; y < size.height; y++) {
817     for (int32_t x = 0; x < size.width; x += 4) {
818       uint32_t sourceIndex = y * sourceStride + 4 * x;
819       uint32_t destIndex = y * destStride + 4 * x;
820 
821       u8x16_t s1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
822       u8x16_t d1234 = simd::Load8<u8x16_t>(&destData[destIndex]);
823 
824       u16x8_t s12 = simd::UnpackLo8x8ToU16x8(s1234);
825       u16x8_t d12 = simd::UnpackLo8x8ToU16x8(d1234);
826       u16x8_t sa12 = simd::Splat16<3, 3>(s12);
827       u16x8_t da12 = simd::Splat16<3, 3>(d12);
828       u16x8_t result12 =
829           CompositeTwoPixels<i32x4_t, u16x8_t, op>(s12, sa12, d12, da12);
830 
831       u16x8_t s34 = simd::UnpackHi8x8ToU16x8(s1234);
832       u16x8_t d34 = simd::UnpackHi8x8ToU16x8(d1234);
833       u16x8_t sa34 = simd::Splat16<3, 3>(s34);
834       u16x8_t da34 = simd::Splat16<3, 3>(d34);
835       u16x8_t result34 =
836           CompositeTwoPixels<i32x4_t, u16x8_t, op>(s34, sa34, d34, da34);
837 
838       u8x16_t result1234 = simd::PackAndSaturate16To8(result12, result34);
839       simd::Store8(&destData[destIndex], result1234);
840     }
841   }
842 }
843 
844 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
ApplyComposition_SIMD(DataSourceSurface * aSource,DataSourceSurface * aDest,CompositeOperator aOperator)845 static void ApplyComposition_SIMD(DataSourceSurface* aSource,
846                                   DataSourceSurface* aDest,
847                                   CompositeOperator aOperator) {
848   switch (aOperator) {
849     case COMPOSITE_OPERATOR_OVER:
850       ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_OVER>(
851           aSource, aDest);
852       break;
853     case COMPOSITE_OPERATOR_IN:
854       ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_IN>(
855           aSource, aDest);
856       break;
857     case COMPOSITE_OPERATOR_OUT:
858       ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_OUT>(
859           aSource, aDest);
860       break;
861     case COMPOSITE_OPERATOR_ATOP:
862       ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_ATOP>(
863           aSource, aDest);
864       break;
865     case COMPOSITE_OPERATOR_XOR:
866       ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_XOR>(
867           aSource, aDest);
868       break;
869     case COMPOSITE_OPERATOR_LIGHTER:
870       ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_LIGHTER>(
871           aSource, aDest);
872       break;
873     default:
874       MOZ_CRASH("GFX: Incomplete switch");
875   }
876 }
877 
878 template <typename u8x16_t>
SeparateColorChannels_SIMD(const IntSize & size,uint8_t * sourceData,int32_t sourceStride,uint8_t * channel0Data,uint8_t * channel1Data,uint8_t * channel2Data,uint8_t * channel3Data,int32_t channelStride)879 static void SeparateColorChannels_SIMD(
880     const IntSize& size, uint8_t* sourceData, int32_t sourceStride,
881     uint8_t* channel0Data, uint8_t* channel1Data, uint8_t* channel2Data,
882     uint8_t* channel3Data, int32_t channelStride) {
883   for (int32_t y = 0; y < size.height; y++) {
884     for (int32_t x = 0; x < size.width; x += 16) {
885       // Process 16 pixels at a time.
886       int32_t sourceIndex = y * sourceStride + 4 * x;
887       int32_t targetIndex = y * channelStride + x;
888 
889       u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>();
890       u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>();
891       u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>();
892 
893       u8x16_t bgrabgrabgrabgra1 =
894           simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
895       if (4 * (x + 4) < sourceStride) {
896         bgrabgrabgrabgra2 =
897             simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]);
898       }
899       if (4 * (x + 8) < sourceStride) {
900         bgrabgrabgrabgra3 =
901             simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]);
902       }
903       if (4 * (x + 12) < sourceStride) {
904         bgrabgrabgrabgra4 =
905             simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]);
906       }
907 
908       u8x16_t bbggrraabbggrraa1 =
909           simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
910       u8x16_t bbggrraabbggrraa2 =
911           simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
912       u8x16_t bbggrraabbggrraa3 =
913           simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
914       u8x16_t bbggrraabbggrraa4 =
915           simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
916       u8x16_t bbbbggggrrrraaaa1 =
917           simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3);
918       u8x16_t bbbbggggrrrraaaa2 =
919           simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3);
920       u8x16_t bbbbggggrrrraaaa3 =
921           simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4);
922       u8x16_t bbbbggggrrrraaaa4 =
923           simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4);
924       u8x16_t bbbbbbbbgggggggg1 =
925           simd::InterleaveLo8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
926       u8x16_t rrrrrrrraaaaaaaa1 =
927           simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
928       u8x16_t bbbbbbbbgggggggg2 =
929           simd::InterleaveLo8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
930       u8x16_t rrrrrrrraaaaaaaa2 =
931           simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
932       u8x16_t bbbbbbbbbbbbbbbb =
933           simd::InterleaveLo8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2);
934       u8x16_t gggggggggggggggg =
935           simd::InterleaveHi8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2);
936       u8x16_t rrrrrrrrrrrrrrrr =
937           simd::InterleaveLo8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
938       u8x16_t aaaaaaaaaaaaaaaa =
939           simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
940 
941       simd::Store8(&channel0Data[targetIndex], bbbbbbbbbbbbbbbb);
942       simd::Store8(&channel1Data[targetIndex], gggggggggggggggg);
943       simd::Store8(&channel2Data[targetIndex], rrrrrrrrrrrrrrrr);
944       simd::Store8(&channel3Data[targetIndex], aaaaaaaaaaaaaaaa);
945     }
946   }
947 }
948 
949 template <typename u8x16_t>
CombineColorChannels_SIMD(const IntSize & size,int32_t resultStride,uint8_t * resultData,int32_t channelStride,uint8_t * channel0Data,uint8_t * channel1Data,uint8_t * channel2Data,uint8_t * channel3Data)950 static void CombineColorChannels_SIMD(
951     const IntSize& size, int32_t resultStride, uint8_t* resultData,
952     int32_t channelStride, uint8_t* channel0Data, uint8_t* channel1Data,
953     uint8_t* channel2Data, uint8_t* channel3Data) {
954   for (int32_t y = 0; y < size.height; y++) {
955     for (int32_t x = 0; x < size.width; x += 16) {
956       // Process 16 pixels at a time.
957       int32_t resultIndex = y * resultStride + 4 * x;
958       int32_t channelIndex = y * channelStride + x;
959 
960       u8x16_t bbbbbbbbbbbbbbbb =
961           simd::Load8<u8x16_t>(&channel0Data[channelIndex]);
962       u8x16_t gggggggggggggggg =
963           simd::Load8<u8x16_t>(&channel1Data[channelIndex]);
964       u8x16_t rrrrrrrrrrrrrrrr =
965           simd::Load8<u8x16_t>(&channel2Data[channelIndex]);
966       u8x16_t aaaaaaaaaaaaaaaa =
967           simd::Load8<u8x16_t>(&channel3Data[channelIndex]);
968 
969       u8x16_t brbrbrbrbrbrbrbr1 =
970           simd::InterleaveLo8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr);
971       u8x16_t brbrbrbrbrbrbrbr2 =
972           simd::InterleaveHi8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr);
973       u8x16_t gagagagagagagaga1 =
974           simd::InterleaveLo8(gggggggggggggggg, aaaaaaaaaaaaaaaa);
975       u8x16_t gagagagagagagaga2 =
976           simd::InterleaveHi8(gggggggggggggggg, aaaaaaaaaaaaaaaa);
977 
978       u8x16_t bgrabgrabgrabgra1 =
979           simd::InterleaveLo8(brbrbrbrbrbrbrbr1, gagagagagagagaga1);
980       u8x16_t bgrabgrabgrabgra2 =
981           simd::InterleaveHi8(brbrbrbrbrbrbrbr1, gagagagagagagaga1);
982       u8x16_t bgrabgrabgrabgra3 =
983           simd::InterleaveLo8(brbrbrbrbrbrbrbr2, gagagagagagagaga2);
984       u8x16_t bgrabgrabgrabgra4 =
985           simd::InterleaveHi8(brbrbrbrbrbrbrbr2, gagagagagagagaga2);
986 
987       simd::Store8(&resultData[resultIndex], bgrabgrabgrabgra1);
988       if (4 * (x + 4) < resultStride) {
989         simd::Store8(&resultData[resultIndex + 4 * 4], bgrabgrabgrabgra2);
990       }
991       if (4 * (x + 8) < resultStride) {
992         simd::Store8(&resultData[resultIndex + 8 * 4], bgrabgrabgrabgra3);
993       }
994       if (4 * (x + 12) < resultStride) {
995         simd::Store8(&resultData[resultIndex + 12 * 4], bgrabgrabgrabgra4);
996       }
997     }
998   }
999 }
1000 
1001 template <typename i32x4_t, typename u16x8_t, typename u8x16_t>
DoPremultiplicationCalculation_SIMD(const IntSize & aSize,uint8_t * aTargetData,int32_t aTargetStride,uint8_t * aSourceData,int32_t aSourceStride)1002 static void DoPremultiplicationCalculation_SIMD(const IntSize& aSize,
1003                                                 uint8_t* aTargetData,
1004                                                 int32_t aTargetStride,
1005                                                 uint8_t* aSourceData,
1006                                                 int32_t aSourceStride) {
1007   const u8x16_t alphaMask = simd::From8<u8x16_t>(0, 0, 0, 0xff, 0, 0, 0, 0xff,
1008                                                  0, 0, 0, 0xff, 0, 0, 0, 0xff);
1009   for (int32_t y = 0; y < aSize.height; y++) {
1010     for (int32_t x = 0; x < aSize.width; x += 4) {
1011       int32_t inputIndex = y * aSourceStride + 4 * x;
1012       int32_t targetIndex = y * aTargetStride + 4 * x;
1013 
1014       u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
1015       u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
1016       u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
1017 
1018       // Multiply all components with alpha.
1019       p12 = simd::Mul16(p12, simd::Splat16<3, 3>(p12));
1020       p34 = simd::Mul16(p34, simd::Splat16<3, 3>(p34));
1021 
1022       // Divide by 255 and pack.
1023       u8x16_t result = simd::PackAndSaturate16To8(
1024           simd::FastDivideBy255_16(p12), simd::FastDivideBy255_16(p34));
1025 
1026       // Get the original alpha channel value back from p1234.
1027       result = simd::Pick(alphaMask, result, p1234);
1028 
1029       simd::Store8(&aTargetData[targetIndex], result);
1030     }
1031   }
1032 }
1033 
1034 // We use a table of precomputed factors for unpremultiplying.
1035 // We want to compute round(r / (alpha / 255.0f)) for arbitrary values of
1036 // r and alpha in constant time. This table of factors has the property that
1037 // (r * sAlphaFactors[alpha] + 128) >> 8 roughly gives the result we want (with
1038 // a maximum deviation of 1).
1039 //
1040 // sAlphaFactors[alpha] == round(255.0 * (1 << 8) / alpha)
1041 //
1042 // This table has been created using the python code
1043 // ", ".join("%d" % (round(255.0 * 256 / alpha) if alpha > 0 else 0) for alpha
1044 // in range(256))
1045 static const uint16_t sAlphaFactors[256] = {
1046     0,    65280, 32640, 21760, 16320, 13056, 10880, 9326, 8160, 7253, 6528,
1047     5935, 5440,  5022,  4663,  4352,  4080,  3840,  3627, 3436, 3264, 3109,
1048     2967, 2838,  2720,  2611,  2511,  2418,  2331,  2251, 2176, 2106, 2040,
1049     1978, 1920,  1865,  1813,  1764,  1718,  1674,  1632, 1592, 1554, 1518,
1050     1484, 1451,  1419,  1389,  1360,  1332,  1306,  1280, 1255, 1232, 1209,
1051     1187, 1166,  1145,  1126,  1106,  1088,  1070,  1053, 1036, 1020, 1004,
1052     989,  974,   960,   946,   933,   919,   907,   894,  882,  870,  859,
1053     848,  837,   826,   816,   806,   796,   787,   777,  768,  759,  750,
1054     742,  733,   725,   717,   710,   702,   694,   687,  680,  673,  666,
1055     659,  653,   646,   640,   634,   628,   622,   616,  610,  604,  599,
1056     593,  588,   583,   578,   573,   568,   563,   558,  553,  549,  544,
1057     540,  535,   531,   526,   522,   518,   514,   510,  506,  502,  498,
1058     495,  491,   487,   484,   480,   476,   473,   470,  466,  463,  460,
1059     457,  453,   450,   447,   444,   441,   438,   435,  432,  429,  427,
1060     424,  421,   418,   416,   413,   411,   408,   405,  403,  400,  398,
1061     396,  393,   391,   389,   386,   384,   382,   380,  377,  375,  373,
1062     371,  369,   367,   365,   363,   361,   359,   357,  355,  353,  351,
1063     349,  347,   345,   344,   342,   340,   338,   336,  335,  333,  331,
1064     330,  328,   326,   325,   323,   322,   320,   318,  317,  315,  314,
1065     312,  311,   309,   308,   306,   305,   304,   302,  301,  299,  298,
1066     297,  295,   294,   293,   291,   290,   289,   288,  286,  285,  284,
1067     283,  281,   280,   279,   278,   277,   275,   274,  273,  272,  271,
1068     270,  269,   268,   266,   265,   264,   263,   262,  261,  260,  259,
1069     258,  257,   256};
1070 
1071 template <typename u16x8_t, typename u8x16_t>
DoUnpremultiplicationCalculation_SIMD(const IntSize & aSize,uint8_t * aTargetData,int32_t aTargetStride,uint8_t * aSourceData,int32_t aSourceStride)1072 static void DoUnpremultiplicationCalculation_SIMD(const IntSize& aSize,
1073                                                   uint8_t* aTargetData,
1074                                                   int32_t aTargetStride,
1075                                                   uint8_t* aSourceData,
1076                                                   int32_t aSourceStride) {
1077   for (int32_t y = 0; y < aSize.height; y++) {
1078     for (int32_t x = 0; x < aSize.width; x += 4) {
1079       int32_t inputIndex = y * aSourceStride + 4 * x;
1080       int32_t targetIndex = y * aTargetStride + 4 * x;
1081       union {
1082         u8x16_t p1234;
1083         uint8_t u8[4][4];
1084       };
1085       p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
1086 
1087       // Prepare the alpha factors.
1088       uint16_t aF1 = sAlphaFactors[u8[0][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
1089       uint16_t aF2 = sAlphaFactors[u8[1][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
1090       uint16_t aF3 = sAlphaFactors[u8[2][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
1091       uint16_t aF4 = sAlphaFactors[u8[3][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
1092       u16x8_t aF12 =
1093           simd::FromU16<u16x8_t>(aF1, aF1, aF1, 1 << 8, aF2, aF2, aF2, 1 << 8);
1094       u16x8_t aF34 =
1095           simd::FromU16<u16x8_t>(aF3, aF3, aF3, 1 << 8, aF4, aF4, aF4, 1 << 8);
1096 
1097       u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
1098       u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
1099 
1100       // Multiply with the alpha factors, add 128 for rounding, and shift right
1101       // by 8 bits.
1102       p12 = simd::ShiftRight16<8>(
1103           simd::Add16(simd::Mul16(p12, aF12), simd::FromU16<u16x8_t>(128)));
1104       p34 = simd::ShiftRight16<8>(
1105           simd::Add16(simd::Mul16(p34, aF34), simd::FromU16<u16x8_t>(128)));
1106 
1107       u8x16_t result = simd::PackAndSaturate16To8(p12, p34);
1108       simd::Store8(&aTargetData[targetIndex], result);
1109     }
1110   }
1111 }
1112 
1113 template <typename u16x8_t, typename u8x16_t>
DoOpacityCalculation_SIMD(const IntSize & aSize,uint8_t * aTargetData,int32_t aTargetStride,uint8_t * aSourceData,int32_t aSourceStride,Float aOpacity)1114 static void DoOpacityCalculation_SIMD(const IntSize& aSize,
1115                                       uint8_t* aTargetData,
1116                                       int32_t aTargetStride,
1117                                       uint8_t* aSourceData,
1118                                       int32_t aSourceStride, Float aOpacity) {
1119   uint8_t alphaValue = uint8_t(roundf(255.f * aOpacity));
1120   u16x8_t alphaValues =
1121       simd::FromU16<u16x8_t>(alphaValue, alphaValue, alphaValue, alphaValue,
1122                              alphaValue, alphaValue, alphaValue, alphaValue);
1123   for (int32_t y = 0; y < aSize.height; y++) {
1124     for (int32_t x = 0; x < aSize.width; x += 4) {
1125       int32_t inputIndex = y * aSourceStride + 4 * x;
1126       int32_t targetIndex = y * aTargetStride + 4 * x;
1127 
1128       u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
1129       u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
1130       u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
1131 
1132       // Multiply all components with alpha.
1133       p12 = simd::Mul16(p12, alphaValues);
1134       p34 = simd::Mul16(p34, alphaValues);
1135 
1136       // Divide by 255 and pack.
1137       u8x16_t result = simd::PackAndSaturate16To8(simd::ShiftRight16<8>(p12),
1138                                                   simd::ShiftRight16<8>(p34));
1139 
1140       simd::Store8(&aTargetData[targetIndex], result);
1141     }
1142   }
1143 }
1144 
1145 template <typename f32x4_t, typename i32x4_t, typename u8x16_t>
RenderTurbulence_SIMD(const IntSize & aSize,const Point & aOffset,const Size & aBaseFrequency,int32_t aSeed,int aNumOctaves,TurbulenceType aType,bool aStitch,const Rect & aTileRect)1146 static already_AddRefed<DataSourceSurface> RenderTurbulence_SIMD(
1147     const IntSize& aSize, const Point& aOffset, const Size& aBaseFrequency,
1148     int32_t aSeed, int aNumOctaves, TurbulenceType aType, bool aStitch,
1149     const Rect& aTileRect) {
1150 #define RETURN_TURBULENCE(Type, Stitch)                                    \
1151   SVGTurbulenceRenderer<Type, Stitch, f32x4_t, i32x4_t, u8x16_t> renderer( \
1152       aBaseFrequency, aSeed, aNumOctaves, aTileRect);                      \
1153   return renderer.Render(aSize, aOffset);
1154 
1155   switch (aType) {
1156     case TURBULENCE_TYPE_TURBULENCE: {
1157       if (aStitch) {
1158         RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, true);
1159       }
1160       RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, false);
1161     }
1162     case TURBULENCE_TYPE_FRACTAL_NOISE: {
1163       if (aStitch) {
1164         RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, true);
1165       }
1166       RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, false);
1167     }
1168   }
1169   return nullptr;
1170 #undef RETURN_TURBULENCE
1171 }
1172 
1173 // k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4
1174 template <typename i32x4_t, typename i16x8_t>
ArithmeticCombineTwoPixels(i16x8_t in1,i16x8_t in2,const i16x8_t & k1And4,const i16x8_t & k2And3)1175 static MOZ_ALWAYS_INLINE i16x8_t ArithmeticCombineTwoPixels(
1176     i16x8_t in1, i16x8_t in2, const i16x8_t& k1And4, const i16x8_t& k2And3) {
1177   // Calculate input product: inProd = (in1 * in2) / 255.
1178   i32x4_t inProd_1, inProd_2;
1179   simd::Mul16x4x2x2To32x4x2(in1, in2, inProd_1, inProd_2);
1180   i16x8_t inProd = simd::PackAndSaturate32To16(simd::FastDivideBy255(inProd_1),
1181                                                simd::FastDivideBy255(inProd_2));
1182 
1183   // Calculate k1 * ((in1 * in2) / 255) + (k4/128) * 128
1184   i16x8_t oneTwentyEight = simd::FromI16<i16x8_t>(128);
1185   i16x8_t inProd1AndOneTwentyEight =
1186       simd::InterleaveLo16(inProd, oneTwentyEight);
1187   i16x8_t inProd2AndOneTwentyEight =
1188       simd::InterleaveHi16(inProd, oneTwentyEight);
1189   i32x4_t inProdTimesK1PlusK4_1 =
1190       simd::MulAdd16x8x2To32x4(k1And4, inProd1AndOneTwentyEight);
1191   i32x4_t inProdTimesK1PlusK4_2 =
1192       simd::MulAdd16x8x2To32x4(k1And4, inProd2AndOneTwentyEight);
1193 
1194   // Calculate k2 * in1 + k3 * in2
1195   i16x8_t in12_1 = simd::InterleaveLo16(in1, in2);
1196   i16x8_t in12_2 = simd::InterleaveHi16(in1, in2);
1197   i32x4_t inTimesK2K3_1 = simd::MulAdd16x8x2To32x4(k2And3, in12_1);
1198   i32x4_t inTimesK2K3_2 = simd::MulAdd16x8x2To32x4(k2And3, in12_2);
1199 
1200   // Sum everything up and truncate the fractional part.
1201   i32x4_t result_1 =
1202       simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_1, inTimesK2K3_1));
1203   i32x4_t result_2 =
1204       simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_2, inTimesK2K3_2));
1205   return simd::PackAndSaturate32To16(result_1, result_2);
1206 }
1207 
1208 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
ApplyArithmeticCombine_SIMD(const DataSourceSurface::ScopedMap & aInputMap1,const DataSourceSurface::ScopedMap & aInputMap2,const DataSourceSurface::ScopedMap & aOutputMap,const IntSize & aSize,Float aK1,Float aK2,Float aK3,Float aK4)1209 static void ApplyArithmeticCombine_SIMD(
1210     const DataSourceSurface::ScopedMap& aInputMap1,
1211     const DataSourceSurface::ScopedMap& aInputMap2,
1212     const DataSourceSurface::ScopedMap& aOutputMap, const IntSize& aSize,
1213     Float aK1, Float aK2, Float aK3, Float aK4) {
1214   uint8_t* source1Data = aInputMap1.GetData();
1215   uint8_t* source2Data = aInputMap2.GetData();
1216   uint8_t* targetData = aOutputMap.GetData();
1217   uint32_t source1Stride = aInputMap1.GetStride();
1218   uint32_t source2Stride = aInputMap2.GetStride();
1219   uint32_t targetStride = aOutputMap.GetStride();
1220 
1221   // The arithmetic combine filter does the following calculation:
1222   // result = k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4
1223   //
1224   // Or, with in1/2 integers between 0 and 255:
1225   // result = (k1 * in1 * in2) / 255 + k2 * in1 + k3 * in2 + k4 * 255
1226   //
1227   // We want the whole calculation to happen in integer, with 16-bit factors.
1228   // So we convert our factors to fixed-point with precision 1.8.7.
1229   // K4 is premultiplied with 255, and it will be multiplied with 128 later
1230   // during the actual calculation, because premultiplying it with 255 * 128
1231   // would overflow int16.
1232 
1233   i16x8_t k1 = simd::FromI16<i16x8_t>(
1234       int16_t(floorf(std::min(std::max(aK1, -255.0f), 255.0f) * 128 + 0.5f)));
1235   i16x8_t k2 = simd::FromI16<i16x8_t>(
1236       int16_t(floorf(std::min(std::max(aK2, -255.0f), 255.0f) * 128 + 0.5f)));
1237   i16x8_t k3 = simd::FromI16<i16x8_t>(
1238       int16_t(floorf(std::min(std::max(aK3, -255.0f), 255.0f) * 128 + 0.5f)));
1239   i16x8_t k4 = simd::FromI16<i16x8_t>(
1240       int16_t(floorf(std::min(std::max(aK4, -128.0f), 128.0f) * 255 + 0.5f)));
1241 
1242   i16x8_t k1And4 = simd::InterleaveLo16(k1, k4);
1243   i16x8_t k2And3 = simd::InterleaveLo16(k2, k3);
1244 
1245   for (int32_t y = 0; y < aSize.height; y++) {
1246     for (int32_t x = 0; x < aSize.width; x += 4) {
1247       uint32_t source1Index = y * source1Stride + 4 * x;
1248       uint32_t source2Index = y * source2Stride + 4 * x;
1249       uint32_t targetIndex = y * targetStride + 4 * x;
1250 
1251       // Load and unpack.
1252       u8x16_t in1 = simd::Load8<u8x16_t>(&source1Data[source1Index]);
1253       u8x16_t in2 = simd::Load8<u8x16_t>(&source2Data[source2Index]);
1254       i16x8_t in1_12 = simd::UnpackLo8x8ToI16x8(in1);
1255       i16x8_t in1_34 = simd::UnpackHi8x8ToI16x8(in1);
1256       i16x8_t in2_12 = simd::UnpackLo8x8ToI16x8(in2);
1257       i16x8_t in2_34 = simd::UnpackHi8x8ToI16x8(in2);
1258 
1259       // Multiply and add.
1260       i16x8_t result_12 = ArithmeticCombineTwoPixels<i32x4_t, i16x8_t>(
1261           in1_12, in2_12, k1And4, k2And3);
1262       i16x8_t result_34 = ArithmeticCombineTwoPixels<i32x4_t, i16x8_t>(
1263           in1_34, in2_34, k1And4, k2And3);
1264 
1265       // Pack and store.
1266       simd::Store8(&targetData[targetIndex],
1267                    simd::PackAndSaturate16To8(result_12, result_34));
1268     }
1269   }
1270 }
1271 
1272 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
ApplyArithmeticCombine_SIMD(DataSourceSurface * aInput1,DataSourceSurface * aInput2,Float aK1,Float aK2,Float aK3,Float aK4)1273 static already_AddRefed<DataSourceSurface> ApplyArithmeticCombine_SIMD(
1274     DataSourceSurface* aInput1, DataSourceSurface* aInput2, Float aK1,
1275     Float aK2, Float aK3, Float aK4) {
1276   IntSize size = aInput1->GetSize();
1277   RefPtr<DataSourceSurface> target =
1278       Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
1279   if (!target) {
1280     return nullptr;
1281   }
1282 
1283   DataSourceSurface::ScopedMap inputMap1(aInput1, DataSourceSurface::READ);
1284   DataSourceSurface::ScopedMap outputMap(target, DataSourceSurface::READ_WRITE);
1285 
1286   if (aInput1->Equals(aInput2)) {
1287     ApplyArithmeticCombine_SIMD<i32x4_t, i16x8_t, u8x16_t>(
1288         inputMap1, inputMap1, outputMap, size, aK1, aK2, aK3, aK4);
1289   } else {
1290     DataSourceSurface::ScopedMap inputMap2(aInput2, DataSourceSurface::READ);
1291     ApplyArithmeticCombine_SIMD<i32x4_t, i16x8_t, u8x16_t>(
1292         inputMap1, inputMap2, outputMap, size, aK1, aK2, aK3, aK4);
1293   }
1294 
1295   return target.forget();
1296 }
1297 
1298 }  // namespace gfx
1299 }  // namespace mozilla
1300