1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 #include "FilterProcessing.h"
8 
9 #include "SIMD.h"
10 #include "SVGTurbulenceRenderer-inl.h"
11 
12 namespace mozilla {
13 namespace gfx {
14 
15 template <typename u8x16_t>
ConvertToB8G8R8A8_SIMD(SourceSurface * aSurface)16 inline already_AddRefed<DataSourceSurface> ConvertToB8G8R8A8_SIMD(
17     SourceSurface* aSurface) {
18   IntSize size = aSurface->GetSize();
19   RefPtr<DataSourceSurface> output =
20       Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
21   if (!output) {
22     return nullptr;
23   }
24 
25   RefPtr<DataSourceSurface> input = aSurface->GetDataSurface();
26   DataSourceSurface::ScopedMap inputMap(input, DataSourceSurface::READ);
27   DataSourceSurface::ScopedMap outputMap(output, DataSourceSurface::READ_WRITE);
28   uint8_t* inputData = inputMap.GetData();
29   uint8_t* outputData = outputMap.GetData();
30   int32_t inputStride = inputMap.GetStride();
31   int32_t outputStride = outputMap.GetStride();
32   switch (input->GetFormat()) {
33     case SurfaceFormat::B8G8R8A8:
34       output = input;
35       break;
36     case SurfaceFormat::B8G8R8X8:
37       for (int32_t y = 0; y < size.height; y++) {
38         for (int32_t x = 0; x < size.width; x++) {
39           int32_t inputIndex = y * inputStride + 4 * x;
40           int32_t outputIndex = y * outputStride + 4 * x;
41           outputData[outputIndex + 0] = inputData[inputIndex + 0];
42           outputData[outputIndex + 1] = inputData[inputIndex + 1];
43           outputData[outputIndex + 2] = inputData[inputIndex + 2];
44           outputData[outputIndex + 3] = 255;
45         }
46       }
47       break;
48     case SurfaceFormat::R8G8B8A8:
49       for (int32_t y = 0; y < size.height; y++) {
50         for (int32_t x = 0; x < size.width; x++) {
51           int32_t inputIndex = y * inputStride + 4 * x;
52           int32_t outputIndex = y * outputStride + 4 * x;
53           outputData[outputIndex + 2] = inputData[inputIndex + 0];
54           outputData[outputIndex + 1] = inputData[inputIndex + 1];
55           outputData[outputIndex + 0] = inputData[inputIndex + 2];
56           outputData[outputIndex + 3] = inputData[inputIndex + 3];
57         }
58       }
59       break;
60     case SurfaceFormat::R8G8B8X8:
61       for (int32_t y = 0; y < size.height; y++) {
62         for (int32_t x = 0; x < size.width; x++) {
63           int32_t inputIndex = y * inputStride + 4 * x;
64           int32_t outputIndex = y * outputStride + 4 * x;
65           outputData[outputIndex + 2] = inputData[inputIndex + 0];
66           outputData[outputIndex + 1] = inputData[inputIndex + 1];
67           outputData[outputIndex + 0] = inputData[inputIndex + 2];
68           outputData[outputIndex + 3] = 255;
69         }
70       }
71       break;
72     case SurfaceFormat::A8:
73       for (int32_t y = 0; y < size.height; y++) {
74         for (int32_t x = 0; x < size.width; x += 16) {
75           int32_t inputIndex = y * inputStride + x;
76           int32_t outputIndex = y * outputStride + 4 * x;
77           u8x16_t p1To16 = simd::Load8<u8x16_t>(&inputData[inputIndex]);
78           // Turn AAAAAAAAAAAAAAAA into four chunks of 000A000A000A000A by
79           // interleaving with 0000000000000000 twice.
80           u8x16_t zero = simd::FromZero8<u8x16_t>();
81           u8x16_t p1To8 = simd::InterleaveLo8(zero, p1To16);
82           u8x16_t p9To16 = simd::InterleaveHi8(zero, p1To16);
83           u8x16_t p1To4 = simd::InterleaveLo8(zero, p1To8);
84           u8x16_t p5To8 = simd::InterleaveHi8(zero, p1To8);
85           u8x16_t p9To12 = simd::InterleaveLo8(zero, p9To16);
86           u8x16_t p13To16 = simd::InterleaveHi8(zero, p9To16);
87           simd::Store8(&outputData[outputIndex], p1To4);
88           if ((x + 4) * 4 < outputStride) {
89             simd::Store8(&outputData[outputIndex + 4 * 4], p5To8);
90           }
91           if ((x + 8) * 4 < outputStride) {
92             simd::Store8(&outputData[outputIndex + 4 * 8], p9To12);
93           }
94           if ((x + 12) * 4 < outputStride) {
95             simd::Store8(&outputData[outputIndex + 4 * 12], p13To16);
96           }
97         }
98       }
99       break;
100     default:
101       output = nullptr;
102       break;
103   }
104   return output.forget();
105 }
106 
107 template <typename u8x16_t>
ExtractAlpha_SIMD(const IntSize & size,uint8_t * sourceData,int32_t sourceStride,uint8_t * alphaData,int32_t alphaStride)108 inline void ExtractAlpha_SIMD(const IntSize& size, uint8_t* sourceData,
109                               int32_t sourceStride, uint8_t* alphaData,
110                               int32_t alphaStride) {
111   for (int32_t y = 0; y < size.height; y++) {
112     for (int32_t x = 0; x < size.width; x += 16) {
113       // Process 16 pixels at a time.
114       // Turn up to four chunks of BGRABGRABGRABGRA into one chunk of
115       // AAAAAAAAAAAAAAAA.
116       int32_t sourceIndex = y * sourceStride + 4 * x;
117       int32_t targetIndex = y * alphaStride + x;
118 
119       u8x16_t bgrabgrabgrabgra1 = simd::FromZero8<u8x16_t>();
120       u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>();
121       u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>();
122       u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>();
123 
124       bgrabgrabgrabgra1 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
125       if (4 * (x + 4) < sourceStride) {
126         bgrabgrabgrabgra2 =
127             simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]);
128       }
129       if (4 * (x + 8) < sourceStride) {
130         bgrabgrabgrabgra3 =
131             simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]);
132       }
133       if (4 * (x + 12) < sourceStride) {
134         bgrabgrabgrabgra4 =
135             simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]);
136       }
137 
138       u8x16_t bbggrraabbggrraa1 =
139           simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
140       u8x16_t bbggrraabbggrraa2 =
141           simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
142       u8x16_t bbggrraabbggrraa3 =
143           simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
144       u8x16_t bbggrraabbggrraa4 =
145           simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
146       u8x16_t bbbbggggrrrraaaa1 =
147           simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3);
148       u8x16_t bbbbggggrrrraaaa2 =
149           simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3);
150       u8x16_t bbbbggggrrrraaaa3 =
151           simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4);
152       u8x16_t bbbbggggrrrraaaa4 =
153           simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4);
154       u8x16_t rrrrrrrraaaaaaaa1 =
155           simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
156       u8x16_t rrrrrrrraaaaaaaa2 =
157           simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
158       u8x16_t aaaaaaaaaaaaaaaa =
159           simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
160 
161       simd::Store8(&alphaData[targetIndex], aaaaaaaaaaaaaaaa);
162     }
163   }
164 }
165 
166 // This function calculates the result color values for four pixels, but for
167 // only two color channels - either b & r or g & a. However, the a result will
168 // not be used.
169 // source and dest each contain 8 values, either bbbb gggg or rrrr aaaa.
170 // sourceAlpha and destAlpha are of the form aaaa aaaa, where each aaaa is the
171 // alpha of all four pixels (and both aaaa's are the same).
172 // blendendComponent1 and blendedComponent2 are the out parameters.
173 template <typename i16x8_t, typename i32x4_t, uint32_t aBlendMode>
BlendTwoComponentsOfFourPixels(i16x8_t source,i16x8_t sourceAlpha,i16x8_t dest,const i16x8_t & destAlpha,i32x4_t & blendedComponent1,i32x4_t & blendedComponent2)174 inline void BlendTwoComponentsOfFourPixels(i16x8_t source, i16x8_t sourceAlpha,
175                                            i16x8_t dest,
176                                            const i16x8_t& destAlpha,
177                                            i32x4_t& blendedComponent1,
178                                            i32x4_t& blendedComponent2) {
179   i16x8_t x255 = simd::FromI16<i16x8_t>(255);
180 
181   switch (aBlendMode) {
182     case BLEND_MODE_MULTIPLY: {
183       // val = ((255 - destAlpha) * source + (255 - sourceAlpha + source) *
184       // dest);
185       i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
186       i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
187       i16x8_t twoFiftyFiveMinusSourceAlphaPlusSource =
188           simd::Add16(twoFiftyFiveMinusSourceAlpha, source);
189 
190       i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest);
191       i16x8_t leftFactor1 = simd::InterleaveLo16(
192           twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource);
193       blendedComponent1 =
194           simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest1, leftFactor1);
195       blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
196 
197       i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest);
198       i16x8_t leftFactor2 = simd::InterleaveHi16(
199           twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource);
200       blendedComponent2 =
201           simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest2, leftFactor2);
202       blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
203 
204       break;
205     }
206 
207     case BLEND_MODE_SCREEN: {
208       // val = 255 * (source + dest) + (0 - dest) * source;
209       i16x8_t sourcePlusDest = simd::Add16(source, dest);
210       i16x8_t zeroMinusDest = simd::Sub16(simd::FromI16<i16x8_t>(0), dest);
211 
212       i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest1 =
213           simd::InterleaveLo16(x255, zeroMinusDest);
214       i16x8_t sourcePlusDestInterleavedWithSource1 =
215           simd::InterleaveLo16(sourcePlusDest, source);
216       blendedComponent1 =
217           simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest1,
218                                    sourcePlusDestInterleavedWithSource1);
219       blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
220 
221       i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest2 =
222           simd::InterleaveHi16(x255, zeroMinusDest);
223       i16x8_t sourcePlusDestInterleavedWithSource2 =
224           simd::InterleaveHi16(sourcePlusDest, source);
225       blendedComponent2 =
226           simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest2,
227                                    sourcePlusDestInterleavedWithSource2);
228       blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
229 
230       break;
231     }
232 
233     case BLEND_MODE_DARKEN:
234     case BLEND_MODE_LIGHTEN: {
235       // Darken:
236       // val = min((255 - destAlpha) * source + 255                 * dest,
237       //           255               * source + (255 - sourceAlpha) * dest);
238       //
239       // Lighten:
240       // val = max((255 - destAlpha) * source + 255                 * dest,
241       //           255               * source + (255 - sourceAlpha) * dest);
242 
243       i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
244       i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
245 
246       i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1 =
247           simd::InterleaveLo16(twoFiftyFiveMinusDestAlpha, x255);
248       i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1 =
249           simd::InterleaveLo16(x255, twoFiftyFiveMinusSourceAlpha);
250       i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest);
251       i32x4_t product1_1 = simd::MulAdd16x8x2To32x4(
252           twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1,
253           sourceInterleavedWithDest1);
254       i32x4_t product1_2 = simd::MulAdd16x8x2To32x4(
255           twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1,
256           sourceInterleavedWithDest1);
257       blendedComponent1 = aBlendMode == BLEND_MODE_DARKEN
258                               ? simd::Min32(product1_1, product1_2)
259                               : simd::Max32(product1_1, product1_2);
260       blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
261 
262       i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2 =
263           simd::InterleaveHi16(twoFiftyFiveMinusDestAlpha, x255);
264       i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2 =
265           simd::InterleaveHi16(x255, twoFiftyFiveMinusSourceAlpha);
266       i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest);
267       i32x4_t product2_1 = simd::MulAdd16x8x2To32x4(
268           twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2,
269           sourceInterleavedWithDest2);
270       i32x4_t product2_2 = simd::MulAdd16x8x2To32x4(
271           twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2,
272           sourceInterleavedWithDest2);
273       blendedComponent2 = aBlendMode == BLEND_MODE_DARKEN
274                               ? simd::Min32(product2_1, product2_2)
275                               : simd::Max32(product2_1, product2_2);
276       blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
277 
278       break;
279     }
280   }
281 }
282 
283 // The alpha channel is subject to a different calculation than the RGB
284 // channels, and this calculation is the same for all blend modes:
285 // resultAlpha * 255 = 255 * 255 - (255 - sourceAlpha) * (255 - destAlpha)
286 template <typename i16x8_t, typename i32x4_t>
BlendAlphaOfFourPixels(i16x8_t s_rrrraaaa1234,i16x8_t d_rrrraaaa1234)287 inline i32x4_t BlendAlphaOfFourPixels(i16x8_t s_rrrraaaa1234,
288                                       i16x8_t d_rrrraaaa1234) {
289   // clang-format off
290   // We're using MulAdd16x8x2To32x4, so we need to interleave our factors
291   // appropriately. The calculation is rewritten as follows:
292   // resultAlpha[0] * 255 = 255 * 255 - (255 - sourceAlpha[0]) * (255 - destAlpha[0])
293   //                      = 255 * 255 + (255 - sourceAlpha[0]) * (destAlpha[0] - 255)
294   //                      = (255 - 0) * (510 - 255) + (255 - sourceAlpha[0]) * (destAlpha[0] - 255)
295   //                      = MulAdd(255 - IntLv(0, sourceAlpha), IntLv(510, destAlpha) - 255)[0]
296   // clang-format on
297   i16x8_t zeroInterleavedWithSourceAlpha =
298       simd::InterleaveHi16(simd::FromI16<i16x8_t>(0), s_rrrraaaa1234);
299   i16x8_t fiveTenInterleavedWithDestAlpha =
300       simd::InterleaveHi16(simd::FromI16<i16x8_t>(510), d_rrrraaaa1234);
301   i16x8_t f1 =
302       simd::Sub16(simd::FromI16<i16x8_t>(255), zeroInterleavedWithSourceAlpha);
303   i16x8_t f2 =
304       simd::Sub16(fiveTenInterleavedWithDestAlpha, simd::FromI16<i16x8_t>(255));
305   return simd::FastDivideBy255(simd::MulAdd16x8x2To32x4(f1, f2));
306 }
307 
308 template <typename u8x16_t, typename i16x8_t>
UnpackAndShuffleComponents(u8x16_t bgrabgrabgrabgra1234,i16x8_t & bbbbgggg1234,i16x8_t & rrrraaaa1234)309 inline void UnpackAndShuffleComponents(u8x16_t bgrabgrabgrabgra1234,
310                                        i16x8_t& bbbbgggg1234,
311                                        i16x8_t& rrrraaaa1234) {
312   // bgrabgrabgrabgra1234 -> bbbbgggg1234, rrrraaaa1234
313   i16x8_t bgrabgra12 = simd::UnpackLo8x8ToI16x8(bgrabgrabgrabgra1234);
314   i16x8_t bgrabgra34 = simd::UnpackHi8x8ToI16x8(bgrabgrabgrabgra1234);
315   i16x8_t bbggrraa13 = simd::InterleaveLo16(bgrabgra12, bgrabgra34);
316   i16x8_t bbggrraa24 = simd::InterleaveHi16(bgrabgra12, bgrabgra34);
317   bbbbgggg1234 = simd::InterleaveLo16(bbggrraa13, bbggrraa24);
318   rrrraaaa1234 = simd::InterleaveHi16(bbggrraa13, bbggrraa24);
319 }
320 
321 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
ShuffleAndPackComponents(i32x4_t bbbb1234,i32x4_t gggg1234,i32x4_t rrrr1234,const i32x4_t & aaaa1234)322 inline u8x16_t ShuffleAndPackComponents(i32x4_t bbbb1234, i32x4_t gggg1234,
323                                         i32x4_t rrrr1234,
324                                         const i32x4_t& aaaa1234) {
325   // bbbb1234, gggg1234, rrrr1234, aaaa1234 -> bgrabgrabgrabgra1234
326   i16x8_t bbbbgggg1234 = simd::PackAndSaturate32To16(bbbb1234, gggg1234);
327   i16x8_t rrrraaaa1234 = simd::PackAndSaturate32To16(rrrr1234, aaaa1234);
328   i16x8_t brbrbrbr1234 = simd::InterleaveLo16(bbbbgggg1234, rrrraaaa1234);
329   i16x8_t gagagaga1234 = simd::InterleaveHi16(bbbbgggg1234, rrrraaaa1234);
330   i16x8_t bgrabgra12 = simd::InterleaveLo16(brbrbrbr1234, gagagaga1234);
331   i16x8_t bgrabgra34 = simd::InterleaveHi16(brbrbrbr1234, gagagaga1234);
332   return simd::PackAndSaturate16To8(bgrabgra12, bgrabgra34);
333 }
334 
335 template <typename i32x4_t, typename i16x8_t, typename u8x16_t, BlendMode mode>
ApplyBlending_SIMD(const DataSourceSurface::ScopedMap & aInputMap1,const DataSourceSurface::ScopedMap & aInputMap2,const DataSourceSurface::ScopedMap & aOutputMap,const IntSize & aSize)336 inline void ApplyBlending_SIMD(const DataSourceSurface::ScopedMap& aInputMap1,
337                                const DataSourceSurface::ScopedMap& aInputMap2,
338                                const DataSourceSurface::ScopedMap& aOutputMap,
339                                const IntSize& aSize) {
340   uint8_t* source1Data = aInputMap1.GetData();
341   uint8_t* source2Data = aInputMap2.GetData();
342   uint8_t* targetData = aOutputMap.GetData();
343   int32_t targetStride = aOutputMap.GetStride();
344   int32_t source1Stride = aInputMap1.GetStride();
345   int32_t source2Stride = aInputMap2.GetStride();
346 
347   for (int32_t y = 0; y < aSize.height; y++) {
348     for (int32_t x = 0; x < aSize.width; x += 4) {
349       int32_t targetIndex = y * targetStride + 4 * x;
350       int32_t source1Index = y * source1Stride + 4 * x;
351       int32_t source2Index = y * source2Stride + 4 * x;
352 
353       u8x16_t s1234 = simd::Load8<u8x16_t>(&source2Data[source2Index]);
354       u8x16_t d1234 = simd::Load8<u8x16_t>(&source1Data[source1Index]);
355 
356       // The blending calculation for the RGB channels all need access to the
357       // alpha channel of their pixel, and the alpha calculation is different,
358       // so it makes sense to separate by channel.
359 
360       i16x8_t s_bbbbgggg1234, s_rrrraaaa1234;
361       i16x8_t d_bbbbgggg1234, d_rrrraaaa1234;
362       UnpackAndShuffleComponents(s1234, s_bbbbgggg1234, s_rrrraaaa1234);
363       UnpackAndShuffleComponents(d1234, d_bbbbgggg1234, d_rrrraaaa1234);
364       i16x8_t s_aaaaaaaa1234 = simd::Shuffle32<3, 2, 3, 2>(s_rrrraaaa1234);
365       i16x8_t d_aaaaaaaa1234 = simd::Shuffle32<3, 2, 3, 2>(d_rrrraaaa1234);
366 
367       // We only use blendedB, blendedG and blendedR.
368       i32x4_t blendedB, blendedG, blendedR, blendedA;
369       BlendTwoComponentsOfFourPixels<i16x8_t, i32x4_t, mode>(
370           s_bbbbgggg1234, s_aaaaaaaa1234, d_bbbbgggg1234, d_aaaaaaaa1234,
371           blendedB, blendedG);
372       BlendTwoComponentsOfFourPixels<i16x8_t, i32x4_t, mode>(
373           s_rrrraaaa1234, s_aaaaaaaa1234, d_rrrraaaa1234, d_aaaaaaaa1234,
374           blendedR, blendedA);
375 
376       // Throw away blendedA and overwrite it with the correct blended alpha.
377       blendedA = BlendAlphaOfFourPixels<i16x8_t, i32x4_t>(s_rrrraaaa1234,
378                                                           d_rrrraaaa1234);
379 
380       u8x16_t result1234 = ShuffleAndPackComponents<i32x4_t, i16x8_t, u8x16_t>(
381           blendedB, blendedG, blendedR, blendedA);
382       simd::Store8(&targetData[targetIndex], result1234);
383     }
384   }
385 }
386 
387 template <typename i32x4_t, typename i16x8_t, typename u8x16_t, BlendMode mode>
ApplyBlending_SIMD(DataSourceSurface * aInput1,DataSourceSurface * aInput2)388 inline already_AddRefed<DataSourceSurface> ApplyBlending_SIMD(
389     DataSourceSurface* aInput1, DataSourceSurface* aInput2) {
390   IntSize size = aInput1->GetSize();
391   RefPtr<DataSourceSurface> target =
392       Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
393   if (!target) {
394     return nullptr;
395   }
396 
397   DataSourceSurface::ScopedMap inputMap1(aInput1, DataSourceSurface::READ);
398   DataSourceSurface::ScopedMap outputMap(target, DataSourceSurface::READ_WRITE);
399   if (aInput1->Equals(aInput2)) {
400     ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, mode>(inputMap1, inputMap1,
401                                                         outputMap, size);
402   } else {
403     DataSourceSurface::ScopedMap inputMap2(aInput2, DataSourceSurface::READ);
404     ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, mode>(inputMap1, inputMap2,
405                                                         outputMap, size);
406   }
407 
408   return target.forget();
409 }
410 
411 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
ApplyBlending_SIMD(DataSourceSurface * aInput1,DataSourceSurface * aInput2,BlendMode aBlendMode)412 static already_AddRefed<DataSourceSurface> ApplyBlending_SIMD(
413     DataSourceSurface* aInput1, DataSourceSurface* aInput2,
414     BlendMode aBlendMode) {
415   switch (aBlendMode) {
416     case BLEND_MODE_MULTIPLY:
417       return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_MULTIPLY>(
418           aInput1, aInput2);
419     case BLEND_MODE_SCREEN:
420       return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_SCREEN>(
421           aInput1, aInput2);
422     case BLEND_MODE_DARKEN:
423       return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_DARKEN>(
424           aInput1, aInput2);
425     case BLEND_MODE_LIGHTEN:
426       return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_LIGHTEN>(
427           aInput1, aInput2);
428     default:
429       return nullptr;
430   }
431 }
432 
433 template <MorphologyOperator Operator, typename u8x16_t>
Morph8(u8x16_t a,u8x16_t b)434 static u8x16_t Morph8(u8x16_t a, u8x16_t b) {
435   return Operator == MORPHOLOGY_OPERATOR_ERODE ? simd::Min8(a, b)
436                                                : simd::Max8(a, b);
437 }
438 
439 // Set every pixel to the per-component minimum or maximum of the pixels around
440 // it that are up to aRadius pixels away from it (horizontally).
441 template <MorphologyOperator op, typename i16x8_t, typename u8x16_t>
ApplyMorphologyHorizontal_SIMD(uint8_t * aSourceData,int32_t aSourceStride,uint8_t * aDestData,int32_t aDestStride,const IntRect & aDestRect,int32_t aRadius)442 inline void ApplyMorphologyHorizontal_SIMD(
443     uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData,
444     int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius) {
445   static_assert(
446       op == MORPHOLOGY_OPERATOR_ERODE || op == MORPHOLOGY_OPERATOR_DILATE,
447       "unexpected morphology operator");
448 
449   int32_t kernelSize = aRadius + 1 + aRadius;
450   MOZ_ASSERT(kernelSize >= 3, "don't call this with aRadius <= 0");
451   MOZ_ASSERT(kernelSize % 4 == 1 || kernelSize % 4 == 3);
452   int32_t completeKernelSizeForFourPixels = kernelSize + 3;
453   MOZ_ASSERT(completeKernelSizeForFourPixels % 4 == 0 ||
454              completeKernelSizeForFourPixels % 4 == 2);
455 
456   // aSourceData[-aRadius] and aDestData[0] are both aligned to 16 bytes, just
457   // the way we need them to be.
458 
459   IntRect sourceRect = aDestRect;
460   sourceRect.Inflate(aRadius, 0);
461 
462   for (int32_t y = aDestRect.Y(); y < aDestRect.YMost(); y++) {
463     int32_t kernelStartX = aDestRect.X() - aRadius;
464     for (int32_t x = aDestRect.X(); x < aDestRect.XMost();
465          x += 4, kernelStartX += 4) {
466       // We process four pixels (16 color values) at a time.
467       // aSourceData[0] points to the pixel located at aDestRect.TopLeft();
468       // source values can be read beyond that because the source is extended
469       // by aRadius pixels.
470 
471       int32_t sourceIndex = y * aSourceStride + 4 * kernelStartX;
472       u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
473       u8x16_t m1234 = p1234;
474 
475       for (int32_t i = 4; i < completeKernelSizeForFourPixels; i += 4) {
476         u8x16_t p5678 =
477             (kernelStartX + i < sourceRect.XMost())
478                 ? simd::Load8<u8x16_t>(&aSourceData[sourceIndex + 4 * i])
479                 : simd::FromZero8<u8x16_t>();
480         u8x16_t p2345 = simd::Rotate8<4>(p1234, p5678);
481         u8x16_t p3456 = simd::Rotate8<8>(p1234, p5678);
482         m1234 = Morph8<op, u8x16_t>(m1234, p2345);
483         m1234 = Morph8<op, u8x16_t>(m1234, p3456);
484         if (i + 2 < completeKernelSizeForFourPixels) {
485           u8x16_t p4567 = simd::Rotate8<12>(p1234, p5678);
486           m1234 = Morph8<op, u8x16_t>(m1234, p4567);
487           m1234 = Morph8<op, u8x16_t>(m1234, p5678);
488         }
489         p1234 = p5678;
490       }
491 
492       int32_t destIndex = y * aDestStride + 4 * x;
493       simd::Store8(&aDestData[destIndex], m1234);
494     }
495   }
496 }
497 
498 template <typename i16x8_t, typename u8x16_t>
ApplyMorphologyHorizontal_SIMD(uint8_t * aSourceData,int32_t aSourceStride,uint8_t * aDestData,int32_t aDestStride,const IntRect & aDestRect,int32_t aRadius,MorphologyOperator aOp)499 inline void ApplyMorphologyHorizontal_SIMD(
500     uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData,
501     int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius,
502     MorphologyOperator aOp) {
503   if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
504     ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_ERODE, i16x8_t, u8x16_t>(
505         aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
506   } else {
507     ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_DILATE, i16x8_t,
508                                    u8x16_t>(
509         aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
510   }
511 }
512 
513 // Set every pixel to the per-component minimum or maximum of the pixels around
514 // it that are up to aRadius pixels away from it (vertically).
515 template <MorphologyOperator op, typename i16x8_t, typename u8x16_t>
ApplyMorphologyVertical_SIMD(uint8_t * aSourceData,int32_t aSourceStride,uint8_t * aDestData,int32_t aDestStride,const IntRect & aDestRect,int32_t aRadius)516 static void ApplyMorphologyVertical_SIMD(
517     uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData,
518     int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius) {
519   static_assert(
520       op == MORPHOLOGY_OPERATOR_ERODE || op == MORPHOLOGY_OPERATOR_DILATE,
521       "unexpected morphology operator");
522 
523   int32_t startY = aDestRect.Y() - aRadius;
524   int32_t endY = aDestRect.Y() + aRadius;
525   for (int32_t y = aDestRect.Y(); y < aDestRect.YMost();
526        y++, startY++, endY++) {
527     for (int32_t x = aDestRect.X(); x < aDestRect.XMost(); x += 4) {
528       int32_t sourceIndex = startY * aSourceStride + 4 * x;
529       u8x16_t u = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
530       sourceIndex += aSourceStride;
531       for (int32_t iy = startY + 1; iy <= endY;
532            iy++, sourceIndex += aSourceStride) {
533         u8x16_t u2 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
534         u = Morph8<op, u8x16_t>(u, u2);
535       }
536 
537       int32_t destIndex = y * aDestStride + 4 * x;
538       simd::Store8(&aDestData[destIndex], u);
539     }
540   }
541 }
542 
543 template <typename i16x8_t, typename u8x16_t>
ApplyMorphologyVertical_SIMD(uint8_t * aSourceData,int32_t aSourceStride,uint8_t * aDestData,int32_t aDestStride,const IntRect & aDestRect,int32_t aRadius,MorphologyOperator aOp)544 inline void ApplyMorphologyVertical_SIMD(
545     uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData,
546     int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius,
547     MorphologyOperator aOp) {
548   if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
549     ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_ERODE, i16x8_t, u8x16_t>(
550         aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
551   } else {
552     ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_DILATE, i16x8_t, u8x16_t>(
553         aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
554   }
555 }
556 
557 template <typename i32x4_t, typename i16x8_t>
ColorMatrixMultiply(i16x8_t p,i16x8_t rows_bg,i16x8_t rows_ra,const i32x4_t & bias)558 static i32x4_t ColorMatrixMultiply(i16x8_t p, i16x8_t rows_bg, i16x8_t rows_ra,
559                                    const i32x4_t& bias) {
560   // int16_t p[8] == { b, g, r, a, b, g, r, a }.
561   // int16_t rows_bg[8] == { bB, bG, bR, bA, gB, gG, gR, gA }.
562   // int16_t rows_ra[8] == { rB, rG, rR, rA, aB, aG, aR, aA }.
563   // int32_t bias[4] == { _B, _G, _R, _A }.
564 
565   i32x4_t sum = bias;
566 
567   // int16_t bg[8] = { b, g, b, g, b, g, b, g };
568   i16x8_t bg = simd::ShuffleHi16<1, 0, 1, 0>(simd::ShuffleLo16<1, 0, 1, 0>(p));
569   // int32_t prodsum_bg[4] =
570   //   { b * bB + g * gB, b * bG + g * gG, b * bR + g * gR, b * bA + g * gA }
571   i32x4_t prodsum_bg = simd::MulAdd16x8x2To32x4(bg, rows_bg);
572   sum = simd::Add32(sum, prodsum_bg);
573 
574   // uint16_t ra[8] = { r, a, r, a, r, a, r, a };
575   i16x8_t ra = simd::ShuffleHi16<3, 2, 3, 2>(simd::ShuffleLo16<3, 2, 3, 2>(p));
576   // int32_t prodsum_ra[4] =
577   //   { r * rB + a * aB, r * rG + a * aG, r * rR + a * aR, r * rA + a * aA }
578   i32x4_t prodsum_ra = simd::MulAdd16x8x2To32x4(ra, rows_ra);
579   sum = simd::Add32(sum, prodsum_ra);
580 
581   // int32_t sum[4] == { b * bB + g * gB + r * rB + a * aB + _B, ... }.
582   return sum;
583 }
584 
585 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
ApplyColorMatrix_SIMD(DataSourceSurface * aInput,const Matrix5x4 & aMatrix)586 static already_AddRefed<DataSourceSurface> ApplyColorMatrix_SIMD(
587     DataSourceSurface* aInput, const Matrix5x4& aMatrix) {
588   IntSize size = aInput->GetSize();
589   RefPtr<DataSourceSurface> target =
590       Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
591   if (!target) {
592     return nullptr;
593   }
594 
595   DataSourceSurface::ScopedMap inputMap(aInput, DataSourceSurface::READ);
596   DataSourceSurface::ScopedMap outputMap(target, DataSourceSurface::READ_WRITE);
597 
598   uint8_t* sourceData = inputMap.GetData();
599   uint8_t* targetData = outputMap.GetData();
600   int32_t sourceStride = inputMap.GetStride();
601   int32_t targetStride = outputMap.GetStride();
602 
603   const int16_t factor = 128;
604   const Float floatElementMax = INT16_MAX / factor;  // 255
605   MOZ_ASSERT((floatElementMax * factor) <= INT16_MAX,
606              "badly chosen float-to-int scale");
607 
608   const Float* floats = &aMatrix._11;
609 
610   ptrdiff_t componentOffsets[4] = {
611       B8G8R8A8_COMPONENT_BYTEOFFSET_R, B8G8R8A8_COMPONENT_BYTEOFFSET_G,
612       B8G8R8A8_COMPONENT_BYTEOFFSET_B, B8G8R8A8_COMPONENT_BYTEOFFSET_A};
613 
614   // We store the color matrix in rows_bgra in the following format:
615   // { bB, bG, bR, bA, gB, gG, gR, gA }.
616   // { bB, gB, bG, gG, bR, gR, bA, gA }
617   // The way this is interleaved allows us to use the intrinsic _mm_madd_epi16
618   // which works especially well for our use case.
619   int16_t rows_bgra[2][8];
620   for (size_t rowIndex = 0; rowIndex < 4; rowIndex++) {
621     for (size_t colIndex = 0; colIndex < 4; colIndex++) {
622       const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex];
623       Float clampedFloatMatrixElement = std::min(
624           std::max(floatMatrixElement, -floatElementMax), floatElementMax);
625       int16_t scaledIntMatrixElement =
626           int16_t(clampedFloatMatrixElement * factor + 0.5);
627       int8_t bg_or_ra = componentOffsets[rowIndex] / 2;
628       int8_t g_or_a = componentOffsets[rowIndex] % 2;
629       int8_t B_or_G_or_R_or_A = componentOffsets[colIndex];
630       rows_bgra[bg_or_ra][B_or_G_or_R_or_A * 2 + g_or_a] =
631           scaledIntMatrixElement;
632     }
633   }
634 
635   int32_t rowBias[4];
636   Float biasMax = (INT32_MAX - 4 * 255 * INT16_MAX) / (factor * 255);
637   for (size_t colIndex = 0; colIndex < 4; colIndex++) {
638     size_t rowIndex = 4;
639     const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex];
640     Float clampedFloatMatrixElement =
641         std::min(std::max(floatMatrixElement, -biasMax), biasMax);
642     int32_t scaledIntMatrixElement =
643         int32_t(clampedFloatMatrixElement * factor * 255 + 0.5);
644     rowBias[componentOffsets[colIndex]] = scaledIntMatrixElement;
645   }
646 
647   i16x8_t row_bg_v = simd::FromI16<i16x8_t>(
648       rows_bgra[0][0], rows_bgra[0][1], rows_bgra[0][2], rows_bgra[0][3],
649       rows_bgra[0][4], rows_bgra[0][5], rows_bgra[0][6], rows_bgra[0][7]);
650 
651   i16x8_t row_ra_v = simd::FromI16<i16x8_t>(
652       rows_bgra[1][0], rows_bgra[1][1], rows_bgra[1][2], rows_bgra[1][3],
653       rows_bgra[1][4], rows_bgra[1][5], rows_bgra[1][6], rows_bgra[1][7]);
654 
655   i32x4_t rowsBias_v =
656       simd::From32<i32x4_t>(rowBias[0], rowBias[1], rowBias[2], rowBias[3]);
657 
658   for (int32_t y = 0; y < size.height; y++) {
659     for (int32_t x = 0; x < size.width; x += 4) {
660       MOZ_ASSERT(sourceStride >= 4 * (x + 4),
661                  "need to be able to read 4 pixels at this position");
662       MOZ_ASSERT(targetStride >= 4 * (x + 4),
663                  "need to be able to write 4 pixels at this position");
664       int32_t sourceIndex = y * sourceStride + 4 * x;
665       int32_t targetIndex = y * targetStride + 4 * x;
666 
667       // We load 4 pixels, unpack them, process them 1 pixel at a time, and
668       // finally pack and store the 4 result pixels.
669 
670       u8x16_t p1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
671 
672       // Splat needed to get each pixel twice into i16x8
673       i16x8_t p11 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<0>(p1234));
674       i16x8_t p22 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<1>(p1234));
675       i16x8_t p33 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<2>(p1234));
676       i16x8_t p44 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<3>(p1234));
677 
678       i32x4_t result_p1 =
679           ColorMatrixMultiply(p11, row_bg_v, row_ra_v, rowsBias_v);
680       i32x4_t result_p2 =
681           ColorMatrixMultiply(p22, row_bg_v, row_ra_v, rowsBias_v);
682       i32x4_t result_p3 =
683           ColorMatrixMultiply(p33, row_bg_v, row_ra_v, rowsBias_v);
684       i32x4_t result_p4 =
685           ColorMatrixMultiply(p44, row_bg_v, row_ra_v, rowsBias_v);
686 
687       static_assert(factor == 1 << 7,
688                     "Please adapt the calculation in the lines below for a "
689                     "different factor.");
690       u8x16_t result_p1234 = simd::PackAndSaturate32To8(
691           simd::ShiftRight32<7>(result_p1), simd::ShiftRight32<7>(result_p2),
692           simd::ShiftRight32<7>(result_p3), simd::ShiftRight32<7>(result_p4));
693       simd::Store8(&targetData[targetIndex], result_p1234);
694     }
695   }
696 
697   return target.forget();
698 }
699 
700 // source / dest: bgra bgra
701 // sourceAlpha / destAlpha: aaaa aaaa
702 // result: bgra bgra
703 template <typename i32x4_t, typename u16x8_t, uint32_t aCompositeOperator>
CompositeTwoPixels(u16x8_t source,u16x8_t sourceAlpha,u16x8_t dest,const u16x8_t & destAlpha)704 static inline u16x8_t CompositeTwoPixels(u16x8_t source, u16x8_t sourceAlpha,
705                                          u16x8_t dest,
706                                          const u16x8_t& destAlpha) {
707   u16x8_t x255 = simd::FromU16<u16x8_t>(255);
708 
709   switch (aCompositeOperator) {
710     case COMPOSITE_OPERATOR_OVER: {
711       // val = dest * (255 - sourceAlpha) + source * 255;
712       u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
713 
714       u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
715       u16x8_t rightFactor1 =
716           simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, x255);
717       i32x4_t result1 =
718           simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
719 
720       u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
721       u16x8_t rightFactor2 =
722           simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, x255);
723       i32x4_t result2 =
724           simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
725 
726       return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
727                                           simd::FastDivideBy255(result2));
728     }
729 
730     case COMPOSITE_OPERATOR_IN: {
731       // val = source * destAlpha;
732       return simd::FastDivideBy255_16(simd::Mul16(source, destAlpha));
733     }
734 
735     case COMPOSITE_OPERATOR_OUT: {
736       // val = source * (255 - destAlpha);
737       u16x8_t prod = simd::Mul16(source, simd::Sub16(x255, destAlpha));
738       return simd::FastDivideBy255_16(prod);
739     }
740 
741     case COMPOSITE_OPERATOR_ATOP: {
742       // val = dest * (255 - sourceAlpha) + source * destAlpha;
743       u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
744 
745       u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
746       u16x8_t rightFactor1 =
747           simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, destAlpha);
748       i32x4_t result1 =
749           simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
750 
751       u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
752       u16x8_t rightFactor2 =
753           simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, destAlpha);
754       i32x4_t result2 =
755           simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
756 
757       return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
758                                           simd::FastDivideBy255(result2));
759     }
760 
761     case COMPOSITE_OPERATOR_XOR: {
762       // val = dest * (255 - sourceAlpha) + source * (255 - destAlpha);
763       u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
764       u16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
765 
766       u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
767       u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha,
768                                                   twoFiftyFiveMinusDestAlpha);
769       i32x4_t result1 =
770           simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
771 
772       u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
773       u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha,
774                                                   twoFiftyFiveMinusDestAlpha);
775       i32x4_t result2 =
776           simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
777 
778       return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
779                                           simd::FastDivideBy255(result2));
780     }
781 
782     default:
783       return simd::FromU16<u16x8_t>(0);
784   }
785 }
786 
787 template <typename i32x4_t, typename u16x8_t, typename u8x16_t, uint32_t op>
ApplyComposition(DataSourceSurface * aSource,DataSourceSurface * aDest)788 static void ApplyComposition(DataSourceSurface* aSource,
789                              DataSourceSurface* aDest) {
790   IntSize size = aDest->GetSize();
791 
792   DataSourceSurface::ScopedMap input(aSource, DataSourceSurface::READ);
793   DataSourceSurface::ScopedMap output(aDest, DataSourceSurface::READ_WRITE);
794 
795   uint8_t* sourceData = input.GetData();
796   uint8_t* destData = output.GetData();
797   uint32_t sourceStride = input.GetStride();
798   uint32_t destStride = output.GetStride();
799 
800   for (int32_t y = 0; y < size.height; y++) {
801     for (int32_t x = 0; x < size.width; x += 4) {
802       uint32_t sourceIndex = y * sourceStride + 4 * x;
803       uint32_t destIndex = y * destStride + 4 * x;
804 
805       u8x16_t s1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
806       u8x16_t d1234 = simd::Load8<u8x16_t>(&destData[destIndex]);
807 
808       u16x8_t s12 = simd::UnpackLo8x8ToU16x8(s1234);
809       u16x8_t d12 = simd::UnpackLo8x8ToU16x8(d1234);
810       u16x8_t sa12 = simd::Splat16<3, 3>(s12);
811       u16x8_t da12 = simd::Splat16<3, 3>(d12);
812       u16x8_t result12 =
813           CompositeTwoPixels<i32x4_t, u16x8_t, op>(s12, sa12, d12, da12);
814 
815       u16x8_t s34 = simd::UnpackHi8x8ToU16x8(s1234);
816       u16x8_t d34 = simd::UnpackHi8x8ToU16x8(d1234);
817       u16x8_t sa34 = simd::Splat16<3, 3>(s34);
818       u16x8_t da34 = simd::Splat16<3, 3>(d34);
819       u16x8_t result34 =
820           CompositeTwoPixels<i32x4_t, u16x8_t, op>(s34, sa34, d34, da34);
821 
822       u8x16_t result1234 = simd::PackAndSaturate16To8(result12, result34);
823       simd::Store8(&destData[destIndex], result1234);
824     }
825   }
826 }
827 
828 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
ApplyComposition_SIMD(DataSourceSurface * aSource,DataSourceSurface * aDest,CompositeOperator aOperator)829 static void ApplyComposition_SIMD(DataSourceSurface* aSource,
830                                   DataSourceSurface* aDest,
831                                   CompositeOperator aOperator) {
832   switch (aOperator) {
833     case COMPOSITE_OPERATOR_OVER:
834       ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_OVER>(
835           aSource, aDest);
836       break;
837     case COMPOSITE_OPERATOR_IN:
838       ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_IN>(
839           aSource, aDest);
840       break;
841     case COMPOSITE_OPERATOR_OUT:
842       ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_OUT>(
843           aSource, aDest);
844       break;
845     case COMPOSITE_OPERATOR_ATOP:
846       ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_ATOP>(
847           aSource, aDest);
848       break;
849     case COMPOSITE_OPERATOR_XOR:
850       ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_XOR>(
851           aSource, aDest);
852       break;
853     default:
854       MOZ_CRASH("GFX: Incomplete switch");
855   }
856 }
857 
858 template <typename u8x16_t>
SeparateColorChannels_SIMD(const IntSize & size,uint8_t * sourceData,int32_t sourceStride,uint8_t * channel0Data,uint8_t * channel1Data,uint8_t * channel2Data,uint8_t * channel3Data,int32_t channelStride)859 static void SeparateColorChannels_SIMD(
860     const IntSize& size, uint8_t* sourceData, int32_t sourceStride,
861     uint8_t* channel0Data, uint8_t* channel1Data, uint8_t* channel2Data,
862     uint8_t* channel3Data, int32_t channelStride) {
863   for (int32_t y = 0; y < size.height; y++) {
864     for (int32_t x = 0; x < size.width; x += 16) {
865       // Process 16 pixels at a time.
866       int32_t sourceIndex = y * sourceStride + 4 * x;
867       int32_t targetIndex = y * channelStride + x;
868 
869       u8x16_t bgrabgrabgrabgra1 = simd::FromZero8<u8x16_t>();
870       u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>();
871       u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>();
872       u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>();
873 
874       bgrabgrabgrabgra1 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
875       if (4 * (x + 4) < sourceStride) {
876         bgrabgrabgrabgra2 =
877             simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]);
878       }
879       if (4 * (x + 8) < sourceStride) {
880         bgrabgrabgrabgra3 =
881             simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]);
882       }
883       if (4 * (x + 12) < sourceStride) {
884         bgrabgrabgrabgra4 =
885             simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]);
886       }
887 
888       u8x16_t bbggrraabbggrraa1 =
889           simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
890       u8x16_t bbggrraabbggrraa2 =
891           simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
892       u8x16_t bbggrraabbggrraa3 =
893           simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
894       u8x16_t bbggrraabbggrraa4 =
895           simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
896       u8x16_t bbbbggggrrrraaaa1 =
897           simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3);
898       u8x16_t bbbbggggrrrraaaa2 =
899           simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3);
900       u8x16_t bbbbggggrrrraaaa3 =
901           simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4);
902       u8x16_t bbbbggggrrrraaaa4 =
903           simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4);
904       u8x16_t bbbbbbbbgggggggg1 =
905           simd::InterleaveLo8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
906       u8x16_t rrrrrrrraaaaaaaa1 =
907           simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
908       u8x16_t bbbbbbbbgggggggg2 =
909           simd::InterleaveLo8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
910       u8x16_t rrrrrrrraaaaaaaa2 =
911           simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
912       u8x16_t bbbbbbbbbbbbbbbb =
913           simd::InterleaveLo8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2);
914       u8x16_t gggggggggggggggg =
915           simd::InterleaveHi8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2);
916       u8x16_t rrrrrrrrrrrrrrrr =
917           simd::InterleaveLo8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
918       u8x16_t aaaaaaaaaaaaaaaa =
919           simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
920 
921       simd::Store8(&channel0Data[targetIndex], bbbbbbbbbbbbbbbb);
922       simd::Store8(&channel1Data[targetIndex], gggggggggggggggg);
923       simd::Store8(&channel2Data[targetIndex], rrrrrrrrrrrrrrrr);
924       simd::Store8(&channel3Data[targetIndex], aaaaaaaaaaaaaaaa);
925     }
926   }
927 }
928 
929 template <typename u8x16_t>
CombineColorChannels_SIMD(const IntSize & size,int32_t resultStride,uint8_t * resultData,int32_t channelStride,uint8_t * channel0Data,uint8_t * channel1Data,uint8_t * channel2Data,uint8_t * channel3Data)930 static void CombineColorChannels_SIMD(
931     const IntSize& size, int32_t resultStride, uint8_t* resultData,
932     int32_t channelStride, uint8_t* channel0Data, uint8_t* channel1Data,
933     uint8_t* channel2Data, uint8_t* channel3Data) {
934   for (int32_t y = 0; y < size.height; y++) {
935     for (int32_t x = 0; x < size.width; x += 16) {
936       // Process 16 pixels at a time.
937       int32_t resultIndex = y * resultStride + 4 * x;
938       int32_t channelIndex = y * channelStride + x;
939 
940       u8x16_t bbbbbbbbbbbbbbbb =
941           simd::Load8<u8x16_t>(&channel0Data[channelIndex]);
942       u8x16_t gggggggggggggggg =
943           simd::Load8<u8x16_t>(&channel1Data[channelIndex]);
944       u8x16_t rrrrrrrrrrrrrrrr =
945           simd::Load8<u8x16_t>(&channel2Data[channelIndex]);
946       u8x16_t aaaaaaaaaaaaaaaa =
947           simd::Load8<u8x16_t>(&channel3Data[channelIndex]);
948 
949       u8x16_t brbrbrbrbrbrbrbr1 =
950           simd::InterleaveLo8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr);
951       u8x16_t brbrbrbrbrbrbrbr2 =
952           simd::InterleaveHi8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr);
953       u8x16_t gagagagagagagaga1 =
954           simd::InterleaveLo8(gggggggggggggggg, aaaaaaaaaaaaaaaa);
955       u8x16_t gagagagagagagaga2 =
956           simd::InterleaveHi8(gggggggggggggggg, aaaaaaaaaaaaaaaa);
957 
958       u8x16_t bgrabgrabgrabgra1 =
959           simd::InterleaveLo8(brbrbrbrbrbrbrbr1, gagagagagagagaga1);
960       u8x16_t bgrabgrabgrabgra2 =
961           simd::InterleaveHi8(brbrbrbrbrbrbrbr1, gagagagagagagaga1);
962       u8x16_t bgrabgrabgrabgra3 =
963           simd::InterleaveLo8(brbrbrbrbrbrbrbr2, gagagagagagagaga2);
964       u8x16_t bgrabgrabgrabgra4 =
965           simd::InterleaveHi8(brbrbrbrbrbrbrbr2, gagagagagagagaga2);
966 
967       simd::Store8(&resultData[resultIndex], bgrabgrabgrabgra1);
968       if (4 * (x + 4) < resultStride) {
969         simd::Store8(&resultData[resultIndex + 4 * 4], bgrabgrabgrabgra2);
970       }
971       if (4 * (x + 8) < resultStride) {
972         simd::Store8(&resultData[resultIndex + 8 * 4], bgrabgrabgrabgra3);
973       }
974       if (4 * (x + 12) < resultStride) {
975         simd::Store8(&resultData[resultIndex + 12 * 4], bgrabgrabgrabgra4);
976       }
977     }
978   }
979 }
980 
981 template <typename i32x4_t, typename u16x8_t, typename u8x16_t>
DoPremultiplicationCalculation_SIMD(const IntSize & aSize,uint8_t * aTargetData,int32_t aTargetStride,uint8_t * aSourceData,int32_t aSourceStride)982 static void DoPremultiplicationCalculation_SIMD(const IntSize& aSize,
983                                                 uint8_t* aTargetData,
984                                                 int32_t aTargetStride,
985                                                 uint8_t* aSourceData,
986                                                 int32_t aSourceStride) {
987   const u8x16_t alphaMask = simd::From8<u8x16_t>(0, 0, 0, 0xff, 0, 0, 0, 0xff,
988                                                  0, 0, 0, 0xff, 0, 0, 0, 0xff);
989   for (int32_t y = 0; y < aSize.height; y++) {
990     for (int32_t x = 0; x < aSize.width; x += 4) {
991       int32_t inputIndex = y * aSourceStride + 4 * x;
992       int32_t targetIndex = y * aTargetStride + 4 * x;
993 
994       u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
995       u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
996       u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
997 
998       // Multiply all components with alpha.
999       p12 = simd::Mul16(p12, simd::Splat16<3, 3>(p12));
1000       p34 = simd::Mul16(p34, simd::Splat16<3, 3>(p34));
1001 
1002       // Divide by 255 and pack.
1003       u8x16_t result = simd::PackAndSaturate16To8(
1004           simd::FastDivideBy255_16(p12), simd::FastDivideBy255_16(p34));
1005 
1006       // Get the original alpha channel value back from p1234.
1007       result = simd::Pick(alphaMask, result, p1234);
1008 
1009       simd::Store8(&aTargetData[targetIndex], result);
1010     }
1011   }
1012 }
1013 
1014 // We use a table of precomputed factors for unpremultiplying.
1015 // We want to compute round(r / (alpha / 255.0f)) for arbitrary values of
1016 // r and alpha in constant time. This table of factors has the property that
1017 // (r * sAlphaFactors[alpha] + 128) >> 8 roughly gives the result we want (with
1018 // a maximum deviation of 1).
1019 //
1020 // sAlphaFactors[alpha] == round(255.0 * (1 << 8) / alpha)
1021 //
1022 // This table has been created using the python code
1023 // ", ".join("%d" % (round(255.0 * 256 / alpha) if alpha > 0 else 0) for alpha
1024 // in range(256))
1025 static const uint16_t sAlphaFactors[256] = {
1026     0,    65280, 32640, 21760, 16320, 13056, 10880, 9326, 8160, 7253, 6528,
1027     5935, 5440,  5022,  4663,  4352,  4080,  3840,  3627, 3436, 3264, 3109,
1028     2967, 2838,  2720,  2611,  2511,  2418,  2331,  2251, 2176, 2106, 2040,
1029     1978, 1920,  1865,  1813,  1764,  1718,  1674,  1632, 1592, 1554, 1518,
1030     1484, 1451,  1419,  1389,  1360,  1332,  1306,  1280, 1255, 1232, 1209,
1031     1187, 1166,  1145,  1126,  1106,  1088,  1070,  1053, 1036, 1020, 1004,
1032     989,  974,   960,   946,   933,   919,   907,   894,  882,  870,  859,
1033     848,  837,   826,   816,   806,   796,   787,   777,  768,  759,  750,
1034     742,  733,   725,   717,   710,   702,   694,   687,  680,  673,  666,
1035     659,  653,   646,   640,   634,   628,   622,   616,  610,  604,  599,
1036     593,  588,   583,   578,   573,   568,   563,   558,  553,  549,  544,
1037     540,  535,   531,   526,   522,   518,   514,   510,  506,  502,  498,
1038     495,  491,   487,   484,   480,   476,   473,   470,  466,  463,  460,
1039     457,  453,   450,   447,   444,   441,   438,   435,  432,  429,  427,
1040     424,  421,   418,   416,   413,   411,   408,   405,  403,  400,  398,
1041     396,  393,   391,   389,   386,   384,   382,   380,  377,  375,  373,
1042     371,  369,   367,   365,   363,   361,   359,   357,  355,  353,  351,
1043     349,  347,   345,   344,   342,   340,   338,   336,  335,  333,  331,
1044     330,  328,   326,   325,   323,   322,   320,   318,  317,  315,  314,
1045     312,  311,   309,   308,   306,   305,   304,   302,  301,  299,  298,
1046     297,  295,   294,   293,   291,   290,   289,   288,  286,  285,  284,
1047     283,  281,   280,   279,   278,   277,   275,   274,  273,  272,  271,
1048     270,  269,   268,   266,   265,   264,   263,   262,  261,  260,  259,
1049     258,  257,   256};
1050 
1051 template <typename u16x8_t, typename u8x16_t>
DoUnpremultiplicationCalculation_SIMD(const IntSize & aSize,uint8_t * aTargetData,int32_t aTargetStride,uint8_t * aSourceData,int32_t aSourceStride)1052 static void DoUnpremultiplicationCalculation_SIMD(const IntSize& aSize,
1053                                                   uint8_t* aTargetData,
1054                                                   int32_t aTargetStride,
1055                                                   uint8_t* aSourceData,
1056                                                   int32_t aSourceStride) {
1057   for (int32_t y = 0; y < aSize.height; y++) {
1058     for (int32_t x = 0; x < aSize.width; x += 4) {
1059       int32_t inputIndex = y * aSourceStride + 4 * x;
1060       int32_t targetIndex = y * aTargetStride + 4 * x;
1061       union {
1062         u8x16_t p1234;
1063         uint8_t u8[4][4];
1064       };
1065       p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
1066 
1067       // Prepare the alpha factors.
1068       uint16_t aF1 = sAlphaFactors[u8[0][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
1069       uint16_t aF2 = sAlphaFactors[u8[1][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
1070       uint16_t aF3 = sAlphaFactors[u8[2][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
1071       uint16_t aF4 = sAlphaFactors[u8[3][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
1072       u16x8_t aF12 =
1073           simd::FromU16<u16x8_t>(aF1, aF1, aF1, 1 << 8, aF2, aF2, aF2, 1 << 8);
1074       u16x8_t aF34 =
1075           simd::FromU16<u16x8_t>(aF3, aF3, aF3, 1 << 8, aF4, aF4, aF4, 1 << 8);
1076 
1077       u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
1078       u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
1079 
1080       // Multiply with the alpha factors, add 128 for rounding, and shift right
1081       // by 8 bits.
1082       p12 = simd::ShiftRight16<8>(
1083           simd::Add16(simd::Mul16(p12, aF12), simd::FromU16<u16x8_t>(128)));
1084       p34 = simd::ShiftRight16<8>(
1085           simd::Add16(simd::Mul16(p34, aF34), simd::FromU16<u16x8_t>(128)));
1086 
1087       u8x16_t result = simd::PackAndSaturate16To8(p12, p34);
1088       simd::Store8(&aTargetData[targetIndex], result);
1089     }
1090   }
1091 }
1092 
1093 template <typename f32x4_t, typename i32x4_t, typename u8x16_t>
RenderTurbulence_SIMD(const IntSize & aSize,const Point & aOffset,const Size & aBaseFrequency,int32_t aSeed,int aNumOctaves,TurbulenceType aType,bool aStitch,const Rect & aTileRect)1094 static already_AddRefed<DataSourceSurface> RenderTurbulence_SIMD(
1095     const IntSize& aSize, const Point& aOffset, const Size& aBaseFrequency,
1096     int32_t aSeed, int aNumOctaves, TurbulenceType aType, bool aStitch,
1097     const Rect& aTileRect) {
1098 #define RETURN_TURBULENCE(Type, Stitch)                                    \
1099   SVGTurbulenceRenderer<Type, Stitch, f32x4_t, i32x4_t, u8x16_t> renderer( \
1100       aBaseFrequency, aSeed, aNumOctaves, aTileRect);                      \
1101   return renderer.Render(aSize, aOffset);
1102 
1103   switch (aType) {
1104     case TURBULENCE_TYPE_TURBULENCE: {
1105       if (aStitch) {
1106         RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, true);
1107       }
1108       RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, false);
1109     }
1110     case TURBULENCE_TYPE_FRACTAL_NOISE: {
1111       if (aStitch) {
1112         RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, true);
1113       }
1114       RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, false);
1115     }
1116   }
1117   return nullptr;
1118 #undef RETURN_TURBULENCE
1119 }
1120 
1121 // k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4
1122 template <typename i32x4_t, typename i16x8_t>
ArithmeticCombineTwoPixels(i16x8_t in1,i16x8_t in2,const i16x8_t & k1And4,const i16x8_t & k2And3)1123 static MOZ_ALWAYS_INLINE i16x8_t ArithmeticCombineTwoPixels(
1124     i16x8_t in1, i16x8_t in2, const i16x8_t& k1And4, const i16x8_t& k2And3) {
1125   // Calculate input product: inProd = (in1 * in2) / 255.
1126   i32x4_t inProd_1, inProd_2;
1127   simd::Mul16x4x2x2To32x4x2(in1, in2, inProd_1, inProd_2);
1128   i16x8_t inProd = simd::PackAndSaturate32To16(simd::FastDivideBy255(inProd_1),
1129                                                simd::FastDivideBy255(inProd_2));
1130 
1131   // Calculate k1 * ((in1 * in2) / 255) + (k4/128) * 128
1132   i16x8_t oneTwentyEight = simd::FromI16<i16x8_t>(128);
1133   i16x8_t inProd1AndOneTwentyEight =
1134       simd::InterleaveLo16(inProd, oneTwentyEight);
1135   i16x8_t inProd2AndOneTwentyEight =
1136       simd::InterleaveHi16(inProd, oneTwentyEight);
1137   i32x4_t inProdTimesK1PlusK4_1 =
1138       simd::MulAdd16x8x2To32x4(k1And4, inProd1AndOneTwentyEight);
1139   i32x4_t inProdTimesK1PlusK4_2 =
1140       simd::MulAdd16x8x2To32x4(k1And4, inProd2AndOneTwentyEight);
1141 
1142   // Calculate k2 * in1 + k3 * in2
1143   i16x8_t in12_1 = simd::InterleaveLo16(in1, in2);
1144   i16x8_t in12_2 = simd::InterleaveHi16(in1, in2);
1145   i32x4_t inTimesK2K3_1 = simd::MulAdd16x8x2To32x4(k2And3, in12_1);
1146   i32x4_t inTimesK2K3_2 = simd::MulAdd16x8x2To32x4(k2And3, in12_2);
1147 
1148   // Sum everything up and truncate the fractional part.
1149   i32x4_t result_1 =
1150       simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_1, inTimesK2K3_1));
1151   i32x4_t result_2 =
1152       simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_2, inTimesK2K3_2));
1153   return simd::PackAndSaturate32To16(result_1, result_2);
1154 }
1155 
1156 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
ApplyArithmeticCombine_SIMD(const DataSourceSurface::ScopedMap & aInputMap1,const DataSourceSurface::ScopedMap & aInputMap2,const DataSourceSurface::ScopedMap & aOutputMap,const IntSize & aSize,Float aK1,Float aK2,Float aK3,Float aK4)1157 static void ApplyArithmeticCombine_SIMD(
1158     const DataSourceSurface::ScopedMap& aInputMap1,
1159     const DataSourceSurface::ScopedMap& aInputMap2,
1160     const DataSourceSurface::ScopedMap& aOutputMap, const IntSize& aSize,
1161     Float aK1, Float aK2, Float aK3, Float aK4) {
1162   uint8_t* source1Data = aInputMap1.GetData();
1163   uint8_t* source2Data = aInputMap2.GetData();
1164   uint8_t* targetData = aOutputMap.GetData();
1165   uint32_t source1Stride = aInputMap1.GetStride();
1166   uint32_t source2Stride = aInputMap2.GetStride();
1167   uint32_t targetStride = aOutputMap.GetStride();
1168 
1169   // The arithmetic combine filter does the following calculation:
1170   // result = k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4
1171   //
1172   // Or, with in1/2 integers between 0 and 255:
1173   // result = (k1 * in1 * in2) / 255 + k2 * in1 + k3 * in2 + k4 * 255
1174   //
1175   // We want the whole calculation to happen in integer, with 16-bit factors.
1176   // So we convert our factors to fixed-point with precision 1.8.7.
1177   // K4 is premultiplied with 255, and it will be multiplied with 128 later
1178   // during the actual calculation, because premultiplying it with 255 * 128
1179   // would overflow int16.
1180 
1181   i16x8_t k1 = simd::FromI16<i16x8_t>(
1182       int16_t(floorf(std::min(std::max(aK1, -255.0f), 255.0f) * 128 + 0.5f)));
1183   i16x8_t k2 = simd::FromI16<i16x8_t>(
1184       int16_t(floorf(std::min(std::max(aK2, -255.0f), 255.0f) * 128 + 0.5f)));
1185   i16x8_t k3 = simd::FromI16<i16x8_t>(
1186       int16_t(floorf(std::min(std::max(aK3, -255.0f), 255.0f) * 128 + 0.5f)));
1187   i16x8_t k4 = simd::FromI16<i16x8_t>(
1188       int16_t(floorf(std::min(std::max(aK4, -128.0f), 128.0f) * 255 + 0.5f)));
1189 
1190   i16x8_t k1And4 = simd::InterleaveLo16(k1, k4);
1191   i16x8_t k2And3 = simd::InterleaveLo16(k2, k3);
1192 
1193   for (int32_t y = 0; y < aSize.height; y++) {
1194     for (int32_t x = 0; x < aSize.width; x += 4) {
1195       uint32_t source1Index = y * source1Stride + 4 * x;
1196       uint32_t source2Index = y * source2Stride + 4 * x;
1197       uint32_t targetIndex = y * targetStride + 4 * x;
1198 
1199       // Load and unpack.
1200       u8x16_t in1 = simd::Load8<u8x16_t>(&source1Data[source1Index]);
1201       u8x16_t in2 = simd::Load8<u8x16_t>(&source2Data[source2Index]);
1202       i16x8_t in1_12 = simd::UnpackLo8x8ToI16x8(in1);
1203       i16x8_t in1_34 = simd::UnpackHi8x8ToI16x8(in1);
1204       i16x8_t in2_12 = simd::UnpackLo8x8ToI16x8(in2);
1205       i16x8_t in2_34 = simd::UnpackHi8x8ToI16x8(in2);
1206 
1207       // Multiply and add.
1208       i16x8_t result_12 = ArithmeticCombineTwoPixels<i32x4_t, i16x8_t>(
1209           in1_12, in2_12, k1And4, k2And3);
1210       i16x8_t result_34 = ArithmeticCombineTwoPixels<i32x4_t, i16x8_t>(
1211           in1_34, in2_34, k1And4, k2And3);
1212 
1213       // Pack and store.
1214       simd::Store8(&targetData[targetIndex],
1215                    simd::PackAndSaturate16To8(result_12, result_34));
1216     }
1217   }
1218 }
1219 
1220 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
ApplyArithmeticCombine_SIMD(DataSourceSurface * aInput1,DataSourceSurface * aInput2,Float aK1,Float aK2,Float aK3,Float aK4)1221 static already_AddRefed<DataSourceSurface> ApplyArithmeticCombine_SIMD(
1222     DataSourceSurface* aInput1, DataSourceSurface* aInput2, Float aK1,
1223     Float aK2, Float aK3, Float aK4) {
1224   IntSize size = aInput1->GetSize();
1225   RefPtr<DataSourceSurface> target =
1226       Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
1227   if (!target) {
1228     return nullptr;
1229   }
1230 
1231   DataSourceSurface::ScopedMap inputMap1(aInput1, DataSourceSurface::READ);
1232   DataSourceSurface::ScopedMap outputMap(target, DataSourceSurface::READ_WRITE);
1233 
1234   if (aInput1->Equals(aInput2)) {
1235     ApplyArithmeticCombine_SIMD<i32x4_t, i16x8_t, u8x16_t>(
1236         inputMap1, inputMap1, outputMap, size, aK1, aK2, aK3, aK4);
1237   } else {
1238     DataSourceSurface::ScopedMap inputMap2(aInput2, DataSourceSurface::READ);
1239     ApplyArithmeticCombine_SIMD<i32x4_t, i16x8_t, u8x16_t>(
1240         inputMap1, inputMap2, outputMap, size, aK1, aK2, aK3, aK4);
1241   }
1242 
1243   return target.forget();
1244 }
1245 
1246 }  // namespace gfx
1247 }  // namespace mozilla
1248