1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7 #include "FilterProcessing.h"
8
9 #include "SIMD.h"
10 #include "SVGTurbulenceRenderer-inl.h"
11
12 namespace mozilla {
13 namespace gfx {
14
15 template <typename u8x16_t>
ConvertToB8G8R8A8_SIMD(SourceSurface * aSurface)16 inline already_AddRefed<DataSourceSurface> ConvertToB8G8R8A8_SIMD(
17 SourceSurface* aSurface) {
18 IntSize size = aSurface->GetSize();
19 RefPtr<DataSourceSurface> output =
20 Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
21 if (!output) {
22 return nullptr;
23 }
24
25 RefPtr<DataSourceSurface> input = aSurface->GetDataSurface();
26 DataSourceSurface::ScopedMap inputMap(input, DataSourceSurface::READ);
27 DataSourceSurface::ScopedMap outputMap(output, DataSourceSurface::READ_WRITE);
28 uint8_t* inputData = inputMap.GetData();
29 uint8_t* outputData = outputMap.GetData();
30 int32_t inputStride = inputMap.GetStride();
31 int32_t outputStride = outputMap.GetStride();
32 switch (input->GetFormat()) {
33 case SurfaceFormat::B8G8R8A8:
34 output = input;
35 break;
36 case SurfaceFormat::B8G8R8X8:
37 for (int32_t y = 0; y < size.height; y++) {
38 for (int32_t x = 0; x < size.width; x++) {
39 int32_t inputIndex = y * inputStride + 4 * x;
40 int32_t outputIndex = y * outputStride + 4 * x;
41 outputData[outputIndex + 0] = inputData[inputIndex + 0];
42 outputData[outputIndex + 1] = inputData[inputIndex + 1];
43 outputData[outputIndex + 2] = inputData[inputIndex + 2];
44 outputData[outputIndex + 3] = 255;
45 }
46 }
47 break;
48 case SurfaceFormat::R8G8B8A8:
49 for (int32_t y = 0; y < size.height; y++) {
50 for (int32_t x = 0; x < size.width; x++) {
51 int32_t inputIndex = y * inputStride + 4 * x;
52 int32_t outputIndex = y * outputStride + 4 * x;
53 outputData[outputIndex + 2] = inputData[inputIndex + 0];
54 outputData[outputIndex + 1] = inputData[inputIndex + 1];
55 outputData[outputIndex + 0] = inputData[inputIndex + 2];
56 outputData[outputIndex + 3] = inputData[inputIndex + 3];
57 }
58 }
59 break;
60 case SurfaceFormat::R8G8B8X8:
61 for (int32_t y = 0; y < size.height; y++) {
62 for (int32_t x = 0; x < size.width; x++) {
63 int32_t inputIndex = y * inputStride + 4 * x;
64 int32_t outputIndex = y * outputStride + 4 * x;
65 outputData[outputIndex + 2] = inputData[inputIndex + 0];
66 outputData[outputIndex + 1] = inputData[inputIndex + 1];
67 outputData[outputIndex + 0] = inputData[inputIndex + 2];
68 outputData[outputIndex + 3] = 255;
69 }
70 }
71 break;
72 case SurfaceFormat::A8:
73 for (int32_t y = 0; y < size.height; y++) {
74 for (int32_t x = 0; x < size.width; x += 16) {
75 int32_t inputIndex = y * inputStride + x;
76 int32_t outputIndex = y * outputStride + 4 * x;
77 u8x16_t p1To16 = simd::Load8<u8x16_t>(&inputData[inputIndex]);
78 // Turn AAAAAAAAAAAAAAAA into four chunks of 000A000A000A000A by
79 // interleaving with 0000000000000000 twice.
80 u8x16_t zero = simd::FromZero8<u8x16_t>();
81 u8x16_t p1To8 = simd::InterleaveLo8(zero, p1To16);
82 u8x16_t p9To16 = simd::InterleaveHi8(zero, p1To16);
83 u8x16_t p1To4 = simd::InterleaveLo8(zero, p1To8);
84 u8x16_t p5To8 = simd::InterleaveHi8(zero, p1To8);
85 u8x16_t p9To12 = simd::InterleaveLo8(zero, p9To16);
86 u8x16_t p13To16 = simd::InterleaveHi8(zero, p9To16);
87 simd::Store8(&outputData[outputIndex], p1To4);
88 if ((x + 4) * 4 < outputStride) {
89 simd::Store8(&outputData[outputIndex + 4 * 4], p5To8);
90 }
91 if ((x + 8) * 4 < outputStride) {
92 simd::Store8(&outputData[outputIndex + 4 * 8], p9To12);
93 }
94 if ((x + 12) * 4 < outputStride) {
95 simd::Store8(&outputData[outputIndex + 4 * 12], p13To16);
96 }
97 }
98 }
99 break;
100 default:
101 output = nullptr;
102 break;
103 }
104 return output.forget();
105 }
106
107 template <typename u8x16_t>
ExtractAlpha_SIMD(const IntSize & size,uint8_t * sourceData,int32_t sourceStride,uint8_t * alphaData,int32_t alphaStride)108 inline void ExtractAlpha_SIMD(const IntSize& size, uint8_t* sourceData,
109 int32_t sourceStride, uint8_t* alphaData,
110 int32_t alphaStride) {
111 for (int32_t y = 0; y < size.height; y++) {
112 for (int32_t x = 0; x < size.width; x += 16) {
113 // Process 16 pixels at a time.
114 // Turn up to four chunks of BGRABGRABGRABGRA into one chunk of
115 // AAAAAAAAAAAAAAAA.
116 int32_t sourceIndex = y * sourceStride + 4 * x;
117 int32_t targetIndex = y * alphaStride + x;
118
119 u8x16_t bgrabgrabgrabgra1 = simd::FromZero8<u8x16_t>();
120 u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>();
121 u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>();
122 u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>();
123
124 bgrabgrabgrabgra1 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
125 if (4 * (x + 4) < sourceStride) {
126 bgrabgrabgrabgra2 =
127 simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]);
128 }
129 if (4 * (x + 8) < sourceStride) {
130 bgrabgrabgrabgra3 =
131 simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]);
132 }
133 if (4 * (x + 12) < sourceStride) {
134 bgrabgrabgrabgra4 =
135 simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]);
136 }
137
138 u8x16_t bbggrraabbggrraa1 =
139 simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
140 u8x16_t bbggrraabbggrraa2 =
141 simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
142 u8x16_t bbggrraabbggrraa3 =
143 simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
144 u8x16_t bbggrraabbggrraa4 =
145 simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
146 u8x16_t bbbbggggrrrraaaa1 =
147 simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3);
148 u8x16_t bbbbggggrrrraaaa2 =
149 simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3);
150 u8x16_t bbbbggggrrrraaaa3 =
151 simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4);
152 u8x16_t bbbbggggrrrraaaa4 =
153 simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4);
154 u8x16_t rrrrrrrraaaaaaaa1 =
155 simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
156 u8x16_t rrrrrrrraaaaaaaa2 =
157 simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
158 u8x16_t aaaaaaaaaaaaaaaa =
159 simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
160
161 simd::Store8(&alphaData[targetIndex], aaaaaaaaaaaaaaaa);
162 }
163 }
164 }
165
166 // This function calculates the result color values for four pixels, but for
167 // only two color channels - either b & r or g & a. However, the a result will
168 // not be used.
169 // source and dest each contain 8 values, either bbbb gggg or rrrr aaaa.
170 // sourceAlpha and destAlpha are of the form aaaa aaaa, where each aaaa is the
171 // alpha of all four pixels (and both aaaa's are the same).
172 // blendendComponent1 and blendedComponent2 are the out parameters.
173 template <typename i16x8_t, typename i32x4_t, uint32_t aBlendMode>
BlendTwoComponentsOfFourPixels(i16x8_t source,i16x8_t sourceAlpha,i16x8_t dest,const i16x8_t & destAlpha,i32x4_t & blendedComponent1,i32x4_t & blendedComponent2)174 inline void BlendTwoComponentsOfFourPixels(i16x8_t source, i16x8_t sourceAlpha,
175 i16x8_t dest,
176 const i16x8_t& destAlpha,
177 i32x4_t& blendedComponent1,
178 i32x4_t& blendedComponent2) {
179 i16x8_t x255 = simd::FromI16<i16x8_t>(255);
180
181 switch (aBlendMode) {
182 case BLEND_MODE_MULTIPLY: {
183 // val = ((255 - destAlpha) * source + (255 - sourceAlpha + source) *
184 // dest);
185 i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
186 i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
187 i16x8_t twoFiftyFiveMinusSourceAlphaPlusSource =
188 simd::Add16(twoFiftyFiveMinusSourceAlpha, source);
189
190 i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest);
191 i16x8_t leftFactor1 = simd::InterleaveLo16(
192 twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource);
193 blendedComponent1 =
194 simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest1, leftFactor1);
195 blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
196
197 i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest);
198 i16x8_t leftFactor2 = simd::InterleaveHi16(
199 twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource);
200 blendedComponent2 =
201 simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest2, leftFactor2);
202 blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
203
204 break;
205 }
206
207 case BLEND_MODE_SCREEN: {
208 // val = 255 * (source + dest) + (0 - dest) * source;
209 i16x8_t sourcePlusDest = simd::Add16(source, dest);
210 i16x8_t zeroMinusDest = simd::Sub16(simd::FromI16<i16x8_t>(0), dest);
211
212 i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest1 =
213 simd::InterleaveLo16(x255, zeroMinusDest);
214 i16x8_t sourcePlusDestInterleavedWithSource1 =
215 simd::InterleaveLo16(sourcePlusDest, source);
216 blendedComponent1 =
217 simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest1,
218 sourcePlusDestInterleavedWithSource1);
219 blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
220
221 i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest2 =
222 simd::InterleaveHi16(x255, zeroMinusDest);
223 i16x8_t sourcePlusDestInterleavedWithSource2 =
224 simd::InterleaveHi16(sourcePlusDest, source);
225 blendedComponent2 =
226 simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest2,
227 sourcePlusDestInterleavedWithSource2);
228 blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
229
230 break;
231 }
232
233 case BLEND_MODE_DARKEN:
234 case BLEND_MODE_LIGHTEN: {
235 // Darken:
236 // val = min((255 - destAlpha) * source + 255 * dest,
237 // 255 * source + (255 - sourceAlpha) * dest);
238 //
239 // Lighten:
240 // val = max((255 - destAlpha) * source + 255 * dest,
241 // 255 * source + (255 - sourceAlpha) * dest);
242
243 i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
244 i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
245
246 i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1 =
247 simd::InterleaveLo16(twoFiftyFiveMinusDestAlpha, x255);
248 i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1 =
249 simd::InterleaveLo16(x255, twoFiftyFiveMinusSourceAlpha);
250 i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest);
251 i32x4_t product1_1 = simd::MulAdd16x8x2To32x4(
252 twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1,
253 sourceInterleavedWithDest1);
254 i32x4_t product1_2 = simd::MulAdd16x8x2To32x4(
255 twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1,
256 sourceInterleavedWithDest1);
257 blendedComponent1 = aBlendMode == BLEND_MODE_DARKEN
258 ? simd::Min32(product1_1, product1_2)
259 : simd::Max32(product1_1, product1_2);
260 blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
261
262 i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2 =
263 simd::InterleaveHi16(twoFiftyFiveMinusDestAlpha, x255);
264 i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2 =
265 simd::InterleaveHi16(x255, twoFiftyFiveMinusSourceAlpha);
266 i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest);
267 i32x4_t product2_1 = simd::MulAdd16x8x2To32x4(
268 twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2,
269 sourceInterleavedWithDest2);
270 i32x4_t product2_2 = simd::MulAdd16x8x2To32x4(
271 twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2,
272 sourceInterleavedWithDest2);
273 blendedComponent2 = aBlendMode == BLEND_MODE_DARKEN
274 ? simd::Min32(product2_1, product2_2)
275 : simd::Max32(product2_1, product2_2);
276 blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
277
278 break;
279 }
280 }
281 }
282
283 // The alpha channel is subject to a different calculation than the RGB
284 // channels, and this calculation is the same for all blend modes:
285 // resultAlpha * 255 = 255 * 255 - (255 - sourceAlpha) * (255 - destAlpha)
286 template <typename i16x8_t, typename i32x4_t>
BlendAlphaOfFourPixels(i16x8_t s_rrrraaaa1234,i16x8_t d_rrrraaaa1234)287 inline i32x4_t BlendAlphaOfFourPixels(i16x8_t s_rrrraaaa1234,
288 i16x8_t d_rrrraaaa1234) {
289 // clang-format off
290 // We're using MulAdd16x8x2To32x4, so we need to interleave our factors
291 // appropriately. The calculation is rewritten as follows:
292 // resultAlpha[0] * 255 = 255 * 255 - (255 - sourceAlpha[0]) * (255 - destAlpha[0])
293 // = 255 * 255 + (255 - sourceAlpha[0]) * (destAlpha[0] - 255)
294 // = (255 - 0) * (510 - 255) + (255 - sourceAlpha[0]) * (destAlpha[0] - 255)
295 // = MulAdd(255 - IntLv(0, sourceAlpha), IntLv(510, destAlpha) - 255)[0]
296 // clang-format on
297 i16x8_t zeroInterleavedWithSourceAlpha =
298 simd::InterleaveHi16(simd::FromI16<i16x8_t>(0), s_rrrraaaa1234);
299 i16x8_t fiveTenInterleavedWithDestAlpha =
300 simd::InterleaveHi16(simd::FromI16<i16x8_t>(510), d_rrrraaaa1234);
301 i16x8_t f1 =
302 simd::Sub16(simd::FromI16<i16x8_t>(255), zeroInterleavedWithSourceAlpha);
303 i16x8_t f2 =
304 simd::Sub16(fiveTenInterleavedWithDestAlpha, simd::FromI16<i16x8_t>(255));
305 return simd::FastDivideBy255(simd::MulAdd16x8x2To32x4(f1, f2));
306 }
307
308 template <typename u8x16_t, typename i16x8_t>
UnpackAndShuffleComponents(u8x16_t bgrabgrabgrabgra1234,i16x8_t & bbbbgggg1234,i16x8_t & rrrraaaa1234)309 inline void UnpackAndShuffleComponents(u8x16_t bgrabgrabgrabgra1234,
310 i16x8_t& bbbbgggg1234,
311 i16x8_t& rrrraaaa1234) {
312 // bgrabgrabgrabgra1234 -> bbbbgggg1234, rrrraaaa1234
313 i16x8_t bgrabgra12 = simd::UnpackLo8x8ToI16x8(bgrabgrabgrabgra1234);
314 i16x8_t bgrabgra34 = simd::UnpackHi8x8ToI16x8(bgrabgrabgrabgra1234);
315 i16x8_t bbggrraa13 = simd::InterleaveLo16(bgrabgra12, bgrabgra34);
316 i16x8_t bbggrraa24 = simd::InterleaveHi16(bgrabgra12, bgrabgra34);
317 bbbbgggg1234 = simd::InterleaveLo16(bbggrraa13, bbggrraa24);
318 rrrraaaa1234 = simd::InterleaveHi16(bbggrraa13, bbggrraa24);
319 }
320
321 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
ShuffleAndPackComponents(i32x4_t bbbb1234,i32x4_t gggg1234,i32x4_t rrrr1234,const i32x4_t & aaaa1234)322 inline u8x16_t ShuffleAndPackComponents(i32x4_t bbbb1234, i32x4_t gggg1234,
323 i32x4_t rrrr1234,
324 const i32x4_t& aaaa1234) {
325 // bbbb1234, gggg1234, rrrr1234, aaaa1234 -> bgrabgrabgrabgra1234
326 i16x8_t bbbbgggg1234 = simd::PackAndSaturate32To16(bbbb1234, gggg1234);
327 i16x8_t rrrraaaa1234 = simd::PackAndSaturate32To16(rrrr1234, aaaa1234);
328 i16x8_t brbrbrbr1234 = simd::InterleaveLo16(bbbbgggg1234, rrrraaaa1234);
329 i16x8_t gagagaga1234 = simd::InterleaveHi16(bbbbgggg1234, rrrraaaa1234);
330 i16x8_t bgrabgra12 = simd::InterleaveLo16(brbrbrbr1234, gagagaga1234);
331 i16x8_t bgrabgra34 = simd::InterleaveHi16(brbrbrbr1234, gagagaga1234);
332 return simd::PackAndSaturate16To8(bgrabgra12, bgrabgra34);
333 }
334
335 template <typename i32x4_t, typename i16x8_t, typename u8x16_t, BlendMode mode>
ApplyBlending_SIMD(const DataSourceSurface::ScopedMap & aInputMap1,const DataSourceSurface::ScopedMap & aInputMap2,const DataSourceSurface::ScopedMap & aOutputMap,const IntSize & aSize)336 inline void ApplyBlending_SIMD(const DataSourceSurface::ScopedMap& aInputMap1,
337 const DataSourceSurface::ScopedMap& aInputMap2,
338 const DataSourceSurface::ScopedMap& aOutputMap,
339 const IntSize& aSize) {
340 uint8_t* source1Data = aInputMap1.GetData();
341 uint8_t* source2Data = aInputMap2.GetData();
342 uint8_t* targetData = aOutputMap.GetData();
343 int32_t targetStride = aOutputMap.GetStride();
344 int32_t source1Stride = aInputMap1.GetStride();
345 int32_t source2Stride = aInputMap2.GetStride();
346
347 for (int32_t y = 0; y < aSize.height; y++) {
348 for (int32_t x = 0; x < aSize.width; x += 4) {
349 int32_t targetIndex = y * targetStride + 4 * x;
350 int32_t source1Index = y * source1Stride + 4 * x;
351 int32_t source2Index = y * source2Stride + 4 * x;
352
353 u8x16_t s1234 = simd::Load8<u8x16_t>(&source2Data[source2Index]);
354 u8x16_t d1234 = simd::Load8<u8x16_t>(&source1Data[source1Index]);
355
356 // The blending calculation for the RGB channels all need access to the
357 // alpha channel of their pixel, and the alpha calculation is different,
358 // so it makes sense to separate by channel.
359
360 i16x8_t s_bbbbgggg1234, s_rrrraaaa1234;
361 i16x8_t d_bbbbgggg1234, d_rrrraaaa1234;
362 UnpackAndShuffleComponents(s1234, s_bbbbgggg1234, s_rrrraaaa1234);
363 UnpackAndShuffleComponents(d1234, d_bbbbgggg1234, d_rrrraaaa1234);
364 i16x8_t s_aaaaaaaa1234 = simd::Shuffle32<3, 2, 3, 2>(s_rrrraaaa1234);
365 i16x8_t d_aaaaaaaa1234 = simd::Shuffle32<3, 2, 3, 2>(d_rrrraaaa1234);
366
367 // We only use blendedB, blendedG and blendedR.
368 i32x4_t blendedB, blendedG, blendedR, blendedA;
369 BlendTwoComponentsOfFourPixels<i16x8_t, i32x4_t, mode>(
370 s_bbbbgggg1234, s_aaaaaaaa1234, d_bbbbgggg1234, d_aaaaaaaa1234,
371 blendedB, blendedG);
372 BlendTwoComponentsOfFourPixels<i16x8_t, i32x4_t, mode>(
373 s_rrrraaaa1234, s_aaaaaaaa1234, d_rrrraaaa1234, d_aaaaaaaa1234,
374 blendedR, blendedA);
375
376 // Throw away blendedA and overwrite it with the correct blended alpha.
377 blendedA = BlendAlphaOfFourPixels<i16x8_t, i32x4_t>(s_rrrraaaa1234,
378 d_rrrraaaa1234);
379
380 u8x16_t result1234 = ShuffleAndPackComponents<i32x4_t, i16x8_t, u8x16_t>(
381 blendedB, blendedG, blendedR, blendedA);
382 simd::Store8(&targetData[targetIndex], result1234);
383 }
384 }
385 }
386
387 template <typename i32x4_t, typename i16x8_t, typename u8x16_t, BlendMode mode>
ApplyBlending_SIMD(DataSourceSurface * aInput1,DataSourceSurface * aInput2)388 inline already_AddRefed<DataSourceSurface> ApplyBlending_SIMD(
389 DataSourceSurface* aInput1, DataSourceSurface* aInput2) {
390 IntSize size = aInput1->GetSize();
391 RefPtr<DataSourceSurface> target =
392 Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
393 if (!target) {
394 return nullptr;
395 }
396
397 DataSourceSurface::ScopedMap inputMap1(aInput1, DataSourceSurface::READ);
398 DataSourceSurface::ScopedMap outputMap(target, DataSourceSurface::READ_WRITE);
399 if (aInput1->Equals(aInput2)) {
400 ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, mode>(inputMap1, inputMap1,
401 outputMap, size);
402 } else {
403 DataSourceSurface::ScopedMap inputMap2(aInput2, DataSourceSurface::READ);
404 ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, mode>(inputMap1, inputMap2,
405 outputMap, size);
406 }
407
408 return target.forget();
409 }
410
411 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
ApplyBlending_SIMD(DataSourceSurface * aInput1,DataSourceSurface * aInput2,BlendMode aBlendMode)412 static already_AddRefed<DataSourceSurface> ApplyBlending_SIMD(
413 DataSourceSurface* aInput1, DataSourceSurface* aInput2,
414 BlendMode aBlendMode) {
415 switch (aBlendMode) {
416 case BLEND_MODE_MULTIPLY:
417 return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_MULTIPLY>(
418 aInput1, aInput2);
419 case BLEND_MODE_SCREEN:
420 return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_SCREEN>(
421 aInput1, aInput2);
422 case BLEND_MODE_DARKEN:
423 return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_DARKEN>(
424 aInput1, aInput2);
425 case BLEND_MODE_LIGHTEN:
426 return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_LIGHTEN>(
427 aInput1, aInput2);
428 default:
429 return nullptr;
430 }
431 }
432
433 template <MorphologyOperator Operator, typename u8x16_t>
Morph8(u8x16_t a,u8x16_t b)434 static u8x16_t Morph8(u8x16_t a, u8x16_t b) {
435 return Operator == MORPHOLOGY_OPERATOR_ERODE ? simd::Min8(a, b)
436 : simd::Max8(a, b);
437 }
438
439 // Set every pixel to the per-component minimum or maximum of the pixels around
440 // it that are up to aRadius pixels away from it (horizontally).
441 template <MorphologyOperator op, typename i16x8_t, typename u8x16_t>
ApplyMorphologyHorizontal_SIMD(uint8_t * aSourceData,int32_t aSourceStride,uint8_t * aDestData,int32_t aDestStride,const IntRect & aDestRect,int32_t aRadius)442 inline void ApplyMorphologyHorizontal_SIMD(
443 uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData,
444 int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius) {
445 static_assert(
446 op == MORPHOLOGY_OPERATOR_ERODE || op == MORPHOLOGY_OPERATOR_DILATE,
447 "unexpected morphology operator");
448
449 int32_t kernelSize = aRadius + 1 + aRadius;
450 MOZ_ASSERT(kernelSize >= 3, "don't call this with aRadius <= 0");
451 MOZ_ASSERT(kernelSize % 4 == 1 || kernelSize % 4 == 3);
452 int32_t completeKernelSizeForFourPixels = kernelSize + 3;
453 MOZ_ASSERT(completeKernelSizeForFourPixels % 4 == 0 ||
454 completeKernelSizeForFourPixels % 4 == 2);
455
456 // aSourceData[-aRadius] and aDestData[0] are both aligned to 16 bytes, just
457 // the way we need them to be.
458
459 IntRect sourceRect = aDestRect;
460 sourceRect.Inflate(aRadius, 0);
461
462 for (int32_t y = aDestRect.Y(); y < aDestRect.YMost(); y++) {
463 int32_t kernelStartX = aDestRect.X() - aRadius;
464 for (int32_t x = aDestRect.X(); x < aDestRect.XMost();
465 x += 4, kernelStartX += 4) {
466 // We process four pixels (16 color values) at a time.
467 // aSourceData[0] points to the pixel located at aDestRect.TopLeft();
468 // source values can be read beyond that because the source is extended
469 // by aRadius pixels.
470
471 int32_t sourceIndex = y * aSourceStride + 4 * kernelStartX;
472 u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
473 u8x16_t m1234 = p1234;
474
475 for (int32_t i = 4; i < completeKernelSizeForFourPixels; i += 4) {
476 u8x16_t p5678 =
477 (kernelStartX + i < sourceRect.XMost())
478 ? simd::Load8<u8x16_t>(&aSourceData[sourceIndex + 4 * i])
479 : simd::FromZero8<u8x16_t>();
480 u8x16_t p2345 = simd::Rotate8<4>(p1234, p5678);
481 u8x16_t p3456 = simd::Rotate8<8>(p1234, p5678);
482 m1234 = Morph8<op, u8x16_t>(m1234, p2345);
483 m1234 = Morph8<op, u8x16_t>(m1234, p3456);
484 if (i + 2 < completeKernelSizeForFourPixels) {
485 u8x16_t p4567 = simd::Rotate8<12>(p1234, p5678);
486 m1234 = Morph8<op, u8x16_t>(m1234, p4567);
487 m1234 = Morph8<op, u8x16_t>(m1234, p5678);
488 }
489 p1234 = p5678;
490 }
491
492 int32_t destIndex = y * aDestStride + 4 * x;
493 simd::Store8(&aDestData[destIndex], m1234);
494 }
495 }
496 }
497
498 template <typename i16x8_t, typename u8x16_t>
ApplyMorphologyHorizontal_SIMD(uint8_t * aSourceData,int32_t aSourceStride,uint8_t * aDestData,int32_t aDestStride,const IntRect & aDestRect,int32_t aRadius,MorphologyOperator aOp)499 inline void ApplyMorphologyHorizontal_SIMD(
500 uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData,
501 int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius,
502 MorphologyOperator aOp) {
503 if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
504 ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_ERODE, i16x8_t, u8x16_t>(
505 aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
506 } else {
507 ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_DILATE, i16x8_t,
508 u8x16_t>(
509 aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
510 }
511 }
512
513 // Set every pixel to the per-component minimum or maximum of the pixels around
514 // it that are up to aRadius pixels away from it (vertically).
515 template <MorphologyOperator op, typename i16x8_t, typename u8x16_t>
ApplyMorphologyVertical_SIMD(uint8_t * aSourceData,int32_t aSourceStride,uint8_t * aDestData,int32_t aDestStride,const IntRect & aDestRect,int32_t aRadius)516 static void ApplyMorphologyVertical_SIMD(
517 uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData,
518 int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius) {
519 static_assert(
520 op == MORPHOLOGY_OPERATOR_ERODE || op == MORPHOLOGY_OPERATOR_DILATE,
521 "unexpected morphology operator");
522
523 int32_t startY = aDestRect.Y() - aRadius;
524 int32_t endY = aDestRect.Y() + aRadius;
525 for (int32_t y = aDestRect.Y(); y < aDestRect.YMost();
526 y++, startY++, endY++) {
527 for (int32_t x = aDestRect.X(); x < aDestRect.XMost(); x += 4) {
528 int32_t sourceIndex = startY * aSourceStride + 4 * x;
529 u8x16_t u = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
530 sourceIndex += aSourceStride;
531 for (int32_t iy = startY + 1; iy <= endY;
532 iy++, sourceIndex += aSourceStride) {
533 u8x16_t u2 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
534 u = Morph8<op, u8x16_t>(u, u2);
535 }
536
537 int32_t destIndex = y * aDestStride + 4 * x;
538 simd::Store8(&aDestData[destIndex], u);
539 }
540 }
541 }
542
543 template <typename i16x8_t, typename u8x16_t>
ApplyMorphologyVertical_SIMD(uint8_t * aSourceData,int32_t aSourceStride,uint8_t * aDestData,int32_t aDestStride,const IntRect & aDestRect,int32_t aRadius,MorphologyOperator aOp)544 inline void ApplyMorphologyVertical_SIMD(
545 uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData,
546 int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius,
547 MorphologyOperator aOp) {
548 if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
549 ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_ERODE, i16x8_t, u8x16_t>(
550 aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
551 } else {
552 ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_DILATE, i16x8_t, u8x16_t>(
553 aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
554 }
555 }
556
557 template <typename i32x4_t, typename i16x8_t>
ColorMatrixMultiply(i16x8_t p,i16x8_t rows_bg,i16x8_t rows_ra,const i32x4_t & bias)558 static i32x4_t ColorMatrixMultiply(i16x8_t p, i16x8_t rows_bg, i16x8_t rows_ra,
559 const i32x4_t& bias) {
560 // int16_t p[8] == { b, g, r, a, b, g, r, a }.
561 // int16_t rows_bg[8] == { bB, bG, bR, bA, gB, gG, gR, gA }.
562 // int16_t rows_ra[8] == { rB, rG, rR, rA, aB, aG, aR, aA }.
563 // int32_t bias[4] == { _B, _G, _R, _A }.
564
565 i32x4_t sum = bias;
566
567 // int16_t bg[8] = { b, g, b, g, b, g, b, g };
568 i16x8_t bg = simd::ShuffleHi16<1, 0, 1, 0>(simd::ShuffleLo16<1, 0, 1, 0>(p));
569 // int32_t prodsum_bg[4] =
570 // { b * bB + g * gB, b * bG + g * gG, b * bR + g * gR, b * bA + g * gA }
571 i32x4_t prodsum_bg = simd::MulAdd16x8x2To32x4(bg, rows_bg);
572 sum = simd::Add32(sum, prodsum_bg);
573
574 // uint16_t ra[8] = { r, a, r, a, r, a, r, a };
575 i16x8_t ra = simd::ShuffleHi16<3, 2, 3, 2>(simd::ShuffleLo16<3, 2, 3, 2>(p));
576 // int32_t prodsum_ra[4] =
577 // { r * rB + a * aB, r * rG + a * aG, r * rR + a * aR, r * rA + a * aA }
578 i32x4_t prodsum_ra = simd::MulAdd16x8x2To32x4(ra, rows_ra);
579 sum = simd::Add32(sum, prodsum_ra);
580
581 // int32_t sum[4] == { b * bB + g * gB + r * rB + a * aB + _B, ... }.
582 return sum;
583 }
584
585 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
ApplyColorMatrix_SIMD(DataSourceSurface * aInput,const Matrix5x4 & aMatrix)586 static already_AddRefed<DataSourceSurface> ApplyColorMatrix_SIMD(
587 DataSourceSurface* aInput, const Matrix5x4& aMatrix) {
588 IntSize size = aInput->GetSize();
589 RefPtr<DataSourceSurface> target =
590 Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
591 if (!target) {
592 return nullptr;
593 }
594
595 DataSourceSurface::ScopedMap inputMap(aInput, DataSourceSurface::READ);
596 DataSourceSurface::ScopedMap outputMap(target, DataSourceSurface::READ_WRITE);
597
598 uint8_t* sourceData = inputMap.GetData();
599 uint8_t* targetData = outputMap.GetData();
600 int32_t sourceStride = inputMap.GetStride();
601 int32_t targetStride = outputMap.GetStride();
602
603 const int16_t factor = 128;
604 const Float floatElementMax = INT16_MAX / factor; // 255
605 MOZ_ASSERT((floatElementMax * factor) <= INT16_MAX,
606 "badly chosen float-to-int scale");
607
608 const Float* floats = &aMatrix._11;
609
610 ptrdiff_t componentOffsets[4] = {
611 B8G8R8A8_COMPONENT_BYTEOFFSET_R, B8G8R8A8_COMPONENT_BYTEOFFSET_G,
612 B8G8R8A8_COMPONENT_BYTEOFFSET_B, B8G8R8A8_COMPONENT_BYTEOFFSET_A};
613
614 // We store the color matrix in rows_bgra in the following format:
615 // { bB, bG, bR, bA, gB, gG, gR, gA }.
616 // { bB, gB, bG, gG, bR, gR, bA, gA }
617 // The way this is interleaved allows us to use the intrinsic _mm_madd_epi16
618 // which works especially well for our use case.
619 int16_t rows_bgra[2][8];
620 for (size_t rowIndex = 0; rowIndex < 4; rowIndex++) {
621 for (size_t colIndex = 0; colIndex < 4; colIndex++) {
622 const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex];
623 Float clampedFloatMatrixElement = std::min(
624 std::max(floatMatrixElement, -floatElementMax), floatElementMax);
625 int16_t scaledIntMatrixElement =
626 int16_t(clampedFloatMatrixElement * factor + 0.5);
627 int8_t bg_or_ra = componentOffsets[rowIndex] / 2;
628 int8_t g_or_a = componentOffsets[rowIndex] % 2;
629 int8_t B_or_G_or_R_or_A = componentOffsets[colIndex];
630 rows_bgra[bg_or_ra][B_or_G_or_R_or_A * 2 + g_or_a] =
631 scaledIntMatrixElement;
632 }
633 }
634
635 int32_t rowBias[4];
636 Float biasMax = (INT32_MAX - 4 * 255 * INT16_MAX) / (factor * 255);
637 for (size_t colIndex = 0; colIndex < 4; colIndex++) {
638 size_t rowIndex = 4;
639 const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex];
640 Float clampedFloatMatrixElement =
641 std::min(std::max(floatMatrixElement, -biasMax), biasMax);
642 int32_t scaledIntMatrixElement =
643 int32_t(clampedFloatMatrixElement * factor * 255 + 0.5);
644 rowBias[componentOffsets[colIndex]] = scaledIntMatrixElement;
645 }
646
647 i16x8_t row_bg_v = simd::FromI16<i16x8_t>(
648 rows_bgra[0][0], rows_bgra[0][1], rows_bgra[0][2], rows_bgra[0][3],
649 rows_bgra[0][4], rows_bgra[0][5], rows_bgra[0][6], rows_bgra[0][7]);
650
651 i16x8_t row_ra_v = simd::FromI16<i16x8_t>(
652 rows_bgra[1][0], rows_bgra[1][1], rows_bgra[1][2], rows_bgra[1][3],
653 rows_bgra[1][4], rows_bgra[1][5], rows_bgra[1][6], rows_bgra[1][7]);
654
655 i32x4_t rowsBias_v =
656 simd::From32<i32x4_t>(rowBias[0], rowBias[1], rowBias[2], rowBias[3]);
657
658 for (int32_t y = 0; y < size.height; y++) {
659 for (int32_t x = 0; x < size.width; x += 4) {
660 MOZ_ASSERT(sourceStride >= 4 * (x + 4),
661 "need to be able to read 4 pixels at this position");
662 MOZ_ASSERT(targetStride >= 4 * (x + 4),
663 "need to be able to write 4 pixels at this position");
664 int32_t sourceIndex = y * sourceStride + 4 * x;
665 int32_t targetIndex = y * targetStride + 4 * x;
666
667 // We load 4 pixels, unpack them, process them 1 pixel at a time, and
668 // finally pack and store the 4 result pixels.
669
670 u8x16_t p1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
671
672 // Splat needed to get each pixel twice into i16x8
673 i16x8_t p11 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<0>(p1234));
674 i16x8_t p22 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<1>(p1234));
675 i16x8_t p33 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<2>(p1234));
676 i16x8_t p44 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<3>(p1234));
677
678 i32x4_t result_p1 =
679 ColorMatrixMultiply(p11, row_bg_v, row_ra_v, rowsBias_v);
680 i32x4_t result_p2 =
681 ColorMatrixMultiply(p22, row_bg_v, row_ra_v, rowsBias_v);
682 i32x4_t result_p3 =
683 ColorMatrixMultiply(p33, row_bg_v, row_ra_v, rowsBias_v);
684 i32x4_t result_p4 =
685 ColorMatrixMultiply(p44, row_bg_v, row_ra_v, rowsBias_v);
686
687 static_assert(factor == 1 << 7,
688 "Please adapt the calculation in the lines below for a "
689 "different factor.");
690 u8x16_t result_p1234 = simd::PackAndSaturate32To8(
691 simd::ShiftRight32<7>(result_p1), simd::ShiftRight32<7>(result_p2),
692 simd::ShiftRight32<7>(result_p3), simd::ShiftRight32<7>(result_p4));
693 simd::Store8(&targetData[targetIndex], result_p1234);
694 }
695 }
696
697 return target.forget();
698 }
699
700 // source / dest: bgra bgra
701 // sourceAlpha / destAlpha: aaaa aaaa
702 // result: bgra bgra
703 template <typename i32x4_t, typename u16x8_t, uint32_t aCompositeOperator>
CompositeTwoPixels(u16x8_t source,u16x8_t sourceAlpha,u16x8_t dest,const u16x8_t & destAlpha)704 static inline u16x8_t CompositeTwoPixels(u16x8_t source, u16x8_t sourceAlpha,
705 u16x8_t dest,
706 const u16x8_t& destAlpha) {
707 u16x8_t x255 = simd::FromU16<u16x8_t>(255);
708
709 switch (aCompositeOperator) {
710 case COMPOSITE_OPERATOR_OVER: {
711 // val = dest * (255 - sourceAlpha) + source * 255;
712 u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
713
714 u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
715 u16x8_t rightFactor1 =
716 simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, x255);
717 i32x4_t result1 =
718 simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
719
720 u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
721 u16x8_t rightFactor2 =
722 simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, x255);
723 i32x4_t result2 =
724 simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
725
726 return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
727 simd::FastDivideBy255(result2));
728 }
729
730 case COMPOSITE_OPERATOR_IN: {
731 // val = source * destAlpha;
732 return simd::FastDivideBy255_16(simd::Mul16(source, destAlpha));
733 }
734
735 case COMPOSITE_OPERATOR_OUT: {
736 // val = source * (255 - destAlpha);
737 u16x8_t prod = simd::Mul16(source, simd::Sub16(x255, destAlpha));
738 return simd::FastDivideBy255_16(prod);
739 }
740
741 case COMPOSITE_OPERATOR_ATOP: {
742 // val = dest * (255 - sourceAlpha) + source * destAlpha;
743 u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
744
745 u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
746 u16x8_t rightFactor1 =
747 simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, destAlpha);
748 i32x4_t result1 =
749 simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
750
751 u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
752 u16x8_t rightFactor2 =
753 simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, destAlpha);
754 i32x4_t result2 =
755 simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
756
757 return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
758 simd::FastDivideBy255(result2));
759 }
760
761 case COMPOSITE_OPERATOR_XOR: {
762 // val = dest * (255 - sourceAlpha) + source * (255 - destAlpha);
763 u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
764 u16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
765
766 u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
767 u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha,
768 twoFiftyFiveMinusDestAlpha);
769 i32x4_t result1 =
770 simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
771
772 u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
773 u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha,
774 twoFiftyFiveMinusDestAlpha);
775 i32x4_t result2 =
776 simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
777
778 return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
779 simd::FastDivideBy255(result2));
780 }
781
782 default:
783 return simd::FromU16<u16x8_t>(0);
784 }
785 }
786
787 template <typename i32x4_t, typename u16x8_t, typename u8x16_t, uint32_t op>
ApplyComposition(DataSourceSurface * aSource,DataSourceSurface * aDest)788 static void ApplyComposition(DataSourceSurface* aSource,
789 DataSourceSurface* aDest) {
790 IntSize size = aDest->GetSize();
791
792 DataSourceSurface::ScopedMap input(aSource, DataSourceSurface::READ);
793 DataSourceSurface::ScopedMap output(aDest, DataSourceSurface::READ_WRITE);
794
795 uint8_t* sourceData = input.GetData();
796 uint8_t* destData = output.GetData();
797 uint32_t sourceStride = input.GetStride();
798 uint32_t destStride = output.GetStride();
799
800 for (int32_t y = 0; y < size.height; y++) {
801 for (int32_t x = 0; x < size.width; x += 4) {
802 uint32_t sourceIndex = y * sourceStride + 4 * x;
803 uint32_t destIndex = y * destStride + 4 * x;
804
805 u8x16_t s1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
806 u8x16_t d1234 = simd::Load8<u8x16_t>(&destData[destIndex]);
807
808 u16x8_t s12 = simd::UnpackLo8x8ToU16x8(s1234);
809 u16x8_t d12 = simd::UnpackLo8x8ToU16x8(d1234);
810 u16x8_t sa12 = simd::Splat16<3, 3>(s12);
811 u16x8_t da12 = simd::Splat16<3, 3>(d12);
812 u16x8_t result12 =
813 CompositeTwoPixels<i32x4_t, u16x8_t, op>(s12, sa12, d12, da12);
814
815 u16x8_t s34 = simd::UnpackHi8x8ToU16x8(s1234);
816 u16x8_t d34 = simd::UnpackHi8x8ToU16x8(d1234);
817 u16x8_t sa34 = simd::Splat16<3, 3>(s34);
818 u16x8_t da34 = simd::Splat16<3, 3>(d34);
819 u16x8_t result34 =
820 CompositeTwoPixels<i32x4_t, u16x8_t, op>(s34, sa34, d34, da34);
821
822 u8x16_t result1234 = simd::PackAndSaturate16To8(result12, result34);
823 simd::Store8(&destData[destIndex], result1234);
824 }
825 }
826 }
827
828 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
ApplyComposition_SIMD(DataSourceSurface * aSource,DataSourceSurface * aDest,CompositeOperator aOperator)829 static void ApplyComposition_SIMD(DataSourceSurface* aSource,
830 DataSourceSurface* aDest,
831 CompositeOperator aOperator) {
832 switch (aOperator) {
833 case COMPOSITE_OPERATOR_OVER:
834 ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_OVER>(
835 aSource, aDest);
836 break;
837 case COMPOSITE_OPERATOR_IN:
838 ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_IN>(
839 aSource, aDest);
840 break;
841 case COMPOSITE_OPERATOR_OUT:
842 ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_OUT>(
843 aSource, aDest);
844 break;
845 case COMPOSITE_OPERATOR_ATOP:
846 ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_ATOP>(
847 aSource, aDest);
848 break;
849 case COMPOSITE_OPERATOR_XOR:
850 ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_XOR>(
851 aSource, aDest);
852 break;
853 default:
854 MOZ_CRASH("GFX: Incomplete switch");
855 }
856 }
857
858 template <typename u8x16_t>
SeparateColorChannels_SIMD(const IntSize & size,uint8_t * sourceData,int32_t sourceStride,uint8_t * channel0Data,uint8_t * channel1Data,uint8_t * channel2Data,uint8_t * channel3Data,int32_t channelStride)859 static void SeparateColorChannels_SIMD(
860 const IntSize& size, uint8_t* sourceData, int32_t sourceStride,
861 uint8_t* channel0Data, uint8_t* channel1Data, uint8_t* channel2Data,
862 uint8_t* channel3Data, int32_t channelStride) {
863 for (int32_t y = 0; y < size.height; y++) {
864 for (int32_t x = 0; x < size.width; x += 16) {
865 // Process 16 pixels at a time.
866 int32_t sourceIndex = y * sourceStride + 4 * x;
867 int32_t targetIndex = y * channelStride + x;
868
869 u8x16_t bgrabgrabgrabgra1 = simd::FromZero8<u8x16_t>();
870 u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>();
871 u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>();
872 u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>();
873
874 bgrabgrabgrabgra1 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
875 if (4 * (x + 4) < sourceStride) {
876 bgrabgrabgrabgra2 =
877 simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]);
878 }
879 if (4 * (x + 8) < sourceStride) {
880 bgrabgrabgrabgra3 =
881 simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]);
882 }
883 if (4 * (x + 12) < sourceStride) {
884 bgrabgrabgrabgra4 =
885 simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]);
886 }
887
888 u8x16_t bbggrraabbggrraa1 =
889 simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
890 u8x16_t bbggrraabbggrraa2 =
891 simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
892 u8x16_t bbggrraabbggrraa3 =
893 simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
894 u8x16_t bbggrraabbggrraa4 =
895 simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
896 u8x16_t bbbbggggrrrraaaa1 =
897 simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3);
898 u8x16_t bbbbggggrrrraaaa2 =
899 simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3);
900 u8x16_t bbbbggggrrrraaaa3 =
901 simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4);
902 u8x16_t bbbbggggrrrraaaa4 =
903 simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4);
904 u8x16_t bbbbbbbbgggggggg1 =
905 simd::InterleaveLo8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
906 u8x16_t rrrrrrrraaaaaaaa1 =
907 simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
908 u8x16_t bbbbbbbbgggggggg2 =
909 simd::InterleaveLo8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
910 u8x16_t rrrrrrrraaaaaaaa2 =
911 simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
912 u8x16_t bbbbbbbbbbbbbbbb =
913 simd::InterleaveLo8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2);
914 u8x16_t gggggggggggggggg =
915 simd::InterleaveHi8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2);
916 u8x16_t rrrrrrrrrrrrrrrr =
917 simd::InterleaveLo8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
918 u8x16_t aaaaaaaaaaaaaaaa =
919 simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
920
921 simd::Store8(&channel0Data[targetIndex], bbbbbbbbbbbbbbbb);
922 simd::Store8(&channel1Data[targetIndex], gggggggggggggggg);
923 simd::Store8(&channel2Data[targetIndex], rrrrrrrrrrrrrrrr);
924 simd::Store8(&channel3Data[targetIndex], aaaaaaaaaaaaaaaa);
925 }
926 }
927 }
928
929 template <typename u8x16_t>
CombineColorChannels_SIMD(const IntSize & size,int32_t resultStride,uint8_t * resultData,int32_t channelStride,uint8_t * channel0Data,uint8_t * channel1Data,uint8_t * channel2Data,uint8_t * channel3Data)930 static void CombineColorChannels_SIMD(
931 const IntSize& size, int32_t resultStride, uint8_t* resultData,
932 int32_t channelStride, uint8_t* channel0Data, uint8_t* channel1Data,
933 uint8_t* channel2Data, uint8_t* channel3Data) {
934 for (int32_t y = 0; y < size.height; y++) {
935 for (int32_t x = 0; x < size.width; x += 16) {
936 // Process 16 pixels at a time.
937 int32_t resultIndex = y * resultStride + 4 * x;
938 int32_t channelIndex = y * channelStride + x;
939
940 u8x16_t bbbbbbbbbbbbbbbb =
941 simd::Load8<u8x16_t>(&channel0Data[channelIndex]);
942 u8x16_t gggggggggggggggg =
943 simd::Load8<u8x16_t>(&channel1Data[channelIndex]);
944 u8x16_t rrrrrrrrrrrrrrrr =
945 simd::Load8<u8x16_t>(&channel2Data[channelIndex]);
946 u8x16_t aaaaaaaaaaaaaaaa =
947 simd::Load8<u8x16_t>(&channel3Data[channelIndex]);
948
949 u8x16_t brbrbrbrbrbrbrbr1 =
950 simd::InterleaveLo8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr);
951 u8x16_t brbrbrbrbrbrbrbr2 =
952 simd::InterleaveHi8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr);
953 u8x16_t gagagagagagagaga1 =
954 simd::InterleaveLo8(gggggggggggggggg, aaaaaaaaaaaaaaaa);
955 u8x16_t gagagagagagagaga2 =
956 simd::InterleaveHi8(gggggggggggggggg, aaaaaaaaaaaaaaaa);
957
958 u8x16_t bgrabgrabgrabgra1 =
959 simd::InterleaveLo8(brbrbrbrbrbrbrbr1, gagagagagagagaga1);
960 u8x16_t bgrabgrabgrabgra2 =
961 simd::InterleaveHi8(brbrbrbrbrbrbrbr1, gagagagagagagaga1);
962 u8x16_t bgrabgrabgrabgra3 =
963 simd::InterleaveLo8(brbrbrbrbrbrbrbr2, gagagagagagagaga2);
964 u8x16_t bgrabgrabgrabgra4 =
965 simd::InterleaveHi8(brbrbrbrbrbrbrbr2, gagagagagagagaga2);
966
967 simd::Store8(&resultData[resultIndex], bgrabgrabgrabgra1);
968 if (4 * (x + 4) < resultStride) {
969 simd::Store8(&resultData[resultIndex + 4 * 4], bgrabgrabgrabgra2);
970 }
971 if (4 * (x + 8) < resultStride) {
972 simd::Store8(&resultData[resultIndex + 8 * 4], bgrabgrabgrabgra3);
973 }
974 if (4 * (x + 12) < resultStride) {
975 simd::Store8(&resultData[resultIndex + 12 * 4], bgrabgrabgrabgra4);
976 }
977 }
978 }
979 }
980
981 template <typename i32x4_t, typename u16x8_t, typename u8x16_t>
DoPremultiplicationCalculation_SIMD(const IntSize & aSize,uint8_t * aTargetData,int32_t aTargetStride,uint8_t * aSourceData,int32_t aSourceStride)982 static void DoPremultiplicationCalculation_SIMD(const IntSize& aSize,
983 uint8_t* aTargetData,
984 int32_t aTargetStride,
985 uint8_t* aSourceData,
986 int32_t aSourceStride) {
987 const u8x16_t alphaMask = simd::From8<u8x16_t>(0, 0, 0, 0xff, 0, 0, 0, 0xff,
988 0, 0, 0, 0xff, 0, 0, 0, 0xff);
989 for (int32_t y = 0; y < aSize.height; y++) {
990 for (int32_t x = 0; x < aSize.width; x += 4) {
991 int32_t inputIndex = y * aSourceStride + 4 * x;
992 int32_t targetIndex = y * aTargetStride + 4 * x;
993
994 u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
995 u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
996 u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
997
998 // Multiply all components with alpha.
999 p12 = simd::Mul16(p12, simd::Splat16<3, 3>(p12));
1000 p34 = simd::Mul16(p34, simd::Splat16<3, 3>(p34));
1001
1002 // Divide by 255 and pack.
1003 u8x16_t result = simd::PackAndSaturate16To8(
1004 simd::FastDivideBy255_16(p12), simd::FastDivideBy255_16(p34));
1005
1006 // Get the original alpha channel value back from p1234.
1007 result = simd::Pick(alphaMask, result, p1234);
1008
1009 simd::Store8(&aTargetData[targetIndex], result);
1010 }
1011 }
1012 }
1013
1014 // We use a table of precomputed factors for unpremultiplying.
1015 // We want to compute round(r / (alpha / 255.0f)) for arbitrary values of
1016 // r and alpha in constant time. This table of factors has the property that
1017 // (r * sAlphaFactors[alpha] + 128) >> 8 roughly gives the result we want (with
1018 // a maximum deviation of 1).
1019 //
1020 // sAlphaFactors[alpha] == round(255.0 * (1 << 8) / alpha)
1021 //
1022 // This table has been created using the python code
1023 // ", ".join("%d" % (round(255.0 * 256 / alpha) if alpha > 0 else 0) for alpha
1024 // in range(256))
1025 static const uint16_t sAlphaFactors[256] = {
1026 0, 65280, 32640, 21760, 16320, 13056, 10880, 9326, 8160, 7253, 6528,
1027 5935, 5440, 5022, 4663, 4352, 4080, 3840, 3627, 3436, 3264, 3109,
1028 2967, 2838, 2720, 2611, 2511, 2418, 2331, 2251, 2176, 2106, 2040,
1029 1978, 1920, 1865, 1813, 1764, 1718, 1674, 1632, 1592, 1554, 1518,
1030 1484, 1451, 1419, 1389, 1360, 1332, 1306, 1280, 1255, 1232, 1209,
1031 1187, 1166, 1145, 1126, 1106, 1088, 1070, 1053, 1036, 1020, 1004,
1032 989, 974, 960, 946, 933, 919, 907, 894, 882, 870, 859,
1033 848, 837, 826, 816, 806, 796, 787, 777, 768, 759, 750,
1034 742, 733, 725, 717, 710, 702, 694, 687, 680, 673, 666,
1035 659, 653, 646, 640, 634, 628, 622, 616, 610, 604, 599,
1036 593, 588, 583, 578, 573, 568, 563, 558, 553, 549, 544,
1037 540, 535, 531, 526, 522, 518, 514, 510, 506, 502, 498,
1038 495, 491, 487, 484, 480, 476, 473, 470, 466, 463, 460,
1039 457, 453, 450, 447, 444, 441, 438, 435, 432, 429, 427,
1040 424, 421, 418, 416, 413, 411, 408, 405, 403, 400, 398,
1041 396, 393, 391, 389, 386, 384, 382, 380, 377, 375, 373,
1042 371, 369, 367, 365, 363, 361, 359, 357, 355, 353, 351,
1043 349, 347, 345, 344, 342, 340, 338, 336, 335, 333, 331,
1044 330, 328, 326, 325, 323, 322, 320, 318, 317, 315, 314,
1045 312, 311, 309, 308, 306, 305, 304, 302, 301, 299, 298,
1046 297, 295, 294, 293, 291, 290, 289, 288, 286, 285, 284,
1047 283, 281, 280, 279, 278, 277, 275, 274, 273, 272, 271,
1048 270, 269, 268, 266, 265, 264, 263, 262, 261, 260, 259,
1049 258, 257, 256};
1050
1051 template <typename u16x8_t, typename u8x16_t>
DoUnpremultiplicationCalculation_SIMD(const IntSize & aSize,uint8_t * aTargetData,int32_t aTargetStride,uint8_t * aSourceData,int32_t aSourceStride)1052 static void DoUnpremultiplicationCalculation_SIMD(const IntSize& aSize,
1053 uint8_t* aTargetData,
1054 int32_t aTargetStride,
1055 uint8_t* aSourceData,
1056 int32_t aSourceStride) {
1057 for (int32_t y = 0; y < aSize.height; y++) {
1058 for (int32_t x = 0; x < aSize.width; x += 4) {
1059 int32_t inputIndex = y * aSourceStride + 4 * x;
1060 int32_t targetIndex = y * aTargetStride + 4 * x;
1061 union {
1062 u8x16_t p1234;
1063 uint8_t u8[4][4];
1064 };
1065 p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
1066
1067 // Prepare the alpha factors.
1068 uint16_t aF1 = sAlphaFactors[u8[0][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
1069 uint16_t aF2 = sAlphaFactors[u8[1][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
1070 uint16_t aF3 = sAlphaFactors[u8[2][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
1071 uint16_t aF4 = sAlphaFactors[u8[3][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
1072 u16x8_t aF12 =
1073 simd::FromU16<u16x8_t>(aF1, aF1, aF1, 1 << 8, aF2, aF2, aF2, 1 << 8);
1074 u16x8_t aF34 =
1075 simd::FromU16<u16x8_t>(aF3, aF3, aF3, 1 << 8, aF4, aF4, aF4, 1 << 8);
1076
1077 u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
1078 u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
1079
1080 // Multiply with the alpha factors, add 128 for rounding, and shift right
1081 // by 8 bits.
1082 p12 = simd::ShiftRight16<8>(
1083 simd::Add16(simd::Mul16(p12, aF12), simd::FromU16<u16x8_t>(128)));
1084 p34 = simd::ShiftRight16<8>(
1085 simd::Add16(simd::Mul16(p34, aF34), simd::FromU16<u16x8_t>(128)));
1086
1087 u8x16_t result = simd::PackAndSaturate16To8(p12, p34);
1088 simd::Store8(&aTargetData[targetIndex], result);
1089 }
1090 }
1091 }
1092
1093 template <typename f32x4_t, typename i32x4_t, typename u8x16_t>
RenderTurbulence_SIMD(const IntSize & aSize,const Point & aOffset,const Size & aBaseFrequency,int32_t aSeed,int aNumOctaves,TurbulenceType aType,bool aStitch,const Rect & aTileRect)1094 static already_AddRefed<DataSourceSurface> RenderTurbulence_SIMD(
1095 const IntSize& aSize, const Point& aOffset, const Size& aBaseFrequency,
1096 int32_t aSeed, int aNumOctaves, TurbulenceType aType, bool aStitch,
1097 const Rect& aTileRect) {
1098 #define RETURN_TURBULENCE(Type, Stitch) \
1099 SVGTurbulenceRenderer<Type, Stitch, f32x4_t, i32x4_t, u8x16_t> renderer( \
1100 aBaseFrequency, aSeed, aNumOctaves, aTileRect); \
1101 return renderer.Render(aSize, aOffset);
1102
1103 switch (aType) {
1104 case TURBULENCE_TYPE_TURBULENCE: {
1105 if (aStitch) {
1106 RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, true);
1107 }
1108 RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, false);
1109 }
1110 case TURBULENCE_TYPE_FRACTAL_NOISE: {
1111 if (aStitch) {
1112 RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, true);
1113 }
1114 RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, false);
1115 }
1116 }
1117 return nullptr;
1118 #undef RETURN_TURBULENCE
1119 }
1120
1121 // k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4
1122 template <typename i32x4_t, typename i16x8_t>
ArithmeticCombineTwoPixels(i16x8_t in1,i16x8_t in2,const i16x8_t & k1And4,const i16x8_t & k2And3)1123 static MOZ_ALWAYS_INLINE i16x8_t ArithmeticCombineTwoPixels(
1124 i16x8_t in1, i16x8_t in2, const i16x8_t& k1And4, const i16x8_t& k2And3) {
1125 // Calculate input product: inProd = (in1 * in2) / 255.
1126 i32x4_t inProd_1, inProd_2;
1127 simd::Mul16x4x2x2To32x4x2(in1, in2, inProd_1, inProd_2);
1128 i16x8_t inProd = simd::PackAndSaturate32To16(simd::FastDivideBy255(inProd_1),
1129 simd::FastDivideBy255(inProd_2));
1130
1131 // Calculate k1 * ((in1 * in2) / 255) + (k4/128) * 128
1132 i16x8_t oneTwentyEight = simd::FromI16<i16x8_t>(128);
1133 i16x8_t inProd1AndOneTwentyEight =
1134 simd::InterleaveLo16(inProd, oneTwentyEight);
1135 i16x8_t inProd2AndOneTwentyEight =
1136 simd::InterleaveHi16(inProd, oneTwentyEight);
1137 i32x4_t inProdTimesK1PlusK4_1 =
1138 simd::MulAdd16x8x2To32x4(k1And4, inProd1AndOneTwentyEight);
1139 i32x4_t inProdTimesK1PlusK4_2 =
1140 simd::MulAdd16x8x2To32x4(k1And4, inProd2AndOneTwentyEight);
1141
1142 // Calculate k2 * in1 + k3 * in2
1143 i16x8_t in12_1 = simd::InterleaveLo16(in1, in2);
1144 i16x8_t in12_2 = simd::InterleaveHi16(in1, in2);
1145 i32x4_t inTimesK2K3_1 = simd::MulAdd16x8x2To32x4(k2And3, in12_1);
1146 i32x4_t inTimesK2K3_2 = simd::MulAdd16x8x2To32x4(k2And3, in12_2);
1147
1148 // Sum everything up and truncate the fractional part.
1149 i32x4_t result_1 =
1150 simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_1, inTimesK2K3_1));
1151 i32x4_t result_2 =
1152 simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_2, inTimesK2K3_2));
1153 return simd::PackAndSaturate32To16(result_1, result_2);
1154 }
1155
1156 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
ApplyArithmeticCombine_SIMD(const DataSourceSurface::ScopedMap & aInputMap1,const DataSourceSurface::ScopedMap & aInputMap2,const DataSourceSurface::ScopedMap & aOutputMap,const IntSize & aSize,Float aK1,Float aK2,Float aK3,Float aK4)1157 static void ApplyArithmeticCombine_SIMD(
1158 const DataSourceSurface::ScopedMap& aInputMap1,
1159 const DataSourceSurface::ScopedMap& aInputMap2,
1160 const DataSourceSurface::ScopedMap& aOutputMap, const IntSize& aSize,
1161 Float aK1, Float aK2, Float aK3, Float aK4) {
1162 uint8_t* source1Data = aInputMap1.GetData();
1163 uint8_t* source2Data = aInputMap2.GetData();
1164 uint8_t* targetData = aOutputMap.GetData();
1165 uint32_t source1Stride = aInputMap1.GetStride();
1166 uint32_t source2Stride = aInputMap2.GetStride();
1167 uint32_t targetStride = aOutputMap.GetStride();
1168
1169 // The arithmetic combine filter does the following calculation:
1170 // result = k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4
1171 //
1172 // Or, with in1/2 integers between 0 and 255:
1173 // result = (k1 * in1 * in2) / 255 + k2 * in1 + k3 * in2 + k4 * 255
1174 //
1175 // We want the whole calculation to happen in integer, with 16-bit factors.
1176 // So we convert our factors to fixed-point with precision 1.8.7.
1177 // K4 is premultiplied with 255, and it will be multiplied with 128 later
1178 // during the actual calculation, because premultiplying it with 255 * 128
1179 // would overflow int16.
1180
1181 i16x8_t k1 = simd::FromI16<i16x8_t>(
1182 int16_t(floorf(std::min(std::max(aK1, -255.0f), 255.0f) * 128 + 0.5f)));
1183 i16x8_t k2 = simd::FromI16<i16x8_t>(
1184 int16_t(floorf(std::min(std::max(aK2, -255.0f), 255.0f) * 128 + 0.5f)));
1185 i16x8_t k3 = simd::FromI16<i16x8_t>(
1186 int16_t(floorf(std::min(std::max(aK3, -255.0f), 255.0f) * 128 + 0.5f)));
1187 i16x8_t k4 = simd::FromI16<i16x8_t>(
1188 int16_t(floorf(std::min(std::max(aK4, -128.0f), 128.0f) * 255 + 0.5f)));
1189
1190 i16x8_t k1And4 = simd::InterleaveLo16(k1, k4);
1191 i16x8_t k2And3 = simd::InterleaveLo16(k2, k3);
1192
1193 for (int32_t y = 0; y < aSize.height; y++) {
1194 for (int32_t x = 0; x < aSize.width; x += 4) {
1195 uint32_t source1Index = y * source1Stride + 4 * x;
1196 uint32_t source2Index = y * source2Stride + 4 * x;
1197 uint32_t targetIndex = y * targetStride + 4 * x;
1198
1199 // Load and unpack.
1200 u8x16_t in1 = simd::Load8<u8x16_t>(&source1Data[source1Index]);
1201 u8x16_t in2 = simd::Load8<u8x16_t>(&source2Data[source2Index]);
1202 i16x8_t in1_12 = simd::UnpackLo8x8ToI16x8(in1);
1203 i16x8_t in1_34 = simd::UnpackHi8x8ToI16x8(in1);
1204 i16x8_t in2_12 = simd::UnpackLo8x8ToI16x8(in2);
1205 i16x8_t in2_34 = simd::UnpackHi8x8ToI16x8(in2);
1206
1207 // Multiply and add.
1208 i16x8_t result_12 = ArithmeticCombineTwoPixels<i32x4_t, i16x8_t>(
1209 in1_12, in2_12, k1And4, k2And3);
1210 i16x8_t result_34 = ArithmeticCombineTwoPixels<i32x4_t, i16x8_t>(
1211 in1_34, in2_34, k1And4, k2And3);
1212
1213 // Pack and store.
1214 simd::Store8(&targetData[targetIndex],
1215 simd::PackAndSaturate16To8(result_12, result_34));
1216 }
1217 }
1218 }
1219
1220 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
ApplyArithmeticCombine_SIMD(DataSourceSurface * aInput1,DataSourceSurface * aInput2,Float aK1,Float aK2,Float aK3,Float aK4)1221 static already_AddRefed<DataSourceSurface> ApplyArithmeticCombine_SIMD(
1222 DataSourceSurface* aInput1, DataSourceSurface* aInput2, Float aK1,
1223 Float aK2, Float aK3, Float aK4) {
1224 IntSize size = aInput1->GetSize();
1225 RefPtr<DataSourceSurface> target =
1226 Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
1227 if (!target) {
1228 return nullptr;
1229 }
1230
1231 DataSourceSurface::ScopedMap inputMap1(aInput1, DataSourceSurface::READ);
1232 DataSourceSurface::ScopedMap outputMap(target, DataSourceSurface::READ_WRITE);
1233
1234 if (aInput1->Equals(aInput2)) {
1235 ApplyArithmeticCombine_SIMD<i32x4_t, i16x8_t, u8x16_t>(
1236 inputMap1, inputMap1, outputMap, size, aK1, aK2, aK3, aK4);
1237 } else {
1238 DataSourceSurface::ScopedMap inputMap2(aInput2, DataSourceSurface::READ);
1239 ApplyArithmeticCombine_SIMD<i32x4_t, i16x8_t, u8x16_t>(
1240 inputMap1, inputMap2, outputMap, size, aK1, aK2, aK3, aK4);
1241 }
1242
1243 return target.forget();
1244 }
1245
1246 } // namespace gfx
1247 } // namespace mozilla
1248