1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 #include "Blur.h"
8 
9 #include <algorithm>
10 #include <math.h>
11 #include <string.h>
12 
13 #include "mozilla/CheckedInt.h"
14 #include "NumericTools.h"
15 
16 #include "2D.h"
17 #include "DataSurfaceHelpers.h"
18 #include "Tools.h"
19 
20 #ifdef USE_NEON
21 #  include "mozilla/arm.h"
22 #endif
23 
24 namespace mozilla {
25 namespace gfx {
26 
27 /**
28  * Helper function to process each row of the box blur.
29  * It takes care of transposing the data on input or output depending
30  * on whether we intend a horizontal or vertical blur, and whether we're
31  * reading from the initial source or writing to the final destination.
32  * It allows starting or ending anywhere within the row to accomodate
33  * a skip rect.
34  */
35 template <bool aTransposeInput, bool aTransposeOutput>
BoxBlurRow(const uint8_t * aInput,uint8_t * aOutput,int32_t aLeftLobe,int32_t aRightLobe,int32_t aWidth,int32_t aStride,int32_t aStart,int32_t aEnd)36 static inline void BoxBlurRow(const uint8_t* aInput, uint8_t* aOutput,
37                               int32_t aLeftLobe, int32_t aRightLobe,
38                               int32_t aWidth, int32_t aStride, int32_t aStart,
39                               int32_t aEnd) {
40   // If the input or output is transposed, then we will move down a row
41   // for each step, instead of moving over a column. Since these values
42   // only depend on a template parameter, they will more easily get
43   // copy-propagated in the non-transposed case, which is why they
44   // are not passed as parameters.
45   const int32_t inputStep = aTransposeInput ? aStride : 1;
46   const int32_t outputStep = aTransposeOutput ? aStride : 1;
47 
48   // We need to sample aLeftLobe pixels to the left and aRightLobe pixels
49   // to the right of the current position, then average them. So this is
50   // the size of the total width of this filter.
51   const int32_t boxSize = aLeftLobe + aRightLobe + 1;
52 
53   // Instead of dividing the pixel sum by boxSize to average, we can just
54   // compute a scale that will normalize the result so that it can be quickly
55   // shifted into the desired range.
56   const uint32_t reciprocal = (1 << 24) / boxSize;
57 
58   // The shift would normally truncate the result, whereas we would rather
59   // prefer to round the result to the closest increment. By adding 0.5 units
60   // to the initial sum, we bias the sum so that it will be rounded by the
61   // truncation instead.
62   uint32_t alphaSum = (boxSize + 1) / 2;
63 
64   // We process the row with a moving filter, keeping a sum (alphaSum) of
65   // boxSize pixels. As we move over a pixel, we need to add on a pixel
66   // from the right extreme of the window that moved into range, and subtract
67   // off a pixel from the left extreme of window that moved out of range.
68   // But first, we need to initialization alphaSum to the contents of
69   // the window before we can get going. If the window moves out of bounds
70   // of the row, we clamp each sample to be the closest pixel from within
71   // row bounds, so the 0th and aWidth-1th pixel.
72   int32_t initLeft = aStart - aLeftLobe;
73   if (initLeft < 0) {
74     // If the left lobe samples before the row, add in clamped samples.
75     alphaSum += -initLeft * aInput[0];
76     initLeft = 0;
77   }
78   int32_t initRight = aStart + boxSize - aLeftLobe;
79   if (initRight > aWidth) {
80     // If the right lobe samples after the row, add in clamped samples.
81     alphaSum += (initRight - aWidth) * aInput[(aWidth - 1) * inputStep];
82     initRight = aWidth;
83   }
84   // Finally, add in all the valid, non-clamped samples to fill up the
85   // rest of the window.
86   const uint8_t* src = &aInput[initLeft * inputStep];
87   const uint8_t* iterEnd = &aInput[initRight * inputStep];
88 
89 #define INIT_ITER   \
90   alphaSum += *src; \
91   src += inputStep;
92 
93   // We unroll the per-pixel loop here substantially. The amount of work
94   // done per sample is so small that the cost of a loop condition check
95   // and a branch can substantially add to or even dominate the performance
96   // of the loop.
97   while (src + 16 * inputStep <= iterEnd) {
98     INIT_ITER;
99     INIT_ITER;
100     INIT_ITER;
101     INIT_ITER;
102     INIT_ITER;
103     INIT_ITER;
104     INIT_ITER;
105     INIT_ITER;
106     INIT_ITER;
107     INIT_ITER;
108     INIT_ITER;
109     INIT_ITER;
110     INIT_ITER;
111     INIT_ITER;
112     INIT_ITER;
113     INIT_ITER;
114   }
115   while (src < iterEnd) {
116     INIT_ITER;
117   }
118 
119   // Now we start moving the window over the row. We will be accessing
120   // pixels form aStart - aLeftLobe up to aEnd + aRightLobe, which may be
121   // out of bounds of the row. To avoid having to check within the inner
122   // loops if we are in bound, we instead compute the points at which
123   // we will move out of bounds of the row on the left side (splitLeft)
124   // and right side (splitRight).
125   int32_t splitLeft = std::min(std::max(aLeftLobe, aStart), aEnd);
126   int32_t splitRight =
127       std::min(std::max(aWidth - (boxSize - aLeftLobe), aStart), aEnd);
128   // If the filter window is actually large than the size of the row,
129   // there will be a middle area of overlap where the leftmost and rightmost
130   // pixel of the filter will both be outside the row. In this case, we need
131   // to invert the splits so that splitLeft <= splitRight.
132   if (boxSize > aWidth) {
133     std::swap(splitLeft, splitRight);
134   }
135 
136   // Process all pixels up to splitLeft that would sample before the start of
137   // the row. Note that because inputStep and outputStep may not be a const 1
138   // value, it is more performant to increment pointers here for the source and
139   // destination rather than use a loop counter, since doing so would entail an
140   // expensive multiplication that significantly slows down the loop.
141   uint8_t* dst = &aOutput[aStart * outputStep];
142   iterEnd = &aOutput[splitLeft * outputStep];
143   src = &aInput[(aStart + boxSize - aLeftLobe) * inputStep];
144   uint8_t firstVal = aInput[0];
145 
146 #define LEFT_ITER                       \
147   *dst = (alphaSum * reciprocal) >> 24; \
148   alphaSum += *src - firstVal;          \
149   dst += outputStep;                    \
150   src += inputStep;
151 
152   while (dst + 16 * outputStep <= iterEnd) {
153     LEFT_ITER;
154     LEFT_ITER;
155     LEFT_ITER;
156     LEFT_ITER;
157     LEFT_ITER;
158     LEFT_ITER;
159     LEFT_ITER;
160     LEFT_ITER;
161     LEFT_ITER;
162     LEFT_ITER;
163     LEFT_ITER;
164     LEFT_ITER;
165     LEFT_ITER;
166     LEFT_ITER;
167     LEFT_ITER;
168     LEFT_ITER;
169   }
170   while (dst < iterEnd) {
171     LEFT_ITER;
172   }
173 
174   // Process all pixels between splitLeft and splitRight.
175   iterEnd = &aOutput[splitRight * outputStep];
176   if (boxSize <= aWidth) {
177     // The filter window is smaller than the row size, so the leftmost and
178     // rightmost samples are both within row bounds.
179     src = &aInput[(splitLeft - aLeftLobe) * inputStep];
180     int32_t boxStep = boxSize * inputStep;
181 
182 #define CENTER_ITER                     \
183   *dst = (alphaSum * reciprocal) >> 24; \
184   alphaSum += src[boxStep] - *src;      \
185   dst += outputStep;                    \
186   src += inputStep;
187 
188     while (dst + 16 * outputStep <= iterEnd) {
189       CENTER_ITER;
190       CENTER_ITER;
191       CENTER_ITER;
192       CENTER_ITER;
193       CENTER_ITER;
194       CENTER_ITER;
195       CENTER_ITER;
196       CENTER_ITER;
197       CENTER_ITER;
198       CENTER_ITER;
199       CENTER_ITER;
200       CENTER_ITER;
201       CENTER_ITER;
202       CENTER_ITER;
203       CENTER_ITER;
204       CENTER_ITER;
205     }
206     while (dst < iterEnd) {
207       CENTER_ITER;
208     }
209   } else {
210     // The filter window is larger than the row size, and we're in the area of
211     // split overlap. So the leftmost and rightmost samples are both out of
212     // bounds and need to be clamped. We can just precompute the difference here
213     // consequently.
214     int32_t firstLastDiff = aInput[(aWidth - 1) * inputStep] - aInput[0];
215     while (dst < iterEnd) {
216       *dst = (alphaSum * reciprocal) >> 24;
217       alphaSum += firstLastDiff;
218       dst += outputStep;
219     }
220   }
221 
222   // Process all remaining pixels after splitRight that would sample after the
223   // row end.
224   iterEnd = &aOutput[aEnd * outputStep];
225   src = &aInput[(splitRight - aLeftLobe) * inputStep];
226   uint8_t lastVal = aInput[(aWidth - 1) * inputStep];
227 
228 #define RIGHT_ITER                      \
229   *dst = (alphaSum * reciprocal) >> 24; \
230   alphaSum += lastVal - *src;           \
231   dst += outputStep;                    \
232   src += inputStep;
233 
234   while (dst + 16 * outputStep <= iterEnd) {
235     RIGHT_ITER;
236     RIGHT_ITER;
237     RIGHT_ITER;
238     RIGHT_ITER;
239     RIGHT_ITER;
240     RIGHT_ITER;
241     RIGHT_ITER;
242     RIGHT_ITER;
243     RIGHT_ITER;
244     RIGHT_ITER;
245     RIGHT_ITER;
246     RIGHT_ITER;
247     RIGHT_ITER;
248     RIGHT_ITER;
249     RIGHT_ITER;
250     RIGHT_ITER;
251   }
252   while (dst < iterEnd) {
253     RIGHT_ITER;
254   }
255 }
256 
257 /**
258  * Box blur involves looking at one pixel, and setting its value to the average
259  * of its neighbouring pixels. This is meant to provide a 3-pass approximation
260  * of a Gaussian blur.
261  * @param aTranspose Whether to transpose the buffer when reading and writing
262  *                   to it.
263  * @param aData The buffer to be blurred.
264  * @param aLobes The number of pixels to blend on the left and right for each of
265  *               3 passes.
266  * @param aWidth The number of columns in the buffers.
267  * @param aRows The number of rows in the buffers.
268  * @param aStride The stride of the buffer.
269  */
270 template <bool aTranspose>
BoxBlur(uint8_t * aData,const int32_t aLobes[3][2],int32_t aWidth,int32_t aRows,int32_t aStride,IntRect aSkipRect)271 static void BoxBlur(uint8_t* aData, const int32_t aLobes[3][2], int32_t aWidth,
272                     int32_t aRows, int32_t aStride, IntRect aSkipRect) {
273   if (aTranspose) {
274     std::swap(aWidth, aRows);
275     aSkipRect.Swap();
276   }
277 
278   MOZ_ASSERT(aWidth > 0);
279 
280   // All three passes of the box blur that approximate the Gaussian are done
281   // on each row in turn, so we only need two temporary row buffers to process
282   // each row, instead of a full-sized buffer. Data moves from the source to the
283   // first temporary, from the first temporary to the second, then from the
284   // second back to the destination. This way is more cache-friendly than
285   // processing whe whole buffer in each pass and thus yields a nice speedup.
286   uint8_t* tmpRow = new (std::nothrow) uint8_t[2 * aWidth];
287   if (!tmpRow) {
288     return;
289   }
290   uint8_t* tmpRow2 = tmpRow + aWidth;
291 
292   const int32_t stride = aTranspose ? 1 : aStride;
293   bool skipRectCoversWholeRow =
294       0 >= aSkipRect.X() && aWidth <= aSkipRect.XMost();
295 
296   for (int32_t y = 0; y < aRows; y++) {
297     // Check whether the skip rect intersects this row. If the skip
298     // rect covers the whole surface in this row, we can avoid
299     // this row entirely (and any others along the skip rect).
300     bool inSkipRectY = aSkipRect.ContainsY(y);
301     if (inSkipRectY && skipRectCoversWholeRow) {
302       aData += stride * (aSkipRect.YMost() - y);
303       y = aSkipRect.YMost() - 1;
304       continue;
305     }
306 
307     // Read in data from the source transposed if necessary.
308     BoxBlurRow<aTranspose, false>(aData, tmpRow, aLobes[0][0], aLobes[0][1],
309                                   aWidth, aStride, 0, aWidth);
310 
311     // For the middle pass, the data is already pre-transposed and does not need
312     // to be post-transposed yet.
313     BoxBlurRow<false, false>(tmpRow, tmpRow2, aLobes[1][0], aLobes[1][1],
314                              aWidth, aStride, 0, aWidth);
315 
316     // Write back data to the destination transposed if necessary too.
317     // Make sure not to overwrite the skip rect by only outputting to the
318     // destination before and after the skip rect, if requested.
319     int32_t skipStart =
320         inSkipRectY ? std::min(std::max(aSkipRect.X(), 0), aWidth) : aWidth;
321     int32_t skipEnd = std::max(skipStart, aSkipRect.XMost());
322     if (skipStart > 0) {
323       BoxBlurRow<false, aTranspose>(tmpRow2, aData, aLobes[2][0], aLobes[2][1],
324                                     aWidth, aStride, 0, skipStart);
325     }
326     if (skipEnd < aWidth) {
327       BoxBlurRow<false, aTranspose>(tmpRow2, aData, aLobes[2][0], aLobes[2][1],
328                                     aWidth, aStride, skipEnd, aWidth);
329     }
330 
331     aData += stride;
332   }
333 
334   delete[] tmpRow;
335 }
336 
ComputeLobes(int32_t aRadius,int32_t aLobes[3][2])337 static void ComputeLobes(int32_t aRadius, int32_t aLobes[3][2]) {
338   int32_t major, minor, final;
339 
340   /* See http://www.w3.org/TR/SVG/filters.html#feGaussianBlur for
341    * some notes about approximating the Gaussian blur with box-blurs.
342    * The comments below are in the terminology of that page.
343    */
344   int32_t z = aRadius / 3;
345   switch (aRadius % 3) {
346     case 0:
347       // aRadius = z*3; choose d = 2*z + 1
348       major = minor = final = z;
349       break;
350     case 1:
351       // aRadius = z*3 + 1
352       // This is a tricky case since there is no value of d which will
353       // yield a radius of exactly aRadius. If d is odd, i.e. d=2*k + 1
354       // for some integer k, then the radius will be 3*k. If d is even,
355       // i.e. d=2*k, then the radius will be 3*k - 1.
356       // So we have to choose values that don't match the standard
357       // algorithm.
358       major = z + 1;
359       minor = final = z;
360       break;
361     case 2:
362       // aRadius = z*3 + 2; choose d = 2*z + 2
363       major = final = z + 1;
364       minor = z;
365       break;
366     default:
367       // Mathematical impossibility!
368       MOZ_ASSERT(false);
369       major = minor = final = 0;
370   }
371   MOZ_ASSERT(major + minor + final == aRadius);
372 
373   aLobes[0][0] = major;
374   aLobes[0][1] = minor;
375   aLobes[1][0] = minor;
376   aLobes[1][1] = major;
377   aLobes[2][0] = final;
378   aLobes[2][1] = final;
379 }
380 
SpreadHorizontal(uint8_t * aInput,uint8_t * aOutput,int32_t aRadius,int32_t aWidth,int32_t aRows,int32_t aStride,const IntRect & aSkipRect)381 static void SpreadHorizontal(uint8_t* aInput, uint8_t* aOutput, int32_t aRadius,
382                              int32_t aWidth, int32_t aRows, int32_t aStride,
383                              const IntRect& aSkipRect) {
384   if (aRadius == 0) {
385     memcpy(aOutput, aInput, aStride * aRows);
386     return;
387   }
388 
389   bool skipRectCoversWholeRow =
390       0 >= aSkipRect.X() && aWidth <= aSkipRect.XMost();
391   for (int32_t y = 0; y < aRows; y++) {
392     // Check whether the skip rect intersects this row. If the skip
393     // rect covers the whole surface in this row, we can avoid
394     // this row entirely (and any others along the skip rect).
395     bool inSkipRectY = aSkipRect.ContainsY(y);
396     if (inSkipRectY && skipRectCoversWholeRow) {
397       y = aSkipRect.YMost() - 1;
398       continue;
399     }
400 
401     for (int32_t x = 0; x < aWidth; x++) {
402       // Check whether we are within the skip rect. If so, go
403       // to the next point outside the skip rect.
404       if (inSkipRectY && aSkipRect.ContainsX(x)) {
405         x = aSkipRect.XMost();
406         if (x >= aWidth) break;
407       }
408 
409       int32_t sMin = std::max(x - aRadius, 0);
410       int32_t sMax = std::min(x + aRadius, aWidth - 1);
411       int32_t v = 0;
412       for (int32_t s = sMin; s <= sMax; ++s) {
413         v = std::max<int32_t>(v, aInput[aStride * y + s]);
414       }
415       aOutput[aStride * y + x] = v;
416     }
417   }
418 }
419 
SpreadVertical(uint8_t * aInput,uint8_t * aOutput,int32_t aRadius,int32_t aWidth,int32_t aRows,int32_t aStride,const IntRect & aSkipRect)420 static void SpreadVertical(uint8_t* aInput, uint8_t* aOutput, int32_t aRadius,
421                            int32_t aWidth, int32_t aRows, int32_t aStride,
422                            const IntRect& aSkipRect) {
423   if (aRadius == 0) {
424     memcpy(aOutput, aInput, aStride * aRows);
425     return;
426   }
427 
428   bool skipRectCoversWholeColumn =
429       0 >= aSkipRect.Y() && aRows <= aSkipRect.YMost();
430   for (int32_t x = 0; x < aWidth; x++) {
431     bool inSkipRectX = aSkipRect.ContainsX(x);
432     if (inSkipRectX && skipRectCoversWholeColumn) {
433       x = aSkipRect.XMost() - 1;
434       continue;
435     }
436 
437     for (int32_t y = 0; y < aRows; y++) {
438       // Check whether we are within the skip rect. If so, go
439       // to the next point outside the skip rect.
440       if (inSkipRectX && aSkipRect.ContainsY(y)) {
441         y = aSkipRect.YMost();
442         if (y >= aRows) break;
443       }
444 
445       int32_t sMin = std::max(y - aRadius, 0);
446       int32_t sMax = std::min(y + aRadius, aRows - 1);
447       int32_t v = 0;
448       for (int32_t s = sMin; s <= sMax; ++s) {
449         v = std::max<int32_t>(v, aInput[aStride * s + x]);
450       }
451       aOutput[aStride * y + x] = v;
452     }
453   }
454 }
455 
RoundUpToMultipleOf4(int32_t aVal)456 CheckedInt<int32_t> AlphaBoxBlur::RoundUpToMultipleOf4(int32_t aVal) {
457   CheckedInt<int32_t> val(aVal);
458 
459   val += 3;
460   val /= 4;
461   val *= 4;
462 
463   return val;
464 }
465 
AlphaBoxBlur(const Rect & aRect,const IntSize & aSpreadRadius,const IntSize & aBlurRadius,const Rect * aDirtyRect,const Rect * aSkipRect)466 AlphaBoxBlur::AlphaBoxBlur(const Rect& aRect, const IntSize& aSpreadRadius,
467                            const IntSize& aBlurRadius, const Rect* aDirtyRect,
468                            const Rect* aSkipRect)
469     : mStride(0), mSurfaceAllocationSize(0) {
470   Init(aRect, aSpreadRadius, aBlurRadius, aDirtyRect, aSkipRect);
471 }
472 
AlphaBoxBlur()473 AlphaBoxBlur::AlphaBoxBlur()
474     : mStride(0), mSurfaceAllocationSize(0), mHasDirtyRect(false) {}
475 
Init(const Rect & aRect,const IntSize & aSpreadRadius,const IntSize & aBlurRadius,const Rect * aDirtyRect,const Rect * aSkipRect)476 void AlphaBoxBlur::Init(const Rect& aRect, const IntSize& aSpreadRadius,
477                         const IntSize& aBlurRadius, const Rect* aDirtyRect,
478                         const Rect* aSkipRect) {
479   mSpreadRadius = aSpreadRadius;
480   mBlurRadius = aBlurRadius;
481 
482   Rect rect(aRect);
483   rect.Inflate(Size(aBlurRadius + aSpreadRadius));
484   rect.RoundOut();
485 
486   if (aDirtyRect) {
487     // If we get passed a dirty rect from layout, we can minimize the
488     // shadow size and make painting faster.
489     mHasDirtyRect = true;
490     mDirtyRect = *aDirtyRect;
491     Rect requiredBlurArea = mDirtyRect.Intersect(rect);
492     requiredBlurArea.Inflate(Size(aBlurRadius + aSpreadRadius));
493     rect = requiredBlurArea.Intersect(rect);
494   } else {
495     mHasDirtyRect = false;
496   }
497 
498   mRect = TruncatedToInt(rect);
499   if (mRect.IsEmpty()) {
500     return;
501   }
502 
503   if (aSkipRect) {
504     // If we get passed a skip rect, we can lower the amount of
505     // blurring/spreading we need to do. We convert it to IntRect to avoid
506     // expensive int<->float conversions if we were to use Rect instead.
507     Rect skipRect = *aSkipRect;
508     skipRect.Deflate(Size(aBlurRadius + aSpreadRadius));
509     mSkipRect = RoundedIn(skipRect);
510     mSkipRect = mSkipRect.Intersect(mRect);
511     if (mSkipRect.IsEqualInterior(mRect)) {
512       return;
513     }
514 
515     mSkipRect -= mRect.TopLeft();
516     // Ensure the skip rect is 4-pixel-aligned in the x axis, so that all our
517     // accesses later are aligned as well, see bug 1622113.
518     mSkipRect.SetLeftEdge(RoundUpToMultiple(mSkipRect.X(), 4));
519     mSkipRect.SetRightEdge(RoundDownToMultiple(mSkipRect.XMost(), 4));
520     if (mSkipRect.IsEmpty()) {
521       mSkipRect = IntRect();
522     }
523   } else {
524     mSkipRect = IntRect();
525   }
526 
527   CheckedInt<int32_t> stride = RoundUpToMultipleOf4(mRect.Width());
528   if (stride.isValid()) {
529     mStride = stride.value();
530 
531     // We need to leave room for an additional 3 bytes for a potential overrun
532     // in our blurring code.
533     size_t size = BufferSizeFromStrideAndHeight(mStride, mRect.Height(), 3);
534     if (size != 0) {
535       mSurfaceAllocationSize = size;
536     }
537   }
538 }
539 
AlphaBoxBlur(const Rect & aRect,int32_t aStride,float aSigmaX,float aSigmaY)540 AlphaBoxBlur::AlphaBoxBlur(const Rect& aRect, int32_t aStride, float aSigmaX,
541                            float aSigmaY)
542     : mRect(TruncatedToInt(aRect)),
543       mSpreadRadius(),
544       mBlurRadius(CalculateBlurRadius(Point(aSigmaX, aSigmaY))),
545       mStride(aStride),
546       mSurfaceAllocationSize(0),
547       mHasDirtyRect(false) {
548   IntRect intRect;
549   if (aRect.ToIntRect(&intRect)) {
550     size_t minDataSize =
551         BufferSizeFromStrideAndHeight(intRect.Width(), intRect.Height());
552     if (minDataSize != 0) {
553       mSurfaceAllocationSize = minDataSize;
554     }
555   }
556 }
557 
558 AlphaBoxBlur::~AlphaBoxBlur() = default;
559 
GetSize() const560 IntSize AlphaBoxBlur::GetSize() const {
561   IntSize size(mRect.Width(), mRect.Height());
562   return size;
563 }
564 
GetStride() const565 int32_t AlphaBoxBlur::GetStride() const { return mStride; }
566 
GetRect() const567 IntRect AlphaBoxBlur::GetRect() const { return mRect; }
568 
GetDirtyRect()569 Rect* AlphaBoxBlur::GetDirtyRect() {
570   if (mHasDirtyRect) {
571     return &mDirtyRect;
572   }
573 
574   return nullptr;
575 }
576 
GetSurfaceAllocationSize() const577 size_t AlphaBoxBlur::GetSurfaceAllocationSize() const {
578   return mSurfaceAllocationSize;
579 }
580 
Blur(uint8_t * aData) const581 void AlphaBoxBlur::Blur(uint8_t* aData) const {
582   if (!aData) {
583     return;
584   }
585 
586   // no need to do all this if not blurring or spreading
587   if (mBlurRadius != IntSize(0, 0) || mSpreadRadius != IntSize(0, 0)) {
588     int32_t stride = GetStride();
589 
590     IntSize size = GetSize();
591 
592     if (mSpreadRadius.width > 0 || mSpreadRadius.height > 0) {
593       // No need to use CheckedInt here - we have validated it in the
594       // constructor.
595       size_t szB = stride * size.height;
596       uint8_t* tmpData = new (std::nothrow) uint8_t[szB];
597 
598       if (!tmpData) {
599         return;
600       }
601 
602       memset(tmpData, 0, szB);
603 
604       SpreadHorizontal(aData, tmpData, mSpreadRadius.width, size.width,
605                        size.height, stride, mSkipRect);
606       SpreadVertical(tmpData, aData, mSpreadRadius.height, size.width,
607                      size.height, stride, mSkipRect);
608 
609       delete[] tmpData;
610     }
611 
612     int32_t horizontalLobes[3][2];
613     ComputeLobes(mBlurRadius.width, horizontalLobes);
614     int32_t verticalLobes[3][2];
615     ComputeLobes(mBlurRadius.height, verticalLobes);
616 
617     // We want to allow for some extra space on the left for alignment reasons.
618     int32_t maxLeftLobe =
619         RoundUpToMultipleOf4(horizontalLobes[0][0] + 1).value();
620 
621     IntSize integralImageSize(
622         size.width + maxLeftLobe + horizontalLobes[1][1],
623         size.height + verticalLobes[0][0] + verticalLobes[1][1] + 1);
624 
625     if ((integralImageSize.width * integralImageSize.height) > (1 << 24)) {
626       // Fallback to old blurring code when the surface is so large it may
627       // overflow our integral image!
628       if (mBlurRadius.width > 0) {
629         BoxBlur<false>(aData, horizontalLobes, size.width, size.height, stride,
630                        mSkipRect);
631       }
632       if (mBlurRadius.height > 0) {
633         BoxBlur<true>(aData, verticalLobes, size.width, size.height, stride,
634                       mSkipRect);
635       }
636     } else {
637       size_t integralImageStride =
638           GetAlignedStride<16>(integralImageSize.width, 4);
639       if (integralImageStride == 0) {
640         return;
641       }
642 
643       // We need to leave room for an additional 12 bytes for a maximum overrun
644       // of 3 pixels in the blurring code.
645       size_t bufLen = BufferSizeFromStrideAndHeight(
646           integralImageStride, integralImageSize.height, 12);
647       if (bufLen == 0) {
648         return;
649       }
650       // bufLen is a byte count, but here we want a multiple of 32-bit ints, so
651       // we divide by 4.
652       AlignedArray<uint32_t> integralImage((bufLen / 4) +
653                                            ((bufLen % 4) ? 1 : 0));
654 
655       if (!integralImage) {
656         return;
657       }
658 
659 #ifdef USE_SSE2
660       if (Factory::HasSSE2()) {
661         BoxBlur_SSE2(aData, horizontalLobes[0][0], horizontalLobes[0][1],
662                      verticalLobes[0][0], verticalLobes[0][1], integralImage,
663                      integralImageStride);
664         BoxBlur_SSE2(aData, horizontalLobes[1][0], horizontalLobes[1][1],
665                      verticalLobes[1][0], verticalLobes[1][1], integralImage,
666                      integralImageStride);
667         BoxBlur_SSE2(aData, horizontalLobes[2][0], horizontalLobes[2][1],
668                      verticalLobes[2][0], verticalLobes[2][1], integralImage,
669                      integralImageStride);
670       } else
671 #endif
672 #ifdef USE_NEON
673           if (mozilla::supports_neon()) {
674         BoxBlur_NEON(aData, horizontalLobes[0][0], horizontalLobes[0][1],
675                      verticalLobes[0][0], verticalLobes[0][1], integralImage,
676                      integralImageStride);
677         BoxBlur_NEON(aData, horizontalLobes[1][0], horizontalLobes[1][1],
678                      verticalLobes[1][0], verticalLobes[1][1], integralImage,
679                      integralImageStride);
680         BoxBlur_NEON(aData, horizontalLobes[2][0], horizontalLobes[2][1],
681                      verticalLobes[2][0], verticalLobes[2][1], integralImage,
682                      integralImageStride);
683       } else
684 #endif
685       {
686 #ifdef _MIPS_ARCH_LOONGSON3A
687         BoxBlur_LS3(aData, horizontalLobes[0][0], horizontalLobes[0][1],
688                     verticalLobes[0][0], verticalLobes[0][1], integralImage,
689                     integralImageStride);
690         BoxBlur_LS3(aData, horizontalLobes[1][0], horizontalLobes[1][1],
691                     verticalLobes[1][0], verticalLobes[1][1], integralImage,
692                     integralImageStride);
693         BoxBlur_LS3(aData, horizontalLobes[2][0], horizontalLobes[2][1],
694                     verticalLobes[2][0], verticalLobes[2][1], integralImage,
695                     integralImageStride);
696 #else
697         BoxBlur_C(aData, horizontalLobes[0][0], horizontalLobes[0][1],
698                   verticalLobes[0][0], verticalLobes[0][1], integralImage,
699                   integralImageStride);
700         BoxBlur_C(aData, horizontalLobes[1][0], horizontalLobes[1][1],
701                   verticalLobes[1][0], verticalLobes[1][1], integralImage,
702                   integralImageStride);
703         BoxBlur_C(aData, horizontalLobes[2][0], horizontalLobes[2][1],
704                   verticalLobes[2][0], verticalLobes[2][1], integralImage,
705                   integralImageStride);
706 #endif
707       }
708     }
709   }
710 }
711 
GenerateIntegralRow(uint32_t * aDest,const uint8_t * aSource,uint32_t * aPreviousRow,const uint32_t & aSourceWidth,const uint32_t & aLeftInflation,const uint32_t & aRightInflation)712 MOZ_ALWAYS_INLINE void GenerateIntegralRow(uint32_t* aDest,
713                                            const uint8_t* aSource,
714                                            uint32_t* aPreviousRow,
715                                            const uint32_t& aSourceWidth,
716                                            const uint32_t& aLeftInflation,
717                                            const uint32_t& aRightInflation) {
718   uint32_t currentRowSum = 0;
719   uint32_t pixel = aSource[0];
720   for (uint32_t x = 0; x < aLeftInflation; x++) {
721     currentRowSum += pixel;
722     *aDest++ = currentRowSum + *aPreviousRow++;
723   }
724   for (uint32_t x = aLeftInflation; x < (aSourceWidth + aLeftInflation);
725        x += 4) {
726     uint32_t alphaValues = *(uint32_t*)(aSource + (x - aLeftInflation));
727 #if defined WORDS_BIGENDIAN || defined IS_BIG_ENDIAN || defined __BIG_ENDIAN__
728     currentRowSum += (alphaValues >> 24) & 0xff;
729     *aDest++ = *aPreviousRow++ + currentRowSum;
730     currentRowSum += (alphaValues >> 16) & 0xff;
731     *aDest++ = *aPreviousRow++ + currentRowSum;
732     currentRowSum += (alphaValues >> 8) & 0xff;
733     *aDest++ = *aPreviousRow++ + currentRowSum;
734     currentRowSum += alphaValues & 0xff;
735     *aDest++ = *aPreviousRow++ + currentRowSum;
736 #else
737     currentRowSum += alphaValues & 0xff;
738     *aDest++ = *aPreviousRow++ + currentRowSum;
739     alphaValues >>= 8;
740     currentRowSum += alphaValues & 0xff;
741     *aDest++ = *aPreviousRow++ + currentRowSum;
742     alphaValues >>= 8;
743     currentRowSum += alphaValues & 0xff;
744     *aDest++ = *aPreviousRow++ + currentRowSum;
745     alphaValues >>= 8;
746     currentRowSum += alphaValues & 0xff;
747     *aDest++ = *aPreviousRow++ + currentRowSum;
748 #endif
749   }
750   pixel = aSource[aSourceWidth - 1];
751   for (uint32_t x = (aSourceWidth + aLeftInflation);
752        x < (aSourceWidth + aLeftInflation + aRightInflation); x++) {
753     currentRowSum += pixel;
754     *aDest++ = currentRowSum + *aPreviousRow++;
755   }
756 }
757 
GenerateIntegralImage_C(int32_t aLeftInflation,int32_t aRightInflation,int32_t aTopInflation,int32_t aBottomInflation,uint32_t * aIntegralImage,size_t aIntegralImageStride,uint8_t * aSource,int32_t aSourceStride,const IntSize & aSize)758 MOZ_ALWAYS_INLINE void GenerateIntegralImage_C(
759     int32_t aLeftInflation, int32_t aRightInflation, int32_t aTopInflation,
760     int32_t aBottomInflation, uint32_t* aIntegralImage,
761     size_t aIntegralImageStride, uint8_t* aSource, int32_t aSourceStride,
762     const IntSize& aSize) {
763   uint32_t stride32bit = aIntegralImageStride / 4;
764 
765   IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation,
766                             aSize.height + aTopInflation + aBottomInflation);
767 
768   memset(aIntegralImage, 0, aIntegralImageStride);
769 
770   GenerateIntegralRow(aIntegralImage, aSource, aIntegralImage, aSize.width,
771                       aLeftInflation, aRightInflation);
772   for (int y = 1; y < aTopInflation + 1; y++) {
773     GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource,
774                         aIntegralImage + (y - 1) * stride32bit, aSize.width,
775                         aLeftInflation, aRightInflation);
776   }
777 
778   for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) {
779     GenerateIntegralRow(aIntegralImage + (y * stride32bit),
780                         aSource + aSourceStride * (y - aTopInflation),
781                         aIntegralImage + (y - 1) * stride32bit, aSize.width,
782                         aLeftInflation, aRightInflation);
783   }
784 
785   if (aBottomInflation) {
786     for (int y = (aSize.height + aTopInflation); y < integralImageSize.height;
787          y++) {
788       GenerateIntegralRow(aIntegralImage + (y * stride32bit),
789                           aSource + ((aSize.height - 1) * aSourceStride),
790                           aIntegralImage + (y - 1) * stride32bit, aSize.width,
791                           aLeftInflation, aRightInflation);
792     }
793   }
794 }
795 
796 /**
797  * Attempt to do an in-place box blur using an integral image.
798  */
BoxBlur_C(uint8_t * aData,int32_t aLeftLobe,int32_t aRightLobe,int32_t aTopLobe,int32_t aBottomLobe,uint32_t * aIntegralImage,size_t aIntegralImageStride) const799 void AlphaBoxBlur::BoxBlur_C(uint8_t* aData, int32_t aLeftLobe,
800                              int32_t aRightLobe, int32_t aTopLobe,
801                              int32_t aBottomLobe, uint32_t* aIntegralImage,
802                              size_t aIntegralImageStride) const {
803   IntSize size = GetSize();
804 
805   MOZ_ASSERT(size.width > 0);
806 
807   // Our 'left' or 'top' lobe will include the current pixel. i.e. when
808   // looking at an integral image the value of a pixel at 'x,y' is calculated
809   // using the value of the integral image values above/below that.
810   aLeftLobe++;
811   aTopLobe++;
812   int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe);
813 
814   MOZ_ASSERT(boxSize > 0);
815 
816   if (boxSize == 1) {
817     return;
818   }
819 
820   int32_t stride32bit = aIntegralImageStride / 4;
821 
822   int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value();
823 
824   GenerateIntegralImage_C(leftInflation, aRightLobe, aTopLobe, aBottomLobe,
825                           aIntegralImage, aIntegralImageStride, aData, mStride,
826                           size);
827 
828   uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize);
829 
830   uint32_t* innerIntegral =
831       aIntegralImage + (aTopLobe * stride32bit) + leftInflation;
832 
833   // Storing these locally makes this about 30% faster! Presumably the compiler
834   // can't be sure we're not altering the member variables in this loop.
835   IntRect skipRect = mSkipRect;
836   uint8_t* data = aData;
837   int32_t stride = mStride;
838   for (int32_t y = 0; y < size.height; y++) {
839     // Not using ContainsY(y) because we do not skip y == skipRect.Y()
840     // although that may not be done on purpose
841     bool inSkipRectY = y > skipRect.Y() && y < skipRect.YMost();
842 
843     uint32_t* topLeftBase =
844         innerIntegral + ((y - aTopLobe) * stride32bit - aLeftLobe);
845     uint32_t* topRightBase =
846         innerIntegral + ((y - aTopLobe) * stride32bit + aRightLobe);
847     uint32_t* bottomRightBase =
848         innerIntegral + ((y + aBottomLobe) * stride32bit + aRightLobe);
849     uint32_t* bottomLeftBase =
850         innerIntegral + ((y + aBottomLobe) * stride32bit - aLeftLobe);
851 
852     for (int32_t x = 0; x < size.width; x++) {
853       // Not using ContainsX(x) because we do not skip x == skipRect.X()
854       // although that may not be done on purpose
855       if (inSkipRectY && x > skipRect.X() && x < skipRect.XMost()) {
856         x = skipRect.XMost() - 1;
857         // Trigger early jump on coming loop iterations, this will be reset
858         // next line anyway.
859         inSkipRectY = false;
860         continue;
861       }
862       int32_t topLeft = topLeftBase[x];
863       int32_t topRight = topRightBase[x];
864       int32_t bottomRight = bottomRightBase[x];
865       int32_t bottomLeft = bottomLeftBase[x];
866 
867       uint32_t value = bottomRight - topRight - bottomLeft;
868       value += topLeft;
869 
870       data[stride * y + x] =
871           (uint64_t(reciprocal) * value + (uint64_t(1) << 31)) >> 32;
872     }
873   }
874 }
875 
876 /**
877  * Compute the box blur size (which we're calling the blur radius) from
878  * the standard deviation.
879  *
880  * Much of this, the 3 * sqrt(2 * pi) / 4, is the known value for
881  * approximating a Gaussian using box blurs.  This yields quite a good
882  * approximation for a Gaussian.  Then we multiply this by 1.5 since our
883  * code wants the radius of the entire triple-box-blur kernel instead of
884  * the diameter of an individual box blur.  For more details, see:
885  *   http://www.w3.org/TR/SVG11/filters.html#feGaussianBlurElement
886  *   https://bugzilla.mozilla.org/show_bug.cgi?id=590039#c19
887  */
888 static const Float GAUSSIAN_SCALE_FACTOR =
889     Float((3 * sqrt(2 * M_PI) / 4) * 1.5);
890 
CalculateBlurRadius(const Point & aStd)891 IntSize AlphaBoxBlur::CalculateBlurRadius(const Point& aStd) {
892   IntSize size(
893       static_cast<int32_t>(floor(aStd.x * GAUSSIAN_SCALE_FACTOR + 0.5f)),
894       static_cast<int32_t>(floor(aStd.y * GAUSSIAN_SCALE_FACTOR + 0.5f)));
895 
896   return size;
897 }
898 
CalculateBlurSigma(int32_t aBlurRadius)899 Float AlphaBoxBlur::CalculateBlurSigma(int32_t aBlurRadius) {
900   return aBlurRadius / GAUSSIAN_SCALE_FACTOR;
901 }
902 
903 }  // namespace gfx
904 }  // namespace mozilla
905