1 /*
2  * Copyright 2017 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "src/core/SkMaskBlurFilter.h"
9 
10 #include "include/core/SkColorPriv.h"
11 #include "include/private/SkMalloc.h"
12 #include "include/private/SkNx.h"
13 #include "include/private/SkTemplates.h"
14 #include "include/private/SkTo.h"
15 #include "src/core/SkArenaAlloc.h"
16 #include "src/core/SkGaussFilter.h"
17 
18 #include <cmath>
19 #include <climits>
20 
21 namespace {
22 static const double kPi = 3.14159265358979323846264338327950288;
23 
24 class PlanGauss final {
25 public:
PlanGauss(double sigma)26     explicit PlanGauss(double sigma) {
27         auto possibleWindow = static_cast<int>(floor(sigma * 3 * sqrt(2 * kPi) / 4 + 0.5));
28         auto window = std::max(1, possibleWindow);
29 
30         fPass0Size = window - 1;
31         fPass1Size = window - 1;
32         fPass2Size = (window & 1) == 1 ? window - 1 : window;
33 
34         // Calculating the border is tricky. I will go through the odd case which is simpler, and
35         // then through the even case. Given a stack of filters seven wide for the odd case of
36         // three passes.
37         //
38         //        S
39         //     aaaAaaa
40         //     bbbBbbb
41         //     cccCccc
42         //        D
43         //
44         // The furthest changed pixel is when the filters are in the following configuration.
45         //
46         //                 S
47         //           aaaAaaa
48         //        bbbBbbb
49         //     cccCccc
50         //        D
51         //
52         //  The A pixel is calculated using the value S, the B uses A, and the C uses B, and
53         // finally D is C. So, with a window size of seven the border is nine. In general, the
54         // border is 3*((window - 1)/2).
55         //
56         // For even cases the filter stack is more complicated. The spec specifies two passes
57         // of even filters and a final pass of odd filters. A stack for a width of six looks like
58         // this.
59         //
60         //       S
61         //    aaaAaa
62         //     bbBbbb
63         //    cccCccc
64         //       D
65         //
66         // The furthest pixel looks like this.
67         //
68         //               S
69         //          aaaAaa
70         //        bbBbbb
71         //    cccCccc
72         //       D
73         //
74         // For a window of size, the border value is seven. In general the border is 3 *
75         // (window/2) -1.
76         fBorder = (window & 1) == 1 ? 3 * ((window - 1) / 2) : 3 * (window / 2) - 1;
77         fSlidingWindow = 2 * fBorder + 1;
78 
79         // If the window is odd then the divisor is just window ^ 3 otherwise,
80         // it is window * window * (window + 1) = window ^ 2 + window ^ 3;
81         auto window2 = window * window;
82         auto window3 = window2 * window;
83         auto divisor = (window & 1) == 1 ? window3 : window3 + window2;
84 
85         fWeight = static_cast<uint64_t>(round(1.0 / divisor * (1ull << 32)));
86     }
87 
bufferSize() const88     size_t bufferSize() const { return fPass0Size + fPass1Size + fPass2Size; }
89 
border() const90     int    border()     const { return fBorder; }
91 
92 public:
93     class Scan {
94     public:
Scan(uint64_t weight,int noChangeCount,uint32_t * buffer0,uint32_t * buffer0End,uint32_t * buffer1,uint32_t * buffer1End,uint32_t * buffer2,uint32_t * buffer2End)95         Scan(uint64_t weight, int noChangeCount,
96              uint32_t* buffer0, uint32_t* buffer0End,
97              uint32_t* buffer1, uint32_t* buffer1End,
98              uint32_t* buffer2, uint32_t* buffer2End)
99             : fWeight{weight}
100             , fNoChangeCount{noChangeCount}
101             , fBuffer0{buffer0}
102             , fBuffer0End{buffer0End}
103             , fBuffer1{buffer1}
104             , fBuffer1End{buffer1End}
105             , fBuffer2{buffer2}
106             , fBuffer2End{buffer2End}
107         { }
108 
blur(const AlphaIter srcBegin,const AlphaIter srcEnd,uint8_t * dst,int dstStride,uint8_t * dstEnd) const109         template <typename AlphaIter> void blur(const AlphaIter srcBegin, const AlphaIter srcEnd,
110                     uint8_t* dst, int dstStride, uint8_t* dstEnd) const {
111             auto buffer0Cursor = fBuffer0;
112             auto buffer1Cursor = fBuffer1;
113             auto buffer2Cursor = fBuffer2;
114 
115             memset(fBuffer0, 0x00, (fBuffer2End - fBuffer0) * sizeof(*fBuffer0));
116 
117             uint32_t sum0 = 0;
118             uint32_t sum1 = 0;
119             uint32_t sum2 = 0;
120 
121             // Consume the source generating pixels.
122             for (AlphaIter src = srcBegin; src < srcEnd; ++src, dst += dstStride) {
123                 uint32_t leadingEdge = *src;
124                 sum0 += leadingEdge;
125                 sum1 += sum0;
126                 sum2 += sum1;
127 
128                 *dst = this->finalScale(sum2);
129 
130                 sum2 -= *buffer2Cursor;
131                 *buffer2Cursor = sum1;
132                 buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2;
133 
134                 sum1 -= *buffer1Cursor;
135                 *buffer1Cursor = sum0;
136                 buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1;
137 
138                 sum0 -= *buffer0Cursor;
139                 *buffer0Cursor = leadingEdge;
140                 buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0;
141             }
142 
143             // The leading edge is off the right side of the mask.
144             for (int i = 0; i < fNoChangeCount; i++) {
145                 uint32_t leadingEdge = 0;
146                 sum0 += leadingEdge;
147                 sum1 += sum0;
148                 sum2 += sum1;
149 
150                 *dst = this->finalScale(sum2);
151 
152                 sum2 -= *buffer2Cursor;
153                 *buffer2Cursor = sum1;
154                 buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2;
155 
156                 sum1 -= *buffer1Cursor;
157                 *buffer1Cursor = sum0;
158                 buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1;
159 
160                 sum0 -= *buffer0Cursor;
161                 *buffer0Cursor = leadingEdge;
162                 buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0;
163 
164                 dst += dstStride;
165             }
166 
167             // Starting from the right, fill in the rest of the buffer.
168             memset(fBuffer0, 0, (fBuffer2End - fBuffer0) * sizeof(*fBuffer0));
169 
170             sum0 = sum1 = sum2 = 0;
171 
172             uint8_t* dstCursor = dstEnd;
173             AlphaIter src = srcEnd;
174             while (dstCursor > dst) {
175                 dstCursor -= dstStride;
176                 uint32_t leadingEdge = *(--src);
177                 sum0 += leadingEdge;
178                 sum1 += sum0;
179                 sum2 += sum1;
180 
181                 *dstCursor = this->finalScale(sum2);
182 
183                 sum2 -= *buffer2Cursor;
184                 *buffer2Cursor = sum1;
185                 buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2;
186 
187                 sum1 -= *buffer1Cursor;
188                 *buffer1Cursor = sum0;
189                 buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1;
190 
191                 sum0 -= *buffer0Cursor;
192                 *buffer0Cursor = leadingEdge;
193                 buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0;
194             }
195         }
196 
197     private:
198         static constexpr uint64_t kHalf = static_cast<uint64_t>(1) << 31;
199 
finalScale(uint32_t sum) const200         uint8_t finalScale(uint32_t sum) const {
201             return SkTo<uint8_t>((fWeight * sum + kHalf) >> 32);
202         }
203 
204         uint64_t  fWeight;
205         int       fNoChangeCount;
206         uint32_t* fBuffer0;
207         uint32_t* fBuffer0End;
208         uint32_t* fBuffer1;
209         uint32_t* fBuffer1End;
210         uint32_t* fBuffer2;
211         uint32_t* fBuffer2End;
212     };
213 
makeBlurScan(int width,uint32_t * buffer) const214     Scan makeBlurScan(int width, uint32_t* buffer) const {
215         uint32_t* buffer0, *buffer0End, *buffer1, *buffer1End, *buffer2, *buffer2End;
216         buffer0 = buffer;
217         buffer0End = buffer1 = buffer0 + fPass0Size;
218         buffer1End = buffer2 = buffer1 + fPass1Size;
219         buffer2End = buffer2 + fPass2Size;
220         int noChangeCount = fSlidingWindow > width ? fSlidingWindow - width : 0;
221 
222         return Scan(
223             fWeight, noChangeCount,
224             buffer0, buffer0End,
225             buffer1, buffer1End,
226             buffer2, buffer2End);
227     }
228 
229     uint64_t fWeight;
230     int      fBorder;
231     int      fSlidingWindow;
232     int      fPass0Size;
233     int      fPass1Size;
234     int      fPass2Size;
235 };
236 
237 } // namespace
238 
239 // NB 135 is the largest sigma that will not cause a buffer full of 255 mask values to overflow
240 // using the Gauss filter. It also limits the size of buffers used hold intermediate values. The
241 // additional + 1 added to window represents adding one more leading element before subtracting the
242 // trailing element.
243 // Explanation of maximums:
244 //   sum0 = (window + 1) * 255
245 //   sum1 = (window + 1) * sum0 -> (window + 1) * (window + 1) * 255
246 //   sum2 = (window + 1) * sum1 -> (window + 1) * (window + 1) * (window + 1) * 255 -> window^3 * 255
247 //
248 //   The value (window + 1)^3 * 255 must fit in a uint32_t. So,
249 //      (window + 1)^3 * 255 < 2^32. window = 255.
250 //
251 //   window = floor(sigma * 3 * sqrt(2 * kPi) / 4)
252 //   For window <= 255, the largest value for sigma is 135.
SkMaskBlurFilter(double sigmaW,double sigmaH)253 SkMaskBlurFilter::SkMaskBlurFilter(double sigmaW, double sigmaH)
254     : fSigmaW{SkTPin(sigmaW, 0.0, 135.0)}
255     , fSigmaH{SkTPin(sigmaH, 0.0, 135.0)}
256 {
257     SkASSERT(sigmaW >= 0);
258     SkASSERT(sigmaH >= 0);
259 }
260 
hasNoBlur() const261 bool SkMaskBlurFilter::hasNoBlur() const {
262     return (3 * fSigmaW <= 1) && (3 * fSigmaH <= 1);
263 }
264 
265 // We favor A8 masks, and if we need to work with another format, we'll convert to A8 first.
266 // Each of these converts width (up to 8) mask values to A8.
bw_to_a8(uint8_t * a8,const uint8_t * from,int width)267 static void bw_to_a8(uint8_t* a8, const uint8_t* from, int width) {
268     SkASSERT(0 < width && width <= 8);
269 
270     uint8_t masks = *from;
271     for (int i = 0; i < width; ++i) {
272         a8[i] = (masks >> (7 - i)) & 1 ? 0xFF
273                                        : 0x00;
274     }
275 }
lcd_to_a8(uint8_t * a8,const uint8_t * from,int width)276 static void lcd_to_a8(uint8_t* a8, const uint8_t* from, int width) {
277     SkASSERT(0 < width && width <= 8);
278 
279     for (int i = 0; i < width; ++i) {
280         unsigned rgb = reinterpret_cast<const uint16_t*>(from)[i],
281                    r = SkPacked16ToR32(rgb),
282                    g = SkPacked16ToG32(rgb),
283                    b = SkPacked16ToB32(rgb);
284         a8[i] = (r + g + b) / 3;
285     }
286 }
argb32_to_a8(uint8_t * a8,const uint8_t * from,int width)287 static void argb32_to_a8(uint8_t* a8, const uint8_t* from, int width) {
288     SkASSERT(0 < width && width <= 8);
289     for (int i = 0; i < width; ++i) {
290         uint32_t rgba = reinterpret_cast<const uint32_t*>(from)[i];
291         a8[i] = SkGetPackedA32(rgba);
292     }
293 }
294 using ToA8 = decltype(bw_to_a8);
295 
load(const uint8_t * from,int width,ToA8 * toA8)296 static Sk8h load(const uint8_t* from, int width, ToA8* toA8) {
297     // Our fast path is a full 8-byte load of A8.
298     // So we'll conditionally handle the two slow paths using tmp:
299     //    - if we have a function to convert another mask to A8, use it;
300     //    - if not but we have less than 8 bytes to load, load them one at a time.
301     uint8_t tmp[8] = {0,0,0,0, 0,0,0,0};
302     if (toA8) {
303         toA8(tmp, from, width);
304         from = tmp;
305     } else if (width < 8) {
306         for (int i = 0; i < width; ++i) {
307             tmp[i] = from[i];
308         }
309         from = tmp;
310     }
311 
312     // Load A8 and convert to 8.8 fixed-point.
313     return SkNx_cast<uint16_t>(Sk8b::Load(from)) << 8;
314 }
315 
store(uint8_t * to,const Sk8h & v,int width)316 static void store(uint8_t* to, const Sk8h& v, int width) {
317     Sk8b b = SkNx_cast<uint8_t>(v >> 8);
318     if (width == 8) {
319         b.store(to);
320     } else {
321         uint8_t buffer[8];
322         b.store(buffer);
323         for (int i = 0; i < width; i++) {
324             to[i] = buffer[i];
325         }
326     }
327 };
328 
329 static constexpr uint16_t _____ = 0u;
330 static constexpr uint16_t kHalf = 0x80u;
331 
332 // In all the blur_x_radius_N and blur_y_radius_N functions the gaussian values are encoded
333 // in 0.16 format, none of the values is greater than one. The incoming mask values are in 8.8
334 // format. The resulting multiply has a 8.24 format, by the mulhi truncates the lower 16 bits
335 // resulting in a 8.8 format.
336 //
337 // The blur_x_radius_N function below blur along a row of pixels using a kernel with radius N. This
338 // system is setup to minimize the number of multiplies needed.
339 //
340 // Explanation:
341 //    Blurring a specific mask value is given by the following equation where D_n is the resulting
342 // mask value and S_n is the source value. The example below is for a filter with a radius of 1
343 // and a width of 3 (radius == (width-1)/2). The indexes for the source and destination are
344 // aligned. The filter is given by G_n where n is the symmetric filter value.
345 //
346 //   D[n] = S[n-1]*G[1] + S[n]*G[0] + S[n+1]*G[1].
347 //
348 // We can start the source index at an offset relative to the destination separated by the
349 // radius. This results in a non-traditional restating of the above filter.
350 //
351 //  D[n] = S[n]*G[1] + S[n+1]*G[0] + S[n+2]*G[1]
352 //
353 // If we look at three specific consecutive destinations the following equations result:
354 //
355 //   D[5] = S[5]*G[1] + S[6]*G[0] + S[7]*G[1]
356 //   D[7] = S[6]*G[1] + S[7]*G[0] + S[8]*G[1]
357 //   D[8] = S[7]*G[1] + S[8]*G[0] + S[9]*G[1].
358 //
359 // In the above equations, notice that S[7] is used in all three. In particular, two values are
360 // used: S[7]*G[0] and S[7]*G[1]. So, S[7] is only multiplied twice, but used in D[5], D[6] and
361 // D[7].
362 //
363 // From the point of view of a source value we end up with the following three equations.
364 //
365 // Given S[7]:
366 //   D[5] += S[7]*G[1]
367 //   D[6] += S[7]*G[0]
368 //   D[7] += S[7]*G[1]
369 //
370 // In General:
371 //   D[n]   += S[n]*G[1]
372 //   D[n+1] += S[n]*G[0]
373 //   D[n+2] += S[n]*G[1]
374 //
375 // Now these equations can be ganged using SIMD to form:
376 //   D[n..n+7]   += S[n..n+7]*G[1]
377 //   D[n+1..n+8] += S[n..n+7]*G[0]
378 //   D[n+2..n+9] += S[n..n+7]*G[1]
379 // The next set of values becomes.
380 //   D[n+8..n+15]  += S[n+8..n+15]*G[1]
381 //   D[n+9..n+16]  += S[n+8..n+15]*G[0]
382 //   D[n+10..n+17] += S[n+8..n+15]*G[1]
383 // You can see that the D[n+8] and D[n+9] values overlap the two sets, using parts of both
384 // S[n..7] and S[n+8..n+15].
385 //
386 // Just one more transformation allows the code to maintain all working values in
387 // registers. I introduce the notation {0, S[n..n+7] * G[k]} to mean that the value where 0 is
388 // prepended to the array of values to form {0, S[n] * G[k], ..., S[n+7]*G[k]}.
389 //
390 //   D[n..n+7]  += S[n..n+7] * G[1]
391 //   D[n..n+8]  += {0, S[n..n+7] * G[0]}
392 //   D[n..n+9]  += {0, 0, S[n..n+7] * G[1]}
393 //
394 // Now we can encode D[n..n+7] in a single Sk8h register called d0, and D[n+8..n+15] in a
395 // register d8. In addition, S[0..n+7] becomes s0.
396 //
397 // The translation of the {0, S[n..n+7] * G[k]} is translated in the following way below.
398 //
399 // Sk8h v0 = s0*G[0]
400 // Sk8h v1 = s0*G[1]
401 // /* D[n..n+7]  += S[n..n+7] * G[1] */
402 // d0 += v1;
403 // /* D[n..n+8]  += {0, S[n..n+7] * G[0]} */
404 // d0 += {_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]}
405 // d1 += {v0[7], _____, _____, _____, _____, _____, _____, _____}
406 // /* D[n..n+9]  += {0, 0, S[n..n+7] * G[1]} */
407 // d0 += {_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]}
408 // d1 += {v1[6], v1[7], _____, _____, _____, _____, _____, _____}
409 // Where we rely on the compiler to generate efficient code for the {____, n, ....} notation.
410 
blur_x_radius_1(const Sk8h & s0,const Sk8h & g0,const Sk8h & g1,const Sk8h &,const Sk8h &,const Sk8h &,Sk8h * d0,Sk8h * d8)411 static void blur_x_radius_1(
412         const Sk8h& s0,
413         const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&,
414         Sk8h* d0, Sk8h* d8) {
415 
416     auto v1 = s0.mulHi(g1);
417     auto v0 = s0.mulHi(g0);
418 
419     // D[n..n+7]  += S[n..n+7] * G[1]
420     *d0 += v1;
421 
422     //D[n..n+8]  += {0, S[n..n+7] * G[0]}
423     *d0 += Sk8h{_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]};
424     *d8 += Sk8h{v0[7], _____, _____, _____, _____, _____, _____, _____};
425 
426     // D[n..n+9]  += {0, 0, S[n..n+7] * G[1]}
427     *d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]};
428     *d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____};
429 
430 }
431 
blur_x_radius_2(const Sk8h & s0,const Sk8h & g0,const Sk8h & g1,const Sk8h & g2,const Sk8h &,const Sk8h &,Sk8h * d0,Sk8h * d8)432 static void blur_x_radius_2(
433         const Sk8h& s0,
434         const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&,
435         Sk8h* d0, Sk8h* d8) {
436     auto v0 = s0.mulHi(g0);
437     auto v1 = s0.mulHi(g1);
438     auto v2 = s0.mulHi(g2);
439 
440     // D[n..n+7]  += S[n..n+7] * G[2]
441     *d0 += v2;
442 
443     // D[n..n+8]  += {0, S[n..n+7] * G[1]}
444     *d0 += Sk8h{_____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5], v1[6]};
445     *d8 += Sk8h{v1[7], _____, _____, _____, _____, _____, _____, _____};
446 
447     // D[n..n+9]  += {0, 0, S[n..n+7] * G[0]}
448     *d0 += Sk8h{_____, _____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5]};
449     *d8 += Sk8h{v0[6], v0[7], _____, _____, _____, _____, _____, _____};
450 
451     // D[n..n+10]  += {0, 0, 0, S[n..n+7] * G[1]}
452     *d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]};
453     *d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____};
454 
455     // D[n..n+11]  += {0, 0, 0, 0, S[n..n+7] * G[2]}
456     *d0 += Sk8h{_____, _____, _____, _____, v2[0], v2[1], v2[2], v2[3]};
457     *d8 += Sk8h{v2[4], v2[5], v2[6], v2[7], _____, _____, _____, _____};
458 }
459 
blur_x_radius_3(const Sk8h & s0,const Sk8h & gauss0,const Sk8h & gauss1,const Sk8h & gauss2,const Sk8h & gauss3,const Sk8h &,Sk8h * d0,Sk8h * d8)460 static void blur_x_radius_3(
461         const Sk8h& s0,
462         const Sk8h& gauss0, const Sk8h& gauss1, const Sk8h& gauss2, const Sk8h& gauss3, const Sk8h&,
463         Sk8h* d0, Sk8h* d8) {
464     auto v0 = s0.mulHi(gauss0);
465     auto v1 = s0.mulHi(gauss1);
466     auto v2 = s0.mulHi(gauss2);
467     auto v3 = s0.mulHi(gauss3);
468 
469     // D[n..n+7]  += S[n..n+7] * G[3]
470     *d0 += v3;
471 
472     // D[n..n+8]  += {0, S[n..n+7] * G[2]}
473     *d0 += Sk8h{_____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5], v2[6]};
474     *d8 += Sk8h{v2[7], _____, _____, _____, _____, _____, _____, _____};
475 
476     // D[n..n+9]  += {0, 0, S[n..n+7] * G[1]}
477     *d0 += Sk8h{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]};
478     *d8 += Sk8h{v1[6], v1[7], _____, _____, _____, _____, _____, _____};
479 
480     // D[n..n+10]  += {0, 0, 0, S[n..n+7] * G[0]}
481     *d0 += Sk8h{_____, _____, _____, v0[0], v0[1], v0[2], v0[3], v0[4]};
482     *d8 += Sk8h{v0[5], v0[6], v0[7], _____, _____, _____, _____, _____};
483 
484     // D[n..n+11]  += {0, 0, 0, 0, S[n..n+7] * G[1]}
485     *d0 += Sk8h{_____, _____, _____, _____, v1[0], v1[1], v1[2], v1[3]};
486     *d8 += Sk8h{v1[4], v1[5], v1[6], v1[7], _____, _____, _____, _____};
487 
488     // D[n..n+12]  += {0, 0, 0, 0, 0, S[n..n+7] * G[2]}
489     *d0 += Sk8h{_____, _____, _____, _____, _____, v2[0], v2[1], v2[2]};
490     *d8 += Sk8h{v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____, _____};
491 
492     // D[n..n+13]  += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]}
493     *d0 += Sk8h{_____, _____, _____, _____, _____, _____, v3[0], v3[1]};
494     *d8 += Sk8h{v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____, _____};
495 }
496 
blur_x_radius_4(const Sk8h & s0,const Sk8h & gauss0,const Sk8h & gauss1,const Sk8h & gauss2,const Sk8h & gauss3,const Sk8h & gauss4,Sk8h * d0,Sk8h * d8)497 static void blur_x_radius_4(
498         const Sk8h& s0,
499         const Sk8h& gauss0,
500         const Sk8h& gauss1,
501         const Sk8h& gauss2,
502         const Sk8h& gauss3,
503         const Sk8h& gauss4,
504         Sk8h* d0, Sk8h* d8) {
505     auto v0 = s0.mulHi(gauss0);
506     auto v1 = s0.mulHi(gauss1);
507     auto v2 = s0.mulHi(gauss2);
508     auto v3 = s0.mulHi(gauss3);
509     auto v4 = s0.mulHi(gauss4);
510 
511     // D[n..n+7]  += S[n..n+7] * G[4]
512     *d0 += v4;
513 
514     // D[n..n+8]  += {0, S[n..n+7] * G[3]}
515     *d0 += Sk8h{_____, v3[0], v3[1], v3[2], v3[3], v3[4], v3[5], v3[6]};
516     *d8 += Sk8h{v3[7], _____, _____, _____, _____, _____, _____, _____};
517 
518     // D[n..n+9]  += {0, 0, S[n..n+7] * G[2]}
519     *d0 += Sk8h{_____, _____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5]};
520     *d8 += Sk8h{v2[6], v2[7], _____, _____, _____, _____, _____, _____};
521 
522     // D[n..n+10]  += {0, 0, 0, S[n..n+7] * G[1]}
523     *d0 += Sk8h{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]};
524     *d8 += Sk8h{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____};
525 
526     // D[n..n+11]  += {0, 0, 0, 0, S[n..n+7] * G[0]}
527     *d0 += Sk8h{_____, _____, _____, _____, v0[0], v0[1], v0[2], v0[3]};
528     *d8 += Sk8h{v0[4], v0[5], v0[6], v0[7], _____, _____, _____, _____};
529 
530     // D[n..n+12]  += {0, 0, 0, 0, 0, S[n..n+7] * G[1]}
531     *d0 += Sk8h{_____, _____, _____, _____, _____, v1[0], v1[1], v1[2]};
532     *d8 += Sk8h{v1[3], v1[4], v1[5], v1[6], v1[7], _____, _____, _____};
533 
534     // D[n..n+13]  += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[2]}
535     *d0 += Sk8h{_____, _____, _____, _____, _____, _____, v2[0], v2[1]};
536     *d8 += Sk8h{v2[2], v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____};
537 
538     // D[n..n+14]  += {0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]}
539     *d0 += Sk8h{_____, _____, _____, _____, _____, _____, _____, v3[0]};
540     *d8 += Sk8h{v3[1], v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____};
541 
542     // D[n..n+15]  += {0, 0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[4]}
543     *d8 += v4;
544 }
545 
546 using BlurX = decltype(blur_x_radius_1);
547 
548 // BlurX will only be one of the functions blur_x_radius_(1|2|3|4).
blur_row(BlurX blur,const Sk8h & g0,const Sk8h & g1,const Sk8h & g2,const Sk8h & g3,const Sk8h & g4,const uint8_t * src,int srcW,uint8_t * dst,int dstW)549 static void blur_row(
550         BlurX blur,
551         const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4,
552         const uint8_t* src, int srcW,
553               uint8_t* dst, int dstW) {
554     // Clear the buffer to handle summing wider than source.
555     Sk8h d0{kHalf}, d8{kHalf};
556 
557     // Go by multiples of 8 in src.
558     int x = 0;
559     for (; x <= srcW - 8; x += 8) {
560         blur(load(src, 8, nullptr), g0, g1, g2, g3, g4, &d0, &d8);
561 
562         store(dst, d0, 8);
563 
564         d0 = d8;
565         d8 = Sk8h{kHalf};
566 
567         src += 8;
568         dst += 8;
569     }
570 
571     // There are src values left, but the remainder of src values is not a multiple of 8.
572     int srcTail = srcW - x;
573     if (srcTail > 0) {
574 
575         blur(load(src, srcTail, nullptr), g0, g1, g2, g3, g4, &d0, &d8);
576 
577         int dstTail = std::min(8, dstW - x);
578         store(dst, d0, dstTail);
579 
580         d0 = d8;
581         dst += dstTail;
582         x += dstTail;
583     }
584 
585     // There are dst mask values to complete.
586     int dstTail = dstW - x;
587     if (dstTail > 0) {
588         store(dst, d0, dstTail);
589     }
590 }
591 
592 // BlurX will only be one of the functions blur_x_radius_(1|2|3|4).
blur_x_rect(BlurX blur,uint16_t * gauss,const uint8_t * src,size_t srcStride,int srcW,uint8_t * dst,size_t dstStride,int dstW,int dstH)593 static void blur_x_rect(BlurX blur,
594                         uint16_t* gauss,
595                         const uint8_t* src, size_t srcStride, int srcW,
596                         uint8_t* dst, size_t dstStride, int dstW, int dstH) {
597 
598     Sk8h g0{gauss[0]},
599          g1{gauss[1]},
600          g2{gauss[2]},
601          g3{gauss[3]},
602          g4{gauss[4]};
603 
604     // Blur *ALL* the rows.
605     for (int y = 0; y < dstH; y++) {
606         blur_row(blur, g0, g1, g2, g3, g4, src, srcW, dst, dstW);
607         src += srcStride;
608         dst += dstStride;
609     }
610 }
611 
direct_blur_x(int radius,uint16_t * gauss,const uint8_t * src,size_t srcStride,int srcW,uint8_t * dst,size_t dstStride,int dstW,int dstH)612 static void direct_blur_x(int radius, uint16_t* gauss,
613                           const uint8_t* src, size_t srcStride, int srcW,
614                           uint8_t* dst, size_t dstStride, int dstW, int dstH) {
615 
616     switch (radius) {
617         case 1:
618             blur_x_rect(blur_x_radius_1, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
619             break;
620 
621         case 2:
622             blur_x_rect(blur_x_radius_2, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
623             break;
624 
625         case 3:
626             blur_x_rect(blur_x_radius_3, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
627             break;
628 
629         case 4:
630             blur_x_rect(blur_x_radius_4, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
631             break;
632 
633         default:
634             SkASSERTF(false, "The radius %d is not handled\n", radius);
635     }
636 }
637 
638 // The operations of the blur_y_radius_N functions work on a theme similar to the blur_x_radius_N
639 // functions, but end up being simpler because there is no complicated shift of registers. We
640 // start with the non-traditional form of the gaussian filter. In the following r is the value
641 // when added generates the next value in the column.
642 //
643 //   D[n+0r] = S[n+0r]*G[1]
644 //           + S[n+1r]*G[0]
645 //           + S[n+2r]*G[1]
646 //
647 // Expanding out in a way similar to blur_x_radius_N for specific values of n.
648 //
649 //   D[n+0r] = S[n-2r]*G[1] + S[n-1r]*G[0] + S[n+0r]*G[1]
650 //   D[n+1r] = S[n-1r]*G[1] + S[n+0r]*G[0] + S[n+1r]*G[1]
651 //   D[n+2r] = S[n+0r]*G[1] + S[n+1r]*G[0] + S[n+2r]*G[1]
652 //
653 // We can see that S[n+0r] is in all three D[] equations, but is only multiplied twice. Now we
654 // can look at the calculation form the point of view of a source value.
655 //
656 //   Given S[n+0r]:
657 //   D[n+0r] += S[n+0r]*G[1];
658 //   /* D[n+0r] is done and can be stored now. */
659 //   D[n+1r] += S[n+0r]*G[0];
660 //   D[n+2r]  = S[n+0r]*G[1];
661 //
662 // Remember, by induction, that D[n+0r] == S[n-2r]*G[1] + S[n-1r]*G[0] before adding in
663 // S[n+0r]*G[1]. So, after the addition D[n+0r] has finished calculation and can be stored. Also,
664 // notice that D[n+2r] is receiving its first value from S[n+0r]*G[1] and is not added in. Notice
665 // how values flow in the following two iterations in source.
666 //
667 //   D[n+0r] += S[n+0r]*G[1]
668 //   D[n+1r] += S[n+0r]*G[0]
669 //   D[n+2r]  = S[n+0r]*G[1]
670 //   /* ------- */
671 //   D[n+1r] += S[n+1r]*G[1]
672 //   D[n+2r] += S[n+1r]*G[0]
673 //   D[n+3r]  = S[n+1r]*G[1]
674 //
675 // Instead of using memory we can introduce temporaries d01 and d12. The update step changes
676 // to the following.
677 //
678 //   answer = d01 + S[n+0r]*G[1]
679 //   d01    = d12 + S[n+0r]*G[0]
680 //   d12    =       S[n+0r]*G[1]
681 //   return answer
682 //
683 // Finally, this can be ganged into SIMD style.
684 //   answer[0..7] = d01[0..7] + S[n+0r..n+0r+7]*G[1]
685 //   d01[0..7]    = d12[0..7] + S[n+0r..n+0r+7]*G[0]
686 //   d12[0..7]    =             S[n+0r..n+0r+7]*G[1]
687 //   return answer[0..7]
blur_y_radius_1(const Sk8h & s0,const Sk8h & g0,const Sk8h & g1,const Sk8h &,const Sk8h &,const Sk8h &,Sk8h * d01,Sk8h * d12,Sk8h *,Sk8h *,Sk8h *,Sk8h *,Sk8h *,Sk8h *)688 static Sk8h blur_y_radius_1(
689         const Sk8h& s0,
690         const Sk8h& g0, const Sk8h& g1, const Sk8h&, const Sk8h&, const Sk8h&,
691         Sk8h* d01, Sk8h* d12, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*, Sk8h*) {
692     auto v0 = s0.mulHi(g0);
693     auto v1 = s0.mulHi(g1);
694 
695     Sk8h answer = *d01 + v1;
696            *d01 = *d12 + v0;
697            *d12 =        v1 + kHalf;
698 
699     return answer;
700 }
701 
blur_y_radius_2(const Sk8h & s0,const Sk8h & g0,const Sk8h & g1,const Sk8h & g2,const Sk8h &,const Sk8h &,Sk8h * d01,Sk8h * d12,Sk8h * d23,Sk8h * d34,Sk8h *,Sk8h *,Sk8h *,Sk8h *)702 static Sk8h blur_y_radius_2(
703         const Sk8h& s0,
704         const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h&, const Sk8h&,
705         Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h*, Sk8h*, Sk8h*, Sk8h*) {
706     auto v0 = s0.mulHi(g0);
707     auto v1 = s0.mulHi(g1);
708     auto v2 = s0.mulHi(g2);
709 
710     Sk8h answer = *d01 + v2;
711            *d01 = *d12 + v1;
712            *d12 = *d23 + v0;
713            *d23 = *d34 + v1;
714            *d34 =        v2 + kHalf;
715 
716     return answer;
717 }
718 
blur_y_radius_3(const Sk8h & s0,const Sk8h & g0,const Sk8h & g1,const Sk8h & g2,const Sk8h & g3,const Sk8h &,Sk8h * d01,Sk8h * d12,Sk8h * d23,Sk8h * d34,Sk8h * d45,Sk8h * d56,Sk8h *,Sk8h *)719 static Sk8h blur_y_radius_3(
720         const Sk8h& s0,
721         const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h&,
722         Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h*, Sk8h*) {
723     auto v0 = s0.mulHi(g0);
724     auto v1 = s0.mulHi(g1);
725     auto v2 = s0.mulHi(g2);
726     auto v3 = s0.mulHi(g3);
727 
728     Sk8h answer = *d01 + v3;
729            *d01 = *d12 + v2;
730            *d12 = *d23 + v1;
731            *d23 = *d34 + v0;
732            *d34 = *d45 + v1;
733            *d45 = *d56 + v2;
734            *d56 =        v3 + kHalf;
735 
736     return answer;
737 }
738 
blur_y_radius_4(const Sk8h & s0,const Sk8h & g0,const Sk8h & g1,const Sk8h & g2,const Sk8h & g3,const Sk8h & g4,Sk8h * d01,Sk8h * d12,Sk8h * d23,Sk8h * d34,Sk8h * d45,Sk8h * d56,Sk8h * d67,Sk8h * d78)739 static Sk8h blur_y_radius_4(
740     const Sk8h& s0,
741     const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4,
742     Sk8h* d01, Sk8h* d12, Sk8h* d23, Sk8h* d34, Sk8h* d45, Sk8h* d56, Sk8h* d67, Sk8h* d78) {
743     auto v0 = s0.mulHi(g0);
744     auto v1 = s0.mulHi(g1);
745     auto v2 = s0.mulHi(g2);
746     auto v3 = s0.mulHi(g3);
747     auto v4 = s0.mulHi(g4);
748 
749     Sk8h answer = *d01 + v4;
750            *d01 = *d12 + v3;
751            *d12 = *d23 + v2;
752            *d23 = *d34 + v1;
753            *d34 = *d45 + v0;
754            *d45 = *d56 + v1;
755            *d56 = *d67 + v2;
756            *d67 = *d78 + v3;
757            *d78 =        v4 + kHalf;
758 
759     return answer;
760 }
761 
762 using BlurY = decltype(blur_y_radius_1);
763 
764 // BlurY will be one of blur_y_radius_(1|2|3|4).
blur_column(ToA8 toA8,BlurY blur,int radius,int width,const Sk8h & g0,const Sk8h & g1,const Sk8h & g2,const Sk8h & g3,const Sk8h & g4,const uint8_t * src,size_t srcRB,int srcH,uint8_t * dst,size_t dstRB)765 static void blur_column(
766         ToA8 toA8,
767         BlurY blur, int radius, int width,
768         const Sk8h& g0, const Sk8h& g1, const Sk8h& g2, const Sk8h& g3, const Sk8h& g4,
769         const uint8_t* src, size_t srcRB, int srcH,
770         uint8_t* dst, size_t dstRB) {
771     Sk8h d01{kHalf}, d12{kHalf}, d23{kHalf}, d34{kHalf},
772          d45{kHalf}, d56{kHalf}, d67{kHalf}, d78{kHalf};
773 
774     auto flush = [&](uint8_t* to, const Sk8h& v0, const Sk8h& v1) {
775         store(to, v0, width);
776         to += dstRB;
777         store(to, v1, width);
778         return to + dstRB;
779     };
780 
781     for (int y = 0; y < srcH; y += 1) {
782         auto s = load(src, width, toA8);
783         auto b = blur(s,
784                       g0, g1, g2, g3, g4,
785                       &d01, &d12, &d23, &d34, &d45, &d56, &d67, &d78);
786         store(dst, b, width);
787         src += srcRB;
788         dst += dstRB;
789     }
790 
791     if (radius >= 1) {
792         dst = flush(dst, d01, d12);
793     }
794     if (radius >= 2) {
795         dst = flush(dst, d23, d34);
796     }
797     if (radius >= 3) {
798         dst = flush(dst, d45, d56);
799     }
800     if (radius >= 4) {
801               flush(dst, d67, d78);
802     }
803 }
804 
805 // BlurY will be one of blur_y_radius_(1|2|3|4).
blur_y_rect(ToA8 toA8,const int strideOf8,BlurY blur,int radius,uint16_t * gauss,const uint8_t * src,size_t srcRB,int srcW,int srcH,uint8_t * dst,size_t dstRB)806 static void blur_y_rect(ToA8 toA8, const int strideOf8,
807                         BlurY blur, int radius, uint16_t *gauss,
808                         const uint8_t *src, size_t srcRB, int srcW, int srcH,
809                         uint8_t *dst, size_t dstRB) {
810 
811     Sk8h g0{gauss[0]},
812          g1{gauss[1]},
813          g2{gauss[2]},
814          g3{gauss[3]},
815          g4{gauss[4]};
816 
817     int x = 0;
818     for (; x <= srcW - 8; x += 8) {
819         blur_column(toA8, blur, radius, 8,
820                     g0, g1, g2, g3, g4,
821                     src, srcRB, srcH,
822                     dst, dstRB);
823         src += strideOf8;
824         dst += 8;
825     }
826 
827     int xTail = srcW - x;
828     if (xTail > 0) {
829         blur_column(toA8, blur, radius, xTail,
830                     g0, g1, g2, g3, g4,
831                     src, srcRB, srcH,
832                     dst, dstRB);
833     }
834 }
835 
direct_blur_y(ToA8 toA8,const int strideOf8,int radius,uint16_t * gauss,const uint8_t * src,size_t srcRB,int srcW,int srcH,uint8_t * dst,size_t dstRB)836 static void direct_blur_y(ToA8 toA8, const int strideOf8,
837                           int radius, uint16_t* gauss,
838                           const uint8_t* src, size_t srcRB, int srcW, int srcH,
839                           uint8_t* dst, size_t dstRB) {
840 
841     switch (radius) {
842         case 1:
843             blur_y_rect(toA8, strideOf8, blur_y_radius_1, 1, gauss,
844                         src, srcRB, srcW, srcH,
845                         dst, dstRB);
846             break;
847 
848         case 2:
849             blur_y_rect(toA8, strideOf8, blur_y_radius_2, 2, gauss,
850                         src, srcRB, srcW, srcH,
851                         dst, dstRB);
852             break;
853 
854         case 3:
855             blur_y_rect(toA8, strideOf8, blur_y_radius_3, 3, gauss,
856                         src, srcRB, srcW, srcH,
857                         dst, dstRB);
858             break;
859 
860         case 4:
861             blur_y_rect(toA8, strideOf8, blur_y_radius_4, 4, gauss,
862                         src, srcRB, srcW, srcH,
863                         dst, dstRB);
864             break;
865 
866         default:
867             SkASSERTF(false, "The radius %d is not handled\n", radius);
868     }
869 }
870 
small_blur(double sigmaX,double sigmaY,const SkMask & src,SkMask * dst)871 static SkIPoint small_blur(double sigmaX, double sigmaY, const SkMask& src, SkMask* dst) {
872     SkASSERT(sigmaX == sigmaY); // TODO
873     SkASSERT(0.01 <= sigmaX && sigmaX < 2);
874     SkASSERT(0.01 <= sigmaY && sigmaY < 2);
875 
876     SkGaussFilter filterX{sigmaX},
877                   filterY{sigmaY};
878 
879     int radiusX = filterX.radius(),
880         radiusY = filterY.radius();
881 
882     SkASSERT(radiusX <= 4 && radiusY <= 4);
883 
884     auto prepareGauss = [](const SkGaussFilter& filter, uint16_t* factors) {
885         int i = 0;
886         for (double d : filter) {
887             factors[i++] = static_cast<uint16_t>(round(d * (1 << 16)));
888         }
889     };
890 
891     uint16_t gaussFactorsX[SkGaussFilter::kGaussArrayMax],
892              gaussFactorsY[SkGaussFilter::kGaussArrayMax];
893 
894     prepareGauss(filterX, gaussFactorsX);
895     prepareGauss(filterY, gaussFactorsY);
896 
897     *dst = SkMask::PrepareDestination(radiusX, radiusY, src);
898     if (src.fImage == nullptr) {
899         return {SkTo<int32_t>(radiusX), SkTo<int32_t>(radiusY)};
900     }
901     if (dst->fImage == nullptr) {
902         dst->fBounds.setEmpty();
903         return {0, 0};
904     }
905 
906     int srcW = src.fBounds.width(),
907         srcH = src.fBounds.height();
908 
909     int dstW = dst->fBounds.width(),
910         dstH = dst->fBounds.height();
911 
912     size_t srcRB = src.fRowBytes,
913            dstRB = dst->fRowBytes;
914 
915     //TODO: handle bluring in only one direction.
916 
917     // Blur vertically and copy to destination.
918     switch (src.fFormat) {
919         case SkMask::kBW_Format:
920             direct_blur_y(bw_to_a8, 1,
921                           radiusY, gaussFactorsY,
922                           src.fImage, srcRB, srcW, srcH,
923                           dst->fImage + radiusX, dstRB);
924             break;
925         case SkMask::kA8_Format:
926             direct_blur_y(nullptr, 8,
927                           radiusY, gaussFactorsY,
928                           src.fImage, srcRB, srcW, srcH,
929                           dst->fImage + radiusX, dstRB);
930             break;
931         case SkMask::kARGB32_Format:
932             direct_blur_y(argb32_to_a8, 32,
933                           radiusY, gaussFactorsY,
934                           src.fImage, srcRB, srcW, srcH,
935                           dst->fImage + radiusX, dstRB);
936             break;
937         case SkMask::kLCD16_Format:
938             direct_blur_y(lcd_to_a8, 16, radiusY, gaussFactorsY,
939                           src.fImage, srcRB, srcW, srcH,
940                           dst->fImage + radiusX, dstRB);
941             break;
942         default:
943             SK_ABORT("Unhandled format.");
944     }
945 
946     // Blur horizontally in place.
947     direct_blur_x(radiusX, gaussFactorsX,
948                   dst->fImage + radiusX,  dstRB, srcW,
949                   dst->fImage,            dstRB, dstW, dstH);
950 
951     return {radiusX, radiusY};
952 }
953 
954 // TODO: assuming sigmaW = sigmaH. Allow different sigmas. Right now the
955 // API forces the sigmas to be the same.
blur(const SkMask & src,SkMask * dst) const956 SkIPoint SkMaskBlurFilter::blur(const SkMask& src, SkMask* dst) const {
957 
958     if (fSigmaW < 2.0 && fSigmaH < 2.0) {
959         return small_blur(fSigmaW, fSigmaH, src, dst);
960     }
961 
962     // 1024 is a place holder guess until more analysis can be done.
963     SkSTArenaAlloc<1024> alloc;
964 
965     PlanGauss planW(fSigmaW);
966     PlanGauss planH(fSigmaH);
967 
968     int borderW = planW.border(),
969         borderH = planH.border();
970     SkASSERT(borderH >= 0 && borderW >= 0);
971 
972     *dst = SkMask::PrepareDestination(borderW, borderH, src);
973     if (src.fImage == nullptr) {
974         return {SkTo<int32_t>(borderW), SkTo<int32_t>(borderH)};
975     }
976     if (dst->fImage == nullptr) {
977         dst->fBounds.setEmpty();
978         return {0, 0};
979     }
980 
981     int srcW = src.fBounds.width(),
982         srcH = src.fBounds.height(),
983         dstW = dst->fBounds.width(),
984         dstH = dst->fBounds.height();
985     SkASSERT(srcW >= 0 && srcH >= 0 && dstW >= 0 && dstH >= 0);
986 
987     auto bufferSize = std::max(planW.bufferSize(), planH.bufferSize());
988     auto buffer = alloc.makeArrayDefault<uint32_t>(bufferSize);
989 
990     // Blur both directions.
991     int tmpW = srcH,
992         tmpH = dstW;
993 
994     auto tmp = alloc.makeArrayDefault<uint8_t>(tmpW * tmpH);
995 
996     // Blur horizontally, and transpose.
997     const PlanGauss::Scan& scanW = planW.makeBlurScan(srcW, buffer);
998     switch (src.fFormat) {
999         case SkMask::kBW_Format: {
1000             const uint8_t* bwStart = src.fImage;
1001             auto start = SkMask::AlphaIter<SkMask::kBW_Format>(bwStart, 0);
1002             auto end = SkMask::AlphaIter<SkMask::kBW_Format>(bwStart + (srcW / 8), srcW % 8);
1003             for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) {
1004                 auto tmpStart = &tmp[y];
1005                 scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH);
1006             }
1007         } break;
1008         case SkMask::kA8_Format: {
1009             const uint8_t* a8Start = src.fImage;
1010             auto start = SkMask::AlphaIter<SkMask::kA8_Format>(a8Start);
1011             auto end = SkMask::AlphaIter<SkMask::kA8_Format>(a8Start + srcW);
1012             for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) {
1013                 auto tmpStart = &tmp[y];
1014                 scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH);
1015             }
1016         } break;
1017         case SkMask::kARGB32_Format: {
1018             const uint32_t* argbStart = reinterpret_cast<const uint32_t*>(src.fImage);
1019             auto start = SkMask::AlphaIter<SkMask::kARGB32_Format>(argbStart);
1020             auto end = SkMask::AlphaIter<SkMask::kARGB32_Format>(argbStart + srcW);
1021             for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) {
1022                 auto tmpStart = &tmp[y];
1023                 scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH);
1024             }
1025         } break;
1026         case SkMask::kLCD16_Format: {
1027             const uint16_t* lcdStart = reinterpret_cast<const uint16_t*>(src.fImage);
1028             auto start = SkMask::AlphaIter<SkMask::kLCD16_Format>(lcdStart);
1029             auto end = SkMask::AlphaIter<SkMask::kLCD16_Format>(lcdStart + srcW);
1030             for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) {
1031                 auto tmpStart = &tmp[y];
1032                 scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH);
1033             }
1034         } break;
1035         default:
1036             SK_ABORT("Unhandled format.");
1037     }
1038 
1039     // Blur vertically (scan in memory order because of the transposition),
1040     // and transpose back to the original orientation.
1041     const PlanGauss::Scan& scanH = planH.makeBlurScan(tmpW, buffer);
1042     for (int y = 0; y < tmpH; y++) {
1043         auto tmpStart = &tmp[y * tmpW];
1044         auto dstStart = &dst->fImage[y];
1045 
1046         scanH.blur(tmpStart, tmpStart + tmpW,
1047                    dstStart, dst->fRowBytes, dstStart + dst->fRowBytes * dstH);
1048     }
1049 
1050     return {SkTo<int32_t>(borderW), SkTo<int32_t>(borderH)};
1051 }
1052