1 /*
2  * By downloading, copying, installing or using the software you agree to this license.
3  * If you do not agree to this license, do not download, install,
4  * copy or use the software.
5  *
6  *
7  *                           License Agreement
8  *                For Open Source Computer Vision Library
9  *                        (3-clause BSD License)
10  *
11  * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12  * Third party copyrights are property of their respective owners.
13  *
14  * Redistribution and use in source and binary forms, with or without modification,
15  * are permitted provided that the following conditions are met:
16  *
17  *   * Redistributions of source code must retain the above copyright notice,
18  *     this list of conditions and the following disclaimer.
19  *
20  *   * Redistributions in binary form must reproduce the above copyright notice,
21  *     this list of conditions and the following disclaimer in the documentation
22  *     and/or other materials provided with the distribution.
23  *
24  *   * Neither the names of the copyright holders nor the names of the contributors
25  *     may be used to endorse or promote products derived from this software
26  *     without specific prior written permission.
27  *
28  * This software is provided by the copyright holders and contributors "as is" and
29  * any express or implied warranties, including, but not limited to, the implied
30  * warranties of merchantability and fitness for a particular purpose are disclaimed.
31  * In no event shall copyright holders or contributors be liable for any direct,
32  * indirect, incidental, special, exemplary, or consequential damages
33  * (including, but not limited to, procurement of substitute goods or services;
34  * loss of use, data, or profits; or business interruption) however caused
35  * and on any theory of liability, whether in contract, strict liability,
36  * or tort (including negligence or otherwise) arising in any way out of
37  * the use of this software, even if advised of the possibility of such damage.
38  */
39 
40 #include "common.hpp"
41 #include "saturate_cast.hpp"
42 #include "separable_filter.hpp"
43 
44 namespace CAROTENE_NS {
45 
isGaussianBlur3x3Supported(const Size2D & size,BORDER_MODE border)46 bool isGaussianBlur3x3Supported(const Size2D &size, BORDER_MODE border)
47 {
48     return isSupportedConfiguration() && size.width >= 8 &&
49         (border == BORDER_MODE_CONSTANT ||
50             border == BORDER_MODE_REPLICATE);
51 }
52 
gaussianBlur3x3(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,BORDER_MODE border,u8 borderValue)53 void gaussianBlur3x3(const Size2D &size,
54                      const u8 * srcBase, ptrdiff_t srcStride,
55                      u8 * dstBase, ptrdiff_t dstStride,
56                      BORDER_MODE border, u8 borderValue)
57 {
58     internal::assertSupportedConfiguration(isGaussianBlur3x3Supported(size, border));
59 #ifdef CAROTENE_NEON
60     const uint16x8_t v_border_x4 = vdupq_n_u16(borderValue << 2);
61     const uint16x8_t v_zero = vdupq_n_u16(0);
62     const uint8x8_t v_border = vdup_n_u8(borderValue);
63 
64     uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
65     uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
66 
67     ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
68 
69     for (ptrdiff_t y = 0; y < height; ++y)
70     {
71         const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
72         const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
73         const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
74         u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
75 
76         s16 prevx = 0, currx = 0, nextx = 0;
77         ptrdiff_t x = 0;
78         const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
79 
80         // perform vertical convolution
81         for ( ; x <= bwidth; x += 8)
82         {
83             internal::prefetch(srow0 + x);
84             internal::prefetch(srow1 + x);
85             internal::prefetch(srow2 + x);
86 
87             uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
88             uint8x8_t x1 = vld1_u8(srow1 + x);
89             uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
90 
91             // calculate values for plain CPU part below if needed
92             if (x + 8 >= bwidth)
93             {
94                 ptrdiff_t x3 = x == width ? width - 1 : x;
95                 ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
96 
97                 if (border == BORDER_MODE_CONSTANT && x4 < 0)
98                     prevx = borderValue;
99                 else
100                     prevx = (srow2 ? srow2[x4] : borderValue) + (srow1[x4] << 1) + (srow0 ? srow0[x4] : borderValue);
101 
102                 currx = (srow2 ? srow2[x3] : borderValue) + (srow1[x3] << 1) + (srow0 ? srow0[x3] : borderValue);
103             }
104 
105             // make shift
106             if (x)
107             {
108                 tprev = tcurr;
109                 tcurr = tnext;
110             }
111 
112             // and calculate next value
113             tnext = vaddq_u16(vaddl_u8(x0, x2), vshll_n_u8(x1, 1));
114 
115             // make extrapolation for the first elements
116             if (!x)
117             {
118                 // make border
119                 if (border == BORDER_MODE_CONSTANT)
120                     tcurr = v_border_x4;
121                 else if (border == BORDER_MODE_REPLICATE)
122                     tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0));
123 
124                 continue;
125             }
126 
127             // combine 3 "shifted" vectors
128             t0 = vextq_u16(tprev, tcurr, 7);
129             t1 = tcurr;
130             t2 = vextq_u16(tcurr, tnext, 1);
131 
132             // and add them
133             t0 = vqaddq_u16(vshlq_n_u16(t1, 1), vqaddq_u16(t0, t2));
134             vst1_u8(drow + x - 8, vshrn_n_u16(t0, 4));
135         }
136 
137         x -= 8;
138         if (x == width)
139             --x;
140 
141         for ( ; x < width; ++x)
142         {
143             // make extrapolation for the last elements
144             if (x + 1 >= width)
145             {
146                 if (border == BORDER_MODE_CONSTANT)
147                     nextx = borderValue << 2;
148                 else if (border == BORDER_MODE_REPLICATE)
149                     nextx = srow2[x] + (srow1[x] << 1) + srow0[x];
150             }
151             else
152                 nextx = (srow2 ? srow2[x + 1] : borderValue) +
153                                 (srow1[x + 1] << 1) +
154                         (srow0 ? srow0[x + 1] : borderValue);
155 
156             f32 val = (prevx + (currx << 1) + nextx) >> 4;
157             drow[x] = internal::saturate_cast<u8>((s32)val);
158 
159             // make shift
160             prevx = currx;
161             currx = nextx;
162         }
163     }
164 #else
165     (void)srcBase;
166     (void)srcStride;
167     (void)dstBase;
168     (void)dstStride;
169     (void)borderValue;
170 #endif
171 }
172 
isGaussianBlur3x3MarginSupported(const Size2D & size,BORDER_MODE border,Margin borderMargin)173 bool isGaussianBlur3x3MarginSupported(const Size2D &size, BORDER_MODE border, Margin borderMargin)
174 {
175     return isSeparableFilter3x3Supported(size, border, 0, 0, borderMargin);
176 }
177 
gaussianBlur3x3Margin(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,BORDER_MODE border,u8 borderValue,Margin borderMargin)178 void gaussianBlur3x3Margin(const Size2D &size,
179                            const u8 * srcBase, ptrdiff_t srcStride,
180                            u8 * dstBase, ptrdiff_t dstStride,
181                            BORDER_MODE border, u8 borderValue, Margin borderMargin)
182 {
183     internal::assertSupportedConfiguration(isGaussianBlur3x3MarginSupported(size, border, borderMargin));
184 #ifdef CAROTENE_NEON
185     internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3U8_121>::process(
186                            size, srcBase, srcStride, dstBase, dstStride,
187                            0, 0, border, borderValue, borderMargin);
188 #else
189     (void)srcBase;
190     (void)srcStride;
191     (void)dstBase;
192     (void)dstStride;
193     (void)borderValue;
194 #endif
195 }
196 
isGaussianBlur5x5Supported(const Size2D & size,s32 cn,BORDER_MODE border)197 bool isGaussianBlur5x5Supported(const Size2D &size, s32 cn, BORDER_MODE border)
198 {
199     return isSupportedConfiguration() &&
200            cn > 0 && cn <= 4 &&
201            size.width >= 8 && size.height >= 2 &&
202            (border == BORDER_MODE_CONSTANT ||
203             border == BORDER_MODE_REFLECT101 ||
204             border == BORDER_MODE_REFLECT ||
205             border == BORDER_MODE_REPLICATE ||
206             border == BORDER_MODE_WRAP);
207 }
208 
gaussianBlur5x5(const Size2D & size,s32 cn,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,BORDER_MODE borderType,u8 borderValue,Margin borderMargin)209 void gaussianBlur5x5(const Size2D &size, s32 cn,
210                      const u8 * srcBase, ptrdiff_t srcStride,
211                      u8 * dstBase, ptrdiff_t dstStride,
212                      BORDER_MODE borderType, u8 borderValue, Margin borderMargin)
213 {
214     internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
215 #ifdef CAROTENE_NEON
216     size_t colsn = size.width * cn;
217 
218     std::vector<u8> _tmp;
219     u8 *tmp = 0;
220     if (borderType == BORDER_MODE_CONSTANT)
221     {
222         _tmp.assign(colsn + 4*cn, borderValue);
223         tmp = &_tmp[cn << 1];
224     }
225 
226     ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
227     ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
228     ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
229     ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
230 
231     //1-line buffer
232     std::vector<u16> _buf(cn * (size.width + 4) + 32 / sizeof(u16));
233     u16* lane = internal::alignPtr(&_buf[cn << 1], 32);
234 
235     if (borderType == BORDER_MODE_CONSTANT)
236         for (s32 k = 0; k < cn; ++k)
237         {
238             lane[-cn+k] = borderValue;
239             lane[-cn-cn+k] = borderValue;
240             lane[colsn+k] = borderValue;
241             lane[colsn+cn+k] = borderValue;
242         }
243 
244     uint8x8_t vc6u8 = vmov_n_u8(6);
245     uint16x8_t vc6u16 = vmovq_n_u16(6);
246     uint16x8_t vc4u16 = vmovq_n_u16(4);
247 
248     for (size_t i = 0; i < size.height; ++i)
249     {
250         u8* dst = internal::getRowPtr(dstBase, dstStride, i);
251         //vertical convolution
252         ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
253         ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
254         ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
255         ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
256 
257         const u8* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
258         const u8* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
259         const u8* ln2 = internal::getRowPtr(srcBase, srcStride, i);
260         const u8* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
261         const u8* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
262 
263         size_t x = 0;
264         for (; x <= colsn - 8; x += 8)
265         {
266             internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
267             uint8x8_t v0 = vld1_u8(ln0+x);
268             uint8x8_t v1 = vld1_u8(ln1+x);
269             uint8x8_t v2 = vld1_u8(ln2+x);
270             uint8x8_t v3 = vld1_u8(ln3+x);
271             uint8x8_t v4 = vld1_u8(ln4+x);
272 
273             uint16x8_t v = vaddl_u8(v0, v4);
274             uint16x8_t v13 = vaddl_u8(v1, v3);
275 
276             v = vmlal_u8(v, v2, vc6u8);
277             v = vmlaq_u16(v, v13, vc4u16);
278 
279             vst1q_u16(lane + x, v);
280         }
281         for (; x < colsn; ++x)
282             lane[x] = ln0[x] + ln4[x] + u16(4) * (ln1[x] + ln3[x]) + u16(6) * ln2[x];
283 
284         //left&right borders
285         if (borderType != BORDER_MODE_CONSTANT)
286             for (s32 k = 0; k < cn; ++k)
287             {
288                 lane[-cn+k] = lane[idx_l1 + k];
289                 lane[-cn-cn+k] = lane[idx_l2 + k];
290 
291                 lane[colsn+k] = lane[idx_r1 + k];
292                 lane[colsn+cn+k] = lane[idx_r2 + k];
293             }
294 
295         //horizontal convolution
296         x = 0;
297         switch(cn)
298         {
299         case 1:
300             for (; x <= colsn - 8; x += 8)
301             {
302                 internal::prefetch(lane + x);
303 
304                 uint16x8_t lane0 = vld1q_u16(lane + x - 2);
305                 uint16x8_t lane4 = vld1q_u16(lane + x + 2);
306                 uint16x8_t lane1 = vld1q_u16(lane + x - 1);
307                 uint16x8_t lane3 = vld1q_u16(lane + x + 1);
308                 uint16x8_t lane2 = vld1q_u16(lane + x + 0);
309 
310                 uint16x8_t ln04 = vaddq_u16(lane0, lane4);
311                 uint16x8_t ln13 = vaddq_u16(lane1, lane3);
312 
313                 uint16x8_t ln042 = vmlaq_u16(ln04, lane2, vc6u16);
314                 uint16x8_t lsw = vmlaq_u16(ln042, ln13, vc4u16);
315 
316                 uint8x8_t ls = vrshrn_n_u16(lsw, 8);
317 
318                 vst1_u8(dst + x, ls);
319             }
320             break;
321         case 2:
322             for (; x <= colsn - 8*2; x += 8*2)
323             {
324                 internal::prefetch(lane + x);
325 
326                 u16* lidx0 = lane + x - 2*2;
327                 u16* lidx1 = lane + x - 1*2;
328                 u16* lidx3 = lane + x + 1*2;
329                 u16* lidx4 = lane + x + 2*2;
330 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
331                 __asm__ __volatile__ (
332                     "vld2.16 {d0, d2}, [%[in0]]!                              \n\t"
333                     "vld2.16 {d1, d3}, [%[in0]]                               \n\t"
334                     "vld2.16 {d8, d10}, [%[in4]]!                             \n\t"
335                     "vld2.16 {d9, d11}, [%[in4]]                              \n\t"
336                     "vadd.i16 q0, q4                                          \n\t"
337                     "vadd.i16 q1, q5                                          \n\t"
338                     "vld2.16 {d16, d18}, [%[in1]]!                            \n\t"
339                     "vld2.16 {d17, d19}, [%[in1]]                             \n\t"
340                     "vld2.16 {d8, d10}, [%[in3]]!                             \n\t"
341                     "vld2.16 {d9, d11}, [%[in3]]                              \n\t"
342                     "vadd.i16 q4, q8                                          \n\t"
343                     "vadd.i16 q5, q9                                          \n\t"
344                     "vld2.16 {d16, d18}, [%[in2]]                             \n\t"
345                     "vld2.16 {d17, d19}, [%[in22]]                            \n\t"
346                     "vmla.i16 q0, q4, %q[c4]                                  \n\t"
347                     "vmla.i16 q1, q5, %q[c4]                                  \n\t"
348                     "vmla.i16 q0, q8, %q[c6]                                  \n\t"
349                     "vmla.i16 q1, q9, %q[c6]                                  \n\t"
350                     "vrshrn.u16 d8, q0, #8                                    \n\t"
351                     "vrshrn.u16 d9, q1, #8                                    \n\t"
352                     "vst2.8 {d8-d9}, [%[out]]                                 \n\t"
353                     : [in0] "=r" (lidx0),
354                       [in1] "=r" (lidx1),
355                       [in3] "=r" (lidx3),
356                       [in4] "=r" (lidx4)
357                     : [out] "r" (dst + x),
358                       "0" (lidx0),
359                       "1" (lidx1),
360                       "2" (lidx3),
361                       "3" (lidx4),
362                       [in2] "r" (lane + x),
363                       [in22] "r" (lane + x + 4*2),
364                       [c4] "w" (vc4u16), [c6] "w" (vc6u16)
365                     : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
366                 );
367 #else
368                 uint16x8x2_t vLane0 = vld2q_u16(lidx0);
369                 uint16x8x2_t vLane1 = vld2q_u16(lidx1);
370                 uint16x8x2_t vLane2 = vld2q_u16(lane + x);
371                 uint16x8x2_t vLane3 = vld2q_u16(lidx3);
372                 uint16x8x2_t vLane4 = vld2q_u16(lidx4);
373 
374                 uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]);
375                 uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane4.val[1]);
376 
377                 uint16x8_t vSum_4_8 = vaddq_u16(vLane1.val[0], vLane3.val[0]);
378                 uint16x8_t vSum_5_9 = vaddq_u16(vLane1.val[1], vLane3.val[1]);
379 
380                 vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_4_8, vc4u16);
381                 vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_5_9, vc4u16);
382                 vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16);
383                 vSum_1_5 = vmlaq_u16(vSum_1_5, vLane2.val[1], vc6u16);
384 
385                 uint8x8x2_t vRes;
386                 vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8);
387                 vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8);
388                 vst2_u8(dst + x, vRes);
389 #endif
390             }
391             break;
392         case 3:
393             for (; x <= colsn - 8*3; x += 8*3)
394             {
395                 internal::prefetch(lane + x);
396 
397                 u16* lidx0 = lane + x - 2*3;
398                 u16* lidx1 = lane + x - 1*3;
399                 u16* lidx3 = lane + x + 1*3;
400                 u16* lidx4 = lane + x + 2*3;
401 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
402                 __asm__ __volatile__ (
403                     "vld3.16 {d0, d2, d4}, [%[in0]]!                          \n\t"
404                     "vld3.16 {d1, d3, d5}, [%[in0]]                           \n\t"
405                     "vld3.16 {d8, d10, d12}, [%[in4]]!                        \n\t"
406                     "vld3.16 {d9, d11, d13}, [%[in4]]                         \n\t"
407                     "vadd.i16 q0, q4                                          \n\t"
408                     "vadd.i16 q1, q5                                          \n\t"
409                     "vadd.i16 q2, q6                                          \n\t"
410                     "vld3.16 {d16, d18, d20}, [%[in1]]!                       \n\t"
411                     "vld3.16 {d17, d19, d21}, [%[in1]]                        \n\t"
412                     "vld3.16 {d8, d10, d12}, [%[in3]]!                        \n\t"
413                     "vld3.16 {d9, d11, d13}, [%[in3]]                         \n\t"
414                     "vadd.i16 q4, q8                                          \n\t"
415                     "vadd.i16 q5, q9                                          \n\t"
416                     "vadd.i16 q6, q10                                         \n\t"
417                     "vld3.16 {d16, d18, d20}, [%[in2]]                        \n\t"
418                     "vld3.16 {d17, d19, d21}, [%[in22]]                       \n\t"
419                     "vmla.i16 q0, q4, %q[c4]                                  \n\t"
420                     "vmla.i16 q1, q5, %q[c4]                                  \n\t"
421                     "vmla.i16 q2, q6, %q[c4]                                  \n\t"
422                     "vmla.i16 q0, q8, %q[c6]                                  \n\t"
423                     "vmla.i16 q1, q9, %q[c6]                                  \n\t"
424                     "vmla.i16 q2, q10, %q[c6]                                 \n\t"
425                     "vrshrn.u16 d8, q0, #8                                    \n\t"
426                     "vrshrn.u16 d9, q1, #8                                    \n\t"
427                     "vrshrn.u16 d10, q2, #8                                   \n\t"
428                     "vst3.8 {d8-d10}, [%[out]]                                \n\t"
429                     : [in0] "=r" (lidx0),
430                       [in1] "=r" (lidx1),
431                       [in3] "=r" (lidx3),
432                       [in4] "=r" (lidx4)
433                     : [out] "r" (dst + x),
434                       "0" (lidx0),
435                       "1" (lidx1),
436                       "2" (lidx3),
437                       "3" (lidx4),
438                       [in2] "r" (lane + x),
439                       [in22] "r" (lane + x + 4*3),
440                       [c4] "w" (vc4u16), [c6] "w" (vc6u16)
441                     : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
442                 );
443 #else
444                 uint16x8x3_t vLane0 = vld3q_u16(lidx0);
445                 uint16x8x3_t vLane1 = vld3q_u16(lidx1);
446                 uint16x8x3_t vLane2 = vld3q_u16(lane + x);
447                 uint16x8x3_t vLane3 = vld3q_u16(lidx3);
448                 uint16x8x3_t vLane4 = vld3q_u16(lidx4);
449 
450                 uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]);
451                 uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane4.val[1]);
452                 uint16x8_t vSum_2_6 = vaddq_u16(vLane0.val[2], vLane4.val[2]);
453 
454                 uint16x8_t vSum_3_1 = vaddq_u16(vLane3.val[0], vLane1.val[0]);
455                 uint16x8_t vSum_4_2 = vaddq_u16(vLane3.val[1], vLane1.val[1]);
456                 uint16x8_t vSum_5_6 = vaddq_u16(vLane3.val[2], vLane1.val[2]);
457 
458                 vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_3_1, vc4u16);
459                 vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_4_2, vc4u16);
460                 vSum_2_6 = vmlaq_u16(vSum_2_6, vSum_5_6, vc4u16);
461 
462                 vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16);
463                 vSum_1_5 = vmlaq_u16(vSum_1_5, vLane2.val[1], vc6u16);
464                 vSum_2_6 = vmlaq_u16(vSum_2_6, vLane2.val[2], vc6u16);
465 
466                 uint8x8x3_t vRes;
467                 vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8);
468                 vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8);
469                 vRes.val[2] = vrshrn_n_u16(vSum_2_6, 8);
470 
471                 vst3_u8(dst + x, vRes);
472 #endif
473             }
474             break;
475         case 4:
476             for (; x <= colsn - 8*4; x += 8*4)
477             {
478                 internal::prefetch(lane + x);
479                 internal::prefetch(lane + x + 16);
480 
481                 u16* lidx0 = lane + x - 2*4;
482                 u16* lidx1 = lane + x - 1*4;
483                 u16* lidx3 = lane + x + 1*4;
484                 u16* lidx4 = lane + x + 2*4;
485 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
486                 __asm__ __volatile__ (
487                     "vld4.16 {d0, d2, d4, d6}, [%[in0]]!                      \n\t"
488                     "vld4.16 {d1, d3, d5, d7}, [%[in0]]                       \n\t"
489                     "vld4.16 {d8, d10, d12, d14}, [%[in4]]!                   \n\t"
490                     "vld4.16 {d9, d11, d13, d15}, [%[in4]]                    \n\t"
491                     "vadd.i16 q0, q4                                          \n\t"
492                     "vadd.i16 q1, q5                                          \n\t"
493                     "vadd.i16 q2, q6                                          \n\t"
494                     "vadd.i16 q3, q7                                          \n\t"
495                     "vld4.16 {d16, d18, d20, d22}, [%[in1]]!                  \n\t"
496                     "vld4.16 {d17, d19, d21, d23}, [%[in1]]                   \n\t"
497                     "vld4.16 {d8, d10, d12, d14}, [%[in3]]!                   \n\t"
498                     "vld4.16 {d9, d11, d13, d15}, [%[in3]]                    \n\t"
499                     "vadd.i16 q4, q8                                          \n\t"
500                     "vadd.i16 q5, q9                                          \n\t"
501                     "vadd.i16 q6, q10                                         \n\t"
502                     "vadd.i16 q7, q11                                         \n\t"
503                     "vld4.16 {d16, d18, d20, d22}, [%[in2],:256]              \n\t"
504                     "vld4.16 {d17, d19, d21, d23}, [%[in22],:256]             \n\t"
505                     "vmla.i16 q0, q4, %q[c4]                                  \n\t"
506                     "vmla.i16 q1, q5, %q[c4]                                  \n\t"
507                     "vmla.i16 q2, q6, %q[c4]                                  \n\t"
508                     "vmla.i16 q3, q7, %q[c4]                                  \n\t"
509                     "vmla.i16 q0, q8, %q[c6]                                  \n\t"
510                     "vmla.i16 q1, q9, %q[c6]                                  \n\t"
511                     "vmla.i16 q2, q10, %q[c6]                                 \n\t"
512                     "vmla.i16 q3, q11, %q[c6]                                 \n\t"
513                     "vrshrn.u16 d8, q0, #8                                    \n\t"
514                     "vrshrn.u16 d9, q1, #8                                    \n\t"
515                     "vrshrn.u16 d10, q2, #8                                   \n\t"
516                     "vrshrn.u16 d11, q3, #8                                   \n\t"
517                     "vst4.8 {d8-d11}, [%[out]]                                \n\t"
518                     : [in0] "=r" (lidx0),
519                       [in1] "=r" (lidx1),
520                       [in3] "=r" (lidx3),
521                       [in4] "=r" (lidx4)
522                     : [out] "r" (dst + x),
523                       "0" (lidx0),
524                       "1" (lidx1),
525                       "2" (lidx3),
526                       "3" (lidx4),
527                       [in2] "r" (lane + x),
528                       [in22] "r" (lane + x + 4*4),
529                       [c4] "w" (vc4u16), [c6] "w" (vc6u16)
530                     : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
531                 );
532 #else
533                 uint16x8x4_t vLane0 = vld4q_u16(lidx0);
534                 uint16x8x4_t vLane2 = vld4q_u16(lidx4);
535                 uint16x8x4_t vLane4 = vld4q_u16(lidx1);
536                 uint16x8x4_t vLane6 = vld4q_u16(lidx3);
537                 uint16x8x4_t vLane8 = vld4q_u16(lane + x);
538 
539                 uint16x8_t vSum_0_4  = vaddq_u16(vLane0.val[0], vLane2.val[0]);
540                 uint16x8_t vSum_1_5  = vaddq_u16(vLane0.val[1], vLane2.val[1]);
541                 uint16x8_t vSum_2_6  = vaddq_u16(vLane0.val[2], vLane2.val[2]);
542                 uint16x8_t vSum_3_7  = vaddq_u16(vLane0.val[3], vLane2.val[3]);
543 
544                 uint16x8_t vSum_4_8  = vaddq_u16(vLane4.val[0], vLane6.val[0]);
545                 uint16x8_t vSum_5_9  = vaddq_u16(vLane4.val[1], vLane6.val[1]);
546                 uint16x8_t vSum_6_10 = vaddq_u16(vLane4.val[2], vLane6.val[2]);
547                 uint16x8_t vSum_7_11 = vaddq_u16(vLane4.val[3], vLane6.val[3]);
548 
549                 vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_4_8, vc4u16);
550                 vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_5_9, vc4u16);
551                 vSum_2_6 = vmlaq_u16(vSum_2_6, vSum_6_10, vc4u16);
552                 vSum_3_7 = vmlaq_u16(vSum_3_7, vSum_7_11, vc4u16);
553 
554                 vSum_0_4 = vmlaq_u16(vSum_0_4, vLane8.val[0], vc6u16);
555                 vSum_1_5 = vmlaq_u16(vSum_1_5, vLane8.val[1], vc6u16);
556                 vSum_2_6 = vmlaq_u16(vSum_2_6, vLane8.val[2], vc6u16);
557                 vSum_3_7 = vmlaq_u16(vSum_3_7, vLane8.val[3], vc6u16);
558 
559                 uint8x8x4_t vRes;
560                 vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8);
561                 vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8);
562                 vRes.val[2] = vrshrn_n_u16(vSum_2_6, 8);
563                 vRes.val[3] = vrshrn_n_u16(vSum_3_7, 8);
564 
565                 vst4_u8(dst + x, vRes);
566 #endif
567             }
568             break;
569         }
570         for (s32 h = 0; h < cn; ++h)
571         {
572             u16* ln = lane + h;
573             u8* dt = dst + h;
574             for (size_t k = x; k < colsn; k += cn)
575             {
576                 dt[k] = (u8)((ln[k-2*cn] + ln[k+2*cn]
577                                + u16(4) * (ln[k-cn] + ln[k+cn])
578                                + u16(6) * ln[k] + (1 << 7)) >> 8);
579             }
580         }
581     }
582 #else
583     (void)srcBase;
584     (void)srcStride;
585     (void)dstBase;
586     (void)dstStride;
587     (void)borderValue;
588     (void)borderMargin;
589 #endif
590 }
591 
gaussianBlur5x5(const Size2D & size,s32 cn,const u16 * srcBase,ptrdiff_t srcStride,u16 * dstBase,ptrdiff_t dstStride,BORDER_MODE borderType,u16 borderValue,Margin borderMargin)592 void gaussianBlur5x5(const Size2D &size, s32 cn,
593                      const u16 * srcBase, ptrdiff_t srcStride,
594                      u16 * dstBase, ptrdiff_t dstStride,
595                      BORDER_MODE borderType, u16 borderValue, Margin borderMargin)
596 {
597     internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
598 #ifdef CAROTENE_NEON
599     size_t colsn = size.width * cn;
600 
601     std::vector<u16> _tmp;
602     u16 *tmp = 0;
603     if (borderType == BORDER_MODE_CONSTANT)
604     {
605         _tmp.assign(colsn + 4*cn, borderValue);
606         tmp = &_tmp[cn << 1];
607     }
608 
609     ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
610     ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
611     ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
612     ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
613 
614     //1-line buffer
615     std::vector<u32> _buf(cn * (size.width + 4) + 32 / sizeof(u32));
616     u32* lane = internal::alignPtr(&_buf[cn << 1], 32);
617 
618     if (borderType == BORDER_MODE_CONSTANT)
619         for (s32 k = 0; k < cn; ++k)
620         {
621             lane[-cn+k] = borderValue;
622             lane[-cn-cn+k] = borderValue;
623             lane[colsn+k] = borderValue;
624             lane[colsn+cn+k] = borderValue;
625         }
626 
627     uint16x4_t vc6u16 = vmov_n_u16(6);
628     uint32x4_t vc6u32 = vmovq_n_u32(6);
629     uint32x4_t vc4u32 = vmovq_n_u32(4);
630 
631     for (size_t i = 0; i < size.height; ++i)
632     {
633         u16* dst = internal::getRowPtr(dstBase, dstStride, i);
634         //vertical convolution
635         ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
636         ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
637         ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
638         ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
639 
640         const u16* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
641         const u16* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
642         const u16* ln2 = internal::getRowPtr(srcBase, srcStride, i);
643         const u16* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
644         const u16* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
645 
646         size_t x = 0;
647         for (; x <= colsn - 4; x += 4)
648         {
649             internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
650             uint16x4_t v0 = vld1_u16(ln0+x);
651             uint16x4_t v1 = vld1_u16(ln1+x);
652             uint16x4_t v2 = vld1_u16(ln2+x);
653             uint16x4_t v3 = vld1_u16(ln3+x);
654             uint16x4_t v4 = vld1_u16(ln4+x);
655 
656             uint32x4_t v = vaddl_u16(v0, v4);
657             uint32x4_t v13 = vaddl_u16(v1, v3);
658 
659             v = vmlal_u16(v, v2, vc6u16);
660             v = vmlaq_u32(v, v13, vc4u32);
661 
662             vst1q_u32(lane + x, v);
663         }
664         for (; x < colsn; ++x)
665             lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x];
666 
667         //left&right borders
668         if (borderType != BORDER_MODE_CONSTANT)
669             for (s32 k = 0; k < cn; ++k)
670             {
671                 lane[-cn+k] = lane[idx_l1 + k];
672                 lane[-cn-cn+k] = lane[idx_l2 + k];
673 
674                 lane[colsn+k] = lane[idx_r1 + k];
675                 lane[colsn+cn+k] = lane[idx_r2 + k];
676             }
677 
678         //horizontal convolution
679         x = 0;
680         for (; x <= colsn - 4; x += 4)
681         {
682             internal::prefetch(lane + x);
683 
684             uint32x4_t lane0 = vld1q_u32(lane + x - 2);
685             uint32x4_t lane4 = vld1q_u32(lane + x + 2);
686             uint32x4_t lane1 = vld1q_u32(lane + x - 1);
687             uint32x4_t lane3 = vld1q_u32(lane + x + 1);
688             uint32x4_t lane2 = vld1q_u32(lane + x + 0);
689 
690             uint32x4_t ln04 = vaddq_u32(lane0, lane4);
691             uint32x4_t ln13 = vaddq_u32(lane1, lane3);
692 
693             uint32x4_t ln042 = vmlaq_u32(ln04, lane2, vc6u32);
694             uint32x4_t lsw = vmlaq_u32(ln042, ln13, vc4u32);
695 
696             uint16x4_t ls = vrshrn_n_u32(lsw, 8);
697 
698             vst1_u16(dst + x, ls);
699         }
700         for (s32 h = 0; h < cn; ++h)
701         {
702             u32* ln = lane + h;
703             u16* dt = dst + h;
704             for (size_t k = x; k < colsn; k += cn)
705             {
706                 dt[k] = (u16)((ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k] + (1<<7))>>8);
707             }
708         }
709     }
710 #else
711     (void)srcBase;
712     (void)srcStride;
713     (void)dstBase;
714     (void)dstStride;
715     (void)borderValue;
716     (void)borderMargin;
717 #endif
718 }
719 
gaussianBlur5x5(const Size2D & size,s32 cn,const s16 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride,BORDER_MODE borderType,s16 borderValue,Margin borderMargin)720 void gaussianBlur5x5(const Size2D &size, s32 cn,
721                      const s16 * srcBase, ptrdiff_t srcStride,
722                      s16 * dstBase, ptrdiff_t dstStride,
723                      BORDER_MODE borderType, s16 borderValue, Margin borderMargin)
724 {
725     internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
726 #ifdef CAROTENE_NEON
727     size_t colsn = size.width * cn;
728 
729     std::vector<s16> _tmp;
730     s16 *tmp = 0;
731     if (borderType == BORDER_MODE_CONSTANT)
732     {
733         _tmp.assign(colsn + 4*cn, borderValue);
734         tmp = &_tmp[cn << 1];
735     }
736 
737     ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
738     ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
739     ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
740     ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
741 
742     //1-line buffer
743     std::vector<s32> _buf(cn * (size.width + 4) + 32 / sizeof(s32));
744     s32* lane = internal::alignPtr(&_buf[cn << 1], 32);
745 
746     if (borderType == BORDER_MODE_CONSTANT)
747         for (s32 k = 0; k < cn; ++k)
748         {
749             lane[-cn+k] = borderValue;
750             lane[-cn-cn+k] = borderValue;
751             lane[colsn+k] = borderValue;
752             lane[colsn+cn+k] = borderValue;
753         }
754 
755     int16x4_t vc6s16 = vmov_n_s16(6);
756     int32x4_t vc6s32 = vmovq_n_s32(6);
757     int32x4_t vc4s32 = vmovq_n_s32(4);
758 
759     for (size_t i = 0; i < size.height; ++i)
760     {
761         s16* dst = internal::getRowPtr(dstBase, dstStride, i);
762         //vertical convolution
763         ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
764         ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
765         ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
766         ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
767 
768         const s16* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
769         const s16* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
770         const s16* ln2 = internal::getRowPtr(srcBase, srcStride, i);
771         const s16* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
772         const s16* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
773 
774         size_t x = 0;
775         for (; x <= colsn - 4; x += 4)
776         {
777             internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
778             int16x4_t v0 = vld1_s16(ln0+x);
779             int16x4_t v1 = vld1_s16(ln1+x);
780             int16x4_t v2 = vld1_s16(ln2+x);
781             int16x4_t v3 = vld1_s16(ln3+x);
782             int16x4_t v4 = vld1_s16(ln4+x);
783 
784             int32x4_t v = vaddl_s16(v0, v4);
785             int32x4_t v13 = vaddl_s16(v1, v3);
786 
787             v = vmlal_s16(v, v2, vc6s16);
788             v = vmlaq_s32(v, v13, vc4s32);
789 
790             vst1q_s32(lane + x, v);
791         }
792         for (; x < colsn; ++x)
793             lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x];
794 
795         //left&right borders
796         if (borderType != BORDER_MODE_CONSTANT)
797             for (s32 k = 0; k < cn; ++k)
798             {
799                 lane[-cn+k] = lane[idx_l1 + k];
800                 lane[-cn-cn+k] = lane[idx_l2 + k];
801 
802                 lane[colsn+k] = lane[idx_r1 + k];
803                 lane[colsn+cn+k] = lane[idx_r2 + k];
804             }
805 
806         //horizontal convolution
807         x = 0;
808        switch(cn)
809         {
810         case 1:
811         case 2:
812         case 3:
813             for (; x <= colsn - 4; x += 4)
814             {
815                 internal::prefetch(lane + x);
816 
817                 int32x4_t lane0 = vld1q_s32(lane + x - 2);
818                 int32x4_t lane4 = vld1q_s32(lane + x + 2);
819                 int32x4_t lane1 = vld1q_s32(lane + x - 1);
820                 int32x4_t lane3 = vld1q_s32(lane + x + 1);
821                 int32x4_t lane2 = vld1q_s32(lane + x + 0);
822 
823                 int32x4_t ln04 = vaddq_s32(lane0, lane4);
824                 int32x4_t ln13 = vaddq_s32(lane1, lane3);
825 
826                 int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32);
827                 int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32);
828 
829                 int16x4_t ls = vrshrn_n_s32(lsw, 8);
830 
831                 vst1_s16(dst + x, ls);
832            }
833             break;
834         case 4:
835 /*            for (; x <= colsn - 4*4; x += 4*4)
836             {
837                 internal::prefetch(lane + x);
838                 internal::prefetch(lane + x + 16);
839 
840                 ptrdiff_t* lidx0 = lane + x - 2*4;
841                 ptrdiff_t* lidx1 = lane + x - 1*4;
842                 ptrdiff_t* lidx3 = lane + x + 1*4;
843                 ptrdiff_t* lidx4 = lane + x + 2*4;
844 
845                 __asm__ __volatile__ (
846                     "vld4.32 {d0, d2, d4, d6}, [%[in0]]!                      \n\t"
847                     "vld4.32 {d1, d3, d5, d7}, [%[in0]]                       \n\t"
848                     "vld4.32 {d8, d10, d12, d14}, [%[in4]]!                   \n\t"
849                     "vld4.32 {d9, d11, d13, d15}, [%[in4]]                    \n\t"
850                     "vadd.i32 q0, q4                                          \n\t"
851                     "vadd.i32 q1, q5                                          \n\t"
852                     "vadd.i32 q2, q6                                          \n\t"
853                     "vadd.i32 q3, q7                                          \n\t"
854                     "vld4.32 {d16, d18, d20, d22}, [%[in1]]!                  \n\t"
855                     "vld4.32 {d17, d19, d21, d23}, [%[in1]]                   \n\t"
856                     "vld4.32 {d8, d10, d12, d14}, [%[in3]]!                   \n\t"
857                     "vld4.32 {d9, d11, d13, d15}, [%[in3]]                    \n\t"
858                     "vadd.i32 q4, q8                                          \n\t"
859                     "vadd.i32 q5, q9                                          \n\t"
860                     "vadd.i32 q6, q10                                         \n\t"
861                     "vadd.i32 q7, q11                                         \n\t"
862                     "vld4.32 {d16, d18, d20, d22}, [%[in2],:256]              \n\t"
863                     "vld4.32 {d17, d19, d21, d23}, [%[in22],:256]             \n\t"
864                     "vmla.i32 q0, q4, %q[c4]                                  \n\t"
865                     "vmla.i32 q1, q5, %q[c4]                                  \n\t"
866                     "vmla.i32 q2, q6, %q[c4]                                  \n\t"
867                     "vmla.i32 q3, q7, %q[c4]                                  \n\t"
868                     "vmla.i32 q0, q8, %q[c6]                                  \n\t"
869                     "vmla.i32 q1, q9, %q[c6]                                  \n\t"
870                     "vmla.i32 q2, q10, %q[c6]                                 \n\t"
871                     "vmla.i32 q3, q11, %q[c6]                                 \n\t"
872                     "vrshrn.i32 d8, q0, #8                                    \n\t"
873                     "vrshrn.i32 d9, q1, #8                                    \n\t"
874                     "vrshrn.i32 d10, q2, #8                                   \n\t"
875                     "vrshrn.i32 d11, q3, #8                                   \n\t"
876                    "vst4.16 {d8-d11}, [%[out]]                                \n\t"
877                     : [in0] "=r" (lidx0),
878                       [in1] "=r" (lidx1),
879                       [in3] "=r" (lidx3),
880                       [in4] "=r" (lidx4)
881                     : [out] "r" (dst + x),
882                       "0" (lidx0),
883                       "1" (lidx1),
884                       "2" (lidx3),
885                       "3" (lidx4),
886                       [in2] "r" (lane + x),
887                       [in22] "r" (lane + x + 4*2),
888                       [c4] "w" (vc4s32), [c6] "w" (vc6s32)
889                     : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
890                 );
891 */
892             for (; x <= colsn - 4; x += 4)
893             {
894                 internal::prefetch(lane + x);
895 
896                 int32x4_t lane0 = vld1q_s32(lane + x - 2);
897                 int32x4_t lane4 = vld1q_s32(lane + x + 2);
898                 int32x4_t lane1 = vld1q_s32(lane + x - 1);
899                 int32x4_t lane3 = vld1q_s32(lane + x + 1);
900                 int32x4_t lane2 = vld1q_s32(lane + x + 0);
901 
902                 int32x4_t ln04 = vaddq_s32(lane0, lane4);
903                 int32x4_t ln13 = vaddq_s32(lane1, lane3);
904 
905                 int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32);
906                 int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32);
907 
908                 int16x4_t ls = vrshrn_n_s32(lsw, 8);
909 
910                 vst1_s16(dst + x, ls);
911             }
912             break;
913         }
914         for (s32 h = 0; h < cn; ++h)
915         {
916             s32* ln = lane + h;
917             s16* dt = dst + h;
918             for (size_t k = x; k < colsn; k += cn)
919             {
920                 dt[k] = (s16)((ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k] + (1<<7))>>8);
921             }
922         }
923     }
924 #else
925     (void)srcBase;
926     (void)srcStride;
927     (void)dstBase;
928     (void)dstStride;
929     (void)borderValue;
930     (void)borderMargin;
931 #endif
932 }
933 
gaussianBlur5x5(const Size2D & size,s32 cn,const s32 * srcBase,ptrdiff_t srcStride,s32 * dstBase,ptrdiff_t dstStride,BORDER_MODE borderType,s32 borderValue,Margin borderMargin)934 void gaussianBlur5x5(const Size2D &size, s32 cn,
935                      const s32 * srcBase, ptrdiff_t srcStride,
936                      s32 * dstBase, ptrdiff_t dstStride,
937                      BORDER_MODE borderType, s32 borderValue, Margin borderMargin)
938 {
939     internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
940 #ifdef CAROTENE_NEON
941     size_t colsn = size.width * cn;
942 
943     std::vector<s32> _tmp;
944     s32 *tmp = 0;
945     if (borderType == BORDER_MODE_CONSTANT)
946     {
947         _tmp.assign(colsn + 4*cn, borderValue);
948         tmp = &_tmp[cn << 1];
949     }
950 
951     ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
952     ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
953     ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
954     ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
955 
956     //1-line buffer
957     std::vector<s32> _buf(cn * (size.width + 4) + 32 / sizeof(s32));
958     s32* lane = internal::alignPtr(&_buf[cn << 1], 32);
959 
960     if (borderType == BORDER_MODE_CONSTANT)
961         for (s32 k = 0; k < cn; ++k)
962         {
963             lane[-cn+k] = borderValue;
964             lane[-cn-cn+k] = borderValue;
965             lane[colsn+k] = borderValue;
966             lane[colsn+cn+k] = borderValue;
967         }
968 
969     int32x4_t vc6s32 = vmovq_n_s32(6);
970     int32x4_t vc4s32 = vmovq_n_s32(4);
971 
972     for (size_t i = 0; i < size.height; ++i)
973     {
974         s32* dst = internal::getRowPtr(dstBase, dstStride, i);
975         //vertical convolution
976         ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
977         ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
978         ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
979         ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
980 
981         const s32* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
982         const s32* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
983         const s32* ln2 = internal::getRowPtr(srcBase, srcStride, i);
984         const s32* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
985         const s32* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
986 
987         size_t x = 0;
988         for (; x <= colsn - 4; x += 4)
989         {
990             internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
991             int32x4_t v0 = vld1q_s32(ln0+x);
992             int32x4_t v1 = vld1q_s32(ln1+x);
993             int32x4_t v2 = vld1q_s32(ln2+x);
994             int32x4_t v3 = vld1q_s32(ln3+x);
995             int32x4_t v4 = vld1q_s32(ln4+x);
996 
997             int32x4_t v = vaddq_s32(v0, v4);
998             int32x4_t v13 = vaddq_s32(v1, v3);
999 
1000             v = vmlaq_s32(v, v2, vc6s32);
1001             v = vmlaq_s32(v, v13, vc4s32);
1002 
1003             vst1q_s32(lane + x, v);
1004         }
1005         for (; x < colsn; ++x)
1006             lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x];
1007 
1008         //left&right borders
1009         if (borderType != BORDER_MODE_CONSTANT)
1010             for (s32 k = 0; k < cn; ++k)
1011             {
1012                 lane[-cn+k] = lane[idx_l1 + k];
1013                 lane[-cn-cn+k] = lane[idx_l2 + k];
1014 
1015                 lane[colsn+k] = lane[idx_r1 + k];
1016                 lane[colsn+cn+k] = lane[idx_r2 + k];
1017             }
1018 
1019         //horizontal convolution
1020         x = 0;
1021         for (; x <= colsn - 4; x += 4)
1022         {
1023             internal::prefetch(lane + x);
1024 
1025             int32x4_t lane0 = vld1q_s32(lane + x - 2);
1026             int32x4_t lane4 = vld1q_s32(lane + x + 2);
1027             int32x4_t lane1 = vld1q_s32(lane + x - 1);
1028             int32x4_t lane3 = vld1q_s32(lane + x + 1);
1029             int32x4_t lane2 = vld1q_s32(lane + x + 0);
1030 
1031             int32x4_t ln04 = vaddq_s32(lane0, lane4);
1032             int32x4_t ln13 = vaddq_s32(lane1, lane3);
1033 
1034             int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32);
1035             int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32);
1036 
1037             vst1q_s32(dst + x, lsw);
1038         }
1039         for (s32 h = 0; h < cn; ++h)
1040         {
1041             s32* ln = lane + h;
1042             s32* dt = dst + h;
1043             for (size_t k = x; k < colsn; k += cn)
1044             {
1045                 dt[k] = ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k];
1046             }
1047         }
1048     }
1049 #else
1050     (void)srcBase;
1051     (void)srcStride;
1052     (void)dstBase;
1053     (void)dstStride;
1054     (void)borderValue;
1055     (void)borderMargin;
1056 #endif
1057 }
1058 
1059 } // namespace CAROTENE_NS
1060