1 /*
2 * By downloading, copying, installing or using the software you agree to this license.
3 * If you do not agree to this license, do not download, install,
4 * copy or use the software.
5 *
6 *
7 * License Agreement
8 * For Open Source Computer Vision Library
9 * (3-clause BSD License)
10 *
11 * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12 * Third party copyrights are property of their respective owners.
13 *
14 * Redistribution and use in source and binary forms, with or without modification,
15 * are permitted provided that the following conditions are met:
16 *
17 * * Redistributions of source code must retain the above copyright notice,
18 * this list of conditions and the following disclaimer.
19 *
20 * * Redistributions in binary form must reproduce the above copyright notice,
21 * this list of conditions and the following disclaimer in the documentation
22 * and/or other materials provided with the distribution.
23 *
24 * * Neither the names of the copyright holders nor the names of the contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * This software is provided by the copyright holders and contributors "as is" and
29 * any express or implied warranties, including, but not limited to, the implied
30 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall copyright holders or contributors be liable for any direct,
32 * indirect, incidental, special, exemplary, or consequential damages
33 * (including, but not limited to, procurement of substitute goods or services;
34 * loss of use, data, or profits; or business interruption) however caused
35 * and on any theory of liability, whether in contract, strict liability,
36 * or tort (including negligence or otherwise) arising in any way out of
37 * the use of this software, even if advised of the possibility of such damage.
38 */
39
40 #include "common.hpp"
41 #include "saturate_cast.hpp"
42 #include "separable_filter.hpp"
43
44 namespace CAROTENE_NS {
45
isGaussianBlur3x3Supported(const Size2D & size,BORDER_MODE border)46 bool isGaussianBlur3x3Supported(const Size2D &size, BORDER_MODE border)
47 {
48 return isSupportedConfiguration() && size.width >= 8 &&
49 (border == BORDER_MODE_CONSTANT ||
50 border == BORDER_MODE_REPLICATE);
51 }
52
gaussianBlur3x3(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,BORDER_MODE border,u8 borderValue)53 void gaussianBlur3x3(const Size2D &size,
54 const u8 * srcBase, ptrdiff_t srcStride,
55 u8 * dstBase, ptrdiff_t dstStride,
56 BORDER_MODE border, u8 borderValue)
57 {
58 internal::assertSupportedConfiguration(isGaussianBlur3x3Supported(size, border));
59 #ifdef CAROTENE_NEON
60 const uint16x8_t v_border_x4 = vdupq_n_u16(borderValue << 2);
61 const uint16x8_t v_zero = vdupq_n_u16(0);
62 const uint8x8_t v_border = vdup_n_u8(borderValue);
63
64 uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
65 uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
66
67 ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
68
69 for (ptrdiff_t y = 0; y < height; ++y)
70 {
71 const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
72 const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
73 const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
74 u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
75
76 s16 prevx = 0, currx = 0, nextx = 0;
77 ptrdiff_t x = 0;
78 const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
79
80 // perform vertical convolution
81 for ( ; x <= bwidth; x += 8)
82 {
83 internal::prefetch(srow0 + x);
84 internal::prefetch(srow1 + x);
85 internal::prefetch(srow2 + x);
86
87 uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
88 uint8x8_t x1 = vld1_u8(srow1 + x);
89 uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
90
91 // calculate values for plain CPU part below if needed
92 if (x + 8 >= bwidth)
93 {
94 ptrdiff_t x3 = x == width ? width - 1 : x;
95 ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
96
97 if (border == BORDER_MODE_CONSTANT && x4 < 0)
98 prevx = borderValue;
99 else
100 prevx = (srow2 ? srow2[x4] : borderValue) + (srow1[x4] << 1) + (srow0 ? srow0[x4] : borderValue);
101
102 currx = (srow2 ? srow2[x3] : borderValue) + (srow1[x3] << 1) + (srow0 ? srow0[x3] : borderValue);
103 }
104
105 // make shift
106 if (x)
107 {
108 tprev = tcurr;
109 tcurr = tnext;
110 }
111
112 // and calculate next value
113 tnext = vaddq_u16(vaddl_u8(x0, x2), vshll_n_u8(x1, 1));
114
115 // make extrapolation for the first elements
116 if (!x)
117 {
118 // make border
119 if (border == BORDER_MODE_CONSTANT)
120 tcurr = v_border_x4;
121 else if (border == BORDER_MODE_REPLICATE)
122 tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0));
123
124 continue;
125 }
126
127 // combine 3 "shifted" vectors
128 t0 = vextq_u16(tprev, tcurr, 7);
129 t1 = tcurr;
130 t2 = vextq_u16(tcurr, tnext, 1);
131
132 // and add them
133 t0 = vqaddq_u16(vshlq_n_u16(t1, 1), vqaddq_u16(t0, t2));
134 vst1_u8(drow + x - 8, vshrn_n_u16(t0, 4));
135 }
136
137 x -= 8;
138 if (x == width)
139 --x;
140
141 for ( ; x < width; ++x)
142 {
143 // make extrapolation for the last elements
144 if (x + 1 >= width)
145 {
146 if (border == BORDER_MODE_CONSTANT)
147 nextx = borderValue << 2;
148 else if (border == BORDER_MODE_REPLICATE)
149 nextx = srow2[x] + (srow1[x] << 1) + srow0[x];
150 }
151 else
152 nextx = (srow2 ? srow2[x + 1] : borderValue) +
153 (srow1[x + 1] << 1) +
154 (srow0 ? srow0[x + 1] : borderValue);
155
156 f32 val = (prevx + (currx << 1) + nextx) >> 4;
157 drow[x] = internal::saturate_cast<u8>((s32)val);
158
159 // make shift
160 prevx = currx;
161 currx = nextx;
162 }
163 }
164 #else
165 (void)srcBase;
166 (void)srcStride;
167 (void)dstBase;
168 (void)dstStride;
169 (void)borderValue;
170 #endif
171 }
172
isGaussianBlur3x3MarginSupported(const Size2D & size,BORDER_MODE border,Margin borderMargin)173 bool isGaussianBlur3x3MarginSupported(const Size2D &size, BORDER_MODE border, Margin borderMargin)
174 {
175 return isSeparableFilter3x3Supported(size, border, 0, 0, borderMargin);
176 }
177
gaussianBlur3x3Margin(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,BORDER_MODE border,u8 borderValue,Margin borderMargin)178 void gaussianBlur3x3Margin(const Size2D &size,
179 const u8 * srcBase, ptrdiff_t srcStride,
180 u8 * dstBase, ptrdiff_t dstStride,
181 BORDER_MODE border, u8 borderValue, Margin borderMargin)
182 {
183 internal::assertSupportedConfiguration(isGaussianBlur3x3MarginSupported(size, border, borderMargin));
184 #ifdef CAROTENE_NEON
185 internal::sepFilter3x3<internal::RowFilter3x3S16_121, internal::ColFilter3x3U8_121>::process(
186 size, srcBase, srcStride, dstBase, dstStride,
187 0, 0, border, borderValue, borderMargin);
188 #else
189 (void)srcBase;
190 (void)srcStride;
191 (void)dstBase;
192 (void)dstStride;
193 (void)borderValue;
194 #endif
195 }
196
isGaussianBlur5x5Supported(const Size2D & size,s32 cn,BORDER_MODE border)197 bool isGaussianBlur5x5Supported(const Size2D &size, s32 cn, BORDER_MODE border)
198 {
199 return isSupportedConfiguration() &&
200 cn > 0 && cn <= 4 &&
201 size.width >= 8 && size.height >= 2 &&
202 (border == BORDER_MODE_CONSTANT ||
203 border == BORDER_MODE_REFLECT101 ||
204 border == BORDER_MODE_REFLECT ||
205 border == BORDER_MODE_REPLICATE ||
206 border == BORDER_MODE_WRAP);
207 }
208
gaussianBlur5x5(const Size2D & size,s32 cn,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,BORDER_MODE borderType,u8 borderValue,Margin borderMargin)209 void gaussianBlur5x5(const Size2D &size, s32 cn,
210 const u8 * srcBase, ptrdiff_t srcStride,
211 u8 * dstBase, ptrdiff_t dstStride,
212 BORDER_MODE borderType, u8 borderValue, Margin borderMargin)
213 {
214 internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
215 #ifdef CAROTENE_NEON
216 size_t colsn = size.width * cn;
217
218 std::vector<u8> _tmp;
219 u8 *tmp = 0;
220 if (borderType == BORDER_MODE_CONSTANT)
221 {
222 _tmp.assign(colsn + 4*cn, borderValue);
223 tmp = &_tmp[cn << 1];
224 }
225
226 ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
227 ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
228 ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
229 ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
230
231 //1-line buffer
232 std::vector<u16> _buf(cn * (size.width + 4) + 32 / sizeof(u16));
233 u16* lane = internal::alignPtr(&_buf[cn << 1], 32);
234
235 if (borderType == BORDER_MODE_CONSTANT)
236 for (s32 k = 0; k < cn; ++k)
237 {
238 lane[-cn+k] = borderValue;
239 lane[-cn-cn+k] = borderValue;
240 lane[colsn+k] = borderValue;
241 lane[colsn+cn+k] = borderValue;
242 }
243
244 uint8x8_t vc6u8 = vmov_n_u8(6);
245 uint16x8_t vc6u16 = vmovq_n_u16(6);
246 uint16x8_t vc4u16 = vmovq_n_u16(4);
247
248 for (size_t i = 0; i < size.height; ++i)
249 {
250 u8* dst = internal::getRowPtr(dstBase, dstStride, i);
251 //vertical convolution
252 ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
253 ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
254 ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
255 ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
256
257 const u8* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
258 const u8* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
259 const u8* ln2 = internal::getRowPtr(srcBase, srcStride, i);
260 const u8* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
261 const u8* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
262
263 size_t x = 0;
264 for (; x <= colsn - 8; x += 8)
265 {
266 internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
267 uint8x8_t v0 = vld1_u8(ln0+x);
268 uint8x8_t v1 = vld1_u8(ln1+x);
269 uint8x8_t v2 = vld1_u8(ln2+x);
270 uint8x8_t v3 = vld1_u8(ln3+x);
271 uint8x8_t v4 = vld1_u8(ln4+x);
272
273 uint16x8_t v = vaddl_u8(v0, v4);
274 uint16x8_t v13 = vaddl_u8(v1, v3);
275
276 v = vmlal_u8(v, v2, vc6u8);
277 v = vmlaq_u16(v, v13, vc4u16);
278
279 vst1q_u16(lane + x, v);
280 }
281 for (; x < colsn; ++x)
282 lane[x] = ln0[x] + ln4[x] + u16(4) * (ln1[x] + ln3[x]) + u16(6) * ln2[x];
283
284 //left&right borders
285 if (borderType != BORDER_MODE_CONSTANT)
286 for (s32 k = 0; k < cn; ++k)
287 {
288 lane[-cn+k] = lane[idx_l1 + k];
289 lane[-cn-cn+k] = lane[idx_l2 + k];
290
291 lane[colsn+k] = lane[idx_r1 + k];
292 lane[colsn+cn+k] = lane[idx_r2 + k];
293 }
294
295 //horizontal convolution
296 x = 0;
297 switch(cn)
298 {
299 case 1:
300 for (; x <= colsn - 8; x += 8)
301 {
302 internal::prefetch(lane + x);
303
304 uint16x8_t lane0 = vld1q_u16(lane + x - 2);
305 uint16x8_t lane4 = vld1q_u16(lane + x + 2);
306 uint16x8_t lane1 = vld1q_u16(lane + x - 1);
307 uint16x8_t lane3 = vld1q_u16(lane + x + 1);
308 uint16x8_t lane2 = vld1q_u16(lane + x + 0);
309
310 uint16x8_t ln04 = vaddq_u16(lane0, lane4);
311 uint16x8_t ln13 = vaddq_u16(lane1, lane3);
312
313 uint16x8_t ln042 = vmlaq_u16(ln04, lane2, vc6u16);
314 uint16x8_t lsw = vmlaq_u16(ln042, ln13, vc4u16);
315
316 uint8x8_t ls = vrshrn_n_u16(lsw, 8);
317
318 vst1_u8(dst + x, ls);
319 }
320 break;
321 case 2:
322 for (; x <= colsn - 8*2; x += 8*2)
323 {
324 internal::prefetch(lane + x);
325
326 u16* lidx0 = lane + x - 2*2;
327 u16* lidx1 = lane + x - 1*2;
328 u16* lidx3 = lane + x + 1*2;
329 u16* lidx4 = lane + x + 2*2;
330 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
331 __asm__ __volatile__ (
332 "vld2.16 {d0, d2}, [%[in0]]! \n\t"
333 "vld2.16 {d1, d3}, [%[in0]] \n\t"
334 "vld2.16 {d8, d10}, [%[in4]]! \n\t"
335 "vld2.16 {d9, d11}, [%[in4]] \n\t"
336 "vadd.i16 q0, q4 \n\t"
337 "vadd.i16 q1, q5 \n\t"
338 "vld2.16 {d16, d18}, [%[in1]]! \n\t"
339 "vld2.16 {d17, d19}, [%[in1]] \n\t"
340 "vld2.16 {d8, d10}, [%[in3]]! \n\t"
341 "vld2.16 {d9, d11}, [%[in3]] \n\t"
342 "vadd.i16 q4, q8 \n\t"
343 "vadd.i16 q5, q9 \n\t"
344 "vld2.16 {d16, d18}, [%[in2]] \n\t"
345 "vld2.16 {d17, d19}, [%[in22]] \n\t"
346 "vmla.i16 q0, q4, %q[c4] \n\t"
347 "vmla.i16 q1, q5, %q[c4] \n\t"
348 "vmla.i16 q0, q8, %q[c6] \n\t"
349 "vmla.i16 q1, q9, %q[c6] \n\t"
350 "vrshrn.u16 d8, q0, #8 \n\t"
351 "vrshrn.u16 d9, q1, #8 \n\t"
352 "vst2.8 {d8-d9}, [%[out]] \n\t"
353 : [in0] "=r" (lidx0),
354 [in1] "=r" (lidx1),
355 [in3] "=r" (lidx3),
356 [in4] "=r" (lidx4)
357 : [out] "r" (dst + x),
358 "0" (lidx0),
359 "1" (lidx1),
360 "2" (lidx3),
361 "3" (lidx4),
362 [in2] "r" (lane + x),
363 [in22] "r" (lane + x + 4*2),
364 [c4] "w" (vc4u16), [c6] "w" (vc6u16)
365 : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
366 );
367 #else
368 uint16x8x2_t vLane0 = vld2q_u16(lidx0);
369 uint16x8x2_t vLane1 = vld2q_u16(lidx1);
370 uint16x8x2_t vLane2 = vld2q_u16(lane + x);
371 uint16x8x2_t vLane3 = vld2q_u16(lidx3);
372 uint16x8x2_t vLane4 = vld2q_u16(lidx4);
373
374 uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]);
375 uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane4.val[1]);
376
377 uint16x8_t vSum_4_8 = vaddq_u16(vLane1.val[0], vLane3.val[0]);
378 uint16x8_t vSum_5_9 = vaddq_u16(vLane1.val[1], vLane3.val[1]);
379
380 vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_4_8, vc4u16);
381 vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_5_9, vc4u16);
382 vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16);
383 vSum_1_5 = vmlaq_u16(vSum_1_5, vLane2.val[1], vc6u16);
384
385 uint8x8x2_t vRes;
386 vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8);
387 vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8);
388 vst2_u8(dst + x, vRes);
389 #endif
390 }
391 break;
392 case 3:
393 for (; x <= colsn - 8*3; x += 8*3)
394 {
395 internal::prefetch(lane + x);
396
397 u16* lidx0 = lane + x - 2*3;
398 u16* lidx1 = lane + x - 1*3;
399 u16* lidx3 = lane + x + 1*3;
400 u16* lidx4 = lane + x + 2*3;
401 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
402 __asm__ __volatile__ (
403 "vld3.16 {d0, d2, d4}, [%[in0]]! \n\t"
404 "vld3.16 {d1, d3, d5}, [%[in0]] \n\t"
405 "vld3.16 {d8, d10, d12}, [%[in4]]! \n\t"
406 "vld3.16 {d9, d11, d13}, [%[in4]] \n\t"
407 "vadd.i16 q0, q4 \n\t"
408 "vadd.i16 q1, q5 \n\t"
409 "vadd.i16 q2, q6 \n\t"
410 "vld3.16 {d16, d18, d20}, [%[in1]]! \n\t"
411 "vld3.16 {d17, d19, d21}, [%[in1]] \n\t"
412 "vld3.16 {d8, d10, d12}, [%[in3]]! \n\t"
413 "vld3.16 {d9, d11, d13}, [%[in3]] \n\t"
414 "vadd.i16 q4, q8 \n\t"
415 "vadd.i16 q5, q9 \n\t"
416 "vadd.i16 q6, q10 \n\t"
417 "vld3.16 {d16, d18, d20}, [%[in2]] \n\t"
418 "vld3.16 {d17, d19, d21}, [%[in22]] \n\t"
419 "vmla.i16 q0, q4, %q[c4] \n\t"
420 "vmla.i16 q1, q5, %q[c4] \n\t"
421 "vmla.i16 q2, q6, %q[c4] \n\t"
422 "vmla.i16 q0, q8, %q[c6] \n\t"
423 "vmla.i16 q1, q9, %q[c6] \n\t"
424 "vmla.i16 q2, q10, %q[c6] \n\t"
425 "vrshrn.u16 d8, q0, #8 \n\t"
426 "vrshrn.u16 d9, q1, #8 \n\t"
427 "vrshrn.u16 d10, q2, #8 \n\t"
428 "vst3.8 {d8-d10}, [%[out]] \n\t"
429 : [in0] "=r" (lidx0),
430 [in1] "=r" (lidx1),
431 [in3] "=r" (lidx3),
432 [in4] "=r" (lidx4)
433 : [out] "r" (dst + x),
434 "0" (lidx0),
435 "1" (lidx1),
436 "2" (lidx3),
437 "3" (lidx4),
438 [in2] "r" (lane + x),
439 [in22] "r" (lane + x + 4*3),
440 [c4] "w" (vc4u16), [c6] "w" (vc6u16)
441 : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
442 );
443 #else
444 uint16x8x3_t vLane0 = vld3q_u16(lidx0);
445 uint16x8x3_t vLane1 = vld3q_u16(lidx1);
446 uint16x8x3_t vLane2 = vld3q_u16(lane + x);
447 uint16x8x3_t vLane3 = vld3q_u16(lidx3);
448 uint16x8x3_t vLane4 = vld3q_u16(lidx4);
449
450 uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]);
451 uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane4.val[1]);
452 uint16x8_t vSum_2_6 = vaddq_u16(vLane0.val[2], vLane4.val[2]);
453
454 uint16x8_t vSum_3_1 = vaddq_u16(vLane3.val[0], vLane1.val[0]);
455 uint16x8_t vSum_4_2 = vaddq_u16(vLane3.val[1], vLane1.val[1]);
456 uint16x8_t vSum_5_6 = vaddq_u16(vLane3.val[2], vLane1.val[2]);
457
458 vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_3_1, vc4u16);
459 vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_4_2, vc4u16);
460 vSum_2_6 = vmlaq_u16(vSum_2_6, vSum_5_6, vc4u16);
461
462 vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16);
463 vSum_1_5 = vmlaq_u16(vSum_1_5, vLane2.val[1], vc6u16);
464 vSum_2_6 = vmlaq_u16(vSum_2_6, vLane2.val[2], vc6u16);
465
466 uint8x8x3_t vRes;
467 vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8);
468 vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8);
469 vRes.val[2] = vrshrn_n_u16(vSum_2_6, 8);
470
471 vst3_u8(dst + x, vRes);
472 #endif
473 }
474 break;
475 case 4:
476 for (; x <= colsn - 8*4; x += 8*4)
477 {
478 internal::prefetch(lane + x);
479 internal::prefetch(lane + x + 16);
480
481 u16* lidx0 = lane + x - 2*4;
482 u16* lidx1 = lane + x - 1*4;
483 u16* lidx3 = lane + x + 1*4;
484 u16* lidx4 = lane + x + 2*4;
485 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
486 __asm__ __volatile__ (
487 "vld4.16 {d0, d2, d4, d6}, [%[in0]]! \n\t"
488 "vld4.16 {d1, d3, d5, d7}, [%[in0]] \n\t"
489 "vld4.16 {d8, d10, d12, d14}, [%[in4]]! \n\t"
490 "vld4.16 {d9, d11, d13, d15}, [%[in4]] \n\t"
491 "vadd.i16 q0, q4 \n\t"
492 "vadd.i16 q1, q5 \n\t"
493 "vadd.i16 q2, q6 \n\t"
494 "vadd.i16 q3, q7 \n\t"
495 "vld4.16 {d16, d18, d20, d22}, [%[in1]]! \n\t"
496 "vld4.16 {d17, d19, d21, d23}, [%[in1]] \n\t"
497 "vld4.16 {d8, d10, d12, d14}, [%[in3]]! \n\t"
498 "vld4.16 {d9, d11, d13, d15}, [%[in3]] \n\t"
499 "vadd.i16 q4, q8 \n\t"
500 "vadd.i16 q5, q9 \n\t"
501 "vadd.i16 q6, q10 \n\t"
502 "vadd.i16 q7, q11 \n\t"
503 "vld4.16 {d16, d18, d20, d22}, [%[in2],:256] \n\t"
504 "vld4.16 {d17, d19, d21, d23}, [%[in22],:256] \n\t"
505 "vmla.i16 q0, q4, %q[c4] \n\t"
506 "vmla.i16 q1, q5, %q[c4] \n\t"
507 "vmla.i16 q2, q6, %q[c4] \n\t"
508 "vmla.i16 q3, q7, %q[c4] \n\t"
509 "vmla.i16 q0, q8, %q[c6] \n\t"
510 "vmla.i16 q1, q9, %q[c6] \n\t"
511 "vmla.i16 q2, q10, %q[c6] \n\t"
512 "vmla.i16 q3, q11, %q[c6] \n\t"
513 "vrshrn.u16 d8, q0, #8 \n\t"
514 "vrshrn.u16 d9, q1, #8 \n\t"
515 "vrshrn.u16 d10, q2, #8 \n\t"
516 "vrshrn.u16 d11, q3, #8 \n\t"
517 "vst4.8 {d8-d11}, [%[out]] \n\t"
518 : [in0] "=r" (lidx0),
519 [in1] "=r" (lidx1),
520 [in3] "=r" (lidx3),
521 [in4] "=r" (lidx4)
522 : [out] "r" (dst + x),
523 "0" (lidx0),
524 "1" (lidx1),
525 "2" (lidx3),
526 "3" (lidx4),
527 [in2] "r" (lane + x),
528 [in22] "r" (lane + x + 4*4),
529 [c4] "w" (vc4u16), [c6] "w" (vc6u16)
530 : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
531 );
532 #else
533 uint16x8x4_t vLane0 = vld4q_u16(lidx0);
534 uint16x8x4_t vLane2 = vld4q_u16(lidx4);
535 uint16x8x4_t vLane4 = vld4q_u16(lidx1);
536 uint16x8x4_t vLane6 = vld4q_u16(lidx3);
537 uint16x8x4_t vLane8 = vld4q_u16(lane + x);
538
539 uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane2.val[0]);
540 uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane2.val[1]);
541 uint16x8_t vSum_2_6 = vaddq_u16(vLane0.val[2], vLane2.val[2]);
542 uint16x8_t vSum_3_7 = vaddq_u16(vLane0.val[3], vLane2.val[3]);
543
544 uint16x8_t vSum_4_8 = vaddq_u16(vLane4.val[0], vLane6.val[0]);
545 uint16x8_t vSum_5_9 = vaddq_u16(vLane4.val[1], vLane6.val[1]);
546 uint16x8_t vSum_6_10 = vaddq_u16(vLane4.val[2], vLane6.val[2]);
547 uint16x8_t vSum_7_11 = vaddq_u16(vLane4.val[3], vLane6.val[3]);
548
549 vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_4_8, vc4u16);
550 vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_5_9, vc4u16);
551 vSum_2_6 = vmlaq_u16(vSum_2_6, vSum_6_10, vc4u16);
552 vSum_3_7 = vmlaq_u16(vSum_3_7, vSum_7_11, vc4u16);
553
554 vSum_0_4 = vmlaq_u16(vSum_0_4, vLane8.val[0], vc6u16);
555 vSum_1_5 = vmlaq_u16(vSum_1_5, vLane8.val[1], vc6u16);
556 vSum_2_6 = vmlaq_u16(vSum_2_6, vLane8.val[2], vc6u16);
557 vSum_3_7 = vmlaq_u16(vSum_3_7, vLane8.val[3], vc6u16);
558
559 uint8x8x4_t vRes;
560 vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8);
561 vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8);
562 vRes.val[2] = vrshrn_n_u16(vSum_2_6, 8);
563 vRes.val[3] = vrshrn_n_u16(vSum_3_7, 8);
564
565 vst4_u8(dst + x, vRes);
566 #endif
567 }
568 break;
569 }
570 for (s32 h = 0; h < cn; ++h)
571 {
572 u16* ln = lane + h;
573 u8* dt = dst + h;
574 for (size_t k = x; k < colsn; k += cn)
575 {
576 dt[k] = (u8)((ln[k-2*cn] + ln[k+2*cn]
577 + u16(4) * (ln[k-cn] + ln[k+cn])
578 + u16(6) * ln[k] + (1 << 7)) >> 8);
579 }
580 }
581 }
582 #else
583 (void)srcBase;
584 (void)srcStride;
585 (void)dstBase;
586 (void)dstStride;
587 (void)borderValue;
588 (void)borderMargin;
589 #endif
590 }
591
gaussianBlur5x5(const Size2D & size,s32 cn,const u16 * srcBase,ptrdiff_t srcStride,u16 * dstBase,ptrdiff_t dstStride,BORDER_MODE borderType,u16 borderValue,Margin borderMargin)592 void gaussianBlur5x5(const Size2D &size, s32 cn,
593 const u16 * srcBase, ptrdiff_t srcStride,
594 u16 * dstBase, ptrdiff_t dstStride,
595 BORDER_MODE borderType, u16 borderValue, Margin borderMargin)
596 {
597 internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
598 #ifdef CAROTENE_NEON
599 size_t colsn = size.width * cn;
600
601 std::vector<u16> _tmp;
602 u16 *tmp = 0;
603 if (borderType == BORDER_MODE_CONSTANT)
604 {
605 _tmp.assign(colsn + 4*cn, borderValue);
606 tmp = &_tmp[cn << 1];
607 }
608
609 ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
610 ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
611 ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
612 ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
613
614 //1-line buffer
615 std::vector<u32> _buf(cn * (size.width + 4) + 32 / sizeof(u32));
616 u32* lane = internal::alignPtr(&_buf[cn << 1], 32);
617
618 if (borderType == BORDER_MODE_CONSTANT)
619 for (s32 k = 0; k < cn; ++k)
620 {
621 lane[-cn+k] = borderValue;
622 lane[-cn-cn+k] = borderValue;
623 lane[colsn+k] = borderValue;
624 lane[colsn+cn+k] = borderValue;
625 }
626
627 uint16x4_t vc6u16 = vmov_n_u16(6);
628 uint32x4_t vc6u32 = vmovq_n_u32(6);
629 uint32x4_t vc4u32 = vmovq_n_u32(4);
630
631 for (size_t i = 0; i < size.height; ++i)
632 {
633 u16* dst = internal::getRowPtr(dstBase, dstStride, i);
634 //vertical convolution
635 ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
636 ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
637 ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
638 ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
639
640 const u16* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
641 const u16* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
642 const u16* ln2 = internal::getRowPtr(srcBase, srcStride, i);
643 const u16* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
644 const u16* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
645
646 size_t x = 0;
647 for (; x <= colsn - 4; x += 4)
648 {
649 internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
650 uint16x4_t v0 = vld1_u16(ln0+x);
651 uint16x4_t v1 = vld1_u16(ln1+x);
652 uint16x4_t v2 = vld1_u16(ln2+x);
653 uint16x4_t v3 = vld1_u16(ln3+x);
654 uint16x4_t v4 = vld1_u16(ln4+x);
655
656 uint32x4_t v = vaddl_u16(v0, v4);
657 uint32x4_t v13 = vaddl_u16(v1, v3);
658
659 v = vmlal_u16(v, v2, vc6u16);
660 v = vmlaq_u32(v, v13, vc4u32);
661
662 vst1q_u32(lane + x, v);
663 }
664 for (; x < colsn; ++x)
665 lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x];
666
667 //left&right borders
668 if (borderType != BORDER_MODE_CONSTANT)
669 for (s32 k = 0; k < cn; ++k)
670 {
671 lane[-cn+k] = lane[idx_l1 + k];
672 lane[-cn-cn+k] = lane[idx_l2 + k];
673
674 lane[colsn+k] = lane[idx_r1 + k];
675 lane[colsn+cn+k] = lane[idx_r2 + k];
676 }
677
678 //horizontal convolution
679 x = 0;
680 for (; x <= colsn - 4; x += 4)
681 {
682 internal::prefetch(lane + x);
683
684 uint32x4_t lane0 = vld1q_u32(lane + x - 2);
685 uint32x4_t lane4 = vld1q_u32(lane + x + 2);
686 uint32x4_t lane1 = vld1q_u32(lane + x - 1);
687 uint32x4_t lane3 = vld1q_u32(lane + x + 1);
688 uint32x4_t lane2 = vld1q_u32(lane + x + 0);
689
690 uint32x4_t ln04 = vaddq_u32(lane0, lane4);
691 uint32x4_t ln13 = vaddq_u32(lane1, lane3);
692
693 uint32x4_t ln042 = vmlaq_u32(ln04, lane2, vc6u32);
694 uint32x4_t lsw = vmlaq_u32(ln042, ln13, vc4u32);
695
696 uint16x4_t ls = vrshrn_n_u32(lsw, 8);
697
698 vst1_u16(dst + x, ls);
699 }
700 for (s32 h = 0; h < cn; ++h)
701 {
702 u32* ln = lane + h;
703 u16* dt = dst + h;
704 for (size_t k = x; k < colsn; k += cn)
705 {
706 dt[k] = (u16)((ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k] + (1<<7))>>8);
707 }
708 }
709 }
710 #else
711 (void)srcBase;
712 (void)srcStride;
713 (void)dstBase;
714 (void)dstStride;
715 (void)borderValue;
716 (void)borderMargin;
717 #endif
718 }
719
gaussianBlur5x5(const Size2D & size,s32 cn,const s16 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride,BORDER_MODE borderType,s16 borderValue,Margin borderMargin)720 void gaussianBlur5x5(const Size2D &size, s32 cn,
721 const s16 * srcBase, ptrdiff_t srcStride,
722 s16 * dstBase, ptrdiff_t dstStride,
723 BORDER_MODE borderType, s16 borderValue, Margin borderMargin)
724 {
725 internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
726 #ifdef CAROTENE_NEON
727 size_t colsn = size.width * cn;
728
729 std::vector<s16> _tmp;
730 s16 *tmp = 0;
731 if (borderType == BORDER_MODE_CONSTANT)
732 {
733 _tmp.assign(colsn + 4*cn, borderValue);
734 tmp = &_tmp[cn << 1];
735 }
736
737 ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
738 ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
739 ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
740 ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
741
742 //1-line buffer
743 std::vector<s32> _buf(cn * (size.width + 4) + 32 / sizeof(s32));
744 s32* lane = internal::alignPtr(&_buf[cn << 1], 32);
745
746 if (borderType == BORDER_MODE_CONSTANT)
747 for (s32 k = 0; k < cn; ++k)
748 {
749 lane[-cn+k] = borderValue;
750 lane[-cn-cn+k] = borderValue;
751 lane[colsn+k] = borderValue;
752 lane[colsn+cn+k] = borderValue;
753 }
754
755 int16x4_t vc6s16 = vmov_n_s16(6);
756 int32x4_t vc6s32 = vmovq_n_s32(6);
757 int32x4_t vc4s32 = vmovq_n_s32(4);
758
759 for (size_t i = 0; i < size.height; ++i)
760 {
761 s16* dst = internal::getRowPtr(dstBase, dstStride, i);
762 //vertical convolution
763 ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
764 ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
765 ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
766 ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
767
768 const s16* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
769 const s16* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
770 const s16* ln2 = internal::getRowPtr(srcBase, srcStride, i);
771 const s16* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
772 const s16* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
773
774 size_t x = 0;
775 for (; x <= colsn - 4; x += 4)
776 {
777 internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
778 int16x4_t v0 = vld1_s16(ln0+x);
779 int16x4_t v1 = vld1_s16(ln1+x);
780 int16x4_t v2 = vld1_s16(ln2+x);
781 int16x4_t v3 = vld1_s16(ln3+x);
782 int16x4_t v4 = vld1_s16(ln4+x);
783
784 int32x4_t v = vaddl_s16(v0, v4);
785 int32x4_t v13 = vaddl_s16(v1, v3);
786
787 v = vmlal_s16(v, v2, vc6s16);
788 v = vmlaq_s32(v, v13, vc4s32);
789
790 vst1q_s32(lane + x, v);
791 }
792 for (; x < colsn; ++x)
793 lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x];
794
795 //left&right borders
796 if (borderType != BORDER_MODE_CONSTANT)
797 for (s32 k = 0; k < cn; ++k)
798 {
799 lane[-cn+k] = lane[idx_l1 + k];
800 lane[-cn-cn+k] = lane[idx_l2 + k];
801
802 lane[colsn+k] = lane[idx_r1 + k];
803 lane[colsn+cn+k] = lane[idx_r2 + k];
804 }
805
806 //horizontal convolution
807 x = 0;
808 switch(cn)
809 {
810 case 1:
811 case 2:
812 case 3:
813 for (; x <= colsn - 4; x += 4)
814 {
815 internal::prefetch(lane + x);
816
817 int32x4_t lane0 = vld1q_s32(lane + x - 2);
818 int32x4_t lane4 = vld1q_s32(lane + x + 2);
819 int32x4_t lane1 = vld1q_s32(lane + x - 1);
820 int32x4_t lane3 = vld1q_s32(lane + x + 1);
821 int32x4_t lane2 = vld1q_s32(lane + x + 0);
822
823 int32x4_t ln04 = vaddq_s32(lane0, lane4);
824 int32x4_t ln13 = vaddq_s32(lane1, lane3);
825
826 int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32);
827 int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32);
828
829 int16x4_t ls = vrshrn_n_s32(lsw, 8);
830
831 vst1_s16(dst + x, ls);
832 }
833 break;
834 case 4:
835 /* for (; x <= colsn - 4*4; x += 4*4)
836 {
837 internal::prefetch(lane + x);
838 internal::prefetch(lane + x + 16);
839
840 ptrdiff_t* lidx0 = lane + x - 2*4;
841 ptrdiff_t* lidx1 = lane + x - 1*4;
842 ptrdiff_t* lidx3 = lane + x + 1*4;
843 ptrdiff_t* lidx4 = lane + x + 2*4;
844
845 __asm__ __volatile__ (
846 "vld4.32 {d0, d2, d4, d6}, [%[in0]]! \n\t"
847 "vld4.32 {d1, d3, d5, d7}, [%[in0]] \n\t"
848 "vld4.32 {d8, d10, d12, d14}, [%[in4]]! \n\t"
849 "vld4.32 {d9, d11, d13, d15}, [%[in4]] \n\t"
850 "vadd.i32 q0, q4 \n\t"
851 "vadd.i32 q1, q5 \n\t"
852 "vadd.i32 q2, q6 \n\t"
853 "vadd.i32 q3, q7 \n\t"
854 "vld4.32 {d16, d18, d20, d22}, [%[in1]]! \n\t"
855 "vld4.32 {d17, d19, d21, d23}, [%[in1]] \n\t"
856 "vld4.32 {d8, d10, d12, d14}, [%[in3]]! \n\t"
857 "vld4.32 {d9, d11, d13, d15}, [%[in3]] \n\t"
858 "vadd.i32 q4, q8 \n\t"
859 "vadd.i32 q5, q9 \n\t"
860 "vadd.i32 q6, q10 \n\t"
861 "vadd.i32 q7, q11 \n\t"
862 "vld4.32 {d16, d18, d20, d22}, [%[in2],:256] \n\t"
863 "vld4.32 {d17, d19, d21, d23}, [%[in22],:256] \n\t"
864 "vmla.i32 q0, q4, %q[c4] \n\t"
865 "vmla.i32 q1, q5, %q[c4] \n\t"
866 "vmla.i32 q2, q6, %q[c4] \n\t"
867 "vmla.i32 q3, q7, %q[c4] \n\t"
868 "vmla.i32 q0, q8, %q[c6] \n\t"
869 "vmla.i32 q1, q9, %q[c6] \n\t"
870 "vmla.i32 q2, q10, %q[c6] \n\t"
871 "vmla.i32 q3, q11, %q[c6] \n\t"
872 "vrshrn.i32 d8, q0, #8 \n\t"
873 "vrshrn.i32 d9, q1, #8 \n\t"
874 "vrshrn.i32 d10, q2, #8 \n\t"
875 "vrshrn.i32 d11, q3, #8 \n\t"
876 "vst4.16 {d8-d11}, [%[out]] \n\t"
877 : [in0] "=r" (lidx0),
878 [in1] "=r" (lidx1),
879 [in3] "=r" (lidx3),
880 [in4] "=r" (lidx4)
881 : [out] "r" (dst + x),
882 "0" (lidx0),
883 "1" (lidx1),
884 "2" (lidx3),
885 "3" (lidx4),
886 [in2] "r" (lane + x),
887 [in22] "r" (lane + x + 4*2),
888 [c4] "w" (vc4s32), [c6] "w" (vc6s32)
889 : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
890 );
891 */
892 for (; x <= colsn - 4; x += 4)
893 {
894 internal::prefetch(lane + x);
895
896 int32x4_t lane0 = vld1q_s32(lane + x - 2);
897 int32x4_t lane4 = vld1q_s32(lane + x + 2);
898 int32x4_t lane1 = vld1q_s32(lane + x - 1);
899 int32x4_t lane3 = vld1q_s32(lane + x + 1);
900 int32x4_t lane2 = vld1q_s32(lane + x + 0);
901
902 int32x4_t ln04 = vaddq_s32(lane0, lane4);
903 int32x4_t ln13 = vaddq_s32(lane1, lane3);
904
905 int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32);
906 int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32);
907
908 int16x4_t ls = vrshrn_n_s32(lsw, 8);
909
910 vst1_s16(dst + x, ls);
911 }
912 break;
913 }
914 for (s32 h = 0; h < cn; ++h)
915 {
916 s32* ln = lane + h;
917 s16* dt = dst + h;
918 for (size_t k = x; k < colsn; k += cn)
919 {
920 dt[k] = (s16)((ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k] + (1<<7))>>8);
921 }
922 }
923 }
924 #else
925 (void)srcBase;
926 (void)srcStride;
927 (void)dstBase;
928 (void)dstStride;
929 (void)borderValue;
930 (void)borderMargin;
931 #endif
932 }
933
gaussianBlur5x5(const Size2D & size,s32 cn,const s32 * srcBase,ptrdiff_t srcStride,s32 * dstBase,ptrdiff_t dstStride,BORDER_MODE borderType,s32 borderValue,Margin borderMargin)934 void gaussianBlur5x5(const Size2D &size, s32 cn,
935 const s32 * srcBase, ptrdiff_t srcStride,
936 s32 * dstBase, ptrdiff_t dstStride,
937 BORDER_MODE borderType, s32 borderValue, Margin borderMargin)
938 {
939 internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType));
940 #ifdef CAROTENE_NEON
941 size_t colsn = size.width * cn;
942
943 std::vector<s32> _tmp;
944 s32 *tmp = 0;
945 if (borderType == BORDER_MODE_CONSTANT)
946 {
947 _tmp.assign(colsn + 4*cn, borderValue);
948 tmp = &_tmp[cn << 1];
949 }
950
951 ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
952 ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
953 ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
954 ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
955
956 //1-line buffer
957 std::vector<s32> _buf(cn * (size.width + 4) + 32 / sizeof(s32));
958 s32* lane = internal::alignPtr(&_buf[cn << 1], 32);
959
960 if (borderType == BORDER_MODE_CONSTANT)
961 for (s32 k = 0; k < cn; ++k)
962 {
963 lane[-cn+k] = borderValue;
964 lane[-cn-cn+k] = borderValue;
965 lane[colsn+k] = borderValue;
966 lane[colsn+cn+k] = borderValue;
967 }
968
969 int32x4_t vc6s32 = vmovq_n_s32(6);
970 int32x4_t vc4s32 = vmovq_n_s32(4);
971
972 for (size_t i = 0; i < size.height; ++i)
973 {
974 s32* dst = internal::getRowPtr(dstBase, dstStride, i);
975 //vertical convolution
976 ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
977 ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
978 ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
979 ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
980
981 const s32* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp;
982 const s32* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
983 const s32* ln2 = internal::getRowPtr(srcBase, srcStride, i);
984 const s32* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
985 const s32* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
986
987 size_t x = 0;
988 for (; x <= colsn - 4; x += 4)
989 {
990 internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
991 int32x4_t v0 = vld1q_s32(ln0+x);
992 int32x4_t v1 = vld1q_s32(ln1+x);
993 int32x4_t v2 = vld1q_s32(ln2+x);
994 int32x4_t v3 = vld1q_s32(ln3+x);
995 int32x4_t v4 = vld1q_s32(ln4+x);
996
997 int32x4_t v = vaddq_s32(v0, v4);
998 int32x4_t v13 = vaddq_s32(v1, v3);
999
1000 v = vmlaq_s32(v, v2, vc6s32);
1001 v = vmlaq_s32(v, v13, vc4s32);
1002
1003 vst1q_s32(lane + x, v);
1004 }
1005 for (; x < colsn; ++x)
1006 lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x];
1007
1008 //left&right borders
1009 if (borderType != BORDER_MODE_CONSTANT)
1010 for (s32 k = 0; k < cn; ++k)
1011 {
1012 lane[-cn+k] = lane[idx_l1 + k];
1013 lane[-cn-cn+k] = lane[idx_l2 + k];
1014
1015 lane[colsn+k] = lane[idx_r1 + k];
1016 lane[colsn+cn+k] = lane[idx_r2 + k];
1017 }
1018
1019 //horizontal convolution
1020 x = 0;
1021 for (; x <= colsn - 4; x += 4)
1022 {
1023 internal::prefetch(lane + x);
1024
1025 int32x4_t lane0 = vld1q_s32(lane + x - 2);
1026 int32x4_t lane4 = vld1q_s32(lane + x + 2);
1027 int32x4_t lane1 = vld1q_s32(lane + x - 1);
1028 int32x4_t lane3 = vld1q_s32(lane + x + 1);
1029 int32x4_t lane2 = vld1q_s32(lane + x + 0);
1030
1031 int32x4_t ln04 = vaddq_s32(lane0, lane4);
1032 int32x4_t ln13 = vaddq_s32(lane1, lane3);
1033
1034 int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32);
1035 int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32);
1036
1037 vst1q_s32(dst + x, lsw);
1038 }
1039 for (s32 h = 0; h < cn; ++h)
1040 {
1041 s32* ln = lane + h;
1042 s32* dt = dst + h;
1043 for (size_t k = x; k < colsn; k += cn)
1044 {
1045 dt[k] = ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k];
1046 }
1047 }
1048 }
1049 #else
1050 (void)srcBase;
1051 (void)srcStride;
1052 (void)dstBase;
1053 (void)dstStride;
1054 (void)borderValue;
1055 (void)borderMargin;
1056 #endif
1057 }
1058
1059 } // namespace CAROTENE_NS
1060