1 /*
2  * By downloading, copying, installing or using the software you agree to this license.
3  * If you do not agree to this license, do not download, install,
4  * copy or use the software.
5  *
6  *
7  *                           License Agreement
8  *                For Open Source Computer Vision Library
9  *                        (3-clause BSD License)
10  *
11  * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
12  * Third party copyrights are property of their respective owners.
13  *
14  * Redistribution and use in source and binary forms, with or without modification,
15  * are permitted provided that the following conditions are met:
16  *
17  *   * Redistributions of source code must retain the above copyright notice,
18  *     this list of conditions and the following disclaimer.
19  *
20  *   * Redistributions in binary form must reproduce the above copyright notice,
21  *     this list of conditions and the following disclaimer in the documentation
22  *     and/or other materials provided with the distribution.
23  *
24  *   * Neither the names of the copyright holders nor the names of the contributors
25  *     may be used to endorse or promote products derived from this software
26  *     without specific prior written permission.
27  *
28  * This software is provided by the copyright holders and contributors "as is" and
29  * any express or implied warranties, including, but not limited to, the implied
30  * warranties of merchantability and fitness for a particular purpose are disclaimed.
31  * In no event shall copyright holders or contributors be liable for any direct,
32  * indirect, incidental, special, exemplary, or consequential damages
33  * (including, but not limited to, procurement of substitute goods or services;
34  * loss of use, data, or profits; or business interruption) however caused
35  * and on any theory of liability, whether in contract, strict liability,
36  * or tort (including negligence or otherwise) arising in any way out of
37  * the use of this software, even if advised of the possibility of such damage.
38  */
39 
40 #ifndef CAROTENE_SRC_SEPARABLE_FILTER_HPP
41 #define CAROTENE_SRC_SEPARABLE_FILTER_HPP
42 
43 #include "common.hpp"
44 
45 #include <carotene/types.hpp>
46 
47 #include <vector>
48 
49 #ifdef CAROTENE_NEON
50 
51 namespace CAROTENE_NS {
52 
53 namespace internal {
54 
55 struct RowFilter3x3S16Base
56 {
57     typedef u8 srcType;
58      /*
59      Various border types, image boundaries are denoted with '|'
60 
61      * BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
62      * BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
63      * BORDER_REFLECT_101:   gfedcb|abcdefgh|gfedcba
64      * BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
65      * BORDER_CONSTANT:      iiiiii|abcdefgh|iiiiiii  with some specified 'i'
66      */
RowFilter3x3S16BaseCAROTENE_NS::internal::RowFilter3x3S16Base67     inline RowFilter3x3S16Base(const BORDER_MODE _borderType, const srcType _borderValue, const ptrdiff_t borderxl, const ptrdiff_t borderxr):
68                                borderType(_borderType),borderValue(_borderValue)
69     {
70         if (borderType == BORDER_MODE_CONSTANT)
71         {
72             vfmask = vreinterpret_u8_u64(vmov_n_u64(borderxl ? 0x00ffFFffFFffFFffULL : 0x0100FFffFFffFFffULL));
73             vtmask = vreinterpret_u8_u64(vmov_n_u64(borderxr ? 0xFF07060504030201ULL : 0x0706050403020100ULL));
74         }
75         else if (borderType == BORDER_MODE_REFLECT101)
76         {
77             vfmask = vreinterpret_u8_u64(vmov_n_u64(borderxl ? 0x0001FFffFFffFFffULL : 0x0100FFffFFffFFffULL));
78             vtmask = vreinterpret_u8_u64(vmov_n_u64(borderxr ? 0x0607060504030201ULL : 0x0706050403020100ULL));
79         }
80         else //if (borderType == BORDER_MODE_REFLECT || borderType == BORDER_MODE_REPLICATE)
81         {
82             vfmask = vreinterpret_u8_u64(vmov_n_u64(borderxl ? 0x0000FFffFFffFFffULL : 0x0100FFffFFffFFffULL));
83             vtmask = vreinterpret_u8_u64(vmov_n_u64(borderxr ? 0x0707060504030201ULL : 0x0706050403020100ULL));
84         }
85         lookLeft = offsetk - borderxl;
86         lookRight = offsetk - borderxr;
87     }
88 
89     uint8x8_t vfmask;
90     uint8x8_t vtmask;
91     enum { offsetk = 1};
92     ptrdiff_t lookLeft;
93     ptrdiff_t lookRight;
94     const BORDER_MODE borderType;
95     const srcType borderValue;
96 };
97 
98 struct ColFilter3x3S16Base
99 {
100     typedef s16 srcType;
101 
ColFilter3x3S16BaseCAROTENE_NS::internal::ColFilter3x3S16Base102     inline ColFilter3x3S16Base(const BORDER_MODE _borderType, const srcType _borderValue):
103                                borderType(_borderType),borderValue(_borderValue) {}
104 
105     enum { offsetk = 1};
106     const BORDER_MODE borderType;
107     const srcType borderValue;
108 };
109 
110 struct RowFilter3x3S16Generic : public RowFilter3x3S16Base
111 {
112     typedef s16 dstType;
113 
RowFilter3x3S16GenericCAROTENE_NS::internal::RowFilter3x3S16Generic114     inline RowFilter3x3S16Generic(BORDER_MODE _borderType, const srcType _borderValue, ptrdiff_t borderxl, ptrdiff_t borderxr, const s16 *w):
115                                   RowFilter3x3S16Base(_borderType, _borderValue, borderxl, borderxr), borderFilter( (w[0]+w[1]+w[2]) * borderValue )
116     {
117         vw0 = vdupq_n_s16(w[0]);
118         vw1 = vdupq_n_s16(w[1]);
119         vw2 = vdupq_n_s16(w[2]);
120     }
121 
122     int16x8_t vw0;
123     int16x8_t vw1;
124     int16x8_t vw2;
125     const dstType borderFilter;
126 
operator ()CAROTENE_NS::internal::RowFilter3x3S16Generic127     inline void operator()(const u8* src, s16* dst, ptrdiff_t width)
128     {
129         uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask);
130         if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT)
131             l = vset_lane_u8(borderValue, l, 6);
132 
133         ptrdiff_t i = 0;
134         for (; i < width - 16 + lookRight; i += 16)
135         {
136             internal::prefetch(src + i);
137             uint8x8_t l18u = vld1_u8(src + i + 1);
138             vst1q_s16(dst + i, vaddq_s16(vmlaq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vext_u8(l, l18u, 6))), vw0),
139                                                    vreinterpretq_s16_u16(vmovl_u8(vext_u8(l, l18u, 7))), vw1),
140                                                    vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(l18u)), vw2)));
141             l = vld1_u8(src + i + 9);
142             vst1q_s16(dst + i + 8, vaddq_s16(vmlaq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vext_u8(l18u, l, 6))), vw0),
143                                                    vreinterpretq_s16_u16(vmovl_u8(vext_u8(l18u, l, 7))), vw1),
144                                                    vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(l)), vw2)));
145         }
146         if (i < width - 8 + lookRight)
147         {
148             uint8x8_t l18u = vld1_u8(src + i + 1);
149             vst1q_s16(dst + i, vaddq_s16(vmlaq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vext_u8(l, l18u, 6))), vw0),
150                                                    vreinterpretq_s16_u16(vmovl_u8(vext_u8(l, l18u, 7))), vw1),
151                                                    vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(l18u)), vw2)));
152             i += 8;
153         }
154 
155         //tail
156         if (lookRight == 0 || i != width)
157         {
158             uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1
159             uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask);
160             if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT)
161                 tail2 = vset_lane_u8(borderValue, tail2, 7);
162             uint8x8_t tail1 = vext_u8(vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(tail0), 8*6)), tail2, 7);
163 
164             int16x8_t l0 = vreinterpretq_s16_u16(vmovl_u8(tail0));
165             int16x8_t l1 = vreinterpretq_s16_u16(vmovl_u8(tail1));
166             int16x8_t l2 = vreinterpretq_s16_u16(vmovl_u8(tail2));
167 
168             int16x8_t l0w = vmulq_s16(l0, vw0);
169             int16x8_t l2w = vmulq_s16(l2, vw2);
170             int16x8_t ls = vaddq_s16(vmlaq_s16(l0w, l1, vw1), l2w);
171 
172             vst1q_s16(dst + (width - 8), ls);
173         }
174     }
175 };
176 
177 struct RowFilter3x3S16_m101 : public RowFilter3x3S16Base
178 {
179     typedef s16 dstType;
180 
RowFilter3x3S16_m101CAROTENE_NS::internal::RowFilter3x3S16_m101181     inline RowFilter3x3S16_m101(const BORDER_MODE _borderType, const srcType _borderValue, ptrdiff_t borderxl, ptrdiff_t borderxr, const s16*):
182                                 RowFilter3x3S16Base(_borderType, _borderValue, borderxl, borderxr), borderFilter(0) {}
183 
184     const dstType borderFilter;
185 
operator ()CAROTENE_NS::internal::RowFilter3x3S16_m101186     inline void operator()(const u8* src, s16* dst, ptrdiff_t width)
187     {
188         uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask);
189         if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT)
190             l = vset_lane_u8(borderValue, l, 6);
191 
192         ptrdiff_t i = 0;
193         for (; i < width - 16 + lookRight; i += 16)
194         {
195             internal::prefetch(src + i);
196 
197             uint8x8_t l2 = vld1_u8(src + i + 1);
198             vst1q_s16(dst + i, vreinterpretq_s16_u16(vsubl_u8(l2, vext_u8(l, l2, 6))));
199 
200             l = vld1_u8(src + i + 9);
201             vst1q_s16(dst + i + 8, vreinterpretq_s16_u16(vsubl_u8(l, vext_u8(l2, l, 6))));
202         }
203 
204         if (i < width - 8 + lookRight)
205         {
206             uint8x8_t l2 = vld1_u8(src + i + 1);
207             vst1q_s16(dst + i, vreinterpretq_s16_u16(vsubl_u8(l2, vext_u8(l, l2, 6))));
208             i += 8;
209         }
210 
211         //tail
212         if (lookRight == 0 || i != width)
213         {
214             uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1
215             uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask);
216             if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT)
217                 tail2 = vset_lane_u8(borderValue, tail2, 7);
218 
219             int16x8_t ls = vreinterpretq_s16_u16(vsubl_u8(tail2, tail0));
220 
221             vst1q_s16(dst + (width - 8), ls);
222         }
223     }
224 };
225 
226 struct RowFilter3x3S16_121 : public RowFilter3x3S16Base
227 {
228     typedef s16 dstType;
229 
RowFilter3x3S16_121CAROTENE_NS::internal::RowFilter3x3S16_121230     inline RowFilter3x3S16_121(const BORDER_MODE _borderType, const srcType _borderValue, ptrdiff_t borderxl, ptrdiff_t borderxr, const s16*):
231                                RowFilter3x3S16Base(_borderType, _borderValue, borderxl, borderxr), borderFilter(borderValue << 2) {}
232 
233     const dstType borderFilter;
234 
operator ()CAROTENE_NS::internal::RowFilter3x3S16_121235     inline void operator()(const u8* src, s16* dst, ptrdiff_t width)
236     {
237         uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask);
238         if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT)
239             l = vset_lane_u8(borderValue, l, 6);
240 
241         ptrdiff_t i = 0;
242         for (; i < width - 16 + lookRight; i += 16)
243         {
244             internal::prefetch(src + i);
245 
246             uint8x8_t l2 = vld1_u8(src + i + 1);
247             vst1q_s16(dst + i, vqaddq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l, l2, 6), l2)),
248                                           vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l2, 7), 1))));
249 
250             l = vld1_u8(src + i + 9);
251             vst1q_s16(dst + i + 8, vqaddq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l2, l, 6), l)),
252                                               vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l2, l, 7), 1))));
253         }
254 
255         if (i < width - 8 + lookRight)
256         {
257             uint8x8_t l2 = vld1_u8(src + i + 1);
258             vst1q_s16(dst + i, vqaddq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l, l2, 6), l2)),
259                                           vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l2, 7), 1))));
260             i += 8;
261         }
262 
263         //tail
264         if (lookRight == 0 || i != width)
265         {
266             uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1
267             uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask);
268             if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT)
269                 tail2 = vset_lane_u8(borderValue, tail2, 7);
270             uint8x8_t tail1 = vext_u8(vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(tail0), 8*6)), tail2, 7);
271 
272             int16x8_t tail02 = vreinterpretq_s16_u16(vaddl_u8(tail0, tail2));
273             int16x8_t tail1x2 = vreinterpretq_s16_u16(vshll_n_u8(tail1, 1));
274 
275             int16x8_t ls = vqaddq_s16(tail02, tail1x2);
276 
277             vst1q_s16(dst + (width - 8), ls);
278         }
279     }
280 };
281 
282 struct RowFilter3x3S16_1m21 : public RowFilter3x3S16Base
283 {
284     typedef s16 dstType;
285 
RowFilter3x3S16_1m21CAROTENE_NS::internal::RowFilter3x3S16_1m21286     inline RowFilter3x3S16_1m21(const BORDER_MODE _borderType, const srcType _borderValue, ptrdiff_t borderxl, ptrdiff_t borderxr, const s16*):
287                                 RowFilter3x3S16Base(_borderType, _borderValue, borderxl, borderxr), borderFilter(0) {}
288 
289     const dstType borderFilter;
290 
operator ()CAROTENE_NS::internal::RowFilter3x3S16_1m21291     inline void operator()(const u8* src, s16* dst, ptrdiff_t width)
292     {
293         uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask);
294         if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT)
295             l = vset_lane_u8(borderValue, l, 6);
296 
297         ptrdiff_t i = 0;
298         for (; i < width - 16 + lookRight; i += 16)
299         {
300             internal::prefetch(src + i);
301 
302             uint8x8_t l2 = vld1_u8(src + i + 1);
303             vst1q_s16(dst + i, vqsubq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l, l2, 6), l2)),
304                                           vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l2, 7), 1))));
305 
306             l = vld1_u8(src + i + 9);
307             vst1q_s16(dst + i + 8, vqsubq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l2, l, 6), l)),
308                                               vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l2, l, 7), 1))));
309         }
310 
311         if (i < width - 8 + lookRight)
312         {
313             uint8x8_t l2 = vld1_u8(src + i + 1);
314             vst1q_s16(dst + i, vqsubq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l, l2, 6), l2)),
315                                           vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l2, 7), 1))));
316             i += 8;
317         }
318 
319         //tail
320         if (lookRight == 0 || i != width)
321         {
322             uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1
323             uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask);
324             if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT)
325                 tail2 = vset_lane_u8(borderValue, tail2, 7);
326             uint8x8_t tail1 = vext_u8(vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(tail0), 8*6)), tail2, 7);
327 
328             int16x8_t tail02 = vreinterpretq_s16_u16(vaddl_u8(tail0, tail2));
329             int16x8_t tail1x2 = vreinterpretq_s16_u16(vshll_n_u8(tail1, 1));
330 
331             int16x8_t ls = vqsubq_s16(tail02, tail1x2);
332 
333             vst1q_s16(dst + (width - 8), ls);
334         }
335     }
336 };
337 
338 struct ColFilter3x3S16Generic : public ColFilter3x3S16Base
339 {
340     typedef s16 dstType;
341 
ColFilter3x3S16GenericCAROTENE_NS::internal::ColFilter3x3S16Generic342     inline ColFilter3x3S16Generic(const BORDER_MODE _borderType, const srcType _borderValue, const s16 *w):
343                                   ColFilter3x3S16Base(_borderType, _borderValue)
344     {
345         vw0 = vdupq_n_s16(w[0]);
346         vw1 = vdupq_n_s16(w[1]);
347         vw2 = vdupq_n_s16(w[2]);
348     }
349 
350     int16x8_t vw0;
351     int16x8_t vw1;
352     int16x8_t vw2;
353 
operator ()CAROTENE_NS::internal::ColFilter3x3S16Generic354     inline void operator()(const s16* src0, const s16* src1, const s16* src2, const s16* src3, s16* dst0, s16* dst1, ptrdiff_t width)
355     {
356         ptrdiff_t j = 0;
357         for (; j <= width - 16; j += 16)
358         {
359             int16x8_t line1 = vld1q_s16(src1 + j);
360             int16x8_t line2 = vld1q_s16(src2 + j);
361             vst1q_s16(dst0 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0), line1, vw1), line2, vw2));
362             vst1q_s16(dst1 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src3 + j), vw2), line1, vw0), line2, vw1));
363 
364             line1 = vld1q_s16(src1 + j + 8);
365             line2 = vld1q_s16(src2 + j + 8);
366             vst1q_s16(dst0 + j + 8, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j + 8), vw0), line1, vw1), line2, vw2));
367             vst1q_s16(dst1 + j + 8, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src3 + j + 8), vw2), line1, vw0), line2, vw1));
368         }
369         if (j <= width - 8)
370         {
371             int16x8_t line1 = vld1q_s16(src1 + j);
372             int16x8_t line2 = vld1q_s16(src2 + j);
373             vst1q_s16(dst0 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0), line1, vw1), line2, vw2));
374             vst1q_s16(dst1 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src3 + j), vw2), line1, vw0), line2, vw1));
375             j += 8;
376         }
377         if (j != width)
378         {
379             j = width - 8;
380             int16x8_t line1 = vld1q_s16(src1 + j);
381             int16x8_t line2 = vld1q_s16(src2 + j);
382             vst1q_s16(dst0 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0), line1, vw1), line2, vw2));
383             vst1q_s16(dst1 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src3 + j), vw2), line1, vw0), line2, vw1));
384         }
385     }
386 
operator ()CAROTENE_NS::internal::ColFilter3x3S16Generic387     inline void operator()(const s16* src0, const s16* src1, const s16* src2, s16* dst, ptrdiff_t width)
388     {
389         if (src0 == 0 || src2 == 0)
390         {
391             int16x8_t vwl1 = vw0;
392             int16x8_t vwl2 = vw2;
393             if (src2 == 0)
394             {
395                 src2 = src0;
396                 vwl1 = vw2;
397                 vwl2 = vw0;
398             }
399 
400             int16x8_t v_border = vdupq_n_s16(0);
401             if (borderType == BORDER_MODE_CONSTANT)
402             {
403                 v_border = vmulq_s16(vdupq_n_s16(borderValue), vwl1);
404                 vwl1 = vw1;
405             }
406             else if (borderType == BORDER_MODE_REFLECT101)
407             {
408                 vwl1 = vw1;
409                 vwl2 = vaddq_s16(vw0, vw2);
410             }
411             else //replicate\reflect
412                 vwl1 = vaddq_s16(vwl1, vw1);
413 
414             ptrdiff_t j = 0;
415             for (; j <= width - 16; j += 16)
416             {
417                 vst1q_s16(dst + j, vaddq_s16(vmlaq_s16(v_border, vld1q_s16(src1 + j), vwl1),
418                                              vmulq_s16(vld1q_s16(src2 + j), vwl2)));
419                 vst1q_s16(dst + j + 8, vaddq_s16(vmlaq_s16(v_border, vld1q_s16(src1 + j + 8), vwl1),
420                                              vmulq_s16(vld1q_s16(src2 + j + 8), vwl2)));
421             }
422             if (j <= width - 8)
423             {
424                 vst1q_s16(dst + j, vaddq_s16(vmlaq_s16(v_border, vld1q_s16(src1 + j), vwl1),
425                                              vmulq_s16(vld1q_s16(src2 + j), vwl2)));
426                 j += 8;
427             }
428             if (j != width)
429             {
430                 j = width - 8;
431                 vst1q_s16(dst + j, vaddq_s16(vmlaq_s16(v_border, vld1q_s16(src1 + j), vwl1),
432                                              vmulq_s16(vld1q_s16(src2 + j), vwl2)));
433             }
434         }
435         else
436         {
437             ptrdiff_t j = 0;
438             for (; j <= width - 16; j += 16)
439             {
440                 vst1q_s16(dst + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0),
441                                                                  vld1q_s16(src1 + j), vw1),
442                                                                  vld1q_s16(src2 + j), vw2));
443                 vst1q_s16(dst + j + 8, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j + 8), vw0),
444                                                                      vld1q_s16(src1 + j + 8), vw1),
445                                                                      vld1q_s16(src2 + j + 8), vw2));
446             }
447             if (j <= width - 8)
448             {
449                 vst1q_s16(dst + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0),
450                                                                  vld1q_s16(src1 + j), vw1),
451                                                                  vld1q_s16(src2 + j), vw2));
452                 j += 8;
453             }
454             if (j != width)
455             {
456                 j = width - 8;
457                 vst1q_s16(dst + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0),
458                                                                  vld1q_s16(src1 + j), vw1),
459                                                                  vld1q_s16(src2 + j), vw2));
460             }
461         }
462     }
463 };
464 
465 struct ColFilter3x3S16_m101 : public ColFilter3x3S16Base
466 {
467     typedef s16 dstType;
468 
ColFilter3x3S16_m101CAROTENE_NS::internal::ColFilter3x3S16_m101469     inline ColFilter3x3S16_m101(const BORDER_MODE _borderType, const srcType _borderValue, const s16 *):
470                                 ColFilter3x3S16Base(_borderType, _borderValue) {}
471 
operator ()CAROTENE_NS::internal::ColFilter3x3S16_m101472     inline void operator()(const s16* src0, const s16* src1, const s16* src2, const s16* src3, s16* dst0, s16* dst1, ptrdiff_t width)
473     {
474         ptrdiff_t j = 0;
475         for (; j <= width - 16; j += 16)
476         {
477             vst1q_s16(dst0 + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j)));
478             vst1q_s16(dst1 + j, vqsubq_s16(vld1q_s16(src3 + j), vld1q_s16(src1 + j)));
479             vst1q_s16(dst0 + j + 8, vqsubq_s16(vld1q_s16(src2 + j + 8), vld1q_s16(src0 + j + 8)));
480             vst1q_s16(dst1 + j + 8, vqsubq_s16(vld1q_s16(src3 + j + 8), vld1q_s16(src1 + j + 8)));
481         }
482         if (j <= width - 8)
483         {
484             vst1q_s16(dst0 + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j)));
485             vst1q_s16(dst1 + j, vqsubq_s16(vld1q_s16(src3 + j), vld1q_s16(src1 + j)));
486             j += 8;
487         }
488         if (j != width)
489         {
490             j = width - 8;
491             vst1q_s16(dst0 + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j)));
492             vst1q_s16(dst1 + j, vqsubq_s16(vld1q_s16(src3 + j), vld1q_s16(src1 + j)));
493         }
494     }
495 
operator ()CAROTENE_NS::internal::ColFilter3x3S16_m101496     inline void operator()(const s16* src0, const s16* src1, const s16* src2, s16* dst, ptrdiff_t width)
497     {
498         if (src0 == 0 || src2 == 0)
499         {
500             if (borderType == BORDER_MODE_CONSTANT)
501             {
502                 int16x8_t v_border = vdupq_n_s16(borderValue);
503                 if (src0 == 0)
504                 {
505                     ptrdiff_t j = 0;
506                     for (; j <= width - 16; j += 16)
507                     {
508                         vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), v_border));
509                         vst1q_s16(dst + j + 8, vqsubq_s16(vld1q_s16(src2 + j + 8), v_border));
510                     }
511                     if (j <= width - 8)
512                     {
513                         vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), v_border));
514                         j += 8;
515                     }
516                     if (j != width)
517                     {
518                         j = width - 8;
519                         vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), v_border));
520                     }
521                 }
522                 else
523                 {
524                     ptrdiff_t j = 0;
525                     for (; j <= width - 16; j += 16)
526                     {
527                         vst1q_s16(dst + j, vqsubq_s16(v_border, vld1q_s16(src0 + j)));
528                         vst1q_s16(dst + j + 8, vqsubq_s16(v_border, vld1q_s16(src0 + j + 8)));
529                     }
530                     if (j <= width - 8)
531                     {
532                         vst1q_s16(dst + j, vqsubq_s16(v_border, vld1q_s16(src0 + j)));
533                         j += 8;
534                     }
535                     if (j != width)
536                     {
537                         j = width - 8;
538                         vst1q_s16(dst + j, vqsubq_s16(v_border, vld1q_s16(src0 + j)));
539                     }
540                 }
541             }
542             else if (borderType == BORDER_MODE_REFLECT101)
543             {
544                 int16x8_t vzero = vmovq_n_s16(0);
545                 ptrdiff_t j = 0;
546                 for (; j <= width - 16; j += 16)
547                 {
548                     vst1q_s16(dst + j, vzero);
549                     vst1q_s16(dst + j + 8, vzero);
550                 }
551                 if (j <= width - 8)
552                 {
553                     vst1q_s16(dst + j, vzero);
554                     j += 8;
555                 }
556                 if (j != width)
557                 {
558                     j = width - 8;
559                     vst1q_s16(dst + j, vzero);
560                 }
561             }
562             else //replicate\reflect
563             {
564                 if (src0 == 0) src0 = src1; else src2 = src1;
565                 ptrdiff_t j = 0;
566                 for (; j <= width - 16; j += 16)
567                 {
568                     vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j)));
569                     vst1q_s16(dst + j + 8, vqsubq_s16(vld1q_s16(src2 + j + 8), vld1q_s16(src0 + j + 8)));
570                 }
571                 if (j <= width - 8)
572                 {
573                     vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j)));
574                     j += 8;
575                 }
576                 if (j != width)
577                 {
578                     j = width - 8;
579                     vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j)));
580                 }
581             }
582         }
583         else
584         {
585             ptrdiff_t j = 0;
586             for (; j <= width - 16; j += 16)
587             {
588                 vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j)));
589                 vst1q_s16(dst + j + 8, vqsubq_s16(vld1q_s16(src2 + j + 8), vld1q_s16(src0 + j + 8)));
590             }
591             if (j <= width - 8)
592             {
593                 vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j)));
594                 j += 8;
595             }
596             if (j != width)
597             {
598                 j = width - 8;
599                 vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j)));
600             }
601         }
602     }
603 };
604 
605 struct ColFilter3x3S16_121 : public ColFilter3x3S16Base
606 {
607     typedef s16 dstType;
608 
ColFilter3x3S16_121CAROTENE_NS::internal::ColFilter3x3S16_121609     inline ColFilter3x3S16_121(const BORDER_MODE _borderType, const srcType _borderValue, const s16*):
610                                ColFilter3x3S16Base(_borderType, _borderValue) {}
611 
operator ()CAROTENE_NS::internal::ColFilter3x3S16_121612     inline void operator()(const s16* src0, const s16* src1, const s16* src2, const s16* src3, s16* dst0, s16* dst1, ptrdiff_t width)
613     {
614         ptrdiff_t j = 0;
615         //int16x8_t line0 = vld1q_s16(src0 + j);//1
616         //int16x8_t line1 = vld1q_s16(src1 + j);//11
617         //int16x8_t line2 = vld1q_s16(src2 + j);// 11
618         //int16x8_t line3 = vld1q_s16(src3 + j);//  1
619         for (; j <= width - 16; j += 16)
620         {
621             int16x8_t line1 = vld1q_s16(src1 + j);
622             int16x8_t line2 = vld1q_s16(src2 + j);
623 
624             int16x8_t l12 = vqaddq_s16(line1, line2);
625 
626             vst1q_s16(dst0 + j, vqaddq_s16(vqaddq_s16(vld1q_s16(src0 + j), line1), l12));
627             vst1q_s16(dst1 + j, vqaddq_s16(l12, vqaddq_s16(line2, vld1q_s16(src3 + j))));
628 
629             line1 = vld1q_s16(src1 + j + 8);
630             line2 = vld1q_s16(src2 + j + 8);
631 
632             l12 = vqaddq_s16(line1, line2);
633 
634             vst1q_s16(dst0 + j + 8, vqaddq_s16(vqaddq_s16(vld1q_s16(src0 + j + 8), line1), l12));
635             vst1q_s16(dst1 + j + 8, vqaddq_s16(l12, vqaddq_s16(line2, vld1q_s16(src3 + j + 8))));
636         }
637         if (j <= width - 8)
638         {
639             int16x8_t line1 = vld1q_s16(src1 + j);
640             int16x8_t line2 = vld1q_s16(src2 + j);
641 
642             int16x8_t l12 = vqaddq_s16(line1, line2);
643 
644             vst1q_s16(dst0 + j, vqaddq_s16(vqaddq_s16(vld1q_s16(src0 + j), line1), l12));
645             vst1q_s16(dst1 + j, vqaddq_s16(l12, vqaddq_s16(line2, vld1q_s16(src3 + j))));
646             j += 8;
647         }
648         if (j != width)
649         {
650             j = width - 8;
651             int16x8_t line1 = vld1q_s16(src1 + j);
652             int16x8_t line2 = vld1q_s16(src2 + j);
653 
654             int16x8_t l12 = vqaddq_s16(line1, line2);
655 
656             vst1q_s16(dst0 + j, vqaddq_s16(vqaddq_s16(vld1q_s16(src0 + j), line1), l12));
657             vst1q_s16(dst1 + j, vqaddq_s16(l12, vqaddq_s16(line2, vld1q_s16(src3 + j))));
658         }
659     }
660 
operator ()CAROTENE_NS::internal::ColFilter3x3S16_121661     inline void operator()(const s16* src0, const s16* src1, const s16* src2, s16* dst, ptrdiff_t width)
662     {
663         if (src0 == 0 || src2 == 0)
664         {
665             if (src2 == 0)
666                 src2 = src0;
667 
668             if (borderType == BORDER_MODE_CONSTANT)
669             {
670                 int16x8_t v_border = vdupq_n_s16(borderValue);
671                 ptrdiff_t j = 0;
672                 for (; j <= width - 16; j += 16)
673                 {
674                     vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1),
675                                                   vqaddq_s16(v_border, vld1q_s16(src2 + j))));
676                     vst1q_s16(dst + j + 8, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j + 8), 1),
677                                                       vqaddq_s16(v_border, vld1q_s16(src2 + j + 8))));
678                 }
679                 if (j <= width - 8)
680                 {
681                     vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1),
682                                                   vqaddq_s16(v_border, vld1q_s16(src2 + j))));
683                     j += 8;
684                 }
685                 if (j != width)
686                 {
687                     j = width - 8;
688                     vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1),
689                                                   vqaddq_s16(v_border, vld1q_s16(src2 + j))));
690                 }
691             }
692             else if (borderType == BORDER_MODE_REFLECT101)
693             {
694                 ptrdiff_t j = 0;
695                 for (; j <= width - 16; j += 16)
696                 {
697                     vst1q_s16(dst + j, vqshlq_n_s16(vqaddq_s16(vld1q_s16(src1 + j),
698                                                                vld1q_s16(src2 + j)), 1));
699                     vst1q_s16(dst + j + 8, vqshlq_n_s16(vqaddq_s16(vld1q_s16(src1 + j + 8),
700                                                                    vld1q_s16(src2 + j + 8)), 1));
701                 }
702                 if (j <= width - 8)
703                 {
704                     vst1q_s16(dst + j, vqshlq_n_s16(vqaddq_s16(vld1q_s16(src1 + j),
705                                                                vld1q_s16(src2 + j)), 1));
706                     j += 8;
707                 }
708                 if (j != width)
709                 {
710                     j = width - 8;
711                     vst1q_s16(dst + j, vqshlq_n_s16(vqaddq_s16(vld1q_s16(src1 + j),
712                                                                vld1q_s16(src2 + j)), 1));
713                 }
714             }
715             else //replicate\reflect
716             {
717                 ptrdiff_t j = 0;
718                 for (; j <= width - 16; j += 16)
719                 {
720                     int16x8_t line1 = vld1q_s16(src1 + j);
721                     vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(line1, 1),
722                                                   vqaddq_s16(line1, vld1q_s16(src2 + j))));
723 
724                     line1 = vld1q_s16(src1 + j + 8);
725                     vst1q_s16(dst + j + 8, vqaddq_s16(vqshlq_n_s16(line1, 1),
726                                                       vqaddq_s16(line1, vld1q_s16(src2 + j + 8))));
727                 }
728                 if (j <= width - 8)
729                 {
730                     int16x8_t line1 = vld1q_s16(src1 + j);
731                     vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(line1, 1),
732                                                   vqaddq_s16(line1, vld1q_s16(src2 + j))));
733                     j += 8;
734                 }
735                 if (j != width)
736                 {
737                     j = width - 8;
738                     int16x8_t line1 = vld1q_s16(src1 + j);
739                     vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(line1, 1),
740                                                   vqaddq_s16(line1, vld1q_s16(src2 + j))));
741                 }
742             }
743         }
744         else
745         {
746             ptrdiff_t j = 0;
747             for (; j <= width - 16; j += 16)
748             {
749                 vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1),
750                                               vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j))));
751 
752                 vst1q_s16(dst + j + 8, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j + 8), 1),
753                                               vqaddq_s16(vld1q_s16(src0 + j + 8), vld1q_s16(src2 + j + 8))));
754             }
755             if (j <= width - 8)
756             {
757                 vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1),
758                                               vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j))));
759                 j += 8;
760             }
761             if (j != width)
762             {
763                 j = width - 8;
764                 vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1),
765                                               vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j))));
766             }
767         }
768     }
769 };
770 
771 struct ColFilter3x3U8_121 : public ColFilter3x3S16Base
772 {
773     typedef u8 dstType;
774 
ColFilter3x3U8_121CAROTENE_NS::internal::ColFilter3x3U8_121775     inline ColFilter3x3U8_121(const BORDER_MODE _borderType, const srcType _borderValue, const s16*):
776                               ColFilter3x3S16Base(_borderType, _borderValue) {}
777 
operator ()CAROTENE_NS::internal::ColFilter3x3U8_121778     inline void operator()(const srcType* src0, const srcType* src1, const srcType* src2, const srcType* src3, dstType* dst0, dstType* dst1, ptrdiff_t width)
779     {
780         ptrdiff_t j = 0;
781         //int16x8_t line0 = vld1q_s16(src0 + j);//1
782         //int16x8_t line1 = vld1q_s16(src1 + j);//11
783         //int16x8_t line2 = vld1q_s16(src2 + j);// 11
784         //int16x8_t line3 = vld1q_s16(src3 + j);//  1
785         for (; j <= width - 16; j += 16)
786         {
787             int16x8_t line1 = vld1q_s16(src1 + j);
788             int16x8_t line2 = vld1q_s16(src2 + j);
789 
790             int16x8_t l12 = vaddq_s16(line1, line2);
791 
792             vst1_u8(dst0 + j, vqrshrun_n_s16(vaddq_s16(vaddq_s16(vld1q_s16(src0 + j), line1), l12), 4));
793             vst1_u8(dst1 + j, vqrshrun_n_s16(vaddq_s16(l12, vaddq_s16(line2, vld1q_s16(src3 + j))), 4));
794 
795             line1 = vld1q_s16(src1 + j + 8);
796             line2 = vld1q_s16(src2 + j + 8);
797 
798             l12 = vaddq_s16(line1, line2);
799 
800             vst1_u8(dst0 + j + 8, vqrshrun_n_s16(vaddq_s16(vaddq_s16(vld1q_s16(src0 + j + 8), line1), l12), 4));
801             vst1_u8(dst1 + j + 8, vqrshrun_n_s16(vaddq_s16(l12, vaddq_s16(line2, vld1q_s16(src3 + j + 8))), 4));
802         }
803         if (j <= width - 8)
804         {
805             int16x8_t line1 = vld1q_s16(src1 + j);
806             int16x8_t line2 = vld1q_s16(src2 + j);
807 
808             int16x8_t l12 = vaddq_s16(line1, line2);
809 
810             vst1_u8(dst0 + j, vqrshrun_n_s16(vaddq_s16(vaddq_s16(vld1q_s16(src0 + j), line1), l12), 4));
811             vst1_u8(dst1 + j, vqrshrun_n_s16(vaddq_s16(l12, vaddq_s16(line2, vld1q_s16(src3 + j))), 4));
812             j += 8;
813         }
814         if (j != width)
815         {
816             j = width - 8;
817             int16x8_t line1 = vld1q_s16(src1 + j);
818             int16x8_t line2 = vld1q_s16(src2 + j);
819 
820             int16x8_t l12 = vaddq_s16(line1, line2);
821 
822             vst1_u8(dst0 + j, vqrshrun_n_s16(vaddq_s16(vaddq_s16(vld1q_s16(src0 + j), line1), l12), 4));
823             vst1_u8(dst1 + j, vqrshrun_n_s16(vaddq_s16(l12, vaddq_s16(line2, vld1q_s16(src3 + j))), 4));
824         }
825     }
826 
operator ()CAROTENE_NS::internal::ColFilter3x3U8_121827     inline void operator()(const srcType* src0, const srcType* src1, const srcType* src2, dstType* dst, ptrdiff_t width)
828     {
829         if (src0 == 0 || src2 == 0)
830         {
831             if (src2 == 0)
832                 src2 = src0;
833 
834             if (borderType == BORDER_MODE_CONSTANT)
835             {
836                 ptrdiff_t j = 0;
837                 int16x8_t v_border = vdupq_n_s16(borderValue);
838                 for (; j <= width - 16; j += 16)
839                 {
840                     //Store normalized result, essential for gaussianBlur
841                     vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1),
842                                                               vaddq_s16(v_border, vld1q_s16(src2 + j))), 4));
843 
844                     vst1_u8(dst + j + 8, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j + 8), 1),
845                                                                   vaddq_s16(v_border, vld1q_s16(src2 + j + 8))), 4));
846                 }
847                 if (j <= width - 8)
848                 {
849                     vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1),
850                                                               vaddq_s16(v_border, vld1q_s16(src2 + j))), 4));
851                     j += 8;
852                 }
853                 if (j != width)
854                 {
855                     j = width - 8;
856                     vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1),
857                                                               vaddq_s16(v_border, vld1q_s16(src2 + j))), 4));
858                 }
859             }
860             else if (borderType == BORDER_MODE_REFLECT101)
861             {
862                 ptrdiff_t j = 0;
863                 for (; j <= width - 16; j += 16)
864                 {
865                     vst1_u8(dst + j, vqrshrun_n_s16(vshlq_n_s16(vaddq_s16(vld1q_s16(src1 + j),
866                                                                           vld1q_s16(src2 + j)), 1), 4));
867                     vst1_u8(dst + j + 8, vqrshrun_n_s16(vshlq_n_s16(vaddq_s16(vld1q_s16(src1 + j + 8),
868                                                                           vld1q_s16(src2 + j + 8)), 1), 4));
869                 }
870                 if (j <= width - 8)
871                 {
872                     vst1_u8(dst + j, vqrshrun_n_s16(vshlq_n_s16(vaddq_s16(vld1q_s16(src1 + j),
873                                                                           vld1q_s16(src2 + j)), 1), 4));
874                     j += 8;
875                 }
876                 if (j != width)
877                 {
878                     j = width - 8;
879                     vst1_u8(dst + j, vqrshrun_n_s16(vshlq_n_s16(vaddq_s16(vld1q_s16(src1 + j),
880                                                                           vld1q_s16(src2 + j)), 1), 4));
881                 }
882             }
883             else //replicate\reflect
884             {
885                 ptrdiff_t j = 0;
886                 for (; j <= width - 16; j += 16)
887                 {
888                     int16x8_t line1 = vld1q_s16(src1 + j);
889                     vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(line1, 1),
890                                                               vaddq_s16(line1, vld1q_s16(src2 + j))), 4));
891 
892                     line1 = vld1q_s16(src1 + j + 8);
893                     vst1_u8(dst + j + 8, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(line1, 1),
894                                                               vaddq_s16(line1, vld1q_s16(src2 + j + 8))), 4));
895                 }
896                 if (j <= width - 8)
897                 {
898                     int16x8_t line1 = vld1q_s16(src1 + j);
899                     vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(line1, 1),
900                                                               vaddq_s16(line1, vld1q_s16(src2 + j))), 4));
901                     j += 8;
902                 }
903                 if (j != width)
904                 {
905                     j = width - 8;
906                     int16x8_t line1 = vld1q_s16(src1 + j);
907                     vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(line1, 1),
908                                                               vaddq_s16(line1, vld1q_s16(src2 + j))), 4));
909                 }
910             }
911         }
912         else
913         {
914             ptrdiff_t j = 0;
915             for (; j <= width - 16; j += 16)
916             {
917                 vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1),
918                                                           vaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j))), 4));
919                 vst1_u8(dst + j + 8, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j + 8), 1),
920                                                           vaddq_s16(vld1q_s16(src0 + j + 8), vld1q_s16(src2 + j + 8))), 4));
921             }
922             if (j <= width - 8)
923             {
924                 vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1),
925                                                           vaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j))), 4));
926                 j += 8;
927             }
928             if (j != width)
929             {
930                 j = width - 8;
931                 vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1),
932                                                           vaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j))), 4));
933             }
934         }
935     }
936 };
937 
938 struct ColFilter3x3S16_1m21 : public ColFilter3x3S16Base
939 {
940     typedef s16 dstType;
941 
ColFilter3x3S16_1m21CAROTENE_NS::internal::ColFilter3x3S16_1m21942     inline ColFilter3x3S16_1m21(const BORDER_MODE _borderType, const srcType _borderValue, const s16*):
943                                 ColFilter3x3S16Base(_borderType, _borderValue) {}
944 
operator ()CAROTENE_NS::internal::ColFilter3x3S16_1m21945     inline void operator()(const s16* src0, const s16* src1, const s16* src2, const s16* src3, s16* dst0, s16* dst1, ptrdiff_t width)
946     {
947         ptrdiff_t j = 0;
948         //int16x8_t line0 = vld1q_s16(src0 + j);// 1
949         //int16x8_t line1 = vld1q_s16(src1 + j);//-1 1
950         //int16x8_t line2 = vld1q_s16(src2 + j);//  -1 -1
951         //int16x8_t line3 = vld1q_s16(src3 + j);//      1
952         for (; j <= width - 16; j += 16)
953         {
954             int16x8_t line1 = vld1q_s16(src1 + j);
955             int16x8_t line2 = vld1q_s16(src2 + j);
956 
957             int16x8_t l12 = vqsubq_s16(line1, line2);
958 
959             vst1q_s16(dst0 + j, vqsubq_s16(vqsubq_s16(vld1q_s16(src0 + j), line1), l12));
960             vst1q_s16(dst1 + j, vqaddq_s16(vqsubq_s16(vld1q_s16(src3 + j), line2), l12));
961 
962             line1 = vld1q_s16(src1 + j + 8);
963             line2 = vld1q_s16(src2 + j + 8);
964 
965             l12 = vqsubq_s16(line1, line2);
966 
967             vst1q_s16(dst0 + j + 8, vqsubq_s16(vqsubq_s16(vld1q_s16(src0 + j + 8), line1), l12));
968             vst1q_s16(dst1 + j + 8, vqaddq_s16(vqsubq_s16(vld1q_s16(src3 + j + 8), line2), l12));
969         }
970         if (j <= width - 8)
971         {
972             int16x8_t line1 = vld1q_s16(src1 + j);
973             int16x8_t line2 = vld1q_s16(src2 + j);
974 
975             int16x8_t l12 = vqsubq_s16(line1, line2);
976 
977             vst1q_s16(dst0 + j, vqsubq_s16(vqsubq_s16(vld1q_s16(src0 + j), line1), l12));
978             vst1q_s16(dst1 + j, vqaddq_s16(vqsubq_s16(vld1q_s16(src3 + j), line2), l12));
979             j += 8;
980         }
981         if (j != width)
982         {
983             j = width - 8;
984             int16x8_t line1 = vld1q_s16(src1 + j);
985             int16x8_t line2 = vld1q_s16(src2 + j);
986 
987             int16x8_t l12 = vqsubq_s16(line1, line2);
988 
989             vst1q_s16(dst0 + j, vqsubq_s16(vqsubq_s16(vld1q_s16(src0 + j), line1), l12));
990             vst1q_s16(dst1 + j, vqaddq_s16(vqsubq_s16(vld1q_s16(src3 + j), line2), l12));
991         }
992     }
993 
operator ()CAROTENE_NS::internal::ColFilter3x3S16_1m21994     inline void operator()(const s16* src0, const s16* src1, const s16* src2, s16* dst, ptrdiff_t width)
995     {
996         if (src0 == 0 || src2 == 0)
997         {
998             if (src2 == 0)
999                 src2 = src0;
1000 
1001             if (borderType == BORDER_MODE_CONSTANT)
1002             {
1003                 ptrdiff_t j = 0;
1004                 int16x8_t v_border = vdupq_n_s16(borderValue);
1005                 for (; j <= width - 16; j += 16)
1006                 {
1007                     vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(v_border, vld1q_s16(src2 + j)), vshlq_n_s16(vld1q_s16(src1 + j), 1)));
1008                     vst1q_s16(dst + j + 8, vqsubq_s16(vqaddq_s16(v_border, vld1q_s16(src2 + j + 8)), vshlq_n_s16(vld1q_s16(src1 + j + 8), 1)));
1009                 }
1010                 if (j <= width - 8)
1011                 {
1012                     vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(v_border, vld1q_s16(src2 + j)), vshlq_n_s16(vld1q_s16(src1 + j), 1)));
1013                     j += 8;
1014                 }
1015                 if (j != width)
1016                 {
1017                     j = width - 8;
1018                     vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(v_border, vld1q_s16(src2 + j)), vshlq_n_s16(vld1q_s16(src1 + j), 1)));
1019                 }
1020             }
1021             else if (borderType == BORDER_MODE_REFLECT101)
1022             {
1023                 ptrdiff_t j = 0;
1024                 for (; j <= width - 16; j += 16)
1025                 {
1026                     vst1q_s16(dst + j, vqshlq_n_s16(vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j)), 1));
1027                     vst1q_s16(dst + j + 8, vqshlq_n_s16(vqsubq_s16(vld1q_s16(src2 + j + 8), vld1q_s16(src1 + j + 8)), 1));
1028                 }
1029                 if (j <= width - 8)
1030                 {
1031                     vst1q_s16(dst + j, vqshlq_n_s16(vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j)), 1));
1032                     j += 8;
1033                 }
1034                 if (j != width)
1035                 {
1036                     j = width - 8;
1037                     vst1q_s16(dst + j, vqshlq_n_s16(vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j)), 1));
1038                 }
1039             }
1040             else //replicate\reflect
1041             {
1042                 ptrdiff_t j = 0;
1043                 for (; j <= width - 16; j += 16)
1044                 {
1045                     vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j)));
1046                     vst1q_s16(dst + j + 8, vqsubq_s16(vld1q_s16(src2 + j + 8), vld1q_s16(src1 + j + 8)));
1047                 }
1048                 if (j <= width - 8)
1049                 {
1050                     vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j)));
1051                     j += 8;
1052                 }
1053                 if (j != width)
1054                 {
1055                     j = width - 8;
1056                     vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j)));
1057                 }
1058             }
1059         }
1060         else
1061         {
1062             ptrdiff_t j = 0;
1063             for (; j <= width - 16; j += 16)
1064             {
1065                 vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j)),
1066                                               vqshlq_n_s16(vld1q_s16(src1 + j), 1)));
1067                 vst1q_s16(dst + j + 8, vqsubq_s16(vqaddq_s16(vld1q_s16(src0 + j + 8), vld1q_s16(src2 + j + 8)),
1068                                               vqshlq_n_s16(vld1q_s16(src1 + j + 8), 1)));
1069             }
1070             if (j <= width - 8)
1071             {
1072                 vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j)),
1073                                               vqshlq_n_s16(vld1q_s16(src1 + j), 1)));
1074                 j += 8;
1075             }
1076             if (j != width)
1077             {
1078                 j = width - 8;
1079                 vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j)),
1080                                               vqshlq_n_s16(vld1q_s16(src1 + j), 1)));
1081             }
1082         }
1083     }
1084 };
1085 
1086 template<class RowFilter, class ColFilter> struct sepFilter3x3
1087 {
1088     typedef typename RowFilter::srcType srcType;
1089     typedef typename RowFilter::dstType tmpType;
1090     typedef typename ColFilter::dstType dstType;
1091 
processCAROTENE_NS::internal::sepFilter3x31092     static void process(const Size2D &ssize,
1093                         const srcType * srcBase, ptrdiff_t srcStride,
1094                         dstType * dstBase, ptrdiff_t dstStride,
1095                         const s16 *xw, const s16 *yw,
1096                         BORDER_MODE borderType, srcType borderValue, Margin borderMargin)
1097     {
1098         const ptrdiff_t offsetk = 1;
1099         ptrdiff_t borderxl, borderxr, borderyt, borderyb;
1100         borderxl = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.left);
1101         borderyt = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.top);
1102         borderxr = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.right);
1103         borderyb = std::max<ptrdiff_t>(0, offsetk - (ptrdiff_t)borderMargin.bottom);
1104 
1105         std::vector<tmpType> _buf(ssize.width << 2);
1106         tmpType * buf = &_buf[0];
1107 
1108         RowFilter filterX(borderType, borderValue, borderxl, borderxr, xw);
1109         ColFilter filterY(borderType, filterX.borderFilter, yw);
1110         const ptrdiff_t lookTop = offsetk - borderyt;
1111         const ptrdiff_t lookBottom = offsetk - borderyb;
1112 
1113         const srcType* src = srcBase - lookTop * srcStride / sizeof(srcType);
1114         dstType* dst = dstBase;
1115 
1116         ptrdiff_t ridx = -lookTop;
1117         for (; ridx <= (ptrdiff_t)ssize.height + lookBottom - 2; ridx += 2)
1118         {
1119             for (ptrdiff_t bidx = 0; bidx < 2; ++bidx, src += srcStride / sizeof(srcType))
1120                 filterX(src, buf + ssize.width * ((4 + ridx + bidx) % 4), ssize.width);
1121 
1122             if (ridx <= 0)
1123             {
1124                 if (ridx == 0) //first row
1125                 {
1126                     filterY(0, buf + ssize.width * ((ridx + 4) % 4), buf + ssize.width * ((ridx + 1) % 4), dst, ssize.width);
1127                     dst += dstStride / sizeof(dstType);
1128                 }
1129                 continue;
1130             }
1131 
1132             filterY(buf + ssize.width * ((ridx + 2) % 4),
1133                     buf + ssize.width * ((ridx + 3) % 4),
1134                     buf + ssize.width * ((ridx + 4) % 4),
1135                     buf + ssize.width * ((ridx + 1) % 4),
1136                     dst, dst + dstStride / sizeof(dstType),  ssize.width);
1137 
1138             dst += dstStride * 2 / sizeof(dstType);
1139         }
1140 
1141         if (ridx < (ptrdiff_t)ssize.height + lookBottom)
1142         {
1143             filterX(src, buf + ssize.width * ((4 + ridx) % 4), ssize.width);
1144             filterY(buf + ssize.width * ((2 + ridx) % 4),
1145                     buf + ssize.width * ((3 + ridx) % 4),
1146                     buf + ssize.width * ((4 + ridx) % 4), dst, ssize.width);
1147             dst += dstStride / sizeof(dstType);
1148             ridx++;
1149         }
1150         if (lookBottom == 0)
1151             filterY(buf + ssize.width * ((ridx + 2) % 4), buf + ssize.width * ((ridx + 3) % 4), 0, dst, ssize.width);
1152     }
1153 };
1154 
1155 } //namespace internal
1156 
1157 } //namespace CAROTENE_NS
1158 
1159 #endif // CAROTENE_NEON
1160 
1161 #endif // CAROTENE_SRC_REMAP_HPP
1162