1 /*
2  * By downloading, copying, installing or using the software you agree to this license.
3  * If you do not agree to this license, do not download, install,
4  * copy or use the software.
5  *
6  *
7  *                           License Agreement
8  *                For Open Source Computer Vision Library
9  *                        (3-clause BSD License)
10  *
11  * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12  * Third party copyrights are property of their respective owners.
13  *
14  * Redistribution and use in source and binary forms, with or without modification,
15  * are permitted provided that the following conditions are met:
16  *
17  *   * Redistributions of source code must retain the above copyright notice,
18  *     this list of conditions and the following disclaimer.
19  *
20  *   * Redistributions in binary form must reproduce the above copyright notice,
21  *     this list of conditions and the following disclaimer in the documentation
22  *     and/or other materials provided with the distribution.
23  *
24  *   * Neither the names of the copyright holders nor the names of the contributors
25  *     may be used to endorse or promote products derived from this software
26  *     without specific prior written permission.
27  *
28  * This software is provided by the copyright holders and contributors "as is" and
29  * any express or implied warranties, including, but not limited to, the implied
30  * warranties of merchantability and fitness for a particular purpose are disclaimed.
31  * In no event shall copyright holders or contributors be liable for any direct,
32  * indirect, incidental, special, exemplary, or consequential damages
33  * (including, but not limited to, procurement of substitute goods or services;
34  * loss of use, data, or profits; or business interruption) however caused
35  * and on any theory of liability, whether in contract, strict liability,
36  * or tort (including negligence or otherwise) arising in any way out of
37  * the use of this software, even if advised of the possibility of such damage.
38  */
39 
40 #include <vector>
41 
42 #include "common.hpp"
43 
44 namespace CAROTENE_NS {
45 
isScharr3x3Supported(const Size2D & size,BORDER_MODE border,s32 dx,s32 dy,Margin borderMargin)46 bool isScharr3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin)
47 {
48     return (dx == 0 && dy == 1 &&
49                    isSeparableFilter3x3Supported(size, border, 3, 1, borderMargin)) ||
50            (dx == 1 && dy == 0 &&
51                    isSeparableFilter3x3Supported(size, border, 1, 3, borderMargin));
52 }
53 
Scharr3x3(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride,s32 dx,s32 dy,BORDER_MODE border,u8 borderValue,Margin borderMargin)54 void Scharr3x3(const Size2D &size,
55                const u8 * srcBase, ptrdiff_t srcStride,
56                s16 * dstBase, ptrdiff_t dstStride,
57                s32 dx, s32 dy,
58                BORDER_MODE border, u8 borderValue, Margin borderMargin)
59 {
60     internal::assertSupportedConfiguration(isScharr3x3Supported(size, border, dx, dy, borderMargin));
61 #ifdef CAROTENE_NEON
62     static s16 dw[] = {3, 10, 3};
63 
64     if (dy == 1)
65         SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
66                            3, 1, dw, 0,
67                            border, borderValue, borderMargin);
68     else
69         SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
70                            1, 3, 0, dw,
71                            border, borderValue, borderMargin);
72 #else
73     (void)srcBase;
74     (void)srcStride;
75     (void)dstBase;
76     (void)dstStride;
77     (void)borderValue;
78 #endif
79 }
80 
ScharrDeriv(const Size2D & size,s32 cn,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride)81 void ScharrDeriv(const Size2D &size, s32 cn,
82                  const u8 * srcBase, ptrdiff_t srcStride,
83                  s16 * dstBase, ptrdiff_t dstStride)
84 {
85     internal::assertSupportedConfiguration();
86 #ifdef CAROTENE_NEON
87     size_t colsn = size.width*cn;
88     size_t roiw8 = colsn > 7 ? colsn - 7 : 0;
89 
90     ptrdiff_t delta = (ptrdiff_t)(((size.width + 2)*cn + 15) & -16);//align size
91     std::vector<s16> _tempBuf((delta << 1) + 64);
92     s16 *trow0 = internal::alignPtr(&_tempBuf[cn], 16), *trow1 = internal::alignPtr(trow0 + delta, 16);
93 
94     int16x8_t vc3 = vmovq_n_s16(3);
95     int16x8_t vc10 = vmovq_n_s16(10);
96     uint8x8_t v8c10 = vmov_n_u8(10);
97 
98     for(size_t y = 0; y < size.height; y++ )
99     {
100         const u8* srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : size.height > 1 ? 1 : 0);
101         const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y);
102         const u8* srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height > 1 ? size.height-2 : 0);
103         s16* drow = internal::getRowPtr(dstBase, dstStride, y);
104 
105         // do vertical convolution
106         size_t x = 0;
107         for( ; x < roiw8; x += 8 )
108         {
109             internal::prefetch(srow0 + x);
110             internal::prefetch(srow1 + x);
111             internal::prefetch(srow2 + x);
112 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
113             __asm__ (
114                 "vld1.8 {d0}, [%[src0]]                                \n\t"
115                 "vld1.8 {d2}, [%[src2]]                                \n\t"
116                 "vld1.8 {d1}, [%[src1]]                                \n\t"
117                 "vaddl.u8 q2, d2, d0                                   \n\t"
118                 "vmull.u8 q3, d1, %[vc10]                              \n\t"
119                 "vsubl.u8 q4, d2, d0                                   \n\t"
120                 "vmla.s16 q3, q2, %q[vc3]                              \n\t"
121                 "vst1.16 {d8-d9}, [%[out1],:128]                       \n\t"
122                 "vst1.16 {d6-d7}, [%[out0],:128]                       \n\t"
123                 :
124                 : [out0] "r" (trow0 + x),
125                   [out1] "r" (trow1 + x),
126                   [src0] "r" (srow0 + x),
127                   [src1] "r" (srow1 + x),
128                   [src2] "r" (srow2 + x),
129                   [vc10] "w" (v8c10), [vc3] "w" (vc3)
130                 : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
131             );
132 #else
133             uint8x8_t s0 = vld1_u8(srow0 + x);
134             uint8x8_t s1 = vld1_u8(srow1 + x);
135             uint8x8_t s2 = vld1_u8(srow2 + x);
136 
137             int16x8_t s1x10 = vreinterpretq_s16_u16(vmull_u8(s1, v8c10));
138             int16x8_t s02 = vreinterpretq_s16_u16(vaddl_u8(s2, s0));
139             int16x8_t t1 = vreinterpretq_s16_u16(vsubl_u8(s2, s0));
140             int16x8_t t0 = vmlaq_s16(s1x10, s02, vc3);
141 
142             vst1q_s16(trow1 + x, t1);
143             vst1q_s16(trow0 + x, t0);
144 #endif
145         }
146         for( ; x < colsn; x++ )
147         {
148             trow0[x] = (s16)((srow0[x] + srow2[x])*3 + srow1[x]*10);
149             trow1[x] = (s16)(srow2[x] - srow0[x]);
150         }
151 
152         // make border
153         size_t x0 = (size.width > 1 ? cn : 0), x1 = (size.width > 1 ? (size.width-2)*cn : 0);
154         for( s32 k = 0; k < cn; k++ )
155         {
156             trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k];
157             trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k];
158         }
159 
160         // do horizontal convolution, interleave the results and store them to dst
161         x = 0;
162         for( ; x < roiw8; x += 8 )
163         {
164 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 6 && !defined(__clang__)
165             __asm__ (
166                 "vld1.16 {d4-d5}, [%[s2ptr]]                           \n\t"
167                 "vld1.16 {d8-d9}, [%[s4ptr]]                           \n\t"
168                 "vld1.16 {d6-d7}, [%[s3ptr],:128]                      \n\t"
169                 "vld1.16 {d0-d1}, [%[s0ptr]]                           \n\t"
170                 "vld1.16 {d2-d3}, [%[s1ptr]]                           \n\t"
171                 "vadd.i16 q7, q2, q4                                   \n\t"
172                 "vmul.s16 q6, q3, %q[vc10]                             \n\t"
173                 "vsub.s16 q5, q1, q0                                   \n\t"
174                 "vmla.s16 q6, q7, %q[vc3]                              \n\t"
175                 "vst2.16 {d10-d13}, [%[out]]                           \n\t"
176                 :
177                 : [out] "r" (drow + x * 2),
178                   [s0ptr] "r" (trow0 + x - cn),
179                   [s1ptr] "r" (trow0 + x + cn),
180                   [s2ptr] "r" (trow1 + x - cn),
181                   [s3ptr] "r" (trow1 + x),
182                   [s4ptr] "r" (trow1 + x + cn),
183                   [vc10] "w" (vc10), [vc3] "w" (vc3)
184                 : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
185             );
186 #else
187             int16x8_t s0 = vld1q_s16(trow0 + x - cn);
188             int16x8_t s1 = vld1q_s16(trow0 + x + cn);
189             int16x8_t s2 = vld1q_s16(trow1 + x - cn);
190             int16x8_t s3 = vld1q_s16(trow1 + x);
191             int16x8_t s4 = vld1q_s16(trow1 + x + cn);
192 
193             int16x8_t s3x10 = vmulq_s16(s3, vc10);
194             int16x8_t s24 = vaddq_s16(s2, s4);
195 
196             int16x8x2_t vr;
197             vr.val[0] = vsubq_s16(s1, s0);
198             vr.val[1] = vmlaq_s16(s3x10, s24, vc3);
199 
200             vst2q_s16(drow + x*2, vr);
201 #endif
202         }
203         for( ; x < colsn; x++ )
204         {
205             drow[x*2] = (s16)(trow0[x+cn] - trow0[x-cn]);
206             drow[x*2+1] = (s16)((trow1[x+cn] + trow1[x-cn])*3 + trow1[x]*10);
207         }
208     }
209 #else
210     (void)size;
211     (void)cn;
212     (void)srcBase;
213     (void)srcStride;
214     (void)dstBase;
215     (void)dstStride;
216 #endif
217 }
218 
219 } // namespace CAROTENE_NS
220