1 /*
2 * By downloading, copying, installing or using the software you agree to this license.
3 * If you do not agree to this license, do not download, install,
4 * copy or use the software.
5 *
6 *
7 * License Agreement
8 * For Open Source Computer Vision Library
9 * (3-clause BSD License)
10 *
11 * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12 * Third party copyrights are property of their respective owners.
13 *
14 * Redistribution and use in source and binary forms, with or without modification,
15 * are permitted provided that the following conditions are met:
16 *
17 * * Redistributions of source code must retain the above copyright notice,
18 * this list of conditions and the following disclaimer.
19 *
20 * * Redistributions in binary form must reproduce the above copyright notice,
21 * this list of conditions and the following disclaimer in the documentation
22 * and/or other materials provided with the distribution.
23 *
24 * * Neither the names of the copyright holders nor the names of the contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * This software is provided by the copyright holders and contributors "as is" and
29 * any express or implied warranties, including, but not limited to, the implied
30 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall copyright holders or contributors be liable for any direct,
32 * indirect, incidental, special, exemplary, or consequential damages
33 * (including, but not limited to, procurement of substitute goods or services;
34 * loss of use, data, or profits; or business interruption) however caused
35 * and on any theory of liability, whether in contract, strict liability,
36 * or tort (including negligence or otherwise) arising in any way out of
37 * the use of this software, even if advised of the possibility of such damage.
38 */
39
40 #include <vector>
41
42 #include "common.hpp"
43
44 namespace CAROTENE_NS {
45
isScharr3x3Supported(const Size2D & size,BORDER_MODE border,s32 dx,s32 dy,Margin borderMargin)46 bool isScharr3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin)
47 {
48 return (dx == 0 && dy == 1 &&
49 isSeparableFilter3x3Supported(size, border, 3, 1, borderMargin)) ||
50 (dx == 1 && dy == 0 &&
51 isSeparableFilter3x3Supported(size, border, 1, 3, borderMargin));
52 }
53
Scharr3x3(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride,s32 dx,s32 dy,BORDER_MODE border,u8 borderValue,Margin borderMargin)54 void Scharr3x3(const Size2D &size,
55 const u8 * srcBase, ptrdiff_t srcStride,
56 s16 * dstBase, ptrdiff_t dstStride,
57 s32 dx, s32 dy,
58 BORDER_MODE border, u8 borderValue, Margin borderMargin)
59 {
60 internal::assertSupportedConfiguration(isScharr3x3Supported(size, border, dx, dy, borderMargin));
61 #ifdef CAROTENE_NEON
62 static s16 dw[] = {3, 10, 3};
63
64 if (dy == 1)
65 SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
66 3, 1, dw, 0,
67 border, borderValue, borderMargin);
68 else
69 SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride,
70 1, 3, 0, dw,
71 border, borderValue, borderMargin);
72 #else
73 (void)srcBase;
74 (void)srcStride;
75 (void)dstBase;
76 (void)dstStride;
77 (void)borderValue;
78 #endif
79 }
80
ScharrDeriv(const Size2D & size,s32 cn,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride)81 void ScharrDeriv(const Size2D &size, s32 cn,
82 const u8 * srcBase, ptrdiff_t srcStride,
83 s16 * dstBase, ptrdiff_t dstStride)
84 {
85 internal::assertSupportedConfiguration();
86 #ifdef CAROTENE_NEON
87 size_t colsn = size.width*cn;
88 size_t roiw8 = colsn > 7 ? colsn - 7 : 0;
89
90 ptrdiff_t delta = (ptrdiff_t)(((size.width + 2)*cn + 15) & -16);//align size
91 std::vector<s16> _tempBuf((delta << 1) + 64);
92 s16 *trow0 = internal::alignPtr(&_tempBuf[cn], 16), *trow1 = internal::alignPtr(trow0 + delta, 16);
93
94 int16x8_t vc3 = vmovq_n_s16(3);
95 int16x8_t vc10 = vmovq_n_s16(10);
96 uint8x8_t v8c10 = vmov_n_u8(10);
97
98 for(size_t y = 0; y < size.height; y++ )
99 {
100 const u8* srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : size.height > 1 ? 1 : 0);
101 const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y);
102 const u8* srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height > 1 ? size.height-2 : 0);
103 s16* drow = internal::getRowPtr(dstBase, dstStride, y);
104
105 // do vertical convolution
106 size_t x = 0;
107 for( ; x < roiw8; x += 8 )
108 {
109 internal::prefetch(srow0 + x);
110 internal::prefetch(srow1 + x);
111 internal::prefetch(srow2 + x);
112 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
113 __asm__ (
114 "vld1.8 {d0}, [%[src0]] \n\t"
115 "vld1.8 {d2}, [%[src2]] \n\t"
116 "vld1.8 {d1}, [%[src1]] \n\t"
117 "vaddl.u8 q2, d2, d0 \n\t"
118 "vmull.u8 q3, d1, %[vc10] \n\t"
119 "vsubl.u8 q4, d2, d0 \n\t"
120 "vmla.s16 q3, q2, %q[vc3] \n\t"
121 "vst1.16 {d8-d9}, [%[out1],:128] \n\t"
122 "vst1.16 {d6-d7}, [%[out0],:128] \n\t"
123 :
124 : [out0] "r" (trow0 + x),
125 [out1] "r" (trow1 + x),
126 [src0] "r" (srow0 + x),
127 [src1] "r" (srow1 + x),
128 [src2] "r" (srow2 + x),
129 [vc10] "w" (v8c10), [vc3] "w" (vc3)
130 : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
131 );
132 #else
133 uint8x8_t s0 = vld1_u8(srow0 + x);
134 uint8x8_t s1 = vld1_u8(srow1 + x);
135 uint8x8_t s2 = vld1_u8(srow2 + x);
136
137 int16x8_t s1x10 = vreinterpretq_s16_u16(vmull_u8(s1, v8c10));
138 int16x8_t s02 = vreinterpretq_s16_u16(vaddl_u8(s2, s0));
139 int16x8_t t1 = vreinterpretq_s16_u16(vsubl_u8(s2, s0));
140 int16x8_t t0 = vmlaq_s16(s1x10, s02, vc3);
141
142 vst1q_s16(trow1 + x, t1);
143 vst1q_s16(trow0 + x, t0);
144 #endif
145 }
146 for( ; x < colsn; x++ )
147 {
148 trow0[x] = (s16)((srow0[x] + srow2[x])*3 + srow1[x]*10);
149 trow1[x] = (s16)(srow2[x] - srow0[x]);
150 }
151
152 // make border
153 size_t x0 = (size.width > 1 ? cn : 0), x1 = (size.width > 1 ? (size.width-2)*cn : 0);
154 for( s32 k = 0; k < cn; k++ )
155 {
156 trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k];
157 trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k];
158 }
159
160 // do horizontal convolution, interleave the results and store them to dst
161 x = 0;
162 for( ; x < roiw8; x += 8 )
163 {
164 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__clang__)
165 __asm__ (
166 "vld1.16 {d4-d5}, [%[s2ptr]] \n\t"
167 "vld1.16 {d8-d9}, [%[s4ptr]] \n\t"
168 "vld1.16 {d6-d7}, [%[s3ptr],:128] \n\t"
169 "vld1.16 {d0-d1}, [%[s0ptr]] \n\t"
170 "vld1.16 {d2-d3}, [%[s1ptr]] \n\t"
171 "vadd.i16 q7, q2, q4 \n\t"
172 "vmul.s16 q6, q3, %q[vc10] \n\t"
173 "vsub.s16 q5, q1, q0 \n\t"
174 "vmla.s16 q6, q7, %q[vc3] \n\t"
175 "vst2.16 {d10-d13}, [%[out]] \n\t"
176 :
177 : [out] "r" (drow + x * 2),
178 [s0ptr] "r" (trow0 + x - cn),
179 [s1ptr] "r" (trow0 + x + cn),
180 [s2ptr] "r" (trow1 + x - cn),
181 [s3ptr] "r" (trow1 + x),
182 [s4ptr] "r" (trow1 + x + cn),
183 [vc10] "w" (vc10), [vc3] "w" (vc3)
184 : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15"
185 );
186 #else
187 int16x8_t s0 = vld1q_s16(trow0 + x - cn);
188 int16x8_t s1 = vld1q_s16(trow0 + x + cn);
189 int16x8_t s2 = vld1q_s16(trow1 + x - cn);
190 int16x8_t s3 = vld1q_s16(trow1 + x);
191 int16x8_t s4 = vld1q_s16(trow1 + x + cn);
192
193 int16x8_t s3x10 = vmulq_s16(s3, vc10);
194 int16x8_t s24 = vaddq_s16(s2, s4);
195
196 int16x8x2_t vr;
197 vr.val[0] = vsubq_s16(s1, s0);
198 vr.val[1] = vmlaq_s16(s3x10, s24, vc3);
199
200 vst2q_s16(drow + x*2, vr);
201 #endif
202 }
203 for( ; x < colsn; x++ )
204 {
205 drow[x*2] = (s16)(trow0[x+cn] - trow0[x-cn]);
206 drow[x*2+1] = (s16)((trow1[x+cn] + trow1[x-cn])*3 + trow1[x]*10);
207 }
208 }
209 #else
210 (void)size;
211 (void)cn;
212 (void)srcBase;
213 (void)srcStride;
214 (void)dstBase;
215 (void)dstStride;
216 #endif
217 }
218
219 } // namespace CAROTENE_NS
220