1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
linear_coeffs(int w,int outw,int * xofs,float * alpha,int align_corner)15 static void linear_coeffs(int w, int outw, int* xofs, float* alpha, int align_corner)
16 {
17 double scale = (double)w / outw;
18 if (align_corner)
19 {
20 scale = (double)(w - 1) / (outw - 1);
21 }
22
23 for (int dx = 0; dx < outw; dx++)
24 {
25 float fx = (float)((dx + 0.5) * scale - 0.5);
26 if (align_corner)
27 {
28 fx = (float)(dx * scale);
29 }
30
31 int sx = floor(fx);
32 fx -= sx;
33
34 if (sx < 0)
35 {
36 sx = 0;
37 fx = 0.f;
38 }
39 if (sx >= w - 1)
40 {
41 sx = w - 2;
42 fx = 1.f;
43 }
44
45 xofs[dx] = sx;
46
47 alpha[dx * 2] = 1.f - fx;
48 alpha[dx * 2 + 1] = fx;
49 }
50 }
51
resize_bilinear_image(const Mat & src,Mat & dst,float * alpha,int * xofs,float * beta,int * yofs)52 static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
53 {
54 int w = dst.w;
55 int h = dst.h;
56
57 // loop body
58 Mat rowsbuf0(w);
59 Mat rowsbuf1(w);
60 float* rows0 = rowsbuf0;
61 float* rows1 = rowsbuf1;
62
63 int prev_sy1 = -2;
64
65 for (int dy = 0; dy < h; dy++)
66 {
67 int sy = yofs[dy];
68
69 if (sy == prev_sy1)
70 {
71 // reuse all rows
72 }
73 else if (sy == prev_sy1 + 1)
74 {
75 // hresize one row
76 float* rows0_old = rows0;
77 rows0 = rows1;
78 rows1 = rows0_old;
79 const float* S1 = src.row(sy + 1);
80
81 const float* alphap = alpha;
82 float* rows1p = rows1;
83 int dx = 0;
84 #if __ARM_NEON
85 for (; dx + 1 < w; dx += 2)
86 {
87 int sx = xofs[dx];
88 int sxn = xofs[dx + 1];
89 const float* S1p = S1 + sx;
90 const float* S1np = S1 + sxn;
91
92 float32x4_t _a = vld1q_f32(alphap);
93 float32x2_t _S1 = vld1_f32(S1p);
94 float32x2_t _S1n = vld1_f32(S1np);
95
96 float32x4_t _S1S1n = vcombine_f32(_S1, _S1n);
97 float32x4_t _ms1 = vmulq_f32(_S1S1n, _a);
98 float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1));
99
100 vst1_f32(rows1p + dx, _rows1);
101
102 alphap += 4;
103 }
104 #endif // __ARM_NEON
105 for (; dx < w; dx++)
106 {
107 int sx = xofs[dx];
108 const float* S1p = S1 + sx;
109
110 float a0 = alphap[0];
111 float a1 = alphap[1];
112 rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;
113
114 alphap += 2;
115 }
116 }
117 else
118 {
119 // hresize two rows
120 const float* S0 = src.row(sy);
121 const float* S1 = src.row(sy + 1);
122
123 const float* alphap = alpha;
124 float* rows0p = rows0;
125 float* rows1p = rows1;
126 int dx = 0;
127 #if __ARM_NEON
128 for (; dx + 1 < w; dx += 2)
129 {
130 int sx = xofs[dx];
131 int sxn = xofs[dx + 1];
132 const float* S0p = S0 + sx;
133 const float* S1p = S1 + sx;
134 const float* S0np = S0 + sxn;
135 const float* S1np = S1 + sxn;
136
137 float32x4_t _a = vld1q_f32(alphap);
138 float32x2_t _S0 = vld1_f32(S0p);
139 float32x2_t _S1 = vld1_f32(S1p);
140 float32x2_t _S0n = vld1_f32(S0np);
141 float32x2_t _S1n = vld1_f32(S1np);
142
143 float32x4_t _S0S0n = vcombine_f32(_S0, _S0n);
144 float32x4_t _S1S1n = vcombine_f32(_S1, _S1n);
145 float32x4_t _ms0 = vmulq_f32(_S0S0n, _a);
146 float32x4_t _ms1 = vmulq_f32(_S1S1n, _a);
147 float32x2_t _rows0 = vpadd_f32(vget_low_f32(_ms0), vget_high_f32(_ms0));
148 float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1));
149
150 vst1_f32(rows0p + dx, _rows0);
151 vst1_f32(rows1p + dx, _rows1);
152
153 alphap += 4;
154 }
155 #endif // __ARM_NEON
156 for (; dx < w; dx++)
157 {
158 int sx = xofs[dx];
159 const float* S0p = S0 + sx;
160 const float* S1p = S1 + sx;
161
162 float a0 = alphap[0];
163 float a1 = alphap[1];
164 rows0p[dx] = S0p[0] * a0 + S0p[1] * a1;
165 rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;
166
167 alphap += 2;
168 }
169 }
170
171 prev_sy1 = sy;
172
173 // vresize
174 float b0 = beta[0];
175 float b1 = beta[1];
176
177 float* rows0p = rows0;
178 float* rows1p = rows1;
179 float* Dp = dst.row(dy);
180
181 #if __ARM_NEON
182 int nn = w >> 3;
183 #else
184 int nn = 0;
185 #endif
186 int remain = w - (nn << 3);
187
188 #if __ARM_NEON
189 float32x4_t _b0 = vdupq_n_f32(b0);
190 float32x4_t _b1 = vdupq_n_f32(b1);
191 for (; nn > 0; nn--)
192 {
193 float32x4_t _rows0 = vld1q_f32(rows0p);
194 float32x4_t _rows1 = vld1q_f32(rows1p);
195
196 float32x4_t _D = vmulq_f32(_rows0, _b0);
197 _D = vmlaq_f32(_D, _rows1, _b1);
198
199 vst1q_f32(Dp, _D);
200
201 float32x4_t _rows0n = vld1q_f32(rows0p + 4);
202 float32x4_t _rows1n = vld1q_f32(rows1p + 4);
203
204 float32x4_t _Dn = vmulq_f32(_rows0n, _b0);
205 _Dn = vmlaq_f32(_Dn, _rows1n, _b1);
206
207 vst1q_f32(Dp + 4, _Dn);
208
209 Dp += 8;
210 rows0p += 8;
211 rows1p += 8;
212 }
213 #endif // __ARM_NEON
214 for (; remain; --remain)
215 {
216 // D[x] = rows0[x]*b0 + rows1[x]*b1;
217 *Dp++ = *rows0p++ * b0 + *rows1p++ * b1;
218 }
219
220 beta += 2;
221 }
222 }
223