1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
linear_coeffs(int w,int outw,int * xofs,float * alpha,int align_corner)15 static void linear_coeffs(int w, int outw, int* xofs, float* alpha, int align_corner)
16 {
17     double scale = (double)w / outw;
18     if (align_corner)
19     {
20         scale = (double)(w - 1) / (outw - 1);
21     }
22 
23     for (int dx = 0; dx < outw; dx++)
24     {
25         float fx = (float)((dx + 0.5) * scale - 0.5);
26         if (align_corner)
27         {
28             fx = (float)(dx * scale);
29         }
30 
31         int sx = floor(fx);
32         fx -= sx;
33 
34         if (sx < 0)
35         {
36             sx = 0;
37             fx = 0.f;
38         }
39         if (sx >= w - 1)
40         {
41             sx = w - 2;
42             fx = 1.f;
43         }
44 
45         xofs[dx] = sx;
46 
47         alpha[dx * 2] = 1.f - fx;
48         alpha[dx * 2 + 1] = fx;
49     }
50 }
51 
resize_bilinear_image(const Mat & src,Mat & dst,float * alpha,int * xofs,float * beta,int * yofs)52 static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
53 {
54     int w = dst.w;
55     int h = dst.h;
56 
57     // loop body
58     Mat rowsbuf0(w);
59     Mat rowsbuf1(w);
60     float* rows0 = rowsbuf0;
61     float* rows1 = rowsbuf1;
62 
63     int prev_sy1 = -2;
64 
65     for (int dy = 0; dy < h; dy++)
66     {
67         int sy = yofs[dy];
68 
69         if (sy == prev_sy1)
70         {
71             // reuse all rows
72         }
73         else if (sy == prev_sy1 + 1)
74         {
75             // hresize one row
76             float* rows0_old = rows0;
77             rows0 = rows1;
78             rows1 = rows0_old;
79             const float* S1 = src.row(sy + 1);
80 
81             const float* alphap = alpha;
82             float* rows1p = rows1;
83             int dx = 0;
84 #if __ARM_NEON
85             for (; dx + 1 < w; dx += 2)
86             {
87                 int sx = xofs[dx];
88                 int sxn = xofs[dx + 1];
89                 const float* S1p = S1 + sx;
90                 const float* S1np = S1 + sxn;
91 
92                 float32x4_t _a = vld1q_f32(alphap);
93                 float32x2_t _S1 = vld1_f32(S1p);
94                 float32x2_t _S1n = vld1_f32(S1np);
95 
96                 float32x4_t _S1S1n = vcombine_f32(_S1, _S1n);
97                 float32x4_t _ms1 = vmulq_f32(_S1S1n, _a);
98                 float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1));
99 
100                 vst1_f32(rows1p + dx, _rows1);
101 
102                 alphap += 4;
103             }
104 #endif // __ARM_NEON
105             for (; dx < w; dx++)
106             {
107                 int sx = xofs[dx];
108                 const float* S1p = S1 + sx;
109 
110                 float a0 = alphap[0];
111                 float a1 = alphap[1];
112                 rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;
113 
114                 alphap += 2;
115             }
116         }
117         else
118         {
119             // hresize two rows
120             const float* S0 = src.row(sy);
121             const float* S1 = src.row(sy + 1);
122 
123             const float* alphap = alpha;
124             float* rows0p = rows0;
125             float* rows1p = rows1;
126             int dx = 0;
127 #if __ARM_NEON
128             for (; dx + 1 < w; dx += 2)
129             {
130                 int sx = xofs[dx];
131                 int sxn = xofs[dx + 1];
132                 const float* S0p = S0 + sx;
133                 const float* S1p = S1 + sx;
134                 const float* S0np = S0 + sxn;
135                 const float* S1np = S1 + sxn;
136 
137                 float32x4_t _a = vld1q_f32(alphap);
138                 float32x2_t _S0 = vld1_f32(S0p);
139                 float32x2_t _S1 = vld1_f32(S1p);
140                 float32x2_t _S0n = vld1_f32(S0np);
141                 float32x2_t _S1n = vld1_f32(S1np);
142 
143                 float32x4_t _S0S0n = vcombine_f32(_S0, _S0n);
144                 float32x4_t _S1S1n = vcombine_f32(_S1, _S1n);
145                 float32x4_t _ms0 = vmulq_f32(_S0S0n, _a);
146                 float32x4_t _ms1 = vmulq_f32(_S1S1n, _a);
147                 float32x2_t _rows0 = vpadd_f32(vget_low_f32(_ms0), vget_high_f32(_ms0));
148                 float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1));
149 
150                 vst1_f32(rows0p + dx, _rows0);
151                 vst1_f32(rows1p + dx, _rows1);
152 
153                 alphap += 4;
154             }
155 #endif // __ARM_NEON
156             for (; dx < w; dx++)
157             {
158                 int sx = xofs[dx];
159                 const float* S0p = S0 + sx;
160                 const float* S1p = S1 + sx;
161 
162                 float a0 = alphap[0];
163                 float a1 = alphap[1];
164                 rows0p[dx] = S0p[0] * a0 + S0p[1] * a1;
165                 rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;
166 
167                 alphap += 2;
168             }
169         }
170 
171         prev_sy1 = sy;
172 
173         // vresize
174         float b0 = beta[0];
175         float b1 = beta[1];
176 
177         float* rows0p = rows0;
178         float* rows1p = rows1;
179         float* Dp = dst.row(dy);
180 
181 #if __ARM_NEON
182         int nn = w >> 3;
183 #else
184         int nn = 0;
185 #endif
186         int remain = w - (nn << 3);
187 
188 #if __ARM_NEON
189         float32x4_t _b0 = vdupq_n_f32(b0);
190         float32x4_t _b1 = vdupq_n_f32(b1);
191         for (; nn > 0; nn--)
192         {
193             float32x4_t _rows0 = vld1q_f32(rows0p);
194             float32x4_t _rows1 = vld1q_f32(rows1p);
195 
196             float32x4_t _D = vmulq_f32(_rows0, _b0);
197             _D = vmlaq_f32(_D, _rows1, _b1);
198 
199             vst1q_f32(Dp, _D);
200 
201             float32x4_t _rows0n = vld1q_f32(rows0p + 4);
202             float32x4_t _rows1n = vld1q_f32(rows1p + 4);
203 
204             float32x4_t _Dn = vmulq_f32(_rows0n, _b0);
205             _Dn = vmlaq_f32(_Dn, _rows1n, _b1);
206 
207             vst1q_f32(Dp + 4, _Dn);
208 
209             Dp += 8;
210             rows0p += 8;
211             rows1p += 8;
212         }
213 #endif // __ARM_NEON
214         for (; remain; --remain)
215         {
216             //             D[x] = rows0[x]*b0 + rows1[x]*b1;
217             *Dp++ = *rows0p++ * b0 + *rows1p++ * b1;
218         }
219 
220         beta += 2;
221     }
222 }
223