1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 BUG1989. All rights reserved.
4 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
5 //
6 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
7 // in compliance with the License. You may obtain a copy of the License at
8 //
9 // https://opensource.org/licenses/BSD-3-Clause
10 //
11 // Unless required by applicable law or agreed to in writing, software distributed
12 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
13 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
14 // specific language governing permissions and limitations under the License.
15
16 #include "requantize_arm.h"
17
18 #include <math.h>
19
20 #if __ARM_NEON
21 #include <arm_neon.h>
22 #endif // __ARM_NEON
23
24 #include "arm_activation.h"
25 #include "arm_usability.h"
26
27 namespace ncnn {
28
29 #if __ARM_NEON
30 #include "requantize_leakyrelu_pack4.h"
31 #include "requantize_leakyrelu_pack8.h"
32 #include "requantize_relu_pack4.h"
33 #include "requantize_relu_pack8.h"
34 #endif // __ARM_NEON
35
Requantize_arm()36 Requantize_arm::Requantize_arm()
37 {
38 #if __ARM_NEON
39 support_packing = true;
40 #endif // __ARM_NEON
41 }
42
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const43 int Requantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
44 {
45 int dims = bottom_blob.dims;
46 int elempack = bottom_blob.elempack;
47
48 #if __ARM_NEON
49 if (elempack == 8)
50 {
51 if (dims == 1)
52 {
53 int w = bottom_blob.w;
54
55 top_blob.create(w, (size_t)8u, 8, opt.blob_allocator);
56 if (top_blob.empty())
57 return -100;
58
59 if (scale_in_data_size == 1 && scale_out_data_size == 1)
60 {
61 float32x4_t _scale_in = vdupq_n_f32(scale_in_data[0]);
62 float32x4_t _scale_out = vdupq_n_f32(scale_out_data[0]);
63
64 if (bias_data_size == 0)
65 {
66 #pragma omp parallel for num_threads(opt.num_threads)
67 for (int i = 0; i < w; i++)
68 {
69 const int* intptr = (const int*)bottom_blob + i * 8;
70 signed char* ptr = (signed char*)top_blob + i * 8;
71
72 float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
73 float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
74 _v0 = vmulq_f32(_v0, _scale_in);
75 _v1 = vmulq_f32(_v1, _scale_in);
76 _v0 = activation_ps(_v0, activation_type, activation_params);
77 _v1 = activation_ps(_v1, activation_type, activation_params);
78 _v0 = vmulq_f32(_v0, _scale_out);
79 _v1 = vmulq_f32(_v1, _scale_out);
80 vst1_s8(ptr, float2int8(_v0, _v1));
81 }
82 }
83 else if (bias_data_size == 1)
84 {
85 float32x4_t _bias = vdupq_n_f32(bias_data[0]);
86
87 #pragma omp parallel for num_threads(opt.num_threads)
88 for (int i = 0; i < w; i++)
89 {
90 const int* intptr = (const int*)bottom_blob + i * 8;
91 signed char* ptr = (signed char*)top_blob + i * 8;
92
93 float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
94 float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
95 _v0 = vmlaq_f32(_bias, _v0, _scale_in);
96 _v1 = vmlaq_f32(_bias, _v1, _scale_in);
97 _v0 = activation_ps(_v0, activation_type, activation_params);
98 _v1 = activation_ps(_v1, activation_type, activation_params);
99 _v0 = vmulq_f32(_v0, _scale_out);
100 _v1 = vmulq_f32(_v1, _scale_out);
101 vst1_s8(ptr, float2int8(_v0, _v1));
102 }
103 }
104 else
105 {
106 #pragma omp parallel for num_threads(opt.num_threads)
107 for (int i = 0; i < w; i++)
108 {
109 const int* intptr = (const int*)bottom_blob + i * 8;
110 signed char* ptr = (signed char*)top_blob + i * 8;
111
112 float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8);
113 float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8 + 4);
114 float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
115 float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
116 _v0 = vmlaq_f32(_bias0, _v0, _scale_in);
117 _v1 = vmlaq_f32(_bias1, _v1, _scale_in);
118 _v0 = activation_ps(_v0, activation_type, activation_params);
119 _v1 = activation_ps(_v1, activation_type, activation_params);
120 _v0 = vmulq_f32(_v0, _scale_out);
121 _v1 = vmulq_f32(_v1, _scale_out);
122 vst1_s8(ptr, float2int8(_v0, _v1));
123 }
124 }
125 }
126 else if (scale_in_data_size == 1 && scale_out_data_size > 1)
127 {
128 float32x4_t _scale_in = vdupq_n_f32(scale_in_data[0]);
129
130 if (bias_data_size == 0)
131 {
132 #pragma omp parallel for num_threads(opt.num_threads)
133 for (int i = 0; i < w; i++)
134 {
135 const int* intptr = (const int*)bottom_blob + i * 8;
136 signed char* ptr = (signed char*)top_blob + i * 8;
137
138 float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8);
139 float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4);
140 float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
141 float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
142 _v0 = vmulq_f32(_v0, _scale_in);
143 _v1 = vmulq_f32(_v1, _scale_in);
144 _v0 = activation_ps(_v0, activation_type, activation_params);
145 _v1 = activation_ps(_v1, activation_type, activation_params);
146 _v0 = vmulq_f32(_v0, _scale_out0);
147 _v1 = vmulq_f32(_v1, _scale_out1);
148 vst1_s8(ptr, float2int8(_v0, _v1));
149 }
150 }
151 else if (bias_data_size == 1)
152 {
153 float32x4_t _bias = vdupq_n_f32(bias_data[0]);
154
155 #pragma omp parallel for num_threads(opt.num_threads)
156 for (int i = 0; i < w; i++)
157 {
158 const int* intptr = (const int*)bottom_blob + i * 8;
159 signed char* ptr = (signed char*)top_blob + i * 8;
160
161 float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8);
162 float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4);
163 float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
164 float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
165 _v0 = vmlaq_f32(_bias, _v0, _scale_in);
166 _v1 = vmlaq_f32(_bias, _v1, _scale_in);
167 _v0 = activation_ps(_v0, activation_type, activation_params);
168 _v1 = activation_ps(_v1, activation_type, activation_params);
169 _v0 = vmulq_f32(_v0, _scale_out0);
170 _v1 = vmulq_f32(_v1, _scale_out1);
171 vst1_s8(ptr, float2int8(_v0, _v1));
172 }
173 }
174 else
175 {
176 #pragma omp parallel for num_threads(opt.num_threads)
177 for (int i = 0; i < w; i++)
178 {
179 const int* intptr = (const int*)bottom_blob + i * 8;
180 signed char* ptr = (signed char*)top_blob + i * 8;
181
182 float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8);
183 float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4);
184 float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8);
185 float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8 + 4);
186 float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
187 float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
188 _v0 = vmlaq_f32(_bias0, _v0, _scale_in);
189 _v1 = vmlaq_f32(_bias1, _v1, _scale_in);
190 _v0 = activation_ps(_v0, activation_type, activation_params);
191 _v1 = activation_ps(_v1, activation_type, activation_params);
192 _v0 = vmulq_f32(_v0, _scale_out0);
193 _v1 = vmulq_f32(_v1, _scale_out1);
194 vst1_s8(ptr, float2int8(_v0, _v1));
195 }
196 }
197 }
198 else if (scale_in_data_size > 1 && scale_out_data_size == 1)
199 {
200 float32x4_t _scale_out = vdupq_n_f32(scale_out_data[0]);
201
202 if (bias_data_size == 0)
203 {
204 #pragma omp parallel for num_threads(opt.num_threads)
205 for (int i = 0; i < w; i++)
206 {
207 const int* intptr = (const int*)bottom_blob + i * 8;
208 signed char* ptr = (signed char*)top_blob + i * 8;
209
210 float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8);
211 float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4);
212 float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
213 float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
214 _v0 = vmulq_f32(_v0, _scale_in0);
215 _v1 = vmulq_f32(_v1, _scale_in1);
216 _v0 = activation_ps(_v0, activation_type, activation_params);
217 _v1 = activation_ps(_v1, activation_type, activation_params);
218 _v0 = vmulq_f32(_v0, _scale_out);
219 _v1 = vmulq_f32(_v1, _scale_out);
220 vst1_s8(ptr, float2int8(_v0, _v1));
221 }
222 }
223 else if (bias_data_size == 1)
224 {
225 float32x4_t _bias = vdupq_n_f32(bias_data[0]);
226
227 #pragma omp parallel for num_threads(opt.num_threads)
228 for (int i = 0; i < w; i++)
229 {
230 const int* intptr = (const int*)bottom_blob + i * 8;
231 signed char* ptr = (signed char*)top_blob + i * 8;
232
233 float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8);
234 float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4);
235 float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
236 float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
237 _v0 = vmlaq_f32(_bias, _v0, _scale_in0);
238 _v1 = vmlaq_f32(_bias, _v1, _scale_in1);
239 _v0 = activation_ps(_v0, activation_type, activation_params);
240 _v1 = activation_ps(_v1, activation_type, activation_params);
241 _v0 = vmulq_f32(_v0, _scale_out);
242 _v1 = vmulq_f32(_v1, _scale_out);
243 vst1_s8(ptr, float2int8(_v0, _v1));
244 }
245 }
246 else
247 {
248 #pragma omp parallel for num_threads(opt.num_threads)
249 for (int i = 0; i < w; i++)
250 {
251 const int* intptr = (const int*)bottom_blob + i * 8;
252 signed char* ptr = (signed char*)top_blob + i * 8;
253
254 float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8);
255 float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4);
256 float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8);
257 float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8 + 4);
258 float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
259 float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
260 _v0 = vmlaq_f32(_bias0, _v0, _scale_in0);
261 _v1 = vmlaq_f32(_bias1, _v1, _scale_in1);
262 _v0 = activation_ps(_v0, activation_type, activation_params);
263 _v1 = activation_ps(_v1, activation_type, activation_params);
264 _v0 = vmulq_f32(_v0, _scale_out);
265 _v1 = vmulq_f32(_v1, _scale_out);
266 vst1_s8(ptr, float2int8(_v0, _v1));
267 }
268 }
269 }
270 else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
271 {
272 if (bias_data_size == 0)
273 {
274 #pragma omp parallel for num_threads(opt.num_threads)
275 for (int i = 0; i < w; i++)
276 {
277 const int* intptr = (const int*)bottom_blob + i * 8;
278 signed char* ptr = (signed char*)top_blob + i * 8;
279
280 float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8);
281 float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4);
282 float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8);
283 float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4);
284 float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
285 float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
286 _v0 = vmulq_f32(_v0, _scale_in0);
287 _v1 = vmulq_f32(_v1, _scale_in1);
288 _v0 = activation_ps(_v0, activation_type, activation_params);
289 _v1 = activation_ps(_v1, activation_type, activation_params);
290 _v0 = vmulq_f32(_v0, _scale_out0);
291 _v1 = vmulq_f32(_v1, _scale_out1);
292 vst1_s8(ptr, float2int8(_v0, _v1));
293 }
294 }
295 else if (bias_data_size == 1)
296 {
297 float32x4_t _bias = vdupq_n_f32(bias_data[0]);
298
299 #pragma omp parallel for num_threads(opt.num_threads)
300 for (int i = 0; i < w; i++)
301 {
302 const int* intptr = (const int*)bottom_blob + i * 8;
303 signed char* ptr = (signed char*)top_blob + i * 8;
304
305 float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8);
306 float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4);
307 float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8);
308 float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4);
309 float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
310 float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
311 _v0 = vmlaq_f32(_bias, _v0, _scale_in0);
312 _v1 = vmlaq_f32(_bias, _v1, _scale_in1);
313 _v0 = activation_ps(_v0, activation_type, activation_params);
314 _v1 = activation_ps(_v1, activation_type, activation_params);
315 _v0 = vmulq_f32(_v0, _scale_out0);
316 _v1 = vmulq_f32(_v1, _scale_out1);
317 vst1_s8(ptr, float2int8(_v0, _v1));
318 }
319 }
320 else
321 {
322 #pragma omp parallel for num_threads(opt.num_threads)
323 for (int i = 0; i < w; i++)
324 {
325 const int* intptr = (const int*)bottom_blob + i * 8;
326 signed char* ptr = (signed char*)top_blob + i * 8;
327
328 float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8);
329 float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4);
330 float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8);
331 float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4);
332 float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8);
333 float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8 + 4);
334 float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
335 float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
336 _v0 = vmlaq_f32(_bias0, _v0, _scale_in0);
337 _v1 = vmlaq_f32(_bias1, _v1, _scale_in1);
338 _v0 = activation_ps(_v0, activation_type, activation_params);
339 _v1 = activation_ps(_v1, activation_type, activation_params);
340 _v0 = vmulq_f32(_v0, _scale_out0);
341 _v1 = vmulq_f32(_v1, _scale_out1);
342 vst1_s8(ptr, float2int8(_v0, _v1));
343 }
344 }
345 }
346 }
347
348 if (dims == 2)
349 {
350 int w = bottom_blob.w;
351 int h = bottom_blob.h;
352
353 top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator);
354 if (top_blob.empty())
355 return -100;
356
357 if (bias_data_size == 0)
358 {
359 #pragma omp parallel for num_threads(opt.num_threads)
360 for (int i = 0; i < h; i++)
361 {
362 const int* intptr = bottom_blob.row<const int>(i);
363 signed char* ptr = top_blob.row<signed char>(i);
364
365 float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8);
366 float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4);
367 float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8);
368 float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4);
369
370 for (int j = 0; j < w; j++)
371 {
372 float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
373 float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
374 _v0 = vmulq_f32(_v0, _scale_in0);
375 _v1 = vmulq_f32(_v1, _scale_in1);
376 _v0 = activation_ps(_v0, activation_type, activation_params);
377 _v1 = activation_ps(_v1, activation_type, activation_params);
378 _v0 = vmulq_f32(_v0, _scale_out0);
379 _v1 = vmulq_f32(_v1, _scale_out1);
380 vst1_s8(ptr, float2int8(_v0, _v1));
381
382 intptr += 8;
383 ptr += 8;
384 }
385 }
386 }
387 else
388 {
389 #pragma omp parallel for num_threads(opt.num_threads)
390 for (int i = 0; i < h; i++)
391 {
392 const int* intptr = bottom_blob.row<const int>(i);
393 signed char* ptr = top_blob.row<signed char>(i);
394
395 float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8);
396 float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4);
397 float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8);
398 float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4);
399 float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8);
400 float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8 + 4);
401
402 for (int j = 0; j < w; j++)
403 {
404 float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
405 float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
406 _v0 = vmlaq_f32(_bias0, _v0, _scale_in0);
407 _v1 = vmlaq_f32(_bias1, _v1, _scale_in1);
408 _v0 = activation_ps(_v0, activation_type, activation_params);
409 _v1 = activation_ps(_v1, activation_type, activation_params);
410 _v0 = vmulq_f32(_v0, _scale_out0);
411 _v1 = vmulq_f32(_v1, _scale_out1);
412 vst1_s8(ptr, float2int8(_v0, _v1));
413
414 intptr += 8;
415 ptr += 8;
416 }
417 }
418 }
419 }
420
421 if (dims == 3)
422 {
423 int w = bottom_blob.w;
424 int h = bottom_blob.h;
425 int channels = bottom_blob.c;
426 int size = w * h;
427
428 top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator);
429 if (top_blob.empty())
430 return -100;
431
432 if (activation_type == 1)
433 {
434 requantize_relu_pack8_neon(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt);
435 return 0;
436 }
437
438 if (activation_type == 2 && activation_params[0] > 0.f)
439 {
440 requantize_leakyrelu_pack8_neon(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt);
441 return 0;
442 }
443
444 if (bias_data_size == 0)
445 {
446 #pragma omp parallel for num_threads(opt.num_threads)
447 for (int q = 0; q < channels; q++)
448 {
449 const int* intptr = bottom_blob.channel(q);
450 signed char* ptr = top_blob.channel(q);
451
452 float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8);
453 float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8 + 4);
454 float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8);
455 float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8 + 4);
456
457 for (int i = 0; i < size; i++)
458 {
459 float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
460 float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
461 _v0 = vmulq_f32(_v0, _scale_in0);
462 _v1 = vmulq_f32(_v1, _scale_in1);
463 _v0 = activation_ps(_v0, activation_type, activation_params);
464 _v1 = activation_ps(_v1, activation_type, activation_params);
465 _v0 = vmulq_f32(_v0, _scale_out0);
466 _v1 = vmulq_f32(_v1, _scale_out1);
467 vst1_s8(ptr, float2int8(_v0, _v1));
468
469 intptr += 8;
470 ptr += 8;
471 }
472 }
473 }
474 else
475 {
476 #pragma omp parallel for num_threads(opt.num_threads)
477 for (int q = 0; q < channels; q++)
478 {
479 const int* intptr = bottom_blob.channel(q);
480 signed char* ptr = top_blob.channel(q);
481
482 float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8);
483 float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8 + 4);
484 float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8);
485 float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8 + 4);
486 float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8);
487 float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8 + 4);
488
489 for (int i = 0; i < size; i++)
490 {
491 float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
492 float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
493 _v0 = vmlaq_f32(_bias0, _v0, _scale_in0);
494 _v1 = vmlaq_f32(_bias1, _v1, _scale_in1);
495 _v0 = activation_ps(_v0, activation_type, activation_params);
496 _v1 = activation_ps(_v1, activation_type, activation_params);
497 _v0 = vmulq_f32(_v0, _scale_out0);
498 _v1 = vmulq_f32(_v1, _scale_out1);
499 vst1_s8(ptr, float2int8(_v0, _v1));
500
501 intptr += 8;
502 ptr += 8;
503 }
504 }
505 }
506 }
507
508 return 0;
509 }
510
511 if (elempack == 4)
512 {
513 if (dims == 1)
514 {
515 int w = bottom_blob.w;
516 int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1;
517 int outw = w * elempack / out_elempack;
518
519 top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
520 if (top_blob.empty())
521 return -100;
522
523 if (scale_in_data_size == 1 && scale_out_data_size == 1)
524 {
525 float32x4_t _scale_in = vdupq_n_f32(scale_in_data[0]);
526 float32x4_t _scale_out = vdupq_n_f32(scale_out_data[0]);
527
528 if (bias_data_size == 0)
529 {
530 #pragma omp parallel for num_threads(opt.num_threads)
531 for (int i = 0; i < w; i++)
532 {
533 const int* intptr = (const int*)bottom_blob + i * 4;
534 signed char* ptr = (signed char*)top_blob + i * 4;
535
536 float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
537 _v = vmulq_f32(_v, _scale_in);
538 _v = activation_ps(_v, activation_type, activation_params);
539 _v = vmulq_f32(_v, _scale_out);
540 int8x8_t v = float2int8(_v, _v);
541 ptr[0] = vget_lane_s8(v, 0);
542 ptr[1] = vget_lane_s8(v, 1);
543 ptr[2] = vget_lane_s8(v, 2);
544 ptr[3] = vget_lane_s8(v, 3);
545 }
546 }
547 else if (bias_data_size == 1)
548 {
549 float32x4_t _bias = vdupq_n_f32(bias_data[0]);
550
551 #pragma omp parallel for num_threads(opt.num_threads)
552 for (int i = 0; i < w; i++)
553 {
554 const int* intptr = (const int*)bottom_blob + i * 4;
555 signed char* ptr = (signed char*)top_blob + i * 4;
556
557 float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
558 _v = vmlaq_f32(_bias, _v, _scale_in);
559 _v = activation_ps(_v, activation_type, activation_params);
560 _v = vmulq_f32(_v, _scale_out);
561 int8x8_t v = float2int8(_v, _v);
562 ptr[0] = vget_lane_s8(v, 0);
563 ptr[1] = vget_lane_s8(v, 1);
564 ptr[2] = vget_lane_s8(v, 2);
565 ptr[3] = vget_lane_s8(v, 3);
566 }
567 }
568 else
569 {
570 #pragma omp parallel for num_threads(opt.num_threads)
571 for (int i = 0; i < w; i++)
572 {
573 const int* intptr = (const int*)bottom_blob + i * 4;
574 signed char* ptr = (signed char*)top_blob + i * 4;
575
576 float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4);
577 float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
578 _v = vmlaq_f32(_bias, _v, _scale_in);
579 _v = activation_ps(_v, activation_type, activation_params);
580 _v = vmulq_f32(_v, _scale_out);
581 int8x8_t v = float2int8(_v, _v);
582 ptr[0] = vget_lane_s8(v, 0);
583 ptr[1] = vget_lane_s8(v, 1);
584 ptr[2] = vget_lane_s8(v, 2);
585 ptr[3] = vget_lane_s8(v, 3);
586 }
587 }
588 }
589 else if (scale_in_data_size == 1 && scale_out_data_size > 1)
590 {
591 float32x4_t _scale_in = vdupq_n_f32(scale_in_data[0]);
592
593 if (bias_data_size == 0)
594 {
595 #pragma omp parallel for num_threads(opt.num_threads)
596 for (int i = 0; i < w; i++)
597 {
598 const int* intptr = (const int*)bottom_blob + i * 4;
599 signed char* ptr = (signed char*)top_blob + i * 4;
600
601 float32x4_t _scale_out = vld1q_f32((const float*)scale_out_data + i * 4);
602 float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
603 _v = vmulq_f32(_v, _scale_in);
604 _v = activation_ps(_v, activation_type, activation_params);
605 _v = vmulq_f32(_v, _scale_out);
606 int8x8_t v = float2int8(_v, _v);
607 ptr[0] = vget_lane_s8(v, 0);
608 ptr[1] = vget_lane_s8(v, 1);
609 ptr[2] = vget_lane_s8(v, 2);
610 ptr[3] = vget_lane_s8(v, 3);
611 }
612 }
613 else if (bias_data_size == 1)
614 {
615 float32x4_t _bias = vdupq_n_f32(bias_data[0]);
616
617 #pragma omp parallel for num_threads(opt.num_threads)
618 for (int i = 0; i < w; i++)
619 {
620 const int* intptr = (const int*)bottom_blob + i * 4;
621 signed char* ptr = (signed char*)top_blob + i * 4;
622
623 float32x4_t _scale_out = vld1q_f32((const float*)scale_out_data + i * 4);
624 float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
625 _v = vmlaq_f32(_bias, _v, _scale_in);
626 _v = activation_ps(_v, activation_type, activation_params);
627 _v = vmulq_f32(_v, _scale_out);
628 int8x8_t v = float2int8(_v, _v);
629 ptr[0] = vget_lane_s8(v, 0);
630 ptr[1] = vget_lane_s8(v, 1);
631 ptr[2] = vget_lane_s8(v, 2);
632 ptr[3] = vget_lane_s8(v, 3);
633 }
634 }
635 else
636 {
637 #pragma omp parallel for num_threads(opt.num_threads)
638 for (int i = 0; i < w; i++)
639 {
640 const int* intptr = (const int*)bottom_blob + i * 4;
641 signed char* ptr = (signed char*)top_blob + i * 4;
642
643 float32x4_t _scale_out = vld1q_f32((const float*)scale_out_data + i * 4);
644 float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4);
645 float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
646 _v = vmlaq_f32(_bias, _v, _scale_in);
647 _v = activation_ps(_v, activation_type, activation_params);
648 _v = vmulq_f32(_v, _scale_out);
649 int8x8_t v = float2int8(_v, _v);
650 ptr[0] = vget_lane_s8(v, 0);
651 ptr[1] = vget_lane_s8(v, 1);
652 ptr[2] = vget_lane_s8(v, 2);
653 ptr[3] = vget_lane_s8(v, 3);
654 }
655 }
656 }
657 else if (scale_in_data_size > 1 && scale_out_data_size == 1)
658 {
659 float32x4_t _scale_out = vdupq_n_f32(scale_out_data[0]);
660
661 if (bias_data_size == 0)
662 {
663 #pragma omp parallel for num_threads(opt.num_threads)
664 for (int i = 0; i < w; i++)
665 {
666 const int* intptr = (const int*)bottom_blob + i * 4;
667 signed char* ptr = (signed char*)top_blob + i * 4;
668
669 float32x4_t _scale_in = vld1q_f32((const float*)scale_in_data + i * 4);
670 float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
671 _v = vmulq_f32(_v, _scale_in);
672 _v = activation_ps(_v, activation_type, activation_params);
673 _v = vmulq_f32(_v, _scale_out);
674 int8x8_t v = float2int8(_v, _v);
675 ptr[0] = vget_lane_s8(v, 0);
676 ptr[1] = vget_lane_s8(v, 1);
677 ptr[2] = vget_lane_s8(v, 2);
678 ptr[3] = vget_lane_s8(v, 3);
679 }
680 }
681 else if (bias_data_size == 1)
682 {
683 float32x4_t _bias = vdupq_n_f32(bias_data[0]);
684
685 #pragma omp parallel for num_threads(opt.num_threads)
686 for (int i = 0; i < w; i++)
687 {
688 const int* intptr = (const int*)bottom_blob + i * 4;
689 signed char* ptr = (signed char*)top_blob + i * 4;
690
691 float32x4_t _scale_in = vld1q_f32((const float*)scale_in_data + i * 4);
692 float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
693 _v = vmlaq_f32(_bias, _v, _scale_in);
694 _v = activation_ps(_v, activation_type, activation_params);
695 _v = vmulq_f32(_v, _scale_out);
696 int8x8_t v = float2int8(_v, _v);
697 ptr[0] = vget_lane_s8(v, 0);
698 ptr[1] = vget_lane_s8(v, 1);
699 ptr[2] = vget_lane_s8(v, 2);
700 ptr[3] = vget_lane_s8(v, 3);
701 }
702 }
703 else
704 {
705 #pragma omp parallel for num_threads(opt.num_threads)
706 for (int i = 0; i < w; i++)
707 {
708 const int* intptr = (const int*)bottom_blob + i * 4;
709 signed char* ptr = (signed char*)top_blob + i * 4;
710
711 float32x4_t _scale_in = vld1q_f32((const float*)scale_in_data + i * 4);
712 float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4);
713 float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
714 _v = vmlaq_f32(_bias, _v, _scale_in);
715 _v = activation_ps(_v, activation_type, activation_params);
716 _v = vmulq_f32(_v, _scale_out);
717 int8x8_t v = float2int8(_v, _v);
718 ptr[0] = vget_lane_s8(v, 0);
719 ptr[1] = vget_lane_s8(v, 1);
720 ptr[2] = vget_lane_s8(v, 2);
721 ptr[3] = vget_lane_s8(v, 3);
722 }
723 }
724 }
725 else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
726 {
727 if (bias_data_size == 0)
728 {
729 #pragma omp parallel for num_threads(opt.num_threads)
730 for (int i = 0; i < w; i++)
731 {
732 const int* intptr = (const int*)bottom_blob + i * 4;
733 signed char* ptr = (signed char*)top_blob + i * 4;
734
735 float32x4_t _scale_in = vld1q_f32((const float*)scale_in_data + i * 4);
736 float32x4_t _scale_out = vld1q_f32((const float*)scale_out_data + i * 4);
737 float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
738 _v = vmulq_f32(_v, _scale_in);
739 _v = activation_ps(_v, activation_type, activation_params);
740 _v = vmulq_f32(_v, _scale_out);
741 int8x8_t v = float2int8(_v, _v);
742 ptr[0] = vget_lane_s8(v, 0);
743 ptr[1] = vget_lane_s8(v, 1);
744 ptr[2] = vget_lane_s8(v, 2);
745 ptr[3] = vget_lane_s8(v, 3);
746 }
747 }
748 else if (bias_data_size == 1)
749 {
750 float32x4_t _bias = vdupq_n_f32(bias_data[0]);
751
752 #pragma omp parallel for num_threads(opt.num_threads)
753 for (int i = 0; i < w; i++)
754 {
755 const int* intptr = (const int*)bottom_blob + i * 4;
756 signed char* ptr = (signed char*)top_blob + i * 4;
757
758 float32x4_t _scale_in = vld1q_f32((const float*)scale_in_data + i * 4);
759 float32x4_t _scale_out = vld1q_f32((const float*)scale_out_data + i * 4);
760 float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
761 _v = vmlaq_f32(_bias, _v, _scale_in);
762 _v = activation_ps(_v, activation_type, activation_params);
763 _v = vmulq_f32(_v, _scale_out);
764 int8x8_t v = float2int8(_v, _v);
765 ptr[0] = vget_lane_s8(v, 0);
766 ptr[1] = vget_lane_s8(v, 1);
767 ptr[2] = vget_lane_s8(v, 2);
768 ptr[3] = vget_lane_s8(v, 3);
769 }
770 }
771 else
772 {
773 #pragma omp parallel for num_threads(opt.num_threads)
774 for (int i = 0; i < w; i++)
775 {
776 const int* intptr = (const int*)bottom_blob + i * 4;
777 signed char* ptr = (signed char*)top_blob + i * 4;
778
779 float32x4_t _scale_in = vld1q_f32((const float*)scale_in_data + i * 4);
780 float32x4_t _scale_out = vld1q_f32((const float*)scale_out_data + i * 4);
781 float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4);
782 float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
783 _v = vmlaq_f32(_bias, _v, _scale_in);
784 _v = activation_ps(_v, activation_type, activation_params);
785 _v = vmulq_f32(_v, _scale_out);
786 int8x8_t v = float2int8(_v, _v);
787 ptr[0] = vget_lane_s8(v, 0);
788 ptr[1] = vget_lane_s8(v, 1);
789 ptr[2] = vget_lane_s8(v, 2);
790 ptr[3] = vget_lane_s8(v, 3);
791 }
792 }
793 }
794 }
795
796 if (dims == 2)
797 {
798 int w = bottom_blob.w;
799 int h = bottom_blob.h;
800 int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1;
801 int outh = h * elempack / out_elempack;
802
803 top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
804 if (top_blob.empty())
805 return -100;
806
807 if (out_elempack == 8)
808 {
809 if (bias_data_size == 0)
810 {
811 #pragma omp parallel for num_threads(opt.num_threads)
812 for (int i = 0; i < outh; i++)
813 {
814 const int* intptr0 = bottom_blob.row<const int>(i * 2);
815 const int* intptr1 = bottom_blob.row<const int>(i * 2 + 1);
816 signed char* ptr = top_blob.row<signed char>(i);
817
818 float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8);
819 float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4);
820 float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8);
821 float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4);
822
823 for (int j = 0; j < w; j++)
824 {
825 float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr0));
826 float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr1));
827 _v0 = vmulq_f32(_v0, _scale_in0);
828 _v1 = vmulq_f32(_v1, _scale_in1);
829 _v0 = activation_ps(_v0, activation_type, activation_params);
830 _v1 = activation_ps(_v1, activation_type, activation_params);
831 _v0 = vmulq_f32(_v0, _scale_out0);
832 _v1 = vmulq_f32(_v1, _scale_out1);
833 vst1_s8(ptr, float2int8(_v0, _v1));
834
835 intptr0 += 4;
836 intptr1 += 4;
837 ptr += 8;
838 }
839 }
840 }
841 else
842 {
843 #pragma omp parallel for num_threads(opt.num_threads)
844 for (int i = 0; i < outh; i++)
845 {
846 const int* intptr0 = bottom_blob.row<const int>(i * 2);
847 const int* intptr1 = bottom_blob.row<const int>(i * 2 + 1);
848 signed char* ptr = top_blob.row<signed char>(i);
849
850 float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8);
851 float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4);
852 float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8);
853 float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4);
854 float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8);
855 float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8 + 4);
856
857 for (int j = 0; j < w; j++)
858 {
859 float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr0));
860 float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr1));
861 _v0 = vmlaq_f32(_bias0, _v0, _scale_in0);
862 _v1 = vmlaq_f32(_bias1, _v1, _scale_in1);
863 _v0 = activation_ps(_v0, activation_type, activation_params);
864 _v1 = activation_ps(_v1, activation_type, activation_params);
865 _v0 = vmulq_f32(_v0, _scale_out0);
866 _v1 = vmulq_f32(_v1, _scale_out1);
867 vst1_s8(ptr, float2int8(_v0, _v1));
868
869 intptr0 += 4;
870 intptr1 += 4;
871 ptr += 8;
872 }
873 }
874 }
875 }
876 if (out_elempack == 1)
877 {
878 if (bias_data_size == 0)
879 {
880 #pragma omp parallel for num_threads(opt.num_threads)
881 for (int i = 0; i < h; i++)
882 {
883 const int* intptr = bottom_blob.row<const int>(i);
884 signed char* ptr0 = top_blob.row<signed char>(i * 4);
885 signed char* ptr1 = top_blob.row<signed char>(i * 4 + 1);
886 signed char* ptr2 = top_blob.row<signed char>(i * 4 + 2);
887 signed char* ptr3 = top_blob.row<signed char>(i * 4 + 3);
888
889 float32x4_t _scale_in = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 4);
890 float32x4_t _scale_out = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 4);
891
892 for (int j = 0; j < w; j++)
893 {
894 float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
895 _v = vmulq_f32(_v, _scale_in);
896 _v = activation_ps(_v, activation_type, activation_params);
897 _v = vmulq_f32(_v, _scale_out);
898 int8x8_t v = float2int8(_v, _v);
899 ptr0[0] = vget_lane_s8(v, 0);
900 ptr1[0] = vget_lane_s8(v, 1);
901 ptr2[0] = vget_lane_s8(v, 2);
902 ptr3[0] = vget_lane_s8(v, 3);
903
904 intptr += 4;
905 ptr0 += 1;
906 ptr1 += 1;
907 ptr2 += 1;
908 ptr3 += 1;
909 }
910 }
911 }
912 else
913 {
914 #pragma omp parallel for num_threads(opt.num_threads)
915 for (int i = 0; i < h; i++)
916 {
917 const int* intptr = bottom_blob.row<const int>(i);
918 signed char* ptr0 = top_blob.row<signed char>(i * 4);
919 signed char* ptr1 = top_blob.row<signed char>(i * 4 + 1);
920 signed char* ptr2 = top_blob.row<signed char>(i * 4 + 2);
921 signed char* ptr3 = top_blob.row<signed char>(i * 4 + 3);
922
923 float32x4_t _scale_in = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 4);
924 float32x4_t _scale_out = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 4);
925 float32x4_t _bias = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 4);
926
927 for (int j = 0; j < w; j++)
928 {
929 float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
930 _v = vmlaq_f32(_bias, _v, _scale_in);
931 _v = activation_ps(_v, activation_type, activation_params);
932 _v = vmulq_f32(_v, _scale_out);
933 int8x8_t v = float2int8(_v, _v);
934 ptr0[0] = vget_lane_s8(v, 0);
935 ptr1[0] = vget_lane_s8(v, 1);
936 ptr2[0] = vget_lane_s8(v, 2);
937 ptr3[0] = vget_lane_s8(v, 3);
938
939 intptr += 4;
940 ptr0 += 1;
941 ptr1 += 1;
942 ptr2 += 1;
943 ptr3 += 1;
944 }
945 }
946 }
947 }
948 }
949
950 if (dims == 3)
951 {
952 int w = bottom_blob.w;
953 int h = bottom_blob.h;
954 int channels = bottom_blob.c;
955 int size = w * h;
956 int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1;
957 int outc = channels * elempack / out_elempack;
958
959 top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
960 if (top_blob.empty())
961 return -100;
962
963 if (activation_type == 1)
964 {
965 requantize_relu_pack4_neon(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt);
966 return 0;
967 }
968
969 if (activation_type == 2 && activation_params[0] > 0.f)
970 {
971 requantize_leakyrelu_pack4_neon(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt);
972 return 0;
973 }
974
975 if (out_elempack == 8)
976 {
977 if (bias_data_size == 0)
978 {
979 #pragma omp parallel for num_threads(opt.num_threads)
980 for (int q = 0; q < outc; q++)
981 {
982 const int* intptr0 = bottom_blob.channel(q * 2);
983 const int* intptr1 = bottom_blob.channel(q * 2 + 1);
984 signed char* ptr = top_blob.channel(q);
985
986 float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8);
987 float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8 + 4);
988 float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8);
989 float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8 + 4);
990
991 for (int i = 0; i < size; i++)
992 {
993 float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr0));
994 float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr1));
995 _v0 = vmulq_f32(_v0, _scale_in0);
996 _v1 = vmulq_f32(_v1, _scale_in1);
997 _v0 = activation_ps(_v0, activation_type, activation_params);
998 _v1 = activation_ps(_v1, activation_type, activation_params);
999 _v0 = vmulq_f32(_v0, _scale_out0);
1000 _v1 = vmulq_f32(_v1, _scale_out1);
1001 vst1_s8(ptr, float2int8(_v0, _v1));
1002
1003 intptr0 += 4;
1004 intptr1 += 4;
1005 ptr += 8;
1006 }
1007 }
1008 }
1009 else
1010 {
1011 #pragma omp parallel for num_threads(opt.num_threads)
1012 for (int q = 0; q < outc; q++)
1013 {
1014 const int* intptr0 = bottom_blob.channel(q * 2);
1015 const int* intptr1 = bottom_blob.channel(q * 2 + 1);
1016 signed char* ptr = top_blob.channel(q);
1017
1018 float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8);
1019 float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8 + 4);
1020 float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8);
1021 float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8 + 4);
1022 float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8);
1023 float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8 + 4);
1024
1025 for (int i = 0; i < size; i++)
1026 {
1027 float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr0));
1028 float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr1));
1029 _v0 = vmlaq_f32(_bias0, _v0, _scale_in0);
1030 _v1 = vmlaq_f32(_bias1, _v1, _scale_in1);
1031 _v0 = activation_ps(_v0, activation_type, activation_params);
1032 _v1 = activation_ps(_v1, activation_type, activation_params);
1033 _v0 = vmulq_f32(_v0, _scale_out0);
1034 _v1 = vmulq_f32(_v1, _scale_out1);
1035 vst1_s8(ptr, float2int8(_v0, _v1));
1036
1037 intptr0 += 4;
1038 intptr1 += 4;
1039 ptr += 8;
1040 }
1041 }
1042 }
1043 }
1044 if (out_elempack == 1)
1045 {
1046 if (bias_data_size == 0)
1047 {
1048 #pragma omp parallel for num_threads(opt.num_threads)
1049 for (int q = 0; q < channels; q++)
1050 {
1051 const int* intptr = bottom_blob.channel(q);
1052 signed char* ptr0 = top_blob.channel(q * 4);
1053 signed char* ptr1 = top_blob.channel(q * 4 + 1);
1054 signed char* ptr2 = top_blob.channel(q * 4 + 2);
1055 signed char* ptr3 = top_blob.channel(q * 4 + 3);
1056
1057 float32x4_t _scale_in = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 4);
1058 float32x4_t _scale_out = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 4);
1059
1060 for (int i = 0; i < size; i++)
1061 {
1062 float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
1063 _v = vmulq_f32(_v, _scale_in);
1064 _v = activation_ps(_v, activation_type, activation_params);
1065 _v = vmulq_f32(_v, _scale_out);
1066 int8x8_t v = float2int8(_v, _v);
1067 ptr0[0] = vget_lane_s8(v, 0);
1068 ptr1[0] = vget_lane_s8(v, 1);
1069 ptr2[0] = vget_lane_s8(v, 2);
1070 ptr3[0] = vget_lane_s8(v, 3);
1071
1072 intptr += 4;
1073 ptr0 += 1;
1074 ptr1 += 1;
1075 ptr2 += 1;
1076 ptr3 += 1;
1077 }
1078 }
1079 }
1080 else
1081 {
1082 #pragma omp parallel for num_threads(opt.num_threads)
1083 for (int q = 0; q < channels; q++)
1084 {
1085 const int* intptr = bottom_blob.channel(q);
1086 signed char* ptr0 = top_blob.channel(q * 4);
1087 signed char* ptr1 = top_blob.channel(q * 4 + 1);
1088 signed char* ptr2 = top_blob.channel(q * 4 + 2);
1089 signed char* ptr3 = top_blob.channel(q * 4 + 3);
1090
1091 float32x4_t _scale_in = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 4);
1092 float32x4_t _scale_out = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 4);
1093 float32x4_t _bias = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 4);
1094
1095 for (int i = 0; i < size; i++)
1096 {
1097 float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
1098 _v = vmlaq_f32(_bias, _v, _scale_in);
1099 _v = activation_ps(_v, activation_type, activation_params);
1100 _v = vmulq_f32(_v, _scale_out);
1101 int8x8_t v = float2int8(_v, _v);
1102 ptr0[0] = vget_lane_s8(v, 0);
1103 ptr1[0] = vget_lane_s8(v, 1);
1104 ptr2[0] = vget_lane_s8(v, 2);
1105 ptr3[0] = vget_lane_s8(v, 3);
1106
1107 intptr += 4;
1108 ptr0 += 1;
1109 ptr1 += 1;
1110 ptr2 += 1;
1111 ptr3 += 1;
1112 }
1113 }
1114 }
1115 }
1116 }
1117
1118 return 0;
1119 }
1120 #endif // __ARM_NEON
1121
1122 if (dims == 1)
1123 {
1124 int w = bottom_blob.w;
1125
1126 top_blob.create(w, (size_t)1u, opt.blob_allocator);
1127 if (top_blob.empty())
1128 return -100;
1129
1130 const int* intptr = bottom_blob;
1131 signed char* ptr = top_blob;
1132
1133 if (scale_in_data_size == 1 && scale_out_data_size == 1)
1134 {
1135 const float scale_in = scale_in_data[0];
1136 const float scale_out = scale_out_data[0];
1137
1138 if (bias_data_size == 0)
1139 {
1140 #pragma omp parallel for num_threads(opt.num_threads)
1141 for (int i = 0; i < w; i++)
1142 {
1143 float v = intptr[i] * scale_in;
1144 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1145 }
1146 }
1147 else if (bias_data_size == 1)
1148 {
1149 const float bias = bias_data[0];
1150
1151 #pragma omp parallel for num_threads(opt.num_threads)
1152 for (int i = 0; i < w; i++)
1153 {
1154 float v = intptr[i] * scale_in + bias;
1155 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1156 }
1157 }
1158 else
1159 {
1160 #pragma omp parallel for num_threads(opt.num_threads)
1161 for (int i = 0; i < w; i++)
1162 {
1163 float v = intptr[i] * scale_in + bias_data[i];
1164 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1165 }
1166 }
1167 }
1168 else if (scale_in_data_size == 1 && scale_out_data_size > 1)
1169 {
1170 const float scale_in = scale_in_data[0];
1171
1172 if (bias_data_size == 0)
1173 {
1174 #pragma omp parallel for num_threads(opt.num_threads)
1175 for (int i = 0; i < w; i++)
1176 {
1177 float v = intptr[i] * scale_in;
1178 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1179 }
1180 }
1181 else if (bias_data_size == 1)
1182 {
1183 const float bias = bias_data[0];
1184
1185 #pragma omp parallel for num_threads(opt.num_threads)
1186 for (int i = 0; i < w; i++)
1187 {
1188 float v = intptr[i] * scale_in + bias;
1189 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1190 }
1191 }
1192 else
1193 {
1194 #pragma omp parallel for num_threads(opt.num_threads)
1195 for (int i = 0; i < w; i++)
1196 {
1197 float v = intptr[i] * scale_in + bias_data[i];
1198 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1199 }
1200 }
1201 }
1202 else if (scale_in_data_size > 1 && scale_out_data_size == 1)
1203 {
1204 const float scale_out = scale_out_data[0];
1205
1206 if (bias_data_size == 0)
1207 {
1208 #pragma omp parallel for num_threads(opt.num_threads)
1209 for (int i = 0; i < w; i++)
1210 {
1211 float v = intptr[i] * scale_in_data[i];
1212 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1213 }
1214 }
1215 else if (bias_data_size == 1)
1216 {
1217 const float bias = bias_data[0];
1218
1219 #pragma omp parallel for num_threads(opt.num_threads)
1220 for (int i = 0; i < w; i++)
1221 {
1222 float v = intptr[i] * scale_in_data[i] + bias;
1223 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1224 }
1225 }
1226 else
1227 {
1228 #pragma omp parallel for num_threads(opt.num_threads)
1229 for (int i = 0; i < w; i++)
1230 {
1231 float v = intptr[i] * scale_in_data[i] + bias_data[i];
1232 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1233 }
1234 }
1235 }
1236 else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
1237 {
1238 if (bias_data_size == 0)
1239 {
1240 #pragma omp parallel for num_threads(opt.num_threads)
1241 for (int i = 0; i < w; i++)
1242 {
1243 float v = intptr[i] * scale_in_data[i];
1244 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1245 }
1246 }
1247 else if (bias_data_size == 1)
1248 {
1249 const float bias = bias_data[0];
1250
1251 #pragma omp parallel for num_threads(opt.num_threads)
1252 for (int i = 0; i < w; i++)
1253 {
1254 float v = intptr[i] * scale_in_data[i] + bias;
1255 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1256 }
1257 }
1258 else
1259 {
1260 #pragma omp parallel for num_threads(opt.num_threads)
1261 for (int i = 0; i < w; i++)
1262 {
1263 float v = intptr[i] * scale_in_data[i] + bias_data[i];
1264 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1265 }
1266 }
1267 }
1268 }
1269
1270 if (dims == 2)
1271 {
1272 int w = bottom_blob.w;
1273 int h = bottom_blob.h;
1274
1275 top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
1276 if (top_blob.empty())
1277 return -100;
1278
1279 if (bias_data_size == 0)
1280 {
1281 #pragma omp parallel for num_threads(opt.num_threads)
1282 for (int i = 0; i < h; i++)
1283 {
1284 const int* intptr = bottom_blob.row<const int>(i);
1285 signed char* ptr = top_blob.row<signed char>(i);
1286
1287 const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i];
1288 const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i];
1289
1290 for (int j = 0; j < w; j++)
1291 {
1292 float v = intptr[j] * scale_in;
1293 ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1294 }
1295 }
1296 }
1297 else
1298 {
1299 #pragma omp parallel for num_threads(opt.num_threads)
1300 for (int i = 0; i < h; i++)
1301 {
1302 const int* intptr = bottom_blob.row<const int>(i);
1303 signed char* ptr = top_blob.row<signed char>(i);
1304
1305 const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i];
1306 const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i];
1307 const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i];
1308
1309 for (int j = 0; j < w; j++)
1310 {
1311 float v = intptr[j] * scale_in + bias;
1312 ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1313 }
1314 }
1315 }
1316 }
1317
1318 if (dims == 3)
1319 {
1320 int w = bottom_blob.w;
1321 int h = bottom_blob.h;
1322 int channels = bottom_blob.c;
1323 int size = w * h;
1324
1325 top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
1326 if (top_blob.empty())
1327 return -100;
1328
1329 if (bias_data_size == 0)
1330 {
1331 #pragma omp parallel for num_threads(opt.num_threads)
1332 for (int q = 0; q < channels; q++)
1333 {
1334 const int* intptr = bottom_blob.channel(q);
1335 signed char* ptr = top_blob.channel(q);
1336
1337 const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q];
1338 const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q];
1339
1340 for (int i = 0; i < size; i++)
1341 {
1342 float v = intptr[i] * scale_in;
1343 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1344 }
1345 }
1346 }
1347 else
1348 {
1349 #pragma omp parallel for num_threads(opt.num_threads)
1350 for (int q = 0; q < channels; q++)
1351 {
1352 const int* intptr = bottom_blob.channel(q);
1353 signed char* ptr = top_blob.channel(q);
1354
1355 const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q];
1356 const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q];
1357 const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q];
1358
1359 for (int i = 0; i < size; i++)
1360 {
1361 float v = intptr[i] * scale_in + bias;
1362 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1363 }
1364 }
1365 }
1366 }
1367
1368 return 0;
1369 }
1370
1371 } // namespace ncnn
1372