1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 BUG1989. All rights reserved.
4 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
5 //
6 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
7 // in compliance with the License. You may obtain a copy of the License at
8 //
9 // https://opensource.org/licenses/BSD-3-Clause
10 //
11 // Unless required by applicable law or agreed to in writing, software distributed
12 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
13 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
14 // specific language governing permissions and limitations under the License.
15 
16 #include "requantize_arm.h"
17 
18 #include <math.h>
19 
20 #if __ARM_NEON
21 #include <arm_neon.h>
22 #endif // __ARM_NEON
23 
24 #include "arm_activation.h"
25 #include "arm_usability.h"
26 
27 namespace ncnn {
28 
29 #if __ARM_NEON
30 #include "requantize_leakyrelu_pack4.h"
31 #include "requantize_leakyrelu_pack8.h"
32 #include "requantize_relu_pack4.h"
33 #include "requantize_relu_pack8.h"
34 #endif // __ARM_NEON
35 
Requantize_arm()36 Requantize_arm::Requantize_arm()
37 {
38 #if __ARM_NEON
39     support_packing = true;
40 #endif // __ARM_NEON
41 }
42 
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const43 int Requantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
44 {
45     int dims = bottom_blob.dims;
46     int elempack = bottom_blob.elempack;
47 
48 #if __ARM_NEON
49     if (elempack == 8)
50     {
51         if (dims == 1)
52         {
53             int w = bottom_blob.w;
54 
55             top_blob.create(w, (size_t)8u, 8, opt.blob_allocator);
56             if (top_blob.empty())
57                 return -100;
58 
59             if (scale_in_data_size == 1 && scale_out_data_size == 1)
60             {
61                 float32x4_t _scale_in = vdupq_n_f32(scale_in_data[0]);
62                 float32x4_t _scale_out = vdupq_n_f32(scale_out_data[0]);
63 
64                 if (bias_data_size == 0)
65                 {
66                     #pragma omp parallel for num_threads(opt.num_threads)
67                     for (int i = 0; i < w; i++)
68                     {
69                         const int* intptr = (const int*)bottom_blob + i * 8;
70                         signed char* ptr = (signed char*)top_blob + i * 8;
71 
72                         float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
73                         float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
74                         _v0 = vmulq_f32(_v0, _scale_in);
75                         _v1 = vmulq_f32(_v1, _scale_in);
76                         _v0 = activation_ps(_v0, activation_type, activation_params);
77                         _v1 = activation_ps(_v1, activation_type, activation_params);
78                         _v0 = vmulq_f32(_v0, _scale_out);
79                         _v1 = vmulq_f32(_v1, _scale_out);
80                         vst1_s8(ptr, float2int8(_v0, _v1));
81                     }
82                 }
83                 else if (bias_data_size == 1)
84                 {
85                     float32x4_t _bias = vdupq_n_f32(bias_data[0]);
86 
87                     #pragma omp parallel for num_threads(opt.num_threads)
88                     for (int i = 0; i < w; i++)
89                     {
90                         const int* intptr = (const int*)bottom_blob + i * 8;
91                         signed char* ptr = (signed char*)top_blob + i * 8;
92 
93                         float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
94                         float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
95                         _v0 = vmlaq_f32(_bias, _v0, _scale_in);
96                         _v1 = vmlaq_f32(_bias, _v1, _scale_in);
97                         _v0 = activation_ps(_v0, activation_type, activation_params);
98                         _v1 = activation_ps(_v1, activation_type, activation_params);
99                         _v0 = vmulq_f32(_v0, _scale_out);
100                         _v1 = vmulq_f32(_v1, _scale_out);
101                         vst1_s8(ptr, float2int8(_v0, _v1));
102                     }
103                 }
104                 else
105                 {
106                     #pragma omp parallel for num_threads(opt.num_threads)
107                     for (int i = 0; i < w; i++)
108                     {
109                         const int* intptr = (const int*)bottom_blob + i * 8;
110                         signed char* ptr = (signed char*)top_blob + i * 8;
111 
112                         float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8);
113                         float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8 + 4);
114                         float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
115                         float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
116                         _v0 = vmlaq_f32(_bias0, _v0, _scale_in);
117                         _v1 = vmlaq_f32(_bias1, _v1, _scale_in);
118                         _v0 = activation_ps(_v0, activation_type, activation_params);
119                         _v1 = activation_ps(_v1, activation_type, activation_params);
120                         _v0 = vmulq_f32(_v0, _scale_out);
121                         _v1 = vmulq_f32(_v1, _scale_out);
122                         vst1_s8(ptr, float2int8(_v0, _v1));
123                     }
124                 }
125             }
126             else if (scale_in_data_size == 1 && scale_out_data_size > 1)
127             {
128                 float32x4_t _scale_in = vdupq_n_f32(scale_in_data[0]);
129 
130                 if (bias_data_size == 0)
131                 {
132                     #pragma omp parallel for num_threads(opt.num_threads)
133                     for (int i = 0; i < w; i++)
134                     {
135                         const int* intptr = (const int*)bottom_blob + i * 8;
136                         signed char* ptr = (signed char*)top_blob + i * 8;
137 
138                         float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8);
139                         float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4);
140                         float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
141                         float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
142                         _v0 = vmulq_f32(_v0, _scale_in);
143                         _v1 = vmulq_f32(_v1, _scale_in);
144                         _v0 = activation_ps(_v0, activation_type, activation_params);
145                         _v1 = activation_ps(_v1, activation_type, activation_params);
146                         _v0 = vmulq_f32(_v0, _scale_out0);
147                         _v1 = vmulq_f32(_v1, _scale_out1);
148                         vst1_s8(ptr, float2int8(_v0, _v1));
149                     }
150                 }
151                 else if (bias_data_size == 1)
152                 {
153                     float32x4_t _bias = vdupq_n_f32(bias_data[0]);
154 
155                     #pragma omp parallel for num_threads(opt.num_threads)
156                     for (int i = 0; i < w; i++)
157                     {
158                         const int* intptr = (const int*)bottom_blob + i * 8;
159                         signed char* ptr = (signed char*)top_blob + i * 8;
160 
161                         float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8);
162                         float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4);
163                         float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
164                         float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
165                         _v0 = vmlaq_f32(_bias, _v0, _scale_in);
166                         _v1 = vmlaq_f32(_bias, _v1, _scale_in);
167                         _v0 = activation_ps(_v0, activation_type, activation_params);
168                         _v1 = activation_ps(_v1, activation_type, activation_params);
169                         _v0 = vmulq_f32(_v0, _scale_out0);
170                         _v1 = vmulq_f32(_v1, _scale_out1);
171                         vst1_s8(ptr, float2int8(_v0, _v1));
172                     }
173                 }
174                 else
175                 {
176                     #pragma omp parallel for num_threads(opt.num_threads)
177                     for (int i = 0; i < w; i++)
178                     {
179                         const int* intptr = (const int*)bottom_blob + i * 8;
180                         signed char* ptr = (signed char*)top_blob + i * 8;
181 
182                         float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8);
183                         float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4);
184                         float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8);
185                         float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8 + 4);
186                         float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
187                         float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
188                         _v0 = vmlaq_f32(_bias0, _v0, _scale_in);
189                         _v1 = vmlaq_f32(_bias1, _v1, _scale_in);
190                         _v0 = activation_ps(_v0, activation_type, activation_params);
191                         _v1 = activation_ps(_v1, activation_type, activation_params);
192                         _v0 = vmulq_f32(_v0, _scale_out0);
193                         _v1 = vmulq_f32(_v1, _scale_out1);
194                         vst1_s8(ptr, float2int8(_v0, _v1));
195                     }
196                 }
197             }
198             else if (scale_in_data_size > 1 && scale_out_data_size == 1)
199             {
200                 float32x4_t _scale_out = vdupq_n_f32(scale_out_data[0]);
201 
202                 if (bias_data_size == 0)
203                 {
204                     #pragma omp parallel for num_threads(opt.num_threads)
205                     for (int i = 0; i < w; i++)
206                     {
207                         const int* intptr = (const int*)bottom_blob + i * 8;
208                         signed char* ptr = (signed char*)top_blob + i * 8;
209 
210                         float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8);
211                         float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4);
212                         float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
213                         float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
214                         _v0 = vmulq_f32(_v0, _scale_in0);
215                         _v1 = vmulq_f32(_v1, _scale_in1);
216                         _v0 = activation_ps(_v0, activation_type, activation_params);
217                         _v1 = activation_ps(_v1, activation_type, activation_params);
218                         _v0 = vmulq_f32(_v0, _scale_out);
219                         _v1 = vmulq_f32(_v1, _scale_out);
220                         vst1_s8(ptr, float2int8(_v0, _v1));
221                     }
222                 }
223                 else if (bias_data_size == 1)
224                 {
225                     float32x4_t _bias = vdupq_n_f32(bias_data[0]);
226 
227                     #pragma omp parallel for num_threads(opt.num_threads)
228                     for (int i = 0; i < w; i++)
229                     {
230                         const int* intptr = (const int*)bottom_blob + i * 8;
231                         signed char* ptr = (signed char*)top_blob + i * 8;
232 
233                         float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8);
234                         float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4);
235                         float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
236                         float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
237                         _v0 = vmlaq_f32(_bias, _v0, _scale_in0);
238                         _v1 = vmlaq_f32(_bias, _v1, _scale_in1);
239                         _v0 = activation_ps(_v0, activation_type, activation_params);
240                         _v1 = activation_ps(_v1, activation_type, activation_params);
241                         _v0 = vmulq_f32(_v0, _scale_out);
242                         _v1 = vmulq_f32(_v1, _scale_out);
243                         vst1_s8(ptr, float2int8(_v0, _v1));
244                     }
245                 }
246                 else
247                 {
248                     #pragma omp parallel for num_threads(opt.num_threads)
249                     for (int i = 0; i < w; i++)
250                     {
251                         const int* intptr = (const int*)bottom_blob + i * 8;
252                         signed char* ptr = (signed char*)top_blob + i * 8;
253 
254                         float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8);
255                         float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4);
256                         float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8);
257                         float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8 + 4);
258                         float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
259                         float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
260                         _v0 = vmlaq_f32(_bias0, _v0, _scale_in0);
261                         _v1 = vmlaq_f32(_bias1, _v1, _scale_in1);
262                         _v0 = activation_ps(_v0, activation_type, activation_params);
263                         _v1 = activation_ps(_v1, activation_type, activation_params);
264                         _v0 = vmulq_f32(_v0, _scale_out);
265                         _v1 = vmulq_f32(_v1, _scale_out);
266                         vst1_s8(ptr, float2int8(_v0, _v1));
267                     }
268                 }
269             }
270             else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
271             {
272                 if (bias_data_size == 0)
273                 {
274                     #pragma omp parallel for num_threads(opt.num_threads)
275                     for (int i = 0; i < w; i++)
276                     {
277                         const int* intptr = (const int*)bottom_blob + i * 8;
278                         signed char* ptr = (signed char*)top_blob + i * 8;
279 
280                         float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8);
281                         float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4);
282                         float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8);
283                         float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4);
284                         float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
285                         float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
286                         _v0 = vmulq_f32(_v0, _scale_in0);
287                         _v1 = vmulq_f32(_v1, _scale_in1);
288                         _v0 = activation_ps(_v0, activation_type, activation_params);
289                         _v1 = activation_ps(_v1, activation_type, activation_params);
290                         _v0 = vmulq_f32(_v0, _scale_out0);
291                         _v1 = vmulq_f32(_v1, _scale_out1);
292                         vst1_s8(ptr, float2int8(_v0, _v1));
293                     }
294                 }
295                 else if (bias_data_size == 1)
296                 {
297                     float32x4_t _bias = vdupq_n_f32(bias_data[0]);
298 
299                     #pragma omp parallel for num_threads(opt.num_threads)
300                     for (int i = 0; i < w; i++)
301                     {
302                         const int* intptr = (const int*)bottom_blob + i * 8;
303                         signed char* ptr = (signed char*)top_blob + i * 8;
304 
305                         float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8);
306                         float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4);
307                         float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8);
308                         float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4);
309                         float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
310                         float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
311                         _v0 = vmlaq_f32(_bias, _v0, _scale_in0);
312                         _v1 = vmlaq_f32(_bias, _v1, _scale_in1);
313                         _v0 = activation_ps(_v0, activation_type, activation_params);
314                         _v1 = activation_ps(_v1, activation_type, activation_params);
315                         _v0 = vmulq_f32(_v0, _scale_out0);
316                         _v1 = vmulq_f32(_v1, _scale_out1);
317                         vst1_s8(ptr, float2int8(_v0, _v1));
318                     }
319                 }
320                 else
321                 {
322                     #pragma omp parallel for num_threads(opt.num_threads)
323                     for (int i = 0; i < w; i++)
324                     {
325                         const int* intptr = (const int*)bottom_blob + i * 8;
326                         signed char* ptr = (signed char*)top_blob + i * 8;
327 
328                         float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8);
329                         float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4);
330                         float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8);
331                         float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4);
332                         float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8);
333                         float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8 + 4);
334                         float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
335                         float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
336                         _v0 = vmlaq_f32(_bias0, _v0, _scale_in0);
337                         _v1 = vmlaq_f32(_bias1, _v1, _scale_in1);
338                         _v0 = activation_ps(_v0, activation_type, activation_params);
339                         _v1 = activation_ps(_v1, activation_type, activation_params);
340                         _v0 = vmulq_f32(_v0, _scale_out0);
341                         _v1 = vmulq_f32(_v1, _scale_out1);
342                         vst1_s8(ptr, float2int8(_v0, _v1));
343                     }
344                 }
345             }
346         }
347 
348         if (dims == 2)
349         {
350             int w = bottom_blob.w;
351             int h = bottom_blob.h;
352 
353             top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator);
354             if (top_blob.empty())
355                 return -100;
356 
357             if (bias_data_size == 0)
358             {
359                 #pragma omp parallel for num_threads(opt.num_threads)
360                 for (int i = 0; i < h; i++)
361                 {
362                     const int* intptr = bottom_blob.row<const int>(i);
363                     signed char* ptr = top_blob.row<signed char>(i);
364 
365                     float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8);
366                     float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4);
367                     float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8);
368                     float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4);
369 
370                     for (int j = 0; j < w; j++)
371                     {
372                         float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
373                         float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
374                         _v0 = vmulq_f32(_v0, _scale_in0);
375                         _v1 = vmulq_f32(_v1, _scale_in1);
376                         _v0 = activation_ps(_v0, activation_type, activation_params);
377                         _v1 = activation_ps(_v1, activation_type, activation_params);
378                         _v0 = vmulq_f32(_v0, _scale_out0);
379                         _v1 = vmulq_f32(_v1, _scale_out1);
380                         vst1_s8(ptr, float2int8(_v0, _v1));
381 
382                         intptr += 8;
383                         ptr += 8;
384                     }
385                 }
386             }
387             else
388             {
389                 #pragma omp parallel for num_threads(opt.num_threads)
390                 for (int i = 0; i < h; i++)
391                 {
392                     const int* intptr = bottom_blob.row<const int>(i);
393                     signed char* ptr = top_blob.row<signed char>(i);
394 
395                     float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8);
396                     float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4);
397                     float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8);
398                     float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4);
399                     float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8);
400                     float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8 + 4);
401 
402                     for (int j = 0; j < w; j++)
403                     {
404                         float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
405                         float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
406                         _v0 = vmlaq_f32(_bias0, _v0, _scale_in0);
407                         _v1 = vmlaq_f32(_bias1, _v1, _scale_in1);
408                         _v0 = activation_ps(_v0, activation_type, activation_params);
409                         _v1 = activation_ps(_v1, activation_type, activation_params);
410                         _v0 = vmulq_f32(_v0, _scale_out0);
411                         _v1 = vmulq_f32(_v1, _scale_out1);
412                         vst1_s8(ptr, float2int8(_v0, _v1));
413 
414                         intptr += 8;
415                         ptr += 8;
416                     }
417                 }
418             }
419         }
420 
421         if (dims == 3)
422         {
423             int w = bottom_blob.w;
424             int h = bottom_blob.h;
425             int channels = bottom_blob.c;
426             int size = w * h;
427 
428             top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator);
429             if (top_blob.empty())
430                 return -100;
431 
432             if (activation_type == 1)
433             {
434                 requantize_relu_pack8_neon(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt);
435                 return 0;
436             }
437 
438             if (activation_type == 2 && activation_params[0] > 0.f)
439             {
440                 requantize_leakyrelu_pack8_neon(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt);
441                 return 0;
442             }
443 
444             if (bias_data_size == 0)
445             {
446                 #pragma omp parallel for num_threads(opt.num_threads)
447                 for (int q = 0; q < channels; q++)
448                 {
449                     const int* intptr = bottom_blob.channel(q);
450                     signed char* ptr = top_blob.channel(q);
451 
452                     float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8);
453                     float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8 + 4);
454                     float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8);
455                     float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8 + 4);
456 
457                     for (int i = 0; i < size; i++)
458                     {
459                         float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
460                         float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
461                         _v0 = vmulq_f32(_v0, _scale_in0);
462                         _v1 = vmulq_f32(_v1, _scale_in1);
463                         _v0 = activation_ps(_v0, activation_type, activation_params);
464                         _v1 = activation_ps(_v1, activation_type, activation_params);
465                         _v0 = vmulq_f32(_v0, _scale_out0);
466                         _v1 = vmulq_f32(_v1, _scale_out1);
467                         vst1_s8(ptr, float2int8(_v0, _v1));
468 
469                         intptr += 8;
470                         ptr += 8;
471                     }
472                 }
473             }
474             else
475             {
476                 #pragma omp parallel for num_threads(opt.num_threads)
477                 for (int q = 0; q < channels; q++)
478                 {
479                     const int* intptr = bottom_blob.channel(q);
480                     signed char* ptr = top_blob.channel(q);
481 
482                     float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8);
483                     float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8 + 4);
484                     float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8);
485                     float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8 + 4);
486                     float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8);
487                     float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8 + 4);
488 
489                     for (int i = 0; i < size; i++)
490                     {
491                         float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
492                         float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32((intptr + 4)));
493                         _v0 = vmlaq_f32(_bias0, _v0, _scale_in0);
494                         _v1 = vmlaq_f32(_bias1, _v1, _scale_in1);
495                         _v0 = activation_ps(_v0, activation_type, activation_params);
496                         _v1 = activation_ps(_v1, activation_type, activation_params);
497                         _v0 = vmulq_f32(_v0, _scale_out0);
498                         _v1 = vmulq_f32(_v1, _scale_out1);
499                         vst1_s8(ptr, float2int8(_v0, _v1));
500 
501                         intptr += 8;
502                         ptr += 8;
503                     }
504                 }
505             }
506         }
507 
508         return 0;
509     }
510 
511     if (elempack == 4)
512     {
513         if (dims == 1)
514         {
515             int w = bottom_blob.w;
516             int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1;
517             int outw = w * elempack / out_elempack;
518 
519             top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
520             if (top_blob.empty())
521                 return -100;
522 
523             if (scale_in_data_size == 1 && scale_out_data_size == 1)
524             {
525                 float32x4_t _scale_in = vdupq_n_f32(scale_in_data[0]);
526                 float32x4_t _scale_out = vdupq_n_f32(scale_out_data[0]);
527 
528                 if (bias_data_size == 0)
529                 {
530                     #pragma omp parallel for num_threads(opt.num_threads)
531                     for (int i = 0; i < w; i++)
532                     {
533                         const int* intptr = (const int*)bottom_blob + i * 4;
534                         signed char* ptr = (signed char*)top_blob + i * 4;
535 
536                         float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
537                         _v = vmulq_f32(_v, _scale_in);
538                         _v = activation_ps(_v, activation_type, activation_params);
539                         _v = vmulq_f32(_v, _scale_out);
540                         int8x8_t v = float2int8(_v, _v);
541                         ptr[0] = vget_lane_s8(v, 0);
542                         ptr[1] = vget_lane_s8(v, 1);
543                         ptr[2] = vget_lane_s8(v, 2);
544                         ptr[3] = vget_lane_s8(v, 3);
545                     }
546                 }
547                 else if (bias_data_size == 1)
548                 {
549                     float32x4_t _bias = vdupq_n_f32(bias_data[0]);
550 
551                     #pragma omp parallel for num_threads(opt.num_threads)
552                     for (int i = 0; i < w; i++)
553                     {
554                         const int* intptr = (const int*)bottom_blob + i * 4;
555                         signed char* ptr = (signed char*)top_blob + i * 4;
556 
557                         float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
558                         _v = vmlaq_f32(_bias, _v, _scale_in);
559                         _v = activation_ps(_v, activation_type, activation_params);
560                         _v = vmulq_f32(_v, _scale_out);
561                         int8x8_t v = float2int8(_v, _v);
562                         ptr[0] = vget_lane_s8(v, 0);
563                         ptr[1] = vget_lane_s8(v, 1);
564                         ptr[2] = vget_lane_s8(v, 2);
565                         ptr[3] = vget_lane_s8(v, 3);
566                     }
567                 }
568                 else
569                 {
570                     #pragma omp parallel for num_threads(opt.num_threads)
571                     for (int i = 0; i < w; i++)
572                     {
573                         const int* intptr = (const int*)bottom_blob + i * 4;
574                         signed char* ptr = (signed char*)top_blob + i * 4;
575 
576                         float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4);
577                         float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
578                         _v = vmlaq_f32(_bias, _v, _scale_in);
579                         _v = activation_ps(_v, activation_type, activation_params);
580                         _v = vmulq_f32(_v, _scale_out);
581                         int8x8_t v = float2int8(_v, _v);
582                         ptr[0] = vget_lane_s8(v, 0);
583                         ptr[1] = vget_lane_s8(v, 1);
584                         ptr[2] = vget_lane_s8(v, 2);
585                         ptr[3] = vget_lane_s8(v, 3);
586                     }
587                 }
588             }
589             else if (scale_in_data_size == 1 && scale_out_data_size > 1)
590             {
591                 float32x4_t _scale_in = vdupq_n_f32(scale_in_data[0]);
592 
593                 if (bias_data_size == 0)
594                 {
595                     #pragma omp parallel for num_threads(opt.num_threads)
596                     for (int i = 0; i < w; i++)
597                     {
598                         const int* intptr = (const int*)bottom_blob + i * 4;
599                         signed char* ptr = (signed char*)top_blob + i * 4;
600 
601                         float32x4_t _scale_out = vld1q_f32((const float*)scale_out_data + i * 4);
602                         float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
603                         _v = vmulq_f32(_v, _scale_in);
604                         _v = activation_ps(_v, activation_type, activation_params);
605                         _v = vmulq_f32(_v, _scale_out);
606                         int8x8_t v = float2int8(_v, _v);
607                         ptr[0] = vget_lane_s8(v, 0);
608                         ptr[1] = vget_lane_s8(v, 1);
609                         ptr[2] = vget_lane_s8(v, 2);
610                         ptr[3] = vget_lane_s8(v, 3);
611                     }
612                 }
613                 else if (bias_data_size == 1)
614                 {
615                     float32x4_t _bias = vdupq_n_f32(bias_data[0]);
616 
617                     #pragma omp parallel for num_threads(opt.num_threads)
618                     for (int i = 0; i < w; i++)
619                     {
620                         const int* intptr = (const int*)bottom_blob + i * 4;
621                         signed char* ptr = (signed char*)top_blob + i * 4;
622 
623                         float32x4_t _scale_out = vld1q_f32((const float*)scale_out_data + i * 4);
624                         float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
625                         _v = vmlaq_f32(_bias, _v, _scale_in);
626                         _v = activation_ps(_v, activation_type, activation_params);
627                         _v = vmulq_f32(_v, _scale_out);
628                         int8x8_t v = float2int8(_v, _v);
629                         ptr[0] = vget_lane_s8(v, 0);
630                         ptr[1] = vget_lane_s8(v, 1);
631                         ptr[2] = vget_lane_s8(v, 2);
632                         ptr[3] = vget_lane_s8(v, 3);
633                     }
634                 }
635                 else
636                 {
637                     #pragma omp parallel for num_threads(opt.num_threads)
638                     for (int i = 0; i < w; i++)
639                     {
640                         const int* intptr = (const int*)bottom_blob + i * 4;
641                         signed char* ptr = (signed char*)top_blob + i * 4;
642 
643                         float32x4_t _scale_out = vld1q_f32((const float*)scale_out_data + i * 4);
644                         float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4);
645                         float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
646                         _v = vmlaq_f32(_bias, _v, _scale_in);
647                         _v = activation_ps(_v, activation_type, activation_params);
648                         _v = vmulq_f32(_v, _scale_out);
649                         int8x8_t v = float2int8(_v, _v);
650                         ptr[0] = vget_lane_s8(v, 0);
651                         ptr[1] = vget_lane_s8(v, 1);
652                         ptr[2] = vget_lane_s8(v, 2);
653                         ptr[3] = vget_lane_s8(v, 3);
654                     }
655                 }
656             }
657             else if (scale_in_data_size > 1 && scale_out_data_size == 1)
658             {
659                 float32x4_t _scale_out = vdupq_n_f32(scale_out_data[0]);
660 
661                 if (bias_data_size == 0)
662                 {
663                     #pragma omp parallel for num_threads(opt.num_threads)
664                     for (int i = 0; i < w; i++)
665                     {
666                         const int* intptr = (const int*)bottom_blob + i * 4;
667                         signed char* ptr = (signed char*)top_blob + i * 4;
668 
669                         float32x4_t _scale_in = vld1q_f32((const float*)scale_in_data + i * 4);
670                         float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
671                         _v = vmulq_f32(_v, _scale_in);
672                         _v = activation_ps(_v, activation_type, activation_params);
673                         _v = vmulq_f32(_v, _scale_out);
674                         int8x8_t v = float2int8(_v, _v);
675                         ptr[0] = vget_lane_s8(v, 0);
676                         ptr[1] = vget_lane_s8(v, 1);
677                         ptr[2] = vget_lane_s8(v, 2);
678                         ptr[3] = vget_lane_s8(v, 3);
679                     }
680                 }
681                 else if (bias_data_size == 1)
682                 {
683                     float32x4_t _bias = vdupq_n_f32(bias_data[0]);
684 
685                     #pragma omp parallel for num_threads(opt.num_threads)
686                     for (int i = 0; i < w; i++)
687                     {
688                         const int* intptr = (const int*)bottom_blob + i * 4;
689                         signed char* ptr = (signed char*)top_blob + i * 4;
690 
691                         float32x4_t _scale_in = vld1q_f32((const float*)scale_in_data + i * 4);
692                         float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
693                         _v = vmlaq_f32(_bias, _v, _scale_in);
694                         _v = activation_ps(_v, activation_type, activation_params);
695                         _v = vmulq_f32(_v, _scale_out);
696                         int8x8_t v = float2int8(_v, _v);
697                         ptr[0] = vget_lane_s8(v, 0);
698                         ptr[1] = vget_lane_s8(v, 1);
699                         ptr[2] = vget_lane_s8(v, 2);
700                         ptr[3] = vget_lane_s8(v, 3);
701                     }
702                 }
703                 else
704                 {
705                     #pragma omp parallel for num_threads(opt.num_threads)
706                     for (int i = 0; i < w; i++)
707                     {
708                         const int* intptr = (const int*)bottom_blob + i * 4;
709                         signed char* ptr = (signed char*)top_blob + i * 4;
710 
711                         float32x4_t _scale_in = vld1q_f32((const float*)scale_in_data + i * 4);
712                         float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4);
713                         float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
714                         _v = vmlaq_f32(_bias, _v, _scale_in);
715                         _v = activation_ps(_v, activation_type, activation_params);
716                         _v = vmulq_f32(_v, _scale_out);
717                         int8x8_t v = float2int8(_v, _v);
718                         ptr[0] = vget_lane_s8(v, 0);
719                         ptr[1] = vget_lane_s8(v, 1);
720                         ptr[2] = vget_lane_s8(v, 2);
721                         ptr[3] = vget_lane_s8(v, 3);
722                     }
723                 }
724             }
725             else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
726             {
727                 if (bias_data_size == 0)
728                 {
729                     #pragma omp parallel for num_threads(opt.num_threads)
730                     for (int i = 0; i < w; i++)
731                     {
732                         const int* intptr = (const int*)bottom_blob + i * 4;
733                         signed char* ptr = (signed char*)top_blob + i * 4;
734 
735                         float32x4_t _scale_in = vld1q_f32((const float*)scale_in_data + i * 4);
736                         float32x4_t _scale_out = vld1q_f32((const float*)scale_out_data + i * 4);
737                         float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
738                         _v = vmulq_f32(_v, _scale_in);
739                         _v = activation_ps(_v, activation_type, activation_params);
740                         _v = vmulq_f32(_v, _scale_out);
741                         int8x8_t v = float2int8(_v, _v);
742                         ptr[0] = vget_lane_s8(v, 0);
743                         ptr[1] = vget_lane_s8(v, 1);
744                         ptr[2] = vget_lane_s8(v, 2);
745                         ptr[3] = vget_lane_s8(v, 3);
746                     }
747                 }
748                 else if (bias_data_size == 1)
749                 {
750                     float32x4_t _bias = vdupq_n_f32(bias_data[0]);
751 
752                     #pragma omp parallel for num_threads(opt.num_threads)
753                     for (int i = 0; i < w; i++)
754                     {
755                         const int* intptr = (const int*)bottom_blob + i * 4;
756                         signed char* ptr = (signed char*)top_blob + i * 4;
757 
758                         float32x4_t _scale_in = vld1q_f32((const float*)scale_in_data + i * 4);
759                         float32x4_t _scale_out = vld1q_f32((const float*)scale_out_data + i * 4);
760                         float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
761                         _v = vmlaq_f32(_bias, _v, _scale_in);
762                         _v = activation_ps(_v, activation_type, activation_params);
763                         _v = vmulq_f32(_v, _scale_out);
764                         int8x8_t v = float2int8(_v, _v);
765                         ptr[0] = vget_lane_s8(v, 0);
766                         ptr[1] = vget_lane_s8(v, 1);
767                         ptr[2] = vget_lane_s8(v, 2);
768                         ptr[3] = vget_lane_s8(v, 3);
769                     }
770                 }
771                 else
772                 {
773                     #pragma omp parallel for num_threads(opt.num_threads)
774                     for (int i = 0; i < w; i++)
775                     {
776                         const int* intptr = (const int*)bottom_blob + i * 4;
777                         signed char* ptr = (signed char*)top_blob + i * 4;
778 
779                         float32x4_t _scale_in = vld1q_f32((const float*)scale_in_data + i * 4);
780                         float32x4_t _scale_out = vld1q_f32((const float*)scale_out_data + i * 4);
781                         float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4);
782                         float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
783                         _v = vmlaq_f32(_bias, _v, _scale_in);
784                         _v = activation_ps(_v, activation_type, activation_params);
785                         _v = vmulq_f32(_v, _scale_out);
786                         int8x8_t v = float2int8(_v, _v);
787                         ptr[0] = vget_lane_s8(v, 0);
788                         ptr[1] = vget_lane_s8(v, 1);
789                         ptr[2] = vget_lane_s8(v, 2);
790                         ptr[3] = vget_lane_s8(v, 3);
791                     }
792                 }
793             }
794         }
795 
796         if (dims == 2)
797         {
798             int w = bottom_blob.w;
799             int h = bottom_blob.h;
800             int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1;
801             int outh = h * elempack / out_elempack;
802 
803             top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
804             if (top_blob.empty())
805                 return -100;
806 
807             if (out_elempack == 8)
808             {
809                 if (bias_data_size == 0)
810                 {
811                     #pragma omp parallel for num_threads(opt.num_threads)
812                     for (int i = 0; i < outh; i++)
813                     {
814                         const int* intptr0 = bottom_blob.row<const int>(i * 2);
815                         const int* intptr1 = bottom_blob.row<const int>(i * 2 + 1);
816                         signed char* ptr = top_blob.row<signed char>(i);
817 
818                         float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8);
819                         float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4);
820                         float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8);
821                         float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4);
822 
823                         for (int j = 0; j < w; j++)
824                         {
825                             float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr0));
826                             float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr1));
827                             _v0 = vmulq_f32(_v0, _scale_in0);
828                             _v1 = vmulq_f32(_v1, _scale_in1);
829                             _v0 = activation_ps(_v0, activation_type, activation_params);
830                             _v1 = activation_ps(_v1, activation_type, activation_params);
831                             _v0 = vmulq_f32(_v0, _scale_out0);
832                             _v1 = vmulq_f32(_v1, _scale_out1);
833                             vst1_s8(ptr, float2int8(_v0, _v1));
834 
835                             intptr0 += 4;
836                             intptr1 += 4;
837                             ptr += 8;
838                         }
839                     }
840                 }
841                 else
842                 {
843                     #pragma omp parallel for num_threads(opt.num_threads)
844                     for (int i = 0; i < outh; i++)
845                     {
846                         const int* intptr0 = bottom_blob.row<const int>(i * 2);
847                         const int* intptr1 = bottom_blob.row<const int>(i * 2 + 1);
848                         signed char* ptr = top_blob.row<signed char>(i);
849 
850                         float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8);
851                         float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 8 + 4);
852                         float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8);
853                         float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 8 + 4);
854                         float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8);
855                         float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8 + 4);
856 
857                         for (int j = 0; j < w; j++)
858                         {
859                             float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr0));
860                             float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr1));
861                             _v0 = vmlaq_f32(_bias0, _v0, _scale_in0);
862                             _v1 = vmlaq_f32(_bias1, _v1, _scale_in1);
863                             _v0 = activation_ps(_v0, activation_type, activation_params);
864                             _v1 = activation_ps(_v1, activation_type, activation_params);
865                             _v0 = vmulq_f32(_v0, _scale_out0);
866                             _v1 = vmulq_f32(_v1, _scale_out1);
867                             vst1_s8(ptr, float2int8(_v0, _v1));
868 
869                             intptr0 += 4;
870                             intptr1 += 4;
871                             ptr += 8;
872                         }
873                     }
874                 }
875             }
876             if (out_elempack == 1)
877             {
878                 if (bias_data_size == 0)
879                 {
880                     #pragma omp parallel for num_threads(opt.num_threads)
881                     for (int i = 0; i < h; i++)
882                     {
883                         const int* intptr = bottom_blob.row<const int>(i);
884                         signed char* ptr0 = top_blob.row<signed char>(i * 4);
885                         signed char* ptr1 = top_blob.row<signed char>(i * 4 + 1);
886                         signed char* ptr2 = top_blob.row<signed char>(i * 4 + 2);
887                         signed char* ptr3 = top_blob.row<signed char>(i * 4 + 3);
888 
889                         float32x4_t _scale_in = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 4);
890                         float32x4_t _scale_out = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 4);
891 
892                         for (int j = 0; j < w; j++)
893                         {
894                             float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
895                             _v = vmulq_f32(_v, _scale_in);
896                             _v = activation_ps(_v, activation_type, activation_params);
897                             _v = vmulq_f32(_v, _scale_out);
898                             int8x8_t v = float2int8(_v, _v);
899                             ptr0[0] = vget_lane_s8(v, 0);
900                             ptr1[0] = vget_lane_s8(v, 1);
901                             ptr2[0] = vget_lane_s8(v, 2);
902                             ptr3[0] = vget_lane_s8(v, 3);
903 
904                             intptr += 4;
905                             ptr0 += 1;
906                             ptr1 += 1;
907                             ptr2 += 1;
908                             ptr3 += 1;
909                         }
910                     }
911                 }
912                 else
913                 {
914                     #pragma omp parallel for num_threads(opt.num_threads)
915                     for (int i = 0; i < h; i++)
916                     {
917                         const int* intptr = bottom_blob.row<const int>(i);
918                         signed char* ptr0 = top_blob.row<signed char>(i * 4);
919                         signed char* ptr1 = top_blob.row<signed char>(i * 4 + 1);
920                         signed char* ptr2 = top_blob.row<signed char>(i * 4 + 2);
921                         signed char* ptr3 = top_blob.row<signed char>(i * 4 + 3);
922 
923                         float32x4_t _scale_in = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + i * 4);
924                         float32x4_t _scale_out = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + i * 4);
925                         float32x4_t _bias = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 4);
926 
927                         for (int j = 0; j < w; j++)
928                         {
929                             float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
930                             _v = vmlaq_f32(_bias, _v, _scale_in);
931                             _v = activation_ps(_v, activation_type, activation_params);
932                             _v = vmulq_f32(_v, _scale_out);
933                             int8x8_t v = float2int8(_v, _v);
934                             ptr0[0] = vget_lane_s8(v, 0);
935                             ptr1[0] = vget_lane_s8(v, 1);
936                             ptr2[0] = vget_lane_s8(v, 2);
937                             ptr3[0] = vget_lane_s8(v, 3);
938 
939                             intptr += 4;
940                             ptr0 += 1;
941                             ptr1 += 1;
942                             ptr2 += 1;
943                             ptr3 += 1;
944                         }
945                     }
946                 }
947             }
948         }
949 
950         if (dims == 3)
951         {
952             int w = bottom_blob.w;
953             int h = bottom_blob.h;
954             int channels = bottom_blob.c;
955             int size = w * h;
956             int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1;
957             int outc = channels * elempack / out_elempack;
958 
959             top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
960             if (top_blob.empty())
961                 return -100;
962 
963             if (activation_type == 1)
964             {
965                 requantize_relu_pack4_neon(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt);
966                 return 0;
967             }
968 
969             if (activation_type == 2 && activation_params[0] > 0.f)
970             {
971                 requantize_leakyrelu_pack4_neon(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt);
972                 return 0;
973             }
974 
975             if (out_elempack == 8)
976             {
977                 if (bias_data_size == 0)
978                 {
979                     #pragma omp parallel for num_threads(opt.num_threads)
980                     for (int q = 0; q < outc; q++)
981                     {
982                         const int* intptr0 = bottom_blob.channel(q * 2);
983                         const int* intptr1 = bottom_blob.channel(q * 2 + 1);
984                         signed char* ptr = top_blob.channel(q);
985 
986                         float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8);
987                         float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8 + 4);
988                         float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8);
989                         float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8 + 4);
990 
991                         for (int i = 0; i < size; i++)
992                         {
993                             float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr0));
994                             float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr1));
995                             _v0 = vmulq_f32(_v0, _scale_in0);
996                             _v1 = vmulq_f32(_v1, _scale_in1);
997                             _v0 = activation_ps(_v0, activation_type, activation_params);
998                             _v1 = activation_ps(_v1, activation_type, activation_params);
999                             _v0 = vmulq_f32(_v0, _scale_out0);
1000                             _v1 = vmulq_f32(_v1, _scale_out1);
1001                             vst1_s8(ptr, float2int8(_v0, _v1));
1002 
1003                             intptr0 += 4;
1004                             intptr1 += 4;
1005                             ptr += 8;
1006                         }
1007                     }
1008                 }
1009                 else
1010                 {
1011                     #pragma omp parallel for num_threads(opt.num_threads)
1012                     for (int q = 0; q < outc; q++)
1013                     {
1014                         const int* intptr0 = bottom_blob.channel(q * 2);
1015                         const int* intptr1 = bottom_blob.channel(q * 2 + 1);
1016                         signed char* ptr = top_blob.channel(q);
1017 
1018                         float32x4_t _scale_in0 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8);
1019                         float32x4_t _scale_in1 = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 8 + 4);
1020                         float32x4_t _scale_out0 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8);
1021                         float32x4_t _scale_out1 = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 8 + 4);
1022                         float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8);
1023                         float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8 + 4);
1024 
1025                         for (int i = 0; i < size; i++)
1026                         {
1027                             float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr0));
1028                             float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr1));
1029                             _v0 = vmlaq_f32(_bias0, _v0, _scale_in0);
1030                             _v1 = vmlaq_f32(_bias1, _v1, _scale_in1);
1031                             _v0 = activation_ps(_v0, activation_type, activation_params);
1032                             _v1 = activation_ps(_v1, activation_type, activation_params);
1033                             _v0 = vmulq_f32(_v0, _scale_out0);
1034                             _v1 = vmulq_f32(_v1, _scale_out1);
1035                             vst1_s8(ptr, float2int8(_v0, _v1));
1036 
1037                             intptr0 += 4;
1038                             intptr1 += 4;
1039                             ptr += 8;
1040                         }
1041                     }
1042                 }
1043             }
1044             if (out_elempack == 1)
1045             {
1046                 if (bias_data_size == 0)
1047                 {
1048                     #pragma omp parallel for num_threads(opt.num_threads)
1049                     for (int q = 0; q < channels; q++)
1050                     {
1051                         const int* intptr = bottom_blob.channel(q);
1052                         signed char* ptr0 = top_blob.channel(q * 4);
1053                         signed char* ptr1 = top_blob.channel(q * 4 + 1);
1054                         signed char* ptr2 = top_blob.channel(q * 4 + 2);
1055                         signed char* ptr3 = top_blob.channel(q * 4 + 3);
1056 
1057                         float32x4_t _scale_in = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 4);
1058                         float32x4_t _scale_out = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 4);
1059 
1060                         for (int i = 0; i < size; i++)
1061                         {
1062                             float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
1063                             _v = vmulq_f32(_v, _scale_in);
1064                             _v = activation_ps(_v, activation_type, activation_params);
1065                             _v = vmulq_f32(_v, _scale_out);
1066                             int8x8_t v = float2int8(_v, _v);
1067                             ptr0[0] = vget_lane_s8(v, 0);
1068                             ptr1[0] = vget_lane_s8(v, 1);
1069                             ptr2[0] = vget_lane_s8(v, 2);
1070                             ptr3[0] = vget_lane_s8(v, 3);
1071 
1072                             intptr += 4;
1073                             ptr0 += 1;
1074                             ptr1 += 1;
1075                             ptr2 += 1;
1076                             ptr3 += 1;
1077                         }
1078                     }
1079                 }
1080                 else
1081                 {
1082                     #pragma omp parallel for num_threads(opt.num_threads)
1083                     for (int q = 0; q < channels; q++)
1084                     {
1085                         const int* intptr = bottom_blob.channel(q);
1086                         signed char* ptr0 = top_blob.channel(q * 4);
1087                         signed char* ptr1 = top_blob.channel(q * 4 + 1);
1088                         signed char* ptr2 = top_blob.channel(q * 4 + 2);
1089                         signed char* ptr3 = top_blob.channel(q * 4 + 3);
1090 
1091                         float32x4_t _scale_in = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 4);
1092                         float32x4_t _scale_out = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 4);
1093                         float32x4_t _bias = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 4);
1094 
1095                         for (int i = 0; i < size; i++)
1096                         {
1097                             float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
1098                             _v = vmlaq_f32(_bias, _v, _scale_in);
1099                             _v = activation_ps(_v, activation_type, activation_params);
1100                             _v = vmulq_f32(_v, _scale_out);
1101                             int8x8_t v = float2int8(_v, _v);
1102                             ptr0[0] = vget_lane_s8(v, 0);
1103                             ptr1[0] = vget_lane_s8(v, 1);
1104                             ptr2[0] = vget_lane_s8(v, 2);
1105                             ptr3[0] = vget_lane_s8(v, 3);
1106 
1107                             intptr += 4;
1108                             ptr0 += 1;
1109                             ptr1 += 1;
1110                             ptr2 += 1;
1111                             ptr3 += 1;
1112                         }
1113                     }
1114                 }
1115             }
1116         }
1117 
1118         return 0;
1119     }
1120 #endif // __ARM_NEON
1121 
1122     if (dims == 1)
1123     {
1124         int w = bottom_blob.w;
1125 
1126         top_blob.create(w, (size_t)1u, opt.blob_allocator);
1127         if (top_blob.empty())
1128             return -100;
1129 
1130         const int* intptr = bottom_blob;
1131         signed char* ptr = top_blob;
1132 
1133         if (scale_in_data_size == 1 && scale_out_data_size == 1)
1134         {
1135             const float scale_in = scale_in_data[0];
1136             const float scale_out = scale_out_data[0];
1137 
1138             if (bias_data_size == 0)
1139             {
1140                 #pragma omp parallel for num_threads(opt.num_threads)
1141                 for (int i = 0; i < w; i++)
1142                 {
1143                     float v = intptr[i] * scale_in;
1144                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1145                 }
1146             }
1147             else if (bias_data_size == 1)
1148             {
1149                 const float bias = bias_data[0];
1150 
1151                 #pragma omp parallel for num_threads(opt.num_threads)
1152                 for (int i = 0; i < w; i++)
1153                 {
1154                     float v = intptr[i] * scale_in + bias;
1155                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1156                 }
1157             }
1158             else
1159             {
1160                 #pragma omp parallel for num_threads(opt.num_threads)
1161                 for (int i = 0; i < w; i++)
1162                 {
1163                     float v = intptr[i] * scale_in + bias_data[i];
1164                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1165                 }
1166             }
1167         }
1168         else if (scale_in_data_size == 1 && scale_out_data_size > 1)
1169         {
1170             const float scale_in = scale_in_data[0];
1171 
1172             if (bias_data_size == 0)
1173             {
1174                 #pragma omp parallel for num_threads(opt.num_threads)
1175                 for (int i = 0; i < w; i++)
1176                 {
1177                     float v = intptr[i] * scale_in;
1178                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1179                 }
1180             }
1181             else if (bias_data_size == 1)
1182             {
1183                 const float bias = bias_data[0];
1184 
1185                 #pragma omp parallel for num_threads(opt.num_threads)
1186                 for (int i = 0; i < w; i++)
1187                 {
1188                     float v = intptr[i] * scale_in + bias;
1189                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1190                 }
1191             }
1192             else
1193             {
1194                 #pragma omp parallel for num_threads(opt.num_threads)
1195                 for (int i = 0; i < w; i++)
1196                 {
1197                     float v = intptr[i] * scale_in + bias_data[i];
1198                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1199                 }
1200             }
1201         }
1202         else if (scale_in_data_size > 1 && scale_out_data_size == 1)
1203         {
1204             const float scale_out = scale_out_data[0];
1205 
1206             if (bias_data_size == 0)
1207             {
1208                 #pragma omp parallel for num_threads(opt.num_threads)
1209                 for (int i = 0; i < w; i++)
1210                 {
1211                     float v = intptr[i] * scale_in_data[i];
1212                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1213                 }
1214             }
1215             else if (bias_data_size == 1)
1216             {
1217                 const float bias = bias_data[0];
1218 
1219                 #pragma omp parallel for num_threads(opt.num_threads)
1220                 for (int i = 0; i < w; i++)
1221                 {
1222                     float v = intptr[i] * scale_in_data[i] + bias;
1223                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1224                 }
1225             }
1226             else
1227             {
1228                 #pragma omp parallel for num_threads(opt.num_threads)
1229                 for (int i = 0; i < w; i++)
1230                 {
1231                     float v = intptr[i] * scale_in_data[i] + bias_data[i];
1232                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1233                 }
1234             }
1235         }
1236         else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
1237         {
1238             if (bias_data_size == 0)
1239             {
1240                 #pragma omp parallel for num_threads(opt.num_threads)
1241                 for (int i = 0; i < w; i++)
1242                 {
1243                     float v = intptr[i] * scale_in_data[i];
1244                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1245                 }
1246             }
1247             else if (bias_data_size == 1)
1248             {
1249                 const float bias = bias_data[0];
1250 
1251                 #pragma omp parallel for num_threads(opt.num_threads)
1252                 for (int i = 0; i < w; i++)
1253                 {
1254                     float v = intptr[i] * scale_in_data[i] + bias;
1255                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1256                 }
1257             }
1258             else
1259             {
1260                 #pragma omp parallel for num_threads(opt.num_threads)
1261                 for (int i = 0; i < w; i++)
1262                 {
1263                     float v = intptr[i] * scale_in_data[i] + bias_data[i];
1264                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1265                 }
1266             }
1267         }
1268     }
1269 
1270     if (dims == 2)
1271     {
1272         int w = bottom_blob.w;
1273         int h = bottom_blob.h;
1274 
1275         top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
1276         if (top_blob.empty())
1277             return -100;
1278 
1279         if (bias_data_size == 0)
1280         {
1281             #pragma omp parallel for num_threads(opt.num_threads)
1282             for (int i = 0; i < h; i++)
1283             {
1284                 const int* intptr = bottom_blob.row<const int>(i);
1285                 signed char* ptr = top_blob.row<signed char>(i);
1286 
1287                 const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i];
1288                 const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i];
1289 
1290                 for (int j = 0; j < w; j++)
1291                 {
1292                     float v = intptr[j] * scale_in;
1293                     ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1294                 }
1295             }
1296         }
1297         else
1298         {
1299             #pragma omp parallel for num_threads(opt.num_threads)
1300             for (int i = 0; i < h; i++)
1301             {
1302                 const int* intptr = bottom_blob.row<const int>(i);
1303                 signed char* ptr = top_blob.row<signed char>(i);
1304 
1305                 const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i];
1306                 const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i];
1307                 const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i];
1308 
1309                 for (int j = 0; j < w; j++)
1310                 {
1311                     float v = intptr[j] * scale_in + bias;
1312                     ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1313                 }
1314             }
1315         }
1316     }
1317 
1318     if (dims == 3)
1319     {
1320         int w = bottom_blob.w;
1321         int h = bottom_blob.h;
1322         int channels = bottom_blob.c;
1323         int size = w * h;
1324 
1325         top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
1326         if (top_blob.empty())
1327             return -100;
1328 
1329         if (bias_data_size == 0)
1330         {
1331             #pragma omp parallel for num_threads(opt.num_threads)
1332             for (int q = 0; q < channels; q++)
1333             {
1334                 const int* intptr = bottom_blob.channel(q);
1335                 signed char* ptr = top_blob.channel(q);
1336 
1337                 const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q];
1338                 const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q];
1339 
1340                 for (int i = 0; i < size; i++)
1341                 {
1342                     float v = intptr[i] * scale_in;
1343                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1344                 }
1345             }
1346         }
1347         else
1348         {
1349             #pragma omp parallel for num_threads(opt.num_threads)
1350             for (int q = 0; q < channels; q++)
1351             {
1352                 const int* intptr = bottom_blob.channel(q);
1353                 signed char* ptr = top_blob.channel(q);
1354 
1355                 const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q];
1356                 const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q];
1357                 const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q];
1358 
1359                 for (int i = 0; i < size; i++)
1360                 {
1361                     float v = intptr[i] * scale_in + bias;
1362                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1363                 }
1364             }
1365         }
1366     }
1367 
1368     return 0;
1369 }
1370 
1371 } // namespace ncnn
1372