1 // BUG1989 is pleased to support the open source community by supporting ncnn available.
2 //
3 // Copyright (C) 2019 BUG1989. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
convdw3x3s1_int8_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const Option & opt)15 static void convdw3x3s1_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
16 {
17 int w = bottom_blob.w;
18 //int h = bottom_blob.h;
19 //int inch = bottom_blob.c;
20
21 int outw = top_blob.w;
22 int outh = top_blob.h;
23 int outch = top_blob.c;
24
25 const signed char* kernel = _kernel;
26
27 #pragma omp parallel for num_threads(opt.num_threads)
28 for (int p = 0; p < outch; p++)
29 {
30 Mat out = top_blob.channel(p);
31
32 out.fill(0);
33
34 const signed char* kernel0 = (const signed char*)kernel + p * 9;
35
36 int* outptr = out;
37
38 const signed char* img0 = bottom_blob.channel(p);
39
40 const signed char* r0 = img0;
41 const signed char* r1 = img0 + w;
42 const signed char* r2 = img0 + w * 2;
43
44 int i = 0;
45 for (; i < outh; i++)
46 {
47 int remain = outw;
48
49 for (; remain > 0; remain--)
50 {
51 int sum = 0;
52
53 sum += (int)r0[0] * (int)kernel0[0];
54 sum += (int)r0[1] * (int)kernel0[1];
55 sum += (int)r0[2] * (int)kernel0[2];
56 sum += (int)r1[0] * (int)kernel0[3];
57 sum += (int)r1[1] * (int)kernel0[4];
58 sum += (int)r1[2] * (int)kernel0[5];
59 sum += (int)r2[0] * (int)kernel0[6];
60 sum += (int)r2[1] * (int)kernel0[7];
61 sum += (int)r2[2] * (int)kernel0[8];
62
63 *outptr += sum;
64
65 r0++;
66 r1++;
67 r2++;
68 outptr++;
69 }
70
71 r0 += 2;
72 r1 += 2;
73 r2 += 2;
74 }
75 }
76 }
77
convdw3x3s2_int8_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const Option & opt)78 static void convdw3x3s2_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
79 {
80 int w = bottom_blob.w;
81 //int h = bottom_blob.h;
82 //int inch = bottom_blob.c;
83
84 int outw = top_blob.w;
85 int outh = top_blob.h;
86 int outch = top_blob.c;
87
88 const int tailstep = w - 2 * outw + w;
89
90 const signed char* kernel = _kernel;
91
92 #pragma omp parallel for num_threads(opt.num_threads)
93 for (int p = 0; p < outch; p++)
94 {
95 Mat out = top_blob.channel(p);
96 out.fill(0);
97
98 const signed char* kernel0 = (const signed char*)kernel + p * 9;
99
100 int* outptr = out;
101
102 const signed char* img0 = bottom_blob.channel(p);
103
104 const signed char* r0 = img0;
105 const signed char* r1 = img0 + w;
106 const signed char* r2 = img0 + w * 2;
107
108 int i = 0;
109
110 for (; i < outh; i++)
111 {
112 int remain = outw;
113
114 for (; remain > 0; remain--)
115 {
116 int sum = 0;
117
118 sum += (int)r0[0] * (int)kernel0[0];
119 sum += (int)r0[1] * (int)kernel0[1];
120 sum += (int)r0[2] * (int)kernel0[2];
121 sum += (int)r1[0] * (int)kernel0[3];
122 sum += (int)r1[1] * (int)kernel0[4];
123 sum += (int)r1[2] * (int)kernel0[5];
124 sum += (int)r2[0] * (int)kernel0[6];
125 sum += (int)r2[1] * (int)kernel0[7];
126 sum += (int)r2[2] * (int)kernel0[8];
127
128 *outptr += sum;
129
130 r0 += 2;
131 r1 += 2;
132 r2 += 2;
133 outptr++;
134 }
135
136 r0 += tailstep;
137 r1 += tailstep;
138 r2 += tailstep;
139 }
140 }
141 }
142
convdw3x3s1_int8_dequant_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const Mat & _bias,std::vector<float> scales_dequant,const Option & opt)143 static void convdw3x3s1_int8_dequant_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, std::vector<float> scales_dequant, const Option& opt)
144 {
145 int w = bottom_blob.w;
146 //int h = bottom_blob.h;
147 //int inch = bottom_blob.c;
148
149 int outw = top_blob.w;
150 int outh = top_blob.h;
151 int outch = top_blob.c;
152
153 const signed char* kernel = _kernel;
154 const float* bias = _bias;
155
156 #pragma omp parallel for num_threads(opt.num_threads)
157 for (int p = 0; p < outch; p++)
158 {
159 Mat out = top_blob.channel(p);
160 float* outptr = out;
161
162 const float bias0 = bias ? bias[p] : 0.f;
163 const float scale_dequant = scales_dequant[p];
164
165 out.fill(bias0);
166
167 const signed char* kernel0 = (const signed char*)kernel + p * 9;
168
169 const signed char* img0 = bottom_blob.channel(p);
170 const signed char* r0 = img0;
171 const signed char* r1 = img0 + w;
172 const signed char* r2 = img0 + w * 2;
173
174 int i = 0;
175 for (; i < outh; i++)
176 {
177 int remain = outw;
178
179 for (; remain > 0; remain--)
180 {
181 int sum = 0;
182
183 sum += (int)r0[0] * (int)kernel0[0];
184 sum += (int)r0[1] * (int)kernel0[1];
185 sum += (int)r0[2] * (int)kernel0[2];
186 sum += (int)r1[0] * (int)kernel0[3];
187 sum += (int)r1[1] * (int)kernel0[4];
188 sum += (int)r1[2] * (int)kernel0[5];
189 sum += (int)r2[0] * (int)kernel0[6];
190 sum += (int)r2[1] * (int)kernel0[7];
191 sum += (int)r2[2] * (int)kernel0[8];
192
193 *outptr += (float)sum * scale_dequant;
194
195 r0++;
196 r1++;
197 r2++;
198 outptr++;
199 }
200
201 r0 += 2;
202 r1 += 2;
203 r2 += 2;
204 }
205 }
206 }
207
convdw3x3s2_int8_dequant_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const Mat & _bias,std::vector<float> scales_dequant,const Option & opt)208 static void convdw3x3s2_int8_dequant_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, std::vector<float> scales_dequant, const Option& opt)
209 {
210 int w = bottom_blob.w;
211 //int h = bottom_blob.h;
212 //int inch = bottom_blob.c;
213
214 int outw = top_blob.w;
215 int outh = top_blob.h;
216 int outch = top_blob.c;
217
218 const int tailstep = w - 2 * outw + w;
219
220 const signed char* kernel = _kernel;
221 const float* bias = _bias;
222
223 #pragma omp parallel for num_threads(opt.num_threads)
224 for (int p = 0; p < outch; p++)
225 {
226 Mat out = top_blob.channel(p);
227 float* outptr = out;
228
229 const float bias0 = bias ? bias[p] : 0.f;
230 const float scale_dequant = scales_dequant[p];
231
232 out.fill(bias0);
233
234 const signed char* kernel0 = (const signed char*)kernel + p * 9;
235
236 const signed char* img0 = bottom_blob.channel(p);
237 const signed char* r0 = img0;
238 const signed char* r1 = img0 + w;
239 const signed char* r2 = img0 + w * 2;
240
241 int i = 0;
242
243 for (; i < outh; i++)
244 {
245 int remain = outw;
246
247 for (; remain > 0; remain--)
248 {
249 int sum = 0;
250
251 sum += (int)r0[0] * (int)kernel0[0];
252 sum += (int)r0[1] * (int)kernel0[1];
253 sum += (int)r0[2] * (int)kernel0[2];
254 sum += (int)r1[0] * (int)kernel0[3];
255 sum += (int)r1[1] * (int)kernel0[4];
256 sum += (int)r1[2] * (int)kernel0[5];
257 sum += (int)r2[0] * (int)kernel0[6];
258 sum += (int)r2[1] * (int)kernel0[7];
259 sum += (int)r2[2] * (int)kernel0[8];
260
261 *outptr += (float)sum * scale_dequant;
262
263 r0 += 2;
264 r1 += 2;
265 r2 += 2;
266 outptr++;
267 }
268
269 r0 += tailstep;
270 r1 += tailstep;
271 r2 += tailstep;
272 }
273 }
274 }
275
convdw3x3s1_int8_requant_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const Mat & _bias,std::vector<float> scales_requant,const Option & opt)276 static void convdw3x3s1_int8_requant_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, std::vector<float> scales_requant, const Option& opt)
277 {
278 int w = bottom_blob.w;
279 //int h = bottom_blob.h;
280 //int inch = bottom_blob.c;
281
282 int outw = top_blob.w;
283 int outh = top_blob.h;
284 int outch = top_blob.c;
285
286 const signed char* kernel = _kernel;
287 const float* bias = _bias;
288
289 #pragma omp parallel for num_threads(opt.num_threads)
290 for (int p = 0; p < outch; p++)
291 {
292 Mat out = top_blob.channel(p);
293 signed char* outptr = out;
294
295 const float bias0 = bias ? bias[p] : 0.f;
296 const float scale_requant_in = scales_requant[2 * p];
297 const float scale_requant_out = scales_requant[2 * p + 1];
298
299 const signed char* kernel0 = (const signed char*)kernel + p * 9;
300
301 const signed char* img0 = bottom_blob.channel(p);
302 const signed char* r0 = img0;
303 const signed char* r1 = img0 + w;
304 const signed char* r2 = img0 + w * 2;
305
306 int i = 0;
307 for (; i < outh; i++)
308 {
309 int remain = outw;
310
311 for (; remain > 0; remain--)
312 {
313 int sum = 0;
314
315 sum += (int)r0[0] * (int)kernel0[0];
316 sum += (int)r0[1] * (int)kernel0[1];
317 sum += (int)r0[2] * (int)kernel0[2];
318 sum += (int)r1[0] * (int)kernel0[3];
319 sum += (int)r1[1] * (int)kernel0[4];
320 sum += (int)r1[2] * (int)kernel0[5];
321 sum += (int)r2[0] * (int)kernel0[6];
322 sum += (int)r2[1] * (int)kernel0[7];
323 sum += (int)r2[2] * (int)kernel0[8];
324
325 *outptr = float2int8(((float)sum * scale_requant_in + bias0) * scale_requant_out);
326
327 r0++;
328 r1++;
329 r2++;
330 outptr++;
331 }
332
333 r0 += 2;
334 r1 += 2;
335 r2 += 2;
336 }
337 }
338 }
339
convdw3x3s2_int8_requant_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const Mat & _bias,std::vector<float> scales_requant,const Option & opt)340 static void convdw3x3s2_int8_requant_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, std::vector<float> scales_requant, const Option& opt)
341 {
342 int w = bottom_blob.w;
343 //int h = bottom_blob.h;
344 //int inch = bottom_blob.c;
345
346 int outw = top_blob.w;
347 int outh = top_blob.h;
348 int outch = top_blob.c;
349
350 const int tailstep = w - 2 * outw + w;
351
352 const signed char* kernel = _kernel;
353 const float* bias = _bias;
354
355 #pragma omp parallel for num_threads(opt.num_threads)
356 for (int p = 0; p < outch; p++)
357 {
358 Mat out = top_blob.channel(p);
359 signed char* outptr = out;
360
361 const float bias0 = bias ? bias[p] : 0.f;
362 const float scale_requant_in = scales_requant[2 * p];
363 const float scale_requant_out = scales_requant[2 * p + 1];
364
365 const signed char* kernel0 = (const signed char*)kernel + p * 9;
366
367 const signed char* img0 = bottom_blob.channel(p);
368 const signed char* r0 = img0;
369 const signed char* r1 = img0 + w;
370 const signed char* r2 = img0 + w * 2;
371
372 int i = 0;
373
374 for (; i < outh; i++)
375 {
376 int remain = outw;
377
378 for (; remain > 0; remain--)
379 {
380 int sum = 0;
381
382 sum += (int)r0[0] * (int)kernel0[0];
383 sum += (int)r0[1] * (int)kernel0[1];
384 sum += (int)r0[2] * (int)kernel0[2];
385 sum += (int)r1[0] * (int)kernel0[3];
386 sum += (int)r1[1] * (int)kernel0[4];
387 sum += (int)r1[2] * (int)kernel0[5];
388 sum += (int)r2[0] * (int)kernel0[6];
389 sum += (int)r2[1] * (int)kernel0[7];
390 sum += (int)r2[2] * (int)kernel0[8];
391
392 *outptr = float2int8(((float)sum * scale_requant_in + bias0) * scale_requant_out);
393
394 r0 += 2;
395 r1 += 2;
396 r2 += 2;
397 outptr++;
398 }
399
400 r0 += tailstep;
401 r1 += tailstep;
402 r2 += tailstep;
403 }
404 }
405 }