1 // BUG1989 is pleased to support the open source community by supporting ncnn available.
2 //
3 // Copyright (C) 2019 BUG1989. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
conv1x1s1_int8_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const Option & opt)15 static void conv1x1s1_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
16 {
17 int inch = bottom_blob.c;
18
19 int outw = top_blob.w;
20 int outh = top_blob.h;
21 int outch = top_blob.c;
22
23 const float* kernel = _kernel;
24
25 #pragma omp parallel for num_threads(opt.num_threads)
26 for (int p = 0; p < outch; p++)
27 {
28 Mat out0 = top_blob.channel(p);
29
30 out0.fill(0);
31
32 int q = 0;
33
34 for (; q + 7 < inch; q += 8)
35 {
36 int* outptr0 = out0;
37
38 const signed char* kernel0 = (const signed char*)kernel + p * inch + q;
39
40 const signed char* r0 = bottom_blob.channel(q);
41 const signed char* r1 = bottom_blob.channel(q + 1);
42 const signed char* r2 = bottom_blob.channel(q + 2);
43 const signed char* r3 = bottom_blob.channel(q + 3);
44 const signed char* r4 = bottom_blob.channel(q + 4);
45 const signed char* r5 = bottom_blob.channel(q + 5);
46 const signed char* r6 = bottom_blob.channel(q + 6);
47 const signed char* r7 = bottom_blob.channel(q + 7);
48
49 int size = outw * outh;
50 int remain = size;
51
52 for (; remain > 0; remain--)
53 {
54 //ToDo Neon
55 int sum0 = (int)*r0 * (int)kernel0[0] + (int)*r1 * (int)kernel0[1] + (int)*r2 * (int)kernel0[2] + (int)*r3 * (int)kernel0[3] + (int)*r4 * (int)kernel0[4] + (int)*r5 * (int)kernel0[5] + (int)*r6 * (int)kernel0[6] + (int)*r7 * (int)kernel0[7];
56
57 *outptr0 += sum0;
58
59 r0++;
60 r1++;
61 r2++;
62 r3++;
63 r4++;
64 r5++;
65 r6++;
66 r7++;
67 outptr0++;
68 }
69 }
70
71 for (; q < inch; q++)
72 {
73 int* outptr0 = out0;
74
75 const signed char* r0 = bottom_blob.channel(q);
76
77 const signed char* kernel0 = (const signed char*)kernel + p * inch + q;
78 const signed char k0 = kernel0[0];
79
80 int size = outw * outh;
81 int remain = size;
82
83 for (; remain > 0; remain--)
84 {
85 int sum0 = (int)(*r0) * (int)k0;
86
87 *outptr0 += sum0;
88
89 r0++;
90 outptr0++;
91 }
92 }
93 }
94 }
95
conv1x1s2_int8_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const Option & opt)96 static void conv1x1s2_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
97 {
98 int w = bottom_blob.w;
99 int inch = bottom_blob.c;
100
101 int outw = top_blob.w;
102 int outh = top_blob.h;
103 int outch = top_blob.c;
104
105 const int tailstep = w - 2 * outw + w;
106 const signed char* kernel = _kernel;
107
108 #pragma omp parallel for num_threads(opt.num_threads)
109 for (int p = 0; p < outch; p++)
110 {
111 Mat out0 = top_blob.channel(p);
112
113 out0.fill(0);
114
115 int q = 0;
116
117 for (; q + 7 < inch; q += 8)
118 {
119 int* outptr0 = out0;
120
121 const signed char* kernel0 = (const signed char*)kernel + p * inch + q;
122
123 const signed char* r0 = bottom_blob.channel(q);
124 const signed char* r1 = bottom_blob.channel(q + 1);
125 const signed char* r2 = bottom_blob.channel(q + 2);
126 const signed char* r3 = bottom_blob.channel(q + 3);
127 const signed char* r4 = bottom_blob.channel(q + 4);
128 const signed char* r5 = bottom_blob.channel(q + 5);
129 const signed char* r6 = bottom_blob.channel(q + 6);
130 const signed char* r7 = bottom_blob.channel(q + 7);
131
132 for (int i = 0; i < outh; i++)
133 {
134 int remain = outw;
135
136 for (; remain > 0; remain--)
137 {
138 //ToDo Neon
139 int sum0 = (int)*r0 * (int)kernel0[0] + (int)*r1 * (int)kernel0[1] + (int)*r2 * (int)kernel0[2] + (int)*r3 * (int)kernel0[3] + (int)*r4 * (int)kernel0[4] + (int)*r5 * (int)kernel0[5] + (int)*r6 * (int)kernel0[6] + (int)*r7 * (int)kernel0[7];
140
141 *outptr0 += sum0;
142
143 r0 += 2;
144 r1 += 2;
145 r2 += 2;
146 r3 += 2;
147 r4 += 2;
148 r5 += 2;
149 r6 += 2;
150 r7 += 2;
151 outptr0++;
152 }
153
154 r0 += tailstep;
155 r1 += tailstep;
156 r2 += tailstep;
157 r3 += tailstep;
158 r4 += tailstep;
159 r5 += tailstep;
160 r6 += tailstep;
161 r7 += tailstep;
162 }
163 }
164
165 for (; q < inch; q++)
166 {
167 int* outptr0 = out0;
168
169 const signed char* r0 = bottom_blob.channel(q);
170
171 const signed char* kernel0 = (const signed char*)kernel + p * inch + q;
172
173 for (int i = 0; i < outh; i++)
174 {
175 int remain = outw;
176
177 for (; remain > 0; remain--)
178 {
179 //ToDo Neon
180 int sum0 = (int)*r0 * (int)kernel0[0];
181
182 *outptr0 += sum0;
183
184 r0 += 2;
185 outptr0++;
186 }
187
188 r0 += tailstep;
189 }
190 }
191 }
192 }
193