1 // BUG1989 is pleased to support the open source community by supporting ncnn available.
2 //
3 // Copyright (C) 2019 BUG1989. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
conv1x1s1_int8_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const Option & opt)15 static void conv1x1s1_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
16 {
17     int inch = bottom_blob.c;
18 
19     int outw = top_blob.w;
20     int outh = top_blob.h;
21     int outch = top_blob.c;
22 
23     const float* kernel = _kernel;
24 
25     #pragma omp parallel for num_threads(opt.num_threads)
26     for (int p = 0; p < outch; p++)
27     {
28         Mat out0 = top_blob.channel(p);
29 
30         out0.fill(0);
31 
32         int q = 0;
33 
34         for (; q + 7 < inch; q += 8)
35         {
36             int* outptr0 = out0;
37 
38             const signed char* kernel0 = (const signed char*)kernel + p * inch + q;
39 
40             const signed char* r0 = bottom_blob.channel(q);
41             const signed char* r1 = bottom_blob.channel(q + 1);
42             const signed char* r2 = bottom_blob.channel(q + 2);
43             const signed char* r3 = bottom_blob.channel(q + 3);
44             const signed char* r4 = bottom_blob.channel(q + 4);
45             const signed char* r5 = bottom_blob.channel(q + 5);
46             const signed char* r6 = bottom_blob.channel(q + 6);
47             const signed char* r7 = bottom_blob.channel(q + 7);
48 
49             int size = outw * outh;
50             int remain = size;
51 
52             for (; remain > 0; remain--)
53             {
54                 //ToDo Neon
55                 int sum0 = (int)*r0 * (int)kernel0[0] + (int)*r1 * (int)kernel0[1] + (int)*r2 * (int)kernel0[2] + (int)*r3 * (int)kernel0[3] + (int)*r4 * (int)kernel0[4] + (int)*r5 * (int)kernel0[5] + (int)*r6 * (int)kernel0[6] + (int)*r7 * (int)kernel0[7];
56 
57                 *outptr0 += sum0;
58 
59                 r0++;
60                 r1++;
61                 r2++;
62                 r3++;
63                 r4++;
64                 r5++;
65                 r6++;
66                 r7++;
67                 outptr0++;
68             }
69         }
70 
71         for (; q < inch; q++)
72         {
73             int* outptr0 = out0;
74 
75             const signed char* r0 = bottom_blob.channel(q);
76 
77             const signed char* kernel0 = (const signed char*)kernel + p * inch + q;
78             const signed char k0 = kernel0[0];
79 
80             int size = outw * outh;
81             int remain = size;
82 
83             for (; remain > 0; remain--)
84             {
85                 int sum0 = (int)(*r0) * (int)k0;
86 
87                 *outptr0 += sum0;
88 
89                 r0++;
90                 outptr0++;
91             }
92         }
93     }
94 }
95 
conv1x1s2_int8_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const Option & opt)96 static void conv1x1s2_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
97 {
98     int w = bottom_blob.w;
99     int inch = bottom_blob.c;
100 
101     int outw = top_blob.w;
102     int outh = top_blob.h;
103     int outch = top_blob.c;
104 
105     const int tailstep = w - 2 * outw + w;
106     const signed char* kernel = _kernel;
107 
108     #pragma omp parallel for num_threads(opt.num_threads)
109     for (int p = 0; p < outch; p++)
110     {
111         Mat out0 = top_blob.channel(p);
112 
113         out0.fill(0);
114 
115         int q = 0;
116 
117         for (; q + 7 < inch; q += 8)
118         {
119             int* outptr0 = out0;
120 
121             const signed char* kernel0 = (const signed char*)kernel + p * inch + q;
122 
123             const signed char* r0 = bottom_blob.channel(q);
124             const signed char* r1 = bottom_blob.channel(q + 1);
125             const signed char* r2 = bottom_blob.channel(q + 2);
126             const signed char* r3 = bottom_blob.channel(q + 3);
127             const signed char* r4 = bottom_blob.channel(q + 4);
128             const signed char* r5 = bottom_blob.channel(q + 5);
129             const signed char* r6 = bottom_blob.channel(q + 6);
130             const signed char* r7 = bottom_blob.channel(q + 7);
131 
132             for (int i = 0; i < outh; i++)
133             {
134                 int remain = outw;
135 
136                 for (; remain > 0; remain--)
137                 {
138                     //ToDo Neon
139                     int sum0 = (int)*r0 * (int)kernel0[0] + (int)*r1 * (int)kernel0[1] + (int)*r2 * (int)kernel0[2] + (int)*r3 * (int)kernel0[3] + (int)*r4 * (int)kernel0[4] + (int)*r5 * (int)kernel0[5] + (int)*r6 * (int)kernel0[6] + (int)*r7 * (int)kernel0[7];
140 
141                     *outptr0 += sum0;
142 
143                     r0 += 2;
144                     r1 += 2;
145                     r2 += 2;
146                     r3 += 2;
147                     r4 += 2;
148                     r5 += 2;
149                     r6 += 2;
150                     r7 += 2;
151                     outptr0++;
152                 }
153 
154                 r0 += tailstep;
155                 r1 += tailstep;
156                 r2 += tailstep;
157                 r3 += tailstep;
158                 r4 += tailstep;
159                 r5 += tailstep;
160                 r6 += tailstep;
161                 r7 += tailstep;
162             }
163         }
164 
165         for (; q < inch; q++)
166         {
167             int* outptr0 = out0;
168 
169             const signed char* r0 = bottom_blob.channel(q);
170 
171             const signed char* kernel0 = (const signed char*)kernel + p * inch + q;
172 
173             for (int i = 0; i < outh; i++)
174             {
175                 int remain = outw;
176 
177                 for (; remain > 0; remain--)
178                 {
179                     //ToDo Neon
180                     int sum0 = (int)*r0 * (int)kernel0[0];
181 
182                     *outptr0 += sum0;
183 
184                     r0 += 2;
185                     outptr0++;
186                 }
187 
188                 r0 += tailstep;
189             }
190         }
191     }
192 }
193