1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
conv1x1s1_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const Mat & _bias,const Option & opt)15 static void conv1x1s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
16 {
17     int inch = bottom_blob.c;
18 
19     int outw = top_blob.w;
20     int outh = top_blob.h;
21     int outch = top_blob.c;
22 
23     const float* kernel = _kernel;
24     const float* bias = _bias;
25 
26     #pragma omp parallel for num_threads(opt.num_threads)
27     for (int p = 0; p < outch; p++)
28     {
29         Mat out = top_blob.channel(p);
30 
31         const float bias0 = bias ? bias[p] : 0.f;
32 
33         out.fill(bias0);
34 
35         int q = 0;
36 
37         for (; q + 3 < inch; q += 4)
38         {
39             float* outptr = out;
40 
41             const float* img0 = bottom_blob.channel(q);
42             const float* img1 = bottom_blob.channel(q + 1);
43             const float* img2 = bottom_blob.channel(q + 2);
44             const float* img3 = bottom_blob.channel(q + 3);
45 
46             const float* kernel0 = kernel + p * inch + q;
47             const float k0 = kernel0[0];
48             const float k1 = kernel0[1];
49             const float k2 = kernel0[2];
50             const float k3 = kernel0[3];
51 
52             const float* r0 = img0;
53             const float* r1 = img1;
54             const float* r2 = img2;
55             const float* r3 = img3;
56 
57             int size = outw * outh;
58 
59             int remain = size;
60 
61             for (; remain > 0; remain--)
62             {
63                 float sum = *r0 * k0;
64                 float sum1 = *r1 * k1;
65                 float sum2 = *r2 * k2;
66                 float sum3 = *r3 * k3;
67 
68                 *outptr += sum + sum1 + sum2 + sum3;
69 
70                 r0++;
71                 r1++;
72                 r2++;
73                 r3++;
74                 outptr++;
75             }
76         }
77 
78         for (; q < inch; q++)
79         {
80             float* outptr = out;
81 
82             const float* img0 = bottom_blob.channel(q);
83 
84             const float* kernel0 = kernel + p * inch + q;
85             const float k0 = kernel0[0];
86 
87             const float* r0 = img0;
88 
89             int size = outw * outh;
90 
91             int remain = size;
92 
93             for (; remain > 0; remain--)
94             {
95                 float sum = *r0 * k0;
96 
97                 *outptr += sum;
98 
99                 r0++;
100                 outptr++;
101             }
102         }
103     }
104 }
105 
conv1x1s2_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const Mat & _bias,const Option & opt)106 static void conv1x1s2_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
107 {
108     int w = bottom_blob.w;
109     int inch = bottom_blob.c;
110 
111     int outw = top_blob.w;
112     int outh = top_blob.h;
113     int outch = top_blob.c;
114 
115     const int tailstep = w - 2 * outw + w;
116 
117     const float* kernel = _kernel;
118     const float* bias = _bias;
119 
120     #pragma omp parallel for num_threads(opt.num_threads)
121     for (int p = 0; p < outch; p++)
122     {
123         Mat out = top_blob.channel(p);
124 
125         const float bias0 = bias ? bias[p] : 0.f;
126 
127         out.fill(bias0);
128 
129         int q = 0;
130 
131         for (; q + 3 < inch; q += 4)
132         {
133             float* outptr = out;
134 
135             const float* img0 = bottom_blob.channel(q);
136             const float* img1 = bottom_blob.channel(q + 1);
137             const float* img2 = bottom_blob.channel(q + 2);
138             const float* img3 = bottom_blob.channel(q + 3);
139 
140             const float* kernel0 = kernel + p * inch + q;
141             const float k0 = kernel0[0];
142             const float k1 = kernel0[1];
143             const float k2 = kernel0[2];
144             const float k3 = kernel0[3];
145 
146             const float* r0 = img0;
147             const float* r1 = img1;
148             const float* r2 = img2;
149             const float* r3 = img3;
150 
151             for (int i = 0; i < outh; i++)
152             {
153                 int remain = outw;
154 
155                 for (; remain > 0; remain--)
156                 {
157                     float sum = *r0 * k0;
158                     float sum1 = *r1 * k1;
159                     float sum2 = *r2 * k2;
160                     float sum3 = *r3 * k3;
161 
162                     *outptr += sum + sum1 + sum2 + sum3;
163 
164                     r0 += 2;
165                     r1 += 2;
166                     r2 += 2;
167                     r3 += 2;
168                     outptr++;
169                 }
170 
171                 r0 += tailstep;
172                 r1 += tailstep;
173                 r2 += tailstep;
174                 r3 += tailstep;
175             }
176         }
177 
178         for (; q < inch; q++)
179         {
180             float* outptr = out;
181 
182             const float* img0 = bottom_blob.channel(q);
183 
184             const float* kernel0 = kernel + p * inch + q;
185             const float k0 = kernel0[0];
186 
187             const float* r0 = img0;
188 
189             for (int i = 0; i < outh; i++)
190             {
191                 int remain = outw;
192 
193                 for (; remain > 0; remain--)
194                 {
195                     float sum = *r0 * k0;
196 
197                     *outptr += sum;
198 
199                     r0 += 2;
200                     outptr++;
201                 }
202 
203                 r0 += tailstep;
204             }
205         }
206     }
207 }
208