1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
conv1x1s1_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const Mat & _bias,const Option & opt)15 static void conv1x1s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
16 {
17 int inch = bottom_blob.c;
18
19 int outw = top_blob.w;
20 int outh = top_blob.h;
21 int outch = top_blob.c;
22
23 const float* kernel = _kernel;
24 const float* bias = _bias;
25
26 #pragma omp parallel for num_threads(opt.num_threads)
27 for (int p = 0; p < outch; p++)
28 {
29 Mat out = top_blob.channel(p);
30
31 const float bias0 = bias ? bias[p] : 0.f;
32
33 out.fill(bias0);
34
35 int q = 0;
36
37 for (; q + 3 < inch; q += 4)
38 {
39 float* outptr = out;
40
41 const float* img0 = bottom_blob.channel(q);
42 const float* img1 = bottom_blob.channel(q + 1);
43 const float* img2 = bottom_blob.channel(q + 2);
44 const float* img3 = bottom_blob.channel(q + 3);
45
46 const float* kernel0 = kernel + p * inch + q;
47 const float k0 = kernel0[0];
48 const float k1 = kernel0[1];
49 const float k2 = kernel0[2];
50 const float k3 = kernel0[3];
51
52 const float* r0 = img0;
53 const float* r1 = img1;
54 const float* r2 = img2;
55 const float* r3 = img3;
56
57 int size = outw * outh;
58
59 int remain = size;
60
61 for (; remain > 0; remain--)
62 {
63 float sum = *r0 * k0;
64 float sum1 = *r1 * k1;
65 float sum2 = *r2 * k2;
66 float sum3 = *r3 * k3;
67
68 *outptr += sum + sum1 + sum2 + sum3;
69
70 r0++;
71 r1++;
72 r2++;
73 r3++;
74 outptr++;
75 }
76 }
77
78 for (; q < inch; q++)
79 {
80 float* outptr = out;
81
82 const float* img0 = bottom_blob.channel(q);
83
84 const float* kernel0 = kernel + p * inch + q;
85 const float k0 = kernel0[0];
86
87 const float* r0 = img0;
88
89 int size = outw * outh;
90
91 int remain = size;
92
93 for (; remain > 0; remain--)
94 {
95 float sum = *r0 * k0;
96
97 *outptr += sum;
98
99 r0++;
100 outptr++;
101 }
102 }
103 }
104 }
105
conv1x1s2_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const Mat & _bias,const Option & opt)106 static void conv1x1s2_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
107 {
108 int w = bottom_blob.w;
109 int inch = bottom_blob.c;
110
111 int outw = top_blob.w;
112 int outh = top_blob.h;
113 int outch = top_blob.c;
114
115 const int tailstep = w - 2 * outw + w;
116
117 const float* kernel = _kernel;
118 const float* bias = _bias;
119
120 #pragma omp parallel for num_threads(opt.num_threads)
121 for (int p = 0; p < outch; p++)
122 {
123 Mat out = top_blob.channel(p);
124
125 const float bias0 = bias ? bias[p] : 0.f;
126
127 out.fill(bias0);
128
129 int q = 0;
130
131 for (; q + 3 < inch; q += 4)
132 {
133 float* outptr = out;
134
135 const float* img0 = bottom_blob.channel(q);
136 const float* img1 = bottom_blob.channel(q + 1);
137 const float* img2 = bottom_blob.channel(q + 2);
138 const float* img3 = bottom_blob.channel(q + 3);
139
140 const float* kernel0 = kernel + p * inch + q;
141 const float k0 = kernel0[0];
142 const float k1 = kernel0[1];
143 const float k2 = kernel0[2];
144 const float k3 = kernel0[3];
145
146 const float* r0 = img0;
147 const float* r1 = img1;
148 const float* r2 = img2;
149 const float* r3 = img3;
150
151 for (int i = 0; i < outh; i++)
152 {
153 int remain = outw;
154
155 for (; remain > 0; remain--)
156 {
157 float sum = *r0 * k0;
158 float sum1 = *r1 * k1;
159 float sum2 = *r2 * k2;
160 float sum3 = *r3 * k3;
161
162 *outptr += sum + sum1 + sum2 + sum3;
163
164 r0 += 2;
165 r1 += 2;
166 r2 += 2;
167 r3 += 2;
168 outptr++;
169 }
170
171 r0 += tailstep;
172 r1 += tailstep;
173 r2 += tailstep;
174 r3 += tailstep;
175 }
176 }
177
178 for (; q < inch; q++)
179 {
180 float* outptr = out;
181
182 const float* img0 = bottom_blob.channel(q);
183
184 const float* kernel0 = kernel + p * inch + q;
185 const float k0 = kernel0[0];
186
187 const float* r0 = img0;
188
189 for (int i = 0; i < outh; i++)
190 {
191 int remain = outw;
192
193 for (; remain > 0; remain--)
194 {
195 float sum = *r0 * k0;
196
197 *outptr += sum;
198
199 r0 += 2;
200 outptr++;
201 }
202
203 r0 += tailstep;
204 }
205 }
206 }
207 }
208