1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
conv3x3s1_pack1to8_avx(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel,const Mat & _bias,const Option & opt)15 static void conv3x3s1_pack1to8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
16 {
17 int inch = bottom_blob.c;
18 int outw = top_blob.w;
19 int outh = top_blob.h;
20 int outch = top_blob.c;
21 const float* bias = _bias;
22
23 int nn_outch = outch >> 1;
24 int remain_outch_start = nn_outch << 1;
25
26 #pragma omp parallel for num_threads(opt.num_threads)
27 for (int pp = 0; pp < nn_outch; pp++)
28 {
29 int p = pp * 2;
30
31 Mat out0 = top_blob.channel(p);
32 Mat out1 = top_blob.channel(p + 1);
33
34 __m256 _bias0 = bias ? _mm256_loadu_ps((const float*)bias + p * 8) : _mm256_set1_ps(0.f);
35 __m256 _bias1 = bias ? _mm256_loadu_ps((const float*)bias + (p + 1) * 8) : _mm256_set1_ps(0.f);
36 out0.fill(_bias0);
37 out1.fill(_bias1);
38
39 const float* k0 = kernel.channel(p);
40 const float* k1 = kernel.channel(p + 1);
41
42 for (int q = 0; q < inch; q++)
43 {
44 float* outptr0 = out0;
45 float* outptr1 = out1;
46
47 const Mat img0 = bottom_blob.channel(q);
48
49 const float* r0 = img0.row(0);
50 const float* r1 = img0.row(1);
51 const float* r2 = img0.row(2);
52
53 __m256 _k00_0 = _mm256_loadu_ps(k0);
54 __m256 _k01_0 = _mm256_loadu_ps(k0 + 8);
55 __m256 _k02_0 = _mm256_loadu_ps(k0 + 16);
56 __m256 _k10_0 = _mm256_loadu_ps(k0 + 24);
57 __m256 _k11_0 = _mm256_loadu_ps(k0 + 32);
58 __m256 _k12_0 = _mm256_loadu_ps(k0 + 40);
59 __m256 _k20_0 = _mm256_loadu_ps(k0 + 48);
60 __m256 _k21_0 = _mm256_loadu_ps(k0 + 56);
61 __m256 _k22_0 = _mm256_loadu_ps(k0 + 64);
62
63 __m256 _k00_1 = _mm256_loadu_ps(k1);
64 __m256 _k01_1 = _mm256_loadu_ps(k1 + 8);
65 __m256 _k02_1 = _mm256_loadu_ps(k1 + 16);
66 __m256 _k10_1 = _mm256_loadu_ps(k1 + 24);
67 __m256 _k11_1 = _mm256_loadu_ps(k1 + 32);
68 __m256 _k12_1 = _mm256_loadu_ps(k1 + 40);
69 __m256 _k20_1 = _mm256_loadu_ps(k1 + 48);
70 __m256 _k21_1 = _mm256_loadu_ps(k1 + 56);
71 __m256 _k22_1 = _mm256_loadu_ps(k1 + 64);
72
73 int i = 0;
74
75 for (; i < outh; i++)
76 {
77 int j = 0;
78 for (; j + 3 < outw; j += 4)
79 {
80 __m256 _sum00 = _mm256_loadu_ps(outptr0);
81 __m256 _sum10 = _mm256_loadu_ps(outptr1);
82
83 __m256 _r01 = _mm256_broadcast_ss(r0);
84 __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
85 __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
86 __m256 _r11 = _mm256_broadcast_ss(r1);
87 __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
88 __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
89 __m256 _r21 = _mm256_broadcast_ss(r2);
90 __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
91 __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
92
93 _sum00 = _mm256_fmadd_ps(_r01, _k00_0, _sum00);
94 _sum00 = _mm256_fmadd_ps(_r02, _k01_0, _sum00);
95 _sum00 = _mm256_fmadd_ps(_r03, _k02_0, _sum00);
96 _sum00 = _mm256_fmadd_ps(_r11, _k10_0, _sum00);
97 _sum00 = _mm256_fmadd_ps(_r12, _k11_0, _sum00);
98 _sum00 = _mm256_fmadd_ps(_r13, _k12_0, _sum00);
99 _sum00 = _mm256_fmadd_ps(_r21, _k20_0, _sum00);
100 _sum00 = _mm256_fmadd_ps(_r22, _k21_0, _sum00);
101 _sum00 = _mm256_fmadd_ps(_r23, _k22_0, _sum00);
102
103 _sum10 = _mm256_fmadd_ps(_r01, _k00_1, _sum10);
104 _sum10 = _mm256_fmadd_ps(_r02, _k01_1, _sum10);
105 _sum10 = _mm256_fmadd_ps(_r03, _k02_1, _sum10);
106 _sum10 = _mm256_fmadd_ps(_r11, _k10_1, _sum10);
107 _sum10 = _mm256_fmadd_ps(_r12, _k11_1, _sum10);
108 _sum10 = _mm256_fmadd_ps(_r13, _k12_1, _sum10);
109 _sum10 = _mm256_fmadd_ps(_r21, _k20_1, _sum10);
110 _sum10 = _mm256_fmadd_ps(_r22, _k21_1, _sum10);
111 _sum10 = _mm256_fmadd_ps(_r23, _k22_1, _sum10);
112
113 _mm256_storeu_ps(outptr0, _sum00);
114 _mm256_storeu_ps(outptr1, _sum10);
115
116 __m256 _sum01 = _mm256_loadu_ps(outptr0 + 8);
117 __m256 _sum11 = _mm256_loadu_ps(outptr1 + 8);
118
119 __m256 _r04 = _mm256_broadcast_ss(r0 + 3);
120 __m256 _r14 = _mm256_broadcast_ss(r1 + 3);
121 __m256 _r24 = _mm256_broadcast_ss(r2 + 3);
122
123 _sum01 = _mm256_fmadd_ps(_r02, _k00_0, _sum01);
124 _sum01 = _mm256_fmadd_ps(_r03, _k01_0, _sum01);
125 _sum01 = _mm256_fmadd_ps(_r04, _k02_0, _sum01);
126 _sum01 = _mm256_fmadd_ps(_r12, _k10_0, _sum01);
127 _sum01 = _mm256_fmadd_ps(_r13, _k11_0, _sum01);
128 _sum01 = _mm256_fmadd_ps(_r14, _k12_0, _sum01);
129 _sum01 = _mm256_fmadd_ps(_r22, _k20_0, _sum01);
130 _sum01 = _mm256_fmadd_ps(_r23, _k21_0, _sum01);
131 _sum01 = _mm256_fmadd_ps(_r24, _k22_0, _sum01);
132
133 _sum11 = _mm256_fmadd_ps(_r02, _k00_1, _sum11);
134 _sum11 = _mm256_fmadd_ps(_r03, _k01_1, _sum11);
135 _sum11 = _mm256_fmadd_ps(_r04, _k02_1, _sum11);
136 _sum11 = _mm256_fmadd_ps(_r12, _k10_1, _sum11);
137 _sum11 = _mm256_fmadd_ps(_r13, _k11_1, _sum11);
138 _sum11 = _mm256_fmadd_ps(_r14, _k12_1, _sum11);
139 _sum11 = _mm256_fmadd_ps(_r22, _k20_1, _sum11);
140 _sum11 = _mm256_fmadd_ps(_r23, _k21_1, _sum11);
141 _sum11 = _mm256_fmadd_ps(_r24, _k22_1, _sum11);
142
143 _mm256_storeu_ps(outptr0 + 8, _sum01);
144 _mm256_storeu_ps(outptr1 + 8, _sum11);
145
146 __m256 _sum02 = _mm256_loadu_ps(outptr0 + 16);
147 __m256 _sum12 = _mm256_loadu_ps(outptr1 + 16);
148
149 __m256 _r05 = _mm256_broadcast_ss(r0 + 4);
150 __m256 _r15 = _mm256_broadcast_ss(r1 + 4);
151 __m256 _r25 = _mm256_broadcast_ss(r2 + 4);
152
153 _sum02 = _mm256_fmadd_ps(_r03, _k00_0, _sum02);
154 _sum02 = _mm256_fmadd_ps(_r04, _k01_0, _sum02);
155 _sum02 = _mm256_fmadd_ps(_r05, _k02_0, _sum02);
156 _sum02 = _mm256_fmadd_ps(_r13, _k10_0, _sum02);
157 _sum02 = _mm256_fmadd_ps(_r14, _k11_0, _sum02);
158 _sum02 = _mm256_fmadd_ps(_r15, _k12_0, _sum02);
159 _sum02 = _mm256_fmadd_ps(_r23, _k20_0, _sum02);
160 _sum02 = _mm256_fmadd_ps(_r24, _k21_0, _sum02);
161 _sum02 = _mm256_fmadd_ps(_r25, _k22_0, _sum02);
162
163 _sum12 = _mm256_fmadd_ps(_r03, _k00_1, _sum12);
164 _sum12 = _mm256_fmadd_ps(_r04, _k01_1, _sum12);
165 _sum12 = _mm256_fmadd_ps(_r05, _k02_1, _sum12);
166 _sum12 = _mm256_fmadd_ps(_r13, _k10_1, _sum12);
167 _sum12 = _mm256_fmadd_ps(_r14, _k11_1, _sum12);
168 _sum12 = _mm256_fmadd_ps(_r15, _k12_1, _sum12);
169 _sum12 = _mm256_fmadd_ps(_r23, _k20_1, _sum12);
170 _sum12 = _mm256_fmadd_ps(_r24, _k21_1, _sum12);
171 _sum12 = _mm256_fmadd_ps(_r25, _k22_1, _sum12);
172
173 _mm256_storeu_ps(outptr0 + 16, _sum02);
174 _mm256_storeu_ps(outptr1 + 16, _sum12);
175
176 __m256 _r06 = _mm256_broadcast_ss(r0 + 5);
177 __m256 _r16 = _mm256_broadcast_ss(r1 + 5);
178 __m256 _r26 = _mm256_broadcast_ss(r2 + 5);
179
180 __m256 _sum03 = _mm256_loadu_ps(outptr0 + 24);
181 __m256 _sum13 = _mm256_loadu_ps(outptr1 + 24);
182
183 _sum03 = _mm256_fmadd_ps(_r04, _k00_0, _sum03);
184 _sum03 = _mm256_fmadd_ps(_r05, _k01_0, _sum03);
185 _sum03 = _mm256_fmadd_ps(_r06, _k02_0, _sum03);
186 _sum03 = _mm256_fmadd_ps(_r14, _k10_0, _sum03);
187 _sum03 = _mm256_fmadd_ps(_r15, _k11_0, _sum03);
188 _sum03 = _mm256_fmadd_ps(_r16, _k12_0, _sum03);
189 _sum03 = _mm256_fmadd_ps(_r24, _k20_0, _sum03);
190 _sum03 = _mm256_fmadd_ps(_r25, _k21_0, _sum03);
191 _sum03 = _mm256_fmadd_ps(_r26, _k22_0, _sum03);
192
193 _sum13 = _mm256_fmadd_ps(_r04, _k00_1, _sum13);
194 _sum13 = _mm256_fmadd_ps(_r05, _k01_1, _sum13);
195 _sum13 = _mm256_fmadd_ps(_r06, _k02_1, _sum13);
196 _sum13 = _mm256_fmadd_ps(_r14, _k10_1, _sum13);
197 _sum13 = _mm256_fmadd_ps(_r15, _k11_1, _sum13);
198 _sum13 = _mm256_fmadd_ps(_r16, _k12_1, _sum13);
199 _sum13 = _mm256_fmadd_ps(_r24, _k20_1, _sum13);
200 _sum13 = _mm256_fmadd_ps(_r25, _k21_1, _sum13);
201 _sum13 = _mm256_fmadd_ps(_r26, _k22_1, _sum13);
202
203 _mm256_storeu_ps(outptr0 + 24, _sum03);
204 _mm256_storeu_ps(outptr1 + 24, _sum13);
205
206 r0 += 4;
207 r1 += 4;
208 r2 += 4;
209 outptr0 += 32;
210 outptr1 += 32;
211 }
212
213 for (; j + 1 < outw; j += 2)
214 {
215 __m256 _sum00 = _mm256_loadu_ps(outptr0);
216 __m256 _sum10 = _mm256_loadu_ps(outptr1);
217
218 __m256 _r01 = _mm256_broadcast_ss(r0);
219 __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
220 __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
221 __m256 _r11 = _mm256_broadcast_ss(r1);
222 __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
223 __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
224 __m256 _r21 = _mm256_broadcast_ss(r2);
225 __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
226 __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
227
228 _sum00 = _mm256_fmadd_ps(_r01, _k00_0, _sum00);
229 _sum00 = _mm256_fmadd_ps(_r02, _k01_0, _sum00);
230 _sum00 = _mm256_fmadd_ps(_r03, _k02_0, _sum00);
231 _sum00 = _mm256_fmadd_ps(_r11, _k10_0, _sum00);
232 _sum00 = _mm256_fmadd_ps(_r12, _k11_0, _sum00);
233 _sum00 = _mm256_fmadd_ps(_r13, _k12_0, _sum00);
234 _sum00 = _mm256_fmadd_ps(_r21, _k20_0, _sum00);
235 _sum00 = _mm256_fmadd_ps(_r22, _k21_0, _sum00);
236 _sum00 = _mm256_fmadd_ps(_r23, _k22_0, _sum00);
237
238 _sum10 = _mm256_fmadd_ps(_r01, _k00_1, _sum10);
239 _sum10 = _mm256_fmadd_ps(_r02, _k01_1, _sum10);
240 _sum10 = _mm256_fmadd_ps(_r03, _k02_1, _sum10);
241 _sum10 = _mm256_fmadd_ps(_r11, _k10_1, _sum10);
242 _sum10 = _mm256_fmadd_ps(_r12, _k11_1, _sum10);
243 _sum10 = _mm256_fmadd_ps(_r13, _k12_1, _sum10);
244 _sum10 = _mm256_fmadd_ps(_r21, _k20_1, _sum10);
245 _sum10 = _mm256_fmadd_ps(_r22, _k21_1, _sum10);
246 _sum10 = _mm256_fmadd_ps(_r23, _k22_1, _sum10);
247
248 _mm256_storeu_ps(outptr0, _sum00);
249 _mm256_storeu_ps(outptr1, _sum10);
250
251 __m256 _sum01 = _mm256_loadu_ps(outptr0 + 8);
252 __m256 _sum11 = _mm256_loadu_ps(outptr1 + 8);
253
254 __m256 _r04 = _mm256_broadcast_ss(r0 + 3);
255 __m256 _r14 = _mm256_broadcast_ss(r1 + 3);
256 __m256 _r24 = _mm256_broadcast_ss(r2 + 3);
257
258 _sum01 = _mm256_fmadd_ps(_r02, _k00_0, _sum01);
259 _sum01 = _mm256_fmadd_ps(_r03, _k01_0, _sum01);
260 _sum01 = _mm256_fmadd_ps(_r04, _k02_0, _sum01);
261 _sum01 = _mm256_fmadd_ps(_r12, _k10_0, _sum01);
262 _sum01 = _mm256_fmadd_ps(_r13, _k11_0, _sum01);
263 _sum01 = _mm256_fmadd_ps(_r14, _k12_0, _sum01);
264 _sum01 = _mm256_fmadd_ps(_r22, _k20_0, _sum01);
265 _sum01 = _mm256_fmadd_ps(_r23, _k21_0, _sum01);
266 _sum01 = _mm256_fmadd_ps(_r24, _k22_0, _sum01);
267
268 _sum11 = _mm256_fmadd_ps(_r02, _k00_1, _sum11);
269 _sum11 = _mm256_fmadd_ps(_r03, _k01_1, _sum11);
270 _sum11 = _mm256_fmadd_ps(_r04, _k02_1, _sum11);
271 _sum11 = _mm256_fmadd_ps(_r12, _k10_1, _sum11);
272 _sum11 = _mm256_fmadd_ps(_r13, _k11_1, _sum11);
273 _sum11 = _mm256_fmadd_ps(_r14, _k12_1, _sum11);
274 _sum11 = _mm256_fmadd_ps(_r22, _k20_1, _sum11);
275 _sum11 = _mm256_fmadd_ps(_r23, _k21_1, _sum11);
276 _sum11 = _mm256_fmadd_ps(_r24, _k22_1, _sum11);
277
278 _mm256_storeu_ps(outptr0 + 8, _sum01);
279 _mm256_storeu_ps(outptr1 + 8, _sum11);
280
281 r0 += 2;
282 r1 += 2;
283 r2 += 2;
284 outptr0 += 16;
285 outptr1 += 16;
286 }
287
288 for (; j < outw; j++)
289 {
290 __m256 _sum00 = _mm256_loadu_ps(outptr0);
291 __m256 _sum10 = _mm256_loadu_ps(outptr1);
292
293 __m256 _r01 = _mm256_broadcast_ss(r0);
294 __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
295 __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
296 __m256 _r11 = _mm256_broadcast_ss(r1);
297 __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
298 __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
299 __m256 _r21 = _mm256_broadcast_ss(r2);
300 __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
301 __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
302
303 _sum00 = _mm256_fmadd_ps(_r01, _k00_0, _sum00);
304 _sum00 = _mm256_fmadd_ps(_r02, _k01_0, _sum00);
305 _sum00 = _mm256_fmadd_ps(_r03, _k02_0, _sum00);
306 _sum00 = _mm256_fmadd_ps(_r11, _k10_0, _sum00);
307 _sum00 = _mm256_fmadd_ps(_r12, _k11_0, _sum00);
308 _sum00 = _mm256_fmadd_ps(_r13, _k12_0, _sum00);
309 _sum00 = _mm256_fmadd_ps(_r21, _k20_0, _sum00);
310 _sum00 = _mm256_fmadd_ps(_r22, _k21_0, _sum00);
311 _sum00 = _mm256_fmadd_ps(_r23, _k22_0, _sum00);
312
313 _sum10 = _mm256_fmadd_ps(_r01, _k00_1, _sum10);
314 _sum10 = _mm256_fmadd_ps(_r02, _k01_1, _sum10);
315 _sum10 = _mm256_fmadd_ps(_r03, _k02_1, _sum10);
316 _sum10 = _mm256_fmadd_ps(_r11, _k10_1, _sum10);
317 _sum10 = _mm256_fmadd_ps(_r12, _k11_1, _sum10);
318 _sum10 = _mm256_fmadd_ps(_r13, _k12_1, _sum10);
319 _sum10 = _mm256_fmadd_ps(_r21, _k20_1, _sum10);
320 _sum10 = _mm256_fmadd_ps(_r22, _k21_1, _sum10);
321 _sum10 = _mm256_fmadd_ps(_r23, _k22_1, _sum10);
322
323 _mm256_storeu_ps(outptr0, _sum00);
324 _mm256_storeu_ps(outptr1, _sum10);
325
326 r0 += 1;
327 r1 += 1;
328 r2 += 1;
329 outptr0 += 8;
330 outptr1 += 8;
331 }
332
333 r0 += 2;
334 r1 += 2;
335 r2 += 2;
336 }
337
338 k0 += 9 * 8;
339 k1 += 9 * 8;
340 }
341 }
342
343 #pragma omp parallel for num_threads(opt.num_threads)
344 for (int p = remain_outch_start; p < outch; p++)
345 {
346 Mat out0 = top_blob.channel(p);
347
348 __m256 _bias0 = bias ? _mm256_loadu_ps((const float*)bias + p * 8) : _mm256_set1_ps(0.f);
349 out0.fill(_bias0);
350
351 const float* k0 = kernel.channel(p);
352
353 for (int q = 0; q < inch; q++)
354 {
355 float* outptr0 = out0.row(0);
356
357 const Mat img0 = bottom_blob.channel(q);
358
359 const float* r0 = img0.row(0);
360 const float* r1 = img0.row(1);
361 const float* r2 = img0.row(2);
362
363 __m256 _k00 = _mm256_loadu_ps(k0);
364 __m256 _k01 = _mm256_loadu_ps(k0 + 8);
365 __m256 _k02 = _mm256_loadu_ps(k0 + 16);
366 __m256 _k10 = _mm256_loadu_ps(k0 + 24);
367 __m256 _k11 = _mm256_loadu_ps(k0 + 32);
368 __m256 _k12 = _mm256_loadu_ps(k0 + 40);
369 __m256 _k20 = _mm256_loadu_ps(k0 + 48);
370 __m256 _k21 = _mm256_loadu_ps(k0 + 56);
371 __m256 _k22 = _mm256_loadu_ps(k0 + 64);
372
373 int i = 0;
374
375 for (; i < outh; i++)
376 {
377 int j = 0;
378 for (; j + 3 < outw; j += 4)
379 {
380 __m256 _sum0 = _mm256_loadu_ps(outptr0);
381
382 __m256 _r01 = _mm256_broadcast_ss(r0);
383 __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
384 __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
385 __m256 _r11 = _mm256_broadcast_ss(r1);
386 __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
387 __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
388 __m256 _r21 = _mm256_broadcast_ss(r2);
389 __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
390 __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
391
392 _sum0 = _mm256_fmadd_ps(_r01, _k00, _sum0);
393 _sum0 = _mm256_fmadd_ps(_r02, _k01, _sum0);
394 _sum0 = _mm256_fmadd_ps(_r03, _k02, _sum0);
395 _sum0 = _mm256_fmadd_ps(_r11, _k10, _sum0);
396 _sum0 = _mm256_fmadd_ps(_r12, _k11, _sum0);
397 _sum0 = _mm256_fmadd_ps(_r13, _k12, _sum0);
398 _sum0 = _mm256_fmadd_ps(_r21, _k20, _sum0);
399 _sum0 = _mm256_fmadd_ps(_r22, _k21, _sum0);
400 _sum0 = _mm256_fmadd_ps(_r23, _k22, _sum0);
401
402 __m256 _sum1 = _mm256_loadu_ps(outptr0 + 8);
403 __m256 _r04 = _mm256_broadcast_ss(r0 + 3);
404 __m256 _r14 = _mm256_broadcast_ss(r1 + 3);
405 __m256 _r24 = _mm256_broadcast_ss(r2 + 3);
406 _mm256_storeu_ps(outptr0, _sum0);
407
408 _sum1 = _mm256_fmadd_ps(_r02, _k00, _sum1);
409 _sum1 = _mm256_fmadd_ps(_r03, _k01, _sum1);
410 _sum1 = _mm256_fmadd_ps(_r04, _k02, _sum1);
411 _sum1 = _mm256_fmadd_ps(_r12, _k10, _sum1);
412 _sum1 = _mm256_fmadd_ps(_r13, _k11, _sum1);
413 _sum1 = _mm256_fmadd_ps(_r14, _k12, _sum1);
414 _sum1 = _mm256_fmadd_ps(_r22, _k20, _sum1);
415 _sum1 = _mm256_fmadd_ps(_r23, _k21, _sum1);
416 _sum1 = _mm256_fmadd_ps(_r24, _k22, _sum1);
417
418 __m256 _sum2 = _mm256_loadu_ps(outptr0 + 16);
419 __m256 _r05 = _mm256_broadcast_ss(r0 + 4);
420 __m256 _r15 = _mm256_broadcast_ss(r1 + 4);
421 __m256 _r25 = _mm256_broadcast_ss(r2 + 4);
422 _mm256_storeu_ps(outptr0 + 8, _sum1);
423
424 _sum2 = _mm256_fmadd_ps(_r03, _k00, _sum2);
425 _sum2 = _mm256_fmadd_ps(_r04, _k01, _sum2);
426 _sum2 = _mm256_fmadd_ps(_r05, _k02, _sum2);
427 _sum2 = _mm256_fmadd_ps(_r13, _k10, _sum2);
428 _sum2 = _mm256_fmadd_ps(_r14, _k11, _sum2);
429 _sum2 = _mm256_fmadd_ps(_r15, _k12, _sum2);
430 _sum2 = _mm256_fmadd_ps(_r23, _k20, _sum2);
431 _sum2 = _mm256_fmadd_ps(_r24, _k21, _sum2);
432 _sum2 = _mm256_fmadd_ps(_r25, _k22, _sum2);
433
434 __m256 _sum3 = _mm256_loadu_ps(outptr0 + 24);
435 __m256 _r06 = _mm256_broadcast_ss(r0 + 5);
436 __m256 _r16 = _mm256_broadcast_ss(r1 + 5);
437 __m256 _r26 = _mm256_broadcast_ss(r2 + 5);
438 _mm256_storeu_ps(outptr0 + 16, _sum2);
439
440 _sum3 = _mm256_fmadd_ps(_r04, _k00, _sum3);
441 _sum3 = _mm256_fmadd_ps(_r05, _k01, _sum3);
442 _sum3 = _mm256_fmadd_ps(_r06, _k02, _sum3);
443 _sum3 = _mm256_fmadd_ps(_r14, _k10, _sum3);
444 _sum3 = _mm256_fmadd_ps(_r15, _k11, _sum3);
445 _sum3 = _mm256_fmadd_ps(_r16, _k12, _sum3);
446 _sum3 = _mm256_fmadd_ps(_r24, _k20, _sum3);
447 _sum3 = _mm256_fmadd_ps(_r25, _k21, _sum3);
448 _sum3 = _mm256_fmadd_ps(_r26, _k22, _sum3);
449
450 _mm256_storeu_ps(outptr0 + 24, _sum3);
451
452 r0 += 4;
453 r1 += 4;
454 r2 += 4;
455 outptr0 += 32;
456 }
457 for (; j + 1 < outw; j += 2)
458 {
459 __m256 _sum0 = _mm256_loadu_ps(outptr0);
460
461 __m256 _r01 = _mm256_broadcast_ss(r0);
462 __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
463 __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
464 __m256 _r11 = _mm256_broadcast_ss(r1);
465 __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
466 __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
467 __m256 _r21 = _mm256_broadcast_ss(r2);
468 __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
469 __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
470
471 _sum0 = _mm256_fmadd_ps(_r01, _k00, _sum0);
472 _sum0 = _mm256_fmadd_ps(_r02, _k01, _sum0);
473 _sum0 = _mm256_fmadd_ps(_r03, _k02, _sum0);
474 _sum0 = _mm256_fmadd_ps(_r11, _k10, _sum0);
475 _sum0 = _mm256_fmadd_ps(_r12, _k11, _sum0);
476 _sum0 = _mm256_fmadd_ps(_r13, _k12, _sum0);
477 _sum0 = _mm256_fmadd_ps(_r21, _k20, _sum0);
478 _sum0 = _mm256_fmadd_ps(_r22, _k21, _sum0);
479 _sum0 = _mm256_fmadd_ps(_r23, _k22, _sum0);
480
481 __m256 _sum1 = _mm256_loadu_ps(outptr0 + 8);
482 __m256 _r04 = _mm256_broadcast_ss(r0 + 3);
483 __m256 _r14 = _mm256_broadcast_ss(r1 + 3);
484 __m256 _r24 = _mm256_broadcast_ss(r2 + 3);
485 _mm256_storeu_ps(outptr0, _sum0);
486
487 _sum1 = _mm256_fmadd_ps(_r02, _k00, _sum1);
488 _sum1 = _mm256_fmadd_ps(_r03, _k01, _sum1);
489 _sum1 = _mm256_fmadd_ps(_r04, _k02, _sum1);
490 _sum1 = _mm256_fmadd_ps(_r12, _k10, _sum1);
491 _sum1 = _mm256_fmadd_ps(_r13, _k11, _sum1);
492 _sum1 = _mm256_fmadd_ps(_r14, _k12, _sum1);
493 _sum1 = _mm256_fmadd_ps(_r22, _k20, _sum1);
494 _sum1 = _mm256_fmadd_ps(_r23, _k21, _sum1);
495 _sum1 = _mm256_fmadd_ps(_r24, _k22, _sum1);
496
497 _mm256_storeu_ps(outptr0 + 8, _sum1);
498
499 r0 += 2;
500 r1 += 2;
501 r2 += 2;
502 outptr0 += 16;
503 }
504 for (; j < outw; j++)
505 {
506 __m256 _sum0 = _mm256_loadu_ps(outptr0);
507
508 __m256 _r01 = _mm256_broadcast_ss(r0);
509 __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
510 __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
511 __m256 _r11 = _mm256_broadcast_ss(r1);
512 __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
513 __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
514 __m256 _r21 = _mm256_broadcast_ss(r2);
515 __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
516 __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
517
518 _sum0 = _mm256_fmadd_ps(_r01, _k00, _sum0);
519 _sum0 = _mm256_fmadd_ps(_r02, _k01, _sum0);
520 _sum0 = _mm256_fmadd_ps(_r03, _k02, _sum0);
521 _sum0 = _mm256_fmadd_ps(_r11, _k10, _sum0);
522 _sum0 = _mm256_fmadd_ps(_r12, _k11, _sum0);
523 _sum0 = _mm256_fmadd_ps(_r13, _k12, _sum0);
524 _sum0 = _mm256_fmadd_ps(_r21, _k20, _sum0);
525 _sum0 = _mm256_fmadd_ps(_r22, _k21, _sum0);
526 _sum0 = _mm256_fmadd_ps(_r23, _k22, _sum0);
527
528 _mm256_storeu_ps(outptr0, _sum0);
529 r0 += 1;
530 r1 += 1;
531 r2 += 1;
532 outptr0 += 8;
533 }
534
535 r0 += 2;
536 r1 += 2;
537 r2 += 2;
538 }
539
540 k0 += 9 * 8;
541 }
542 }
543 }
544
conv3x3s2_pack1to8_avx(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel,const Mat & _bias,const Option & opt)545 static void conv3x3s2_pack1to8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
546 {
547 int w = bottom_blob.w;
548 int inch = bottom_blob.c;
549 int outw = top_blob.w;
550 int outh = top_blob.h;
551 int outch = top_blob.c;
552
553 const int tailstep = w - 2 * outw + w;
554
555 const float* bias = _bias;
556
557 int nn_outch = outch >> 1;
558 int remain_outch_start = nn_outch << 1;
559
560 #pragma omp parallel for num_threads(opt.num_threads)
561 for (int pp = 0; pp < nn_outch; pp++)
562 {
563 int p = pp * 2;
564
565 Mat out0 = top_blob.channel(p);
566 Mat out1 = top_blob.channel(p + 1);
567
568 __m256 _bias0 = bias ? _mm256_loadu_ps((const float*)bias + p * 8) : _mm256_set1_ps(0.f);
569 __m256 _bias1 = bias ? _mm256_loadu_ps((const float*)bias + (p + 1) * 8) : _mm256_set1_ps(0.f);
570 out0.fill(_bias0);
571 out1.fill(_bias1);
572
573 const float* k0 = kernel.channel(p);
574 const float* k1 = kernel.channel(p + 1);
575
576 for (int q = 0; q < inch; q++)
577 {
578 float* outptr0 = out0;
579 float* outptr1 = out1;
580
581 const Mat img0 = bottom_blob.channel(q);
582
583 const float* r0 = img0.row(0);
584 const float* r1 = img0.row(1);
585 const float* r2 = img0.row(2);
586
587 __m256 _k00_0 = _mm256_loadu_ps(k0);
588 __m256 _k01_0 = _mm256_loadu_ps(k0 + 8);
589 __m256 _k02_0 = _mm256_loadu_ps(k0 + 16);
590 __m256 _k10_0 = _mm256_loadu_ps(k0 + 24);
591 __m256 _k11_0 = _mm256_loadu_ps(k0 + 32);
592 __m256 _k12_0 = _mm256_loadu_ps(k0 + 40);
593 __m256 _k20_0 = _mm256_loadu_ps(k0 + 48);
594 __m256 _k21_0 = _mm256_loadu_ps(k0 + 56);
595 __m256 _k22_0 = _mm256_loadu_ps(k0 + 64);
596
597 __m256 _k00_1 = _mm256_loadu_ps(k1);
598 __m256 _k01_1 = _mm256_loadu_ps(k1 + 8);
599 __m256 _k02_1 = _mm256_loadu_ps(k1 + 16);
600 __m256 _k10_1 = _mm256_loadu_ps(k1 + 24);
601 __m256 _k11_1 = _mm256_loadu_ps(k1 + 32);
602 __m256 _k12_1 = _mm256_loadu_ps(k1 + 40);
603 __m256 _k20_1 = _mm256_loadu_ps(k1 + 48);
604 __m256 _k21_1 = _mm256_loadu_ps(k1 + 56);
605 __m256 _k22_1 = _mm256_loadu_ps(k1 + 64);
606
607 int i = 0;
608
609 for (; i < outh; i++)
610 {
611 int j = 0;
612 for (; j + 7 < outw; j += 8)
613 {
614 __m256 _sum00 = _mm256_loadu_ps(outptr0);
615 __m256 _sum10 = _mm256_loadu_ps(outptr1);
616
617 __m256 _r01 = _mm256_broadcast_ss(r0);
618 __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
619 __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
620 __m256 _r11 = _mm256_broadcast_ss(r1);
621 __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
622 __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
623 __m256 _r21 = _mm256_broadcast_ss(r2);
624 __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
625 __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
626
627 _sum00 = _mm256_fmadd_ps(_r01, _k00_0, _sum00);
628 _sum00 = _mm256_fmadd_ps(_r02, _k01_0, _sum00);
629 _sum00 = _mm256_fmadd_ps(_r03, _k02_0, _sum00);
630 _sum00 = _mm256_fmadd_ps(_r11, _k10_0, _sum00);
631 _sum00 = _mm256_fmadd_ps(_r12, _k11_0, _sum00);
632 _sum00 = _mm256_fmadd_ps(_r13, _k12_0, _sum00);
633 _sum00 = _mm256_fmadd_ps(_r21, _k20_0, _sum00);
634 _sum00 = _mm256_fmadd_ps(_r22, _k21_0, _sum00);
635 _sum00 = _mm256_fmadd_ps(_r23, _k22_0, _sum00);
636
637 _sum10 = _mm256_fmadd_ps(_r01, _k00_1, _sum10);
638 _sum10 = _mm256_fmadd_ps(_r02, _k01_1, _sum10);
639 _sum10 = _mm256_fmadd_ps(_r03, _k02_1, _sum10);
640 _sum10 = _mm256_fmadd_ps(_r11, _k10_1, _sum10);
641 _sum10 = _mm256_fmadd_ps(_r12, _k11_1, _sum10);
642 _sum10 = _mm256_fmadd_ps(_r13, _k12_1, _sum10);
643 _sum10 = _mm256_fmadd_ps(_r21, _k20_1, _sum10);
644 _sum10 = _mm256_fmadd_ps(_r22, _k21_1, _sum10);
645 _sum10 = _mm256_fmadd_ps(_r23, _k22_1, _sum10);
646
647 _mm256_storeu_ps(outptr0, _sum00);
648 _mm256_storeu_ps(outptr1, _sum10);
649
650 __m256 _sum01 = _mm256_loadu_ps(outptr0 + 8);
651 __m256 _sum11 = _mm256_loadu_ps(outptr1 + 8);
652
653 __m256 _r04 = _mm256_broadcast_ss(r0 + 3);
654 __m256 _r14 = _mm256_broadcast_ss(r1 + 3);
655 __m256 _r24 = _mm256_broadcast_ss(r2 + 3);
656 __m256 _r05 = _mm256_broadcast_ss(r0 + 4);
657 __m256 _r15 = _mm256_broadcast_ss(r1 + 4);
658 __m256 _r25 = _mm256_broadcast_ss(r2 + 4);
659
660 _sum01 = _mm256_fmadd_ps(_r03, _k00_0, _sum01);
661 _sum01 = _mm256_fmadd_ps(_r04, _k01_0, _sum01);
662 _sum01 = _mm256_fmadd_ps(_r05, _k02_0, _sum01);
663 _sum01 = _mm256_fmadd_ps(_r13, _k10_0, _sum01);
664 _sum01 = _mm256_fmadd_ps(_r14, _k11_0, _sum01);
665 _sum01 = _mm256_fmadd_ps(_r15, _k12_0, _sum01);
666 _sum01 = _mm256_fmadd_ps(_r23, _k20_0, _sum01);
667 _sum01 = _mm256_fmadd_ps(_r24, _k21_0, _sum01);
668 _sum01 = _mm256_fmadd_ps(_r25, _k22_0, _sum01);
669
670 _sum11 = _mm256_fmadd_ps(_r03, _k00_1, _sum11);
671 _sum11 = _mm256_fmadd_ps(_r04, _k01_1, _sum11);
672 _sum11 = _mm256_fmadd_ps(_r05, _k02_1, _sum11);
673 _sum11 = _mm256_fmadd_ps(_r13, _k10_1, _sum11);
674 _sum11 = _mm256_fmadd_ps(_r14, _k11_1, _sum11);
675 _sum11 = _mm256_fmadd_ps(_r15, _k12_1, _sum11);
676 _sum11 = _mm256_fmadd_ps(_r23, _k20_1, _sum11);
677 _sum11 = _mm256_fmadd_ps(_r24, _k21_1, _sum11);
678 _sum11 = _mm256_fmadd_ps(_r25, _k22_1, _sum11);
679
680 _mm256_storeu_ps(outptr0 + 8, _sum01);
681 _mm256_storeu_ps(outptr1 + 8, _sum11);
682
683 __m256 _sum02 = _mm256_loadu_ps(outptr0 + 16);
684 __m256 _sum12 = _mm256_loadu_ps(outptr1 + 16);
685
686 __m256 _r06 = _mm256_broadcast_ss(r0 + 5);
687 __m256 _r16 = _mm256_broadcast_ss(r1 + 5);
688 __m256 _r26 = _mm256_broadcast_ss(r2 + 5);
689 __m256 _r07 = _mm256_broadcast_ss(r0 + 6);
690 __m256 _r17 = _mm256_broadcast_ss(r1 + 6);
691 __m256 _r27 = _mm256_broadcast_ss(r2 + 6);
692
693 _sum02 = _mm256_fmadd_ps(_r05, _k00_0, _sum02);
694 _sum02 = _mm256_fmadd_ps(_r06, _k01_0, _sum02);
695 _sum02 = _mm256_fmadd_ps(_r07, _k02_0, _sum02);
696 _sum02 = _mm256_fmadd_ps(_r15, _k10_0, _sum02);
697 _sum02 = _mm256_fmadd_ps(_r16, _k11_0, _sum02);
698 _sum02 = _mm256_fmadd_ps(_r17, _k12_0, _sum02);
699 _sum02 = _mm256_fmadd_ps(_r25, _k20_0, _sum02);
700 _sum02 = _mm256_fmadd_ps(_r26, _k21_0, _sum02);
701 _sum02 = _mm256_fmadd_ps(_r27, _k22_0, _sum02);
702
703 _sum12 = _mm256_fmadd_ps(_r05, _k00_1, _sum12);
704 _sum12 = _mm256_fmadd_ps(_r06, _k01_1, _sum12);
705 _sum12 = _mm256_fmadd_ps(_r07, _k02_1, _sum12);
706 _sum12 = _mm256_fmadd_ps(_r15, _k10_1, _sum12);
707 _sum12 = _mm256_fmadd_ps(_r16, _k11_1, _sum12);
708 _sum12 = _mm256_fmadd_ps(_r17, _k12_1, _sum12);
709 _sum12 = _mm256_fmadd_ps(_r25, _k20_1, _sum12);
710 _sum12 = _mm256_fmadd_ps(_r26, _k21_1, _sum12);
711 _sum12 = _mm256_fmadd_ps(_r27, _k22_1, _sum12);
712
713 _mm256_storeu_ps(outptr0 + 16, _sum02);
714 _mm256_storeu_ps(outptr1 + 16, _sum12);
715
716 __m256 _r08 = _mm256_broadcast_ss(r0 + 7);
717 __m256 _r18 = _mm256_broadcast_ss(r1 + 7);
718 __m256 _r28 = _mm256_broadcast_ss(r2 + 7);
719 __m256 _r09 = _mm256_broadcast_ss(r0 + 8);
720 __m256 _r19 = _mm256_broadcast_ss(r1 + 8);
721 __m256 _r29 = _mm256_broadcast_ss(r2 + 8);
722
723 __m256 _sum03 = _mm256_loadu_ps(outptr0 + 24);
724 __m256 _sum13 = _mm256_loadu_ps(outptr1 + 24);
725
726 _sum03 = _mm256_fmadd_ps(_r07, _k00_0, _sum03);
727 _sum03 = _mm256_fmadd_ps(_r08, _k01_0, _sum03);
728 _sum03 = _mm256_fmadd_ps(_r09, _k02_0, _sum03);
729 _sum03 = _mm256_fmadd_ps(_r17, _k10_0, _sum03);
730 _sum03 = _mm256_fmadd_ps(_r18, _k11_0, _sum03);
731 _sum03 = _mm256_fmadd_ps(_r19, _k12_0, _sum03);
732 _sum03 = _mm256_fmadd_ps(_r27, _k20_0, _sum03);
733 _sum03 = _mm256_fmadd_ps(_r28, _k21_0, _sum03);
734 _sum03 = _mm256_fmadd_ps(_r29, _k22_0, _sum03);
735
736 _sum13 = _mm256_fmadd_ps(_r07, _k00_1, _sum13);
737 _sum13 = _mm256_fmadd_ps(_r08, _k01_1, _sum13);
738 _sum13 = _mm256_fmadd_ps(_r09, _k02_1, _sum13);
739 _sum13 = _mm256_fmadd_ps(_r17, _k10_1, _sum13);
740 _sum13 = _mm256_fmadd_ps(_r18, _k11_1, _sum13);
741 _sum13 = _mm256_fmadd_ps(_r19, _k12_1, _sum13);
742 _sum13 = _mm256_fmadd_ps(_r27, _k20_1, _sum13);
743 _sum13 = _mm256_fmadd_ps(_r28, _k21_1, _sum13);
744 _sum13 = _mm256_fmadd_ps(_r29, _k22_1, _sum13);
745
746 _mm256_storeu_ps(outptr0 + 24, _sum03);
747 _mm256_storeu_ps(outptr1 + 24, _sum13);
748
749 __m256 _r010 = _mm256_broadcast_ss(r0 + 9);
750 __m256 _r110 = _mm256_broadcast_ss(r1 + 9);
751 __m256 _r210 = _mm256_broadcast_ss(r2 + 9);
752 __m256 _r011 = _mm256_broadcast_ss(r0 + 10);
753 __m256 _r111 = _mm256_broadcast_ss(r1 + 10);
754 __m256 _r211 = _mm256_broadcast_ss(r2 + 10);
755
756 __m256 _sum04 = _mm256_loadu_ps(outptr0 + 32);
757 __m256 _sum14 = _mm256_loadu_ps(outptr1 + 32);
758
759 _sum04 = _mm256_fmadd_ps(_r09, _k00_0, _sum04);
760 _sum04 = _mm256_fmadd_ps(_r010, _k01_0, _sum04);
761 _sum04 = _mm256_fmadd_ps(_r011, _k02_0, _sum04);
762 _sum04 = _mm256_fmadd_ps(_r19, _k10_0, _sum04);
763 _sum04 = _mm256_fmadd_ps(_r110, _k11_0, _sum04);
764 _sum04 = _mm256_fmadd_ps(_r111, _k12_0, _sum04);
765 _sum04 = _mm256_fmadd_ps(_r29, _k20_0, _sum04);
766 _sum04 = _mm256_fmadd_ps(_r210, _k21_0, _sum04);
767 _sum04 = _mm256_fmadd_ps(_r211, _k22_0, _sum04);
768
769 _sum14 = _mm256_fmadd_ps(_r09, _k00_1, _sum14);
770 _sum14 = _mm256_fmadd_ps(_r010, _k01_1, _sum14);
771 _sum14 = _mm256_fmadd_ps(_r011, _k02_1, _sum14);
772 _sum14 = _mm256_fmadd_ps(_r19, _k10_1, _sum14);
773 _sum14 = _mm256_fmadd_ps(_r110, _k11_1, _sum14);
774 _sum14 = _mm256_fmadd_ps(_r111, _k12_1, _sum14);
775 _sum14 = _mm256_fmadd_ps(_r29, _k20_1, _sum14);
776 _sum14 = _mm256_fmadd_ps(_r210, _k21_1, _sum14);
777 _sum14 = _mm256_fmadd_ps(_r211, _k22_1, _sum14);
778
779 _mm256_storeu_ps(outptr0 + 32, _sum04);
780 _mm256_storeu_ps(outptr1 + 32, _sum14);
781
782 __m256 _r012 = _mm256_broadcast_ss(r0 + 11);
783 __m256 _r112 = _mm256_broadcast_ss(r1 + 11);
784 __m256 _r212 = _mm256_broadcast_ss(r2 + 11);
785 __m256 _r013 = _mm256_broadcast_ss(r0 + 12);
786 __m256 _r113 = _mm256_broadcast_ss(r1 + 12);
787 __m256 _r213 = _mm256_broadcast_ss(r2 + 12);
788
789 __m256 _sum05 = _mm256_loadu_ps(outptr0 + 40);
790 __m256 _sum15 = _mm256_loadu_ps(outptr1 + 40);
791
792 _sum05 = _mm256_fmadd_ps(_r011, _k00_0, _sum05);
793 _sum05 = _mm256_fmadd_ps(_r012, _k01_0, _sum05);
794 _sum05 = _mm256_fmadd_ps(_r013, _k02_0, _sum05);
795 _sum05 = _mm256_fmadd_ps(_r111, _k10_0, _sum05);
796 _sum05 = _mm256_fmadd_ps(_r112, _k11_0, _sum05);
797 _sum05 = _mm256_fmadd_ps(_r113, _k12_0, _sum05);
798 _sum05 = _mm256_fmadd_ps(_r211, _k20_0, _sum05);
799 _sum05 = _mm256_fmadd_ps(_r212, _k21_0, _sum05);
800 _sum05 = _mm256_fmadd_ps(_r213, _k22_0, _sum05);
801 _sum15 = _mm256_fmadd_ps(_r011, _k00_1, _sum15);
802 _sum15 = _mm256_fmadd_ps(_r012, _k01_1, _sum15);
803 _sum15 = _mm256_fmadd_ps(_r013, _k02_1, _sum15);
804 _sum15 = _mm256_fmadd_ps(_r111, _k10_1, _sum15);
805 _sum15 = _mm256_fmadd_ps(_r112, _k11_1, _sum15);
806 _sum15 = _mm256_fmadd_ps(_r113, _k12_1, _sum15);
807 _sum15 = _mm256_fmadd_ps(_r211, _k20_1, _sum15);
808 _sum15 = _mm256_fmadd_ps(_r212, _k21_1, _sum15);
809 _sum15 = _mm256_fmadd_ps(_r213, _k22_1, _sum15);
810
811 _mm256_storeu_ps(outptr0 + 40, _sum05);
812 _mm256_storeu_ps(outptr1 + 40, _sum15);
813
814 __m256 _r014 = _mm256_broadcast_ss(r0 + 13);
815 __m256 _r114 = _mm256_broadcast_ss(r1 + 13);
816 __m256 _r214 = _mm256_broadcast_ss(r2 + 13);
817 __m256 _r015 = _mm256_broadcast_ss(r0 + 14);
818 __m256 _r115 = _mm256_broadcast_ss(r1 + 14);
819 __m256 _r215 = _mm256_broadcast_ss(r2 + 14);
820
821 __m256 _sum06 = _mm256_loadu_ps(outptr0 + 48);
822 __m256 _sum16 = _mm256_loadu_ps(outptr1 + 48);
823
824 _sum06 = _mm256_fmadd_ps(_r013, _k00_0, _sum06);
825 _sum06 = _mm256_fmadd_ps(_r014, _k01_0, _sum06);
826 _sum06 = _mm256_fmadd_ps(_r015, _k02_0, _sum06);
827 _sum06 = _mm256_fmadd_ps(_r113, _k10_0, _sum06);
828 _sum06 = _mm256_fmadd_ps(_r114, _k11_0, _sum06);
829 _sum06 = _mm256_fmadd_ps(_r115, _k12_0, _sum06);
830 _sum06 = _mm256_fmadd_ps(_r213, _k20_0, _sum06);
831 _sum06 = _mm256_fmadd_ps(_r214, _k21_0, _sum06);
832 _sum06 = _mm256_fmadd_ps(_r215, _k22_0, _sum06);
833 _sum16 = _mm256_fmadd_ps(_r013, _k00_1, _sum16);
834 _sum16 = _mm256_fmadd_ps(_r014, _k01_1, _sum16);
835 _sum16 = _mm256_fmadd_ps(_r015, _k02_1, _sum16);
836 _sum16 = _mm256_fmadd_ps(_r113, _k10_1, _sum16);
837 _sum16 = _mm256_fmadd_ps(_r114, _k11_1, _sum16);
838 _sum16 = _mm256_fmadd_ps(_r115, _k12_1, _sum16);
839 _sum16 = _mm256_fmadd_ps(_r213, _k20_1, _sum16);
840 _sum16 = _mm256_fmadd_ps(_r214, _k21_1, _sum16);
841 _sum16 = _mm256_fmadd_ps(_r215, _k22_1, _sum16);
842
843 _mm256_storeu_ps(outptr0 + 48, _sum06);
844 _mm256_storeu_ps(outptr1 + 48, _sum16);
845
846 __m256 _r016 = _mm256_broadcast_ss(r0 + 15);
847 __m256 _r116 = _mm256_broadcast_ss(r1 + 15);
848 __m256 _r216 = _mm256_broadcast_ss(r2 + 15);
849 __m256 _r017 = _mm256_broadcast_ss(r0 + 16);
850 __m256 _r117 = _mm256_broadcast_ss(r1 + 16);
851 __m256 _r217 = _mm256_broadcast_ss(r2 + 16);
852
853 __m256 _sum07 = _mm256_loadu_ps(outptr0 + 56);
854 __m256 _sum17 = _mm256_loadu_ps(outptr1 + 56);
855
856 _sum07 = _mm256_fmadd_ps(_r015, _k00_0, _sum07);
857 _sum07 = _mm256_fmadd_ps(_r016, _k01_0, _sum07);
858 _sum07 = _mm256_fmadd_ps(_r017, _k02_0, _sum07);
859 _sum07 = _mm256_fmadd_ps(_r115, _k10_0, _sum07);
860 _sum07 = _mm256_fmadd_ps(_r116, _k11_0, _sum07);
861 _sum07 = _mm256_fmadd_ps(_r117, _k12_0, _sum07);
862 _sum07 = _mm256_fmadd_ps(_r215, _k20_0, _sum07);
863 _sum07 = _mm256_fmadd_ps(_r216, _k21_0, _sum07);
864 _sum07 = _mm256_fmadd_ps(_r217, _k22_0, _sum07);
865 _sum17 = _mm256_fmadd_ps(_r015, _k00_1, _sum17);
866 _sum17 = _mm256_fmadd_ps(_r016, _k01_1, _sum17);
867 _sum17 = _mm256_fmadd_ps(_r017, _k02_1, _sum17);
868 _sum17 = _mm256_fmadd_ps(_r115, _k10_1, _sum17);
869 _sum17 = _mm256_fmadd_ps(_r116, _k11_1, _sum17);
870 _sum17 = _mm256_fmadd_ps(_r117, _k12_1, _sum17);
871 _sum17 = _mm256_fmadd_ps(_r215, _k20_1, _sum17);
872 _sum17 = _mm256_fmadd_ps(_r216, _k21_1, _sum17);
873 _sum17 = _mm256_fmadd_ps(_r217, _k22_1, _sum17);
874
875 _mm256_storeu_ps(outptr0 + 56, _sum07);
876 _mm256_storeu_ps(outptr1 + 56, _sum17);
877
878 r0 += 16;
879 r1 += 16;
880 r2 += 16;
881 outptr0 += 64;
882 outptr1 += 64;
883 }
884
885 for (; j + 3 < outw; j += 4)
886 {
887 __m256 _sum00 = _mm256_loadu_ps(outptr0);
888 __m256 _sum10 = _mm256_loadu_ps(outptr1);
889
890 __m256 _r01 = _mm256_broadcast_ss(r0);
891 __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
892 __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
893 __m256 _r11 = _mm256_broadcast_ss(r1);
894 __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
895 __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
896 __m256 _r21 = _mm256_broadcast_ss(r2);
897 __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
898 __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
899
900 _sum00 = _mm256_fmadd_ps(_r01, _k00_0, _sum00);
901 _sum00 = _mm256_fmadd_ps(_r02, _k01_0, _sum00);
902 _sum00 = _mm256_fmadd_ps(_r03, _k02_0, _sum00);
903 _sum00 = _mm256_fmadd_ps(_r11, _k10_0, _sum00);
904 _sum00 = _mm256_fmadd_ps(_r12, _k11_0, _sum00);
905 _sum00 = _mm256_fmadd_ps(_r13, _k12_0, _sum00);
906 _sum00 = _mm256_fmadd_ps(_r21, _k20_0, _sum00);
907 _sum00 = _mm256_fmadd_ps(_r22, _k21_0, _sum00);
908 _sum00 = _mm256_fmadd_ps(_r23, _k22_0, _sum00);
909
910 _sum10 = _mm256_fmadd_ps(_r01, _k00_1, _sum10);
911 _sum10 = _mm256_fmadd_ps(_r02, _k01_1, _sum10);
912 _sum10 = _mm256_fmadd_ps(_r03, _k02_1, _sum10);
913 _sum10 = _mm256_fmadd_ps(_r11, _k10_1, _sum10);
914 _sum10 = _mm256_fmadd_ps(_r12, _k11_1, _sum10);
915 _sum10 = _mm256_fmadd_ps(_r13, _k12_1, _sum10);
916 _sum10 = _mm256_fmadd_ps(_r21, _k20_1, _sum10);
917 _sum10 = _mm256_fmadd_ps(_r22, _k21_1, _sum10);
918 _sum10 = _mm256_fmadd_ps(_r23, _k22_1, _sum10);
919
920 _mm256_storeu_ps(outptr0, _sum00);
921 _mm256_storeu_ps(outptr1, _sum10);
922
923 __m256 _sum01 = _mm256_loadu_ps(outptr0 + 8);
924 __m256 _sum11 = _mm256_loadu_ps(outptr1 + 8);
925
926 __m256 _r04 = _mm256_broadcast_ss(r0 + 3);
927 __m256 _r14 = _mm256_broadcast_ss(r1 + 3);
928 __m256 _r24 = _mm256_broadcast_ss(r2 + 3);
929 __m256 _r05 = _mm256_broadcast_ss(r0 + 4);
930 __m256 _r15 = _mm256_broadcast_ss(r1 + 4);
931 __m256 _r25 = _mm256_broadcast_ss(r2 + 4);
932
933 _sum01 = _mm256_fmadd_ps(_r03, _k00_0, _sum01);
934 _sum01 = _mm256_fmadd_ps(_r04, _k01_0, _sum01);
935 _sum01 = _mm256_fmadd_ps(_r05, _k02_0, _sum01);
936 _sum01 = _mm256_fmadd_ps(_r13, _k10_0, _sum01);
937 _sum01 = _mm256_fmadd_ps(_r14, _k11_0, _sum01);
938 _sum01 = _mm256_fmadd_ps(_r15, _k12_0, _sum01);
939 _sum01 = _mm256_fmadd_ps(_r23, _k20_0, _sum01);
940 _sum01 = _mm256_fmadd_ps(_r24, _k21_0, _sum01);
941 _sum01 = _mm256_fmadd_ps(_r25, _k22_0, _sum01);
942
943 _sum11 = _mm256_fmadd_ps(_r03, _k00_1, _sum11);
944 _sum11 = _mm256_fmadd_ps(_r04, _k01_1, _sum11);
945 _sum11 = _mm256_fmadd_ps(_r05, _k02_1, _sum11);
946 _sum11 = _mm256_fmadd_ps(_r13, _k10_1, _sum11);
947 _sum11 = _mm256_fmadd_ps(_r14, _k11_1, _sum11);
948 _sum11 = _mm256_fmadd_ps(_r15, _k12_1, _sum11);
949 _sum11 = _mm256_fmadd_ps(_r23, _k20_1, _sum11);
950 _sum11 = _mm256_fmadd_ps(_r24, _k21_1, _sum11);
951 _sum11 = _mm256_fmadd_ps(_r25, _k22_1, _sum11);
952
953 _mm256_storeu_ps(outptr0 + 8, _sum01);
954 _mm256_storeu_ps(outptr1 + 8, _sum11);
955
956 __m256 _sum02 = _mm256_loadu_ps(outptr0 + 16);
957 __m256 _sum12 = _mm256_loadu_ps(outptr1 + 16);
958
959 __m256 _r06 = _mm256_broadcast_ss(r0 + 5);
960 __m256 _r16 = _mm256_broadcast_ss(r1 + 5);
961 __m256 _r26 = _mm256_broadcast_ss(r2 + 5);
962 __m256 _r07 = _mm256_broadcast_ss(r0 + 6);
963 __m256 _r17 = _mm256_broadcast_ss(r1 + 6);
964 __m256 _r27 = _mm256_broadcast_ss(r2 + 6);
965
966 _sum02 = _mm256_fmadd_ps(_r05, _k00_0, _sum02);
967 _sum02 = _mm256_fmadd_ps(_r06, _k01_0, _sum02);
968 _sum02 = _mm256_fmadd_ps(_r07, _k02_0, _sum02);
969 _sum02 = _mm256_fmadd_ps(_r15, _k10_0, _sum02);
970 _sum02 = _mm256_fmadd_ps(_r16, _k11_0, _sum02);
971 _sum02 = _mm256_fmadd_ps(_r17, _k12_0, _sum02);
972 _sum02 = _mm256_fmadd_ps(_r25, _k20_0, _sum02);
973 _sum02 = _mm256_fmadd_ps(_r26, _k21_0, _sum02);
974 _sum02 = _mm256_fmadd_ps(_r27, _k22_0, _sum02);
975
976 _sum12 = _mm256_fmadd_ps(_r05, _k00_1, _sum12);
977 _sum12 = _mm256_fmadd_ps(_r06, _k01_1, _sum12);
978 _sum12 = _mm256_fmadd_ps(_r07, _k02_1, _sum12);
979 _sum12 = _mm256_fmadd_ps(_r15, _k10_1, _sum12);
980 _sum12 = _mm256_fmadd_ps(_r16, _k11_1, _sum12);
981 _sum12 = _mm256_fmadd_ps(_r17, _k12_1, _sum12);
982 _sum12 = _mm256_fmadd_ps(_r25, _k20_1, _sum12);
983 _sum12 = _mm256_fmadd_ps(_r26, _k21_1, _sum12);
984 _sum12 = _mm256_fmadd_ps(_r27, _k22_1, _sum12);
985
986 _mm256_storeu_ps(outptr0 + 16, _sum02);
987 _mm256_storeu_ps(outptr1 + 16, _sum12);
988
989 __m256 _r08 = _mm256_broadcast_ss(r0 + 7);
990 __m256 _r18 = _mm256_broadcast_ss(r1 + 7);
991 __m256 _r28 = _mm256_broadcast_ss(r2 + 7);
992 __m256 _r09 = _mm256_broadcast_ss(r0 + 8);
993 __m256 _r19 = _mm256_broadcast_ss(r1 + 8);
994 __m256 _r29 = _mm256_broadcast_ss(r2 + 8);
995
996 __m256 _sum03 = _mm256_loadu_ps(outptr0 + 24);
997 __m256 _sum13 = _mm256_loadu_ps(outptr1 + 24);
998
999 _sum03 = _mm256_fmadd_ps(_r07, _k00_0, _sum03);
1000 _sum03 = _mm256_fmadd_ps(_r08, _k01_0, _sum03);
1001 _sum03 = _mm256_fmadd_ps(_r09, _k02_0, _sum03);
1002 _sum03 = _mm256_fmadd_ps(_r17, _k10_0, _sum03);
1003 _sum03 = _mm256_fmadd_ps(_r18, _k11_0, _sum03);
1004 _sum03 = _mm256_fmadd_ps(_r19, _k12_0, _sum03);
1005 _sum03 = _mm256_fmadd_ps(_r27, _k20_0, _sum03);
1006 _sum03 = _mm256_fmadd_ps(_r28, _k21_0, _sum03);
1007 _sum03 = _mm256_fmadd_ps(_r29, _k22_0, _sum03);
1008
1009 _sum13 = _mm256_fmadd_ps(_r07, _k00_1, _sum13);
1010 _sum13 = _mm256_fmadd_ps(_r08, _k01_1, _sum13);
1011 _sum13 = _mm256_fmadd_ps(_r09, _k02_1, _sum13);
1012 _sum13 = _mm256_fmadd_ps(_r17, _k10_1, _sum13);
1013 _sum13 = _mm256_fmadd_ps(_r18, _k11_1, _sum13);
1014 _sum13 = _mm256_fmadd_ps(_r19, _k12_1, _sum13);
1015 _sum13 = _mm256_fmadd_ps(_r27, _k20_1, _sum13);
1016 _sum13 = _mm256_fmadd_ps(_r28, _k21_1, _sum13);
1017 _sum13 = _mm256_fmadd_ps(_r29, _k22_1, _sum13);
1018
1019 _mm256_storeu_ps(outptr0 + 24, _sum03);
1020 _mm256_storeu_ps(outptr1 + 24, _sum13);
1021 r0 += 8;
1022 r1 += 8;
1023 r2 += 8;
1024 outptr0 += 32;
1025 outptr1 += 32;
1026 }
1027
1028 for (; j + 1 < outw; j += 2)
1029 {
1030 __m256 _sum00 = _mm256_loadu_ps(outptr0);
1031 __m256 _sum10 = _mm256_loadu_ps(outptr1);
1032
1033 __m256 _r01 = _mm256_broadcast_ss(r0);
1034 __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
1035 __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
1036 __m256 _r11 = _mm256_broadcast_ss(r1);
1037 __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
1038 __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
1039 __m256 _r21 = _mm256_broadcast_ss(r2);
1040 __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
1041 __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
1042
1043 _sum00 = _mm256_fmadd_ps(_r01, _k00_0, _sum00);
1044 _sum00 = _mm256_fmadd_ps(_r02, _k01_0, _sum00);
1045 _sum00 = _mm256_fmadd_ps(_r03, _k02_0, _sum00);
1046 _sum00 = _mm256_fmadd_ps(_r11, _k10_0, _sum00);
1047 _sum00 = _mm256_fmadd_ps(_r12, _k11_0, _sum00);
1048 _sum00 = _mm256_fmadd_ps(_r13, _k12_0, _sum00);
1049 _sum00 = _mm256_fmadd_ps(_r21, _k20_0, _sum00);
1050 _sum00 = _mm256_fmadd_ps(_r22, _k21_0, _sum00);
1051 _sum00 = _mm256_fmadd_ps(_r23, _k22_0, _sum00);
1052
1053 _sum10 = _mm256_fmadd_ps(_r01, _k00_1, _sum10);
1054 _sum10 = _mm256_fmadd_ps(_r02, _k01_1, _sum10);
1055 _sum10 = _mm256_fmadd_ps(_r03, _k02_1, _sum10);
1056 _sum10 = _mm256_fmadd_ps(_r11, _k10_1, _sum10);
1057 _sum10 = _mm256_fmadd_ps(_r12, _k11_1, _sum10);
1058 _sum10 = _mm256_fmadd_ps(_r13, _k12_1, _sum10);
1059 _sum10 = _mm256_fmadd_ps(_r21, _k20_1, _sum10);
1060 _sum10 = _mm256_fmadd_ps(_r22, _k21_1, _sum10);
1061 _sum10 = _mm256_fmadd_ps(_r23, _k22_1, _sum10);
1062
1063 _mm256_storeu_ps(outptr0, _sum00);
1064 _mm256_storeu_ps(outptr1, _sum10);
1065
1066 __m256 _sum01 = _mm256_loadu_ps(outptr0 + 8);
1067 __m256 _sum11 = _mm256_loadu_ps(outptr1 + 8);
1068
1069 __m256 _r04 = _mm256_broadcast_ss(r0 + 3);
1070 __m256 _r14 = _mm256_broadcast_ss(r1 + 3);
1071 __m256 _r24 = _mm256_broadcast_ss(r2 + 3);
1072 __m256 _r05 = _mm256_broadcast_ss(r0 + 4);
1073 __m256 _r15 = _mm256_broadcast_ss(r1 + 4);
1074 __m256 _r25 = _mm256_broadcast_ss(r2 + 4);
1075
1076 _sum01 = _mm256_fmadd_ps(_r03, _k00_0, _sum01);
1077 _sum01 = _mm256_fmadd_ps(_r04, _k01_0, _sum01);
1078 _sum01 = _mm256_fmadd_ps(_r05, _k02_0, _sum01);
1079 _sum01 = _mm256_fmadd_ps(_r13, _k10_0, _sum01);
1080 _sum01 = _mm256_fmadd_ps(_r14, _k11_0, _sum01);
1081 _sum01 = _mm256_fmadd_ps(_r15, _k12_0, _sum01);
1082 _sum01 = _mm256_fmadd_ps(_r23, _k20_0, _sum01);
1083 _sum01 = _mm256_fmadd_ps(_r24, _k21_0, _sum01);
1084 _sum01 = _mm256_fmadd_ps(_r25, _k22_0, _sum01);
1085
1086 _sum11 = _mm256_fmadd_ps(_r03, _k00_1, _sum11);
1087 _sum11 = _mm256_fmadd_ps(_r04, _k01_1, _sum11);
1088 _sum11 = _mm256_fmadd_ps(_r05, _k02_1, _sum11);
1089 _sum11 = _mm256_fmadd_ps(_r13, _k10_1, _sum11);
1090 _sum11 = _mm256_fmadd_ps(_r14, _k11_1, _sum11);
1091 _sum11 = _mm256_fmadd_ps(_r15, _k12_1, _sum11);
1092 _sum11 = _mm256_fmadd_ps(_r23, _k20_1, _sum11);
1093 _sum11 = _mm256_fmadd_ps(_r24, _k21_1, _sum11);
1094 _sum11 = _mm256_fmadd_ps(_r25, _k22_1, _sum11);
1095
1096 _mm256_storeu_ps(outptr0 + 8, _sum01);
1097 _mm256_storeu_ps(outptr1 + 8, _sum11);
1098
1099 r0 += 4;
1100 r1 += 4;
1101 r2 += 4;
1102 outptr0 += 16;
1103 outptr1 += 16;
1104 }
1105 for (; j < outw; j++)
1106 {
1107 __m256 _sum00 = _mm256_loadu_ps(outptr0);
1108 __m256 _sum10 = _mm256_loadu_ps(outptr1);
1109
1110 __m256 _r01 = _mm256_broadcast_ss(r0);
1111 __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
1112 __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
1113 __m256 _r11 = _mm256_broadcast_ss(r1);
1114 __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
1115 __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
1116 __m256 _r21 = _mm256_broadcast_ss(r2);
1117 __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
1118 __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
1119
1120 _sum00 = _mm256_fmadd_ps(_r01, _k00_0, _sum00);
1121 _sum00 = _mm256_fmadd_ps(_r02, _k01_0, _sum00);
1122 _sum00 = _mm256_fmadd_ps(_r03, _k02_0, _sum00);
1123 _sum00 = _mm256_fmadd_ps(_r11, _k10_0, _sum00);
1124 _sum00 = _mm256_fmadd_ps(_r12, _k11_0, _sum00);
1125 _sum00 = _mm256_fmadd_ps(_r13, _k12_0, _sum00);
1126 _sum00 = _mm256_fmadd_ps(_r21, _k20_0, _sum00);
1127 _sum00 = _mm256_fmadd_ps(_r22, _k21_0, _sum00);
1128 _sum00 = _mm256_fmadd_ps(_r23, _k22_0, _sum00);
1129
1130 _sum10 = _mm256_fmadd_ps(_r01, _k00_1, _sum10);
1131 _sum10 = _mm256_fmadd_ps(_r02, _k01_1, _sum10);
1132 _sum10 = _mm256_fmadd_ps(_r03, _k02_1, _sum10);
1133 _sum10 = _mm256_fmadd_ps(_r11, _k10_1, _sum10);
1134 _sum10 = _mm256_fmadd_ps(_r12, _k11_1, _sum10);
1135 _sum10 = _mm256_fmadd_ps(_r13, _k12_1, _sum10);
1136 _sum10 = _mm256_fmadd_ps(_r21, _k20_1, _sum10);
1137 _sum10 = _mm256_fmadd_ps(_r22, _k21_1, _sum10);
1138 _sum10 = _mm256_fmadd_ps(_r23, _k22_1, _sum10);
1139
1140 _mm256_storeu_ps(outptr0, _sum00);
1141 _mm256_storeu_ps(outptr1, _sum10);
1142
1143 r0 += 2;
1144 r1 += 2;
1145 r2 += 2;
1146 outptr0 += 8;
1147 outptr1 += 8;
1148 }
1149 r0 += tailstep;
1150 r1 += tailstep;
1151 r2 += tailstep;
1152 }
1153
1154 k0 += 9 * 8;
1155 k1 += 9 * 8;
1156 }
1157 }
1158
1159 #pragma omp parallel for num_threads(opt.num_threads)
1160 for (int p = remain_outch_start; p < outch; p++)
1161 {
1162 Mat out0 = top_blob.channel(p);
1163
1164 __m256 _bias0 = bias ? _mm256_loadu_ps((const float*)bias + p * 8) : _mm256_set1_ps(0.f);
1165 out0.fill(_bias0);
1166
1167 const float* k0 = kernel.channel(p);
1168
1169 for (int q = 0; q < inch; q++)
1170 {
1171 float* outptr0 = out0.row(0);
1172
1173 const Mat img0 = bottom_blob.channel(q);
1174
1175 const float* r0 = img0.row(0);
1176 const float* r1 = img0.row(1);
1177 const float* r2 = img0.row(2);
1178
1179 __m256 _k00_0 = _mm256_loadu_ps(k0);
1180 __m256 _k01_0 = _mm256_loadu_ps(k0 + 8);
1181 __m256 _k02_0 = _mm256_loadu_ps(k0 + 16);
1182 __m256 _k10_0 = _mm256_loadu_ps(k0 + 24);
1183 __m256 _k11_0 = _mm256_loadu_ps(k0 + 32);
1184 __m256 _k12_0 = _mm256_loadu_ps(k0 + 40);
1185 __m256 _k20_0 = _mm256_loadu_ps(k0 + 48);
1186 __m256 _k21_0 = _mm256_loadu_ps(k0 + 56);
1187 __m256 _k22_0 = _mm256_loadu_ps(k0 + 64);
1188
1189 int i = 0;
1190
1191 for (; i < outh; i++)
1192 {
1193 int j = 0;
1194 for (; j + 7 < outw; j += 8)
1195 {
1196 __m256 _sum00 = _mm256_loadu_ps(outptr0);
1197
1198 __m256 _r01 = _mm256_broadcast_ss(r0);
1199 __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
1200 __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
1201 __m256 _r11 = _mm256_broadcast_ss(r1);
1202 __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
1203 __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
1204 __m256 _r21 = _mm256_broadcast_ss(r2);
1205 __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
1206 __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
1207
1208 _sum00 = _mm256_fmadd_ps(_r01, _k00_0, _sum00);
1209 _sum00 = _mm256_fmadd_ps(_r02, _k01_0, _sum00);
1210 _sum00 = _mm256_fmadd_ps(_r03, _k02_0, _sum00);
1211 _sum00 = _mm256_fmadd_ps(_r11, _k10_0, _sum00);
1212 _sum00 = _mm256_fmadd_ps(_r12, _k11_0, _sum00);
1213 _sum00 = _mm256_fmadd_ps(_r13, _k12_0, _sum00);
1214 _sum00 = _mm256_fmadd_ps(_r21, _k20_0, _sum00);
1215 _sum00 = _mm256_fmadd_ps(_r22, _k21_0, _sum00);
1216 _sum00 = _mm256_fmadd_ps(_r23, _k22_0, _sum00);
1217
1218 _mm256_storeu_ps(outptr0, _sum00);
1219
1220 __m256 _sum01 = _mm256_loadu_ps(outptr0 + 8);
1221
1222 __m256 _r04 = _mm256_broadcast_ss(r0 + 3);
1223 __m256 _r14 = _mm256_broadcast_ss(r1 + 3);
1224 __m256 _r24 = _mm256_broadcast_ss(r2 + 3);
1225 __m256 _r05 = _mm256_broadcast_ss(r0 + 4);
1226 __m256 _r15 = _mm256_broadcast_ss(r1 + 4);
1227 __m256 _r25 = _mm256_broadcast_ss(r2 + 4);
1228
1229 _sum01 = _mm256_fmadd_ps(_r03, _k00_0, _sum01);
1230 _sum01 = _mm256_fmadd_ps(_r04, _k01_0, _sum01);
1231 _sum01 = _mm256_fmadd_ps(_r05, _k02_0, _sum01);
1232 _sum01 = _mm256_fmadd_ps(_r13, _k10_0, _sum01);
1233 _sum01 = _mm256_fmadd_ps(_r14, _k11_0, _sum01);
1234 _sum01 = _mm256_fmadd_ps(_r15, _k12_0, _sum01);
1235 _sum01 = _mm256_fmadd_ps(_r23, _k20_0, _sum01);
1236 _sum01 = _mm256_fmadd_ps(_r24, _k21_0, _sum01);
1237 _sum01 = _mm256_fmadd_ps(_r25, _k22_0, _sum01);
1238
1239 _mm256_storeu_ps(outptr0 + 8, _sum01);
1240
1241 __m256 _sum02 = _mm256_loadu_ps(outptr0 + 16);
1242
1243 __m256 _r06 = _mm256_broadcast_ss(r0 + 5);
1244 __m256 _r16 = _mm256_broadcast_ss(r1 + 5);
1245 __m256 _r26 = _mm256_broadcast_ss(r2 + 5);
1246 __m256 _r07 = _mm256_broadcast_ss(r0 + 6);
1247 __m256 _r17 = _mm256_broadcast_ss(r1 + 6);
1248 __m256 _r27 = _mm256_broadcast_ss(r2 + 6);
1249
1250 _sum02 = _mm256_fmadd_ps(_r05, _k00_0, _sum02);
1251 _sum02 = _mm256_fmadd_ps(_r06, _k01_0, _sum02);
1252 _sum02 = _mm256_fmadd_ps(_r07, _k02_0, _sum02);
1253 _sum02 = _mm256_fmadd_ps(_r15, _k10_0, _sum02);
1254 _sum02 = _mm256_fmadd_ps(_r16, _k11_0, _sum02);
1255 _sum02 = _mm256_fmadd_ps(_r17, _k12_0, _sum02);
1256 _sum02 = _mm256_fmadd_ps(_r25, _k20_0, _sum02);
1257 _sum02 = _mm256_fmadd_ps(_r26, _k21_0, _sum02);
1258 _sum02 = _mm256_fmadd_ps(_r27, _k22_0, _sum02);
1259
1260 _mm256_storeu_ps(outptr0 + 16, _sum02);
1261
1262 __m256 _r08 = _mm256_broadcast_ss(r0 + 7);
1263 __m256 _r18 = _mm256_broadcast_ss(r1 + 7);
1264 __m256 _r28 = _mm256_broadcast_ss(r2 + 7);
1265 __m256 _r09 = _mm256_broadcast_ss(r0 + 8);
1266 __m256 _r19 = _mm256_broadcast_ss(r1 + 8);
1267 __m256 _r29 = _mm256_broadcast_ss(r2 + 8);
1268
1269 __m256 _sum03 = _mm256_loadu_ps(outptr0 + 24);
1270
1271 _sum03 = _mm256_fmadd_ps(_r07, _k00_0, _sum03);
1272 _sum03 = _mm256_fmadd_ps(_r08, _k01_0, _sum03);
1273 _sum03 = _mm256_fmadd_ps(_r09, _k02_0, _sum03);
1274 _sum03 = _mm256_fmadd_ps(_r17, _k10_0, _sum03);
1275 _sum03 = _mm256_fmadd_ps(_r18, _k11_0, _sum03);
1276 _sum03 = _mm256_fmadd_ps(_r19, _k12_0, _sum03);
1277 _sum03 = _mm256_fmadd_ps(_r27, _k20_0, _sum03);
1278 _sum03 = _mm256_fmadd_ps(_r28, _k21_0, _sum03);
1279 _sum03 = _mm256_fmadd_ps(_r29, _k22_0, _sum03);
1280
1281 _mm256_storeu_ps(outptr0 + 24, _sum03);
1282
1283 __m256 _r010 = _mm256_broadcast_ss(r0 + 9);
1284 __m256 _r110 = _mm256_broadcast_ss(r1 + 9);
1285 __m256 _r210 = _mm256_broadcast_ss(r2 + 9);
1286 __m256 _r011 = _mm256_broadcast_ss(r0 + 10);
1287 __m256 _r111 = _mm256_broadcast_ss(r1 + 10);
1288 __m256 _r211 = _mm256_broadcast_ss(r2 + 10);
1289
1290 __m256 _sum04 = _mm256_loadu_ps(outptr0 + 32);
1291
1292 _sum04 = _mm256_fmadd_ps(_r09, _k00_0, _sum04);
1293 _sum04 = _mm256_fmadd_ps(_r010, _k01_0, _sum04);
1294 _sum04 = _mm256_fmadd_ps(_r011, _k02_0, _sum04);
1295 _sum04 = _mm256_fmadd_ps(_r19, _k10_0, _sum04);
1296 _sum04 = _mm256_fmadd_ps(_r110, _k11_0, _sum04);
1297 _sum04 = _mm256_fmadd_ps(_r111, _k12_0, _sum04);
1298 _sum04 = _mm256_fmadd_ps(_r29, _k20_0, _sum04);
1299 _sum04 = _mm256_fmadd_ps(_r210, _k21_0, _sum04);
1300 _sum04 = _mm256_fmadd_ps(_r211, _k22_0, _sum04);
1301
1302 _mm256_storeu_ps(outptr0 + 32, _sum04);
1303
1304 __m256 _r012 = _mm256_broadcast_ss(r0 + 11);
1305 __m256 _r112 = _mm256_broadcast_ss(r1 + 11);
1306 __m256 _r212 = _mm256_broadcast_ss(r2 + 11);
1307 __m256 _r013 = _mm256_broadcast_ss(r0 + 12);
1308 __m256 _r113 = _mm256_broadcast_ss(r1 + 12);
1309 __m256 _r213 = _mm256_broadcast_ss(r2 + 12);
1310
1311 __m256 _sum05 = _mm256_loadu_ps(outptr0 + 40);
1312
1313 _sum05 = _mm256_fmadd_ps(_r011, _k00_0, _sum05);
1314 _sum05 = _mm256_fmadd_ps(_r012, _k01_0, _sum05);
1315 _sum05 = _mm256_fmadd_ps(_r013, _k02_0, _sum05);
1316 _sum05 = _mm256_fmadd_ps(_r111, _k10_0, _sum05);
1317 _sum05 = _mm256_fmadd_ps(_r112, _k11_0, _sum05);
1318 _sum05 = _mm256_fmadd_ps(_r113, _k12_0, _sum05);
1319 _sum05 = _mm256_fmadd_ps(_r211, _k20_0, _sum05);
1320 _sum05 = _mm256_fmadd_ps(_r212, _k21_0, _sum05);
1321 _sum05 = _mm256_fmadd_ps(_r213, _k22_0, _sum05);
1322
1323 _mm256_storeu_ps(outptr0 + 40, _sum05);
1324
1325 __m256 _r014 = _mm256_broadcast_ss(r0 + 13);
1326 __m256 _r114 = _mm256_broadcast_ss(r1 + 13);
1327 __m256 _r214 = _mm256_broadcast_ss(r2 + 13);
1328 __m256 _r015 = _mm256_broadcast_ss(r0 + 14);
1329 __m256 _r115 = _mm256_broadcast_ss(r1 + 14);
1330 __m256 _r215 = _mm256_broadcast_ss(r2 + 14);
1331
1332 __m256 _sum06 = _mm256_loadu_ps(outptr0 + 48);
1333
1334 _sum06 = _mm256_fmadd_ps(_r013, _k00_0, _sum06);
1335 _sum06 = _mm256_fmadd_ps(_r014, _k01_0, _sum06);
1336 _sum06 = _mm256_fmadd_ps(_r015, _k02_0, _sum06);
1337 _sum06 = _mm256_fmadd_ps(_r113, _k10_0, _sum06);
1338 _sum06 = _mm256_fmadd_ps(_r114, _k11_0, _sum06);
1339 _sum06 = _mm256_fmadd_ps(_r115, _k12_0, _sum06);
1340 _sum06 = _mm256_fmadd_ps(_r213, _k20_0, _sum06);
1341 _sum06 = _mm256_fmadd_ps(_r214, _k21_0, _sum06);
1342 _sum06 = _mm256_fmadd_ps(_r215, _k22_0, _sum06);
1343
1344 _mm256_storeu_ps(outptr0 + 48, _sum06);
1345
1346 __m256 _r016 = _mm256_broadcast_ss(r0 + 15);
1347 __m256 _r116 = _mm256_broadcast_ss(r1 + 15);
1348 __m256 _r216 = _mm256_broadcast_ss(r2 + 15);
1349 __m256 _r017 = _mm256_broadcast_ss(r0 + 16);
1350 __m256 _r117 = _mm256_broadcast_ss(r1 + 16);
1351 __m256 _r217 = _mm256_broadcast_ss(r2 + 16);
1352
1353 __m256 _sum07 = _mm256_loadu_ps(outptr0 + 56);
1354
1355 _sum07 = _mm256_fmadd_ps(_r015, _k00_0, _sum07);
1356 _sum07 = _mm256_fmadd_ps(_r016, _k01_0, _sum07);
1357 _sum07 = _mm256_fmadd_ps(_r017, _k02_0, _sum07);
1358 _sum07 = _mm256_fmadd_ps(_r115, _k10_0, _sum07);
1359 _sum07 = _mm256_fmadd_ps(_r116, _k11_0, _sum07);
1360 _sum07 = _mm256_fmadd_ps(_r117, _k12_0, _sum07);
1361 _sum07 = _mm256_fmadd_ps(_r215, _k20_0, _sum07);
1362 _sum07 = _mm256_fmadd_ps(_r216, _k21_0, _sum07);
1363 _sum07 = _mm256_fmadd_ps(_r217, _k22_0, _sum07);
1364
1365 _mm256_storeu_ps(outptr0 + 56, _sum07);
1366
1367 r0 += 16;
1368 r1 += 16;
1369 r2 += 16;
1370 outptr0 += 64;
1371 }
1372
1373 for (; j + 3 < outw; j += 4)
1374 {
1375 __m256 _sum00 = _mm256_loadu_ps(outptr0);
1376 __m256 _r01 = _mm256_broadcast_ss(r0);
1377 __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
1378 __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
1379 __m256 _r11 = _mm256_broadcast_ss(r1);
1380 __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
1381 __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
1382 __m256 _r21 = _mm256_broadcast_ss(r2);
1383 __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
1384 __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
1385
1386 _sum00 = _mm256_fmadd_ps(_r01, _k00_0, _sum00);
1387 _sum00 = _mm256_fmadd_ps(_r02, _k01_0, _sum00);
1388 _sum00 = _mm256_fmadd_ps(_r03, _k02_0, _sum00);
1389 _sum00 = _mm256_fmadd_ps(_r11, _k10_0, _sum00);
1390 _sum00 = _mm256_fmadd_ps(_r12, _k11_0, _sum00);
1391 _sum00 = _mm256_fmadd_ps(_r13, _k12_0, _sum00);
1392 _sum00 = _mm256_fmadd_ps(_r21, _k20_0, _sum00);
1393 _sum00 = _mm256_fmadd_ps(_r22, _k21_0, _sum00);
1394 _sum00 = _mm256_fmadd_ps(_r23, _k22_0, _sum00);
1395
1396 _mm256_storeu_ps(outptr0, _sum00);
1397
1398 __m256 _sum01 = _mm256_loadu_ps(outptr0 + 8);
1399
1400 __m256 _r04 = _mm256_broadcast_ss(r0 + 3);
1401 __m256 _r14 = _mm256_broadcast_ss(r1 + 3);
1402 __m256 _r24 = _mm256_broadcast_ss(r2 + 3);
1403 __m256 _r05 = _mm256_broadcast_ss(r0 + 4);
1404 __m256 _r15 = _mm256_broadcast_ss(r1 + 4);
1405 __m256 _r25 = _mm256_broadcast_ss(r2 + 4);
1406
1407 _sum01 = _mm256_fmadd_ps(_r03, _k00_0, _sum01);
1408 _sum01 = _mm256_fmadd_ps(_r04, _k01_0, _sum01);
1409 _sum01 = _mm256_fmadd_ps(_r05, _k02_0, _sum01);
1410 _sum01 = _mm256_fmadd_ps(_r13, _k10_0, _sum01);
1411 _sum01 = _mm256_fmadd_ps(_r14, _k11_0, _sum01);
1412 _sum01 = _mm256_fmadd_ps(_r15, _k12_0, _sum01);
1413 _sum01 = _mm256_fmadd_ps(_r23, _k20_0, _sum01);
1414 _sum01 = _mm256_fmadd_ps(_r24, _k21_0, _sum01);
1415 _sum01 = _mm256_fmadd_ps(_r25, _k22_0, _sum01);
1416
1417 _mm256_storeu_ps(outptr0 + 8, _sum01);
1418
1419 __m256 _sum02 = _mm256_loadu_ps(outptr0 + 16);
1420
1421 __m256 _r06 = _mm256_broadcast_ss(r0 + 5);
1422 __m256 _r16 = _mm256_broadcast_ss(r1 + 5);
1423 __m256 _r26 = _mm256_broadcast_ss(r2 + 5);
1424 __m256 _r07 = _mm256_broadcast_ss(r0 + 6);
1425 __m256 _r17 = _mm256_broadcast_ss(r1 + 6);
1426 __m256 _r27 = _mm256_broadcast_ss(r2 + 6);
1427
1428 _sum02 = _mm256_fmadd_ps(_r05, _k00_0, _sum02);
1429 _sum02 = _mm256_fmadd_ps(_r06, _k01_0, _sum02);
1430 _sum02 = _mm256_fmadd_ps(_r07, _k02_0, _sum02);
1431 _sum02 = _mm256_fmadd_ps(_r15, _k10_0, _sum02);
1432 _sum02 = _mm256_fmadd_ps(_r16, _k11_0, _sum02);
1433 _sum02 = _mm256_fmadd_ps(_r17, _k12_0, _sum02);
1434 _sum02 = _mm256_fmadd_ps(_r25, _k20_0, _sum02);
1435 _sum02 = _mm256_fmadd_ps(_r26, _k21_0, _sum02);
1436 _sum02 = _mm256_fmadd_ps(_r27, _k22_0, _sum02);
1437
1438 _mm256_storeu_ps(outptr0 + 16, _sum02);
1439
1440 __m256 _r08 = _mm256_broadcast_ss(r0 + 7);
1441 __m256 _r18 = _mm256_broadcast_ss(r1 + 7);
1442 __m256 _r28 = _mm256_broadcast_ss(r2 + 7);
1443 __m256 _r09 = _mm256_broadcast_ss(r0 + 8);
1444 __m256 _r19 = _mm256_broadcast_ss(r1 + 8);
1445 __m256 _r29 = _mm256_broadcast_ss(r2 + 8);
1446
1447 __m256 _sum03 = _mm256_loadu_ps(outptr0 + 24);
1448
1449 _sum03 = _mm256_fmadd_ps(_r07, _k00_0, _sum03);
1450 _sum03 = _mm256_fmadd_ps(_r08, _k01_0, _sum03);
1451 _sum03 = _mm256_fmadd_ps(_r09, _k02_0, _sum03);
1452 _sum03 = _mm256_fmadd_ps(_r17, _k10_0, _sum03);
1453 _sum03 = _mm256_fmadd_ps(_r18, _k11_0, _sum03);
1454 _sum03 = _mm256_fmadd_ps(_r19, _k12_0, _sum03);
1455 _sum03 = _mm256_fmadd_ps(_r27, _k20_0, _sum03);
1456 _sum03 = _mm256_fmadd_ps(_r28, _k21_0, _sum03);
1457 _sum03 = _mm256_fmadd_ps(_r29, _k22_0, _sum03);
1458
1459 _mm256_storeu_ps(outptr0 + 24, _sum03);
1460 r0 += 8;
1461 r1 += 8;
1462 r2 += 8;
1463 outptr0 += 32;
1464 }
1465
1466 for (; j + 1 < outw; j += 2)
1467 {
1468 __m256 _sum00 = _mm256_loadu_ps(outptr0);
1469
1470 __m256 _r01 = _mm256_broadcast_ss(r0);
1471 __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
1472 __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
1473 __m256 _r11 = _mm256_broadcast_ss(r1);
1474 __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
1475 __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
1476 __m256 _r21 = _mm256_broadcast_ss(r2);
1477 __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
1478 __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
1479
1480 _sum00 = _mm256_fmadd_ps(_r01, _k00_0, _sum00);
1481 _sum00 = _mm256_fmadd_ps(_r02, _k01_0, _sum00);
1482 _sum00 = _mm256_fmadd_ps(_r03, _k02_0, _sum00);
1483 _sum00 = _mm256_fmadd_ps(_r11, _k10_0, _sum00);
1484 _sum00 = _mm256_fmadd_ps(_r12, _k11_0, _sum00);
1485 _sum00 = _mm256_fmadd_ps(_r13, _k12_0, _sum00);
1486 _sum00 = _mm256_fmadd_ps(_r21, _k20_0, _sum00);
1487 _sum00 = _mm256_fmadd_ps(_r22, _k21_0, _sum00);
1488 _sum00 = _mm256_fmadd_ps(_r23, _k22_0, _sum00);
1489
1490 _mm256_storeu_ps(outptr0, _sum00);
1491
1492 __m256 _sum01 = _mm256_loadu_ps(outptr0 + 8);
1493
1494 __m256 _r04 = _mm256_broadcast_ss(r0 + 3);
1495 __m256 _r14 = _mm256_broadcast_ss(r1 + 3);
1496 __m256 _r24 = _mm256_broadcast_ss(r2 + 3);
1497 __m256 _r05 = _mm256_broadcast_ss(r0 + 4);
1498 __m256 _r15 = _mm256_broadcast_ss(r1 + 4);
1499 __m256 _r25 = _mm256_broadcast_ss(r2 + 4);
1500
1501 _sum01 = _mm256_fmadd_ps(_r03, _k00_0, _sum01);
1502 _sum01 = _mm256_fmadd_ps(_r04, _k01_0, _sum01);
1503 _sum01 = _mm256_fmadd_ps(_r05, _k02_0, _sum01);
1504 _sum01 = _mm256_fmadd_ps(_r13, _k10_0, _sum01);
1505 _sum01 = _mm256_fmadd_ps(_r14, _k11_0, _sum01);
1506 _sum01 = _mm256_fmadd_ps(_r15, _k12_0, _sum01);
1507 _sum01 = _mm256_fmadd_ps(_r23, _k20_0, _sum01);
1508 _sum01 = _mm256_fmadd_ps(_r24, _k21_0, _sum01);
1509 _sum01 = _mm256_fmadd_ps(_r25, _k22_0, _sum01);
1510
1511 _mm256_storeu_ps(outptr0 + 8, _sum01);
1512
1513 r0 += 4;
1514 r1 += 4;
1515 r2 += 4;
1516 outptr0 += 16;
1517 }
1518 for (; j < outw; j++)
1519 {
1520 __m256 _sum00 = _mm256_loadu_ps(outptr0);
1521 __m256 _r01 = _mm256_broadcast_ss(r0);
1522 __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
1523 __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
1524 __m256 _r11 = _mm256_broadcast_ss(r1);
1525 __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
1526 __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
1527 __m256 _r21 = _mm256_broadcast_ss(r2);
1528 __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
1529 __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
1530
1531 _sum00 = _mm256_fmadd_ps(_r01, _k00_0, _sum00);
1532 _sum00 = _mm256_fmadd_ps(_r02, _k01_0, _sum00);
1533 _sum00 = _mm256_fmadd_ps(_r03, _k02_0, _sum00);
1534 _sum00 = _mm256_fmadd_ps(_r11, _k10_0, _sum00);
1535 _sum00 = _mm256_fmadd_ps(_r12, _k11_0, _sum00);
1536 _sum00 = _mm256_fmadd_ps(_r13, _k12_0, _sum00);
1537 _sum00 = _mm256_fmadd_ps(_r21, _k20_0, _sum00);
1538 _sum00 = _mm256_fmadd_ps(_r22, _k21_0, _sum00);
1539 _sum00 = _mm256_fmadd_ps(_r23, _k22_0, _sum00);
1540 _mm256_storeu_ps(outptr0, _sum00);
1541
1542 r0 += 2;
1543 r1 += 2;
1544 r2 += 2;
1545 outptr0 += 8;
1546 }
1547 r0 += tailstep;
1548 r1 += tailstep;
1549 r2 += tailstep;
1550 }
1551
1552 k0 += 9 * 8;
1553 }
1554 }
1555 }
1556