1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
conv3x3s1_pack1to8_avx(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel,const Mat & _bias,const Option & opt)15 static void conv3x3s1_pack1to8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
16 {
17     int inch = bottom_blob.c;
18     int outw = top_blob.w;
19     int outh = top_blob.h;
20     int outch = top_blob.c;
21     const float* bias = _bias;
22 
23     int nn_outch = outch >> 1;
24     int remain_outch_start = nn_outch << 1;
25 
26     #pragma omp parallel for num_threads(opt.num_threads)
27     for (int pp = 0; pp < nn_outch; pp++)
28     {
29         int p = pp * 2;
30 
31         Mat out0 = top_blob.channel(p);
32         Mat out1 = top_blob.channel(p + 1);
33 
34         __m256 _bias0 = bias ? _mm256_loadu_ps((const float*)bias + p * 8) : _mm256_set1_ps(0.f);
35         __m256 _bias1 = bias ? _mm256_loadu_ps((const float*)bias + (p + 1) * 8) : _mm256_set1_ps(0.f);
36         out0.fill(_bias0);
37         out1.fill(_bias1);
38 
39         const float* k0 = kernel.channel(p);
40         const float* k1 = kernel.channel(p + 1);
41 
42         for (int q = 0; q < inch; q++)
43         {
44             float* outptr0 = out0;
45             float* outptr1 = out1;
46 
47             const Mat img0 = bottom_blob.channel(q);
48 
49             const float* r0 = img0.row(0);
50             const float* r1 = img0.row(1);
51             const float* r2 = img0.row(2);
52 
53             __m256 _k00_0 = _mm256_loadu_ps(k0);
54             __m256 _k01_0 = _mm256_loadu_ps(k0 + 8);
55             __m256 _k02_0 = _mm256_loadu_ps(k0 + 16);
56             __m256 _k10_0 = _mm256_loadu_ps(k0 + 24);
57             __m256 _k11_0 = _mm256_loadu_ps(k0 + 32);
58             __m256 _k12_0 = _mm256_loadu_ps(k0 + 40);
59             __m256 _k20_0 = _mm256_loadu_ps(k0 + 48);
60             __m256 _k21_0 = _mm256_loadu_ps(k0 + 56);
61             __m256 _k22_0 = _mm256_loadu_ps(k0 + 64);
62 
63             __m256 _k00_1 = _mm256_loadu_ps(k1);
64             __m256 _k01_1 = _mm256_loadu_ps(k1 + 8);
65             __m256 _k02_1 = _mm256_loadu_ps(k1 + 16);
66             __m256 _k10_1 = _mm256_loadu_ps(k1 + 24);
67             __m256 _k11_1 = _mm256_loadu_ps(k1 + 32);
68             __m256 _k12_1 = _mm256_loadu_ps(k1 + 40);
69             __m256 _k20_1 = _mm256_loadu_ps(k1 + 48);
70             __m256 _k21_1 = _mm256_loadu_ps(k1 + 56);
71             __m256 _k22_1 = _mm256_loadu_ps(k1 + 64);
72 
73             int i = 0;
74 
75             for (; i < outh; i++)
76             {
77                 int j = 0;
78                 for (; j + 3 < outw; j += 4)
79                 {
80                     __m256 _sum00 = _mm256_loadu_ps(outptr0);
81                     __m256 _sum10 = _mm256_loadu_ps(outptr1);
82 
83                     __m256 _r01 = _mm256_broadcast_ss(r0);
84                     __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
85                     __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
86                     __m256 _r11 = _mm256_broadcast_ss(r1);
87                     __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
88                     __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
89                     __m256 _r21 = _mm256_broadcast_ss(r2);
90                     __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
91                     __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
92 
93                     _sum00 = _mm256_fmadd_ps(_r01, _k00_0, _sum00);
94                     _sum00 = _mm256_fmadd_ps(_r02, _k01_0, _sum00);
95                     _sum00 = _mm256_fmadd_ps(_r03, _k02_0, _sum00);
96                     _sum00 = _mm256_fmadd_ps(_r11, _k10_0, _sum00);
97                     _sum00 = _mm256_fmadd_ps(_r12, _k11_0, _sum00);
98                     _sum00 = _mm256_fmadd_ps(_r13, _k12_0, _sum00);
99                     _sum00 = _mm256_fmadd_ps(_r21, _k20_0, _sum00);
100                     _sum00 = _mm256_fmadd_ps(_r22, _k21_0, _sum00);
101                     _sum00 = _mm256_fmadd_ps(_r23, _k22_0, _sum00);
102 
103                     _sum10 = _mm256_fmadd_ps(_r01, _k00_1, _sum10);
104                     _sum10 = _mm256_fmadd_ps(_r02, _k01_1, _sum10);
105                     _sum10 = _mm256_fmadd_ps(_r03, _k02_1, _sum10);
106                     _sum10 = _mm256_fmadd_ps(_r11, _k10_1, _sum10);
107                     _sum10 = _mm256_fmadd_ps(_r12, _k11_1, _sum10);
108                     _sum10 = _mm256_fmadd_ps(_r13, _k12_1, _sum10);
109                     _sum10 = _mm256_fmadd_ps(_r21, _k20_1, _sum10);
110                     _sum10 = _mm256_fmadd_ps(_r22, _k21_1, _sum10);
111                     _sum10 = _mm256_fmadd_ps(_r23, _k22_1, _sum10);
112 
113                     _mm256_storeu_ps(outptr0, _sum00);
114                     _mm256_storeu_ps(outptr1, _sum10);
115 
116                     __m256 _sum01 = _mm256_loadu_ps(outptr0 + 8);
117                     __m256 _sum11 = _mm256_loadu_ps(outptr1 + 8);
118 
119                     __m256 _r04 = _mm256_broadcast_ss(r0 + 3);
120                     __m256 _r14 = _mm256_broadcast_ss(r1 + 3);
121                     __m256 _r24 = _mm256_broadcast_ss(r2 + 3);
122 
123                     _sum01 = _mm256_fmadd_ps(_r02, _k00_0, _sum01);
124                     _sum01 = _mm256_fmadd_ps(_r03, _k01_0, _sum01);
125                     _sum01 = _mm256_fmadd_ps(_r04, _k02_0, _sum01);
126                     _sum01 = _mm256_fmadd_ps(_r12, _k10_0, _sum01);
127                     _sum01 = _mm256_fmadd_ps(_r13, _k11_0, _sum01);
128                     _sum01 = _mm256_fmadd_ps(_r14, _k12_0, _sum01);
129                     _sum01 = _mm256_fmadd_ps(_r22, _k20_0, _sum01);
130                     _sum01 = _mm256_fmadd_ps(_r23, _k21_0, _sum01);
131                     _sum01 = _mm256_fmadd_ps(_r24, _k22_0, _sum01);
132 
133                     _sum11 = _mm256_fmadd_ps(_r02, _k00_1, _sum11);
134                     _sum11 = _mm256_fmadd_ps(_r03, _k01_1, _sum11);
135                     _sum11 = _mm256_fmadd_ps(_r04, _k02_1, _sum11);
136                     _sum11 = _mm256_fmadd_ps(_r12, _k10_1, _sum11);
137                     _sum11 = _mm256_fmadd_ps(_r13, _k11_1, _sum11);
138                     _sum11 = _mm256_fmadd_ps(_r14, _k12_1, _sum11);
139                     _sum11 = _mm256_fmadd_ps(_r22, _k20_1, _sum11);
140                     _sum11 = _mm256_fmadd_ps(_r23, _k21_1, _sum11);
141                     _sum11 = _mm256_fmadd_ps(_r24, _k22_1, _sum11);
142 
143                     _mm256_storeu_ps(outptr0 + 8, _sum01);
144                     _mm256_storeu_ps(outptr1 + 8, _sum11);
145 
146                     __m256 _sum02 = _mm256_loadu_ps(outptr0 + 16);
147                     __m256 _sum12 = _mm256_loadu_ps(outptr1 + 16);
148 
149                     __m256 _r05 = _mm256_broadcast_ss(r0 + 4);
150                     __m256 _r15 = _mm256_broadcast_ss(r1 + 4);
151                     __m256 _r25 = _mm256_broadcast_ss(r2 + 4);
152 
153                     _sum02 = _mm256_fmadd_ps(_r03, _k00_0, _sum02);
154                     _sum02 = _mm256_fmadd_ps(_r04, _k01_0, _sum02);
155                     _sum02 = _mm256_fmadd_ps(_r05, _k02_0, _sum02);
156                     _sum02 = _mm256_fmadd_ps(_r13, _k10_0, _sum02);
157                     _sum02 = _mm256_fmadd_ps(_r14, _k11_0, _sum02);
158                     _sum02 = _mm256_fmadd_ps(_r15, _k12_0, _sum02);
159                     _sum02 = _mm256_fmadd_ps(_r23, _k20_0, _sum02);
160                     _sum02 = _mm256_fmadd_ps(_r24, _k21_0, _sum02);
161                     _sum02 = _mm256_fmadd_ps(_r25, _k22_0, _sum02);
162 
163                     _sum12 = _mm256_fmadd_ps(_r03, _k00_1, _sum12);
164                     _sum12 = _mm256_fmadd_ps(_r04, _k01_1, _sum12);
165                     _sum12 = _mm256_fmadd_ps(_r05, _k02_1, _sum12);
166                     _sum12 = _mm256_fmadd_ps(_r13, _k10_1, _sum12);
167                     _sum12 = _mm256_fmadd_ps(_r14, _k11_1, _sum12);
168                     _sum12 = _mm256_fmadd_ps(_r15, _k12_1, _sum12);
169                     _sum12 = _mm256_fmadd_ps(_r23, _k20_1, _sum12);
170                     _sum12 = _mm256_fmadd_ps(_r24, _k21_1, _sum12);
171                     _sum12 = _mm256_fmadd_ps(_r25, _k22_1, _sum12);
172 
173                     _mm256_storeu_ps(outptr0 + 16, _sum02);
174                     _mm256_storeu_ps(outptr1 + 16, _sum12);
175 
176                     __m256 _r06 = _mm256_broadcast_ss(r0 + 5);
177                     __m256 _r16 = _mm256_broadcast_ss(r1 + 5);
178                     __m256 _r26 = _mm256_broadcast_ss(r2 + 5);
179 
180                     __m256 _sum03 = _mm256_loadu_ps(outptr0 + 24);
181                     __m256 _sum13 = _mm256_loadu_ps(outptr1 + 24);
182 
183                     _sum03 = _mm256_fmadd_ps(_r04, _k00_0, _sum03);
184                     _sum03 = _mm256_fmadd_ps(_r05, _k01_0, _sum03);
185                     _sum03 = _mm256_fmadd_ps(_r06, _k02_0, _sum03);
186                     _sum03 = _mm256_fmadd_ps(_r14, _k10_0, _sum03);
187                     _sum03 = _mm256_fmadd_ps(_r15, _k11_0, _sum03);
188                     _sum03 = _mm256_fmadd_ps(_r16, _k12_0, _sum03);
189                     _sum03 = _mm256_fmadd_ps(_r24, _k20_0, _sum03);
190                     _sum03 = _mm256_fmadd_ps(_r25, _k21_0, _sum03);
191                     _sum03 = _mm256_fmadd_ps(_r26, _k22_0, _sum03);
192 
193                     _sum13 = _mm256_fmadd_ps(_r04, _k00_1, _sum13);
194                     _sum13 = _mm256_fmadd_ps(_r05, _k01_1, _sum13);
195                     _sum13 = _mm256_fmadd_ps(_r06, _k02_1, _sum13);
196                     _sum13 = _mm256_fmadd_ps(_r14, _k10_1, _sum13);
197                     _sum13 = _mm256_fmadd_ps(_r15, _k11_1, _sum13);
198                     _sum13 = _mm256_fmadd_ps(_r16, _k12_1, _sum13);
199                     _sum13 = _mm256_fmadd_ps(_r24, _k20_1, _sum13);
200                     _sum13 = _mm256_fmadd_ps(_r25, _k21_1, _sum13);
201                     _sum13 = _mm256_fmadd_ps(_r26, _k22_1, _sum13);
202 
203                     _mm256_storeu_ps(outptr0 + 24, _sum03);
204                     _mm256_storeu_ps(outptr1 + 24, _sum13);
205 
206                     r0 += 4;
207                     r1 += 4;
208                     r2 += 4;
209                     outptr0 += 32;
210                     outptr1 += 32;
211                 }
212 
213                 for (; j + 1 < outw; j += 2)
214                 {
215                     __m256 _sum00 = _mm256_loadu_ps(outptr0);
216                     __m256 _sum10 = _mm256_loadu_ps(outptr1);
217 
218                     __m256 _r01 = _mm256_broadcast_ss(r0);
219                     __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
220                     __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
221                     __m256 _r11 = _mm256_broadcast_ss(r1);
222                     __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
223                     __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
224                     __m256 _r21 = _mm256_broadcast_ss(r2);
225                     __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
226                     __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
227 
228                     _sum00 = _mm256_fmadd_ps(_r01, _k00_0, _sum00);
229                     _sum00 = _mm256_fmadd_ps(_r02, _k01_0, _sum00);
230                     _sum00 = _mm256_fmadd_ps(_r03, _k02_0, _sum00);
231                     _sum00 = _mm256_fmadd_ps(_r11, _k10_0, _sum00);
232                     _sum00 = _mm256_fmadd_ps(_r12, _k11_0, _sum00);
233                     _sum00 = _mm256_fmadd_ps(_r13, _k12_0, _sum00);
234                     _sum00 = _mm256_fmadd_ps(_r21, _k20_0, _sum00);
235                     _sum00 = _mm256_fmadd_ps(_r22, _k21_0, _sum00);
236                     _sum00 = _mm256_fmadd_ps(_r23, _k22_0, _sum00);
237 
238                     _sum10 = _mm256_fmadd_ps(_r01, _k00_1, _sum10);
239                     _sum10 = _mm256_fmadd_ps(_r02, _k01_1, _sum10);
240                     _sum10 = _mm256_fmadd_ps(_r03, _k02_1, _sum10);
241                     _sum10 = _mm256_fmadd_ps(_r11, _k10_1, _sum10);
242                     _sum10 = _mm256_fmadd_ps(_r12, _k11_1, _sum10);
243                     _sum10 = _mm256_fmadd_ps(_r13, _k12_1, _sum10);
244                     _sum10 = _mm256_fmadd_ps(_r21, _k20_1, _sum10);
245                     _sum10 = _mm256_fmadd_ps(_r22, _k21_1, _sum10);
246                     _sum10 = _mm256_fmadd_ps(_r23, _k22_1, _sum10);
247 
248                     _mm256_storeu_ps(outptr0, _sum00);
249                     _mm256_storeu_ps(outptr1, _sum10);
250 
251                     __m256 _sum01 = _mm256_loadu_ps(outptr0 + 8);
252                     __m256 _sum11 = _mm256_loadu_ps(outptr1 + 8);
253 
254                     __m256 _r04 = _mm256_broadcast_ss(r0 + 3);
255                     __m256 _r14 = _mm256_broadcast_ss(r1 + 3);
256                     __m256 _r24 = _mm256_broadcast_ss(r2 + 3);
257 
258                     _sum01 = _mm256_fmadd_ps(_r02, _k00_0, _sum01);
259                     _sum01 = _mm256_fmadd_ps(_r03, _k01_0, _sum01);
260                     _sum01 = _mm256_fmadd_ps(_r04, _k02_0, _sum01);
261                     _sum01 = _mm256_fmadd_ps(_r12, _k10_0, _sum01);
262                     _sum01 = _mm256_fmadd_ps(_r13, _k11_0, _sum01);
263                     _sum01 = _mm256_fmadd_ps(_r14, _k12_0, _sum01);
264                     _sum01 = _mm256_fmadd_ps(_r22, _k20_0, _sum01);
265                     _sum01 = _mm256_fmadd_ps(_r23, _k21_0, _sum01);
266                     _sum01 = _mm256_fmadd_ps(_r24, _k22_0, _sum01);
267 
268                     _sum11 = _mm256_fmadd_ps(_r02, _k00_1, _sum11);
269                     _sum11 = _mm256_fmadd_ps(_r03, _k01_1, _sum11);
270                     _sum11 = _mm256_fmadd_ps(_r04, _k02_1, _sum11);
271                     _sum11 = _mm256_fmadd_ps(_r12, _k10_1, _sum11);
272                     _sum11 = _mm256_fmadd_ps(_r13, _k11_1, _sum11);
273                     _sum11 = _mm256_fmadd_ps(_r14, _k12_1, _sum11);
274                     _sum11 = _mm256_fmadd_ps(_r22, _k20_1, _sum11);
275                     _sum11 = _mm256_fmadd_ps(_r23, _k21_1, _sum11);
276                     _sum11 = _mm256_fmadd_ps(_r24, _k22_1, _sum11);
277 
278                     _mm256_storeu_ps(outptr0 + 8, _sum01);
279                     _mm256_storeu_ps(outptr1 + 8, _sum11);
280 
281                     r0 += 2;
282                     r1 += 2;
283                     r2 += 2;
284                     outptr0 += 16;
285                     outptr1 += 16;
286                 }
287 
288                 for (; j < outw; j++)
289                 {
290                     __m256 _sum00 = _mm256_loadu_ps(outptr0);
291                     __m256 _sum10 = _mm256_loadu_ps(outptr1);
292 
293                     __m256 _r01 = _mm256_broadcast_ss(r0);
294                     __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
295                     __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
296                     __m256 _r11 = _mm256_broadcast_ss(r1);
297                     __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
298                     __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
299                     __m256 _r21 = _mm256_broadcast_ss(r2);
300                     __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
301                     __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
302 
303                     _sum00 = _mm256_fmadd_ps(_r01, _k00_0, _sum00);
304                     _sum00 = _mm256_fmadd_ps(_r02, _k01_0, _sum00);
305                     _sum00 = _mm256_fmadd_ps(_r03, _k02_0, _sum00);
306                     _sum00 = _mm256_fmadd_ps(_r11, _k10_0, _sum00);
307                     _sum00 = _mm256_fmadd_ps(_r12, _k11_0, _sum00);
308                     _sum00 = _mm256_fmadd_ps(_r13, _k12_0, _sum00);
309                     _sum00 = _mm256_fmadd_ps(_r21, _k20_0, _sum00);
310                     _sum00 = _mm256_fmadd_ps(_r22, _k21_0, _sum00);
311                     _sum00 = _mm256_fmadd_ps(_r23, _k22_0, _sum00);
312 
313                     _sum10 = _mm256_fmadd_ps(_r01, _k00_1, _sum10);
314                     _sum10 = _mm256_fmadd_ps(_r02, _k01_1, _sum10);
315                     _sum10 = _mm256_fmadd_ps(_r03, _k02_1, _sum10);
316                     _sum10 = _mm256_fmadd_ps(_r11, _k10_1, _sum10);
317                     _sum10 = _mm256_fmadd_ps(_r12, _k11_1, _sum10);
318                     _sum10 = _mm256_fmadd_ps(_r13, _k12_1, _sum10);
319                     _sum10 = _mm256_fmadd_ps(_r21, _k20_1, _sum10);
320                     _sum10 = _mm256_fmadd_ps(_r22, _k21_1, _sum10);
321                     _sum10 = _mm256_fmadd_ps(_r23, _k22_1, _sum10);
322 
323                     _mm256_storeu_ps(outptr0, _sum00);
324                     _mm256_storeu_ps(outptr1, _sum10);
325 
326                     r0 += 1;
327                     r1 += 1;
328                     r2 += 1;
329                     outptr0 += 8;
330                     outptr1 += 8;
331                 }
332 
333                 r0 += 2;
334                 r1 += 2;
335                 r2 += 2;
336             }
337 
338             k0 += 9 * 8;
339             k1 += 9 * 8;
340         }
341     }
342 
343     #pragma omp parallel for num_threads(opt.num_threads)
344     for (int p = remain_outch_start; p < outch; p++)
345     {
346         Mat out0 = top_blob.channel(p);
347 
348         __m256 _bias0 = bias ? _mm256_loadu_ps((const float*)bias + p * 8) : _mm256_set1_ps(0.f);
349         out0.fill(_bias0);
350 
351         const float* k0 = kernel.channel(p);
352 
353         for (int q = 0; q < inch; q++)
354         {
355             float* outptr0 = out0.row(0);
356 
357             const Mat img0 = bottom_blob.channel(q);
358 
359             const float* r0 = img0.row(0);
360             const float* r1 = img0.row(1);
361             const float* r2 = img0.row(2);
362 
363             __m256 _k00 = _mm256_loadu_ps(k0);
364             __m256 _k01 = _mm256_loadu_ps(k0 + 8);
365             __m256 _k02 = _mm256_loadu_ps(k0 + 16);
366             __m256 _k10 = _mm256_loadu_ps(k0 + 24);
367             __m256 _k11 = _mm256_loadu_ps(k0 + 32);
368             __m256 _k12 = _mm256_loadu_ps(k0 + 40);
369             __m256 _k20 = _mm256_loadu_ps(k0 + 48);
370             __m256 _k21 = _mm256_loadu_ps(k0 + 56);
371             __m256 _k22 = _mm256_loadu_ps(k0 + 64);
372 
373             int i = 0;
374 
375             for (; i < outh; i++)
376             {
377                 int j = 0;
378                 for (; j + 3 < outw; j += 4)
379                 {
380                     __m256 _sum0 = _mm256_loadu_ps(outptr0);
381 
382                     __m256 _r01 = _mm256_broadcast_ss(r0);
383                     __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
384                     __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
385                     __m256 _r11 = _mm256_broadcast_ss(r1);
386                     __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
387                     __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
388                     __m256 _r21 = _mm256_broadcast_ss(r2);
389                     __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
390                     __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
391 
392                     _sum0 = _mm256_fmadd_ps(_r01, _k00, _sum0);
393                     _sum0 = _mm256_fmadd_ps(_r02, _k01, _sum0);
394                     _sum0 = _mm256_fmadd_ps(_r03, _k02, _sum0);
395                     _sum0 = _mm256_fmadd_ps(_r11, _k10, _sum0);
396                     _sum0 = _mm256_fmadd_ps(_r12, _k11, _sum0);
397                     _sum0 = _mm256_fmadd_ps(_r13, _k12, _sum0);
398                     _sum0 = _mm256_fmadd_ps(_r21, _k20, _sum0);
399                     _sum0 = _mm256_fmadd_ps(_r22, _k21, _sum0);
400                     _sum0 = _mm256_fmadd_ps(_r23, _k22, _sum0);
401 
402                     __m256 _sum1 = _mm256_loadu_ps(outptr0 + 8);
403                     __m256 _r04 = _mm256_broadcast_ss(r0 + 3);
404                     __m256 _r14 = _mm256_broadcast_ss(r1 + 3);
405                     __m256 _r24 = _mm256_broadcast_ss(r2 + 3);
406                     _mm256_storeu_ps(outptr0, _sum0);
407 
408                     _sum1 = _mm256_fmadd_ps(_r02, _k00, _sum1);
409                     _sum1 = _mm256_fmadd_ps(_r03, _k01, _sum1);
410                     _sum1 = _mm256_fmadd_ps(_r04, _k02, _sum1);
411                     _sum1 = _mm256_fmadd_ps(_r12, _k10, _sum1);
412                     _sum1 = _mm256_fmadd_ps(_r13, _k11, _sum1);
413                     _sum1 = _mm256_fmadd_ps(_r14, _k12, _sum1);
414                     _sum1 = _mm256_fmadd_ps(_r22, _k20, _sum1);
415                     _sum1 = _mm256_fmadd_ps(_r23, _k21, _sum1);
416                     _sum1 = _mm256_fmadd_ps(_r24, _k22, _sum1);
417 
418                     __m256 _sum2 = _mm256_loadu_ps(outptr0 + 16);
419                     __m256 _r05 = _mm256_broadcast_ss(r0 + 4);
420                     __m256 _r15 = _mm256_broadcast_ss(r1 + 4);
421                     __m256 _r25 = _mm256_broadcast_ss(r2 + 4);
422                     _mm256_storeu_ps(outptr0 + 8, _sum1);
423 
424                     _sum2 = _mm256_fmadd_ps(_r03, _k00, _sum2);
425                     _sum2 = _mm256_fmadd_ps(_r04, _k01, _sum2);
426                     _sum2 = _mm256_fmadd_ps(_r05, _k02, _sum2);
427                     _sum2 = _mm256_fmadd_ps(_r13, _k10, _sum2);
428                     _sum2 = _mm256_fmadd_ps(_r14, _k11, _sum2);
429                     _sum2 = _mm256_fmadd_ps(_r15, _k12, _sum2);
430                     _sum2 = _mm256_fmadd_ps(_r23, _k20, _sum2);
431                     _sum2 = _mm256_fmadd_ps(_r24, _k21, _sum2);
432                     _sum2 = _mm256_fmadd_ps(_r25, _k22, _sum2);
433 
434                     __m256 _sum3 = _mm256_loadu_ps(outptr0 + 24);
435                     __m256 _r06 = _mm256_broadcast_ss(r0 + 5);
436                     __m256 _r16 = _mm256_broadcast_ss(r1 + 5);
437                     __m256 _r26 = _mm256_broadcast_ss(r2 + 5);
438                     _mm256_storeu_ps(outptr0 + 16, _sum2);
439 
440                     _sum3 = _mm256_fmadd_ps(_r04, _k00, _sum3);
441                     _sum3 = _mm256_fmadd_ps(_r05, _k01, _sum3);
442                     _sum3 = _mm256_fmadd_ps(_r06, _k02, _sum3);
443                     _sum3 = _mm256_fmadd_ps(_r14, _k10, _sum3);
444                     _sum3 = _mm256_fmadd_ps(_r15, _k11, _sum3);
445                     _sum3 = _mm256_fmadd_ps(_r16, _k12, _sum3);
446                     _sum3 = _mm256_fmadd_ps(_r24, _k20, _sum3);
447                     _sum3 = _mm256_fmadd_ps(_r25, _k21, _sum3);
448                     _sum3 = _mm256_fmadd_ps(_r26, _k22, _sum3);
449 
450                     _mm256_storeu_ps(outptr0 + 24, _sum3);
451 
452                     r0 += 4;
453                     r1 += 4;
454                     r2 += 4;
455                     outptr0 += 32;
456                 }
457                 for (; j + 1 < outw; j += 2)
458                 {
459                     __m256 _sum0 = _mm256_loadu_ps(outptr0);
460 
461                     __m256 _r01 = _mm256_broadcast_ss(r0);
462                     __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
463                     __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
464                     __m256 _r11 = _mm256_broadcast_ss(r1);
465                     __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
466                     __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
467                     __m256 _r21 = _mm256_broadcast_ss(r2);
468                     __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
469                     __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
470 
471                     _sum0 = _mm256_fmadd_ps(_r01, _k00, _sum0);
472                     _sum0 = _mm256_fmadd_ps(_r02, _k01, _sum0);
473                     _sum0 = _mm256_fmadd_ps(_r03, _k02, _sum0);
474                     _sum0 = _mm256_fmadd_ps(_r11, _k10, _sum0);
475                     _sum0 = _mm256_fmadd_ps(_r12, _k11, _sum0);
476                     _sum0 = _mm256_fmadd_ps(_r13, _k12, _sum0);
477                     _sum0 = _mm256_fmadd_ps(_r21, _k20, _sum0);
478                     _sum0 = _mm256_fmadd_ps(_r22, _k21, _sum0);
479                     _sum0 = _mm256_fmadd_ps(_r23, _k22, _sum0);
480 
481                     __m256 _sum1 = _mm256_loadu_ps(outptr0 + 8);
482                     __m256 _r04 = _mm256_broadcast_ss(r0 + 3);
483                     __m256 _r14 = _mm256_broadcast_ss(r1 + 3);
484                     __m256 _r24 = _mm256_broadcast_ss(r2 + 3);
485                     _mm256_storeu_ps(outptr0, _sum0);
486 
487                     _sum1 = _mm256_fmadd_ps(_r02, _k00, _sum1);
488                     _sum1 = _mm256_fmadd_ps(_r03, _k01, _sum1);
489                     _sum1 = _mm256_fmadd_ps(_r04, _k02, _sum1);
490                     _sum1 = _mm256_fmadd_ps(_r12, _k10, _sum1);
491                     _sum1 = _mm256_fmadd_ps(_r13, _k11, _sum1);
492                     _sum1 = _mm256_fmadd_ps(_r14, _k12, _sum1);
493                     _sum1 = _mm256_fmadd_ps(_r22, _k20, _sum1);
494                     _sum1 = _mm256_fmadd_ps(_r23, _k21, _sum1);
495                     _sum1 = _mm256_fmadd_ps(_r24, _k22, _sum1);
496 
497                     _mm256_storeu_ps(outptr0 + 8, _sum1);
498 
499                     r0 += 2;
500                     r1 += 2;
501                     r2 += 2;
502                     outptr0 += 16;
503                 }
504                 for (; j < outw; j++)
505                 {
506                     __m256 _sum0 = _mm256_loadu_ps(outptr0);
507 
508                     __m256 _r01 = _mm256_broadcast_ss(r0);
509                     __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
510                     __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
511                     __m256 _r11 = _mm256_broadcast_ss(r1);
512                     __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
513                     __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
514                     __m256 _r21 = _mm256_broadcast_ss(r2);
515                     __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
516                     __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
517 
518                     _sum0 = _mm256_fmadd_ps(_r01, _k00, _sum0);
519                     _sum0 = _mm256_fmadd_ps(_r02, _k01, _sum0);
520                     _sum0 = _mm256_fmadd_ps(_r03, _k02, _sum0);
521                     _sum0 = _mm256_fmadd_ps(_r11, _k10, _sum0);
522                     _sum0 = _mm256_fmadd_ps(_r12, _k11, _sum0);
523                     _sum0 = _mm256_fmadd_ps(_r13, _k12, _sum0);
524                     _sum0 = _mm256_fmadd_ps(_r21, _k20, _sum0);
525                     _sum0 = _mm256_fmadd_ps(_r22, _k21, _sum0);
526                     _sum0 = _mm256_fmadd_ps(_r23, _k22, _sum0);
527 
528                     _mm256_storeu_ps(outptr0, _sum0);
529                     r0 += 1;
530                     r1 += 1;
531                     r2 += 1;
532                     outptr0 += 8;
533                 }
534 
535                 r0 += 2;
536                 r1 += 2;
537                 r2 += 2;
538             }
539 
540             k0 += 9 * 8;
541         }
542     }
543 }
544 
conv3x3s2_pack1to8_avx(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel,const Mat & _bias,const Option & opt)545 static void conv3x3s2_pack1to8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
546 {
547     int w = bottom_blob.w;
548     int inch = bottom_blob.c;
549     int outw = top_blob.w;
550     int outh = top_blob.h;
551     int outch = top_blob.c;
552 
553     const int tailstep = w - 2 * outw + w;
554 
555     const float* bias = _bias;
556 
557     int nn_outch = outch >> 1;
558     int remain_outch_start = nn_outch << 1;
559 
560     #pragma omp parallel for num_threads(opt.num_threads)
561     for (int pp = 0; pp < nn_outch; pp++)
562     {
563         int p = pp * 2;
564 
565         Mat out0 = top_blob.channel(p);
566         Mat out1 = top_blob.channel(p + 1);
567 
568         __m256 _bias0 = bias ? _mm256_loadu_ps((const float*)bias + p * 8) : _mm256_set1_ps(0.f);
569         __m256 _bias1 = bias ? _mm256_loadu_ps((const float*)bias + (p + 1) * 8) : _mm256_set1_ps(0.f);
570         out0.fill(_bias0);
571         out1.fill(_bias1);
572 
573         const float* k0 = kernel.channel(p);
574         const float* k1 = kernel.channel(p + 1);
575 
576         for (int q = 0; q < inch; q++)
577         {
578             float* outptr0 = out0;
579             float* outptr1 = out1;
580 
581             const Mat img0 = bottom_blob.channel(q);
582 
583             const float* r0 = img0.row(0);
584             const float* r1 = img0.row(1);
585             const float* r2 = img0.row(2);
586 
587             __m256 _k00_0 = _mm256_loadu_ps(k0);
588             __m256 _k01_0 = _mm256_loadu_ps(k0 + 8);
589             __m256 _k02_0 = _mm256_loadu_ps(k0 + 16);
590             __m256 _k10_0 = _mm256_loadu_ps(k0 + 24);
591             __m256 _k11_0 = _mm256_loadu_ps(k0 + 32);
592             __m256 _k12_0 = _mm256_loadu_ps(k0 + 40);
593             __m256 _k20_0 = _mm256_loadu_ps(k0 + 48);
594             __m256 _k21_0 = _mm256_loadu_ps(k0 + 56);
595             __m256 _k22_0 = _mm256_loadu_ps(k0 + 64);
596 
597             __m256 _k00_1 = _mm256_loadu_ps(k1);
598             __m256 _k01_1 = _mm256_loadu_ps(k1 + 8);
599             __m256 _k02_1 = _mm256_loadu_ps(k1 + 16);
600             __m256 _k10_1 = _mm256_loadu_ps(k1 + 24);
601             __m256 _k11_1 = _mm256_loadu_ps(k1 + 32);
602             __m256 _k12_1 = _mm256_loadu_ps(k1 + 40);
603             __m256 _k20_1 = _mm256_loadu_ps(k1 + 48);
604             __m256 _k21_1 = _mm256_loadu_ps(k1 + 56);
605             __m256 _k22_1 = _mm256_loadu_ps(k1 + 64);
606 
607             int i = 0;
608 
609             for (; i < outh; i++)
610             {
611                 int j = 0;
612                 for (; j + 7 < outw; j += 8)
613                 {
614                     __m256 _sum00 = _mm256_loadu_ps(outptr0);
615                     __m256 _sum10 = _mm256_loadu_ps(outptr1);
616 
617                     __m256 _r01 = _mm256_broadcast_ss(r0);
618                     __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
619                     __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
620                     __m256 _r11 = _mm256_broadcast_ss(r1);
621                     __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
622                     __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
623                     __m256 _r21 = _mm256_broadcast_ss(r2);
624                     __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
625                     __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
626 
627                     _sum00 = _mm256_fmadd_ps(_r01, _k00_0, _sum00);
628                     _sum00 = _mm256_fmadd_ps(_r02, _k01_0, _sum00);
629                     _sum00 = _mm256_fmadd_ps(_r03, _k02_0, _sum00);
630                     _sum00 = _mm256_fmadd_ps(_r11, _k10_0, _sum00);
631                     _sum00 = _mm256_fmadd_ps(_r12, _k11_0, _sum00);
632                     _sum00 = _mm256_fmadd_ps(_r13, _k12_0, _sum00);
633                     _sum00 = _mm256_fmadd_ps(_r21, _k20_0, _sum00);
634                     _sum00 = _mm256_fmadd_ps(_r22, _k21_0, _sum00);
635                     _sum00 = _mm256_fmadd_ps(_r23, _k22_0, _sum00);
636 
637                     _sum10 = _mm256_fmadd_ps(_r01, _k00_1, _sum10);
638                     _sum10 = _mm256_fmadd_ps(_r02, _k01_1, _sum10);
639                     _sum10 = _mm256_fmadd_ps(_r03, _k02_1, _sum10);
640                     _sum10 = _mm256_fmadd_ps(_r11, _k10_1, _sum10);
641                     _sum10 = _mm256_fmadd_ps(_r12, _k11_1, _sum10);
642                     _sum10 = _mm256_fmadd_ps(_r13, _k12_1, _sum10);
643                     _sum10 = _mm256_fmadd_ps(_r21, _k20_1, _sum10);
644                     _sum10 = _mm256_fmadd_ps(_r22, _k21_1, _sum10);
645                     _sum10 = _mm256_fmadd_ps(_r23, _k22_1, _sum10);
646 
647                     _mm256_storeu_ps(outptr0, _sum00);
648                     _mm256_storeu_ps(outptr1, _sum10);
649 
650                     __m256 _sum01 = _mm256_loadu_ps(outptr0 + 8);
651                     __m256 _sum11 = _mm256_loadu_ps(outptr1 + 8);
652 
653                     __m256 _r04 = _mm256_broadcast_ss(r0 + 3);
654                     __m256 _r14 = _mm256_broadcast_ss(r1 + 3);
655                     __m256 _r24 = _mm256_broadcast_ss(r2 + 3);
656                     __m256 _r05 = _mm256_broadcast_ss(r0 + 4);
657                     __m256 _r15 = _mm256_broadcast_ss(r1 + 4);
658                     __m256 _r25 = _mm256_broadcast_ss(r2 + 4);
659 
660                     _sum01 = _mm256_fmadd_ps(_r03, _k00_0, _sum01);
661                     _sum01 = _mm256_fmadd_ps(_r04, _k01_0, _sum01);
662                     _sum01 = _mm256_fmadd_ps(_r05, _k02_0, _sum01);
663                     _sum01 = _mm256_fmadd_ps(_r13, _k10_0, _sum01);
664                     _sum01 = _mm256_fmadd_ps(_r14, _k11_0, _sum01);
665                     _sum01 = _mm256_fmadd_ps(_r15, _k12_0, _sum01);
666                     _sum01 = _mm256_fmadd_ps(_r23, _k20_0, _sum01);
667                     _sum01 = _mm256_fmadd_ps(_r24, _k21_0, _sum01);
668                     _sum01 = _mm256_fmadd_ps(_r25, _k22_0, _sum01);
669 
670                     _sum11 = _mm256_fmadd_ps(_r03, _k00_1, _sum11);
671                     _sum11 = _mm256_fmadd_ps(_r04, _k01_1, _sum11);
672                     _sum11 = _mm256_fmadd_ps(_r05, _k02_1, _sum11);
673                     _sum11 = _mm256_fmadd_ps(_r13, _k10_1, _sum11);
674                     _sum11 = _mm256_fmadd_ps(_r14, _k11_1, _sum11);
675                     _sum11 = _mm256_fmadd_ps(_r15, _k12_1, _sum11);
676                     _sum11 = _mm256_fmadd_ps(_r23, _k20_1, _sum11);
677                     _sum11 = _mm256_fmadd_ps(_r24, _k21_1, _sum11);
678                     _sum11 = _mm256_fmadd_ps(_r25, _k22_1, _sum11);
679 
680                     _mm256_storeu_ps(outptr0 + 8, _sum01);
681                     _mm256_storeu_ps(outptr1 + 8, _sum11);
682 
683                     __m256 _sum02 = _mm256_loadu_ps(outptr0 + 16);
684                     __m256 _sum12 = _mm256_loadu_ps(outptr1 + 16);
685 
686                     __m256 _r06 = _mm256_broadcast_ss(r0 + 5);
687                     __m256 _r16 = _mm256_broadcast_ss(r1 + 5);
688                     __m256 _r26 = _mm256_broadcast_ss(r2 + 5);
689                     __m256 _r07 = _mm256_broadcast_ss(r0 + 6);
690                     __m256 _r17 = _mm256_broadcast_ss(r1 + 6);
691                     __m256 _r27 = _mm256_broadcast_ss(r2 + 6);
692 
693                     _sum02 = _mm256_fmadd_ps(_r05, _k00_0, _sum02);
694                     _sum02 = _mm256_fmadd_ps(_r06, _k01_0, _sum02);
695                     _sum02 = _mm256_fmadd_ps(_r07, _k02_0, _sum02);
696                     _sum02 = _mm256_fmadd_ps(_r15, _k10_0, _sum02);
697                     _sum02 = _mm256_fmadd_ps(_r16, _k11_0, _sum02);
698                     _sum02 = _mm256_fmadd_ps(_r17, _k12_0, _sum02);
699                     _sum02 = _mm256_fmadd_ps(_r25, _k20_0, _sum02);
700                     _sum02 = _mm256_fmadd_ps(_r26, _k21_0, _sum02);
701                     _sum02 = _mm256_fmadd_ps(_r27, _k22_0, _sum02);
702 
703                     _sum12 = _mm256_fmadd_ps(_r05, _k00_1, _sum12);
704                     _sum12 = _mm256_fmadd_ps(_r06, _k01_1, _sum12);
705                     _sum12 = _mm256_fmadd_ps(_r07, _k02_1, _sum12);
706                     _sum12 = _mm256_fmadd_ps(_r15, _k10_1, _sum12);
707                     _sum12 = _mm256_fmadd_ps(_r16, _k11_1, _sum12);
708                     _sum12 = _mm256_fmadd_ps(_r17, _k12_1, _sum12);
709                     _sum12 = _mm256_fmadd_ps(_r25, _k20_1, _sum12);
710                     _sum12 = _mm256_fmadd_ps(_r26, _k21_1, _sum12);
711                     _sum12 = _mm256_fmadd_ps(_r27, _k22_1, _sum12);
712 
713                     _mm256_storeu_ps(outptr0 + 16, _sum02);
714                     _mm256_storeu_ps(outptr1 + 16, _sum12);
715 
716                     __m256 _r08 = _mm256_broadcast_ss(r0 + 7);
717                     __m256 _r18 = _mm256_broadcast_ss(r1 + 7);
718                     __m256 _r28 = _mm256_broadcast_ss(r2 + 7);
719                     __m256 _r09 = _mm256_broadcast_ss(r0 + 8);
720                     __m256 _r19 = _mm256_broadcast_ss(r1 + 8);
721                     __m256 _r29 = _mm256_broadcast_ss(r2 + 8);
722 
723                     __m256 _sum03 = _mm256_loadu_ps(outptr0 + 24);
724                     __m256 _sum13 = _mm256_loadu_ps(outptr1 + 24);
725 
726                     _sum03 = _mm256_fmadd_ps(_r07, _k00_0, _sum03);
727                     _sum03 = _mm256_fmadd_ps(_r08, _k01_0, _sum03);
728                     _sum03 = _mm256_fmadd_ps(_r09, _k02_0, _sum03);
729                     _sum03 = _mm256_fmadd_ps(_r17, _k10_0, _sum03);
730                     _sum03 = _mm256_fmadd_ps(_r18, _k11_0, _sum03);
731                     _sum03 = _mm256_fmadd_ps(_r19, _k12_0, _sum03);
732                     _sum03 = _mm256_fmadd_ps(_r27, _k20_0, _sum03);
733                     _sum03 = _mm256_fmadd_ps(_r28, _k21_0, _sum03);
734                     _sum03 = _mm256_fmadd_ps(_r29, _k22_0, _sum03);
735 
736                     _sum13 = _mm256_fmadd_ps(_r07, _k00_1, _sum13);
737                     _sum13 = _mm256_fmadd_ps(_r08, _k01_1, _sum13);
738                     _sum13 = _mm256_fmadd_ps(_r09, _k02_1, _sum13);
739                     _sum13 = _mm256_fmadd_ps(_r17, _k10_1, _sum13);
740                     _sum13 = _mm256_fmadd_ps(_r18, _k11_1, _sum13);
741                     _sum13 = _mm256_fmadd_ps(_r19, _k12_1, _sum13);
742                     _sum13 = _mm256_fmadd_ps(_r27, _k20_1, _sum13);
743                     _sum13 = _mm256_fmadd_ps(_r28, _k21_1, _sum13);
744                     _sum13 = _mm256_fmadd_ps(_r29, _k22_1, _sum13);
745 
746                     _mm256_storeu_ps(outptr0 + 24, _sum03);
747                     _mm256_storeu_ps(outptr1 + 24, _sum13);
748 
749                     __m256 _r010 = _mm256_broadcast_ss(r0 + 9);
750                     __m256 _r110 = _mm256_broadcast_ss(r1 + 9);
751                     __m256 _r210 = _mm256_broadcast_ss(r2 + 9);
752                     __m256 _r011 = _mm256_broadcast_ss(r0 + 10);
753                     __m256 _r111 = _mm256_broadcast_ss(r1 + 10);
754                     __m256 _r211 = _mm256_broadcast_ss(r2 + 10);
755 
756                     __m256 _sum04 = _mm256_loadu_ps(outptr0 + 32);
757                     __m256 _sum14 = _mm256_loadu_ps(outptr1 + 32);
758 
759                     _sum04 = _mm256_fmadd_ps(_r09, _k00_0, _sum04);
760                     _sum04 = _mm256_fmadd_ps(_r010, _k01_0, _sum04);
761                     _sum04 = _mm256_fmadd_ps(_r011, _k02_0, _sum04);
762                     _sum04 = _mm256_fmadd_ps(_r19, _k10_0, _sum04);
763                     _sum04 = _mm256_fmadd_ps(_r110, _k11_0, _sum04);
764                     _sum04 = _mm256_fmadd_ps(_r111, _k12_0, _sum04);
765                     _sum04 = _mm256_fmadd_ps(_r29, _k20_0, _sum04);
766                     _sum04 = _mm256_fmadd_ps(_r210, _k21_0, _sum04);
767                     _sum04 = _mm256_fmadd_ps(_r211, _k22_0, _sum04);
768 
769                     _sum14 = _mm256_fmadd_ps(_r09, _k00_1, _sum14);
770                     _sum14 = _mm256_fmadd_ps(_r010, _k01_1, _sum14);
771                     _sum14 = _mm256_fmadd_ps(_r011, _k02_1, _sum14);
772                     _sum14 = _mm256_fmadd_ps(_r19, _k10_1, _sum14);
773                     _sum14 = _mm256_fmadd_ps(_r110, _k11_1, _sum14);
774                     _sum14 = _mm256_fmadd_ps(_r111, _k12_1, _sum14);
775                     _sum14 = _mm256_fmadd_ps(_r29, _k20_1, _sum14);
776                     _sum14 = _mm256_fmadd_ps(_r210, _k21_1, _sum14);
777                     _sum14 = _mm256_fmadd_ps(_r211, _k22_1, _sum14);
778 
779                     _mm256_storeu_ps(outptr0 + 32, _sum04);
780                     _mm256_storeu_ps(outptr1 + 32, _sum14);
781 
782                     __m256 _r012 = _mm256_broadcast_ss(r0 + 11);
783                     __m256 _r112 = _mm256_broadcast_ss(r1 + 11);
784                     __m256 _r212 = _mm256_broadcast_ss(r2 + 11);
785                     __m256 _r013 = _mm256_broadcast_ss(r0 + 12);
786                     __m256 _r113 = _mm256_broadcast_ss(r1 + 12);
787                     __m256 _r213 = _mm256_broadcast_ss(r2 + 12);
788 
789                     __m256 _sum05 = _mm256_loadu_ps(outptr0 + 40);
790                     __m256 _sum15 = _mm256_loadu_ps(outptr1 + 40);
791 
792                     _sum05 = _mm256_fmadd_ps(_r011, _k00_0, _sum05);
793                     _sum05 = _mm256_fmadd_ps(_r012, _k01_0, _sum05);
794                     _sum05 = _mm256_fmadd_ps(_r013, _k02_0, _sum05);
795                     _sum05 = _mm256_fmadd_ps(_r111, _k10_0, _sum05);
796                     _sum05 = _mm256_fmadd_ps(_r112, _k11_0, _sum05);
797                     _sum05 = _mm256_fmadd_ps(_r113, _k12_0, _sum05);
798                     _sum05 = _mm256_fmadd_ps(_r211, _k20_0, _sum05);
799                     _sum05 = _mm256_fmadd_ps(_r212, _k21_0, _sum05);
800                     _sum05 = _mm256_fmadd_ps(_r213, _k22_0, _sum05);
801                     _sum15 = _mm256_fmadd_ps(_r011, _k00_1, _sum15);
802                     _sum15 = _mm256_fmadd_ps(_r012, _k01_1, _sum15);
803                     _sum15 = _mm256_fmadd_ps(_r013, _k02_1, _sum15);
804                     _sum15 = _mm256_fmadd_ps(_r111, _k10_1, _sum15);
805                     _sum15 = _mm256_fmadd_ps(_r112, _k11_1, _sum15);
806                     _sum15 = _mm256_fmadd_ps(_r113, _k12_1, _sum15);
807                     _sum15 = _mm256_fmadd_ps(_r211, _k20_1, _sum15);
808                     _sum15 = _mm256_fmadd_ps(_r212, _k21_1, _sum15);
809                     _sum15 = _mm256_fmadd_ps(_r213, _k22_1, _sum15);
810 
811                     _mm256_storeu_ps(outptr0 + 40, _sum05);
812                     _mm256_storeu_ps(outptr1 + 40, _sum15);
813 
814                     __m256 _r014 = _mm256_broadcast_ss(r0 + 13);
815                     __m256 _r114 = _mm256_broadcast_ss(r1 + 13);
816                     __m256 _r214 = _mm256_broadcast_ss(r2 + 13);
817                     __m256 _r015 = _mm256_broadcast_ss(r0 + 14);
818                     __m256 _r115 = _mm256_broadcast_ss(r1 + 14);
819                     __m256 _r215 = _mm256_broadcast_ss(r2 + 14);
820 
821                     __m256 _sum06 = _mm256_loadu_ps(outptr0 + 48);
822                     __m256 _sum16 = _mm256_loadu_ps(outptr1 + 48);
823 
824                     _sum06 = _mm256_fmadd_ps(_r013, _k00_0, _sum06);
825                     _sum06 = _mm256_fmadd_ps(_r014, _k01_0, _sum06);
826                     _sum06 = _mm256_fmadd_ps(_r015, _k02_0, _sum06);
827                     _sum06 = _mm256_fmadd_ps(_r113, _k10_0, _sum06);
828                     _sum06 = _mm256_fmadd_ps(_r114, _k11_0, _sum06);
829                     _sum06 = _mm256_fmadd_ps(_r115, _k12_0, _sum06);
830                     _sum06 = _mm256_fmadd_ps(_r213, _k20_0, _sum06);
831                     _sum06 = _mm256_fmadd_ps(_r214, _k21_0, _sum06);
832                     _sum06 = _mm256_fmadd_ps(_r215, _k22_0, _sum06);
833                     _sum16 = _mm256_fmadd_ps(_r013, _k00_1, _sum16);
834                     _sum16 = _mm256_fmadd_ps(_r014, _k01_1, _sum16);
835                     _sum16 = _mm256_fmadd_ps(_r015, _k02_1, _sum16);
836                     _sum16 = _mm256_fmadd_ps(_r113, _k10_1, _sum16);
837                     _sum16 = _mm256_fmadd_ps(_r114, _k11_1, _sum16);
838                     _sum16 = _mm256_fmadd_ps(_r115, _k12_1, _sum16);
839                     _sum16 = _mm256_fmadd_ps(_r213, _k20_1, _sum16);
840                     _sum16 = _mm256_fmadd_ps(_r214, _k21_1, _sum16);
841                     _sum16 = _mm256_fmadd_ps(_r215, _k22_1, _sum16);
842 
843                     _mm256_storeu_ps(outptr0 + 48, _sum06);
844                     _mm256_storeu_ps(outptr1 + 48, _sum16);
845 
846                     __m256 _r016 = _mm256_broadcast_ss(r0 + 15);
847                     __m256 _r116 = _mm256_broadcast_ss(r1 + 15);
848                     __m256 _r216 = _mm256_broadcast_ss(r2 + 15);
849                     __m256 _r017 = _mm256_broadcast_ss(r0 + 16);
850                     __m256 _r117 = _mm256_broadcast_ss(r1 + 16);
851                     __m256 _r217 = _mm256_broadcast_ss(r2 + 16);
852 
853                     __m256 _sum07 = _mm256_loadu_ps(outptr0 + 56);
854                     __m256 _sum17 = _mm256_loadu_ps(outptr1 + 56);
855 
856                     _sum07 = _mm256_fmadd_ps(_r015, _k00_0, _sum07);
857                     _sum07 = _mm256_fmadd_ps(_r016, _k01_0, _sum07);
858                     _sum07 = _mm256_fmadd_ps(_r017, _k02_0, _sum07);
859                     _sum07 = _mm256_fmadd_ps(_r115, _k10_0, _sum07);
860                     _sum07 = _mm256_fmadd_ps(_r116, _k11_0, _sum07);
861                     _sum07 = _mm256_fmadd_ps(_r117, _k12_0, _sum07);
862                     _sum07 = _mm256_fmadd_ps(_r215, _k20_0, _sum07);
863                     _sum07 = _mm256_fmadd_ps(_r216, _k21_0, _sum07);
864                     _sum07 = _mm256_fmadd_ps(_r217, _k22_0, _sum07);
865                     _sum17 = _mm256_fmadd_ps(_r015, _k00_1, _sum17);
866                     _sum17 = _mm256_fmadd_ps(_r016, _k01_1, _sum17);
867                     _sum17 = _mm256_fmadd_ps(_r017, _k02_1, _sum17);
868                     _sum17 = _mm256_fmadd_ps(_r115, _k10_1, _sum17);
869                     _sum17 = _mm256_fmadd_ps(_r116, _k11_1, _sum17);
870                     _sum17 = _mm256_fmadd_ps(_r117, _k12_1, _sum17);
871                     _sum17 = _mm256_fmadd_ps(_r215, _k20_1, _sum17);
872                     _sum17 = _mm256_fmadd_ps(_r216, _k21_1, _sum17);
873                     _sum17 = _mm256_fmadd_ps(_r217, _k22_1, _sum17);
874 
875                     _mm256_storeu_ps(outptr0 + 56, _sum07);
876                     _mm256_storeu_ps(outptr1 + 56, _sum17);
877 
878                     r0 += 16;
879                     r1 += 16;
880                     r2 += 16;
881                     outptr0 += 64;
882                     outptr1 += 64;
883                 }
884 
885                 for (; j + 3 < outw; j += 4)
886                 {
887                     __m256 _sum00 = _mm256_loadu_ps(outptr0);
888                     __m256 _sum10 = _mm256_loadu_ps(outptr1);
889 
890                     __m256 _r01 = _mm256_broadcast_ss(r0);
891                     __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
892                     __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
893                     __m256 _r11 = _mm256_broadcast_ss(r1);
894                     __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
895                     __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
896                     __m256 _r21 = _mm256_broadcast_ss(r2);
897                     __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
898                     __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
899 
900                     _sum00 = _mm256_fmadd_ps(_r01, _k00_0, _sum00);
901                     _sum00 = _mm256_fmadd_ps(_r02, _k01_0, _sum00);
902                     _sum00 = _mm256_fmadd_ps(_r03, _k02_0, _sum00);
903                     _sum00 = _mm256_fmadd_ps(_r11, _k10_0, _sum00);
904                     _sum00 = _mm256_fmadd_ps(_r12, _k11_0, _sum00);
905                     _sum00 = _mm256_fmadd_ps(_r13, _k12_0, _sum00);
906                     _sum00 = _mm256_fmadd_ps(_r21, _k20_0, _sum00);
907                     _sum00 = _mm256_fmadd_ps(_r22, _k21_0, _sum00);
908                     _sum00 = _mm256_fmadd_ps(_r23, _k22_0, _sum00);
909 
910                     _sum10 = _mm256_fmadd_ps(_r01, _k00_1, _sum10);
911                     _sum10 = _mm256_fmadd_ps(_r02, _k01_1, _sum10);
912                     _sum10 = _mm256_fmadd_ps(_r03, _k02_1, _sum10);
913                     _sum10 = _mm256_fmadd_ps(_r11, _k10_1, _sum10);
914                     _sum10 = _mm256_fmadd_ps(_r12, _k11_1, _sum10);
915                     _sum10 = _mm256_fmadd_ps(_r13, _k12_1, _sum10);
916                     _sum10 = _mm256_fmadd_ps(_r21, _k20_1, _sum10);
917                     _sum10 = _mm256_fmadd_ps(_r22, _k21_1, _sum10);
918                     _sum10 = _mm256_fmadd_ps(_r23, _k22_1, _sum10);
919 
920                     _mm256_storeu_ps(outptr0, _sum00);
921                     _mm256_storeu_ps(outptr1, _sum10);
922 
923                     __m256 _sum01 = _mm256_loadu_ps(outptr0 + 8);
924                     __m256 _sum11 = _mm256_loadu_ps(outptr1 + 8);
925 
926                     __m256 _r04 = _mm256_broadcast_ss(r0 + 3);
927                     __m256 _r14 = _mm256_broadcast_ss(r1 + 3);
928                     __m256 _r24 = _mm256_broadcast_ss(r2 + 3);
929                     __m256 _r05 = _mm256_broadcast_ss(r0 + 4);
930                     __m256 _r15 = _mm256_broadcast_ss(r1 + 4);
931                     __m256 _r25 = _mm256_broadcast_ss(r2 + 4);
932 
933                     _sum01 = _mm256_fmadd_ps(_r03, _k00_0, _sum01);
934                     _sum01 = _mm256_fmadd_ps(_r04, _k01_0, _sum01);
935                     _sum01 = _mm256_fmadd_ps(_r05, _k02_0, _sum01);
936                     _sum01 = _mm256_fmadd_ps(_r13, _k10_0, _sum01);
937                     _sum01 = _mm256_fmadd_ps(_r14, _k11_0, _sum01);
938                     _sum01 = _mm256_fmadd_ps(_r15, _k12_0, _sum01);
939                     _sum01 = _mm256_fmadd_ps(_r23, _k20_0, _sum01);
940                     _sum01 = _mm256_fmadd_ps(_r24, _k21_0, _sum01);
941                     _sum01 = _mm256_fmadd_ps(_r25, _k22_0, _sum01);
942 
943                     _sum11 = _mm256_fmadd_ps(_r03, _k00_1, _sum11);
944                     _sum11 = _mm256_fmadd_ps(_r04, _k01_1, _sum11);
945                     _sum11 = _mm256_fmadd_ps(_r05, _k02_1, _sum11);
946                     _sum11 = _mm256_fmadd_ps(_r13, _k10_1, _sum11);
947                     _sum11 = _mm256_fmadd_ps(_r14, _k11_1, _sum11);
948                     _sum11 = _mm256_fmadd_ps(_r15, _k12_1, _sum11);
949                     _sum11 = _mm256_fmadd_ps(_r23, _k20_1, _sum11);
950                     _sum11 = _mm256_fmadd_ps(_r24, _k21_1, _sum11);
951                     _sum11 = _mm256_fmadd_ps(_r25, _k22_1, _sum11);
952 
953                     _mm256_storeu_ps(outptr0 + 8, _sum01);
954                     _mm256_storeu_ps(outptr1 + 8, _sum11);
955 
956                     __m256 _sum02 = _mm256_loadu_ps(outptr0 + 16);
957                     __m256 _sum12 = _mm256_loadu_ps(outptr1 + 16);
958 
959                     __m256 _r06 = _mm256_broadcast_ss(r0 + 5);
960                     __m256 _r16 = _mm256_broadcast_ss(r1 + 5);
961                     __m256 _r26 = _mm256_broadcast_ss(r2 + 5);
962                     __m256 _r07 = _mm256_broadcast_ss(r0 + 6);
963                     __m256 _r17 = _mm256_broadcast_ss(r1 + 6);
964                     __m256 _r27 = _mm256_broadcast_ss(r2 + 6);
965 
966                     _sum02 = _mm256_fmadd_ps(_r05, _k00_0, _sum02);
967                     _sum02 = _mm256_fmadd_ps(_r06, _k01_0, _sum02);
968                     _sum02 = _mm256_fmadd_ps(_r07, _k02_0, _sum02);
969                     _sum02 = _mm256_fmadd_ps(_r15, _k10_0, _sum02);
970                     _sum02 = _mm256_fmadd_ps(_r16, _k11_0, _sum02);
971                     _sum02 = _mm256_fmadd_ps(_r17, _k12_0, _sum02);
972                     _sum02 = _mm256_fmadd_ps(_r25, _k20_0, _sum02);
973                     _sum02 = _mm256_fmadd_ps(_r26, _k21_0, _sum02);
974                     _sum02 = _mm256_fmadd_ps(_r27, _k22_0, _sum02);
975 
976                     _sum12 = _mm256_fmadd_ps(_r05, _k00_1, _sum12);
977                     _sum12 = _mm256_fmadd_ps(_r06, _k01_1, _sum12);
978                     _sum12 = _mm256_fmadd_ps(_r07, _k02_1, _sum12);
979                     _sum12 = _mm256_fmadd_ps(_r15, _k10_1, _sum12);
980                     _sum12 = _mm256_fmadd_ps(_r16, _k11_1, _sum12);
981                     _sum12 = _mm256_fmadd_ps(_r17, _k12_1, _sum12);
982                     _sum12 = _mm256_fmadd_ps(_r25, _k20_1, _sum12);
983                     _sum12 = _mm256_fmadd_ps(_r26, _k21_1, _sum12);
984                     _sum12 = _mm256_fmadd_ps(_r27, _k22_1, _sum12);
985 
986                     _mm256_storeu_ps(outptr0 + 16, _sum02);
987                     _mm256_storeu_ps(outptr1 + 16, _sum12);
988 
989                     __m256 _r08 = _mm256_broadcast_ss(r0 + 7);
990                     __m256 _r18 = _mm256_broadcast_ss(r1 + 7);
991                     __m256 _r28 = _mm256_broadcast_ss(r2 + 7);
992                     __m256 _r09 = _mm256_broadcast_ss(r0 + 8);
993                     __m256 _r19 = _mm256_broadcast_ss(r1 + 8);
994                     __m256 _r29 = _mm256_broadcast_ss(r2 + 8);
995 
996                     __m256 _sum03 = _mm256_loadu_ps(outptr0 + 24);
997                     __m256 _sum13 = _mm256_loadu_ps(outptr1 + 24);
998 
999                     _sum03 = _mm256_fmadd_ps(_r07, _k00_0, _sum03);
1000                     _sum03 = _mm256_fmadd_ps(_r08, _k01_0, _sum03);
1001                     _sum03 = _mm256_fmadd_ps(_r09, _k02_0, _sum03);
1002                     _sum03 = _mm256_fmadd_ps(_r17, _k10_0, _sum03);
1003                     _sum03 = _mm256_fmadd_ps(_r18, _k11_0, _sum03);
1004                     _sum03 = _mm256_fmadd_ps(_r19, _k12_0, _sum03);
1005                     _sum03 = _mm256_fmadd_ps(_r27, _k20_0, _sum03);
1006                     _sum03 = _mm256_fmadd_ps(_r28, _k21_0, _sum03);
1007                     _sum03 = _mm256_fmadd_ps(_r29, _k22_0, _sum03);
1008 
1009                     _sum13 = _mm256_fmadd_ps(_r07, _k00_1, _sum13);
1010                     _sum13 = _mm256_fmadd_ps(_r08, _k01_1, _sum13);
1011                     _sum13 = _mm256_fmadd_ps(_r09, _k02_1, _sum13);
1012                     _sum13 = _mm256_fmadd_ps(_r17, _k10_1, _sum13);
1013                     _sum13 = _mm256_fmadd_ps(_r18, _k11_1, _sum13);
1014                     _sum13 = _mm256_fmadd_ps(_r19, _k12_1, _sum13);
1015                     _sum13 = _mm256_fmadd_ps(_r27, _k20_1, _sum13);
1016                     _sum13 = _mm256_fmadd_ps(_r28, _k21_1, _sum13);
1017                     _sum13 = _mm256_fmadd_ps(_r29, _k22_1, _sum13);
1018 
1019                     _mm256_storeu_ps(outptr0 + 24, _sum03);
1020                     _mm256_storeu_ps(outptr1 + 24, _sum13);
1021                     r0 += 8;
1022                     r1 += 8;
1023                     r2 += 8;
1024                     outptr0 += 32;
1025                     outptr1 += 32;
1026                 }
1027 
1028                 for (; j + 1 < outw; j += 2)
1029                 {
1030                     __m256 _sum00 = _mm256_loadu_ps(outptr0);
1031                     __m256 _sum10 = _mm256_loadu_ps(outptr1);
1032 
1033                     __m256 _r01 = _mm256_broadcast_ss(r0);
1034                     __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
1035                     __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
1036                     __m256 _r11 = _mm256_broadcast_ss(r1);
1037                     __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
1038                     __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
1039                     __m256 _r21 = _mm256_broadcast_ss(r2);
1040                     __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
1041                     __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
1042 
1043                     _sum00 = _mm256_fmadd_ps(_r01, _k00_0, _sum00);
1044                     _sum00 = _mm256_fmadd_ps(_r02, _k01_0, _sum00);
1045                     _sum00 = _mm256_fmadd_ps(_r03, _k02_0, _sum00);
1046                     _sum00 = _mm256_fmadd_ps(_r11, _k10_0, _sum00);
1047                     _sum00 = _mm256_fmadd_ps(_r12, _k11_0, _sum00);
1048                     _sum00 = _mm256_fmadd_ps(_r13, _k12_0, _sum00);
1049                     _sum00 = _mm256_fmadd_ps(_r21, _k20_0, _sum00);
1050                     _sum00 = _mm256_fmadd_ps(_r22, _k21_0, _sum00);
1051                     _sum00 = _mm256_fmadd_ps(_r23, _k22_0, _sum00);
1052 
1053                     _sum10 = _mm256_fmadd_ps(_r01, _k00_1, _sum10);
1054                     _sum10 = _mm256_fmadd_ps(_r02, _k01_1, _sum10);
1055                     _sum10 = _mm256_fmadd_ps(_r03, _k02_1, _sum10);
1056                     _sum10 = _mm256_fmadd_ps(_r11, _k10_1, _sum10);
1057                     _sum10 = _mm256_fmadd_ps(_r12, _k11_1, _sum10);
1058                     _sum10 = _mm256_fmadd_ps(_r13, _k12_1, _sum10);
1059                     _sum10 = _mm256_fmadd_ps(_r21, _k20_1, _sum10);
1060                     _sum10 = _mm256_fmadd_ps(_r22, _k21_1, _sum10);
1061                     _sum10 = _mm256_fmadd_ps(_r23, _k22_1, _sum10);
1062 
1063                     _mm256_storeu_ps(outptr0, _sum00);
1064                     _mm256_storeu_ps(outptr1, _sum10);
1065 
1066                     __m256 _sum01 = _mm256_loadu_ps(outptr0 + 8);
1067                     __m256 _sum11 = _mm256_loadu_ps(outptr1 + 8);
1068 
1069                     __m256 _r04 = _mm256_broadcast_ss(r0 + 3);
1070                     __m256 _r14 = _mm256_broadcast_ss(r1 + 3);
1071                     __m256 _r24 = _mm256_broadcast_ss(r2 + 3);
1072                     __m256 _r05 = _mm256_broadcast_ss(r0 + 4);
1073                     __m256 _r15 = _mm256_broadcast_ss(r1 + 4);
1074                     __m256 _r25 = _mm256_broadcast_ss(r2 + 4);
1075 
1076                     _sum01 = _mm256_fmadd_ps(_r03, _k00_0, _sum01);
1077                     _sum01 = _mm256_fmadd_ps(_r04, _k01_0, _sum01);
1078                     _sum01 = _mm256_fmadd_ps(_r05, _k02_0, _sum01);
1079                     _sum01 = _mm256_fmadd_ps(_r13, _k10_0, _sum01);
1080                     _sum01 = _mm256_fmadd_ps(_r14, _k11_0, _sum01);
1081                     _sum01 = _mm256_fmadd_ps(_r15, _k12_0, _sum01);
1082                     _sum01 = _mm256_fmadd_ps(_r23, _k20_0, _sum01);
1083                     _sum01 = _mm256_fmadd_ps(_r24, _k21_0, _sum01);
1084                     _sum01 = _mm256_fmadd_ps(_r25, _k22_0, _sum01);
1085 
1086                     _sum11 = _mm256_fmadd_ps(_r03, _k00_1, _sum11);
1087                     _sum11 = _mm256_fmadd_ps(_r04, _k01_1, _sum11);
1088                     _sum11 = _mm256_fmadd_ps(_r05, _k02_1, _sum11);
1089                     _sum11 = _mm256_fmadd_ps(_r13, _k10_1, _sum11);
1090                     _sum11 = _mm256_fmadd_ps(_r14, _k11_1, _sum11);
1091                     _sum11 = _mm256_fmadd_ps(_r15, _k12_1, _sum11);
1092                     _sum11 = _mm256_fmadd_ps(_r23, _k20_1, _sum11);
1093                     _sum11 = _mm256_fmadd_ps(_r24, _k21_1, _sum11);
1094                     _sum11 = _mm256_fmadd_ps(_r25, _k22_1, _sum11);
1095 
1096                     _mm256_storeu_ps(outptr0 + 8, _sum01);
1097                     _mm256_storeu_ps(outptr1 + 8, _sum11);
1098 
1099                     r0 += 4;
1100                     r1 += 4;
1101                     r2 += 4;
1102                     outptr0 += 16;
1103                     outptr1 += 16;
1104                 }
1105                 for (; j < outw; j++)
1106                 {
1107                     __m256 _sum00 = _mm256_loadu_ps(outptr0);
1108                     __m256 _sum10 = _mm256_loadu_ps(outptr1);
1109 
1110                     __m256 _r01 = _mm256_broadcast_ss(r0);
1111                     __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
1112                     __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
1113                     __m256 _r11 = _mm256_broadcast_ss(r1);
1114                     __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
1115                     __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
1116                     __m256 _r21 = _mm256_broadcast_ss(r2);
1117                     __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
1118                     __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
1119 
1120                     _sum00 = _mm256_fmadd_ps(_r01, _k00_0, _sum00);
1121                     _sum00 = _mm256_fmadd_ps(_r02, _k01_0, _sum00);
1122                     _sum00 = _mm256_fmadd_ps(_r03, _k02_0, _sum00);
1123                     _sum00 = _mm256_fmadd_ps(_r11, _k10_0, _sum00);
1124                     _sum00 = _mm256_fmadd_ps(_r12, _k11_0, _sum00);
1125                     _sum00 = _mm256_fmadd_ps(_r13, _k12_0, _sum00);
1126                     _sum00 = _mm256_fmadd_ps(_r21, _k20_0, _sum00);
1127                     _sum00 = _mm256_fmadd_ps(_r22, _k21_0, _sum00);
1128                     _sum00 = _mm256_fmadd_ps(_r23, _k22_0, _sum00);
1129 
1130                     _sum10 = _mm256_fmadd_ps(_r01, _k00_1, _sum10);
1131                     _sum10 = _mm256_fmadd_ps(_r02, _k01_1, _sum10);
1132                     _sum10 = _mm256_fmadd_ps(_r03, _k02_1, _sum10);
1133                     _sum10 = _mm256_fmadd_ps(_r11, _k10_1, _sum10);
1134                     _sum10 = _mm256_fmadd_ps(_r12, _k11_1, _sum10);
1135                     _sum10 = _mm256_fmadd_ps(_r13, _k12_1, _sum10);
1136                     _sum10 = _mm256_fmadd_ps(_r21, _k20_1, _sum10);
1137                     _sum10 = _mm256_fmadd_ps(_r22, _k21_1, _sum10);
1138                     _sum10 = _mm256_fmadd_ps(_r23, _k22_1, _sum10);
1139 
1140                     _mm256_storeu_ps(outptr0, _sum00);
1141                     _mm256_storeu_ps(outptr1, _sum10);
1142 
1143                     r0 += 2;
1144                     r1 += 2;
1145                     r2 += 2;
1146                     outptr0 += 8;
1147                     outptr1 += 8;
1148                 }
1149                 r0 += tailstep;
1150                 r1 += tailstep;
1151                 r2 += tailstep;
1152             }
1153 
1154             k0 += 9 * 8;
1155             k1 += 9 * 8;
1156         }
1157     }
1158 
1159     #pragma omp parallel for num_threads(opt.num_threads)
1160     for (int p = remain_outch_start; p < outch; p++)
1161     {
1162         Mat out0 = top_blob.channel(p);
1163 
1164         __m256 _bias0 = bias ? _mm256_loadu_ps((const float*)bias + p * 8) : _mm256_set1_ps(0.f);
1165         out0.fill(_bias0);
1166 
1167         const float* k0 = kernel.channel(p);
1168 
1169         for (int q = 0; q < inch; q++)
1170         {
1171             float* outptr0 = out0.row(0);
1172 
1173             const Mat img0 = bottom_blob.channel(q);
1174 
1175             const float* r0 = img0.row(0);
1176             const float* r1 = img0.row(1);
1177             const float* r2 = img0.row(2);
1178 
1179             __m256 _k00_0 = _mm256_loadu_ps(k0);
1180             __m256 _k01_0 = _mm256_loadu_ps(k0 + 8);
1181             __m256 _k02_0 = _mm256_loadu_ps(k0 + 16);
1182             __m256 _k10_0 = _mm256_loadu_ps(k0 + 24);
1183             __m256 _k11_0 = _mm256_loadu_ps(k0 + 32);
1184             __m256 _k12_0 = _mm256_loadu_ps(k0 + 40);
1185             __m256 _k20_0 = _mm256_loadu_ps(k0 + 48);
1186             __m256 _k21_0 = _mm256_loadu_ps(k0 + 56);
1187             __m256 _k22_0 = _mm256_loadu_ps(k0 + 64);
1188 
1189             int i = 0;
1190 
1191             for (; i < outh; i++)
1192             {
1193                 int j = 0;
1194                 for (; j + 7 < outw; j += 8)
1195                 {
1196                     __m256 _sum00 = _mm256_loadu_ps(outptr0);
1197 
1198                     __m256 _r01 = _mm256_broadcast_ss(r0);
1199                     __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
1200                     __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
1201                     __m256 _r11 = _mm256_broadcast_ss(r1);
1202                     __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
1203                     __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
1204                     __m256 _r21 = _mm256_broadcast_ss(r2);
1205                     __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
1206                     __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
1207 
1208                     _sum00 = _mm256_fmadd_ps(_r01, _k00_0, _sum00);
1209                     _sum00 = _mm256_fmadd_ps(_r02, _k01_0, _sum00);
1210                     _sum00 = _mm256_fmadd_ps(_r03, _k02_0, _sum00);
1211                     _sum00 = _mm256_fmadd_ps(_r11, _k10_0, _sum00);
1212                     _sum00 = _mm256_fmadd_ps(_r12, _k11_0, _sum00);
1213                     _sum00 = _mm256_fmadd_ps(_r13, _k12_0, _sum00);
1214                     _sum00 = _mm256_fmadd_ps(_r21, _k20_0, _sum00);
1215                     _sum00 = _mm256_fmadd_ps(_r22, _k21_0, _sum00);
1216                     _sum00 = _mm256_fmadd_ps(_r23, _k22_0, _sum00);
1217 
1218                     _mm256_storeu_ps(outptr0, _sum00);
1219 
1220                     __m256 _sum01 = _mm256_loadu_ps(outptr0 + 8);
1221 
1222                     __m256 _r04 = _mm256_broadcast_ss(r0 + 3);
1223                     __m256 _r14 = _mm256_broadcast_ss(r1 + 3);
1224                     __m256 _r24 = _mm256_broadcast_ss(r2 + 3);
1225                     __m256 _r05 = _mm256_broadcast_ss(r0 + 4);
1226                     __m256 _r15 = _mm256_broadcast_ss(r1 + 4);
1227                     __m256 _r25 = _mm256_broadcast_ss(r2 + 4);
1228 
1229                     _sum01 = _mm256_fmadd_ps(_r03, _k00_0, _sum01);
1230                     _sum01 = _mm256_fmadd_ps(_r04, _k01_0, _sum01);
1231                     _sum01 = _mm256_fmadd_ps(_r05, _k02_0, _sum01);
1232                     _sum01 = _mm256_fmadd_ps(_r13, _k10_0, _sum01);
1233                     _sum01 = _mm256_fmadd_ps(_r14, _k11_0, _sum01);
1234                     _sum01 = _mm256_fmadd_ps(_r15, _k12_0, _sum01);
1235                     _sum01 = _mm256_fmadd_ps(_r23, _k20_0, _sum01);
1236                     _sum01 = _mm256_fmadd_ps(_r24, _k21_0, _sum01);
1237                     _sum01 = _mm256_fmadd_ps(_r25, _k22_0, _sum01);
1238 
1239                     _mm256_storeu_ps(outptr0 + 8, _sum01);
1240 
1241                     __m256 _sum02 = _mm256_loadu_ps(outptr0 + 16);
1242 
1243                     __m256 _r06 = _mm256_broadcast_ss(r0 + 5);
1244                     __m256 _r16 = _mm256_broadcast_ss(r1 + 5);
1245                     __m256 _r26 = _mm256_broadcast_ss(r2 + 5);
1246                     __m256 _r07 = _mm256_broadcast_ss(r0 + 6);
1247                     __m256 _r17 = _mm256_broadcast_ss(r1 + 6);
1248                     __m256 _r27 = _mm256_broadcast_ss(r2 + 6);
1249 
1250                     _sum02 = _mm256_fmadd_ps(_r05, _k00_0, _sum02);
1251                     _sum02 = _mm256_fmadd_ps(_r06, _k01_0, _sum02);
1252                     _sum02 = _mm256_fmadd_ps(_r07, _k02_0, _sum02);
1253                     _sum02 = _mm256_fmadd_ps(_r15, _k10_0, _sum02);
1254                     _sum02 = _mm256_fmadd_ps(_r16, _k11_0, _sum02);
1255                     _sum02 = _mm256_fmadd_ps(_r17, _k12_0, _sum02);
1256                     _sum02 = _mm256_fmadd_ps(_r25, _k20_0, _sum02);
1257                     _sum02 = _mm256_fmadd_ps(_r26, _k21_0, _sum02);
1258                     _sum02 = _mm256_fmadd_ps(_r27, _k22_0, _sum02);
1259 
1260                     _mm256_storeu_ps(outptr0 + 16, _sum02);
1261 
1262                     __m256 _r08 = _mm256_broadcast_ss(r0 + 7);
1263                     __m256 _r18 = _mm256_broadcast_ss(r1 + 7);
1264                     __m256 _r28 = _mm256_broadcast_ss(r2 + 7);
1265                     __m256 _r09 = _mm256_broadcast_ss(r0 + 8);
1266                     __m256 _r19 = _mm256_broadcast_ss(r1 + 8);
1267                     __m256 _r29 = _mm256_broadcast_ss(r2 + 8);
1268 
1269                     __m256 _sum03 = _mm256_loadu_ps(outptr0 + 24);
1270 
1271                     _sum03 = _mm256_fmadd_ps(_r07, _k00_0, _sum03);
1272                     _sum03 = _mm256_fmadd_ps(_r08, _k01_0, _sum03);
1273                     _sum03 = _mm256_fmadd_ps(_r09, _k02_0, _sum03);
1274                     _sum03 = _mm256_fmadd_ps(_r17, _k10_0, _sum03);
1275                     _sum03 = _mm256_fmadd_ps(_r18, _k11_0, _sum03);
1276                     _sum03 = _mm256_fmadd_ps(_r19, _k12_0, _sum03);
1277                     _sum03 = _mm256_fmadd_ps(_r27, _k20_0, _sum03);
1278                     _sum03 = _mm256_fmadd_ps(_r28, _k21_0, _sum03);
1279                     _sum03 = _mm256_fmadd_ps(_r29, _k22_0, _sum03);
1280 
1281                     _mm256_storeu_ps(outptr0 + 24, _sum03);
1282 
1283                     __m256 _r010 = _mm256_broadcast_ss(r0 + 9);
1284                     __m256 _r110 = _mm256_broadcast_ss(r1 + 9);
1285                     __m256 _r210 = _mm256_broadcast_ss(r2 + 9);
1286                     __m256 _r011 = _mm256_broadcast_ss(r0 + 10);
1287                     __m256 _r111 = _mm256_broadcast_ss(r1 + 10);
1288                     __m256 _r211 = _mm256_broadcast_ss(r2 + 10);
1289 
1290                     __m256 _sum04 = _mm256_loadu_ps(outptr0 + 32);
1291 
1292                     _sum04 = _mm256_fmadd_ps(_r09, _k00_0, _sum04);
1293                     _sum04 = _mm256_fmadd_ps(_r010, _k01_0, _sum04);
1294                     _sum04 = _mm256_fmadd_ps(_r011, _k02_0, _sum04);
1295                     _sum04 = _mm256_fmadd_ps(_r19, _k10_0, _sum04);
1296                     _sum04 = _mm256_fmadd_ps(_r110, _k11_0, _sum04);
1297                     _sum04 = _mm256_fmadd_ps(_r111, _k12_0, _sum04);
1298                     _sum04 = _mm256_fmadd_ps(_r29, _k20_0, _sum04);
1299                     _sum04 = _mm256_fmadd_ps(_r210, _k21_0, _sum04);
1300                     _sum04 = _mm256_fmadd_ps(_r211, _k22_0, _sum04);
1301 
1302                     _mm256_storeu_ps(outptr0 + 32, _sum04);
1303 
1304                     __m256 _r012 = _mm256_broadcast_ss(r0 + 11);
1305                     __m256 _r112 = _mm256_broadcast_ss(r1 + 11);
1306                     __m256 _r212 = _mm256_broadcast_ss(r2 + 11);
1307                     __m256 _r013 = _mm256_broadcast_ss(r0 + 12);
1308                     __m256 _r113 = _mm256_broadcast_ss(r1 + 12);
1309                     __m256 _r213 = _mm256_broadcast_ss(r2 + 12);
1310 
1311                     __m256 _sum05 = _mm256_loadu_ps(outptr0 + 40);
1312 
1313                     _sum05 = _mm256_fmadd_ps(_r011, _k00_0, _sum05);
1314                     _sum05 = _mm256_fmadd_ps(_r012, _k01_0, _sum05);
1315                     _sum05 = _mm256_fmadd_ps(_r013, _k02_0, _sum05);
1316                     _sum05 = _mm256_fmadd_ps(_r111, _k10_0, _sum05);
1317                     _sum05 = _mm256_fmadd_ps(_r112, _k11_0, _sum05);
1318                     _sum05 = _mm256_fmadd_ps(_r113, _k12_0, _sum05);
1319                     _sum05 = _mm256_fmadd_ps(_r211, _k20_0, _sum05);
1320                     _sum05 = _mm256_fmadd_ps(_r212, _k21_0, _sum05);
1321                     _sum05 = _mm256_fmadd_ps(_r213, _k22_0, _sum05);
1322 
1323                     _mm256_storeu_ps(outptr0 + 40, _sum05);
1324 
1325                     __m256 _r014 = _mm256_broadcast_ss(r0 + 13);
1326                     __m256 _r114 = _mm256_broadcast_ss(r1 + 13);
1327                     __m256 _r214 = _mm256_broadcast_ss(r2 + 13);
1328                     __m256 _r015 = _mm256_broadcast_ss(r0 + 14);
1329                     __m256 _r115 = _mm256_broadcast_ss(r1 + 14);
1330                     __m256 _r215 = _mm256_broadcast_ss(r2 + 14);
1331 
1332                     __m256 _sum06 = _mm256_loadu_ps(outptr0 + 48);
1333 
1334                     _sum06 = _mm256_fmadd_ps(_r013, _k00_0, _sum06);
1335                     _sum06 = _mm256_fmadd_ps(_r014, _k01_0, _sum06);
1336                     _sum06 = _mm256_fmadd_ps(_r015, _k02_0, _sum06);
1337                     _sum06 = _mm256_fmadd_ps(_r113, _k10_0, _sum06);
1338                     _sum06 = _mm256_fmadd_ps(_r114, _k11_0, _sum06);
1339                     _sum06 = _mm256_fmadd_ps(_r115, _k12_0, _sum06);
1340                     _sum06 = _mm256_fmadd_ps(_r213, _k20_0, _sum06);
1341                     _sum06 = _mm256_fmadd_ps(_r214, _k21_0, _sum06);
1342                     _sum06 = _mm256_fmadd_ps(_r215, _k22_0, _sum06);
1343 
1344                     _mm256_storeu_ps(outptr0 + 48, _sum06);
1345 
1346                     __m256 _r016 = _mm256_broadcast_ss(r0 + 15);
1347                     __m256 _r116 = _mm256_broadcast_ss(r1 + 15);
1348                     __m256 _r216 = _mm256_broadcast_ss(r2 + 15);
1349                     __m256 _r017 = _mm256_broadcast_ss(r0 + 16);
1350                     __m256 _r117 = _mm256_broadcast_ss(r1 + 16);
1351                     __m256 _r217 = _mm256_broadcast_ss(r2 + 16);
1352 
1353                     __m256 _sum07 = _mm256_loadu_ps(outptr0 + 56);
1354 
1355                     _sum07 = _mm256_fmadd_ps(_r015, _k00_0, _sum07);
1356                     _sum07 = _mm256_fmadd_ps(_r016, _k01_0, _sum07);
1357                     _sum07 = _mm256_fmadd_ps(_r017, _k02_0, _sum07);
1358                     _sum07 = _mm256_fmadd_ps(_r115, _k10_0, _sum07);
1359                     _sum07 = _mm256_fmadd_ps(_r116, _k11_0, _sum07);
1360                     _sum07 = _mm256_fmadd_ps(_r117, _k12_0, _sum07);
1361                     _sum07 = _mm256_fmadd_ps(_r215, _k20_0, _sum07);
1362                     _sum07 = _mm256_fmadd_ps(_r216, _k21_0, _sum07);
1363                     _sum07 = _mm256_fmadd_ps(_r217, _k22_0, _sum07);
1364 
1365                     _mm256_storeu_ps(outptr0 + 56, _sum07);
1366 
1367                     r0 += 16;
1368                     r1 += 16;
1369                     r2 += 16;
1370                     outptr0 += 64;
1371                 }
1372 
1373                 for (; j + 3 < outw; j += 4)
1374                 {
1375                     __m256 _sum00 = _mm256_loadu_ps(outptr0);
1376                     __m256 _r01 = _mm256_broadcast_ss(r0);
1377                     __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
1378                     __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
1379                     __m256 _r11 = _mm256_broadcast_ss(r1);
1380                     __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
1381                     __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
1382                     __m256 _r21 = _mm256_broadcast_ss(r2);
1383                     __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
1384                     __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
1385 
1386                     _sum00 = _mm256_fmadd_ps(_r01, _k00_0, _sum00);
1387                     _sum00 = _mm256_fmadd_ps(_r02, _k01_0, _sum00);
1388                     _sum00 = _mm256_fmadd_ps(_r03, _k02_0, _sum00);
1389                     _sum00 = _mm256_fmadd_ps(_r11, _k10_0, _sum00);
1390                     _sum00 = _mm256_fmadd_ps(_r12, _k11_0, _sum00);
1391                     _sum00 = _mm256_fmadd_ps(_r13, _k12_0, _sum00);
1392                     _sum00 = _mm256_fmadd_ps(_r21, _k20_0, _sum00);
1393                     _sum00 = _mm256_fmadd_ps(_r22, _k21_0, _sum00);
1394                     _sum00 = _mm256_fmadd_ps(_r23, _k22_0, _sum00);
1395 
1396                     _mm256_storeu_ps(outptr0, _sum00);
1397 
1398                     __m256 _sum01 = _mm256_loadu_ps(outptr0 + 8);
1399 
1400                     __m256 _r04 = _mm256_broadcast_ss(r0 + 3);
1401                     __m256 _r14 = _mm256_broadcast_ss(r1 + 3);
1402                     __m256 _r24 = _mm256_broadcast_ss(r2 + 3);
1403                     __m256 _r05 = _mm256_broadcast_ss(r0 + 4);
1404                     __m256 _r15 = _mm256_broadcast_ss(r1 + 4);
1405                     __m256 _r25 = _mm256_broadcast_ss(r2 + 4);
1406 
1407                     _sum01 = _mm256_fmadd_ps(_r03, _k00_0, _sum01);
1408                     _sum01 = _mm256_fmadd_ps(_r04, _k01_0, _sum01);
1409                     _sum01 = _mm256_fmadd_ps(_r05, _k02_0, _sum01);
1410                     _sum01 = _mm256_fmadd_ps(_r13, _k10_0, _sum01);
1411                     _sum01 = _mm256_fmadd_ps(_r14, _k11_0, _sum01);
1412                     _sum01 = _mm256_fmadd_ps(_r15, _k12_0, _sum01);
1413                     _sum01 = _mm256_fmadd_ps(_r23, _k20_0, _sum01);
1414                     _sum01 = _mm256_fmadd_ps(_r24, _k21_0, _sum01);
1415                     _sum01 = _mm256_fmadd_ps(_r25, _k22_0, _sum01);
1416 
1417                     _mm256_storeu_ps(outptr0 + 8, _sum01);
1418 
1419                     __m256 _sum02 = _mm256_loadu_ps(outptr0 + 16);
1420 
1421                     __m256 _r06 = _mm256_broadcast_ss(r0 + 5);
1422                     __m256 _r16 = _mm256_broadcast_ss(r1 + 5);
1423                     __m256 _r26 = _mm256_broadcast_ss(r2 + 5);
1424                     __m256 _r07 = _mm256_broadcast_ss(r0 + 6);
1425                     __m256 _r17 = _mm256_broadcast_ss(r1 + 6);
1426                     __m256 _r27 = _mm256_broadcast_ss(r2 + 6);
1427 
1428                     _sum02 = _mm256_fmadd_ps(_r05, _k00_0, _sum02);
1429                     _sum02 = _mm256_fmadd_ps(_r06, _k01_0, _sum02);
1430                     _sum02 = _mm256_fmadd_ps(_r07, _k02_0, _sum02);
1431                     _sum02 = _mm256_fmadd_ps(_r15, _k10_0, _sum02);
1432                     _sum02 = _mm256_fmadd_ps(_r16, _k11_0, _sum02);
1433                     _sum02 = _mm256_fmadd_ps(_r17, _k12_0, _sum02);
1434                     _sum02 = _mm256_fmadd_ps(_r25, _k20_0, _sum02);
1435                     _sum02 = _mm256_fmadd_ps(_r26, _k21_0, _sum02);
1436                     _sum02 = _mm256_fmadd_ps(_r27, _k22_0, _sum02);
1437 
1438                     _mm256_storeu_ps(outptr0 + 16, _sum02);
1439 
1440                     __m256 _r08 = _mm256_broadcast_ss(r0 + 7);
1441                     __m256 _r18 = _mm256_broadcast_ss(r1 + 7);
1442                     __m256 _r28 = _mm256_broadcast_ss(r2 + 7);
1443                     __m256 _r09 = _mm256_broadcast_ss(r0 + 8);
1444                     __m256 _r19 = _mm256_broadcast_ss(r1 + 8);
1445                     __m256 _r29 = _mm256_broadcast_ss(r2 + 8);
1446 
1447                     __m256 _sum03 = _mm256_loadu_ps(outptr0 + 24);
1448 
1449                     _sum03 = _mm256_fmadd_ps(_r07, _k00_0, _sum03);
1450                     _sum03 = _mm256_fmadd_ps(_r08, _k01_0, _sum03);
1451                     _sum03 = _mm256_fmadd_ps(_r09, _k02_0, _sum03);
1452                     _sum03 = _mm256_fmadd_ps(_r17, _k10_0, _sum03);
1453                     _sum03 = _mm256_fmadd_ps(_r18, _k11_0, _sum03);
1454                     _sum03 = _mm256_fmadd_ps(_r19, _k12_0, _sum03);
1455                     _sum03 = _mm256_fmadd_ps(_r27, _k20_0, _sum03);
1456                     _sum03 = _mm256_fmadd_ps(_r28, _k21_0, _sum03);
1457                     _sum03 = _mm256_fmadd_ps(_r29, _k22_0, _sum03);
1458 
1459                     _mm256_storeu_ps(outptr0 + 24, _sum03);
1460                     r0 += 8;
1461                     r1 += 8;
1462                     r2 += 8;
1463                     outptr0 += 32;
1464                 }
1465 
1466                 for (; j + 1 < outw; j += 2)
1467                 {
1468                     __m256 _sum00 = _mm256_loadu_ps(outptr0);
1469 
1470                     __m256 _r01 = _mm256_broadcast_ss(r0);
1471                     __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
1472                     __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
1473                     __m256 _r11 = _mm256_broadcast_ss(r1);
1474                     __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
1475                     __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
1476                     __m256 _r21 = _mm256_broadcast_ss(r2);
1477                     __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
1478                     __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
1479 
1480                     _sum00 = _mm256_fmadd_ps(_r01, _k00_0, _sum00);
1481                     _sum00 = _mm256_fmadd_ps(_r02, _k01_0, _sum00);
1482                     _sum00 = _mm256_fmadd_ps(_r03, _k02_0, _sum00);
1483                     _sum00 = _mm256_fmadd_ps(_r11, _k10_0, _sum00);
1484                     _sum00 = _mm256_fmadd_ps(_r12, _k11_0, _sum00);
1485                     _sum00 = _mm256_fmadd_ps(_r13, _k12_0, _sum00);
1486                     _sum00 = _mm256_fmadd_ps(_r21, _k20_0, _sum00);
1487                     _sum00 = _mm256_fmadd_ps(_r22, _k21_0, _sum00);
1488                     _sum00 = _mm256_fmadd_ps(_r23, _k22_0, _sum00);
1489 
1490                     _mm256_storeu_ps(outptr0, _sum00);
1491 
1492                     __m256 _sum01 = _mm256_loadu_ps(outptr0 + 8);
1493 
1494                     __m256 _r04 = _mm256_broadcast_ss(r0 + 3);
1495                     __m256 _r14 = _mm256_broadcast_ss(r1 + 3);
1496                     __m256 _r24 = _mm256_broadcast_ss(r2 + 3);
1497                     __m256 _r05 = _mm256_broadcast_ss(r0 + 4);
1498                     __m256 _r15 = _mm256_broadcast_ss(r1 + 4);
1499                     __m256 _r25 = _mm256_broadcast_ss(r2 + 4);
1500 
1501                     _sum01 = _mm256_fmadd_ps(_r03, _k00_0, _sum01);
1502                     _sum01 = _mm256_fmadd_ps(_r04, _k01_0, _sum01);
1503                     _sum01 = _mm256_fmadd_ps(_r05, _k02_0, _sum01);
1504                     _sum01 = _mm256_fmadd_ps(_r13, _k10_0, _sum01);
1505                     _sum01 = _mm256_fmadd_ps(_r14, _k11_0, _sum01);
1506                     _sum01 = _mm256_fmadd_ps(_r15, _k12_0, _sum01);
1507                     _sum01 = _mm256_fmadd_ps(_r23, _k20_0, _sum01);
1508                     _sum01 = _mm256_fmadd_ps(_r24, _k21_0, _sum01);
1509                     _sum01 = _mm256_fmadd_ps(_r25, _k22_0, _sum01);
1510 
1511                     _mm256_storeu_ps(outptr0 + 8, _sum01);
1512 
1513                     r0 += 4;
1514                     r1 += 4;
1515                     r2 += 4;
1516                     outptr0 += 16;
1517                 }
1518                 for (; j < outw; j++)
1519                 {
1520                     __m256 _sum00 = _mm256_loadu_ps(outptr0);
1521                     __m256 _r01 = _mm256_broadcast_ss(r0);
1522                     __m256 _r02 = _mm256_broadcast_ss(r0 + 1);
1523                     __m256 _r03 = _mm256_broadcast_ss(r0 + 2);
1524                     __m256 _r11 = _mm256_broadcast_ss(r1);
1525                     __m256 _r12 = _mm256_broadcast_ss(r1 + 1);
1526                     __m256 _r13 = _mm256_broadcast_ss(r1 + 2);
1527                     __m256 _r21 = _mm256_broadcast_ss(r2);
1528                     __m256 _r22 = _mm256_broadcast_ss(r2 + 1);
1529                     __m256 _r23 = _mm256_broadcast_ss(r2 + 2);
1530 
1531                     _sum00 = _mm256_fmadd_ps(_r01, _k00_0, _sum00);
1532                     _sum00 = _mm256_fmadd_ps(_r02, _k01_0, _sum00);
1533                     _sum00 = _mm256_fmadd_ps(_r03, _k02_0, _sum00);
1534                     _sum00 = _mm256_fmadd_ps(_r11, _k10_0, _sum00);
1535                     _sum00 = _mm256_fmadd_ps(_r12, _k11_0, _sum00);
1536                     _sum00 = _mm256_fmadd_ps(_r13, _k12_0, _sum00);
1537                     _sum00 = _mm256_fmadd_ps(_r21, _k20_0, _sum00);
1538                     _sum00 = _mm256_fmadd_ps(_r22, _k21_0, _sum00);
1539                     _sum00 = _mm256_fmadd_ps(_r23, _k22_0, _sum00);
1540                     _mm256_storeu_ps(outptr0, _sum00);
1541 
1542                     r0 += 2;
1543                     r1 += 2;
1544                     r2 += 2;
1545                     outptr0 += 8;
1546                 }
1547                 r0 += tailstep;
1548                 r1 += tailstep;
1549                 r2 += tailstep;
1550             }
1551 
1552             k0 += 9 * 8;
1553         }
1554     }
1555 }
1556