Lines Matching refs:CC

54 	float CC[16] = {0};  in kernel_sgemm_nt_4x4_lib4()  local
56 ALIGNED( float CC[16], 64 ) = {0}; in kernel_sgemm_nt_4x4_lib4()
76 CC[0+bs*0] += a_0 * b_0; in kernel_sgemm_nt_4x4_lib4()
77 CC[1+bs*0] += a_1 * b_0; in kernel_sgemm_nt_4x4_lib4()
78 CC[2+bs*0] += a_2 * b_0; in kernel_sgemm_nt_4x4_lib4()
79 CC[3+bs*0] += a_3 * b_0; in kernel_sgemm_nt_4x4_lib4()
81 CC[0+bs*1] += a_0 * b_1; in kernel_sgemm_nt_4x4_lib4()
82 CC[1+bs*1] += a_1 * b_1; in kernel_sgemm_nt_4x4_lib4()
83 CC[2+bs*1] += a_2 * b_1; in kernel_sgemm_nt_4x4_lib4()
84 CC[3+bs*1] += a_3 * b_1; in kernel_sgemm_nt_4x4_lib4()
86 CC[0+bs*2] += a_0 * b_2; in kernel_sgemm_nt_4x4_lib4()
87 CC[1+bs*2] += a_1 * b_2; in kernel_sgemm_nt_4x4_lib4()
88 CC[2+bs*2] += a_2 * b_2; in kernel_sgemm_nt_4x4_lib4()
89 CC[3+bs*2] += a_3 * b_2; in kernel_sgemm_nt_4x4_lib4()
91 CC[0+bs*3] += a_0 * b_3; in kernel_sgemm_nt_4x4_lib4()
92 CC[1+bs*3] += a_1 * b_3; in kernel_sgemm_nt_4x4_lib4()
93 CC[2+bs*3] += a_2 * b_3; in kernel_sgemm_nt_4x4_lib4()
94 CC[3+bs*3] += a_3 * b_3; in kernel_sgemm_nt_4x4_lib4()
109 CC[0+bs*0] += a_0 * b_0; in kernel_sgemm_nt_4x4_lib4()
110 CC[1+bs*0] += a_1 * b_0; in kernel_sgemm_nt_4x4_lib4()
111 CC[2+bs*0] += a_2 * b_0; in kernel_sgemm_nt_4x4_lib4()
112 CC[3+bs*0] += a_3 * b_0; in kernel_sgemm_nt_4x4_lib4()
114 CC[0+bs*1] += a_0 * b_1; in kernel_sgemm_nt_4x4_lib4()
115 CC[1+bs*1] += a_1 * b_1; in kernel_sgemm_nt_4x4_lib4()
116 CC[2+bs*1] += a_2 * b_1; in kernel_sgemm_nt_4x4_lib4()
117 CC[3+bs*1] += a_3 * b_1; in kernel_sgemm_nt_4x4_lib4()
119 CC[0+bs*2] += a_0 * b_2; in kernel_sgemm_nt_4x4_lib4()
120 CC[1+bs*2] += a_1 * b_2; in kernel_sgemm_nt_4x4_lib4()
121 CC[2+bs*2] += a_2 * b_2; in kernel_sgemm_nt_4x4_lib4()
122 CC[3+bs*2] += a_3 * b_2; in kernel_sgemm_nt_4x4_lib4()
124 CC[0+bs*3] += a_0 * b_3; in kernel_sgemm_nt_4x4_lib4()
125 CC[1+bs*3] += a_1 * b_3; in kernel_sgemm_nt_4x4_lib4()
126 CC[2+bs*3] += a_2 * b_3; in kernel_sgemm_nt_4x4_lib4()
127 CC[3+bs*3] += a_3 * b_3; in kernel_sgemm_nt_4x4_lib4()
142 CC[0+bs*0] += a_0 * b_0; in kernel_sgemm_nt_4x4_lib4()
143 CC[1+bs*0] += a_1 * b_0; in kernel_sgemm_nt_4x4_lib4()
144 CC[2+bs*0] += a_2 * b_0; in kernel_sgemm_nt_4x4_lib4()
145 CC[3+bs*0] += a_3 * b_0; in kernel_sgemm_nt_4x4_lib4()
147 CC[0+bs*1] += a_0 * b_1; in kernel_sgemm_nt_4x4_lib4()
148 CC[1+bs*1] += a_1 * b_1; in kernel_sgemm_nt_4x4_lib4()
149 CC[2+bs*1] += a_2 * b_1; in kernel_sgemm_nt_4x4_lib4()
150 CC[3+bs*1] += a_3 * b_1; in kernel_sgemm_nt_4x4_lib4()
152 CC[0+bs*2] += a_0 * b_2; in kernel_sgemm_nt_4x4_lib4()
153 CC[1+bs*2] += a_1 * b_2; in kernel_sgemm_nt_4x4_lib4()
154 CC[2+bs*2] += a_2 * b_2; in kernel_sgemm_nt_4x4_lib4()
155 CC[3+bs*2] += a_3 * b_2; in kernel_sgemm_nt_4x4_lib4()
157 CC[0+bs*3] += a_0 * b_3; in kernel_sgemm_nt_4x4_lib4()
158 CC[1+bs*3] += a_1 * b_3; in kernel_sgemm_nt_4x4_lib4()
159 CC[2+bs*3] += a_2 * b_3; in kernel_sgemm_nt_4x4_lib4()
160 CC[3+bs*3] += a_3 * b_3; in kernel_sgemm_nt_4x4_lib4()
175 CC[0+bs*0] += a_0 * b_0; in kernel_sgemm_nt_4x4_lib4()
176 CC[1+bs*0] += a_1 * b_0; in kernel_sgemm_nt_4x4_lib4()
177 CC[2+bs*0] += a_2 * b_0; in kernel_sgemm_nt_4x4_lib4()
178 CC[3+bs*0] += a_3 * b_0; in kernel_sgemm_nt_4x4_lib4()
180 CC[0+bs*1] += a_0 * b_1; in kernel_sgemm_nt_4x4_lib4()
181 CC[1+bs*1] += a_1 * b_1; in kernel_sgemm_nt_4x4_lib4()
182 CC[2+bs*1] += a_2 * b_1; in kernel_sgemm_nt_4x4_lib4()
183 CC[3+bs*1] += a_3 * b_1; in kernel_sgemm_nt_4x4_lib4()
185 CC[0+bs*2] += a_0 * b_2; in kernel_sgemm_nt_4x4_lib4()
186 CC[1+bs*2] += a_1 * b_2; in kernel_sgemm_nt_4x4_lib4()
187 CC[2+bs*2] += a_2 * b_2; in kernel_sgemm_nt_4x4_lib4()
188 CC[3+bs*2] += a_3 * b_2; in kernel_sgemm_nt_4x4_lib4()
190 CC[0+bs*3] += a_0 * b_3; in kernel_sgemm_nt_4x4_lib4()
191 CC[1+bs*3] += a_1 * b_3; in kernel_sgemm_nt_4x4_lib4()
192 CC[2+bs*3] += a_2 * b_3; in kernel_sgemm_nt_4x4_lib4()
193 CC[3+bs*3] += a_3 * b_3; in kernel_sgemm_nt_4x4_lib4()
215 CC[0+bs*0] += a_0 * b_0; in kernel_sgemm_nt_4x4_lib4()
216 CC[1+bs*0] += a_1 * b_0; in kernel_sgemm_nt_4x4_lib4()
217 CC[2+bs*0] += a_2 * b_0; in kernel_sgemm_nt_4x4_lib4()
218 CC[3+bs*0] += a_3 * b_0; in kernel_sgemm_nt_4x4_lib4()
220 CC[0+bs*1] += a_0 * b_1; in kernel_sgemm_nt_4x4_lib4()
221 CC[1+bs*1] += a_1 * b_1; in kernel_sgemm_nt_4x4_lib4()
222 CC[2+bs*1] += a_2 * b_1; in kernel_sgemm_nt_4x4_lib4()
223 CC[3+bs*1] += a_3 * b_1; in kernel_sgemm_nt_4x4_lib4()
225 CC[0+bs*2] += a_0 * b_2; in kernel_sgemm_nt_4x4_lib4()
226 CC[1+bs*2] += a_1 * b_2; in kernel_sgemm_nt_4x4_lib4()
227 CC[2+bs*2] += a_2 * b_2; in kernel_sgemm_nt_4x4_lib4()
228 CC[3+bs*2] += a_3 * b_2; in kernel_sgemm_nt_4x4_lib4()
230 CC[0+bs*3] += a_0 * b_3; in kernel_sgemm_nt_4x4_lib4()
231 CC[1+bs*3] += a_1 * b_3; in kernel_sgemm_nt_4x4_lib4()
232 CC[2+bs*3] += a_2 * b_3; in kernel_sgemm_nt_4x4_lib4()
233 CC[3+bs*3] += a_3 * b_3; in kernel_sgemm_nt_4x4_lib4()
240 D[0+bs*0] = beta[0]*C[0+bs*0] + alpha[0]*CC[0+bs*0]; in kernel_sgemm_nt_4x4_lib4()
241 D[1+bs*0] = beta[0]*C[1+bs*0] + alpha[0]*CC[1+bs*0]; in kernel_sgemm_nt_4x4_lib4()
242 D[2+bs*0] = beta[0]*C[2+bs*0] + alpha[0]*CC[2+bs*0]; in kernel_sgemm_nt_4x4_lib4()
243 D[3+bs*0] = beta[0]*C[3+bs*0] + alpha[0]*CC[3+bs*0]; in kernel_sgemm_nt_4x4_lib4()
245 D[0+bs*1] = beta[0]*C[0+bs*1] + alpha[0]*CC[0+bs*1]; in kernel_sgemm_nt_4x4_lib4()
246 D[1+bs*1] = beta[0]*C[1+bs*1] + alpha[0]*CC[1+bs*1]; in kernel_sgemm_nt_4x4_lib4()
247 D[2+bs*1] = beta[0]*C[2+bs*1] + alpha[0]*CC[2+bs*1]; in kernel_sgemm_nt_4x4_lib4()
248 D[3+bs*1] = beta[0]*C[3+bs*1] + alpha[0]*CC[3+bs*1]; in kernel_sgemm_nt_4x4_lib4()
250 D[0+bs*2] = beta[0]*C[0+bs*2] + alpha[0]*CC[0+bs*2]; in kernel_sgemm_nt_4x4_lib4()
251 D[1+bs*2] = beta[0]*C[1+bs*2] + alpha[0]*CC[1+bs*2]; in kernel_sgemm_nt_4x4_lib4()
252 D[2+bs*2] = beta[0]*C[2+bs*2] + alpha[0]*CC[2+bs*2]; in kernel_sgemm_nt_4x4_lib4()
253 D[3+bs*2] = beta[0]*C[3+bs*2] + alpha[0]*CC[3+bs*2]; in kernel_sgemm_nt_4x4_lib4()
255 D[0+bs*3] = beta[0]*C[0+bs*3] + alpha[0]*CC[0+bs*3]; in kernel_sgemm_nt_4x4_lib4()
256 D[1+bs*3] = beta[0]*C[1+bs*3] + alpha[0]*CC[1+bs*3]; in kernel_sgemm_nt_4x4_lib4()
257 D[2+bs*3] = beta[0]*C[2+bs*3] + alpha[0]*CC[2+bs*3]; in kernel_sgemm_nt_4x4_lib4()
258 D[3+bs*3] = beta[0]*C[3+bs*3] + alpha[0]*CC[3+bs*3]; in kernel_sgemm_nt_4x4_lib4()
274 float CC[16] = {0}; in kernel_sgemm_nt_4x4_vs_lib4() local
276 ALIGNED( float CC[16], 64 ) = {0}; in kernel_sgemm_nt_4x4_vs_lib4()
279 kernel_sgemm_nt_4x4_lib4(kmax, alpha, A, B, beta, C, CC); in kernel_sgemm_nt_4x4_vs_lib4()
283 D[0+bs*0] = CC[0+bs*0]; in kernel_sgemm_nt_4x4_vs_lib4()
284 D[1+bs*0] = CC[1+bs*0]; in kernel_sgemm_nt_4x4_vs_lib4()
285 D[2+bs*0] = CC[2+bs*0]; in kernel_sgemm_nt_4x4_vs_lib4()
286 D[3+bs*0] = CC[3+bs*0]; in kernel_sgemm_nt_4x4_vs_lib4()
291 D[0+bs*1] = CC[0+bs*1]; in kernel_sgemm_nt_4x4_vs_lib4()
292 D[1+bs*1] = CC[1+bs*1]; in kernel_sgemm_nt_4x4_vs_lib4()
293 D[2+bs*1] = CC[2+bs*1]; in kernel_sgemm_nt_4x4_vs_lib4()
294 D[3+bs*1] = CC[3+bs*1]; in kernel_sgemm_nt_4x4_vs_lib4()
299 D[0+bs*2] = CC[0+bs*2]; in kernel_sgemm_nt_4x4_vs_lib4()
300 D[1+bs*2] = CC[1+bs*2]; in kernel_sgemm_nt_4x4_vs_lib4()
301 D[2+bs*2] = CC[2+bs*2]; in kernel_sgemm_nt_4x4_vs_lib4()
302 D[3+bs*2] = CC[3+bs*2]; in kernel_sgemm_nt_4x4_vs_lib4()
307 D[0+bs*3] = CC[0+bs*3]; in kernel_sgemm_nt_4x4_vs_lib4()
308 D[1+bs*3] = CC[1+bs*3]; in kernel_sgemm_nt_4x4_vs_lib4()
309 D[2+bs*3] = CC[2+bs*3]; in kernel_sgemm_nt_4x4_vs_lib4()
310 D[3+bs*3] = CC[3+bs*3]; in kernel_sgemm_nt_4x4_vs_lib4()
314 D[0+bs*0] = CC[0+bs*0]; in kernel_sgemm_nt_4x4_vs_lib4()
315 D[1+bs*0] = CC[1+bs*0]; in kernel_sgemm_nt_4x4_vs_lib4()
316 D[2+bs*0] = CC[2+bs*0]; in kernel_sgemm_nt_4x4_vs_lib4()
321 D[0+bs*1] = CC[0+bs*1]; in kernel_sgemm_nt_4x4_vs_lib4()
322 D[1+bs*1] = CC[1+bs*1]; in kernel_sgemm_nt_4x4_vs_lib4()
323 D[2+bs*1] = CC[2+bs*1]; in kernel_sgemm_nt_4x4_vs_lib4()
328 D[0+bs*2] = CC[0+bs*2]; in kernel_sgemm_nt_4x4_vs_lib4()
329 D[1+bs*2] = CC[1+bs*2]; in kernel_sgemm_nt_4x4_vs_lib4()
330 D[2+bs*2] = CC[2+bs*2]; in kernel_sgemm_nt_4x4_vs_lib4()
335 D[0+bs*3] = CC[0+bs*3]; in kernel_sgemm_nt_4x4_vs_lib4()
336 D[1+bs*3] = CC[1+bs*3]; in kernel_sgemm_nt_4x4_vs_lib4()
337 D[2+bs*3] = CC[2+bs*3]; in kernel_sgemm_nt_4x4_vs_lib4()
341 D[0+bs*0] = CC[0+bs*0]; in kernel_sgemm_nt_4x4_vs_lib4()
342 D[1+bs*0] = CC[1+bs*0]; in kernel_sgemm_nt_4x4_vs_lib4()
347 D[0+bs*1] = CC[0+bs*1]; in kernel_sgemm_nt_4x4_vs_lib4()
348 D[1+bs*1] = CC[1+bs*1]; in kernel_sgemm_nt_4x4_vs_lib4()
353 D[0+bs*2] = CC[0+bs*2]; in kernel_sgemm_nt_4x4_vs_lib4()
354 D[1+bs*2] = CC[1+bs*2]; in kernel_sgemm_nt_4x4_vs_lib4()
359 D[0+bs*3] = CC[0+bs*3]; in kernel_sgemm_nt_4x4_vs_lib4()
360 D[1+bs*3] = CC[1+bs*3]; in kernel_sgemm_nt_4x4_vs_lib4()
364 D[0+bs*0] = CC[0+bs*0]; in kernel_sgemm_nt_4x4_vs_lib4()
369 D[0+bs*1] = CC[0+bs*1]; in kernel_sgemm_nt_4x4_vs_lib4()
374 D[0+bs*2] = CC[0+bs*2]; in kernel_sgemm_nt_4x4_vs_lib4()
379 D[0+bs*3] = CC[0+bs*3]; in kernel_sgemm_nt_4x4_vs_lib4()
396 float CC[16] = {0}; in kernel_sgemm_nt_4x4_gen_lib4() local
398 ALIGNED( float CC[16], 64 ) = {0}; in kernel_sgemm_nt_4x4_gen_lib4()
408 CC[0+bs*0] = beta[0]*C0[0+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
409 CC[1+bs*0] = beta[0]*C0[1+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
410 CC[2+bs*0] = beta[0]*C0[2+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
411 CC[3+bs*0] = beta[0]*C0[3+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
413 CC[0+bs*1] = beta[0]*C0[0+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
414 CC[1+bs*1] = beta[0]*C0[1+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
415 CC[2+bs*1] = beta[0]*C0[2+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
416 CC[3+bs*1] = beta[0]*C0[3+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
418 CC[0+bs*2] = beta[0]*C0[0+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
419 CC[1+bs*2] = beta[0]*C0[1+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
420 CC[2+bs*2] = beta[0]*C0[2+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
421 CC[3+bs*2] = beta[0]*C0[3+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
423 CC[0+bs*3] = beta[0]*C0[0+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
424 CC[1+bs*3] = beta[0]*C0[1+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
425 CC[2+bs*3] = beta[0]*C0[2+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
426 CC[3+bs*3] = beta[0]*C0[3+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
432 CC[0+bs*0] = beta[0]*C0[1+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
433 CC[1+bs*0] = beta[0]*C0[2+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
434 CC[2+bs*0] = beta[0]*C0[3+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
435 CC[3+bs*0] = beta[0]*C1[0+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
437 CC[0+bs*1] = beta[0]*C0[1+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
438 CC[1+bs*1] = beta[0]*C0[2+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
439 CC[2+bs*1] = beta[0]*C0[3+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
440 CC[3+bs*1] = beta[0]*C1[0+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
442 CC[0+bs*2] = beta[0]*C0[1+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
443 CC[1+bs*2] = beta[0]*C0[2+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
444 CC[2+bs*2] = beta[0]*C0[3+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
445 CC[3+bs*2] = beta[0]*C1[0+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
447 CC[0+bs*3] = beta[0]*C0[1+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
448 CC[1+bs*3] = beta[0]*C0[2+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
449 CC[2+bs*3] = beta[0]*C0[3+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
450 CC[3+bs*3] = beta[0]*C1[0+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
456 CC[0+bs*0] = beta[0]*C0[2+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
457 CC[1+bs*0] = beta[0]*C0[3+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
458 CC[2+bs*0] = beta[0]*C1[0+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
459 CC[3+bs*0] = beta[0]*C1[1+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
461 CC[0+bs*1] = beta[0]*C0[2+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
462 CC[1+bs*1] = beta[0]*C0[3+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
463 CC[2+bs*1] = beta[0]*C1[0+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
464 CC[3+bs*1] = beta[0]*C1[1+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
466 CC[0+bs*2] = beta[0]*C0[2+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
467 CC[1+bs*2] = beta[0]*C0[3+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
468 CC[2+bs*2] = beta[0]*C1[0+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
469 CC[3+bs*2] = beta[0]*C1[1+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
471 CC[0+bs*3] = beta[0]*C0[2+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
472 CC[1+bs*3] = beta[0]*C0[3+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
473 CC[2+bs*3] = beta[0]*C1[0+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
474 CC[3+bs*3] = beta[0]*C1[1+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
480 CC[0+bs*0] = beta[0]*C0[3+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
481 CC[1+bs*0] = beta[0]*C1[0+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
482 CC[2+bs*0] = beta[0]*C1[1+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
483 CC[3+bs*0] = beta[0]*C1[2+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
485 CC[0+bs*1] = beta[0]*C0[3+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
486 CC[1+bs*1] = beta[0]*C1[0+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
487 CC[2+bs*1] = beta[0]*C1[1+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
488 CC[3+bs*1] = beta[0]*C1[2+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
490 CC[0+bs*2] = beta[0]*C0[3+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
491 CC[1+bs*2] = beta[0]*C1[0+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
492 CC[2+bs*2] = beta[0]*C1[1+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
493 CC[3+bs*2] = beta[0]*C1[2+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
495 CC[0+bs*3] = beta[0]*C0[3+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
496 CC[1+bs*3] = beta[0]*C1[0+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
497 CC[2+bs*3] = beta[0]*C1[1+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
498 CC[3+bs*3] = beta[0]*C1[2+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
503 kernel_sgemm_nt_4x4_lib4(kmax, alpha, A, B, &beta1, CC, CC); in kernel_sgemm_nt_4x4_gen_lib4()
510 CC[0+bs*0] = CC[0+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
511 CC[1+bs*0] = CC[1+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
512 CC[2+bs*0] = CC[2+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
513 CC[3+bs*0] = CC[3+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
515 CC[0+bs*1] = CC[0+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
516 CC[1+bs*1] = CC[1+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
517 CC[2+bs*1] = CC[2+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
518 CC[3+bs*1] = CC[3+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
520 CC[0+bs*2] = CC[0+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
521 CC[1+bs*2] = CC[1+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
522 CC[2+bs*2] = CC[2+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
523 CC[3+bs*2] = CC[3+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
529 CC[0+bs*0] = CC[0+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
530 CC[1+bs*0] = CC[1+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
531 CC[2+bs*0] = CC[2+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
532 CC[3+bs*0] = CC[3+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
534 CC[0+bs*1] = CC[0+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
535 CC[1+bs*1] = CC[1+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
536 CC[2+bs*1] = CC[2+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
537 CC[3+bs*1] = CC[3+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
543 CC[0+bs*0] = CC[0+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
544 CC[1+bs*0] = CC[1+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
545 CC[2+bs*0] = CC[2+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
546 CC[3+bs*0] = CC[3+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
560 if(m0<=0 & m1>0) D0[0+bs*0] = CC[0+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
561 if(m0<=1 & m1>1) D0[1+bs*0] = CC[1+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
562 if(m0<=2 & m1>2) D0[2+bs*0] = CC[2+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
563 if(m0<=3 & m1>3) D0[3+bs*0] = CC[3+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
568 if(m0<=0 & m1>0) D0[0+bs*1] = CC[0+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
569 if(m0<=1 & m1>1) D0[1+bs*1] = CC[1+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
570 if(m0<=2 & m1>2) D0[2+bs*1] = CC[2+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
571 if(m0<=3 & m1>3) D0[3+bs*1] = CC[3+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
576 if(m0<=0 & m1>0) D0[0+bs*2] = CC[0+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
577 if(m0<=1 & m1>1) D0[1+bs*2] = CC[1+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
578 if(m0<=2 & m1>2) D0[2+bs*2] = CC[2+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
579 if(m0<=3 & m1>3) D0[3+bs*2] = CC[3+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
584 if(m0<=0 & m1>0) D0[0+bs*3] = CC[0+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
585 if(m0<=1 & m1>1) D0[1+bs*3] = CC[1+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
586 if(m0<=2 & m1>2) D0[2+bs*3] = CC[2+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
587 if(m0<=3 & m1>3) D0[3+bs*3] = CC[3+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
596 if(m0<=0 & m1>0) D0[1+bs*0] = CC[0+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
597 if(m0<=1 & m1>1) D0[2+bs*0] = CC[1+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
598 if(m0<=2 & m1>2) D0[3+bs*0] = CC[2+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
599 if(m0<=3 & m1>3) D1[0+bs*0] = CC[3+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
604 if(m0<=0 & m1>0) D0[1+bs*1] = CC[0+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
605 if(m0<=1 & m1>1) D0[2+bs*1] = CC[1+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
606 if(m0<=2 & m1>2) D0[3+bs*1] = CC[2+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
607 if(m0<=3 & m1>3) D1[0+bs*1] = CC[3+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
612 if(m0<=0 & m1>0) D0[1+bs*2] = CC[0+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
613 if(m0<=1 & m1>1) D0[2+bs*2] = CC[1+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
614 if(m0<=2 & m1>2) D0[3+bs*2] = CC[2+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
615 if(m0<=3 & m1>3) D1[0+bs*2] = CC[3+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
620 if(m0<=0 & m1>0) D0[1+bs*3] = CC[0+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
621 if(m0<=1 & m1>1) D0[2+bs*3] = CC[1+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
622 if(m0<=2 & m1>2) D0[3+bs*3] = CC[2+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
623 if(m0<=3 & m1>3) D1[0+bs*3] = CC[3+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
632 if(m0<=0 & m1>0) D0[2+bs*0] = CC[0+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
633 if(m0<=1 & m1>1) D0[3+bs*0] = CC[1+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
634 if(m0<=2 & m1>2) D1[0+bs*0] = CC[2+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
635 if(m0<=3 & m1>3) D1[1+bs*0] = CC[3+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
640 if(m0<=0 & m1>0) D0[2+bs*1] = CC[0+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
641 if(m0<=1 & m1>1) D0[3+bs*1] = CC[1+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
642 if(m0<=2 & m1>2) D1[0+bs*1] = CC[2+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
643 if(m0<=3 & m1>3) D1[1+bs*1] = CC[3+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
648 if(m0<=0 & m1>0) D0[2+bs*2] = CC[0+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
649 if(m0<=1 & m1>1) D0[3+bs*2] = CC[1+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
650 if(m0<=2 & m1>2) D1[0+bs*2] = CC[2+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
651 if(m0<=3 & m1>3) D1[1+bs*2] = CC[3+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
656 if(m0<=0 & m1>0) D0[2+bs*3] = CC[0+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
657 if(m0<=1 & m1>1) D0[3+bs*3] = CC[1+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
658 if(m0<=2 & m1>2) D1[0+bs*3] = CC[2+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
659 if(m0<=3 & m1>3) D1[1+bs*3] = CC[3+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
668 if(m0<=0 & m1>0) D0[3+bs*0] = CC[0+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
669 if(m0<=1 & m1>1) D1[0+bs*0] = CC[1+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
670 if(m0<=2 & m1>2) D1[1+bs*0] = CC[2+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
671 if(m0<=3 & m1>3) D1[2+bs*0] = CC[3+bs*0]; in kernel_sgemm_nt_4x4_gen_lib4()
676 if(m0<=0 & m1>0) D0[3+bs*1] = CC[0+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
677 if(m0<=1 & m1>1) D1[0+bs*1] = CC[1+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
678 if(m0<=2 & m1>2) D1[1+bs*1] = CC[2+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
679 if(m0<=3 & m1>3) D1[2+bs*1] = CC[3+bs*1]; in kernel_sgemm_nt_4x4_gen_lib4()
684 if(m0<=0 & m1>0) D0[3+bs*2] = CC[0+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
685 if(m0<=1 & m1>1) D1[0+bs*2] = CC[1+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
686 if(m0<=2 & m1>2) D1[1+bs*2] = CC[2+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
687 if(m0<=3 & m1>3) D1[2+bs*2] = CC[3+bs*2]; in kernel_sgemm_nt_4x4_gen_lib4()
692 if(m0<=0 & m1>0) D0[3+bs*3] = CC[0+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
693 if(m0<=1 & m1>1) D1[0+bs*3] = CC[1+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
694 if(m0<=2 & m1>2) D1[1+bs*3] = CC[2+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
695 if(m0<=3 & m1>3) D1[2+bs*3] = CC[3+bs*3]; in kernel_sgemm_nt_4x4_gen_lib4()
716 float CC[16] = {0}; in kernel_sgemm_nn_4x4_lib4() local
718 ALIGNED( float CC[16], 64 ) = {0}; in kernel_sgemm_nn_4x4_lib4()
744 CC[0+bs*0] += a_0 * b_0; in kernel_sgemm_nn_4x4_lib4()
745 CC[1+bs*0] += a_1 * b_0; in kernel_sgemm_nn_4x4_lib4()
746 CC[2+bs*0] += a_2 * b_0; in kernel_sgemm_nn_4x4_lib4()
747 CC[3+bs*0] += a_3 * b_0; in kernel_sgemm_nn_4x4_lib4()
749 CC[0+bs*1] += a_0 * b_1; in kernel_sgemm_nn_4x4_lib4()
750 CC[1+bs*1] += a_1 * b_1; in kernel_sgemm_nn_4x4_lib4()
751 CC[2+bs*1] += a_2 * b_1; in kernel_sgemm_nn_4x4_lib4()
752 CC[3+bs*1] += a_3 * b_1; in kernel_sgemm_nn_4x4_lib4()
754 CC[0+bs*2] += a_0 * b_2; in kernel_sgemm_nn_4x4_lib4()
755 CC[1+bs*2] += a_1 * b_2; in kernel_sgemm_nn_4x4_lib4()
756 CC[2+bs*2] += a_2 * b_2; in kernel_sgemm_nn_4x4_lib4()
757 CC[3+bs*2] += a_3 * b_2; in kernel_sgemm_nn_4x4_lib4()
759 CC[0+bs*3] += a_0 * b_3; in kernel_sgemm_nn_4x4_lib4()
760 CC[1+bs*3] += a_1 * b_3; in kernel_sgemm_nn_4x4_lib4()
761 CC[2+bs*3] += a_2 * b_3; in kernel_sgemm_nn_4x4_lib4()
762 CC[3+bs*3] += a_3 * b_3; in kernel_sgemm_nn_4x4_lib4()
781 CC[0+bs*0] += a_0 * b_0; in kernel_sgemm_nn_4x4_lib4()
782 CC[1+bs*0] += a_1 * b_0; in kernel_sgemm_nn_4x4_lib4()
783 CC[2+bs*0] += a_2 * b_0; in kernel_sgemm_nn_4x4_lib4()
784 CC[3+bs*0] += a_3 * b_0; in kernel_sgemm_nn_4x4_lib4()
786 CC[0+bs*1] += a_0 * b_1; in kernel_sgemm_nn_4x4_lib4()
787 CC[1+bs*1] += a_1 * b_1; in kernel_sgemm_nn_4x4_lib4()
788 CC[2+bs*1] += a_2 * b_1; in kernel_sgemm_nn_4x4_lib4()
789 CC[3+bs*1] += a_3 * b_1; in kernel_sgemm_nn_4x4_lib4()
791 CC[0+bs*2] += a_0 * b_2; in kernel_sgemm_nn_4x4_lib4()
792 CC[1+bs*2] += a_1 * b_2; in kernel_sgemm_nn_4x4_lib4()
793 CC[2+bs*2] += a_2 * b_2; in kernel_sgemm_nn_4x4_lib4()
794 CC[3+bs*2] += a_3 * b_2; in kernel_sgemm_nn_4x4_lib4()
796 CC[0+bs*3] += a_0 * b_3; in kernel_sgemm_nn_4x4_lib4()
797 CC[1+bs*3] += a_1 * b_3; in kernel_sgemm_nn_4x4_lib4()
798 CC[2+bs*3] += a_2 * b_3; in kernel_sgemm_nn_4x4_lib4()
799 CC[3+bs*3] += a_3 * b_3; in kernel_sgemm_nn_4x4_lib4()
818 CC[0+bs*0] += a_0 * b_0; in kernel_sgemm_nn_4x4_lib4()
819 CC[1+bs*0] += a_1 * b_0; in kernel_sgemm_nn_4x4_lib4()
820 CC[2+bs*0] += a_2 * b_0; in kernel_sgemm_nn_4x4_lib4()
821 CC[3+bs*0] += a_3 * b_0; in kernel_sgemm_nn_4x4_lib4()
823 CC[0+bs*1] += a_0 * b_1; in kernel_sgemm_nn_4x4_lib4()
824 CC[1+bs*1] += a_1 * b_1; in kernel_sgemm_nn_4x4_lib4()
825 CC[2+bs*1] += a_2 * b_1; in kernel_sgemm_nn_4x4_lib4()
826 CC[3+bs*1] += a_3 * b_1; in kernel_sgemm_nn_4x4_lib4()
828 CC[0+bs*2] += a_0 * b_2; in kernel_sgemm_nn_4x4_lib4()
829 CC[1+bs*2] += a_1 * b_2; in kernel_sgemm_nn_4x4_lib4()
830 CC[2+bs*2] += a_2 * b_2; in kernel_sgemm_nn_4x4_lib4()
831 CC[3+bs*2] += a_3 * b_2; in kernel_sgemm_nn_4x4_lib4()
833 CC[0+bs*3] += a_0 * b_3; in kernel_sgemm_nn_4x4_lib4()
834 CC[1+bs*3] += a_1 * b_3; in kernel_sgemm_nn_4x4_lib4()
835 CC[2+bs*3] += a_2 * b_3; in kernel_sgemm_nn_4x4_lib4()
836 CC[3+bs*3] += a_3 * b_3; in kernel_sgemm_nn_4x4_lib4()
859 CC[0+bs*0] += a_0 * b_0; in kernel_sgemm_nn_4x4_lib4()
860 CC[1+bs*0] += a_1 * b_0; in kernel_sgemm_nn_4x4_lib4()
861 CC[2+bs*0] += a_2 * b_0; in kernel_sgemm_nn_4x4_lib4()
862 CC[3+bs*0] += a_3 * b_0; in kernel_sgemm_nn_4x4_lib4()
864 CC[0+bs*1] += a_0 * b_1; in kernel_sgemm_nn_4x4_lib4()
865 CC[1+bs*1] += a_1 * b_1; in kernel_sgemm_nn_4x4_lib4()
866 CC[2+bs*1] += a_2 * b_1; in kernel_sgemm_nn_4x4_lib4()
867 CC[3+bs*1] += a_3 * b_1; in kernel_sgemm_nn_4x4_lib4()
869 CC[0+bs*2] += a_0 * b_2; in kernel_sgemm_nn_4x4_lib4()
870 CC[1+bs*2] += a_1 * b_2; in kernel_sgemm_nn_4x4_lib4()
871 CC[2+bs*2] += a_2 * b_2; in kernel_sgemm_nn_4x4_lib4()
872 CC[3+bs*2] += a_3 * b_2; in kernel_sgemm_nn_4x4_lib4()
874 CC[0+bs*3] += a_0 * b_3; in kernel_sgemm_nn_4x4_lib4()
875 CC[1+bs*3] += a_1 * b_3; in kernel_sgemm_nn_4x4_lib4()
876 CC[2+bs*3] += a_2 * b_3; in kernel_sgemm_nn_4x4_lib4()
877 CC[3+bs*3] += a_3 * b_3; in kernel_sgemm_nn_4x4_lib4()
896 CC[0+bs*0] += a_0 * b_0; in kernel_sgemm_nn_4x4_lib4()
897 CC[1+bs*0] += a_1 * b_0; in kernel_sgemm_nn_4x4_lib4()
898 CC[2+bs*0] += a_2 * b_0; in kernel_sgemm_nn_4x4_lib4()
899 CC[3+bs*0] += a_3 * b_0; in kernel_sgemm_nn_4x4_lib4()
901 CC[0+bs*1] += a_0 * b_1; in kernel_sgemm_nn_4x4_lib4()
902 CC[1+bs*1] += a_1 * b_1; in kernel_sgemm_nn_4x4_lib4()
903 CC[2+bs*1] += a_2 * b_1; in kernel_sgemm_nn_4x4_lib4()
904 CC[3+bs*1] += a_3 * b_1; in kernel_sgemm_nn_4x4_lib4()
906 CC[0+bs*2] += a_0 * b_2; in kernel_sgemm_nn_4x4_lib4()
907 CC[1+bs*2] += a_1 * b_2; in kernel_sgemm_nn_4x4_lib4()
908 CC[2+bs*2] += a_2 * b_2; in kernel_sgemm_nn_4x4_lib4()
909 CC[3+bs*2] += a_3 * b_2; in kernel_sgemm_nn_4x4_lib4()
911 CC[0+bs*3] += a_0 * b_3; in kernel_sgemm_nn_4x4_lib4()
912 CC[1+bs*3] += a_1 * b_3; in kernel_sgemm_nn_4x4_lib4()
913 CC[2+bs*3] += a_2 * b_3; in kernel_sgemm_nn_4x4_lib4()
914 CC[3+bs*3] += a_3 * b_3; in kernel_sgemm_nn_4x4_lib4()
937 CC[0+bs*0] += a_0 * b_0; in kernel_sgemm_nn_4x4_lib4()
938 CC[1+bs*0] += a_1 * b_0; in kernel_sgemm_nn_4x4_lib4()
939 CC[2+bs*0] += a_2 * b_0; in kernel_sgemm_nn_4x4_lib4()
940 CC[3+bs*0] += a_3 * b_0; in kernel_sgemm_nn_4x4_lib4()
942 CC[0+bs*1] += a_0 * b_1; in kernel_sgemm_nn_4x4_lib4()
943 CC[1+bs*1] += a_1 * b_1; in kernel_sgemm_nn_4x4_lib4()
944 CC[2+bs*1] += a_2 * b_1; in kernel_sgemm_nn_4x4_lib4()
945 CC[3+bs*1] += a_3 * b_1; in kernel_sgemm_nn_4x4_lib4()
947 CC[0+bs*2] += a_0 * b_2; in kernel_sgemm_nn_4x4_lib4()
948 CC[1+bs*2] += a_1 * b_2; in kernel_sgemm_nn_4x4_lib4()
949 CC[2+bs*2] += a_2 * b_2; in kernel_sgemm_nn_4x4_lib4()
950 CC[3+bs*2] += a_3 * b_2; in kernel_sgemm_nn_4x4_lib4()
952 CC[0+bs*3] += a_0 * b_3; in kernel_sgemm_nn_4x4_lib4()
953 CC[1+bs*3] += a_1 * b_3; in kernel_sgemm_nn_4x4_lib4()
954 CC[2+bs*3] += a_2 * b_3; in kernel_sgemm_nn_4x4_lib4()
955 CC[3+bs*3] += a_3 * b_3; in kernel_sgemm_nn_4x4_lib4()
979 CC[0+bs*0] += a_0 * b_0; in kernel_sgemm_nn_4x4_lib4()
980 CC[1+bs*0] += a_1 * b_0; in kernel_sgemm_nn_4x4_lib4()
981 CC[2+bs*0] += a_2 * b_0; in kernel_sgemm_nn_4x4_lib4()
982 CC[3+bs*0] += a_3 * b_0; in kernel_sgemm_nn_4x4_lib4()
984 CC[0+bs*1] += a_0 * b_1; in kernel_sgemm_nn_4x4_lib4()
985 CC[1+bs*1] += a_1 * b_1; in kernel_sgemm_nn_4x4_lib4()
986 CC[2+bs*1] += a_2 * b_1; in kernel_sgemm_nn_4x4_lib4()
987 CC[3+bs*1] += a_3 * b_1; in kernel_sgemm_nn_4x4_lib4()
989 CC[0+bs*2] += a_0 * b_2; in kernel_sgemm_nn_4x4_lib4()
990 CC[1+bs*2] += a_1 * b_2; in kernel_sgemm_nn_4x4_lib4()
991 CC[2+bs*2] += a_2 * b_2; in kernel_sgemm_nn_4x4_lib4()
992 CC[3+bs*2] += a_3 * b_2; in kernel_sgemm_nn_4x4_lib4()
994 CC[0+bs*3] += a_0 * b_3; in kernel_sgemm_nn_4x4_lib4()
995 CC[1+bs*3] += a_1 * b_3; in kernel_sgemm_nn_4x4_lib4()
996 CC[2+bs*3] += a_2 * b_3; in kernel_sgemm_nn_4x4_lib4()
997 CC[3+bs*3] += a_3 * b_3; in kernel_sgemm_nn_4x4_lib4()
1012 CC[0+bs*0] += a_0 * b_0; in kernel_sgemm_nn_4x4_lib4()
1013 CC[1+bs*0] += a_1 * b_0; in kernel_sgemm_nn_4x4_lib4()
1014 CC[2+bs*0] += a_2 * b_0; in kernel_sgemm_nn_4x4_lib4()
1015 CC[3+bs*0] += a_3 * b_0; in kernel_sgemm_nn_4x4_lib4()
1017 CC[0+bs*1] += a_0 * b_1; in kernel_sgemm_nn_4x4_lib4()
1018 CC[1+bs*1] += a_1 * b_1; in kernel_sgemm_nn_4x4_lib4()
1019 CC[2+bs*1] += a_2 * b_1; in kernel_sgemm_nn_4x4_lib4()
1020 CC[3+bs*1] += a_3 * b_1; in kernel_sgemm_nn_4x4_lib4()
1022 CC[0+bs*2] += a_0 * b_2; in kernel_sgemm_nn_4x4_lib4()
1023 CC[1+bs*2] += a_1 * b_2; in kernel_sgemm_nn_4x4_lib4()
1024 CC[2+bs*2] += a_2 * b_2; in kernel_sgemm_nn_4x4_lib4()
1025 CC[3+bs*2] += a_3 * b_2; in kernel_sgemm_nn_4x4_lib4()
1027 CC[0+bs*3] += a_0 * b_3; in kernel_sgemm_nn_4x4_lib4()
1028 CC[1+bs*3] += a_1 * b_3; in kernel_sgemm_nn_4x4_lib4()
1029 CC[2+bs*3] += a_2 * b_3; in kernel_sgemm_nn_4x4_lib4()
1030 CC[3+bs*3] += a_3 * b_3; in kernel_sgemm_nn_4x4_lib4()
1045 CC[0+bs*0] += a_0 * b_0; in kernel_sgemm_nn_4x4_lib4()
1046 CC[1+bs*0] += a_1 * b_0; in kernel_sgemm_nn_4x4_lib4()
1047 CC[2+bs*0] += a_2 * b_0; in kernel_sgemm_nn_4x4_lib4()
1048 CC[3+bs*0] += a_3 * b_0; in kernel_sgemm_nn_4x4_lib4()
1050 CC[0+bs*1] += a_0 * b_1; in kernel_sgemm_nn_4x4_lib4()
1051 CC[1+bs*1] += a_1 * b_1; in kernel_sgemm_nn_4x4_lib4()
1052 CC[2+bs*1] += a_2 * b_1; in kernel_sgemm_nn_4x4_lib4()
1053 CC[3+bs*1] += a_3 * b_1; in kernel_sgemm_nn_4x4_lib4()
1055 CC[0+bs*2] += a_0 * b_2; in kernel_sgemm_nn_4x4_lib4()
1056 CC[1+bs*2] += a_1 * b_2; in kernel_sgemm_nn_4x4_lib4()
1057 CC[2+bs*2] += a_2 * b_2; in kernel_sgemm_nn_4x4_lib4()
1058 CC[3+bs*2] += a_3 * b_2; in kernel_sgemm_nn_4x4_lib4()
1060 CC[0+bs*3] += a_0 * b_3; in kernel_sgemm_nn_4x4_lib4()
1061 CC[1+bs*3] += a_1 * b_3; in kernel_sgemm_nn_4x4_lib4()
1062 CC[2+bs*3] += a_2 * b_3; in kernel_sgemm_nn_4x4_lib4()
1063 CC[3+bs*3] += a_3 * b_3; in kernel_sgemm_nn_4x4_lib4()
1078 CC[0+bs*0] += a_0 * b_0; in kernel_sgemm_nn_4x4_lib4()
1079 CC[1+bs*0] += a_1 * b_0; in kernel_sgemm_nn_4x4_lib4()
1080 CC[2+bs*0] += a_2 * b_0; in kernel_sgemm_nn_4x4_lib4()
1081 CC[3+bs*0] += a_3 * b_0; in kernel_sgemm_nn_4x4_lib4()
1083 CC[0+bs*1] += a_0 * b_1; in kernel_sgemm_nn_4x4_lib4()
1084 CC[1+bs*1] += a_1 * b_1; in kernel_sgemm_nn_4x4_lib4()
1085 CC[2+bs*1] += a_2 * b_1; in kernel_sgemm_nn_4x4_lib4()
1086 CC[3+bs*1] += a_3 * b_1; in kernel_sgemm_nn_4x4_lib4()
1088 CC[0+bs*2] += a_0 * b_2; in kernel_sgemm_nn_4x4_lib4()
1089 CC[1+bs*2] += a_1 * b_2; in kernel_sgemm_nn_4x4_lib4()
1090 CC[2+bs*2] += a_2 * b_2; in kernel_sgemm_nn_4x4_lib4()
1091 CC[3+bs*2] += a_3 * b_2; in kernel_sgemm_nn_4x4_lib4()
1093 CC[0+bs*3] += a_0 * b_3; in kernel_sgemm_nn_4x4_lib4()
1094 CC[1+bs*3] += a_1 * b_3; in kernel_sgemm_nn_4x4_lib4()
1095 CC[2+bs*3] += a_2 * b_3; in kernel_sgemm_nn_4x4_lib4()
1096 CC[3+bs*3] += a_3 * b_3; in kernel_sgemm_nn_4x4_lib4()
1117 CC[0+bs*0] += a_0 * b_0; in kernel_sgemm_nn_4x4_lib4()
1118 CC[1+bs*0] += a_1 * b_0; in kernel_sgemm_nn_4x4_lib4()
1119 CC[2+bs*0] += a_2 * b_0; in kernel_sgemm_nn_4x4_lib4()
1120 CC[3+bs*0] += a_3 * b_0; in kernel_sgemm_nn_4x4_lib4()
1122 CC[0+bs*1] += a_0 * b_1; in kernel_sgemm_nn_4x4_lib4()
1123 CC[1+bs*1] += a_1 * b_1; in kernel_sgemm_nn_4x4_lib4()
1124 CC[2+bs*1] += a_2 * b_1; in kernel_sgemm_nn_4x4_lib4()
1125 CC[3+bs*1] += a_3 * b_1; in kernel_sgemm_nn_4x4_lib4()
1127 CC[0+bs*2] += a_0 * b_2; in kernel_sgemm_nn_4x4_lib4()
1128 CC[1+bs*2] += a_1 * b_2; in kernel_sgemm_nn_4x4_lib4()
1129 CC[2+bs*2] += a_2 * b_2; in kernel_sgemm_nn_4x4_lib4()
1130 CC[3+bs*2] += a_3 * b_2; in kernel_sgemm_nn_4x4_lib4()
1132 CC[0+bs*3] += a_0 * b_3; in kernel_sgemm_nn_4x4_lib4()
1133 CC[1+bs*3] += a_1 * b_3; in kernel_sgemm_nn_4x4_lib4()
1134 CC[2+bs*3] += a_2 * b_3; in kernel_sgemm_nn_4x4_lib4()
1135 CC[3+bs*3] += a_3 * b_3; in kernel_sgemm_nn_4x4_lib4()
1144 D[0+bs*0] = beta[0]*C[0+bs*0] + alpha[0]*CC[0+bs*0]; in kernel_sgemm_nn_4x4_lib4()
1145 D[1+bs*0] = beta[0]*C[1+bs*0] + alpha[0]*CC[1+bs*0]; in kernel_sgemm_nn_4x4_lib4()
1146 D[2+bs*0] = beta[0]*C[2+bs*0] + alpha[0]*CC[2+bs*0]; in kernel_sgemm_nn_4x4_lib4()
1147 D[3+bs*0] = beta[0]*C[3+bs*0] + alpha[0]*CC[3+bs*0]; in kernel_sgemm_nn_4x4_lib4()
1149 D[0+bs*1] = beta[0]*C[0+bs*1] + alpha[0]*CC[0+bs*1]; in kernel_sgemm_nn_4x4_lib4()
1150 D[1+bs*1] = beta[0]*C[1+bs*1] + alpha[0]*CC[1+bs*1]; in kernel_sgemm_nn_4x4_lib4()
1151 D[2+bs*1] = beta[0]*C[2+bs*1] + alpha[0]*CC[2+bs*1]; in kernel_sgemm_nn_4x4_lib4()
1152 D[3+bs*1] = beta[0]*C[3+bs*1] + alpha[0]*CC[3+bs*1]; in kernel_sgemm_nn_4x4_lib4()
1154 D[0+bs*2] = beta[0]*C[0+bs*2] + alpha[0]*CC[0+bs*2]; in kernel_sgemm_nn_4x4_lib4()
1155 D[1+bs*2] = beta[0]*C[1+bs*2] + alpha[0]*CC[1+bs*2]; in kernel_sgemm_nn_4x4_lib4()
1156 D[2+bs*2] = beta[0]*C[2+bs*2] + alpha[0]*CC[2+bs*2]; in kernel_sgemm_nn_4x4_lib4()
1157 D[3+bs*2] = beta[0]*C[3+bs*2] + alpha[0]*CC[3+bs*2]; in kernel_sgemm_nn_4x4_lib4()
1159 D[0+bs*3] = beta[0]*C[0+bs*3] + alpha[0]*CC[0+bs*3]; in kernel_sgemm_nn_4x4_lib4()
1160 D[1+bs*3] = beta[0]*C[1+bs*3] + alpha[0]*CC[1+bs*3]; in kernel_sgemm_nn_4x4_lib4()
1161 D[2+bs*3] = beta[0]*C[2+bs*3] + alpha[0]*CC[2+bs*3]; in kernel_sgemm_nn_4x4_lib4()
1162 D[3+bs*3] = beta[0]*C[3+bs*3] + alpha[0]*CC[3+bs*3]; in kernel_sgemm_nn_4x4_lib4()
1178 float CC[16] = {0}; in kernel_sgemm_nn_4x4_vs_lib4() local
1180 ALIGNED( float CC[16], 64 ) = {0}; in kernel_sgemm_nn_4x4_vs_lib4()
1183 kernel_sgemm_nn_4x4_lib4(kmax, alpha, A, offsetB, B, sdb, beta, C, CC); in kernel_sgemm_nn_4x4_vs_lib4()
1187 D[0+bs*0] = CC[0+bs*0]; in kernel_sgemm_nn_4x4_vs_lib4()
1188 D[1+bs*0] = CC[1+bs*0]; in kernel_sgemm_nn_4x4_vs_lib4()
1189 D[2+bs*0] = CC[2+bs*0]; in kernel_sgemm_nn_4x4_vs_lib4()
1190 D[3+bs*0] = CC[3+bs*0]; in kernel_sgemm_nn_4x4_vs_lib4()
1195 D[0+bs*1] = CC[0+bs*1]; in kernel_sgemm_nn_4x4_vs_lib4()
1196 D[1+bs*1] = CC[1+bs*1]; in kernel_sgemm_nn_4x4_vs_lib4()
1197 D[2+bs*1] = CC[2+bs*1]; in kernel_sgemm_nn_4x4_vs_lib4()
1198 D[3+bs*1] = CC[3+bs*1]; in kernel_sgemm_nn_4x4_vs_lib4()
1203 D[0+bs*2] = CC[0+bs*2]; in kernel_sgemm_nn_4x4_vs_lib4()
1204 D[1+bs*2] = CC[1+bs*2]; in kernel_sgemm_nn_4x4_vs_lib4()
1205 D[2+bs*2] = CC[2+bs*2]; in kernel_sgemm_nn_4x4_vs_lib4()
1206 D[3+bs*2] = CC[3+bs*2]; in kernel_sgemm_nn_4x4_vs_lib4()
1211 D[0+bs*3] = CC[0+bs*3]; in kernel_sgemm_nn_4x4_vs_lib4()
1212 D[1+bs*3] = CC[1+bs*3]; in kernel_sgemm_nn_4x4_vs_lib4()
1213 D[2+bs*3] = CC[2+bs*3]; in kernel_sgemm_nn_4x4_vs_lib4()
1214 D[3+bs*3] = CC[3+bs*3]; in kernel_sgemm_nn_4x4_vs_lib4()
1218 D[0+bs*0] = CC[0+bs*0]; in kernel_sgemm_nn_4x4_vs_lib4()
1219 D[1+bs*0] = CC[1+bs*0]; in kernel_sgemm_nn_4x4_vs_lib4()
1220 D[2+bs*0] = CC[2+bs*0]; in kernel_sgemm_nn_4x4_vs_lib4()
1225 D[0+bs*1] = CC[0+bs*1]; in kernel_sgemm_nn_4x4_vs_lib4()
1226 D[1+bs*1] = CC[1+bs*1]; in kernel_sgemm_nn_4x4_vs_lib4()
1227 D[2+bs*1] = CC[2+bs*1]; in kernel_sgemm_nn_4x4_vs_lib4()
1232 D[0+bs*2] = CC[0+bs*2]; in kernel_sgemm_nn_4x4_vs_lib4()
1233 D[1+bs*2] = CC[1+bs*2]; in kernel_sgemm_nn_4x4_vs_lib4()
1234 D[2+bs*2] = CC[2+bs*2]; in kernel_sgemm_nn_4x4_vs_lib4()
1239 D[0+bs*3] = CC[0+bs*3]; in kernel_sgemm_nn_4x4_vs_lib4()
1240 D[1+bs*3] = CC[1+bs*3]; in kernel_sgemm_nn_4x4_vs_lib4()
1241 D[2+bs*3] = CC[2+bs*3]; in kernel_sgemm_nn_4x4_vs_lib4()
1245 D[0+bs*0] = CC[0+bs*0]; in kernel_sgemm_nn_4x4_vs_lib4()
1246 D[1+bs*0] = CC[1+bs*0]; in kernel_sgemm_nn_4x4_vs_lib4()
1251 D[0+bs*1] = CC[0+bs*1]; in kernel_sgemm_nn_4x4_vs_lib4()
1252 D[1+bs*1] = CC[1+bs*1]; in kernel_sgemm_nn_4x4_vs_lib4()
1257 D[0+bs*2] = CC[0+bs*2]; in kernel_sgemm_nn_4x4_vs_lib4()
1258 D[1+bs*2] = CC[1+bs*2]; in kernel_sgemm_nn_4x4_vs_lib4()
1263 D[0+bs*3] = CC[0+bs*3]; in kernel_sgemm_nn_4x4_vs_lib4()
1264 D[1+bs*3] = CC[1+bs*3]; in kernel_sgemm_nn_4x4_vs_lib4()
1268 D[0+bs*0] = CC[0+bs*0]; in kernel_sgemm_nn_4x4_vs_lib4()
1273 D[0+bs*1] = CC[0+bs*1]; in kernel_sgemm_nn_4x4_vs_lib4()
1278 D[0+bs*2] = CC[0+bs*2]; in kernel_sgemm_nn_4x4_vs_lib4()
1283 D[0+bs*3] = CC[0+bs*3]; in kernel_sgemm_nn_4x4_vs_lib4()
1300 float CC[16] = {0}; in kernel_sgemm_nn_4x4_gen_lib4() local
1302 ALIGNED( float CC[16], 64 ) = {0}; in kernel_sgemm_nn_4x4_gen_lib4()
1310 CC[0+bs*0] = beta[0]*C0[0+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1311 CC[1+bs*0] = beta[0]*C0[1+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1312 CC[2+bs*0] = beta[0]*C0[2+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1313 CC[3+bs*0] = beta[0]*C0[3+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1315 CC[0+bs*1] = beta[0]*C0[0+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1316 CC[1+bs*1] = beta[0]*C0[1+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1317 CC[2+bs*1] = beta[0]*C0[2+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1318 CC[3+bs*1] = beta[0]*C0[3+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1320 CC[0+bs*2] = beta[0]*C0[0+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1321 CC[1+bs*2] = beta[0]*C0[1+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1322 CC[2+bs*2] = beta[0]*C0[2+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1323 CC[3+bs*2] = beta[0]*C0[3+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1325 CC[0+bs*3] = beta[0]*C0[0+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1326 CC[1+bs*3] = beta[0]*C0[1+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1327 CC[2+bs*3] = beta[0]*C0[2+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1328 CC[3+bs*3] = beta[0]*C0[3+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1334 CC[0+bs*0] = beta[0]*C0[1+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1335 CC[1+bs*0] = beta[0]*C0[2+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1336 CC[2+bs*0] = beta[0]*C0[3+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1337 CC[3+bs*0] = beta[0]*C1[0+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1339 CC[0+bs*1] = beta[0]*C0[1+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1340 CC[1+bs*1] = beta[0]*C0[2+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1341 CC[2+bs*1] = beta[0]*C0[3+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1342 CC[3+bs*1] = beta[0]*C1[0+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1344 CC[0+bs*2] = beta[0]*C0[1+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1345 CC[1+bs*2] = beta[0]*C0[2+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1346 CC[2+bs*2] = beta[0]*C0[3+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1347 CC[3+bs*2] = beta[0]*C1[0+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1349 CC[0+bs*3] = beta[0]*C0[1+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1350 CC[1+bs*3] = beta[0]*C0[2+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1351 CC[2+bs*3] = beta[0]*C0[3+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1352 CC[3+bs*3] = beta[0]*C1[0+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1358 CC[0+bs*0] = beta[0]*C0[2+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1359 CC[1+bs*0] = beta[0]*C0[3+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1360 CC[2+bs*0] = beta[0]*C1[0+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1361 CC[3+bs*0] = beta[0]*C1[1+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1363 CC[0+bs*1] = beta[0]*C0[2+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1364 CC[1+bs*1] = beta[0]*C0[3+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1365 CC[2+bs*1] = beta[0]*C1[0+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1366 CC[3+bs*1] = beta[0]*C1[1+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1368 CC[0+bs*2] = beta[0]*C0[2+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1369 CC[1+bs*2] = beta[0]*C0[3+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1370 CC[2+bs*2] = beta[0]*C1[0+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1371 CC[3+bs*2] = beta[0]*C1[1+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1373 CC[0+bs*3] = beta[0]*C0[2+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1374 CC[1+bs*3] = beta[0]*C0[3+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1375 CC[2+bs*3] = beta[0]*C1[0+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1376 CC[3+bs*3] = beta[0]*C1[1+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1382 CC[0+bs*0] = beta[0]*C0[3+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1383 CC[1+bs*0] = beta[0]*C1[0+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1384 CC[2+bs*0] = beta[0]*C1[1+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1385 CC[3+bs*0] = beta[0]*C1[2+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1387 CC[0+bs*1] = beta[0]*C0[3+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1388 CC[1+bs*1] = beta[0]*C1[0+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1389 CC[2+bs*1] = beta[0]*C1[1+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1390 CC[3+bs*1] = beta[0]*C1[2+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1392 CC[0+bs*2] = beta[0]*C0[3+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1393 CC[1+bs*2] = beta[0]*C1[0+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1394 CC[2+bs*2] = beta[0]*C1[1+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1395 CC[3+bs*2] = beta[0]*C1[2+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1397 CC[0+bs*3] = beta[0]*C0[3+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1398 CC[1+bs*3] = beta[0]*C1[0+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1399 CC[2+bs*3] = beta[0]*C1[1+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1400 CC[3+bs*3] = beta[0]*C1[2+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1405 kernel_sgemm_nn_4x4_lib4(kmax, alpha, A, offsetB, B, sdb, &beta1, CC, CC); in kernel_sgemm_nn_4x4_gen_lib4()
1412 CC[0+bs*0] = CC[0+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1413 CC[1+bs*0] = CC[1+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1414 CC[2+bs*0] = CC[2+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1415 CC[3+bs*0] = CC[3+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1417 CC[0+bs*1] = CC[0+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1418 CC[1+bs*1] = CC[1+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1419 CC[2+bs*1] = CC[2+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1420 CC[3+bs*1] = CC[3+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1422 CC[0+bs*2] = CC[0+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1423 CC[1+bs*2] = CC[1+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1424 CC[2+bs*2] = CC[2+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1425 CC[3+bs*2] = CC[3+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1431 CC[0+bs*0] = CC[0+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1432 CC[1+bs*0] = CC[1+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1433 CC[2+bs*0] = CC[2+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1434 CC[3+bs*0] = CC[3+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1436 CC[0+bs*1] = CC[0+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1437 CC[1+bs*1] = CC[1+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1438 CC[2+bs*1] = CC[2+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1439 CC[3+bs*1] = CC[3+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1445 CC[0+bs*0] = CC[0+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1446 CC[1+bs*0] = CC[1+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1447 CC[2+bs*0] = CC[2+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1448 CC[3+bs*0] = CC[3+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1462 if(m0<=0 & m1>0) D0[0+bs*0] = CC[0+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1463 if(m0<=1 & m1>1) D0[1+bs*0] = CC[1+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1464 if(m0<=2 & m1>2) D0[2+bs*0] = CC[2+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1465 if(m0<=3 & m1>3) D0[3+bs*0] = CC[3+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1470 if(m0<=0 & m1>0) D0[0+bs*1] = CC[0+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1471 if(m0<=1 & m1>1) D0[1+bs*1] = CC[1+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1472 if(m0<=2 & m1>2) D0[2+bs*1] = CC[2+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1473 if(m0<=3 & m1>3) D0[3+bs*1] = CC[3+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1478 if(m0<=0 & m1>0) D0[0+bs*2] = CC[0+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1479 if(m0<=1 & m1>1) D0[1+bs*2] = CC[1+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1480 if(m0<=2 & m1>2) D0[2+bs*2] = CC[2+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1481 if(m0<=3 & m1>3) D0[3+bs*2] = CC[3+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1486 if(m0<=0 & m1>0) D0[0+bs*3] = CC[0+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1487 if(m0<=1 & m1>1) D0[1+bs*3] = CC[1+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1488 if(m0<=2 & m1>2) D0[2+bs*3] = CC[2+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1489 if(m0<=3 & m1>3) D0[3+bs*3] = CC[3+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1498 if(m0<=0 & m1>0) D0[1+bs*0] = CC[0+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1499 if(m0<=1 & m1>1) D0[2+bs*0] = CC[1+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1500 if(m0<=2 & m1>2) D0[3+bs*0] = CC[2+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1501 if(m0<=3 & m1>3) D1[0+bs*0] = CC[3+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1506 if(m0<=0 & m1>0) D0[1+bs*1] = CC[0+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1507 if(m0<=1 & m1>1) D0[2+bs*1] = CC[1+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1508 if(m0<=2 & m1>2) D0[3+bs*1] = CC[2+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1509 if(m0<=3 & m1>3) D1[0+bs*1] = CC[3+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1514 if(m0<=0 & m1>0) D0[1+bs*2] = CC[0+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1515 if(m0<=1 & m1>1) D0[2+bs*2] = CC[1+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1516 if(m0<=2 & m1>2) D0[3+bs*2] = CC[2+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1517 if(m0<=3 & m1>3) D1[0+bs*2] = CC[3+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1522 if(m0<=0 & m1>0) D0[1+bs*3] = CC[0+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1523 if(m0<=1 & m1>1) D0[2+bs*3] = CC[1+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1524 if(m0<=2 & m1>2) D0[3+bs*3] = CC[2+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1525 if(m0<=3 & m1>3) D1[0+bs*3] = CC[3+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1534 if(m0<=0 & m1>0) D0[2+bs*0] = CC[0+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1535 if(m0<=1 & m1>1) D0[3+bs*0] = CC[1+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1536 if(m0<=2 & m1>2) D1[0+bs*0] = CC[2+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1537 if(m0<=3 & m1>3) D1[1+bs*0] = CC[3+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1542 if(m0<=0 & m1>0) D0[2+bs*1] = CC[0+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1543 if(m0<=1 & m1>1) D0[3+bs*1] = CC[1+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1544 if(m0<=2 & m1>2) D1[0+bs*1] = CC[2+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1545 if(m0<=3 & m1>3) D1[1+bs*1] = CC[3+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1550 if(m0<=0 & m1>0) D0[2+bs*2] = CC[0+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1551 if(m0<=1 & m1>1) D0[3+bs*2] = CC[1+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1552 if(m0<=2 & m1>2) D1[0+bs*2] = CC[2+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1553 if(m0<=3 & m1>3) D1[1+bs*2] = CC[3+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1558 if(m0<=0 & m1>0) D0[2+bs*3] = CC[0+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1559 if(m0<=1 & m1>1) D0[3+bs*3] = CC[1+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1560 if(m0<=2 & m1>2) D1[0+bs*3] = CC[2+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1561 if(m0<=3 & m1>3) D1[1+bs*3] = CC[3+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1570 if(m0<=0 & m1>0) D0[3+bs*0] = CC[0+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1571 if(m0<=1 & m1>1) D1[0+bs*0] = CC[1+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1572 if(m0<=2 & m1>2) D1[1+bs*0] = CC[2+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1573 if(m0<=3 & m1>3) D1[2+bs*0] = CC[3+bs*0]; in kernel_sgemm_nn_4x4_gen_lib4()
1578 if(m0<=0 & m1>0) D0[3+bs*1] = CC[0+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1579 if(m0<=1 & m1>1) D1[0+bs*1] = CC[1+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1580 if(m0<=2 & m1>2) D1[1+bs*1] = CC[2+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1581 if(m0<=3 & m1>3) D1[2+bs*1] = CC[3+bs*1]; in kernel_sgemm_nn_4x4_gen_lib4()
1586 if(m0<=0 & m1>0) D0[3+bs*2] = CC[0+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1587 if(m0<=1 & m1>1) D1[0+bs*2] = CC[1+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1588 if(m0<=2 & m1>2) D1[1+bs*2] = CC[2+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1589 if(m0<=3 & m1>3) D1[2+bs*2] = CC[3+bs*2]; in kernel_sgemm_nn_4x4_gen_lib4()
1594 if(m0<=0 & m1>0) D0[3+bs*3] = CC[0+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1595 if(m0<=1 & m1>1) D1[0+bs*3] = CC[1+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1596 if(m0<=2 & m1>2) D1[1+bs*3] = CC[2+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1597 if(m0<=3 & m1>3) D1[2+bs*3] = CC[3+bs*3]; in kernel_sgemm_nn_4x4_gen_lib4()
1614 float CC[16] = {0}; in kernel_ssyrk_nt_l_4x4_lib4() local
1616 ALIGNED( float CC[16], 64 ) = {0}; in kernel_ssyrk_nt_l_4x4_lib4()
1619 kernel_sgemm_nt_4x4_lib4(kmax, alpha, A, B, beta, C, CC); in kernel_ssyrk_nt_l_4x4_lib4()
1621 D[0+bs*0] = CC[0+bs*0]; in kernel_ssyrk_nt_l_4x4_lib4()
1622 D[1+bs*0] = CC[1+bs*0]; in kernel_ssyrk_nt_l_4x4_lib4()
1623 D[2+bs*0] = CC[2+bs*0]; in kernel_ssyrk_nt_l_4x4_lib4()
1624 D[3+bs*0] = CC[3+bs*0]; in kernel_ssyrk_nt_l_4x4_lib4()
1626 D[1+bs*1] = CC[1+bs*1]; in kernel_ssyrk_nt_l_4x4_lib4()
1627 D[2+bs*1] = CC[2+bs*1]; in kernel_ssyrk_nt_l_4x4_lib4()
1628 D[3+bs*1] = CC[3+bs*1]; in kernel_ssyrk_nt_l_4x4_lib4()
1630 D[2+bs*2] = CC[2+bs*2]; in kernel_ssyrk_nt_l_4x4_lib4()
1631 D[3+bs*2] = CC[3+bs*2]; in kernel_ssyrk_nt_l_4x4_lib4()
1633 D[3+bs*3] = CC[3+bs*3]; in kernel_ssyrk_nt_l_4x4_lib4()
1649 float CC[16] = {0}; in kernel_ssyrk_nt_l_4x4_vs_lib4() local
1651 ALIGNED( float CC[16], 64 ) = {0}; in kernel_ssyrk_nt_l_4x4_vs_lib4()
1654 kernel_sgemm_nt_4x4_lib4(kmax, alpha, A, B, beta, C, CC); in kernel_ssyrk_nt_l_4x4_vs_lib4()
1658 D[0+bs*0] = CC[0+bs*0]; in kernel_ssyrk_nt_l_4x4_vs_lib4()
1659 D[1+bs*0] = CC[1+bs*0]; in kernel_ssyrk_nt_l_4x4_vs_lib4()
1660 D[2+bs*0] = CC[2+bs*0]; in kernel_ssyrk_nt_l_4x4_vs_lib4()
1661 D[3+bs*0] = CC[3+bs*0]; in kernel_ssyrk_nt_l_4x4_vs_lib4()
1666 D[1+bs*1] = CC[1+bs*1]; in kernel_ssyrk_nt_l_4x4_vs_lib4()
1667 D[2+bs*1] = CC[2+bs*1]; in kernel_ssyrk_nt_l_4x4_vs_lib4()
1668 D[3+bs*1] = CC[3+bs*1]; in kernel_ssyrk_nt_l_4x4_vs_lib4()
1673 D[2+bs*2] = CC[2+bs*2]; in kernel_ssyrk_nt_l_4x4_vs_lib4()
1674 D[3+bs*2] = CC[3+bs*2]; in kernel_ssyrk_nt_l_4x4_vs_lib4()
1679 D[3+bs*3] = CC[3+bs*3]; in kernel_ssyrk_nt_l_4x4_vs_lib4()
1683 D[0+bs*0] = CC[0+bs*0]; in kernel_ssyrk_nt_l_4x4_vs_lib4()
1684 D[1+bs*0] = CC[1+bs*0]; in kernel_ssyrk_nt_l_4x4_vs_lib4()
1685 D[2+bs*0] = CC[2+bs*0]; in kernel_ssyrk_nt_l_4x4_vs_lib4()
1690 D[1+bs*1] = CC[1+bs*1]; in kernel_ssyrk_nt_l_4x4_vs_lib4()
1691 D[2+bs*1] = CC[2+bs*1]; in kernel_ssyrk_nt_l_4x4_vs_lib4()
1696 D[2+bs*2] = CC[2+bs*2]; in kernel_ssyrk_nt_l_4x4_vs_lib4()
1700 D[0+bs*0] = CC[0+bs*0]; in kernel_ssyrk_nt_l_4x4_vs_lib4()
1701 D[1+bs*0] = CC[1+bs*0]; in kernel_ssyrk_nt_l_4x4_vs_lib4()
1706 D[1+bs*1] = CC[1+bs*1]; in kernel_ssyrk_nt_l_4x4_vs_lib4()
1710 D[0+bs*0] = CC[0+bs*0]; in kernel_ssyrk_nt_l_4x4_vs_lib4()
1727 float CC[16] = {0}; in kernel_ssyrk_nt_l_4x4_gen_lib4() local
1729 ALIGNED( float CC[16], 64 ) = {0}; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1737 CC[0+bs*0] = beta[0]*C0[0+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1738 CC[1+bs*0] = beta[0]*C0[1+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1739 CC[2+bs*0] = beta[0]*C0[2+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1740 CC[3+bs*0] = beta[0]*C0[3+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1742 CC[1+bs*1] = beta[0]*C0[1+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1743 CC[2+bs*1] = beta[0]*C0[2+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1744 CC[3+bs*1] = beta[0]*C0[3+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1746 CC[2+bs*2] = beta[0]*C0[2+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1747 CC[3+bs*2] = beta[0]*C0[3+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1749 CC[3+bs*3] = beta[0]*C0[3+bs*3]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1755 CC[0+bs*0] = beta[0]*C0[1+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1756 CC[1+bs*0] = beta[0]*C0[2+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1757 CC[2+bs*0] = beta[0]*C0[3+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1758 CC[3+bs*0] = beta[0]*C1[0+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1760 CC[1+bs*1] = beta[0]*C0[2+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1761 CC[2+bs*1] = beta[0]*C0[3+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1762 CC[3+bs*1] = beta[0]*C1[0+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1764 CC[2+bs*2] = beta[0]*C0[3+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1765 CC[3+bs*2] = beta[0]*C1[0+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1767 CC[3+bs*3] = beta[0]*C1[0+bs*3]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1773 CC[0+bs*0] = beta[0]*C0[2+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1774 CC[1+bs*0] = beta[0]*C0[3+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1775 CC[2+bs*0] = beta[0]*C1[0+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1776 CC[3+bs*0] = beta[0]*C1[1+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1778 CC[1+bs*1] = beta[0]*C0[3+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1779 CC[2+bs*1] = beta[0]*C1[0+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1780 CC[3+bs*1] = beta[0]*C1[1+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1782 CC[2+bs*2] = beta[0]*C1[0+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1783 CC[3+bs*2] = beta[0]*C1[1+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1785 CC[3+bs*3] = beta[0]*C1[1+bs*3]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1791 CC[0+bs*0] = beta[0]*C0[3+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1792 CC[1+bs*0] = beta[0]*C1[0+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1793 CC[2+bs*0] = beta[0]*C1[1+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1794 CC[3+bs*0] = beta[0]*C1[2+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1796 CC[1+bs*1] = beta[0]*C1[0+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1797 CC[2+bs*1] = beta[0]*C1[1+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1798 CC[3+bs*1] = beta[0]*C1[2+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1800 CC[2+bs*2] = beta[0]*C1[1+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1801 CC[3+bs*2] = beta[0]*C1[2+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1803 CC[3+bs*3] = beta[0]*C1[2+bs*3]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1808 kernel_sgemm_nt_4x4_lib4(kmax, alpha, A, B, &beta1, CC, CC); in kernel_ssyrk_nt_l_4x4_gen_lib4()
1815 CC[0+bs*0] = CC[0+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1816 CC[1+bs*0] = CC[1+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1817 CC[2+bs*0] = CC[2+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1818 CC[3+bs*0] = CC[3+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1820 CC[0+bs*1] = CC[0+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1821 CC[1+bs*1] = CC[1+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1822 CC[2+bs*1] = CC[2+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1823 CC[3+bs*1] = CC[3+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1825 CC[0+bs*2] = CC[0+bs*3]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1826 CC[1+bs*2] = CC[1+bs*3]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1827 CC[2+bs*2] = CC[2+bs*3]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1828 CC[3+bs*2] = CC[3+bs*3]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1834 CC[0+bs*0] = CC[0+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1835 CC[1+bs*0] = CC[1+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1836 CC[2+bs*0] = CC[2+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1837 CC[3+bs*0] = CC[3+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1839 CC[0+bs*1] = CC[0+bs*3]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1840 CC[1+bs*1] = CC[1+bs*3]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1841 CC[2+bs*1] = CC[2+bs*3]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1842 CC[3+bs*1] = CC[3+bs*3]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1848 CC[0+bs*0] = CC[0+bs*3]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1849 CC[1+bs*0] = CC[1+bs*3]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1850 CC[2+bs*0] = CC[2+bs*3]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1851 CC[3+bs*0] = CC[3+bs*3]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1867 if(m1>0) D0[0+bs*0] = CC[0+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1868 if(m1>1) D0[1+bs*0] = CC[1+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1869 if(m1>2) D0[2+bs*0] = CC[2+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1870 if(m1>3) D0[3+bs*0] = CC[3+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1875 if(m1>1) D0[1+bs*1] = CC[1+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1876 if(m1>2) D0[2+bs*1] = CC[2+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1877 if(m1>3) D0[3+bs*1] = CC[3+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1882 if(m1>2) D0[2+bs*2] = CC[2+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1883 if(m1>3) D0[3+bs*2] = CC[3+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1888 if(m1>3) D0[3+bs*3] = CC[3+bs*3]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1895 if(m1>1) D0[1+bs*0] = CC[1+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1896 if(m1>2) D0[2+bs*0] = CC[2+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1897 if(m1>3) D0[3+bs*0] = CC[3+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1902 if(m1>2) D0[2+bs*1] = CC[2+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1903 if(m1>3) D0[3+bs*1] = CC[3+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1908 if(m1>3) D0[3+bs*2] = CC[3+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1915 if(m1>2) D0[2+bs*0] = CC[2+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1916 if(m1>3) D0[3+bs*0] = CC[3+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1921 if(m1>3) D0[3+bs*1] = CC[3+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1928 if(m1>3) D0[3+bs*0] = CC[3+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1939 if(m1>0) D0[1+bs*0] = CC[0+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1940 if(m1>1) D0[2+bs*0] = CC[1+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1941 if(m1>2) D0[3+bs*0] = CC[2+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1942 if(m1>3) D1[0+bs*0] = CC[3+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1947 if(m1>1) D0[2+bs*1] = CC[1+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1948 if(m1>2) D0[3+bs*1] = CC[2+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1949 if(m1>3) D1[0+bs*1] = CC[3+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1954 if(m1>2) D0[3+bs*2] = CC[2+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1955 if(m1>3) D1[0+bs*2] = CC[3+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1960 if(m1>3) D1[0+bs*3] = CC[3+bs*3]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1967 if(m1>1) D0[2+bs*0] = CC[1+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1968 if(m1>2) D0[3+bs*0] = CC[2+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1969 if(m1>3) D1[0+bs*0] = CC[3+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1974 if(m1>2) D0[3+bs*1] = CC[2+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1975 if(m1>3) D1[0+bs*1] = CC[3+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1980 if(m1>3) D1[0+bs*2] = CC[3+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1987 if(m1>2) D0[3+bs*0] = CC[2+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1988 if(m1>3) D1[0+bs*0] = CC[3+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
1993 if(m1>3) D1[0+bs*1] = CC[3+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2000 if(m1>3) D1[0+bs*0] = CC[3+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2011 if(m1>0) D0[2+bs*0] = CC[0+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2012 if(m1>1) D0[3+bs*0] = CC[1+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2013 if(m1>2) D1[0+bs*0] = CC[2+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2014 if(m1>3) D1[1+bs*0] = CC[3+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2019 if(m1>1) D0[3+bs*1] = CC[1+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2020 if(m1>2) D1[0+bs*1] = CC[2+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2021 if(m1>3) D1[1+bs*1] = CC[3+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2026 if(m1>2) D1[0+bs*2] = CC[2+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2027 if(m1>3) D1[1+bs*2] = CC[3+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2032 if(m1>3) D1[1+bs*3] = CC[3+bs*3]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2039 if(m1>1) D0[3+bs*0] = CC[1+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2040 if(m1>2) D1[0+bs*0] = CC[2+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2041 if(m1>3) D1[1+bs*0] = CC[3+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2046 if(m1>2) D1[0+bs*1] = CC[2+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2047 if(m1>3) D1[1+bs*1] = CC[3+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2052 if(m1>3) D1[1+bs*2] = CC[3+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2059 if(m1>2) D1[0+bs*0] = CC[2+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2060 if(m1>3) D1[1+bs*0] = CC[3+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2065 if(m1>3) D1[1+bs*1] = CC[3+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2072 if(m1>3) D1[1+bs*0] = CC[3+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2083 if(m1>0) D0[3+bs*0] = CC[0+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2084 if(m1>1) D1[0+bs*0] = CC[1+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2085 if(m1>2) D1[1+bs*0] = CC[2+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2086 if(m1>3) D1[2+bs*0] = CC[3+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2091 if(m1>1) D1[0+bs*1] = CC[1+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2092 if(m1>2) D1[1+bs*1] = CC[2+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2093 if(m1>3) D1[2+bs*1] = CC[3+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2098 if(m1>2) D1[1+bs*2] = CC[2+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2099 if(m1>3) D1[2+bs*2] = CC[3+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2104 if(m1>3) D1[2+bs*3] = CC[3+bs*3]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2111 if(m1>1) D1[0+bs*0] = CC[1+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2112 if(m1>2) D1[1+bs*0] = CC[2+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2113 if(m1>3) D1[2+bs*0] = CC[3+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2118 if(m1>2) D1[1+bs*1] = CC[2+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2119 if(m1>3) D1[2+bs*1] = CC[3+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2124 if(m1>3) D1[2+bs*2] = CC[3+bs*2]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2131 if(m1>2) D1[1+bs*0] = CC[2+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2132 if(m1>3) D1[2+bs*0] = CC[3+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2137 if(m1>3) D1[2+bs*1] = CC[3+bs*1]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2144 if(m1>3) D1[2+bs*0] = CC[3+bs*0]; in kernel_ssyrk_nt_l_4x4_gen_lib4()
2166 float CC[16] = {0}; in kernel_strmm_nt_ru_4x4_lib4() local
2168 ALIGNED( float CC[16], 64 ) = {0}; in kernel_strmm_nt_ru_4x4_lib4()
2185 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nt_ru_4x4_lib4()
2186 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nt_ru_4x4_lib4()
2187 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nt_ru_4x4_lib4()
2188 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nt_ru_4x4_lib4()
2206 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nt_ru_4x4_lib4()
2207 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nt_ru_4x4_lib4()
2208 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nt_ru_4x4_lib4()
2209 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nt_ru_4x4_lib4()
2211 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nt_ru_4x4_lib4()
2212 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nt_ru_4x4_lib4()
2213 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nt_ru_4x4_lib4()
2214 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nt_ru_4x4_lib4()
2233 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nt_ru_4x4_lib4()
2234 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nt_ru_4x4_lib4()
2235 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nt_ru_4x4_lib4()
2236 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nt_ru_4x4_lib4()
2238 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nt_ru_4x4_lib4()
2239 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nt_ru_4x4_lib4()
2240 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nt_ru_4x4_lib4()
2241 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nt_ru_4x4_lib4()
2243 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nt_ru_4x4_lib4()
2244 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nt_ru_4x4_lib4()
2245 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nt_ru_4x4_lib4()
2246 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nt_ru_4x4_lib4()
2253 kernel_sgemm_nt_4x4_lib4(kmax-k, alpha, A, B, alpha, CC, D); in kernel_strmm_nt_ru_4x4_lib4()
2273 float CC[16] = {0}; in kernel_strmm_nt_ru_4x4_vs_lib4() local
2275 ALIGNED( float CC[16], 64 ) = {0}; in kernel_strmm_nt_ru_4x4_vs_lib4()
2292 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nt_ru_4x4_vs_lib4()
2293 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nt_ru_4x4_vs_lib4()
2294 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nt_ru_4x4_vs_lib4()
2295 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nt_ru_4x4_vs_lib4()
2313 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nt_ru_4x4_vs_lib4()
2314 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nt_ru_4x4_vs_lib4()
2315 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nt_ru_4x4_vs_lib4()
2316 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nt_ru_4x4_vs_lib4()
2318 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nt_ru_4x4_vs_lib4()
2319 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nt_ru_4x4_vs_lib4()
2320 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nt_ru_4x4_vs_lib4()
2321 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nt_ru_4x4_vs_lib4()
2340 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nt_ru_4x4_vs_lib4()
2341 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nt_ru_4x4_vs_lib4()
2342 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nt_ru_4x4_vs_lib4()
2343 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nt_ru_4x4_vs_lib4()
2345 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nt_ru_4x4_vs_lib4()
2346 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nt_ru_4x4_vs_lib4()
2347 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nt_ru_4x4_vs_lib4()
2348 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nt_ru_4x4_vs_lib4()
2350 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nt_ru_4x4_vs_lib4()
2351 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nt_ru_4x4_vs_lib4()
2352 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nt_ru_4x4_vs_lib4()
2353 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nt_ru_4x4_vs_lib4()
2360 kernel_sgemm_nt_4x4_lib4(kmax-k, alpha, A, B, alpha, CC, CC); in kernel_strmm_nt_ru_4x4_vs_lib4()
2364 D[0+bs*0] = CC[0+bs*0]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2365 D[1+bs*0] = CC[1+bs*0]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2366 D[2+bs*0] = CC[2+bs*0]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2367 D[3+bs*0] = CC[3+bs*0]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2372 D[0+bs*1] = CC[0+bs*1]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2373 D[1+bs*1] = CC[1+bs*1]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2374 D[2+bs*1] = CC[2+bs*1]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2375 D[3+bs*1] = CC[3+bs*1]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2380 D[0+bs*2] = CC[0+bs*2]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2381 D[1+bs*2] = CC[1+bs*2]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2382 D[2+bs*2] = CC[2+bs*2]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2383 D[3+bs*2] = CC[3+bs*2]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2388 D[0+bs*3] = CC[0+bs*3]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2389 D[1+bs*3] = CC[1+bs*3]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2390 D[2+bs*3] = CC[2+bs*3]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2391 D[3+bs*3] = CC[3+bs*3]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2395 D[0+bs*0] = CC[0+bs*0]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2396 D[1+bs*0] = CC[1+bs*0]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2397 D[2+bs*0] = CC[2+bs*0]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2402 D[0+bs*1] = CC[0+bs*1]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2403 D[1+bs*1] = CC[1+bs*1]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2404 D[2+bs*1] = CC[2+bs*1]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2409 D[0+bs*2] = CC[0+bs*2]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2410 D[1+bs*2] = CC[1+bs*2]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2411 D[2+bs*2] = CC[2+bs*2]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2416 D[0+bs*3] = CC[0+bs*3]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2417 D[1+bs*3] = CC[1+bs*3]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2418 D[2+bs*3] = CC[2+bs*3]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2422 D[0+bs*0] = CC[0+bs*0]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2423 D[1+bs*0] = CC[1+bs*0]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2428 D[0+bs*1] = CC[0+bs*1]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2429 D[1+bs*1] = CC[1+bs*1]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2434 D[0+bs*2] = CC[0+bs*2]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2435 D[1+bs*2] = CC[1+bs*2]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2440 D[0+bs*3] = CC[0+bs*3]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2441 D[1+bs*3] = CC[1+bs*3]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2445 D[0+bs*0] = CC[0+bs*0]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2450 D[0+bs*1] = CC[0+bs*1]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2455 D[0+bs*2] = CC[0+bs*2]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2460 D[0+bs*3] = CC[0+bs*3]; in kernel_strmm_nt_ru_4x4_vs_lib4()
2482 float CC[16] = {0}; in kernel_strmm_nn_rl_4x4_lib4() local
2484 ALIGNED( float CC[16], 64 ) = {0}; in kernel_strmm_nn_rl_4x4_lib4()
2506 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2507 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2508 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2509 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2526 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2527 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2528 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2529 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2532 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2533 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2534 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2535 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2552 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2553 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2554 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2555 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2558 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2559 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2560 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2561 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2564 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2565 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2566 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2567 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2584 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2585 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2586 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2587 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2590 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2591 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2592 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2593 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2596 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2597 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2598 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2599 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2602 CC[0+bs*3] += a_0 * b_3; in kernel_strmm_nn_rl_4x4_lib4()
2603 CC[1+bs*3] += a_1 * b_3; in kernel_strmm_nn_rl_4x4_lib4()
2604 CC[2+bs*3] += a_2 * b_3; in kernel_strmm_nn_rl_4x4_lib4()
2605 CC[3+bs*3] += a_3 * b_3; in kernel_strmm_nn_rl_4x4_lib4()
2623 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2624 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2625 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2626 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2643 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2644 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2645 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2646 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2649 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2650 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2651 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2652 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2669 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2670 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2671 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2672 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2675 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2676 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2677 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2678 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2681 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2682 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2683 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2684 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2702 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2703 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2704 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2705 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2722 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2723 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2724 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2725 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2728 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2729 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2730 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2731 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2748 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2749 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2750 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2751 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2754 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2755 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2756 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2757 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2760 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2761 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2762 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2763 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2780 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2781 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2782 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2783 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2786 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2787 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2788 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2789 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2792 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2793 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2794 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2795 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2798 CC[0+bs*3] += a_0 * b_3; in kernel_strmm_nn_rl_4x4_lib4()
2799 CC[1+bs*3] += a_1 * b_3; in kernel_strmm_nn_rl_4x4_lib4()
2800 CC[2+bs*3] += a_2 * b_3; in kernel_strmm_nn_rl_4x4_lib4()
2801 CC[3+bs*3] += a_3 * b_3; in kernel_strmm_nn_rl_4x4_lib4()
2818 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2819 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2820 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2821 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2824 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2825 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2826 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2827 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2830 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2831 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2832 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2833 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2836 CC[0+bs*3] += a_0 * b_3; in kernel_strmm_nn_rl_4x4_lib4()
2837 CC[1+bs*3] += a_1 * b_3; in kernel_strmm_nn_rl_4x4_lib4()
2838 CC[2+bs*3] += a_2 * b_3; in kernel_strmm_nn_rl_4x4_lib4()
2839 CC[3+bs*3] += a_3 * b_3; in kernel_strmm_nn_rl_4x4_lib4()
2856 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2857 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2858 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2859 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2862 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2863 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2864 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2865 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2868 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2869 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2870 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2871 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2874 CC[0+bs*3] += a_0 * b_3; in kernel_strmm_nn_rl_4x4_lib4()
2875 CC[1+bs*3] += a_1 * b_3; in kernel_strmm_nn_rl_4x4_lib4()
2876 CC[2+bs*3] += a_2 * b_3; in kernel_strmm_nn_rl_4x4_lib4()
2877 CC[3+bs*3] += a_3 * b_3; in kernel_strmm_nn_rl_4x4_lib4()
2895 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2896 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2897 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2898 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2915 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2916 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2917 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2918 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2921 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2922 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2923 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2924 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2941 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2942 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2943 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2944 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2947 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2948 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2949 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2950 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2953 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2954 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2955 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2956 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2973 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2974 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2975 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2976 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
2979 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2980 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2981 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2982 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
2985 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2986 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2987 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2988 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
2991 CC[0+bs*3] += a_0 * b_3; in kernel_strmm_nn_rl_4x4_lib4()
2992 CC[1+bs*3] += a_1 * b_3; in kernel_strmm_nn_rl_4x4_lib4()
2993 CC[2+bs*3] += a_2 * b_3; in kernel_strmm_nn_rl_4x4_lib4()
2994 CC[3+bs*3] += a_3 * b_3; in kernel_strmm_nn_rl_4x4_lib4()
3011 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
3012 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
3013 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
3014 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_lib4()
3017 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
3018 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
3019 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
3020 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_lib4()
3023 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
3024 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
3025 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
3026 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_lib4()
3029 CC[0+bs*3] += a_0 * b_3; in kernel_strmm_nn_rl_4x4_lib4()
3030 CC[1+bs*3] += a_1 * b_3; in kernel_strmm_nn_rl_4x4_lib4()
3031 CC[2+bs*3] += a_2 * b_3; in kernel_strmm_nn_rl_4x4_lib4()
3032 CC[3+bs*3] += a_3 * b_3; in kernel_strmm_nn_rl_4x4_lib4()
3042 CC[0+bs*0] = alpha[0]*CC[0+bs*0]; in kernel_strmm_nn_rl_4x4_lib4()
3043 CC[1+bs*0] = alpha[0]*CC[1+bs*0]; in kernel_strmm_nn_rl_4x4_lib4()
3044 CC[2+bs*0] = alpha[0]*CC[2+bs*0]; in kernel_strmm_nn_rl_4x4_lib4()
3045 CC[3+bs*0] = alpha[0]*CC[3+bs*0]; in kernel_strmm_nn_rl_4x4_lib4()
3047 CC[0+bs*1] = alpha[0]*CC[0+bs*1]; in kernel_strmm_nn_rl_4x4_lib4()
3048 CC[1+bs*1] = alpha[0]*CC[1+bs*1]; in kernel_strmm_nn_rl_4x4_lib4()
3049 CC[2+bs*1] = alpha[0]*CC[2+bs*1]; in kernel_strmm_nn_rl_4x4_lib4()
3050 CC[3+bs*1] = alpha[0]*CC[3+bs*1]; in kernel_strmm_nn_rl_4x4_lib4()
3052 CC[0+bs*2] = alpha[0]*CC[0+bs*2]; in kernel_strmm_nn_rl_4x4_lib4()
3053 CC[1+bs*2] = alpha[0]*CC[1+bs*2]; in kernel_strmm_nn_rl_4x4_lib4()
3054 CC[2+bs*2] = alpha[0]*CC[2+bs*2]; in kernel_strmm_nn_rl_4x4_lib4()
3055 CC[3+bs*2] = alpha[0]*CC[3+bs*2]; in kernel_strmm_nn_rl_4x4_lib4()
3057 CC[0+bs*3] = alpha[0]*CC[0+bs*3]; in kernel_strmm_nn_rl_4x4_lib4()
3058 CC[1+bs*3] = alpha[0]*CC[1+bs*3]; in kernel_strmm_nn_rl_4x4_lib4()
3059 CC[2+bs*3] = alpha[0]*CC[2+bs*3]; in kernel_strmm_nn_rl_4x4_lib4()
3060 CC[3+bs*3] = alpha[0]*CC[3+bs*3]; in kernel_strmm_nn_rl_4x4_lib4()
3064 kernel_sgemm_nn_4x4_lib4(kmax-k, alpha, A, 0, B, sdb, &beta1, CC, D); in kernel_strmm_nn_rl_4x4_lib4()
3084 float CC[16] = {0}; in kernel_strmm_nn_rl_4x4_vs_lib4() local
3086 ALIGNED( float CC[16], 64 ) = {0}; in kernel_strmm_nn_rl_4x4_vs_lib4()
3108 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3109 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3110 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3111 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3128 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3129 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3130 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3131 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3134 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3135 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3136 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3137 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3154 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3155 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3156 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3157 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3160 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3161 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3162 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3163 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3166 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3167 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3168 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3169 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3186 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3187 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3188 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3189 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3192 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3193 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3194 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3195 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3198 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3199 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3200 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3201 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3204 CC[0+bs*3] += a_0 * b_3; in kernel_strmm_nn_rl_4x4_vs_lib4()
3205 CC[1+bs*3] += a_1 * b_3; in kernel_strmm_nn_rl_4x4_vs_lib4()
3206 CC[2+bs*3] += a_2 * b_3; in kernel_strmm_nn_rl_4x4_vs_lib4()
3207 CC[3+bs*3] += a_3 * b_3; in kernel_strmm_nn_rl_4x4_vs_lib4()
3225 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3226 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3227 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3228 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3245 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3246 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3247 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3248 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3251 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3252 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3253 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3254 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3271 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3272 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3273 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3274 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3277 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3278 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3279 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3280 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3283 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3284 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3285 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3286 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3304 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3305 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3306 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3307 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3324 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3325 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3326 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3327 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3330 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3331 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3332 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3333 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3350 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3351 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3352 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3353 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3356 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3357 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3358 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3359 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3362 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3363 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3364 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3365 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3382 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3383 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3384 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3385 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3388 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3389 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3390 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3391 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3394 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3395 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3396 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3397 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3400 CC[0+bs*3] += a_0 * b_3; in kernel_strmm_nn_rl_4x4_vs_lib4()
3401 CC[1+bs*3] += a_1 * b_3; in kernel_strmm_nn_rl_4x4_vs_lib4()
3402 CC[2+bs*3] += a_2 * b_3; in kernel_strmm_nn_rl_4x4_vs_lib4()
3403 CC[3+bs*3] += a_3 * b_3; in kernel_strmm_nn_rl_4x4_vs_lib4()
3420 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3421 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3422 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3423 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3426 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3427 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3428 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3429 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3432 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3433 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3434 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3435 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3438 CC[0+bs*3] += a_0 * b_3; in kernel_strmm_nn_rl_4x4_vs_lib4()
3439 CC[1+bs*3] += a_1 * b_3; in kernel_strmm_nn_rl_4x4_vs_lib4()
3440 CC[2+bs*3] += a_2 * b_3; in kernel_strmm_nn_rl_4x4_vs_lib4()
3441 CC[3+bs*3] += a_3 * b_3; in kernel_strmm_nn_rl_4x4_vs_lib4()
3458 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3459 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3460 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3461 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3464 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3465 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3466 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3467 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3470 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3471 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3472 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3473 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3476 CC[0+bs*3] += a_0 * b_3; in kernel_strmm_nn_rl_4x4_vs_lib4()
3477 CC[1+bs*3] += a_1 * b_3; in kernel_strmm_nn_rl_4x4_vs_lib4()
3478 CC[2+bs*3] += a_2 * b_3; in kernel_strmm_nn_rl_4x4_vs_lib4()
3479 CC[3+bs*3] += a_3 * b_3; in kernel_strmm_nn_rl_4x4_vs_lib4()
3497 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3498 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3499 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3500 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3517 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3518 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3519 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3520 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3523 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3524 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3525 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3526 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3543 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3544 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3545 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3546 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3549 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3550 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3551 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3552 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3555 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3556 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3557 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3558 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3575 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3576 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3577 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3578 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3581 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3582 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3583 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3584 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3587 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3588 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3589 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3590 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3593 CC[0+bs*3] += a_0 * b_3; in kernel_strmm_nn_rl_4x4_vs_lib4()
3594 CC[1+bs*3] += a_1 * b_3; in kernel_strmm_nn_rl_4x4_vs_lib4()
3595 CC[2+bs*3] += a_2 * b_3; in kernel_strmm_nn_rl_4x4_vs_lib4()
3596 CC[3+bs*3] += a_3 * b_3; in kernel_strmm_nn_rl_4x4_vs_lib4()
3613 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3614 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3615 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3616 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_vs_lib4()
3619 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3620 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3621 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3622 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_vs_lib4()
3625 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3626 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3627 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3628 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_vs_lib4()
3631 CC[0+bs*3] += a_0 * b_3; in kernel_strmm_nn_rl_4x4_vs_lib4()
3632 CC[1+bs*3] += a_1 * b_3; in kernel_strmm_nn_rl_4x4_vs_lib4()
3633 CC[2+bs*3] += a_2 * b_3; in kernel_strmm_nn_rl_4x4_vs_lib4()
3634 CC[3+bs*3] += a_3 * b_3; in kernel_strmm_nn_rl_4x4_vs_lib4()
3644 CC[0+bs*0] = alpha[0]*CC[0+bs*0]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3645 CC[1+bs*0] = alpha[0]*CC[1+bs*0]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3646 CC[2+bs*0] = alpha[0]*CC[2+bs*0]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3647 CC[3+bs*0] = alpha[0]*CC[3+bs*0]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3649 CC[0+bs*1] = alpha[0]*CC[0+bs*1]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3650 CC[1+bs*1] = alpha[0]*CC[1+bs*1]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3651 CC[2+bs*1] = alpha[0]*CC[2+bs*1]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3652 CC[3+bs*1] = alpha[0]*CC[3+bs*1]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3654 CC[0+bs*2] = alpha[0]*CC[0+bs*2]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3655 CC[1+bs*2] = alpha[0]*CC[1+bs*2]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3656 CC[2+bs*2] = alpha[0]*CC[2+bs*2]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3657 CC[3+bs*2] = alpha[0]*CC[3+bs*2]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3659 CC[0+bs*3] = alpha[0]*CC[0+bs*3]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3660 CC[1+bs*3] = alpha[0]*CC[1+bs*3]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3661 CC[2+bs*3] = alpha[0]*CC[2+bs*3]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3662 CC[3+bs*3] = alpha[0]*CC[3+bs*3]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3666 kernel_sgemm_nn_4x4_lib4(kmax-k, alpha, A, 0, B, sdb, &beta1, CC, CC); in kernel_strmm_nn_rl_4x4_vs_lib4()
3670 D[0+bs*0] = CC[0+bs*0]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3671 D[1+bs*0] = CC[1+bs*0]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3672 D[2+bs*0] = CC[2+bs*0]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3673 D[3+bs*0] = CC[3+bs*0]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3678 D[0+bs*1] = CC[0+bs*1]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3679 D[1+bs*1] = CC[1+bs*1]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3680 D[2+bs*1] = CC[2+bs*1]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3681 D[3+bs*1] = CC[3+bs*1]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3686 D[0+bs*2] = CC[0+bs*2]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3687 D[1+bs*2] = CC[1+bs*2]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3688 D[2+bs*2] = CC[2+bs*2]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3689 D[3+bs*2] = CC[3+bs*2]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3694 D[0+bs*3] = CC[0+bs*3]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3695 D[1+bs*3] = CC[1+bs*3]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3696 D[2+bs*3] = CC[2+bs*3]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3697 D[3+bs*3] = CC[3+bs*3]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3701 D[0+bs*0] = CC[0+bs*0]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3702 D[1+bs*0] = CC[1+bs*0]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3703 D[2+bs*0] = CC[2+bs*0]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3708 D[0+bs*1] = CC[0+bs*1]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3709 D[1+bs*1] = CC[1+bs*1]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3710 D[2+bs*1] = CC[2+bs*1]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3715 D[0+bs*2] = CC[0+bs*2]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3716 D[1+bs*2] = CC[1+bs*2]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3717 D[2+bs*2] = CC[2+bs*2]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3722 D[0+bs*3] = CC[0+bs*3]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3723 D[1+bs*3] = CC[1+bs*3]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3724 D[2+bs*3] = CC[2+bs*3]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3728 D[0+bs*0] = CC[0+bs*0]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3729 D[1+bs*0] = CC[1+bs*0]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3734 D[0+bs*1] = CC[0+bs*1]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3735 D[1+bs*1] = CC[1+bs*1]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3740 D[0+bs*2] = CC[0+bs*2]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3741 D[1+bs*2] = CC[1+bs*2]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3746 D[0+bs*3] = CC[0+bs*3]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3747 D[1+bs*3] = CC[1+bs*3]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3751 D[0+bs*0] = CC[0+bs*0]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3756 D[0+bs*1] = CC[0+bs*1]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3761 D[0+bs*2] = CC[0+bs*2]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3766 D[0+bs*3] = CC[0+bs*3]; in kernel_strmm_nn_rl_4x4_vs_lib4()
3787 float CC[16] = {0}; in kernel_strmm_nn_rl_4x4_gen_lib4() local
3789 ALIGNED( float CC[16], 64 ) = {0}; in kernel_strmm_nn_rl_4x4_gen_lib4()
3811 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3812 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3813 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3814 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3831 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3832 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3833 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3834 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3837 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
3838 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
3839 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
3840 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
3857 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3858 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3859 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3860 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3863 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
3864 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
3865 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
3866 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
3869 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
3870 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
3871 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
3872 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
3889 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3890 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3891 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3892 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3895 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
3896 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
3897 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
3898 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
3901 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
3902 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
3903 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
3904 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
3907 CC[0+bs*3] += a_0 * b_3; in kernel_strmm_nn_rl_4x4_gen_lib4()
3908 CC[1+bs*3] += a_1 * b_3; in kernel_strmm_nn_rl_4x4_gen_lib4()
3909 CC[2+bs*3] += a_2 * b_3; in kernel_strmm_nn_rl_4x4_gen_lib4()
3910 CC[3+bs*3] += a_3 * b_3; in kernel_strmm_nn_rl_4x4_gen_lib4()
3928 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3929 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3930 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3931 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3948 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3949 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3950 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3951 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3954 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
3955 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
3956 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
3957 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
3974 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3975 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3976 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3977 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
3980 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
3981 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
3982 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
3983 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
3986 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
3987 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
3988 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
3989 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4007 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4008 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4009 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4010 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4027 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4028 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4029 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4030 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4033 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4034 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4035 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4036 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4053 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4054 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4055 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4056 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4059 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4060 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4061 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4062 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4065 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4066 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4067 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4068 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4085 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4086 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4087 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4088 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4091 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4092 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4093 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4094 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4097 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4098 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4099 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4100 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4103 CC[0+bs*3] += a_0 * b_3; in kernel_strmm_nn_rl_4x4_gen_lib4()
4104 CC[1+bs*3] += a_1 * b_3; in kernel_strmm_nn_rl_4x4_gen_lib4()
4105 CC[2+bs*3] += a_2 * b_3; in kernel_strmm_nn_rl_4x4_gen_lib4()
4106 CC[3+bs*3] += a_3 * b_3; in kernel_strmm_nn_rl_4x4_gen_lib4()
4123 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4124 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4125 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4126 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4129 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4130 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4131 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4132 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4135 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4136 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4137 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4138 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4141 CC[0+bs*3] += a_0 * b_3; in kernel_strmm_nn_rl_4x4_gen_lib4()
4142 CC[1+bs*3] += a_1 * b_3; in kernel_strmm_nn_rl_4x4_gen_lib4()
4143 CC[2+bs*3] += a_2 * b_3; in kernel_strmm_nn_rl_4x4_gen_lib4()
4144 CC[3+bs*3] += a_3 * b_3; in kernel_strmm_nn_rl_4x4_gen_lib4()
4161 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4162 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4163 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4164 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4167 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4168 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4169 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4170 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4173 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4174 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4175 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4176 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4179 CC[0+bs*3] += a_0 * b_3; in kernel_strmm_nn_rl_4x4_gen_lib4()
4180 CC[1+bs*3] += a_1 * b_3; in kernel_strmm_nn_rl_4x4_gen_lib4()
4181 CC[2+bs*3] += a_2 * b_3; in kernel_strmm_nn_rl_4x4_gen_lib4()
4182 CC[3+bs*3] += a_3 * b_3; in kernel_strmm_nn_rl_4x4_gen_lib4()
4200 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4201 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4202 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4203 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4220 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4221 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4222 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4223 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4226 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4227 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4228 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4229 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4246 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4247 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4248 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4249 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4252 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4253 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4254 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4255 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4258 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4259 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4260 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4261 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4278 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4279 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4280 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4281 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4284 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4285 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4286 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4287 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4290 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4291 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4292 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4293 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4296 CC[0+bs*3] += a_0 * b_3; in kernel_strmm_nn_rl_4x4_gen_lib4()
4297 CC[1+bs*3] += a_1 * b_3; in kernel_strmm_nn_rl_4x4_gen_lib4()
4298 CC[2+bs*3] += a_2 * b_3; in kernel_strmm_nn_rl_4x4_gen_lib4()
4299 CC[3+bs*3] += a_3 * b_3; in kernel_strmm_nn_rl_4x4_gen_lib4()
4316 CC[0+bs*0] += a_0 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4317 CC[1+bs*0] += a_1 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4318 CC[2+bs*0] += a_2 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4319 CC[3+bs*0] += a_3 * b_0; in kernel_strmm_nn_rl_4x4_gen_lib4()
4322 CC[0+bs*1] += a_0 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4323 CC[1+bs*1] += a_1 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4324 CC[2+bs*1] += a_2 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4325 CC[3+bs*1] += a_3 * b_1; in kernel_strmm_nn_rl_4x4_gen_lib4()
4328 CC[0+bs*2] += a_0 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4329 CC[1+bs*2] += a_1 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4330 CC[2+bs*2] += a_2 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4331 CC[3+bs*2] += a_3 * b_2; in kernel_strmm_nn_rl_4x4_gen_lib4()
4334 CC[0+bs*3] += a_0 * b_3; in kernel_strmm_nn_rl_4x4_gen_lib4()
4335 CC[1+bs*3] += a_1 * b_3; in kernel_strmm_nn_rl_4x4_gen_lib4()
4336 CC[2+bs*3] += a_2 * b_3; in kernel_strmm_nn_rl_4x4_gen_lib4()
4337 CC[3+bs*3] += a_3 * b_3; in kernel_strmm_nn_rl_4x4_gen_lib4()
4347 CC[0+bs*0] = alpha[0]*CC[0+bs*0]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4348 CC[1+bs*0] = alpha[0]*CC[1+bs*0]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4349 CC[2+bs*0] = alpha[0]*CC[2+bs*0]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4350 CC[3+bs*0] = alpha[0]*CC[3+bs*0]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4352 CC[0+bs*1] = alpha[0]*CC[0+bs*1]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4353 CC[1+bs*1] = alpha[0]*CC[1+bs*1]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4354 CC[2+bs*1] = alpha[0]*CC[2+bs*1]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4355 CC[3+bs*1] = alpha[0]*CC[3+bs*1]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4357 CC[0+bs*2] = alpha[0]*CC[0+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4358 CC[1+bs*2] = alpha[0]*CC[1+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4359 CC[2+bs*2] = alpha[0]*CC[2+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4360 CC[3+bs*2] = alpha[0]*CC[3+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4362 CC[0+bs*3] = alpha[0]*CC[0+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4363 CC[1+bs*3] = alpha[0]*CC[1+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4364 CC[2+bs*3] = alpha[0]*CC[2+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4365 CC[3+bs*3] = alpha[0]*CC[3+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4369 kernel_sgemm_nn_4x4_lib4(kmax-k, alpha, A, 0, B, sdb, &beta1, CC, CC); in kernel_strmm_nn_rl_4x4_gen_lib4()
4376 CC[0+bs*0] = CC[0+bs*1]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4377 CC[1+bs*0] = CC[1+bs*1]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4378 CC[2+bs*0] = CC[2+bs*1]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4379 CC[3+bs*0] = CC[3+bs*1]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4381 CC[0+bs*1] = CC[0+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4382 CC[1+bs*1] = CC[1+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4383 CC[2+bs*1] = CC[2+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4384 CC[3+bs*1] = CC[3+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4386 CC[0+bs*2] = CC[0+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4387 CC[1+bs*2] = CC[1+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4388 CC[2+bs*2] = CC[2+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4389 CC[3+bs*2] = CC[3+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4395 CC[0+bs*0] = CC[0+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4396 CC[1+bs*0] = CC[1+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4397 CC[2+bs*0] = CC[2+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4398 CC[3+bs*0] = CC[3+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4400 CC[0+bs*1] = CC[0+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4401 CC[1+bs*1] = CC[1+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4402 CC[2+bs*1] = CC[2+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4403 CC[3+bs*1] = CC[3+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4409 CC[0+bs*0] = CC[0+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4410 CC[1+bs*0] = CC[1+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4411 CC[2+bs*0] = CC[2+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4412 CC[3+bs*0] = CC[3+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4426 if(m0<=0 & m1>0) D0[0+bs*0] = CC[0+bs*0]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4427 if(m0<=1 & m1>1) D0[1+bs*0] = CC[1+bs*0]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4428 if(m0<=2 & m1>2) D0[2+bs*0] = CC[2+bs*0]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4429 if(m0<=3 & m1>3) D0[3+bs*0] = CC[3+bs*0]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4434 if(m0<=0 & m1>0) D0[0+bs*1] = CC[0+bs*1]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4435 if(m0<=1 & m1>1) D0[1+bs*1] = CC[1+bs*1]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4436 if(m0<=2 & m1>2) D0[2+bs*1] = CC[2+bs*1]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4437 if(m0<=3 & m1>3) D0[3+bs*1] = CC[3+bs*1]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4442 if(m0<=0 & m1>0) D0[0+bs*2] = CC[0+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4443 if(m0<=1 & m1>1) D0[1+bs*2] = CC[1+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4444 if(m0<=2 & m1>2) D0[2+bs*2] = CC[2+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4445 if(m0<=3 & m1>3) D0[3+bs*2] = CC[3+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4450 if(m0<=0 & m1>0) D0[0+bs*3] = CC[0+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4451 if(m0<=1 & m1>1) D0[1+bs*3] = CC[1+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4452 if(m0<=2 & m1>2) D0[2+bs*3] = CC[2+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4453 if(m0<=3 & m1>3) D0[3+bs*3] = CC[3+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4462 if(m0<=0 & m1>0) D0[1+bs*0] = CC[0+bs*0]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4463 if(m0<=1 & m1>1) D0[2+bs*0] = CC[1+bs*0]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4464 if(m0<=2 & m1>2) D0[3+bs*0] = CC[2+bs*0]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4465 if(m0<=3 & m1>3) D1[0+bs*0] = CC[3+bs*0]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4470 if(m0<=0 & m1>0) D0[1+bs*1] = CC[0+bs*1]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4471 if(m0<=1 & m1>1) D0[2+bs*1] = CC[1+bs*1]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4472 if(m0<=2 & m1>2) D0[3+bs*1] = CC[2+bs*1]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4473 if(m0<=3 & m1>3) D1[0+bs*1] = CC[3+bs*1]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4478 if(m0<=0 & m1>0) D0[1+bs*2] = CC[0+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4479 if(m0<=1 & m1>1) D0[2+bs*2] = CC[1+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4480 if(m0<=2 & m1>2) D0[3+bs*2] = CC[2+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4481 if(m0<=3 & m1>3) D1[0+bs*2] = CC[3+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4486 if(m0<=0 & m1>0) D0[1+bs*3] = CC[0+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4487 if(m0<=1 & m1>1) D0[2+bs*3] = CC[1+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4488 if(m0<=2 & m1>2) D0[3+bs*3] = CC[2+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4489 if(m0<=3 & m1>3) D1[0+bs*3] = CC[3+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4498 if(m0<=0 & m1>0) D0[2+bs*0] = CC[0+bs*0]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4499 if(m0<=1 & m1>1) D0[3+bs*0] = CC[1+bs*0]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4500 if(m0<=2 & m1>2) D1[0+bs*0] = CC[2+bs*0]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4501 if(m0<=3 & m1>3) D1[1+bs*0] = CC[3+bs*0]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4506 if(m0<=0 & m1>0) D0[2+bs*1] = CC[0+bs*1]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4507 if(m0<=1 & m1>1) D0[3+bs*1] = CC[1+bs*1]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4508 if(m0<=2 & m1>2) D1[0+bs*1] = CC[2+bs*1]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4509 if(m0<=3 & m1>3) D1[1+bs*1] = CC[3+bs*1]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4514 if(m0<=0 & m1>0) D0[2+bs*2] = CC[0+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4515 if(m0<=1 & m1>1) D0[3+bs*2] = CC[1+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4516 if(m0<=2 & m1>2) D1[0+bs*2] = CC[2+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4517 if(m0<=3 & m1>3) D1[1+bs*2] = CC[3+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4522 if(m0<=0 & m1>0) D0[2+bs*3] = CC[0+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4523 if(m0<=1 & m1>1) D0[3+bs*3] = CC[1+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4524 if(m0<=2 & m1>2) D1[0+bs*3] = CC[2+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4525 if(m0<=3 & m1>3) D1[1+bs*3] = CC[3+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4534 if(m0<=0 & m1>0) D0[3+bs*0] = CC[0+bs*0]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4535 if(m0<=1 & m1>1) D1[0+bs*0] = CC[1+bs*0]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4536 if(m0<=2 & m1>2) D1[1+bs*0] = CC[2+bs*0]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4537 if(m0<=3 & m1>3) D1[2+bs*0] = CC[3+bs*0]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4542 if(m0<=0 & m1>0) D0[3+bs*1] = CC[0+bs*1]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4543 if(m0<=1 & m1>1) D1[0+bs*1] = CC[1+bs*1]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4544 if(m0<=2 & m1>2) D1[1+bs*1] = CC[2+bs*1]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4545 if(m0<=3 & m1>3) D1[2+bs*1] = CC[3+bs*1]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4550 if(m0<=0 & m1>0) D0[3+bs*2] = CC[0+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4551 if(m0<=1 & m1>1) D1[0+bs*2] = CC[1+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4552 if(m0<=2 & m1>2) D1[1+bs*2] = CC[2+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4553 if(m0<=3 & m1>3) D1[2+bs*2] = CC[3+bs*2]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4558 if(m0<=0 & m1>0) D0[3+bs*3] = CC[0+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4559 if(m0<=1 & m1>1) D1[0+bs*3] = CC[1+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4560 if(m0<=2 & m1>2) D1[1+bs*3] = CC[2+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4561 if(m0<=3 & m1>3) D1[2+bs*3] = CC[3+bs*3]; in kernel_strmm_nn_rl_4x4_gen_lib4()
4583 float CC[16] = {0}; in kernel_spotrf_nt_l_4x4_lib4() local
4585 ALIGNED( float CC[16], 64 ) = {0}; in kernel_spotrf_nt_l_4x4_lib4()
4593 kernel_sgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, C, CC); in kernel_spotrf_nt_l_4x4_lib4()
4595 if(CC[0+bs*0]>0) in kernel_spotrf_nt_l_4x4_lib4()
4597 CC[0+bs*0] = sqrt(CC[0+bs*0]); in kernel_spotrf_nt_l_4x4_lib4()
4598 tmp = 1.0/CC[0+bs*0]; in kernel_spotrf_nt_l_4x4_lib4()
4602 CC[0+bs*0] = 0.0; in kernel_spotrf_nt_l_4x4_lib4()
4605 CC[1+bs*0] *= tmp; in kernel_spotrf_nt_l_4x4_lib4()
4606 CC[2+bs*0] *= tmp; in kernel_spotrf_nt_l_4x4_lib4()
4607 CC[3+bs*0] *= tmp; in kernel_spotrf_nt_l_4x4_lib4()
4610 CC[1+bs*1] -= CC[1+bs*0] * CC[1+bs*0]; in kernel_spotrf_nt_l_4x4_lib4()
4611 CC[2+bs*1] -= CC[2+bs*0] * CC[1+bs*0]; in kernel_spotrf_nt_l_4x4_lib4()
4612 CC[3+bs*1] -= CC[3+bs*0] * CC[1+bs*0]; in kernel_spotrf_nt_l_4x4_lib4()
4613 if(CC[1+bs*1]>0) in kernel_spotrf_nt_l_4x4_lib4()
4615 CC[1+bs*1] = sqrt(CC[1+bs*1]); in kernel_spotrf_nt_l_4x4_lib4()
4616 tmp = 1.0/CC[1+bs*1]; in kernel_spotrf_nt_l_4x4_lib4()
4620 CC[1+bs*1] = 0.0; in kernel_spotrf_nt_l_4x4_lib4()
4623 CC[2+bs*1] *= tmp; in kernel_spotrf_nt_l_4x4_lib4()
4624 CC[3+bs*1] *= tmp; in kernel_spotrf_nt_l_4x4_lib4()
4627 CC[2+bs*2] -= CC[2+bs*0] * CC[2+bs*0]; in kernel_spotrf_nt_l_4x4_lib4()
4628 CC[3+bs*2] -= CC[3+bs*0] * CC[2+bs*0]; in kernel_spotrf_nt_l_4x4_lib4()
4629 CC[2+bs*2] -= CC[2+bs*1] * CC[2+bs*1]; in kernel_spotrf_nt_l_4x4_lib4()
4630 CC[3+bs*2] -= CC[3+bs*1] * CC[2+bs*1]; in kernel_spotrf_nt_l_4x4_lib4()
4631 if(CC[2+bs*2]>0) in kernel_spotrf_nt_l_4x4_lib4()
4633 CC[2+bs*2] = sqrt(CC[2+bs*2]); in kernel_spotrf_nt_l_4x4_lib4()
4634 tmp = 1.0/CC[2+bs*2]; in kernel_spotrf_nt_l_4x4_lib4()
4638 CC[2+bs*2] = 0.0; in kernel_spotrf_nt_l_4x4_lib4()
4641 CC[3+bs*2] *= tmp; in kernel_spotrf_nt_l_4x4_lib4()
4644 CC[3+bs*3] -= CC[3+bs*0] * CC[3+bs*0]; in kernel_spotrf_nt_l_4x4_lib4()
4645 CC[3+bs*3] -= CC[3+bs*1] * CC[3+bs*1]; in kernel_spotrf_nt_l_4x4_lib4()
4646 CC[3+bs*3] -= CC[3+bs*2] * CC[3+bs*2]; in kernel_spotrf_nt_l_4x4_lib4()
4647 if(CC[3+bs*3]>0) in kernel_spotrf_nt_l_4x4_lib4()
4649 CC[3+bs*3] = sqrt(CC[3+bs*3]); in kernel_spotrf_nt_l_4x4_lib4()
4650 tmp = 1.0/CC[3+bs*3]; in kernel_spotrf_nt_l_4x4_lib4()
4654 CC[3+bs*3] = 0.0; in kernel_spotrf_nt_l_4x4_lib4()
4659 D[0+bs*0] = CC[0+bs*0]; in kernel_spotrf_nt_l_4x4_lib4()
4660 D[1+bs*0] = CC[1+bs*0]; in kernel_spotrf_nt_l_4x4_lib4()
4661 D[2+bs*0] = CC[2+bs*0]; in kernel_spotrf_nt_l_4x4_lib4()
4662 D[3+bs*0] = CC[3+bs*0]; in kernel_spotrf_nt_l_4x4_lib4()
4664 D[1+bs*1] = CC[1+bs*1]; in kernel_spotrf_nt_l_4x4_lib4()
4665 D[2+bs*1] = CC[2+bs*1]; in kernel_spotrf_nt_l_4x4_lib4()
4666 D[3+bs*1] = CC[3+bs*1]; in kernel_spotrf_nt_l_4x4_lib4()
4668 D[2+bs*2] = CC[2+bs*2]; in kernel_spotrf_nt_l_4x4_lib4()
4669 D[3+bs*2] = CC[3+bs*2]; in kernel_spotrf_nt_l_4x4_lib4()
4671 D[3+bs*3] = CC[3+bs*3]; in kernel_spotrf_nt_l_4x4_lib4()
4689 float CC[16] = {0}; in kernel_spotrf_nt_l_4x4_vs_lib4() local
4691 ALIGNED( float CC[16], 64 ) = {0}; in kernel_spotrf_nt_l_4x4_vs_lib4()
4697 kernel_sgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, C, CC); in kernel_spotrf_nt_l_4x4_vs_lib4()
4699 if(CC[0+bs*0]>0) in kernel_spotrf_nt_l_4x4_vs_lib4()
4701 CC[0+bs*0] = sqrt(CC[0+bs*0]); in kernel_spotrf_nt_l_4x4_vs_lib4()
4702 tmp = 1.0/CC[0+bs*0]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4706 CC[0+bs*0] = 0.0; in kernel_spotrf_nt_l_4x4_vs_lib4()
4709 CC[1+bs*0] *= tmp; in kernel_spotrf_nt_l_4x4_vs_lib4()
4710 CC[2+bs*0] *= tmp; in kernel_spotrf_nt_l_4x4_vs_lib4()
4711 CC[3+bs*0] *= tmp; in kernel_spotrf_nt_l_4x4_vs_lib4()
4717 CC[1+bs*1] -= CC[1+bs*0] * CC[1+bs*0]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4718 CC[2+bs*1] -= CC[2+bs*0] * CC[1+bs*0]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4719 CC[3+bs*1] -= CC[3+bs*0] * CC[1+bs*0]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4720 if(CC[1+bs*1]>0) in kernel_spotrf_nt_l_4x4_vs_lib4()
4722 CC[1+bs*1] = sqrt(CC[1+bs*1]); in kernel_spotrf_nt_l_4x4_vs_lib4()
4723 tmp = 1.0/CC[1+bs*1]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4727 CC[1+bs*1] = 0.0; in kernel_spotrf_nt_l_4x4_vs_lib4()
4730 CC[2+bs*1] *= tmp; in kernel_spotrf_nt_l_4x4_vs_lib4()
4731 CC[3+bs*1] *= tmp; in kernel_spotrf_nt_l_4x4_vs_lib4()
4737 CC[2+bs*2] -= CC[2+bs*0] * CC[2+bs*0]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4738 CC[3+bs*2] -= CC[3+bs*0] * CC[2+bs*0]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4739 CC[2+bs*2] -= CC[2+bs*1] * CC[2+bs*1]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4740 CC[3+bs*2] -= CC[3+bs*1] * CC[2+bs*1]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4741 if(CC[2+bs*2]>0) in kernel_spotrf_nt_l_4x4_vs_lib4()
4743 CC[2+bs*2] = sqrt(CC[2+bs*2]); in kernel_spotrf_nt_l_4x4_vs_lib4()
4744 tmp = 1.0/CC[2+bs*2]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4748 CC[2+bs*2] = 0.0; in kernel_spotrf_nt_l_4x4_vs_lib4()
4751 CC[3+bs*2] *= tmp; in kernel_spotrf_nt_l_4x4_vs_lib4()
4757 CC[3+bs*3] -= CC[3+bs*0] * CC[3+bs*0]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4758 CC[3+bs*3] -= CC[3+bs*1] * CC[3+bs*1]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4759 CC[3+bs*3] -= CC[3+bs*2] * CC[3+bs*2]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4760 if(CC[3+bs*3]>0) in kernel_spotrf_nt_l_4x4_vs_lib4()
4762 CC[3+bs*3] = sqrt(CC[3+bs*3]); in kernel_spotrf_nt_l_4x4_vs_lib4()
4763 tmp = 1.0/CC[3+bs*3]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4767 CC[3+bs*3] = 0.0; in kernel_spotrf_nt_l_4x4_vs_lib4()
4777 D[0+bs*0] = CC[0+bs*0]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4778 D[1+bs*0] = CC[1+bs*0]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4779 D[2+bs*0] = CC[2+bs*0]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4780 D[3+bs*0] = CC[3+bs*0]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4785 D[1+bs*1] = CC[1+bs*1]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4786 D[2+bs*1] = CC[2+bs*1]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4787 D[3+bs*1] = CC[3+bs*1]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4792 D[2+bs*2] = CC[2+bs*2]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4793 D[3+bs*2] = CC[3+bs*2]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4798 D[3+bs*3] = CC[3+bs*3]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4802 D[0+bs*0] = CC[0+bs*0]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4803 D[1+bs*0] = CC[1+bs*0]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4804 D[2+bs*0] = CC[2+bs*0]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4809 D[1+bs*1] = CC[1+bs*1]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4810 D[2+bs*1] = CC[2+bs*1]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4815 D[2+bs*2] = CC[2+bs*2]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4819 D[0+bs*0] = CC[0+bs*0]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4820 D[1+bs*0] = CC[1+bs*0]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4825 D[1+bs*1] = CC[1+bs*1]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4829 D[0+bs*0] = CC[0+bs*0]; in kernel_spotrf_nt_l_4x4_vs_lib4()
4872 float CC[16] = {0}; in kernel_strsm_nt_rl_inv_4x4_lib4() local
4874 ALIGNED( float CC[16], 64 ) = {0}; in kernel_strsm_nt_rl_inv_4x4_lib4()
4879 kernel_sgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC); in kernel_strsm_nt_rl_inv_4x4_lib4()
4882 CC[0+bs*0] *= tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4883 CC[1+bs*0] *= tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4884 CC[2+bs*0] *= tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4885 CC[3+bs*0] *= tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4888 CC[0+bs*1] -= CC[0+bs*0] * tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4889 CC[1+bs*1] -= CC[1+bs*0] * tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4890 CC[2+bs*1] -= CC[2+bs*0] * tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4891 CC[3+bs*1] -= CC[3+bs*0] * tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4893 CC[0+bs*1] *= tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4894 CC[1+bs*1] *= tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4895 CC[2+bs*1] *= tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4896 CC[3+bs*1] *= tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4899 CC[0+bs*2] -= CC[0+bs*0] * tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4900 CC[1+bs*2] -= CC[1+bs*0] * tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4901 CC[2+bs*2] -= CC[2+bs*0] * tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4902 CC[3+bs*2] -= CC[3+bs*0] * tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4904 CC[0+bs*2] -= CC[0+bs*1] * tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4905 CC[1+bs*2] -= CC[1+bs*1] * tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4906 CC[2+bs*2] -= CC[2+bs*1] * tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4907 CC[3+bs*2] -= CC[3+bs*1] * tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4909 CC[0+bs*2] *= tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4910 CC[1+bs*2] *= tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4911 CC[2+bs*2] *= tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4912 CC[3+bs*2] *= tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4915 CC[0+bs*3] -= CC[0+bs*0] * tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4916 CC[1+bs*3] -= CC[1+bs*0] * tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4917 CC[2+bs*3] -= CC[2+bs*0] * tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4918 CC[3+bs*3] -= CC[3+bs*0] * tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4920 CC[0+bs*3] -= CC[0+bs*1] * tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4921 CC[1+bs*3] -= CC[1+bs*1] * tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4922 CC[2+bs*3] -= CC[2+bs*1] * tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4923 CC[3+bs*3] -= CC[3+bs*1] * tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4925 CC[0+bs*3] -= CC[0+bs*2] * tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4926 CC[1+bs*3] -= CC[1+bs*2] * tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4927 CC[2+bs*3] -= CC[2+bs*2] * tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4928 CC[3+bs*3] -= CC[3+bs*2] * tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4930 CC[0+bs*3] *= tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4931 CC[1+bs*3] *= tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4932 CC[2+bs*3] *= tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4933 CC[3+bs*3] *= tmp; in kernel_strsm_nt_rl_inv_4x4_lib4()
4935 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nt_rl_inv_4x4_lib4()
4936 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nt_rl_inv_4x4_lib4()
4937 D[2+bs*0] = CC[2+bs*0]; in kernel_strsm_nt_rl_inv_4x4_lib4()
4938 D[3+bs*0] = CC[3+bs*0]; in kernel_strsm_nt_rl_inv_4x4_lib4()
4940 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nt_rl_inv_4x4_lib4()
4941 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nt_rl_inv_4x4_lib4()
4942 D[2+bs*1] = CC[2+bs*1]; in kernel_strsm_nt_rl_inv_4x4_lib4()
4943 D[3+bs*1] = CC[3+bs*1]; in kernel_strsm_nt_rl_inv_4x4_lib4()
4945 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nt_rl_inv_4x4_lib4()
4946 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nt_rl_inv_4x4_lib4()
4947 D[2+bs*2] = CC[2+bs*2]; in kernel_strsm_nt_rl_inv_4x4_lib4()
4948 D[3+bs*2] = CC[3+bs*2]; in kernel_strsm_nt_rl_inv_4x4_lib4()
4950 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nt_rl_inv_4x4_lib4()
4951 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nt_rl_inv_4x4_lib4()
4952 D[2+bs*3] = CC[2+bs*3]; in kernel_strsm_nt_rl_inv_4x4_lib4()
4953 D[3+bs*3] = CC[3+bs*3]; in kernel_strsm_nt_rl_inv_4x4_lib4()
4971 float CC[16] = {0}; in kernel_strsm_nt_rl_inv_4x4_vs_lib4() local
4973 ALIGNED( float CC[16], 64 ) = {0}; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
4978 kernel_sgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC); in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
4981 CC[0+bs*0] *= tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
4982 CC[1+bs*0] *= tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
4983 CC[2+bs*0] *= tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
4984 CC[3+bs*0] *= tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
4990 CC[0+bs*1] -= CC[0+bs*0] * tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
4991 CC[1+bs*1] -= CC[1+bs*0] * tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
4992 CC[2+bs*1] -= CC[2+bs*0] * tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
4993 CC[3+bs*1] -= CC[3+bs*0] * tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
4995 CC[0+bs*1] *= tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
4996 CC[1+bs*1] *= tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
4997 CC[2+bs*1] *= tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
4998 CC[3+bs*1] *= tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5004 CC[0+bs*2] -= CC[0+bs*0] * tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5005 CC[1+bs*2] -= CC[1+bs*0] * tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5006 CC[2+bs*2] -= CC[2+bs*0] * tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5007 CC[3+bs*2] -= CC[3+bs*0] * tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5009 CC[0+bs*2] -= CC[0+bs*1] * tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5010 CC[1+bs*2] -= CC[1+bs*1] * tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5011 CC[2+bs*2] -= CC[2+bs*1] * tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5012 CC[3+bs*2] -= CC[3+bs*1] * tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5014 CC[0+bs*2] *= tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5015 CC[1+bs*2] *= tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5016 CC[2+bs*2] *= tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5017 CC[3+bs*2] *= tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5023 CC[0+bs*3] -= CC[0+bs*0] * tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5024 CC[1+bs*3] -= CC[1+bs*0] * tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5025 CC[2+bs*3] -= CC[2+bs*0] * tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5026 CC[3+bs*3] -= CC[3+bs*0] * tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5028 CC[0+bs*3] -= CC[0+bs*1] * tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5029 CC[1+bs*3] -= CC[1+bs*1] * tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5030 CC[2+bs*3] -= CC[2+bs*1] * tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5031 CC[3+bs*3] -= CC[3+bs*1] * tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5033 CC[0+bs*3] -= CC[0+bs*2] * tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5034 CC[1+bs*3] -= CC[1+bs*2] * tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5035 CC[2+bs*3] -= CC[2+bs*2] * tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5036 CC[3+bs*3] -= CC[3+bs*2] * tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5038 CC[0+bs*3] *= tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5039 CC[1+bs*3] *= tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5040 CC[2+bs*3] *= tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5041 CC[3+bs*3] *= tmp; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5047 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5048 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5049 D[2+bs*0] = CC[2+bs*0]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5050 D[3+bs*0] = CC[3+bs*0]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5055 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5056 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5057 D[2+bs*1] = CC[2+bs*1]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5058 D[3+bs*1] = CC[3+bs*1]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5063 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5064 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5065 D[2+bs*2] = CC[2+bs*2]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5066 D[3+bs*2] = CC[3+bs*2]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5071 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5072 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5073 D[2+bs*3] = CC[2+bs*3]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5074 D[3+bs*3] = CC[3+bs*3]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5078 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5079 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5080 D[2+bs*0] = CC[2+bs*0]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5085 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5086 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5087 D[2+bs*1] = CC[2+bs*1]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5092 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5093 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5094 D[2+bs*2] = CC[2+bs*2]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5099 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5100 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5101 D[2+bs*3] = CC[2+bs*3]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5105 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5106 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5111 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5112 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5117 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5118 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5123 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5124 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5128 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5133 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5138 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5143 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nt_rl_inv_4x4_vs_lib4()
5186 float CC[16] = {0}; in kernel_strsm_nt_rl_one_4x4_lib4() local
5188 ALIGNED( float CC[16], 64 ) = {0}; in kernel_strsm_nt_rl_one_4x4_lib4()
5193 kernel_sgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC); in kernel_strsm_nt_rl_one_4x4_lib4()
5196 CC[0+bs*1] -= CC[0+bs*0] * tmp; in kernel_strsm_nt_rl_one_4x4_lib4()
5197 CC[1+bs*1] -= CC[1+bs*0] * tmp; in kernel_strsm_nt_rl_one_4x4_lib4()
5198 CC[2+bs*1] -= CC[2+bs*0] * tmp; in kernel_strsm_nt_rl_one_4x4_lib4()
5199 CC[3+bs*1] -= CC[3+bs*0] * tmp; in kernel_strsm_nt_rl_one_4x4_lib4()
5202 CC[0+bs*2] -= CC[0+bs*0] * tmp; in kernel_strsm_nt_rl_one_4x4_lib4()
5203 CC[1+bs*2] -= CC[1+bs*0] * tmp; in kernel_strsm_nt_rl_one_4x4_lib4()
5204 CC[2+bs*2] -= CC[2+bs*0] * tmp; in kernel_strsm_nt_rl_one_4x4_lib4()
5205 CC[3+bs*2] -= CC[3+bs*0] * tmp; in kernel_strsm_nt_rl_one_4x4_lib4()
5207 CC[0+bs*2] -= CC[0+bs*1] * tmp; in kernel_strsm_nt_rl_one_4x4_lib4()
5208 CC[1+bs*2] -= CC[1+bs*1] * tmp; in kernel_strsm_nt_rl_one_4x4_lib4()
5209 CC[2+bs*2] -= CC[2+bs*1] * tmp; in kernel_strsm_nt_rl_one_4x4_lib4()
5210 CC[3+bs*2] -= CC[3+bs*1] * tmp; in kernel_strsm_nt_rl_one_4x4_lib4()
5213 CC[0+bs*3] -= CC[0+bs*0] * tmp; in kernel_strsm_nt_rl_one_4x4_lib4()
5214 CC[1+bs*3] -= CC[1+bs*0] * tmp; in kernel_strsm_nt_rl_one_4x4_lib4()
5215 CC[2+bs*3] -= CC[2+bs*0] * tmp; in kernel_strsm_nt_rl_one_4x4_lib4()
5216 CC[3+bs*3] -= CC[3+bs*0] * tmp; in kernel_strsm_nt_rl_one_4x4_lib4()
5218 CC[0+bs*3] -= CC[0+bs*1] * tmp; in kernel_strsm_nt_rl_one_4x4_lib4()
5219 CC[1+bs*3] -= CC[1+bs*1] * tmp; in kernel_strsm_nt_rl_one_4x4_lib4()
5220 CC[2+bs*3] -= CC[2+bs*1] * tmp; in kernel_strsm_nt_rl_one_4x4_lib4()
5221 CC[3+bs*3] -= CC[3+bs*1] * tmp; in kernel_strsm_nt_rl_one_4x4_lib4()
5223 CC[0+bs*3] -= CC[0+bs*2] * tmp; in kernel_strsm_nt_rl_one_4x4_lib4()
5224 CC[1+bs*3] -= CC[1+bs*2] * tmp; in kernel_strsm_nt_rl_one_4x4_lib4()
5225 CC[2+bs*3] -= CC[2+bs*2] * tmp; in kernel_strsm_nt_rl_one_4x4_lib4()
5226 CC[3+bs*3] -= CC[3+bs*2] * tmp; in kernel_strsm_nt_rl_one_4x4_lib4()
5228 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nt_rl_one_4x4_lib4()
5229 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nt_rl_one_4x4_lib4()
5230 D[2+bs*0] = CC[2+bs*0]; in kernel_strsm_nt_rl_one_4x4_lib4()
5231 D[3+bs*0] = CC[3+bs*0]; in kernel_strsm_nt_rl_one_4x4_lib4()
5233 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nt_rl_one_4x4_lib4()
5234 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nt_rl_one_4x4_lib4()
5235 D[2+bs*1] = CC[2+bs*1]; in kernel_strsm_nt_rl_one_4x4_lib4()
5236 D[3+bs*1] = CC[3+bs*1]; in kernel_strsm_nt_rl_one_4x4_lib4()
5238 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nt_rl_one_4x4_lib4()
5239 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nt_rl_one_4x4_lib4()
5240 D[2+bs*2] = CC[2+bs*2]; in kernel_strsm_nt_rl_one_4x4_lib4()
5241 D[3+bs*2] = CC[3+bs*2]; in kernel_strsm_nt_rl_one_4x4_lib4()
5243 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nt_rl_one_4x4_lib4()
5244 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nt_rl_one_4x4_lib4()
5245 D[2+bs*3] = CC[2+bs*3]; in kernel_strsm_nt_rl_one_4x4_lib4()
5246 D[3+bs*3] = CC[3+bs*3]; in kernel_strsm_nt_rl_one_4x4_lib4()
5264 float CC[16] = {0}; in kernel_strsm_nt_rl_one_4x4_vs_lib4() local
5266 ALIGNED( float CC[16], 64 ) = {0}; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5271 kernel_sgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC); in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5277 CC[0+bs*1] -= CC[0+bs*0] * tmp; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5278 CC[1+bs*1] -= CC[1+bs*0] * tmp; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5279 CC[2+bs*1] -= CC[2+bs*0] * tmp; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5280 CC[3+bs*1] -= CC[3+bs*0] * tmp; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5286 CC[0+bs*2] -= CC[0+bs*0] * tmp; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5287 CC[1+bs*2] -= CC[1+bs*0] * tmp; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5288 CC[2+bs*2] -= CC[2+bs*0] * tmp; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5289 CC[3+bs*2] -= CC[3+bs*0] * tmp; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5291 CC[0+bs*2] -= CC[0+bs*1] * tmp; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5292 CC[1+bs*2] -= CC[1+bs*1] * tmp; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5293 CC[2+bs*2] -= CC[2+bs*1] * tmp; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5294 CC[3+bs*2] -= CC[3+bs*1] * tmp; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5300 CC[0+bs*3] -= CC[0+bs*0] * tmp; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5301 CC[1+bs*3] -= CC[1+bs*0] * tmp; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5302 CC[2+bs*3] -= CC[2+bs*0] * tmp; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5303 CC[3+bs*3] -= CC[3+bs*0] * tmp; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5305 CC[0+bs*3] -= CC[0+bs*1] * tmp; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5306 CC[1+bs*3] -= CC[1+bs*1] * tmp; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5307 CC[2+bs*3] -= CC[2+bs*1] * tmp; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5308 CC[3+bs*3] -= CC[3+bs*1] * tmp; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5310 CC[0+bs*3] -= CC[0+bs*2] * tmp; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5311 CC[1+bs*3] -= CC[1+bs*2] * tmp; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5312 CC[2+bs*3] -= CC[2+bs*2] * tmp; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5313 CC[3+bs*3] -= CC[3+bs*2] * tmp; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5319 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5320 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5321 D[2+bs*0] = CC[2+bs*0]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5322 D[3+bs*0] = CC[3+bs*0]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5327 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5328 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5329 D[2+bs*1] = CC[2+bs*1]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5330 D[3+bs*1] = CC[3+bs*1]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5335 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5336 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5337 D[2+bs*2] = CC[2+bs*2]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5338 D[3+bs*2] = CC[3+bs*2]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5343 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5344 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5345 D[2+bs*3] = CC[2+bs*3]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5346 D[3+bs*3] = CC[3+bs*3]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5350 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5351 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5352 D[2+bs*0] = CC[2+bs*0]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5357 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5358 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5359 D[2+bs*1] = CC[2+bs*1]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5364 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5365 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5366 D[2+bs*2] = CC[2+bs*2]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5371 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5372 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5373 D[2+bs*3] = CC[2+bs*3]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5377 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5378 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5383 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5384 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5389 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5390 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5395 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5396 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5400 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5405 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5410 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5415 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nt_rl_one_4x4_vs_lib4()
5434 float CC[16] = {0}; in kernel_strsm_nt_ru_inv_4x4_lib4() local
5436 ALIGNED( float CC[16], 64 ) = {0}; in kernel_strsm_nt_ru_inv_4x4_lib4()
5441 kernel_sgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC); in kernel_strsm_nt_ru_inv_4x4_lib4()
5444 CC[0+bs*3] *= tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5445 CC[1+bs*3] *= tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5446 CC[2+bs*3] *= tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5447 CC[3+bs*3] *= tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5449 CC[0+bs*2] -= CC[0+bs*3] * tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5450 CC[1+bs*2] -= CC[1+bs*3] * tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5451 CC[2+bs*2] -= CC[2+bs*3] * tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5452 CC[3+bs*2] -= CC[3+bs*3] * tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5454 CC[0+bs*1] -= CC[0+bs*3] * tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5455 CC[1+bs*1] -= CC[1+bs*3] * tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5456 CC[2+bs*1] -= CC[2+bs*3] * tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5457 CC[3+bs*1] -= CC[3+bs*3] * tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5459 CC[0+bs*0] -= CC[0+bs*3] * tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5460 CC[1+bs*0] -= CC[1+bs*3] * tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5461 CC[2+bs*0] -= CC[2+bs*3] * tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5462 CC[3+bs*0] -= CC[3+bs*3] * tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5465 CC[0+bs*2] *= tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5466 CC[1+bs*2] *= tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5467 CC[2+bs*2] *= tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5468 CC[3+bs*2] *= tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5470 CC[0+bs*1] -= CC[0+bs*2] * tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5471 CC[1+bs*1] -= CC[1+bs*2] * tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5472 CC[2+bs*1] -= CC[2+bs*2] * tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5473 CC[3+bs*1] -= CC[3+bs*2] * tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5475 CC[0+bs*0] -= CC[0+bs*2] * tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5476 CC[1+bs*0] -= CC[1+bs*2] * tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5477 CC[2+bs*0] -= CC[2+bs*2] * tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5478 CC[3+bs*0] -= CC[3+bs*2] * tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5481 CC[0+bs*1] *= tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5482 CC[1+bs*1] *= tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5483 CC[2+bs*1] *= tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5484 CC[3+bs*1] *= tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5486 CC[0+bs*0] -= CC[0+bs*1] * tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5487 CC[1+bs*0] -= CC[1+bs*1] * tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5488 CC[2+bs*0] -= CC[2+bs*1] * tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5489 CC[3+bs*0] -= CC[3+bs*1] * tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5492 CC[0+bs*0] *= tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5493 CC[1+bs*0] *= tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5494 CC[2+bs*0] *= tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5495 CC[3+bs*0] *= tmp; in kernel_strsm_nt_ru_inv_4x4_lib4()
5497 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nt_ru_inv_4x4_lib4()
5498 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nt_ru_inv_4x4_lib4()
5499 D[2+bs*0] = CC[2+bs*0]; in kernel_strsm_nt_ru_inv_4x4_lib4()
5500 D[3+bs*0] = CC[3+bs*0]; in kernel_strsm_nt_ru_inv_4x4_lib4()
5502 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nt_ru_inv_4x4_lib4()
5503 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nt_ru_inv_4x4_lib4()
5504 D[2+bs*1] = CC[2+bs*1]; in kernel_strsm_nt_ru_inv_4x4_lib4()
5505 D[3+bs*1] = CC[3+bs*1]; in kernel_strsm_nt_ru_inv_4x4_lib4()
5507 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nt_ru_inv_4x4_lib4()
5508 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nt_ru_inv_4x4_lib4()
5509 D[2+bs*2] = CC[2+bs*2]; in kernel_strsm_nt_ru_inv_4x4_lib4()
5510 D[3+bs*2] = CC[3+bs*2]; in kernel_strsm_nt_ru_inv_4x4_lib4()
5512 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nt_ru_inv_4x4_lib4()
5513 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nt_ru_inv_4x4_lib4()
5514 D[2+bs*3] = CC[2+bs*3]; in kernel_strsm_nt_ru_inv_4x4_lib4()
5515 D[3+bs*3] = CC[3+bs*3]; in kernel_strsm_nt_ru_inv_4x4_lib4()
5533 float CC[16] = {0}; in kernel_strsm_nt_ru_inv_4x4_vs_lib4() local
5535 ALIGNED( float CC[16], 64 ) = {0}; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5540 kernel_sgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC); in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5545 CC[0+bs*3] *= tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5546 CC[1+bs*3] *= tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5547 CC[2+bs*3] *= tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5548 CC[3+bs*3] *= tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5550 CC[0+bs*2] -= CC[0+bs*3] * tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5551 CC[1+bs*2] -= CC[1+bs*3] * tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5552 CC[2+bs*2] -= CC[2+bs*3] * tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5553 CC[3+bs*2] -= CC[3+bs*3] * tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5555 CC[0+bs*1] -= CC[0+bs*3] * tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5556 CC[1+bs*1] -= CC[1+bs*3] * tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5557 CC[2+bs*1] -= CC[2+bs*3] * tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5558 CC[3+bs*1] -= CC[3+bs*3] * tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5560 CC[0+bs*0] -= CC[0+bs*3] * tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5561 CC[1+bs*0] -= CC[1+bs*3] * tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5562 CC[2+bs*0] -= CC[2+bs*3] * tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5563 CC[3+bs*0] -= CC[3+bs*3] * tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5569 CC[0+bs*2] *= tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5570 CC[1+bs*2] *= tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5571 CC[2+bs*2] *= tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5572 CC[3+bs*2] *= tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5574 CC[0+bs*1] -= CC[0+bs*2] * tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5575 CC[1+bs*1] -= CC[1+bs*2] * tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5576 CC[2+bs*1] -= CC[2+bs*2] * tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5577 CC[3+bs*1] -= CC[3+bs*2] * tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5579 CC[0+bs*0] -= CC[0+bs*2] * tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5580 CC[1+bs*0] -= CC[1+bs*2] * tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5581 CC[2+bs*0] -= CC[2+bs*2] * tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5582 CC[3+bs*0] -= CC[3+bs*2] * tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5588 CC[0+bs*1] *= tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5589 CC[1+bs*1] *= tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5590 CC[2+bs*1] *= tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5591 CC[3+bs*1] *= tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5593 CC[0+bs*0] -= CC[0+bs*1] * tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5594 CC[1+bs*0] -= CC[1+bs*1] * tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5595 CC[2+bs*0] -= CC[2+bs*1] * tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5596 CC[3+bs*0] -= CC[3+bs*1] * tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5600 CC[0+bs*0] *= tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5601 CC[1+bs*0] *= tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5602 CC[2+bs*0] *= tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5603 CC[3+bs*0] *= tmp; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5610 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5611 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5612 D[2+bs*0] = CC[2+bs*0]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5613 D[3+bs*0] = CC[3+bs*0]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5618 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5619 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5620 D[2+bs*1] = CC[2+bs*1]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5621 D[3+bs*1] = CC[3+bs*1]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5626 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5627 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5628 D[2+bs*2] = CC[2+bs*2]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5629 D[3+bs*2] = CC[3+bs*2]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5634 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5635 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5636 D[2+bs*3] = CC[2+bs*3]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5637 D[3+bs*3] = CC[3+bs*3]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5641 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5642 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5643 D[2+bs*0] = CC[2+bs*0]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5648 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5649 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5650 D[2+bs*1] = CC[2+bs*1]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5655 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5656 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5657 D[2+bs*2] = CC[2+bs*2]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5662 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5663 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5664 D[2+bs*3] = CC[2+bs*3]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5668 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5669 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5674 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5675 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5680 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5681 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5686 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5687 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5691 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5696 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5701 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5706 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nt_ru_inv_4x4_vs_lib4()
5727 float CC[16] = {0}; in kernel_sgetrf_nn_4x4_lib4() local
5729 ALIGNED( float CC[16], 64 ) = {0}; in kernel_sgetrf_nn_4x4_lib4()
5735 kernel_sgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, &beta1, C, CC); in kernel_sgetrf_nn_4x4_lib4()
5740 tmp = 1.0 / CC[0+bs*0]; in kernel_sgetrf_nn_4x4_lib4()
5741 CC[1+bs*0] *= tmp; in kernel_sgetrf_nn_4x4_lib4()
5742 CC[2+bs*0] *= tmp; in kernel_sgetrf_nn_4x4_lib4()
5743 CC[3+bs*0] *= tmp; in kernel_sgetrf_nn_4x4_lib4()
5748 CC[1+bs*1] -= CC[1+bs*0] * CC[0+bs*1]; in kernel_sgetrf_nn_4x4_lib4()
5749 CC[2+bs*1] -= CC[2+bs*0] * CC[0+bs*1]; in kernel_sgetrf_nn_4x4_lib4()
5750 CC[3+bs*1] -= CC[3+bs*0] * CC[0+bs*1]; in kernel_sgetrf_nn_4x4_lib4()
5752 tmp = 1.0 / CC[1+bs*1]; in kernel_sgetrf_nn_4x4_lib4()
5753 CC[2+bs*1] *= tmp; in kernel_sgetrf_nn_4x4_lib4()
5754 CC[3+bs*1] *= tmp; in kernel_sgetrf_nn_4x4_lib4()
5759 CC[1+bs*2] -= CC[1+bs*0] * CC[0+bs*2]; in kernel_sgetrf_nn_4x4_lib4()
5760 CC[2+bs*2] -= CC[2+bs*0] * CC[0+bs*2]; in kernel_sgetrf_nn_4x4_lib4()
5761 CC[3+bs*2] -= CC[3+bs*0] * CC[0+bs*2]; in kernel_sgetrf_nn_4x4_lib4()
5763 CC[2+bs*2] -= CC[2+bs*1] * CC[1+bs*2]; in kernel_sgetrf_nn_4x4_lib4()
5764 CC[3+bs*2] -= CC[3+bs*1] * CC[1+bs*2]; in kernel_sgetrf_nn_4x4_lib4()
5766 tmp = 1.0 / CC[2+bs*2]; in kernel_sgetrf_nn_4x4_lib4()
5767 CC[3+bs*2] *= tmp; in kernel_sgetrf_nn_4x4_lib4()
5772 CC[1+bs*3] -= CC[1+bs*0] * CC[0+bs*3]; in kernel_sgetrf_nn_4x4_lib4()
5773 CC[2+bs*3] -= CC[2+bs*0] * CC[0+bs*3]; in kernel_sgetrf_nn_4x4_lib4()
5774 CC[3+bs*3] -= CC[3+bs*0] * CC[0+bs*3]; in kernel_sgetrf_nn_4x4_lib4()
5776 CC[2+bs*3] -= CC[2+bs*1] * CC[1+bs*3]; in kernel_sgetrf_nn_4x4_lib4()
5777 CC[3+bs*3] -= CC[3+bs*1] * CC[1+bs*3]; in kernel_sgetrf_nn_4x4_lib4()
5779 CC[3+bs*3] -= CC[3+bs*2] * CC[2+bs*3]; in kernel_sgetrf_nn_4x4_lib4()
5781 tmp = 1.0 / CC[3+bs*3]; in kernel_sgetrf_nn_4x4_lib4()
5785 D[0+bs*0] = CC[0+bs*0]; in kernel_sgetrf_nn_4x4_lib4()
5786 D[1+bs*0] = CC[1+bs*0]; in kernel_sgetrf_nn_4x4_lib4()
5787 D[2+bs*0] = CC[2+bs*0]; in kernel_sgetrf_nn_4x4_lib4()
5788 D[3+bs*0] = CC[3+bs*0]; in kernel_sgetrf_nn_4x4_lib4()
5790 D[0+bs*1] = CC[0+bs*1]; in kernel_sgetrf_nn_4x4_lib4()
5791 D[1+bs*1] = CC[1+bs*1]; in kernel_sgetrf_nn_4x4_lib4()
5792 D[2+bs*1] = CC[2+bs*1]; in kernel_sgetrf_nn_4x4_lib4()
5793 D[3+bs*1] = CC[3+bs*1]; in kernel_sgetrf_nn_4x4_lib4()
5795 D[0+bs*2] = CC[0+bs*2]; in kernel_sgetrf_nn_4x4_lib4()
5796 D[1+bs*2] = CC[1+bs*2]; in kernel_sgetrf_nn_4x4_lib4()
5797 D[2+bs*2] = CC[2+bs*2]; in kernel_sgetrf_nn_4x4_lib4()
5798 D[3+bs*2] = CC[3+bs*2]; in kernel_sgetrf_nn_4x4_lib4()
5800 D[0+bs*3] = CC[0+bs*3]; in kernel_sgetrf_nn_4x4_lib4()
5801 D[1+bs*3] = CC[1+bs*3]; in kernel_sgetrf_nn_4x4_lib4()
5802 D[2+bs*3] = CC[2+bs*3]; in kernel_sgetrf_nn_4x4_lib4()
5803 D[3+bs*3] = CC[3+bs*3]; in kernel_sgetrf_nn_4x4_lib4()
5823 float CC[16] = {0}; in kernel_sgetrf_nn_4x4_vs_lib4() local
5825 ALIGNED( float CC[16], 64 ) = {0}; in kernel_sgetrf_nn_4x4_vs_lib4()
5831 kernel_sgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, &beta1, C, CC); in kernel_sgetrf_nn_4x4_vs_lib4()
5836 tmp = 1.0 / CC[0+bs*0]; in kernel_sgetrf_nn_4x4_vs_lib4()
5837 CC[1+bs*0] *= tmp; in kernel_sgetrf_nn_4x4_vs_lib4()
5838 CC[2+bs*0] *= tmp; in kernel_sgetrf_nn_4x4_vs_lib4()
5839 CC[3+bs*0] *= tmp; in kernel_sgetrf_nn_4x4_vs_lib4()
5847 CC[1+bs*1] -= CC[1+bs*0] * CC[0+bs*1]; in kernel_sgetrf_nn_4x4_vs_lib4()
5848 CC[2+bs*1] -= CC[2+bs*0] * CC[0+bs*1]; in kernel_sgetrf_nn_4x4_vs_lib4()
5849 CC[3+bs*1] -= CC[3+bs*0] * CC[0+bs*1]; in kernel_sgetrf_nn_4x4_vs_lib4()
5851 tmp = 1.0 / CC[1+bs*1]; in kernel_sgetrf_nn_4x4_vs_lib4()
5852 CC[2+bs*1] *= tmp; in kernel_sgetrf_nn_4x4_vs_lib4()
5853 CC[3+bs*1] *= tmp; in kernel_sgetrf_nn_4x4_vs_lib4()
5861 CC[1+bs*2] -= CC[1+bs*0] * CC[0+bs*2]; in kernel_sgetrf_nn_4x4_vs_lib4()
5862 CC[2+bs*2] -= CC[2+bs*0] * CC[0+bs*2]; in kernel_sgetrf_nn_4x4_vs_lib4()
5863 CC[3+bs*2] -= CC[3+bs*0] * CC[0+bs*2]; in kernel_sgetrf_nn_4x4_vs_lib4()
5865 CC[2+bs*2] -= CC[2+bs*1] * CC[1+bs*2]; in kernel_sgetrf_nn_4x4_vs_lib4()
5866 CC[3+bs*2] -= CC[3+bs*1] * CC[1+bs*2]; in kernel_sgetrf_nn_4x4_vs_lib4()
5868 tmp = 1.0 / CC[2+bs*2]; in kernel_sgetrf_nn_4x4_vs_lib4()
5869 CC[3+bs*2] *= tmp; in kernel_sgetrf_nn_4x4_vs_lib4()
5877 CC[1+bs*3] -= CC[1+bs*0] * CC[0+bs*3]; in kernel_sgetrf_nn_4x4_vs_lib4()
5878 CC[2+bs*3] -= CC[2+bs*0] * CC[0+bs*3]; in kernel_sgetrf_nn_4x4_vs_lib4()
5879 CC[3+bs*3] -= CC[3+bs*0] * CC[0+bs*3]; in kernel_sgetrf_nn_4x4_vs_lib4()
5881 CC[2+bs*3] -= CC[2+bs*1] * CC[1+bs*3]; in kernel_sgetrf_nn_4x4_vs_lib4()
5882 CC[3+bs*3] -= CC[3+bs*1] * CC[1+bs*3]; in kernel_sgetrf_nn_4x4_vs_lib4()
5884 CC[3+bs*3] -= CC[3+bs*2] * CC[2+bs*3]; in kernel_sgetrf_nn_4x4_vs_lib4()
5886 tmp = 1.0 / CC[3+bs*3]; in kernel_sgetrf_nn_4x4_vs_lib4()
5894 D[0+bs*0] = CC[0+bs*0]; in kernel_sgetrf_nn_4x4_vs_lib4()
5895 D[1+bs*0] = CC[1+bs*0]; in kernel_sgetrf_nn_4x4_vs_lib4()
5896 D[2+bs*0] = CC[2+bs*0]; in kernel_sgetrf_nn_4x4_vs_lib4()
5897 D[3+bs*0] = CC[3+bs*0]; in kernel_sgetrf_nn_4x4_vs_lib4()
5902 D[0+bs*1] = CC[0+bs*1]; in kernel_sgetrf_nn_4x4_vs_lib4()
5903 D[1+bs*1] = CC[1+bs*1]; in kernel_sgetrf_nn_4x4_vs_lib4()
5904 D[2+bs*1] = CC[2+bs*1]; in kernel_sgetrf_nn_4x4_vs_lib4()
5905 D[3+bs*1] = CC[3+bs*1]; in kernel_sgetrf_nn_4x4_vs_lib4()
5910 D[0+bs*2] = CC[0+bs*2]; in kernel_sgetrf_nn_4x4_vs_lib4()
5911 D[1+bs*2] = CC[1+bs*2]; in kernel_sgetrf_nn_4x4_vs_lib4()
5912 D[2+bs*2] = CC[2+bs*2]; in kernel_sgetrf_nn_4x4_vs_lib4()
5913 D[3+bs*2] = CC[3+bs*2]; in kernel_sgetrf_nn_4x4_vs_lib4()
5918 D[0+bs*3] = CC[0+bs*3]; in kernel_sgetrf_nn_4x4_vs_lib4()
5919 D[1+bs*3] = CC[1+bs*3]; in kernel_sgetrf_nn_4x4_vs_lib4()
5920 D[2+bs*3] = CC[2+bs*3]; in kernel_sgetrf_nn_4x4_vs_lib4()
5921 D[3+bs*3] = CC[3+bs*3]; in kernel_sgetrf_nn_4x4_vs_lib4()
5925 D[0+bs*0] = CC[0+bs*0]; in kernel_sgetrf_nn_4x4_vs_lib4()
5926 D[1+bs*0] = CC[1+bs*0]; in kernel_sgetrf_nn_4x4_vs_lib4()
5927 D[2+bs*0] = CC[2+bs*0]; in kernel_sgetrf_nn_4x4_vs_lib4()
5932 D[0+bs*1] = CC[0+bs*1]; in kernel_sgetrf_nn_4x4_vs_lib4()
5933 D[1+bs*1] = CC[1+bs*1]; in kernel_sgetrf_nn_4x4_vs_lib4()
5934 D[2+bs*1] = CC[2+bs*1]; in kernel_sgetrf_nn_4x4_vs_lib4()
5939 D[0+bs*2] = CC[0+bs*2]; in kernel_sgetrf_nn_4x4_vs_lib4()
5940 D[1+bs*2] = CC[1+bs*2]; in kernel_sgetrf_nn_4x4_vs_lib4()
5941 D[2+bs*2] = CC[2+bs*2]; in kernel_sgetrf_nn_4x4_vs_lib4()
5946 D[0+bs*3] = CC[0+bs*3]; in kernel_sgetrf_nn_4x4_vs_lib4()
5947 D[1+bs*3] = CC[1+bs*3]; in kernel_sgetrf_nn_4x4_vs_lib4()
5948 D[2+bs*3] = CC[2+bs*3]; in kernel_sgetrf_nn_4x4_vs_lib4()
5952 D[0+bs*0] = CC[0+bs*0]; in kernel_sgetrf_nn_4x4_vs_lib4()
5953 D[1+bs*0] = CC[1+bs*0]; in kernel_sgetrf_nn_4x4_vs_lib4()
5958 D[0+bs*1] = CC[0+bs*1]; in kernel_sgetrf_nn_4x4_vs_lib4()
5959 D[1+bs*1] = CC[1+bs*1]; in kernel_sgetrf_nn_4x4_vs_lib4()
5964 D[0+bs*2] = CC[0+bs*2]; in kernel_sgetrf_nn_4x4_vs_lib4()
5965 D[1+bs*2] = CC[1+bs*2]; in kernel_sgetrf_nn_4x4_vs_lib4()
5970 D[0+bs*3] = CC[0+bs*3]; in kernel_sgetrf_nn_4x4_vs_lib4()
5971 D[1+bs*3] = CC[1+bs*3]; in kernel_sgetrf_nn_4x4_vs_lib4()
5975 D[0+bs*0] = CC[0+bs*0]; in kernel_sgetrf_nn_4x4_vs_lib4()
5980 D[0+bs*1] = CC[0+bs*1]; in kernel_sgetrf_nn_4x4_vs_lib4()
5985 D[0+bs*2] = CC[0+bs*2]; in kernel_sgetrf_nn_4x4_vs_lib4()
5990 D[0+bs*3] = CC[0+bs*3]; in kernel_sgetrf_nn_4x4_vs_lib4()
6009 float CC[16] = {0}; in kernel_strsm_nt_ru_one_4x4_lib4() local
6011 ALIGNED( float CC[16], 64 ) = {0}; in kernel_strsm_nt_ru_one_4x4_lib4()
6016 kernel_sgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC); in kernel_strsm_nt_ru_one_4x4_lib4()
6019 CC[0+bs*2] -= CC[0+bs*3] * tmp; in kernel_strsm_nt_ru_one_4x4_lib4()
6020 CC[1+bs*2] -= CC[1+bs*3] * tmp; in kernel_strsm_nt_ru_one_4x4_lib4()
6021 CC[2+bs*2] -= CC[2+bs*3] * tmp; in kernel_strsm_nt_ru_one_4x4_lib4()
6022 CC[3+bs*2] -= CC[3+bs*3] * tmp; in kernel_strsm_nt_ru_one_4x4_lib4()
6024 CC[0+bs*1] -= CC[0+bs*3] * tmp; in kernel_strsm_nt_ru_one_4x4_lib4()
6025 CC[1+bs*1] -= CC[1+bs*3] * tmp; in kernel_strsm_nt_ru_one_4x4_lib4()
6026 CC[2+bs*1] -= CC[2+bs*3] * tmp; in kernel_strsm_nt_ru_one_4x4_lib4()
6027 CC[3+bs*1] -= CC[3+bs*3] * tmp; in kernel_strsm_nt_ru_one_4x4_lib4()
6029 CC[0+bs*0] -= CC[0+bs*3] * tmp; in kernel_strsm_nt_ru_one_4x4_lib4()
6030 CC[1+bs*0] -= CC[1+bs*3] * tmp; in kernel_strsm_nt_ru_one_4x4_lib4()
6031 CC[2+bs*0] -= CC[2+bs*3] * tmp; in kernel_strsm_nt_ru_one_4x4_lib4()
6032 CC[3+bs*0] -= CC[3+bs*3] * tmp; in kernel_strsm_nt_ru_one_4x4_lib4()
6035 CC[0+bs*1] -= CC[0+bs*2] * tmp; in kernel_strsm_nt_ru_one_4x4_lib4()
6036 CC[1+bs*1] -= CC[1+bs*2] * tmp; in kernel_strsm_nt_ru_one_4x4_lib4()
6037 CC[2+bs*1] -= CC[2+bs*2] * tmp; in kernel_strsm_nt_ru_one_4x4_lib4()
6038 CC[3+bs*1] -= CC[3+bs*2] * tmp; in kernel_strsm_nt_ru_one_4x4_lib4()
6040 CC[0+bs*0] -= CC[0+bs*2] * tmp; in kernel_strsm_nt_ru_one_4x4_lib4()
6041 CC[1+bs*0] -= CC[1+bs*2] * tmp; in kernel_strsm_nt_ru_one_4x4_lib4()
6042 CC[2+bs*0] -= CC[2+bs*2] * tmp; in kernel_strsm_nt_ru_one_4x4_lib4()
6043 CC[3+bs*0] -= CC[3+bs*2] * tmp; in kernel_strsm_nt_ru_one_4x4_lib4()
6046 CC[0+bs*0] -= CC[0+bs*1] * tmp; in kernel_strsm_nt_ru_one_4x4_lib4()
6047 CC[1+bs*0] -= CC[1+bs*1] * tmp; in kernel_strsm_nt_ru_one_4x4_lib4()
6048 CC[2+bs*0] -= CC[2+bs*1] * tmp; in kernel_strsm_nt_ru_one_4x4_lib4()
6049 CC[3+bs*0] -= CC[3+bs*1] * tmp; in kernel_strsm_nt_ru_one_4x4_lib4()
6052 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nt_ru_one_4x4_lib4()
6053 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nt_ru_one_4x4_lib4()
6054 D[2+bs*0] = CC[2+bs*0]; in kernel_strsm_nt_ru_one_4x4_lib4()
6055 D[3+bs*0] = CC[3+bs*0]; in kernel_strsm_nt_ru_one_4x4_lib4()
6057 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nt_ru_one_4x4_lib4()
6058 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nt_ru_one_4x4_lib4()
6059 D[2+bs*1] = CC[2+bs*1]; in kernel_strsm_nt_ru_one_4x4_lib4()
6060 D[3+bs*1] = CC[3+bs*1]; in kernel_strsm_nt_ru_one_4x4_lib4()
6062 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nt_ru_one_4x4_lib4()
6063 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nt_ru_one_4x4_lib4()
6064 D[2+bs*2] = CC[2+bs*2]; in kernel_strsm_nt_ru_one_4x4_lib4()
6065 D[3+bs*2] = CC[3+bs*2]; in kernel_strsm_nt_ru_one_4x4_lib4()
6067 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nt_ru_one_4x4_lib4()
6068 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nt_ru_one_4x4_lib4()
6069 D[2+bs*3] = CC[2+bs*3]; in kernel_strsm_nt_ru_one_4x4_lib4()
6070 D[3+bs*3] = CC[3+bs*3]; in kernel_strsm_nt_ru_one_4x4_lib4()
6088 float CC[16] = {0}; in kernel_strsm_nt_ru_one_4x4_vs_lib4() local
6090 ALIGNED( float CC[16], 64 ) = {0}; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6095 kernel_sgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC); in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6100 CC[0+bs*2] -= CC[0+bs*3] * tmp; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6101 CC[1+bs*2] -= CC[1+bs*3] * tmp; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6102 CC[2+bs*2] -= CC[2+bs*3] * tmp; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6103 CC[3+bs*2] -= CC[3+bs*3] * tmp; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6105 CC[0+bs*1] -= CC[0+bs*3] * tmp; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6106 CC[1+bs*1] -= CC[1+bs*3] * tmp; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6107 CC[2+bs*1] -= CC[2+bs*3] * tmp; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6108 CC[3+bs*1] -= CC[3+bs*3] * tmp; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6110 CC[0+bs*0] -= CC[0+bs*3] * tmp; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6111 CC[1+bs*0] -= CC[1+bs*3] * tmp; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6112 CC[2+bs*0] -= CC[2+bs*3] * tmp; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6113 CC[3+bs*0] -= CC[3+bs*3] * tmp; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6119 CC[0+bs*1] -= CC[0+bs*2] * tmp; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6120 CC[1+bs*1] -= CC[1+bs*2] * tmp; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6121 CC[2+bs*1] -= CC[2+bs*2] * tmp; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6122 CC[3+bs*1] -= CC[3+bs*2] * tmp; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6124 CC[0+bs*0] -= CC[0+bs*2] * tmp; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6125 CC[1+bs*0] -= CC[1+bs*2] * tmp; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6126 CC[2+bs*0] -= CC[2+bs*2] * tmp; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6127 CC[3+bs*0] -= CC[3+bs*2] * tmp; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6133 CC[0+bs*0] -= CC[0+bs*1] * tmp; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6134 CC[1+bs*0] -= CC[1+bs*1] * tmp; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6135 CC[2+bs*0] -= CC[2+bs*1] * tmp; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6136 CC[3+bs*0] -= CC[3+bs*1] * tmp; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6144 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6145 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6146 D[2+bs*0] = CC[2+bs*0]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6147 D[3+bs*0] = CC[3+bs*0]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6152 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6153 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6154 D[2+bs*1] = CC[2+bs*1]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6155 D[3+bs*1] = CC[3+bs*1]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6160 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6161 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6162 D[2+bs*2] = CC[2+bs*2]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6163 D[3+bs*2] = CC[3+bs*2]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6168 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6169 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6170 D[2+bs*3] = CC[2+bs*3]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6171 D[3+bs*3] = CC[3+bs*3]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6175 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6176 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6177 D[2+bs*0] = CC[2+bs*0]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6182 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6183 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6184 D[2+bs*1] = CC[2+bs*1]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6189 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6190 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6191 D[2+bs*2] = CC[2+bs*2]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6196 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6197 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6198 D[2+bs*3] = CC[2+bs*3]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6202 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6203 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6208 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6209 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6214 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6215 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6220 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6221 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6225 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6230 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6235 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6240 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nt_ru_one_4x4_vs_lib4()
6263 float CC[16] = {0}; in kernel_strsm_nn_ll_one_4x4_lib4() local
6265 ALIGNED( float CC[16], 64 ) = {0}; in kernel_strsm_nn_ll_one_4x4_lib4()
6271 kernel_sgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, &beta1, C, CC); in kernel_strsm_nn_ll_one_4x4_lib4()
6278 CC[1+bs*0] -= e_1 * CC[0+bs*0]; in kernel_strsm_nn_ll_one_4x4_lib4()
6279 CC[2+bs*0] -= e_2 * CC[0+bs*0]; in kernel_strsm_nn_ll_one_4x4_lib4()
6280 CC[3+bs*0] -= e_3 * CC[0+bs*0]; in kernel_strsm_nn_ll_one_4x4_lib4()
6281 CC[1+bs*1] -= e_1 * CC[0+bs*1]; in kernel_strsm_nn_ll_one_4x4_lib4()
6282 CC[2+bs*1] -= e_2 * CC[0+bs*1]; in kernel_strsm_nn_ll_one_4x4_lib4()
6283 CC[3+bs*1] -= e_3 * CC[0+bs*1]; in kernel_strsm_nn_ll_one_4x4_lib4()
6284 CC[1+bs*2] -= e_1 * CC[0+bs*2]; in kernel_strsm_nn_ll_one_4x4_lib4()
6285 CC[2+bs*2] -= e_2 * CC[0+bs*2]; in kernel_strsm_nn_ll_one_4x4_lib4()
6286 CC[3+bs*2] -= e_3 * CC[0+bs*2]; in kernel_strsm_nn_ll_one_4x4_lib4()
6287 CC[1+bs*3] -= e_1 * CC[0+bs*3]; in kernel_strsm_nn_ll_one_4x4_lib4()
6288 CC[2+bs*3] -= e_2 * CC[0+bs*3]; in kernel_strsm_nn_ll_one_4x4_lib4()
6289 CC[3+bs*3] -= e_3 * CC[0+bs*3]; in kernel_strsm_nn_ll_one_4x4_lib4()
6293 CC[2+bs*0] -= e_2 * CC[1+bs*0]; in kernel_strsm_nn_ll_one_4x4_lib4()
6294 CC[3+bs*0] -= e_3 * CC[1+bs*0]; in kernel_strsm_nn_ll_one_4x4_lib4()
6295 CC[2+bs*1] -= e_2 * CC[1+bs*1]; in kernel_strsm_nn_ll_one_4x4_lib4()
6296 CC[3+bs*1] -= e_3 * CC[1+bs*1]; in kernel_strsm_nn_ll_one_4x4_lib4()
6297 CC[2+bs*2] -= e_2 * CC[1+bs*2]; in kernel_strsm_nn_ll_one_4x4_lib4()
6298 CC[3+bs*2] -= e_3 * CC[1+bs*2]; in kernel_strsm_nn_ll_one_4x4_lib4()
6299 CC[2+bs*3] -= e_2 * CC[1+bs*3]; in kernel_strsm_nn_ll_one_4x4_lib4()
6300 CC[3+bs*3] -= e_3 * CC[1+bs*3]; in kernel_strsm_nn_ll_one_4x4_lib4()
6303 CC[3+bs*0] -= e_3 * CC[2+bs*0]; in kernel_strsm_nn_ll_one_4x4_lib4()
6304 CC[3+bs*1] -= e_3 * CC[2+bs*1]; in kernel_strsm_nn_ll_one_4x4_lib4()
6305 CC[3+bs*2] -= e_3 * CC[2+bs*2]; in kernel_strsm_nn_ll_one_4x4_lib4()
6306 CC[3+bs*3] -= e_3 * CC[2+bs*3]; in kernel_strsm_nn_ll_one_4x4_lib4()
6308 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nn_ll_one_4x4_lib4()
6309 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nn_ll_one_4x4_lib4()
6310 D[2+bs*0] = CC[2+bs*0]; in kernel_strsm_nn_ll_one_4x4_lib4()
6311 D[3+bs*0] = CC[3+bs*0]; in kernel_strsm_nn_ll_one_4x4_lib4()
6313 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nn_ll_one_4x4_lib4()
6314 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nn_ll_one_4x4_lib4()
6315 D[2+bs*1] = CC[2+bs*1]; in kernel_strsm_nn_ll_one_4x4_lib4()
6316 D[3+bs*1] = CC[3+bs*1]; in kernel_strsm_nn_ll_one_4x4_lib4()
6318 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nn_ll_one_4x4_lib4()
6319 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nn_ll_one_4x4_lib4()
6320 D[2+bs*2] = CC[2+bs*2]; in kernel_strsm_nn_ll_one_4x4_lib4()
6321 D[3+bs*2] = CC[3+bs*2]; in kernel_strsm_nn_ll_one_4x4_lib4()
6323 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nn_ll_one_4x4_lib4()
6324 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nn_ll_one_4x4_lib4()
6325 D[2+bs*3] = CC[2+bs*3]; in kernel_strsm_nn_ll_one_4x4_lib4()
6326 D[3+bs*3] = CC[3+bs*3]; in kernel_strsm_nn_ll_one_4x4_lib4()
6348 float CC[16] = {0}; in kernel_strsm_nn_ll_one_4x4_vs_lib4() local
6350 ALIGNED( float CC[16], 64 ) = {0}; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6356 kernel_sgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, &beta1, C, CC); in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6366 CC[1+bs*0] -= e_1 * CC[0+bs*0]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6367 CC[2+bs*0] -= e_2 * CC[0+bs*0]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6368 CC[3+bs*0] -= e_3 * CC[0+bs*0]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6369 CC[1+bs*1] -= e_1 * CC[0+bs*1]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6370 CC[2+bs*1] -= e_2 * CC[0+bs*1]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6371 CC[3+bs*1] -= e_3 * CC[0+bs*1]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6372 CC[1+bs*2] -= e_1 * CC[0+bs*2]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6373 CC[2+bs*2] -= e_2 * CC[0+bs*2]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6374 CC[3+bs*2] -= e_3 * CC[0+bs*2]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6375 CC[1+bs*3] -= e_1 * CC[0+bs*3]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6376 CC[2+bs*3] -= e_2 * CC[0+bs*3]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6377 CC[3+bs*3] -= e_3 * CC[0+bs*3]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6384 CC[2+bs*0] -= e_2 * CC[1+bs*0]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6385 CC[3+bs*0] -= e_3 * CC[1+bs*0]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6386 CC[2+bs*1] -= e_2 * CC[1+bs*1]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6387 CC[3+bs*1] -= e_3 * CC[1+bs*1]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6388 CC[2+bs*2] -= e_2 * CC[1+bs*2]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6389 CC[3+bs*2] -= e_3 * CC[1+bs*2]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6390 CC[2+bs*3] -= e_2 * CC[1+bs*3]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6391 CC[3+bs*3] -= e_3 * CC[1+bs*3]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6397 CC[3+bs*0] -= e_3 * CC[2+bs*0]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6398 CC[3+bs*1] -= e_3 * CC[2+bs*1]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6399 CC[3+bs*2] -= e_3 * CC[2+bs*2]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6400 CC[3+bs*3] -= e_3 * CC[2+bs*3]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6406 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6407 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6408 D[2+bs*0] = CC[2+bs*0]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6409 D[3+bs*0] = CC[3+bs*0]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6414 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6415 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6416 D[2+bs*1] = CC[2+bs*1]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6417 D[3+bs*1] = CC[3+bs*1]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6422 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6423 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6424 D[2+bs*2] = CC[2+bs*2]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6425 D[3+bs*2] = CC[3+bs*2]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6430 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6431 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6432 D[2+bs*3] = CC[2+bs*3]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6433 D[3+bs*3] = CC[3+bs*3]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6437 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6438 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6439 D[2+bs*0] = CC[2+bs*0]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6444 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6445 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6446 D[2+bs*1] = CC[2+bs*1]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6451 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6452 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6453 D[2+bs*2] = CC[2+bs*2]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6458 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6459 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6460 D[2+bs*3] = CC[2+bs*3]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6464 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6465 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6470 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6471 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6476 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6477 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6482 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6483 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6487 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6492 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6497 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6502 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nn_ll_one_4x4_vs_lib4()
6528 float CC[16] = {0}; in kernel_strsm_nn_ru_inv_4x4_lib4() local
6530 ALIGNED( float CC[16], 64 ) = {0}; in kernel_strsm_nn_ru_inv_4x4_lib4()
6535 kernel_sgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, beta, C, CC); in kernel_strsm_nn_ru_inv_4x4_lib4()
6540 CC[0+bs*0] *= e_00; in kernel_strsm_nn_ru_inv_4x4_lib4()
6541 CC[1+bs*0] *= e_00; in kernel_strsm_nn_ru_inv_4x4_lib4()
6542 CC[2+bs*0] *= e_00; in kernel_strsm_nn_ru_inv_4x4_lib4()
6543 CC[3+bs*0] *= e_00; in kernel_strsm_nn_ru_inv_4x4_lib4()
6547 CC[0+bs*1] -= CC[0+bs*0] * e_01; in kernel_strsm_nn_ru_inv_4x4_lib4()
6548 CC[1+bs*1] -= CC[1+bs*0] * e_01; in kernel_strsm_nn_ru_inv_4x4_lib4()
6549 CC[2+bs*1] -= CC[2+bs*0] * e_01; in kernel_strsm_nn_ru_inv_4x4_lib4()
6550 CC[3+bs*1] -= CC[3+bs*0] * e_01; in kernel_strsm_nn_ru_inv_4x4_lib4()
6551 CC[0+bs*1] *= e_11; in kernel_strsm_nn_ru_inv_4x4_lib4()
6552 CC[1+bs*1] *= e_11; in kernel_strsm_nn_ru_inv_4x4_lib4()
6553 CC[2+bs*1] *= e_11; in kernel_strsm_nn_ru_inv_4x4_lib4()
6554 CC[3+bs*1] *= e_11; in kernel_strsm_nn_ru_inv_4x4_lib4()
6559 CC[0+bs*2] -= CC[0+bs*0] * e_02; in kernel_strsm_nn_ru_inv_4x4_lib4()
6560 CC[1+bs*2] -= CC[1+bs*0] * e_02; in kernel_strsm_nn_ru_inv_4x4_lib4()
6561 CC[2+bs*2] -= CC[2+bs*0] * e_02; in kernel_strsm_nn_ru_inv_4x4_lib4()
6562 CC[3+bs*2] -= CC[3+bs*0] * e_02; in kernel_strsm_nn_ru_inv_4x4_lib4()
6563 CC[0+bs*2] -= CC[0+bs*1] * e_12; in kernel_strsm_nn_ru_inv_4x4_lib4()
6564 CC[1+bs*2] -= CC[1+bs*1] * e_12; in kernel_strsm_nn_ru_inv_4x4_lib4()
6565 CC[2+bs*2] -= CC[2+bs*1] * e_12; in kernel_strsm_nn_ru_inv_4x4_lib4()
6566 CC[3+bs*2] -= CC[3+bs*1] * e_12; in kernel_strsm_nn_ru_inv_4x4_lib4()
6567 CC[0+bs*2] *= e_22; in kernel_strsm_nn_ru_inv_4x4_lib4()
6568 CC[1+bs*2] *= e_22; in kernel_strsm_nn_ru_inv_4x4_lib4()
6569 CC[2+bs*2] *= e_22; in kernel_strsm_nn_ru_inv_4x4_lib4()
6570 CC[3+bs*2] *= e_22; in kernel_strsm_nn_ru_inv_4x4_lib4()
6576 CC[0+bs*3] -= CC[0+bs*0] * e_03; in kernel_strsm_nn_ru_inv_4x4_lib4()
6577 CC[1+bs*3] -= CC[1+bs*0] * e_03; in kernel_strsm_nn_ru_inv_4x4_lib4()
6578 CC[2+bs*3] -= CC[2+bs*0] * e_03; in kernel_strsm_nn_ru_inv_4x4_lib4()
6579 CC[3+bs*3] -= CC[3+bs*0] * e_03; in kernel_strsm_nn_ru_inv_4x4_lib4()
6580 CC[0+bs*3] -= CC[0+bs*1] * e_13; in kernel_strsm_nn_ru_inv_4x4_lib4()
6581 CC[1+bs*3] -= CC[1+bs*1] * e_13; in kernel_strsm_nn_ru_inv_4x4_lib4()
6582 CC[2+bs*3] -= CC[2+bs*1] * e_13; in kernel_strsm_nn_ru_inv_4x4_lib4()
6583 CC[3+bs*3] -= CC[3+bs*1] * e_13; in kernel_strsm_nn_ru_inv_4x4_lib4()
6584 CC[0+bs*3] -= CC[0+bs*2] * e_23; in kernel_strsm_nn_ru_inv_4x4_lib4()
6585 CC[1+bs*3] -= CC[1+bs*2] * e_23; in kernel_strsm_nn_ru_inv_4x4_lib4()
6586 CC[2+bs*3] -= CC[2+bs*2] * e_23; in kernel_strsm_nn_ru_inv_4x4_lib4()
6587 CC[3+bs*3] -= CC[3+bs*2] * e_23; in kernel_strsm_nn_ru_inv_4x4_lib4()
6588 CC[0+bs*3] *= e_33; in kernel_strsm_nn_ru_inv_4x4_lib4()
6589 CC[1+bs*3] *= e_33; in kernel_strsm_nn_ru_inv_4x4_lib4()
6590 CC[2+bs*3] *= e_33; in kernel_strsm_nn_ru_inv_4x4_lib4()
6591 CC[3+bs*3] *= e_33; in kernel_strsm_nn_ru_inv_4x4_lib4()
6593 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nn_ru_inv_4x4_lib4()
6594 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nn_ru_inv_4x4_lib4()
6595 D[2+bs*0] = CC[2+bs*0]; in kernel_strsm_nn_ru_inv_4x4_lib4()
6596 D[3+bs*0] = CC[3+bs*0]; in kernel_strsm_nn_ru_inv_4x4_lib4()
6598 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nn_ru_inv_4x4_lib4()
6599 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nn_ru_inv_4x4_lib4()
6600 D[2+bs*1] = CC[2+bs*1]; in kernel_strsm_nn_ru_inv_4x4_lib4()
6601 D[3+bs*1] = CC[3+bs*1]; in kernel_strsm_nn_ru_inv_4x4_lib4()
6603 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nn_ru_inv_4x4_lib4()
6604 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nn_ru_inv_4x4_lib4()
6605 D[2+bs*2] = CC[2+bs*2]; in kernel_strsm_nn_ru_inv_4x4_lib4()
6606 D[3+bs*2] = CC[3+bs*2]; in kernel_strsm_nn_ru_inv_4x4_lib4()
6608 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nn_ru_inv_4x4_lib4()
6609 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nn_ru_inv_4x4_lib4()
6610 D[2+bs*3] = CC[2+bs*3]; in kernel_strsm_nn_ru_inv_4x4_lib4()
6611 D[3+bs*3] = CC[3+bs*3]; in kernel_strsm_nn_ru_inv_4x4_lib4()
6636 float CC[16] = {0}; in kernel_strsm_nn_ru_inv_4x4_vs_lib4() local
6638 ALIGNED( float CC[16], 64 ) = {0}; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6643 kernel_sgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, beta, C, CC); in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6648 CC[0+bs*0] *= e_00; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6649 CC[1+bs*0] *= e_00; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6650 CC[2+bs*0] *= e_00; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6651 CC[3+bs*0] *= e_00; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6658 CC[0+bs*1] -= CC[0+bs*0] * e_01; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6659 CC[1+bs*1] -= CC[1+bs*0] * e_01; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6660 CC[2+bs*1] -= CC[2+bs*0] * e_01; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6661 CC[3+bs*1] -= CC[3+bs*0] * e_01; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6662 CC[0+bs*1] *= e_11; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6663 CC[1+bs*1] *= e_11; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6664 CC[2+bs*1] *= e_11; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6665 CC[3+bs*1] *= e_11; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6673 CC[0+bs*2] -= CC[0+bs*0] * e_02; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6674 CC[1+bs*2] -= CC[1+bs*0] * e_02; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6675 CC[2+bs*2] -= CC[2+bs*0] * e_02; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6676 CC[3+bs*2] -= CC[3+bs*0] * e_02; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6677 CC[0+bs*2] -= CC[0+bs*1] * e_12; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6678 CC[1+bs*2] -= CC[1+bs*1] * e_12; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6679 CC[2+bs*2] -= CC[2+bs*1] * e_12; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6680 CC[3+bs*2] -= CC[3+bs*1] * e_12; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6681 CC[0+bs*2] *= e_22; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6682 CC[1+bs*2] *= e_22; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6683 CC[2+bs*2] *= e_22; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6684 CC[3+bs*2] *= e_22; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6693 CC[0+bs*3] -= CC[0+bs*0] * e_03; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6694 CC[1+bs*3] -= CC[1+bs*0] * e_03; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6695 CC[2+bs*3] -= CC[2+bs*0] * e_03; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6696 CC[3+bs*3] -= CC[3+bs*0] * e_03; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6697 CC[0+bs*3] -= CC[0+bs*1] * e_13; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6698 CC[1+bs*3] -= CC[1+bs*1] * e_13; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6699 CC[2+bs*3] -= CC[2+bs*1] * e_13; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6700 CC[3+bs*3] -= CC[3+bs*1] * e_13; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6701 CC[0+bs*3] -= CC[0+bs*2] * e_23; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6702 CC[1+bs*3] -= CC[1+bs*2] * e_23; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6703 CC[2+bs*3] -= CC[2+bs*2] * e_23; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6704 CC[3+bs*3] -= CC[3+bs*2] * e_23; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6705 CC[0+bs*3] *= e_33; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6706 CC[1+bs*3] *= e_33; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6707 CC[2+bs*3] *= e_33; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6708 CC[3+bs*3] *= e_33; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6714 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6715 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6716 D[2+bs*0] = CC[2+bs*0]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6717 D[3+bs*0] = CC[3+bs*0]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6722 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6723 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6724 D[2+bs*1] = CC[2+bs*1]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6725 D[3+bs*1] = CC[3+bs*1]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6730 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6731 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6732 D[2+bs*2] = CC[2+bs*2]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6733 D[3+bs*2] = CC[3+bs*2]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6738 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6739 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6740 D[2+bs*3] = CC[2+bs*3]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6741 D[3+bs*3] = CC[3+bs*3]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6745 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6746 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6747 D[2+bs*0] = CC[2+bs*0]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6752 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6753 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6754 D[2+bs*1] = CC[2+bs*1]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6759 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6760 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6761 D[2+bs*2] = CC[2+bs*2]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6766 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6767 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6768 D[2+bs*3] = CC[2+bs*3]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6772 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6773 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6778 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6779 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6784 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6785 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6790 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6791 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6795 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6800 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6805 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6810 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nn_ru_inv_4x4_vs_lib4()
6838 float CC[16] = {0}; in kernel_strsm_nn_lu_inv_4x4_lib4() local
6840 ALIGNED( float CC[16], 64 ) = {0}; in kernel_strsm_nn_lu_inv_4x4_lib4()
6846 kernel_sgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, &beta1, C, CC); in kernel_strsm_nn_lu_inv_4x4_lib4()
6854 CC[3+bs*0] *= e_33; in kernel_strsm_nn_lu_inv_4x4_lib4()
6855 CC[3+bs*1] *= e_33; in kernel_strsm_nn_lu_inv_4x4_lib4()
6856 CC[3+bs*2] *= e_33; in kernel_strsm_nn_lu_inv_4x4_lib4()
6857 CC[3+bs*3] *= e_33; in kernel_strsm_nn_lu_inv_4x4_lib4()
6858 CC[0+bs*0] -= e_03 * CC[3+bs*0]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6859 CC[0+bs*1] -= e_03 * CC[3+bs*1]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6860 CC[0+bs*2] -= e_03 * CC[3+bs*2]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6861 CC[0+bs*3] -= e_03 * CC[3+bs*3]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6862 CC[1+bs*0] -= e_13 * CC[3+bs*0]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6863 CC[1+bs*1] -= e_13 * CC[3+bs*1]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6864 CC[1+bs*2] -= e_13 * CC[3+bs*2]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6865 CC[1+bs*3] -= e_13 * CC[3+bs*3]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6866 CC[2+bs*0] -= e_23 * CC[3+bs*0]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6867 CC[2+bs*1] -= e_23 * CC[3+bs*1]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6868 CC[2+bs*2] -= e_23 * CC[3+bs*2]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6869 CC[2+bs*3] -= e_23 * CC[3+bs*3]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6874 CC[2+bs*0] *= e_22; in kernel_strsm_nn_lu_inv_4x4_lib4()
6875 CC[2+bs*1] *= e_22; in kernel_strsm_nn_lu_inv_4x4_lib4()
6876 CC[2+bs*2] *= e_22; in kernel_strsm_nn_lu_inv_4x4_lib4()
6877 CC[2+bs*3] *= e_22; in kernel_strsm_nn_lu_inv_4x4_lib4()
6878 CC[0+bs*0] -= e_02 * CC[2+bs*0]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6879 CC[0+bs*1] -= e_02 * CC[2+bs*1]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6880 CC[0+bs*2] -= e_02 * CC[2+bs*2]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6881 CC[0+bs*3] -= e_02 * CC[2+bs*3]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6882 CC[1+bs*0] -= e_12 * CC[2+bs*0]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6883 CC[1+bs*1] -= e_12 * CC[2+bs*1]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6884 CC[1+bs*2] -= e_12 * CC[2+bs*2]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6885 CC[1+bs*3] -= e_12 * CC[2+bs*3]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6889 CC[1+bs*0] *= e_11; in kernel_strsm_nn_lu_inv_4x4_lib4()
6890 CC[1+bs*1] *= e_11; in kernel_strsm_nn_lu_inv_4x4_lib4()
6891 CC[1+bs*2] *= e_11; in kernel_strsm_nn_lu_inv_4x4_lib4()
6892 CC[1+bs*3] *= e_11; in kernel_strsm_nn_lu_inv_4x4_lib4()
6893 CC[0+bs*0] -= e_01 * CC[1+bs*0]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6894 CC[0+bs*1] -= e_01 * CC[1+bs*1]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6895 CC[0+bs*2] -= e_01 * CC[1+bs*2]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6896 CC[0+bs*3] -= e_01 * CC[1+bs*3]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6899 CC[0+bs*0] *= e_00; in kernel_strsm_nn_lu_inv_4x4_lib4()
6900 CC[0+bs*1] *= e_00; in kernel_strsm_nn_lu_inv_4x4_lib4()
6901 CC[0+bs*2] *= e_00; in kernel_strsm_nn_lu_inv_4x4_lib4()
6902 CC[0+bs*3] *= e_00; in kernel_strsm_nn_lu_inv_4x4_lib4()
6904 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6905 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6906 D[2+bs*0] = CC[2+bs*0]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6907 D[3+bs*0] = CC[3+bs*0]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6909 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6910 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6911 D[2+bs*1] = CC[2+bs*1]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6912 D[3+bs*1] = CC[3+bs*1]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6914 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6915 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6916 D[2+bs*2] = CC[2+bs*2]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6917 D[3+bs*2] = CC[3+bs*2]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6919 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6920 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6921 D[2+bs*3] = CC[2+bs*3]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6922 D[3+bs*3] = CC[3+bs*3]; in kernel_strsm_nn_lu_inv_4x4_lib4()
6949 float CC[16] = {0}; in kernel_strsm_nn_lu_inv_4x4_vs_lib4() local
6951 ALIGNED( float CC[16], 64 ) = {0}; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6957 kernel_sgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, &beta1, C, CC); in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6967 CC[3+bs*0] *= e_33; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6968 CC[3+bs*1] *= e_33; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6969 CC[3+bs*2] *= e_33; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6970 CC[3+bs*3] *= e_33; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6971 CC[0+bs*0] -= e_03 * CC[3+bs*0]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6972 CC[0+bs*1] -= e_03 * CC[3+bs*1]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6973 CC[0+bs*2] -= e_03 * CC[3+bs*2]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6974 CC[0+bs*3] -= e_03 * CC[3+bs*3]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6975 CC[1+bs*0] -= e_13 * CC[3+bs*0]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6976 CC[1+bs*1] -= e_13 * CC[3+bs*1]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6977 CC[1+bs*2] -= e_13 * CC[3+bs*2]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6978 CC[1+bs*3] -= e_13 * CC[3+bs*3]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6979 CC[2+bs*0] -= e_23 * CC[3+bs*0]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6980 CC[2+bs*1] -= e_23 * CC[3+bs*1]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6981 CC[2+bs*2] -= e_23 * CC[3+bs*2]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6982 CC[2+bs*3] -= e_23 * CC[3+bs*3]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6990 CC[2+bs*0] *= e_22; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6991 CC[2+bs*1] *= e_22; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6992 CC[2+bs*2] *= e_22; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6993 CC[2+bs*3] *= e_22; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6994 CC[0+bs*0] -= e_02 * CC[2+bs*0]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6995 CC[0+bs*1] -= e_02 * CC[2+bs*1]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6996 CC[0+bs*2] -= e_02 * CC[2+bs*2]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6997 CC[0+bs*3] -= e_02 * CC[2+bs*3]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6998 CC[1+bs*0] -= e_12 * CC[2+bs*0]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
6999 CC[1+bs*1] -= e_12 * CC[2+bs*1]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7000 CC[1+bs*2] -= e_12 * CC[2+bs*2]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7001 CC[1+bs*3] -= e_12 * CC[2+bs*3]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7008 CC[1+bs*0] *= e_11; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7009 CC[1+bs*1] *= e_11; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7010 CC[1+bs*2] *= e_11; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7011 CC[1+bs*3] *= e_11; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7012 CC[0+bs*0] -= e_01 * CC[1+bs*0]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7013 CC[0+bs*1] -= e_01 * CC[1+bs*1]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7014 CC[0+bs*2] -= e_01 * CC[1+bs*2]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7015 CC[0+bs*3] -= e_01 * CC[1+bs*3]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7019 CC[0+bs*0] *= e_00; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7020 CC[0+bs*1] *= e_00; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7021 CC[0+bs*2] *= e_00; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7022 CC[0+bs*3] *= e_00; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7028 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7029 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7030 D[2+bs*0] = CC[2+bs*0]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7031 D[3+bs*0] = CC[3+bs*0]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7036 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7037 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7038 D[2+bs*1] = CC[2+bs*1]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7039 D[3+bs*1] = CC[3+bs*1]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7044 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7045 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7046 D[2+bs*2] = CC[2+bs*2]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7047 D[3+bs*2] = CC[3+bs*2]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7052 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7053 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7054 D[2+bs*3] = CC[2+bs*3]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7055 D[3+bs*3] = CC[3+bs*3]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7059 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7060 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7061 D[2+bs*0] = CC[2+bs*0]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7066 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7067 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7068 D[2+bs*1] = CC[2+bs*1]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7073 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7074 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7075 D[2+bs*2] = CC[2+bs*2]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7080 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7081 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7082 D[2+bs*3] = CC[2+bs*3]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7086 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7087 D[1+bs*0] = CC[1+bs*0]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7092 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7093 D[1+bs*1] = CC[1+bs*1]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7098 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7099 D[1+bs*2] = CC[1+bs*2]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7104 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7105 D[1+bs*3] = CC[1+bs*3]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7109 D[0+bs*0] = CC[0+bs*0]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7114 D[0+bs*1] = CC[0+bs*1]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7119 D[0+bs*2] = CC[0+bs*2]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()
7124 D[0+bs*3] = CC[0+bs*3]; in kernel_strsm_nn_lu_inv_4x4_vs_lib4()