Lines Matching refs:CC

65 	double CC[16] = {0};  in kernel_dgemm_nt_4x4_lib4()  local
67 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dgemm_nt_4x4_lib4()
87 CC[0+bs*0] += a_0 * b_0; in kernel_dgemm_nt_4x4_lib4()
88 CC[1+bs*0] += a_1 * b_0; in kernel_dgemm_nt_4x4_lib4()
89 CC[2+bs*0] += a_2 * b_0; in kernel_dgemm_nt_4x4_lib4()
90 CC[3+bs*0] += a_3 * b_0; in kernel_dgemm_nt_4x4_lib4()
92 CC[0+bs*1] += a_0 * b_1; in kernel_dgemm_nt_4x4_lib4()
93 CC[1+bs*1] += a_1 * b_1; in kernel_dgemm_nt_4x4_lib4()
94 CC[2+bs*1] += a_2 * b_1; in kernel_dgemm_nt_4x4_lib4()
95 CC[3+bs*1] += a_3 * b_1; in kernel_dgemm_nt_4x4_lib4()
97 CC[0+bs*2] += a_0 * b_2; in kernel_dgemm_nt_4x4_lib4()
98 CC[1+bs*2] += a_1 * b_2; in kernel_dgemm_nt_4x4_lib4()
99 CC[2+bs*2] += a_2 * b_2; in kernel_dgemm_nt_4x4_lib4()
100 CC[3+bs*2] += a_3 * b_2; in kernel_dgemm_nt_4x4_lib4()
102 CC[0+bs*3] += a_0 * b_3; in kernel_dgemm_nt_4x4_lib4()
103 CC[1+bs*3] += a_1 * b_3; in kernel_dgemm_nt_4x4_lib4()
104 CC[2+bs*3] += a_2 * b_3; in kernel_dgemm_nt_4x4_lib4()
105 CC[3+bs*3] += a_3 * b_3; in kernel_dgemm_nt_4x4_lib4()
120 CC[0+bs*0] += a_0 * b_0; in kernel_dgemm_nt_4x4_lib4()
121 CC[1+bs*0] += a_1 * b_0; in kernel_dgemm_nt_4x4_lib4()
122 CC[2+bs*0] += a_2 * b_0; in kernel_dgemm_nt_4x4_lib4()
123 CC[3+bs*0] += a_3 * b_0; in kernel_dgemm_nt_4x4_lib4()
125 CC[0+bs*1] += a_0 * b_1; in kernel_dgemm_nt_4x4_lib4()
126 CC[1+bs*1] += a_1 * b_1; in kernel_dgemm_nt_4x4_lib4()
127 CC[2+bs*1] += a_2 * b_1; in kernel_dgemm_nt_4x4_lib4()
128 CC[3+bs*1] += a_3 * b_1; in kernel_dgemm_nt_4x4_lib4()
130 CC[0+bs*2] += a_0 * b_2; in kernel_dgemm_nt_4x4_lib4()
131 CC[1+bs*2] += a_1 * b_2; in kernel_dgemm_nt_4x4_lib4()
132 CC[2+bs*2] += a_2 * b_2; in kernel_dgemm_nt_4x4_lib4()
133 CC[3+bs*2] += a_3 * b_2; in kernel_dgemm_nt_4x4_lib4()
135 CC[0+bs*3] += a_0 * b_3; in kernel_dgemm_nt_4x4_lib4()
136 CC[1+bs*3] += a_1 * b_3; in kernel_dgemm_nt_4x4_lib4()
137 CC[2+bs*3] += a_2 * b_3; in kernel_dgemm_nt_4x4_lib4()
138 CC[3+bs*3] += a_3 * b_3; in kernel_dgemm_nt_4x4_lib4()
153 CC[0+bs*0] += a_0 * b_0; in kernel_dgemm_nt_4x4_lib4()
154 CC[1+bs*0] += a_1 * b_0; in kernel_dgemm_nt_4x4_lib4()
155 CC[2+bs*0] += a_2 * b_0; in kernel_dgemm_nt_4x4_lib4()
156 CC[3+bs*0] += a_3 * b_0; in kernel_dgemm_nt_4x4_lib4()
158 CC[0+bs*1] += a_0 * b_1; in kernel_dgemm_nt_4x4_lib4()
159 CC[1+bs*1] += a_1 * b_1; in kernel_dgemm_nt_4x4_lib4()
160 CC[2+bs*1] += a_2 * b_1; in kernel_dgemm_nt_4x4_lib4()
161 CC[3+bs*1] += a_3 * b_1; in kernel_dgemm_nt_4x4_lib4()
163 CC[0+bs*2] += a_0 * b_2; in kernel_dgemm_nt_4x4_lib4()
164 CC[1+bs*2] += a_1 * b_2; in kernel_dgemm_nt_4x4_lib4()
165 CC[2+bs*2] += a_2 * b_2; in kernel_dgemm_nt_4x4_lib4()
166 CC[3+bs*2] += a_3 * b_2; in kernel_dgemm_nt_4x4_lib4()
168 CC[0+bs*3] += a_0 * b_3; in kernel_dgemm_nt_4x4_lib4()
169 CC[1+bs*3] += a_1 * b_3; in kernel_dgemm_nt_4x4_lib4()
170 CC[2+bs*3] += a_2 * b_3; in kernel_dgemm_nt_4x4_lib4()
171 CC[3+bs*3] += a_3 * b_3; in kernel_dgemm_nt_4x4_lib4()
186 CC[0+bs*0] += a_0 * b_0; in kernel_dgemm_nt_4x4_lib4()
187 CC[1+bs*0] += a_1 * b_0; in kernel_dgemm_nt_4x4_lib4()
188 CC[2+bs*0] += a_2 * b_0; in kernel_dgemm_nt_4x4_lib4()
189 CC[3+bs*0] += a_3 * b_0; in kernel_dgemm_nt_4x4_lib4()
191 CC[0+bs*1] += a_0 * b_1; in kernel_dgemm_nt_4x4_lib4()
192 CC[1+bs*1] += a_1 * b_1; in kernel_dgemm_nt_4x4_lib4()
193 CC[2+bs*1] += a_2 * b_1; in kernel_dgemm_nt_4x4_lib4()
194 CC[3+bs*1] += a_3 * b_1; in kernel_dgemm_nt_4x4_lib4()
196 CC[0+bs*2] += a_0 * b_2; in kernel_dgemm_nt_4x4_lib4()
197 CC[1+bs*2] += a_1 * b_2; in kernel_dgemm_nt_4x4_lib4()
198 CC[2+bs*2] += a_2 * b_2; in kernel_dgemm_nt_4x4_lib4()
199 CC[3+bs*2] += a_3 * b_2; in kernel_dgemm_nt_4x4_lib4()
201 CC[0+bs*3] += a_0 * b_3; in kernel_dgemm_nt_4x4_lib4()
202 CC[1+bs*3] += a_1 * b_3; in kernel_dgemm_nt_4x4_lib4()
203 CC[2+bs*3] += a_2 * b_3; in kernel_dgemm_nt_4x4_lib4()
204 CC[3+bs*3] += a_3 * b_3; in kernel_dgemm_nt_4x4_lib4()
226 CC[0+bs*0] += a_0 * b_0; in kernel_dgemm_nt_4x4_lib4()
227 CC[1+bs*0] += a_1 * b_0; in kernel_dgemm_nt_4x4_lib4()
228 CC[2+bs*0] += a_2 * b_0; in kernel_dgemm_nt_4x4_lib4()
229 CC[3+bs*0] += a_3 * b_0; in kernel_dgemm_nt_4x4_lib4()
231 CC[0+bs*1] += a_0 * b_1; in kernel_dgemm_nt_4x4_lib4()
232 CC[1+bs*1] += a_1 * b_1; in kernel_dgemm_nt_4x4_lib4()
233 CC[2+bs*1] += a_2 * b_1; in kernel_dgemm_nt_4x4_lib4()
234 CC[3+bs*1] += a_3 * b_1; in kernel_dgemm_nt_4x4_lib4()
236 CC[0+bs*2] += a_0 * b_2; in kernel_dgemm_nt_4x4_lib4()
237 CC[1+bs*2] += a_1 * b_2; in kernel_dgemm_nt_4x4_lib4()
238 CC[2+bs*2] += a_2 * b_2; in kernel_dgemm_nt_4x4_lib4()
239 CC[3+bs*2] += a_3 * b_2; in kernel_dgemm_nt_4x4_lib4()
241 CC[0+bs*3] += a_0 * b_3; in kernel_dgemm_nt_4x4_lib4()
242 CC[1+bs*3] += a_1 * b_3; in kernel_dgemm_nt_4x4_lib4()
243 CC[2+bs*3] += a_2 * b_3; in kernel_dgemm_nt_4x4_lib4()
244 CC[3+bs*3] += a_3 * b_3; in kernel_dgemm_nt_4x4_lib4()
251 D[0+bs*0] = beta[0]*C[0+bs*0] + alpha[0]*CC[0+bs*0]; in kernel_dgemm_nt_4x4_lib4()
252 D[1+bs*0] = beta[0]*C[1+bs*0] + alpha[0]*CC[1+bs*0]; in kernel_dgemm_nt_4x4_lib4()
253 D[2+bs*0] = beta[0]*C[2+bs*0] + alpha[0]*CC[2+bs*0]; in kernel_dgemm_nt_4x4_lib4()
254 D[3+bs*0] = beta[0]*C[3+bs*0] + alpha[0]*CC[3+bs*0]; in kernel_dgemm_nt_4x4_lib4()
256 D[0+bs*1] = beta[0]*C[0+bs*1] + alpha[0]*CC[0+bs*1]; in kernel_dgemm_nt_4x4_lib4()
257 D[1+bs*1] = beta[0]*C[1+bs*1] + alpha[0]*CC[1+bs*1]; in kernel_dgemm_nt_4x4_lib4()
258 D[2+bs*1] = beta[0]*C[2+bs*1] + alpha[0]*CC[2+bs*1]; in kernel_dgemm_nt_4x4_lib4()
259 D[3+bs*1] = beta[0]*C[3+bs*1] + alpha[0]*CC[3+bs*1]; in kernel_dgemm_nt_4x4_lib4()
261 D[0+bs*2] = beta[0]*C[0+bs*2] + alpha[0]*CC[0+bs*2]; in kernel_dgemm_nt_4x4_lib4()
262 D[1+bs*2] = beta[0]*C[1+bs*2] + alpha[0]*CC[1+bs*2]; in kernel_dgemm_nt_4x4_lib4()
263 D[2+bs*2] = beta[0]*C[2+bs*2] + alpha[0]*CC[2+bs*2]; in kernel_dgemm_nt_4x4_lib4()
264 D[3+bs*2] = beta[0]*C[3+bs*2] + alpha[0]*CC[3+bs*2]; in kernel_dgemm_nt_4x4_lib4()
266 D[0+bs*3] = beta[0]*C[0+bs*3] + alpha[0]*CC[0+bs*3]; in kernel_dgemm_nt_4x4_lib4()
267 D[1+bs*3] = beta[0]*C[1+bs*3] + alpha[0]*CC[1+bs*3]; in kernel_dgemm_nt_4x4_lib4()
268 D[2+bs*3] = beta[0]*C[2+bs*3] + alpha[0]*CC[2+bs*3]; in kernel_dgemm_nt_4x4_lib4()
269 D[3+bs*3] = beta[0]*C[3+bs*3] + alpha[0]*CC[3+bs*3]; in kernel_dgemm_nt_4x4_lib4()
285 double CC[16] = {0}; in kernel_dgemm_nt_4x4_vs_lib4() local
287 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dgemm_nt_4x4_vs_lib4()
290 kernel_dgemm_nt_4x4_lib4(kmax, alpha, A, B, beta, C, CC); in kernel_dgemm_nt_4x4_vs_lib4()
294 D[0+bs*0] = CC[0+bs*0]; in kernel_dgemm_nt_4x4_vs_lib4()
295 D[1+bs*0] = CC[1+bs*0]; in kernel_dgemm_nt_4x4_vs_lib4()
296 D[2+bs*0] = CC[2+bs*0]; in kernel_dgemm_nt_4x4_vs_lib4()
297 D[3+bs*0] = CC[3+bs*0]; in kernel_dgemm_nt_4x4_vs_lib4()
302 D[0+bs*1] = CC[0+bs*1]; in kernel_dgemm_nt_4x4_vs_lib4()
303 D[1+bs*1] = CC[1+bs*1]; in kernel_dgemm_nt_4x4_vs_lib4()
304 D[2+bs*1] = CC[2+bs*1]; in kernel_dgemm_nt_4x4_vs_lib4()
305 D[3+bs*1] = CC[3+bs*1]; in kernel_dgemm_nt_4x4_vs_lib4()
310 D[0+bs*2] = CC[0+bs*2]; in kernel_dgemm_nt_4x4_vs_lib4()
311 D[1+bs*2] = CC[1+bs*2]; in kernel_dgemm_nt_4x4_vs_lib4()
312 D[2+bs*2] = CC[2+bs*2]; in kernel_dgemm_nt_4x4_vs_lib4()
313 D[3+bs*2] = CC[3+bs*2]; in kernel_dgemm_nt_4x4_vs_lib4()
318 D[0+bs*3] = CC[0+bs*3]; in kernel_dgemm_nt_4x4_vs_lib4()
319 D[1+bs*3] = CC[1+bs*3]; in kernel_dgemm_nt_4x4_vs_lib4()
320 D[2+bs*3] = CC[2+bs*3]; in kernel_dgemm_nt_4x4_vs_lib4()
321 D[3+bs*3] = CC[3+bs*3]; in kernel_dgemm_nt_4x4_vs_lib4()
325 D[0+bs*0] = CC[0+bs*0]; in kernel_dgemm_nt_4x4_vs_lib4()
326 D[1+bs*0] = CC[1+bs*0]; in kernel_dgemm_nt_4x4_vs_lib4()
327 D[2+bs*0] = CC[2+bs*0]; in kernel_dgemm_nt_4x4_vs_lib4()
332 D[0+bs*1] = CC[0+bs*1]; in kernel_dgemm_nt_4x4_vs_lib4()
333 D[1+bs*1] = CC[1+bs*1]; in kernel_dgemm_nt_4x4_vs_lib4()
334 D[2+bs*1] = CC[2+bs*1]; in kernel_dgemm_nt_4x4_vs_lib4()
339 D[0+bs*2] = CC[0+bs*2]; in kernel_dgemm_nt_4x4_vs_lib4()
340 D[1+bs*2] = CC[1+bs*2]; in kernel_dgemm_nt_4x4_vs_lib4()
341 D[2+bs*2] = CC[2+bs*2]; in kernel_dgemm_nt_4x4_vs_lib4()
346 D[0+bs*3] = CC[0+bs*3]; in kernel_dgemm_nt_4x4_vs_lib4()
347 D[1+bs*3] = CC[1+bs*3]; in kernel_dgemm_nt_4x4_vs_lib4()
348 D[2+bs*3] = CC[2+bs*3]; in kernel_dgemm_nt_4x4_vs_lib4()
352 D[0+bs*0] = CC[0+bs*0]; in kernel_dgemm_nt_4x4_vs_lib4()
353 D[1+bs*0] = CC[1+bs*0]; in kernel_dgemm_nt_4x4_vs_lib4()
358 D[0+bs*1] = CC[0+bs*1]; in kernel_dgemm_nt_4x4_vs_lib4()
359 D[1+bs*1] = CC[1+bs*1]; in kernel_dgemm_nt_4x4_vs_lib4()
364 D[0+bs*2] = CC[0+bs*2]; in kernel_dgemm_nt_4x4_vs_lib4()
365 D[1+bs*2] = CC[1+bs*2]; in kernel_dgemm_nt_4x4_vs_lib4()
370 D[0+bs*3] = CC[0+bs*3]; in kernel_dgemm_nt_4x4_vs_lib4()
371 D[1+bs*3] = CC[1+bs*3]; in kernel_dgemm_nt_4x4_vs_lib4()
375 D[0+bs*0] = CC[0+bs*0]; in kernel_dgemm_nt_4x4_vs_lib4()
380 D[0+bs*1] = CC[0+bs*1]; in kernel_dgemm_nt_4x4_vs_lib4()
385 D[0+bs*2] = CC[0+bs*2]; in kernel_dgemm_nt_4x4_vs_lib4()
390 D[0+bs*3] = CC[0+bs*3]; in kernel_dgemm_nt_4x4_vs_lib4()
407 double CC[16] = {0}; in kernel_dgemm_nt_4x4_gen_lib4() local
409 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dgemm_nt_4x4_gen_lib4()
417 CC[0+bs*0] = beta[0]*C0[0+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
418 CC[1+bs*0] = beta[0]*C0[1+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
419 CC[2+bs*0] = beta[0]*C0[2+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
420 CC[3+bs*0] = beta[0]*C0[3+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
422 CC[0+bs*1] = beta[0]*C0[0+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
423 CC[1+bs*1] = beta[0]*C0[1+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
424 CC[2+bs*1] = beta[0]*C0[2+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
425 CC[3+bs*1] = beta[0]*C0[3+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
427 CC[0+bs*2] = beta[0]*C0[0+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
428 CC[1+bs*2] = beta[0]*C0[1+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
429 CC[2+bs*2] = beta[0]*C0[2+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
430 CC[3+bs*2] = beta[0]*C0[3+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
432 CC[0+bs*3] = beta[0]*C0[0+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
433 CC[1+bs*3] = beta[0]*C0[1+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
434 CC[2+bs*3] = beta[0]*C0[2+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
435 CC[3+bs*3] = beta[0]*C0[3+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
441 CC[0+bs*0] = beta[0]*C0[1+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
442 CC[1+bs*0] = beta[0]*C0[2+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
443 CC[2+bs*0] = beta[0]*C0[3+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
444 CC[3+bs*0] = beta[0]*C1[0+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
446 CC[0+bs*1] = beta[0]*C0[1+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
447 CC[1+bs*1] = beta[0]*C0[2+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
448 CC[2+bs*1] = beta[0]*C0[3+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
449 CC[3+bs*1] = beta[0]*C1[0+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
451 CC[0+bs*2] = beta[0]*C0[1+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
452 CC[1+bs*2] = beta[0]*C0[2+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
453 CC[2+bs*2] = beta[0]*C0[3+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
454 CC[3+bs*2] = beta[0]*C1[0+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
456 CC[0+bs*3] = beta[0]*C0[1+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
457 CC[1+bs*3] = beta[0]*C0[2+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
458 CC[2+bs*3] = beta[0]*C0[3+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
459 CC[3+bs*3] = beta[0]*C1[0+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
465 CC[0+bs*0] = beta[0]*C0[2+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
466 CC[1+bs*0] = beta[0]*C0[3+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
467 CC[2+bs*0] = beta[0]*C1[0+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
468 CC[3+bs*0] = beta[0]*C1[1+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
470 CC[0+bs*1] = beta[0]*C0[2+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
471 CC[1+bs*1] = beta[0]*C0[3+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
472 CC[2+bs*1] = beta[0]*C1[0+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
473 CC[3+bs*1] = beta[0]*C1[1+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
475 CC[0+bs*2] = beta[0]*C0[2+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
476 CC[1+bs*2] = beta[0]*C0[3+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
477 CC[2+bs*2] = beta[0]*C1[0+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
478 CC[3+bs*2] = beta[0]*C1[1+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
480 CC[0+bs*3] = beta[0]*C0[2+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
481 CC[1+bs*3] = beta[0]*C0[3+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
482 CC[2+bs*3] = beta[0]*C1[0+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
483 CC[3+bs*3] = beta[0]*C1[1+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
489 CC[0+bs*0] = beta[0]*C0[3+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
490 CC[1+bs*0] = beta[0]*C1[0+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
491 CC[2+bs*0] = beta[0]*C1[1+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
492 CC[3+bs*0] = beta[0]*C1[2+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
494 CC[0+bs*1] = beta[0]*C0[3+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
495 CC[1+bs*1] = beta[0]*C1[0+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
496 CC[2+bs*1] = beta[0]*C1[1+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
497 CC[3+bs*1] = beta[0]*C1[2+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
499 CC[0+bs*2] = beta[0]*C0[3+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
500 CC[1+bs*2] = beta[0]*C1[0+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
501 CC[2+bs*2] = beta[0]*C1[1+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
502 CC[3+bs*2] = beta[0]*C1[2+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
504 CC[0+bs*3] = beta[0]*C0[3+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
505 CC[1+bs*3] = beta[0]*C1[0+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
506 CC[2+bs*3] = beta[0]*C1[1+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
507 CC[3+bs*3] = beta[0]*C1[2+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
512 kernel_dgemm_nt_4x4_lib4(kmax, alpha, A, B, &beta1, CC, CC); in kernel_dgemm_nt_4x4_gen_lib4()
519 CC[0+bs*0] = CC[0+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
520 CC[1+bs*0] = CC[1+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
521 CC[2+bs*0] = CC[2+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
522 CC[3+bs*0] = CC[3+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
524 CC[0+bs*1] = CC[0+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
525 CC[1+bs*1] = CC[1+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
526 CC[2+bs*1] = CC[2+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
527 CC[3+bs*1] = CC[3+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
529 CC[0+bs*2] = CC[0+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
530 CC[1+bs*2] = CC[1+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
531 CC[2+bs*2] = CC[2+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
532 CC[3+bs*2] = CC[3+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
538 CC[0+bs*0] = CC[0+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
539 CC[1+bs*0] = CC[1+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
540 CC[2+bs*0] = CC[2+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
541 CC[3+bs*0] = CC[3+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
543 CC[0+bs*1] = CC[0+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
544 CC[1+bs*1] = CC[1+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
545 CC[2+bs*1] = CC[2+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
546 CC[3+bs*1] = CC[3+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
552 CC[0+bs*0] = CC[0+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
553 CC[1+bs*0] = CC[1+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
554 CC[2+bs*0] = CC[2+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
555 CC[3+bs*0] = CC[3+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
569 if(m0<=0 & m1>0) D0[0+bs*0] = CC[0+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
570 if(m0<=1 & m1>1) D0[1+bs*0] = CC[1+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
571 if(m0<=2 & m1>2) D0[2+bs*0] = CC[2+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
572 if(m0<=3 & m1>3) D0[3+bs*0] = CC[3+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
577 if(m0<=0 & m1>0) D0[0+bs*1] = CC[0+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
578 if(m0<=1 & m1>1) D0[1+bs*1] = CC[1+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
579 if(m0<=2 & m1>2) D0[2+bs*1] = CC[2+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
580 if(m0<=3 & m1>3) D0[3+bs*1] = CC[3+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
585 if(m0<=0 & m1>0) D0[0+bs*2] = CC[0+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
586 if(m0<=1 & m1>1) D0[1+bs*2] = CC[1+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
587 if(m0<=2 & m1>2) D0[2+bs*2] = CC[2+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
588 if(m0<=3 & m1>3) D0[3+bs*2] = CC[3+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
593 if(m0<=0 & m1>0) D0[0+bs*3] = CC[0+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
594 if(m0<=1 & m1>1) D0[1+bs*3] = CC[1+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
595 if(m0<=2 & m1>2) D0[2+bs*3] = CC[2+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
596 if(m0<=3 & m1>3) D0[3+bs*3] = CC[3+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
605 if(m0<=0 & m1>0) D0[1+bs*0] = CC[0+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
606 if(m0<=1 & m1>1) D0[2+bs*0] = CC[1+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
607 if(m0<=2 & m1>2) D0[3+bs*0] = CC[2+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
608 if(m0<=3 & m1>3) D1[0+bs*0] = CC[3+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
613 if(m0<=0 & m1>0) D0[1+bs*1] = CC[0+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
614 if(m0<=1 & m1>1) D0[2+bs*1] = CC[1+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
615 if(m0<=2 & m1>2) D0[3+bs*1] = CC[2+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
616 if(m0<=3 & m1>3) D1[0+bs*1] = CC[3+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
621 if(m0<=0 & m1>0) D0[1+bs*2] = CC[0+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
622 if(m0<=1 & m1>1) D0[2+bs*2] = CC[1+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
623 if(m0<=2 & m1>2) D0[3+bs*2] = CC[2+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
624 if(m0<=3 & m1>3) D1[0+bs*2] = CC[3+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
629 if(m0<=0 & m1>0) D0[1+bs*3] = CC[0+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
630 if(m0<=1 & m1>1) D0[2+bs*3] = CC[1+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
631 if(m0<=2 & m1>2) D0[3+bs*3] = CC[2+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
632 if(m0<=3 & m1>3) D1[0+bs*3] = CC[3+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
641 if(m0<=0 & m1>0) D0[2+bs*0] = CC[0+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
642 if(m0<=1 & m1>1) D0[3+bs*0] = CC[1+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
643 if(m0<=2 & m1>2) D1[0+bs*0] = CC[2+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
644 if(m0<=3 & m1>3) D1[1+bs*0] = CC[3+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
649 if(m0<=0 & m1>0) D0[2+bs*1] = CC[0+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
650 if(m0<=1 & m1>1) D0[3+bs*1] = CC[1+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
651 if(m0<=2 & m1>2) D1[0+bs*1] = CC[2+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
652 if(m0<=3 & m1>3) D1[1+bs*1] = CC[3+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
657 if(m0<=0 & m1>0) D0[2+bs*2] = CC[0+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
658 if(m0<=1 & m1>1) D0[3+bs*2] = CC[1+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
659 if(m0<=2 & m1>2) D1[0+bs*2] = CC[2+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
660 if(m0<=3 & m1>3) D1[1+bs*2] = CC[3+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
665 if(m0<=0 & m1>0) D0[2+bs*3] = CC[0+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
666 if(m0<=1 & m1>1) D0[3+bs*3] = CC[1+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
667 if(m0<=2 & m1>2) D1[0+bs*3] = CC[2+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
668 if(m0<=3 & m1>3) D1[1+bs*3] = CC[3+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
677 if(m0<=0 & m1>0) D0[3+bs*0] = CC[0+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
678 if(m0<=1 & m1>1) D1[0+bs*0] = CC[1+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
679 if(m0<=2 & m1>2) D1[1+bs*0] = CC[2+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
680 if(m0<=3 & m1>3) D1[2+bs*0] = CC[3+bs*0]; in kernel_dgemm_nt_4x4_gen_lib4()
685 if(m0<=0 & m1>0) D0[3+bs*1] = CC[0+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
686 if(m0<=1 & m1>1) D1[0+bs*1] = CC[1+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
687 if(m0<=2 & m1>2) D1[1+bs*1] = CC[2+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
688 if(m0<=3 & m1>3) D1[2+bs*1] = CC[3+bs*1]; in kernel_dgemm_nt_4x4_gen_lib4()
693 if(m0<=0 & m1>0) D0[3+bs*2] = CC[0+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
694 if(m0<=1 & m1>1) D1[0+bs*2] = CC[1+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
695 if(m0<=2 & m1>2) D1[1+bs*2] = CC[2+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
696 if(m0<=3 & m1>3) D1[2+bs*2] = CC[3+bs*2]; in kernel_dgemm_nt_4x4_gen_lib4()
701 if(m0<=0 & m1>0) D0[3+bs*3] = CC[0+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
702 if(m0<=1 & m1>1) D1[0+bs*3] = CC[1+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
703 if(m0<=2 & m1>2) D1[1+bs*3] = CC[2+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
704 if(m0<=3 & m1>3) D1[2+bs*3] = CC[3+bs*3]; in kernel_dgemm_nt_4x4_gen_lib4()
734 double CC[16] = {0}; in kernel_dgemm_nn_4x4_lib4() local
736 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dgemm_nn_4x4_lib4()
762 CC[0+bs*0] += a_0 * b_0; in kernel_dgemm_nn_4x4_lib4()
763 CC[1+bs*0] += a_1 * b_0; in kernel_dgemm_nn_4x4_lib4()
764 CC[2+bs*0] += a_2 * b_0; in kernel_dgemm_nn_4x4_lib4()
765 CC[3+bs*0] += a_3 * b_0; in kernel_dgemm_nn_4x4_lib4()
767 CC[0+bs*1] += a_0 * b_1; in kernel_dgemm_nn_4x4_lib4()
768 CC[1+bs*1] += a_1 * b_1; in kernel_dgemm_nn_4x4_lib4()
769 CC[2+bs*1] += a_2 * b_1; in kernel_dgemm_nn_4x4_lib4()
770 CC[3+bs*1] += a_3 * b_1; in kernel_dgemm_nn_4x4_lib4()
772 CC[0+bs*2] += a_0 * b_2; in kernel_dgemm_nn_4x4_lib4()
773 CC[1+bs*2] += a_1 * b_2; in kernel_dgemm_nn_4x4_lib4()
774 CC[2+bs*2] += a_2 * b_2; in kernel_dgemm_nn_4x4_lib4()
775 CC[3+bs*2] += a_3 * b_2; in kernel_dgemm_nn_4x4_lib4()
777 CC[0+bs*3] += a_0 * b_3; in kernel_dgemm_nn_4x4_lib4()
778 CC[1+bs*3] += a_1 * b_3; in kernel_dgemm_nn_4x4_lib4()
779 CC[2+bs*3] += a_2 * b_3; in kernel_dgemm_nn_4x4_lib4()
780 CC[3+bs*3] += a_3 * b_3; in kernel_dgemm_nn_4x4_lib4()
799 CC[0+bs*0] += a_0 * b_0; in kernel_dgemm_nn_4x4_lib4()
800 CC[1+bs*0] += a_1 * b_0; in kernel_dgemm_nn_4x4_lib4()
801 CC[2+bs*0] += a_2 * b_0; in kernel_dgemm_nn_4x4_lib4()
802 CC[3+bs*0] += a_3 * b_0; in kernel_dgemm_nn_4x4_lib4()
804 CC[0+bs*1] += a_0 * b_1; in kernel_dgemm_nn_4x4_lib4()
805 CC[1+bs*1] += a_1 * b_1; in kernel_dgemm_nn_4x4_lib4()
806 CC[2+bs*1] += a_2 * b_1; in kernel_dgemm_nn_4x4_lib4()
807 CC[3+bs*1] += a_3 * b_1; in kernel_dgemm_nn_4x4_lib4()
809 CC[0+bs*2] += a_0 * b_2; in kernel_dgemm_nn_4x4_lib4()
810 CC[1+bs*2] += a_1 * b_2; in kernel_dgemm_nn_4x4_lib4()
811 CC[2+bs*2] += a_2 * b_2; in kernel_dgemm_nn_4x4_lib4()
812 CC[3+bs*2] += a_3 * b_2; in kernel_dgemm_nn_4x4_lib4()
814 CC[0+bs*3] += a_0 * b_3; in kernel_dgemm_nn_4x4_lib4()
815 CC[1+bs*3] += a_1 * b_3; in kernel_dgemm_nn_4x4_lib4()
816 CC[2+bs*3] += a_2 * b_3; in kernel_dgemm_nn_4x4_lib4()
817 CC[3+bs*3] += a_3 * b_3; in kernel_dgemm_nn_4x4_lib4()
836 CC[0+bs*0] += a_0 * b_0; in kernel_dgemm_nn_4x4_lib4()
837 CC[1+bs*0] += a_1 * b_0; in kernel_dgemm_nn_4x4_lib4()
838 CC[2+bs*0] += a_2 * b_0; in kernel_dgemm_nn_4x4_lib4()
839 CC[3+bs*0] += a_3 * b_0; in kernel_dgemm_nn_4x4_lib4()
841 CC[0+bs*1] += a_0 * b_1; in kernel_dgemm_nn_4x4_lib4()
842 CC[1+bs*1] += a_1 * b_1; in kernel_dgemm_nn_4x4_lib4()
843 CC[2+bs*1] += a_2 * b_1; in kernel_dgemm_nn_4x4_lib4()
844 CC[3+bs*1] += a_3 * b_1; in kernel_dgemm_nn_4x4_lib4()
846 CC[0+bs*2] += a_0 * b_2; in kernel_dgemm_nn_4x4_lib4()
847 CC[1+bs*2] += a_1 * b_2; in kernel_dgemm_nn_4x4_lib4()
848 CC[2+bs*2] += a_2 * b_2; in kernel_dgemm_nn_4x4_lib4()
849 CC[3+bs*2] += a_3 * b_2; in kernel_dgemm_nn_4x4_lib4()
851 CC[0+bs*3] += a_0 * b_3; in kernel_dgemm_nn_4x4_lib4()
852 CC[1+bs*3] += a_1 * b_3; in kernel_dgemm_nn_4x4_lib4()
853 CC[2+bs*3] += a_2 * b_3; in kernel_dgemm_nn_4x4_lib4()
854 CC[3+bs*3] += a_3 * b_3; in kernel_dgemm_nn_4x4_lib4()
877 CC[0+bs*0] += a_0 * b_0; in kernel_dgemm_nn_4x4_lib4()
878 CC[1+bs*0] += a_1 * b_0; in kernel_dgemm_nn_4x4_lib4()
879 CC[2+bs*0] += a_2 * b_0; in kernel_dgemm_nn_4x4_lib4()
880 CC[3+bs*0] += a_3 * b_0; in kernel_dgemm_nn_4x4_lib4()
882 CC[0+bs*1] += a_0 * b_1; in kernel_dgemm_nn_4x4_lib4()
883 CC[1+bs*1] += a_1 * b_1; in kernel_dgemm_nn_4x4_lib4()
884 CC[2+bs*1] += a_2 * b_1; in kernel_dgemm_nn_4x4_lib4()
885 CC[3+bs*1] += a_3 * b_1; in kernel_dgemm_nn_4x4_lib4()
887 CC[0+bs*2] += a_0 * b_2; in kernel_dgemm_nn_4x4_lib4()
888 CC[1+bs*2] += a_1 * b_2; in kernel_dgemm_nn_4x4_lib4()
889 CC[2+bs*2] += a_2 * b_2; in kernel_dgemm_nn_4x4_lib4()
890 CC[3+bs*2] += a_3 * b_2; in kernel_dgemm_nn_4x4_lib4()
892 CC[0+bs*3] += a_0 * b_3; in kernel_dgemm_nn_4x4_lib4()
893 CC[1+bs*3] += a_1 * b_3; in kernel_dgemm_nn_4x4_lib4()
894 CC[2+bs*3] += a_2 * b_3; in kernel_dgemm_nn_4x4_lib4()
895 CC[3+bs*3] += a_3 * b_3; in kernel_dgemm_nn_4x4_lib4()
914 CC[0+bs*0] += a_0 * b_0; in kernel_dgemm_nn_4x4_lib4()
915 CC[1+bs*0] += a_1 * b_0; in kernel_dgemm_nn_4x4_lib4()
916 CC[2+bs*0] += a_2 * b_0; in kernel_dgemm_nn_4x4_lib4()
917 CC[3+bs*0] += a_3 * b_0; in kernel_dgemm_nn_4x4_lib4()
919 CC[0+bs*1] += a_0 * b_1; in kernel_dgemm_nn_4x4_lib4()
920 CC[1+bs*1] += a_1 * b_1; in kernel_dgemm_nn_4x4_lib4()
921 CC[2+bs*1] += a_2 * b_1; in kernel_dgemm_nn_4x4_lib4()
922 CC[3+bs*1] += a_3 * b_1; in kernel_dgemm_nn_4x4_lib4()
924 CC[0+bs*2] += a_0 * b_2; in kernel_dgemm_nn_4x4_lib4()
925 CC[1+bs*2] += a_1 * b_2; in kernel_dgemm_nn_4x4_lib4()
926 CC[2+bs*2] += a_2 * b_2; in kernel_dgemm_nn_4x4_lib4()
927 CC[3+bs*2] += a_3 * b_2; in kernel_dgemm_nn_4x4_lib4()
929 CC[0+bs*3] += a_0 * b_3; in kernel_dgemm_nn_4x4_lib4()
930 CC[1+bs*3] += a_1 * b_3; in kernel_dgemm_nn_4x4_lib4()
931 CC[2+bs*3] += a_2 * b_3; in kernel_dgemm_nn_4x4_lib4()
932 CC[3+bs*3] += a_3 * b_3; in kernel_dgemm_nn_4x4_lib4()
955 CC[0+bs*0] += a_0 * b_0; in kernel_dgemm_nn_4x4_lib4()
956 CC[1+bs*0] += a_1 * b_0; in kernel_dgemm_nn_4x4_lib4()
957 CC[2+bs*0] += a_2 * b_0; in kernel_dgemm_nn_4x4_lib4()
958 CC[3+bs*0] += a_3 * b_0; in kernel_dgemm_nn_4x4_lib4()
960 CC[0+bs*1] += a_0 * b_1; in kernel_dgemm_nn_4x4_lib4()
961 CC[1+bs*1] += a_1 * b_1; in kernel_dgemm_nn_4x4_lib4()
962 CC[2+bs*1] += a_2 * b_1; in kernel_dgemm_nn_4x4_lib4()
963 CC[3+bs*1] += a_3 * b_1; in kernel_dgemm_nn_4x4_lib4()
965 CC[0+bs*2] += a_0 * b_2; in kernel_dgemm_nn_4x4_lib4()
966 CC[1+bs*2] += a_1 * b_2; in kernel_dgemm_nn_4x4_lib4()
967 CC[2+bs*2] += a_2 * b_2; in kernel_dgemm_nn_4x4_lib4()
968 CC[3+bs*2] += a_3 * b_2; in kernel_dgemm_nn_4x4_lib4()
970 CC[0+bs*3] += a_0 * b_3; in kernel_dgemm_nn_4x4_lib4()
971 CC[1+bs*3] += a_1 * b_3; in kernel_dgemm_nn_4x4_lib4()
972 CC[2+bs*3] += a_2 * b_3; in kernel_dgemm_nn_4x4_lib4()
973 CC[3+bs*3] += a_3 * b_3; in kernel_dgemm_nn_4x4_lib4()
997 CC[0+bs*0] += a_0 * b_0; in kernel_dgemm_nn_4x4_lib4()
998 CC[1+bs*0] += a_1 * b_0; in kernel_dgemm_nn_4x4_lib4()
999 CC[2+bs*0] += a_2 * b_0; in kernel_dgemm_nn_4x4_lib4()
1000 CC[3+bs*0] += a_3 * b_0; in kernel_dgemm_nn_4x4_lib4()
1002 CC[0+bs*1] += a_0 * b_1; in kernel_dgemm_nn_4x4_lib4()
1003 CC[1+bs*1] += a_1 * b_1; in kernel_dgemm_nn_4x4_lib4()
1004 CC[2+bs*1] += a_2 * b_1; in kernel_dgemm_nn_4x4_lib4()
1005 CC[3+bs*1] += a_3 * b_1; in kernel_dgemm_nn_4x4_lib4()
1007 CC[0+bs*2] += a_0 * b_2; in kernel_dgemm_nn_4x4_lib4()
1008 CC[1+bs*2] += a_1 * b_2; in kernel_dgemm_nn_4x4_lib4()
1009 CC[2+bs*2] += a_2 * b_2; in kernel_dgemm_nn_4x4_lib4()
1010 CC[3+bs*2] += a_3 * b_2; in kernel_dgemm_nn_4x4_lib4()
1012 CC[0+bs*3] += a_0 * b_3; in kernel_dgemm_nn_4x4_lib4()
1013 CC[1+bs*3] += a_1 * b_3; in kernel_dgemm_nn_4x4_lib4()
1014 CC[2+bs*3] += a_2 * b_3; in kernel_dgemm_nn_4x4_lib4()
1015 CC[3+bs*3] += a_3 * b_3; in kernel_dgemm_nn_4x4_lib4()
1030 CC[0+bs*0] += a_0 * b_0; in kernel_dgemm_nn_4x4_lib4()
1031 CC[1+bs*0] += a_1 * b_0; in kernel_dgemm_nn_4x4_lib4()
1032 CC[2+bs*0] += a_2 * b_0; in kernel_dgemm_nn_4x4_lib4()
1033 CC[3+bs*0] += a_3 * b_0; in kernel_dgemm_nn_4x4_lib4()
1035 CC[0+bs*1] += a_0 * b_1; in kernel_dgemm_nn_4x4_lib4()
1036 CC[1+bs*1] += a_1 * b_1; in kernel_dgemm_nn_4x4_lib4()
1037 CC[2+bs*1] += a_2 * b_1; in kernel_dgemm_nn_4x4_lib4()
1038 CC[3+bs*1] += a_3 * b_1; in kernel_dgemm_nn_4x4_lib4()
1040 CC[0+bs*2] += a_0 * b_2; in kernel_dgemm_nn_4x4_lib4()
1041 CC[1+bs*2] += a_1 * b_2; in kernel_dgemm_nn_4x4_lib4()
1042 CC[2+bs*2] += a_2 * b_2; in kernel_dgemm_nn_4x4_lib4()
1043 CC[3+bs*2] += a_3 * b_2; in kernel_dgemm_nn_4x4_lib4()
1045 CC[0+bs*3] += a_0 * b_3; in kernel_dgemm_nn_4x4_lib4()
1046 CC[1+bs*3] += a_1 * b_3; in kernel_dgemm_nn_4x4_lib4()
1047 CC[2+bs*3] += a_2 * b_3; in kernel_dgemm_nn_4x4_lib4()
1048 CC[3+bs*3] += a_3 * b_3; in kernel_dgemm_nn_4x4_lib4()
1063 CC[0+bs*0] += a_0 * b_0; in kernel_dgemm_nn_4x4_lib4()
1064 CC[1+bs*0] += a_1 * b_0; in kernel_dgemm_nn_4x4_lib4()
1065 CC[2+bs*0] += a_2 * b_0; in kernel_dgemm_nn_4x4_lib4()
1066 CC[3+bs*0] += a_3 * b_0; in kernel_dgemm_nn_4x4_lib4()
1068 CC[0+bs*1] += a_0 * b_1; in kernel_dgemm_nn_4x4_lib4()
1069 CC[1+bs*1] += a_1 * b_1; in kernel_dgemm_nn_4x4_lib4()
1070 CC[2+bs*1] += a_2 * b_1; in kernel_dgemm_nn_4x4_lib4()
1071 CC[3+bs*1] += a_3 * b_1; in kernel_dgemm_nn_4x4_lib4()
1073 CC[0+bs*2] += a_0 * b_2; in kernel_dgemm_nn_4x4_lib4()
1074 CC[1+bs*2] += a_1 * b_2; in kernel_dgemm_nn_4x4_lib4()
1075 CC[2+bs*2] += a_2 * b_2; in kernel_dgemm_nn_4x4_lib4()
1076 CC[3+bs*2] += a_3 * b_2; in kernel_dgemm_nn_4x4_lib4()
1078 CC[0+bs*3] += a_0 * b_3; in kernel_dgemm_nn_4x4_lib4()
1079 CC[1+bs*3] += a_1 * b_3; in kernel_dgemm_nn_4x4_lib4()
1080 CC[2+bs*3] += a_2 * b_3; in kernel_dgemm_nn_4x4_lib4()
1081 CC[3+bs*3] += a_3 * b_3; in kernel_dgemm_nn_4x4_lib4()
1096 CC[0+bs*0] += a_0 * b_0; in kernel_dgemm_nn_4x4_lib4()
1097 CC[1+bs*0] += a_1 * b_0; in kernel_dgemm_nn_4x4_lib4()
1098 CC[2+bs*0] += a_2 * b_0; in kernel_dgemm_nn_4x4_lib4()
1099 CC[3+bs*0] += a_3 * b_0; in kernel_dgemm_nn_4x4_lib4()
1101 CC[0+bs*1] += a_0 * b_1; in kernel_dgemm_nn_4x4_lib4()
1102 CC[1+bs*1] += a_1 * b_1; in kernel_dgemm_nn_4x4_lib4()
1103 CC[2+bs*1] += a_2 * b_1; in kernel_dgemm_nn_4x4_lib4()
1104 CC[3+bs*1] += a_3 * b_1; in kernel_dgemm_nn_4x4_lib4()
1106 CC[0+bs*2] += a_0 * b_2; in kernel_dgemm_nn_4x4_lib4()
1107 CC[1+bs*2] += a_1 * b_2; in kernel_dgemm_nn_4x4_lib4()
1108 CC[2+bs*2] += a_2 * b_2; in kernel_dgemm_nn_4x4_lib4()
1109 CC[3+bs*2] += a_3 * b_2; in kernel_dgemm_nn_4x4_lib4()
1111 CC[0+bs*3] += a_0 * b_3; in kernel_dgemm_nn_4x4_lib4()
1112 CC[1+bs*3] += a_1 * b_3; in kernel_dgemm_nn_4x4_lib4()
1113 CC[2+bs*3] += a_2 * b_3; in kernel_dgemm_nn_4x4_lib4()
1114 CC[3+bs*3] += a_3 * b_3; in kernel_dgemm_nn_4x4_lib4()
1135 CC[0+bs*0] += a_0 * b_0; in kernel_dgemm_nn_4x4_lib4()
1136 CC[1+bs*0] += a_1 * b_0; in kernel_dgemm_nn_4x4_lib4()
1137 CC[2+bs*0] += a_2 * b_0; in kernel_dgemm_nn_4x4_lib4()
1138 CC[3+bs*0] += a_3 * b_0; in kernel_dgemm_nn_4x4_lib4()
1140 CC[0+bs*1] += a_0 * b_1; in kernel_dgemm_nn_4x4_lib4()
1141 CC[1+bs*1] += a_1 * b_1; in kernel_dgemm_nn_4x4_lib4()
1142 CC[2+bs*1] += a_2 * b_1; in kernel_dgemm_nn_4x4_lib4()
1143 CC[3+bs*1] += a_3 * b_1; in kernel_dgemm_nn_4x4_lib4()
1145 CC[0+bs*2] += a_0 * b_2; in kernel_dgemm_nn_4x4_lib4()
1146 CC[1+bs*2] += a_1 * b_2; in kernel_dgemm_nn_4x4_lib4()
1147 CC[2+bs*2] += a_2 * b_2; in kernel_dgemm_nn_4x4_lib4()
1148 CC[3+bs*2] += a_3 * b_2; in kernel_dgemm_nn_4x4_lib4()
1150 CC[0+bs*3] += a_0 * b_3; in kernel_dgemm_nn_4x4_lib4()
1151 CC[1+bs*3] += a_1 * b_3; in kernel_dgemm_nn_4x4_lib4()
1152 CC[2+bs*3] += a_2 * b_3; in kernel_dgemm_nn_4x4_lib4()
1153 CC[3+bs*3] += a_3 * b_3; in kernel_dgemm_nn_4x4_lib4()
1162 D[0+bs*0] = beta[0]*C[0+bs*0] + alpha[0]*CC[0+bs*0]; in kernel_dgemm_nn_4x4_lib4()
1163 D[1+bs*0] = beta[0]*C[1+bs*0] + alpha[0]*CC[1+bs*0]; in kernel_dgemm_nn_4x4_lib4()
1164 D[2+bs*0] = beta[0]*C[2+bs*0] + alpha[0]*CC[2+bs*0]; in kernel_dgemm_nn_4x4_lib4()
1165 D[3+bs*0] = beta[0]*C[3+bs*0] + alpha[0]*CC[3+bs*0]; in kernel_dgemm_nn_4x4_lib4()
1167 D[0+bs*1] = beta[0]*C[0+bs*1] + alpha[0]*CC[0+bs*1]; in kernel_dgemm_nn_4x4_lib4()
1168 D[1+bs*1] = beta[0]*C[1+bs*1] + alpha[0]*CC[1+bs*1]; in kernel_dgemm_nn_4x4_lib4()
1169 D[2+bs*1] = beta[0]*C[2+bs*1] + alpha[0]*CC[2+bs*1]; in kernel_dgemm_nn_4x4_lib4()
1170 D[3+bs*1] = beta[0]*C[3+bs*1] + alpha[0]*CC[3+bs*1]; in kernel_dgemm_nn_4x4_lib4()
1172 D[0+bs*2] = beta[0]*C[0+bs*2] + alpha[0]*CC[0+bs*2]; in kernel_dgemm_nn_4x4_lib4()
1173 D[1+bs*2] = beta[0]*C[1+bs*2] + alpha[0]*CC[1+bs*2]; in kernel_dgemm_nn_4x4_lib4()
1174 D[2+bs*2] = beta[0]*C[2+bs*2] + alpha[0]*CC[2+bs*2]; in kernel_dgemm_nn_4x4_lib4()
1175 D[3+bs*2] = beta[0]*C[3+bs*2] + alpha[0]*CC[3+bs*2]; in kernel_dgemm_nn_4x4_lib4()
1177 D[0+bs*3] = beta[0]*C[0+bs*3] + alpha[0]*CC[0+bs*3]; in kernel_dgemm_nn_4x4_lib4()
1178 D[1+bs*3] = beta[0]*C[1+bs*3] + alpha[0]*CC[1+bs*3]; in kernel_dgemm_nn_4x4_lib4()
1179 D[2+bs*3] = beta[0]*C[2+bs*3] + alpha[0]*CC[2+bs*3]; in kernel_dgemm_nn_4x4_lib4()
1180 D[3+bs*3] = beta[0]*C[3+bs*3] + alpha[0]*CC[3+bs*3]; in kernel_dgemm_nn_4x4_lib4()
1196 double CC[16] = {0}; in kernel_dgemm_nn_4x4_vs_lib4() local
1198 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dgemm_nn_4x4_vs_lib4()
1201 kernel_dgemm_nn_4x4_lib4(kmax, alpha, A, offsetB, B, sdb, beta, C, CC); in kernel_dgemm_nn_4x4_vs_lib4()
1205 D[0+bs*0] = CC[0+bs*0]; in kernel_dgemm_nn_4x4_vs_lib4()
1206 D[1+bs*0] = CC[1+bs*0]; in kernel_dgemm_nn_4x4_vs_lib4()
1207 D[2+bs*0] = CC[2+bs*0]; in kernel_dgemm_nn_4x4_vs_lib4()
1208 D[3+bs*0] = CC[3+bs*0]; in kernel_dgemm_nn_4x4_vs_lib4()
1213 D[0+bs*1] = CC[0+bs*1]; in kernel_dgemm_nn_4x4_vs_lib4()
1214 D[1+bs*1] = CC[1+bs*1]; in kernel_dgemm_nn_4x4_vs_lib4()
1215 D[2+bs*1] = CC[2+bs*1]; in kernel_dgemm_nn_4x4_vs_lib4()
1216 D[3+bs*1] = CC[3+bs*1]; in kernel_dgemm_nn_4x4_vs_lib4()
1221 D[0+bs*2] = CC[0+bs*2]; in kernel_dgemm_nn_4x4_vs_lib4()
1222 D[1+bs*2] = CC[1+bs*2]; in kernel_dgemm_nn_4x4_vs_lib4()
1223 D[2+bs*2] = CC[2+bs*2]; in kernel_dgemm_nn_4x4_vs_lib4()
1224 D[3+bs*2] = CC[3+bs*2]; in kernel_dgemm_nn_4x4_vs_lib4()
1229 D[0+bs*3] = CC[0+bs*3]; in kernel_dgemm_nn_4x4_vs_lib4()
1230 D[1+bs*3] = CC[1+bs*3]; in kernel_dgemm_nn_4x4_vs_lib4()
1231 D[2+bs*3] = CC[2+bs*3]; in kernel_dgemm_nn_4x4_vs_lib4()
1232 D[3+bs*3] = CC[3+bs*3]; in kernel_dgemm_nn_4x4_vs_lib4()
1236 D[0+bs*0] = CC[0+bs*0]; in kernel_dgemm_nn_4x4_vs_lib4()
1237 D[1+bs*0] = CC[1+bs*0]; in kernel_dgemm_nn_4x4_vs_lib4()
1238 D[2+bs*0] = CC[2+bs*0]; in kernel_dgemm_nn_4x4_vs_lib4()
1243 D[0+bs*1] = CC[0+bs*1]; in kernel_dgemm_nn_4x4_vs_lib4()
1244 D[1+bs*1] = CC[1+bs*1]; in kernel_dgemm_nn_4x4_vs_lib4()
1245 D[2+bs*1] = CC[2+bs*1]; in kernel_dgemm_nn_4x4_vs_lib4()
1250 D[0+bs*2] = CC[0+bs*2]; in kernel_dgemm_nn_4x4_vs_lib4()
1251 D[1+bs*2] = CC[1+bs*2]; in kernel_dgemm_nn_4x4_vs_lib4()
1252 D[2+bs*2] = CC[2+bs*2]; in kernel_dgemm_nn_4x4_vs_lib4()
1257 D[0+bs*3] = CC[0+bs*3]; in kernel_dgemm_nn_4x4_vs_lib4()
1258 D[1+bs*3] = CC[1+bs*3]; in kernel_dgemm_nn_4x4_vs_lib4()
1259 D[2+bs*3] = CC[2+bs*3]; in kernel_dgemm_nn_4x4_vs_lib4()
1263 D[0+bs*0] = CC[0+bs*0]; in kernel_dgemm_nn_4x4_vs_lib4()
1264 D[1+bs*0] = CC[1+bs*0]; in kernel_dgemm_nn_4x4_vs_lib4()
1269 D[0+bs*1] = CC[0+bs*1]; in kernel_dgemm_nn_4x4_vs_lib4()
1270 D[1+bs*1] = CC[1+bs*1]; in kernel_dgemm_nn_4x4_vs_lib4()
1275 D[0+bs*2] = CC[0+bs*2]; in kernel_dgemm_nn_4x4_vs_lib4()
1276 D[1+bs*2] = CC[1+bs*2]; in kernel_dgemm_nn_4x4_vs_lib4()
1281 D[0+bs*3] = CC[0+bs*3]; in kernel_dgemm_nn_4x4_vs_lib4()
1282 D[1+bs*3] = CC[1+bs*3]; in kernel_dgemm_nn_4x4_vs_lib4()
1286 D[0+bs*0] = CC[0+bs*0]; in kernel_dgemm_nn_4x4_vs_lib4()
1291 D[0+bs*1] = CC[0+bs*1]; in kernel_dgemm_nn_4x4_vs_lib4()
1296 D[0+bs*2] = CC[0+bs*2]; in kernel_dgemm_nn_4x4_vs_lib4()
1301 D[0+bs*3] = CC[0+bs*3]; in kernel_dgemm_nn_4x4_vs_lib4()
1318 double CC[16] = {0}; in kernel_dgemm_nn_4x4_gen_lib4() local
1320 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dgemm_nn_4x4_gen_lib4()
1328 CC[0+bs*0] = beta[0]*C0[0+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1329 CC[1+bs*0] = beta[0]*C0[1+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1330 CC[2+bs*0] = beta[0]*C0[2+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1331 CC[3+bs*0] = beta[0]*C0[3+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1333 CC[0+bs*1] = beta[0]*C0[0+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1334 CC[1+bs*1] = beta[0]*C0[1+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1335 CC[2+bs*1] = beta[0]*C0[2+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1336 CC[3+bs*1] = beta[0]*C0[3+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1338 CC[0+bs*2] = beta[0]*C0[0+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1339 CC[1+bs*2] = beta[0]*C0[1+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1340 CC[2+bs*2] = beta[0]*C0[2+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1341 CC[3+bs*2] = beta[0]*C0[3+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1343 CC[0+bs*3] = beta[0]*C0[0+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1344 CC[1+bs*3] = beta[0]*C0[1+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1345 CC[2+bs*3] = beta[0]*C0[2+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1346 CC[3+bs*3] = beta[0]*C0[3+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1352 CC[0+bs*0] = beta[0]*C0[1+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1353 CC[1+bs*0] = beta[0]*C0[2+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1354 CC[2+bs*0] = beta[0]*C0[3+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1355 CC[3+bs*0] = beta[0]*C1[0+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1357 CC[0+bs*1] = beta[0]*C0[1+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1358 CC[1+bs*1] = beta[0]*C0[2+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1359 CC[2+bs*1] = beta[0]*C0[3+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1360 CC[3+bs*1] = beta[0]*C1[0+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1362 CC[0+bs*2] = beta[0]*C0[1+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1363 CC[1+bs*2] = beta[0]*C0[2+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1364 CC[2+bs*2] = beta[0]*C0[3+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1365 CC[3+bs*2] = beta[0]*C1[0+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1367 CC[0+bs*3] = beta[0]*C0[1+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1368 CC[1+bs*3] = beta[0]*C0[2+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1369 CC[2+bs*3] = beta[0]*C0[3+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1370 CC[3+bs*3] = beta[0]*C1[0+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1376 CC[0+bs*0] = beta[0]*C0[2+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1377 CC[1+bs*0] = beta[0]*C0[3+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1378 CC[2+bs*0] = beta[0]*C1[0+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1379 CC[3+bs*0] = beta[0]*C1[1+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1381 CC[0+bs*1] = beta[0]*C0[2+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1382 CC[1+bs*1] = beta[0]*C0[3+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1383 CC[2+bs*1] = beta[0]*C1[0+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1384 CC[3+bs*1] = beta[0]*C1[1+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1386 CC[0+bs*2] = beta[0]*C0[2+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1387 CC[1+bs*2] = beta[0]*C0[3+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1388 CC[2+bs*2] = beta[0]*C1[0+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1389 CC[3+bs*2] = beta[0]*C1[1+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1391 CC[0+bs*3] = beta[0]*C0[2+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1392 CC[1+bs*3] = beta[0]*C0[3+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1393 CC[2+bs*3] = beta[0]*C1[0+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1394 CC[3+bs*3] = beta[0]*C1[1+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1400 CC[0+bs*0] = beta[0]*C0[3+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1401 CC[1+bs*0] = beta[0]*C1[0+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1402 CC[2+bs*0] = beta[0]*C1[1+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1403 CC[3+bs*0] = beta[0]*C1[2+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1405 CC[0+bs*1] = beta[0]*C0[3+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1406 CC[1+bs*1] = beta[0]*C1[0+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1407 CC[2+bs*1] = beta[0]*C1[1+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1408 CC[3+bs*1] = beta[0]*C1[2+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1410 CC[0+bs*2] = beta[0]*C0[3+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1411 CC[1+bs*2] = beta[0]*C1[0+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1412 CC[2+bs*2] = beta[0]*C1[1+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1413 CC[3+bs*2] = beta[0]*C1[2+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1415 CC[0+bs*3] = beta[0]*C0[3+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1416 CC[1+bs*3] = beta[0]*C1[0+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1417 CC[2+bs*3] = beta[0]*C1[1+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1418 CC[3+bs*3] = beta[0]*C1[2+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1423 kernel_dgemm_nn_4x4_lib4(kmax, alpha, A, offsetB, B, sdb, &beta1, CC, CC); in kernel_dgemm_nn_4x4_gen_lib4()
1430 CC[0+bs*0] = CC[0+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1431 CC[1+bs*0] = CC[1+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1432 CC[2+bs*0] = CC[2+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1433 CC[3+bs*0] = CC[3+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1435 CC[0+bs*1] = CC[0+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1436 CC[1+bs*1] = CC[1+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1437 CC[2+bs*1] = CC[2+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1438 CC[3+bs*1] = CC[3+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1440 CC[0+bs*2] = CC[0+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1441 CC[1+bs*2] = CC[1+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1442 CC[2+bs*2] = CC[2+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1443 CC[3+bs*2] = CC[3+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1449 CC[0+bs*0] = CC[0+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1450 CC[1+bs*0] = CC[1+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1451 CC[2+bs*0] = CC[2+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1452 CC[3+bs*0] = CC[3+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1454 CC[0+bs*1] = CC[0+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1455 CC[1+bs*1] = CC[1+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1456 CC[2+bs*1] = CC[2+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1457 CC[3+bs*1] = CC[3+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1463 CC[0+bs*0] = CC[0+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1464 CC[1+bs*0] = CC[1+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1465 CC[2+bs*0] = CC[2+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1466 CC[3+bs*0] = CC[3+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1480 if(m0<=0 & m1>0) D0[0+bs*0] = CC[0+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1481 if(m0<=1 & m1>1) D0[1+bs*0] = CC[1+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1482 if(m0<=2 & m1>2) D0[2+bs*0] = CC[2+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1483 if(m0<=3 & m1>3) D0[3+bs*0] = CC[3+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1488 if(m0<=0 & m1>0) D0[0+bs*1] = CC[0+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1489 if(m0<=1 & m1>1) D0[1+bs*1] = CC[1+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1490 if(m0<=2 & m1>2) D0[2+bs*1] = CC[2+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1491 if(m0<=3 & m1>3) D0[3+bs*1] = CC[3+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1496 if(m0<=0 & m1>0) D0[0+bs*2] = CC[0+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1497 if(m0<=1 & m1>1) D0[1+bs*2] = CC[1+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1498 if(m0<=2 & m1>2) D0[2+bs*2] = CC[2+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1499 if(m0<=3 & m1>3) D0[3+bs*2] = CC[3+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1504 if(m0<=0 & m1>0) D0[0+bs*3] = CC[0+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1505 if(m0<=1 & m1>1) D0[1+bs*3] = CC[1+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1506 if(m0<=2 & m1>2) D0[2+bs*3] = CC[2+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1507 if(m0<=3 & m1>3) D0[3+bs*3] = CC[3+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1516 if(m0<=0 & m1>0) D0[1+bs*0] = CC[0+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1517 if(m0<=1 & m1>1) D0[2+bs*0] = CC[1+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1518 if(m0<=2 & m1>2) D0[3+bs*0] = CC[2+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1519 if(m0<=3 & m1>3) D1[0+bs*0] = CC[3+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1524 if(m0<=0 & m1>0) D0[1+bs*1] = CC[0+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1525 if(m0<=1 & m1>1) D0[2+bs*1] = CC[1+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1526 if(m0<=2 & m1>2) D0[3+bs*1] = CC[2+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1527 if(m0<=3 & m1>3) D1[0+bs*1] = CC[3+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1532 if(m0<=0 & m1>0) D0[1+bs*2] = CC[0+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1533 if(m0<=1 & m1>1) D0[2+bs*2] = CC[1+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1534 if(m0<=2 & m1>2) D0[3+bs*2] = CC[2+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1535 if(m0<=3 & m1>3) D1[0+bs*2] = CC[3+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1540 if(m0<=0 & m1>0) D0[1+bs*3] = CC[0+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1541 if(m0<=1 & m1>1) D0[2+bs*3] = CC[1+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1542 if(m0<=2 & m1>2) D0[3+bs*3] = CC[2+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1543 if(m0<=3 & m1>3) D1[0+bs*3] = CC[3+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1552 if(m0<=0 & m1>0) D0[2+bs*0] = CC[0+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1553 if(m0<=1 & m1>1) D0[3+bs*0] = CC[1+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1554 if(m0<=2 & m1>2) D1[0+bs*0] = CC[2+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1555 if(m0<=3 & m1>3) D1[1+bs*0] = CC[3+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1560 if(m0<=0 & m1>0) D0[2+bs*1] = CC[0+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1561 if(m0<=1 & m1>1) D0[3+bs*1] = CC[1+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1562 if(m0<=2 & m1>2) D1[0+bs*1] = CC[2+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1563 if(m0<=3 & m1>3) D1[1+bs*1] = CC[3+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1568 if(m0<=0 & m1>0) D0[2+bs*2] = CC[0+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1569 if(m0<=1 & m1>1) D0[3+bs*2] = CC[1+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1570 if(m0<=2 & m1>2) D1[0+bs*2] = CC[2+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1571 if(m0<=3 & m1>3) D1[1+bs*2] = CC[3+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1576 if(m0<=0 & m1>0) D0[2+bs*3] = CC[0+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1577 if(m0<=1 & m1>1) D0[3+bs*3] = CC[1+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1578 if(m0<=2 & m1>2) D1[0+bs*3] = CC[2+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1579 if(m0<=3 & m1>3) D1[1+bs*3] = CC[3+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1588 if(m0<=0 & m1>0) D0[3+bs*0] = CC[0+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1589 if(m0<=1 & m1>1) D1[0+bs*0] = CC[1+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1590 if(m0<=2 & m1>2) D1[1+bs*0] = CC[2+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1591 if(m0<=3 & m1>3) D1[2+bs*0] = CC[3+bs*0]; in kernel_dgemm_nn_4x4_gen_lib4()
1596 if(m0<=0 & m1>0) D0[3+bs*1] = CC[0+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1597 if(m0<=1 & m1>1) D1[0+bs*1] = CC[1+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1598 if(m0<=2 & m1>2) D1[1+bs*1] = CC[2+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1599 if(m0<=3 & m1>3) D1[2+bs*1] = CC[3+bs*1]; in kernel_dgemm_nn_4x4_gen_lib4()
1604 if(m0<=0 & m1>0) D0[3+bs*2] = CC[0+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1605 if(m0<=1 & m1>1) D1[0+bs*2] = CC[1+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1606 if(m0<=2 & m1>2) D1[1+bs*2] = CC[2+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1607 if(m0<=3 & m1>3) D1[2+bs*2] = CC[3+bs*2]; in kernel_dgemm_nn_4x4_gen_lib4()
1612 if(m0<=0 & m1>0) D0[3+bs*3] = CC[0+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1613 if(m0<=1 & m1>1) D1[0+bs*3] = CC[1+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1614 if(m0<=2 & m1>2) D1[1+bs*3] = CC[2+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1615 if(m0<=3 & m1>3) D1[2+bs*3] = CC[3+bs*3]; in kernel_dgemm_nn_4x4_gen_lib4()
1632 double CC[16] = {0}; in kernel_dgemm_tt_4x4_lib4() local
1634 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dgemm_tt_4x4_lib4()
1639 kernel_dgemm_nn_4x4_lib4(kmax, alpha, B, offsetA, A, sda, &beta1, CC, CC); in kernel_dgemm_tt_4x4_lib4()
1644 tmp = CC[1+bs*0]; CC[1+bs*0] = CC[0+bs*1]; CC[0+bs*1] = tmp; in kernel_dgemm_tt_4x4_lib4()
1645 tmp = CC[2+bs*0]; CC[2+bs*0] = CC[0+bs*2]; CC[0+bs*2] = tmp; in kernel_dgemm_tt_4x4_lib4()
1646 tmp = CC[3+bs*0]; CC[3+bs*0] = CC[0+bs*3]; CC[0+bs*3] = tmp; in kernel_dgemm_tt_4x4_lib4()
1647 tmp = CC[2+bs*1]; CC[2+bs*1] = CC[1+bs*2]; CC[1+bs*2] = tmp; in kernel_dgemm_tt_4x4_lib4()
1648 tmp = CC[3+bs*1]; CC[3+bs*1] = CC[1+bs*3]; CC[1+bs*3] = tmp; in kernel_dgemm_tt_4x4_lib4()
1649 tmp = CC[3+bs*2]; CC[3+bs*2] = CC[2+bs*3]; CC[2+bs*3] = tmp; in kernel_dgemm_tt_4x4_lib4()
1652 D[0+bs*0] = beta[0]*C[0+bs*0] + CC[0+bs*0]; in kernel_dgemm_tt_4x4_lib4()
1653 D[1+bs*0] = beta[0]*C[1+bs*0] + CC[1+bs*0]; in kernel_dgemm_tt_4x4_lib4()
1654 D[2+bs*0] = beta[0]*C[2+bs*0] + CC[2+bs*0]; in kernel_dgemm_tt_4x4_lib4()
1655 D[3+bs*0] = beta[0]*C[3+bs*0] + CC[3+bs*0]; in kernel_dgemm_tt_4x4_lib4()
1657 D[0+bs*1] = beta[0]*C[0+bs*1] + CC[0+bs*1]; in kernel_dgemm_tt_4x4_lib4()
1658 D[1+bs*1] = beta[0]*C[1+bs*1] + CC[1+bs*1]; in kernel_dgemm_tt_4x4_lib4()
1659 D[2+bs*1] = beta[0]*C[2+bs*1] + CC[2+bs*1]; in kernel_dgemm_tt_4x4_lib4()
1660 D[3+bs*1] = beta[0]*C[3+bs*1] + CC[3+bs*1]; in kernel_dgemm_tt_4x4_lib4()
1662 D[0+bs*2] = beta[0]*C[0+bs*2] + CC[0+bs*2]; in kernel_dgemm_tt_4x4_lib4()
1663 D[1+bs*2] = beta[0]*C[1+bs*2] + CC[1+bs*2]; in kernel_dgemm_tt_4x4_lib4()
1664 D[2+bs*2] = beta[0]*C[2+bs*2] + CC[2+bs*2]; in kernel_dgemm_tt_4x4_lib4()
1665 D[3+bs*2] = beta[0]*C[3+bs*2] + CC[3+bs*2]; in kernel_dgemm_tt_4x4_lib4()
1667 D[0+bs*3] = beta[0]*C[0+bs*3] + CC[0+bs*3]; in kernel_dgemm_tt_4x4_lib4()
1668 D[1+bs*3] = beta[0]*C[1+bs*3] + CC[1+bs*3]; in kernel_dgemm_tt_4x4_lib4()
1669 D[2+bs*3] = beta[0]*C[2+bs*3] + CC[2+bs*3]; in kernel_dgemm_tt_4x4_lib4()
1670 D[3+bs*3] = beta[0]*C[3+bs*3] + CC[3+bs*3]; in kernel_dgemm_tt_4x4_lib4()
1686 double CC[16] = {0}; in kernel_dgemm_tt_4x4_vs_lib4() local
1688 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dgemm_tt_4x4_vs_lib4()
1693 kernel_dgemm_nn_4x4_lib4(kmax, alpha, B, offsetA, A, sda, &beta1, CC, CC); in kernel_dgemm_tt_4x4_vs_lib4()
1698 tmp = CC[1+bs*0]; CC[1+bs*0] = CC[0+bs*1]; CC[0+bs*1] = tmp; in kernel_dgemm_tt_4x4_vs_lib4()
1699 tmp = CC[2+bs*0]; CC[2+bs*0] = CC[0+bs*2]; CC[0+bs*2] = tmp; in kernel_dgemm_tt_4x4_vs_lib4()
1700 tmp = CC[3+bs*0]; CC[3+bs*0] = CC[0+bs*3]; CC[0+bs*3] = tmp; in kernel_dgemm_tt_4x4_vs_lib4()
1701 tmp = CC[2+bs*1]; CC[2+bs*1] = CC[1+bs*2]; CC[1+bs*2] = tmp; in kernel_dgemm_tt_4x4_vs_lib4()
1702 tmp = CC[3+bs*1]; CC[3+bs*1] = CC[1+bs*3]; CC[1+bs*3] = tmp; in kernel_dgemm_tt_4x4_vs_lib4()
1703 tmp = CC[3+bs*2]; CC[3+bs*2] = CC[2+bs*3]; CC[2+bs*3] = tmp; in kernel_dgemm_tt_4x4_vs_lib4()
1708 D[0+bs*0] = beta[0]*C[0+bs*0] + CC[0+bs*0]; in kernel_dgemm_tt_4x4_vs_lib4()
1709 D[1+bs*0] = beta[0]*C[1+bs*0] + CC[1+bs*0]; in kernel_dgemm_tt_4x4_vs_lib4()
1710 D[2+bs*0] = beta[0]*C[2+bs*0] + CC[2+bs*0]; in kernel_dgemm_tt_4x4_vs_lib4()
1711 D[3+bs*0] = beta[0]*C[3+bs*0] + CC[3+bs*0]; in kernel_dgemm_tt_4x4_vs_lib4()
1716 D[0+bs*1] = beta[0]*C[0+bs*1] + CC[0+bs*1]; in kernel_dgemm_tt_4x4_vs_lib4()
1717 D[1+bs*1] = beta[0]*C[1+bs*1] + CC[1+bs*1]; in kernel_dgemm_tt_4x4_vs_lib4()
1718 D[2+bs*1] = beta[0]*C[2+bs*1] + CC[2+bs*1]; in kernel_dgemm_tt_4x4_vs_lib4()
1719 D[3+bs*1] = beta[0]*C[3+bs*1] + CC[3+bs*1]; in kernel_dgemm_tt_4x4_vs_lib4()
1724 D[0+bs*2] = beta[0]*C[0+bs*2] + CC[0+bs*2]; in kernel_dgemm_tt_4x4_vs_lib4()
1725 D[1+bs*2] = beta[0]*C[1+bs*2] + CC[1+bs*2]; in kernel_dgemm_tt_4x4_vs_lib4()
1726 D[2+bs*2] = beta[0]*C[2+bs*2] + CC[2+bs*2]; in kernel_dgemm_tt_4x4_vs_lib4()
1727 D[3+bs*2] = beta[0]*C[3+bs*2] + CC[3+bs*2]; in kernel_dgemm_tt_4x4_vs_lib4()
1732 D[0+bs*3] = beta[0]*C[0+bs*3] + CC[0+bs*3]; in kernel_dgemm_tt_4x4_vs_lib4()
1733 D[1+bs*3] = beta[0]*C[1+bs*3] + CC[1+bs*3]; in kernel_dgemm_tt_4x4_vs_lib4()
1734 D[2+bs*3] = beta[0]*C[2+bs*3] + CC[2+bs*3]; in kernel_dgemm_tt_4x4_vs_lib4()
1735 D[3+bs*3] = beta[0]*C[3+bs*3] + CC[3+bs*3]; in kernel_dgemm_tt_4x4_vs_lib4()
1739 D[0+bs*0] = beta[0]*C[0+bs*0] + CC[0+bs*0]; in kernel_dgemm_tt_4x4_vs_lib4()
1740 D[1+bs*0] = beta[0]*C[1+bs*0] + CC[1+bs*0]; in kernel_dgemm_tt_4x4_vs_lib4()
1741 D[2+bs*0] = beta[0]*C[2+bs*0] + CC[2+bs*0]; in kernel_dgemm_tt_4x4_vs_lib4()
1746 D[0+bs*1] = beta[0]*C[0+bs*1] + CC[0+bs*1]; in kernel_dgemm_tt_4x4_vs_lib4()
1747 D[1+bs*1] = beta[0]*C[1+bs*1] + CC[1+bs*1]; in kernel_dgemm_tt_4x4_vs_lib4()
1748 D[2+bs*1] = beta[0]*C[2+bs*1] + CC[2+bs*1]; in kernel_dgemm_tt_4x4_vs_lib4()
1753 D[0+bs*2] = beta[0]*C[0+bs*2] + CC[0+bs*2]; in kernel_dgemm_tt_4x4_vs_lib4()
1754 D[1+bs*2] = beta[0]*C[1+bs*2] + CC[1+bs*2]; in kernel_dgemm_tt_4x4_vs_lib4()
1755 D[2+bs*2] = beta[0]*C[2+bs*2] + CC[2+bs*2]; in kernel_dgemm_tt_4x4_vs_lib4()
1760 D[0+bs*3] = beta[0]*C[0+bs*3] + CC[0+bs*3]; in kernel_dgemm_tt_4x4_vs_lib4()
1761 D[1+bs*3] = beta[0]*C[1+bs*3] + CC[1+bs*3]; in kernel_dgemm_tt_4x4_vs_lib4()
1762 D[2+bs*3] = beta[0]*C[2+bs*3] + CC[2+bs*3]; in kernel_dgemm_tt_4x4_vs_lib4()
1766 D[0+bs*0] = beta[0]*C[0+bs*0] + CC[0+bs*0]; in kernel_dgemm_tt_4x4_vs_lib4()
1767 D[1+bs*0] = beta[0]*C[1+bs*0] + CC[1+bs*0]; in kernel_dgemm_tt_4x4_vs_lib4()
1772 D[0+bs*1] = beta[0]*C[0+bs*1] + CC[0+bs*1]; in kernel_dgemm_tt_4x4_vs_lib4()
1773 D[1+bs*1] = beta[0]*C[1+bs*1] + CC[1+bs*1]; in kernel_dgemm_tt_4x4_vs_lib4()
1778 D[0+bs*2] = beta[0]*C[0+bs*2] + CC[0+bs*2]; in kernel_dgemm_tt_4x4_vs_lib4()
1779 D[1+bs*2] = beta[0]*C[1+bs*2] + CC[1+bs*2]; in kernel_dgemm_tt_4x4_vs_lib4()
1784 D[0+bs*3] = beta[0]*C[0+bs*3] + CC[0+bs*3]; in kernel_dgemm_tt_4x4_vs_lib4()
1785 D[1+bs*3] = beta[0]*C[1+bs*3] + CC[1+bs*3]; in kernel_dgemm_tt_4x4_vs_lib4()
1789 D[0+bs*0] = beta[0]*C[0+bs*0] + CC[0+bs*0]; in kernel_dgemm_tt_4x4_vs_lib4()
1794 D[0+bs*1] = beta[0]*C[0+bs*1] + CC[0+bs*1]; in kernel_dgemm_tt_4x4_vs_lib4()
1799 D[0+bs*2] = beta[0]*C[0+bs*2] + CC[0+bs*2]; in kernel_dgemm_tt_4x4_vs_lib4()
1804 D[0+bs*3] = beta[0]*C[0+bs*3] + CC[0+bs*3]; in kernel_dgemm_tt_4x4_vs_lib4()
1821 double CC[16] = {0}; in kernel_dgemm_tt_4x4_gen_lib4() local
1823 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dgemm_tt_4x4_gen_lib4()
1828 kernel_dgemm_nn_4x4_lib4(kmax, alpha, B, offsetA, A, sda, &beta1, CC, CC); in kernel_dgemm_tt_4x4_gen_lib4()
1833 tmp = CC[1+bs*0]; CC[1+bs*0] = CC[0+bs*1]; CC[0+bs*1] = tmp; in kernel_dgemm_tt_4x4_gen_lib4()
1834 tmp = CC[2+bs*0]; CC[2+bs*0] = CC[0+bs*2]; CC[0+bs*2] = tmp; in kernel_dgemm_tt_4x4_gen_lib4()
1835 tmp = CC[3+bs*0]; CC[3+bs*0] = CC[0+bs*3]; CC[0+bs*3] = tmp; in kernel_dgemm_tt_4x4_gen_lib4()
1836 tmp = CC[2+bs*1]; CC[2+bs*1] = CC[1+bs*2]; CC[1+bs*2] = tmp; in kernel_dgemm_tt_4x4_gen_lib4()
1837 tmp = CC[3+bs*1]; CC[3+bs*1] = CC[1+bs*3]; CC[1+bs*3] = tmp; in kernel_dgemm_tt_4x4_gen_lib4()
1838 tmp = CC[3+bs*2]; CC[3+bs*2] = CC[2+bs*3]; CC[2+bs*3] = tmp; in kernel_dgemm_tt_4x4_gen_lib4()
1846 CC[0+bs*0] += beta[0]*C0[0+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
1847 CC[1+bs*0] += beta[0]*C0[1+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
1848 CC[2+bs*0] += beta[0]*C0[2+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
1849 CC[3+bs*0] += beta[0]*C0[3+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
1851 CC[0+bs*1] += beta[0]*C0[0+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
1852 CC[1+bs*1] += beta[0]*C0[1+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
1853 CC[2+bs*1] += beta[0]*C0[2+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
1854 CC[3+bs*1] += beta[0]*C0[3+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
1856 CC[0+bs*2] += beta[0]*C0[0+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
1857 CC[1+bs*2] += beta[0]*C0[1+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
1858 CC[2+bs*2] += beta[0]*C0[2+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
1859 CC[3+bs*2] += beta[0]*C0[3+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
1861 CC[0+bs*3] += beta[0]*C0[0+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1862 CC[1+bs*3] += beta[0]*C0[1+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1863 CC[2+bs*3] += beta[0]*C0[2+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1864 CC[3+bs*3] += beta[0]*C0[3+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1870 CC[0+bs*0] += beta[0]*C0[1+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
1871 CC[1+bs*0] += beta[0]*C0[2+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
1872 CC[2+bs*0] += beta[0]*C0[3+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
1873 CC[3+bs*0] += beta[0]*C1[0+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
1875 CC[0+bs*1] += beta[0]*C0[1+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
1876 CC[1+bs*1] += beta[0]*C0[2+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
1877 CC[2+bs*1] += beta[0]*C0[3+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
1878 CC[3+bs*1] += beta[0]*C1[0+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
1880 CC[0+bs*2] += beta[0]*C0[1+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
1881 CC[1+bs*2] += beta[0]*C0[2+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
1882 CC[2+bs*2] += beta[0]*C0[3+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
1883 CC[3+bs*2] += beta[0]*C1[0+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
1885 CC[0+bs*3] += beta[0]*C0[1+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1886 CC[1+bs*3] += beta[0]*C0[2+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1887 CC[2+bs*3] += beta[0]*C0[3+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1888 CC[3+bs*3] += beta[0]*C1[0+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1894 CC[0+bs*0] += beta[0]*C0[2+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
1895 CC[1+bs*0] += beta[0]*C0[3+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
1896 CC[2+bs*0] += beta[0]*C1[0+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
1897 CC[3+bs*0] += beta[0]*C1[1+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
1899 CC[0+bs*1] += beta[0]*C0[2+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
1900 CC[1+bs*1] += beta[0]*C0[3+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
1901 CC[2+bs*1] += beta[0]*C1[0+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
1902 CC[3+bs*1] += beta[0]*C1[1+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
1904 CC[0+bs*2] += beta[0]*C0[2+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
1905 CC[1+bs*2] += beta[0]*C0[3+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
1906 CC[2+bs*2] += beta[0]*C1[0+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
1907 CC[3+bs*2] += beta[0]*C1[1+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
1909 CC[0+bs*3] += beta[0]*C0[2+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1910 CC[1+bs*3] += beta[0]*C0[3+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1911 CC[2+bs*3] += beta[0]*C1[0+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1912 CC[3+bs*3] += beta[0]*C1[1+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1918 CC[0+bs*0] += beta[0]*C0[3+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
1919 CC[1+bs*0] += beta[0]*C1[0+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
1920 CC[2+bs*0] += beta[0]*C1[1+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
1921 CC[3+bs*0] += beta[0]*C1[2+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
1923 CC[0+bs*1] += beta[0]*C0[3+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
1924 CC[1+bs*1] += beta[0]*C1[0+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
1925 CC[2+bs*1] += beta[0]*C1[1+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
1926 CC[3+bs*1] += beta[0]*C1[2+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
1928 CC[0+bs*2] += beta[0]*C0[3+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
1929 CC[1+bs*2] += beta[0]*C1[0+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
1930 CC[2+bs*2] += beta[0]*C1[1+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
1931 CC[3+bs*2] += beta[0]*C1[2+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
1933 CC[0+bs*3] += beta[0]*C0[3+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1934 CC[1+bs*3] += beta[0]*C1[0+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1935 CC[2+bs*3] += beta[0]*C1[1+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1936 CC[3+bs*3] += beta[0]*C1[2+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1946 CC[0+bs*0] = CC[0+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
1947 CC[1+bs*0] = CC[1+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
1948 CC[2+bs*0] = CC[2+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
1949 CC[3+bs*0] = CC[3+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
1951 CC[0+bs*1] = CC[0+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
1952 CC[1+bs*1] = CC[1+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
1953 CC[2+bs*1] = CC[2+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
1954 CC[3+bs*1] = CC[3+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
1956 CC[0+bs*2] = CC[0+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1957 CC[1+bs*2] = CC[1+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1958 CC[2+bs*2] = CC[2+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1959 CC[3+bs*2] = CC[3+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1965 CC[0+bs*0] = CC[0+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
1966 CC[1+bs*0] = CC[1+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
1967 CC[2+bs*0] = CC[2+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
1968 CC[3+bs*0] = CC[3+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
1970 CC[0+bs*1] = CC[0+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1971 CC[1+bs*1] = CC[1+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1972 CC[2+bs*1] = CC[2+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1973 CC[3+bs*1] = CC[3+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1979 CC[0+bs*0] = CC[0+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1980 CC[1+bs*0] = CC[1+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1981 CC[2+bs*0] = CC[2+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1982 CC[3+bs*0] = CC[3+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
1996 if(m0<=0 & m1>0) D0[0+bs*0] = CC[0+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
1997 if(m0<=1 & m1>1) D0[1+bs*0] = CC[1+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
1998 if(m0<=2 & m1>2) D0[2+bs*0] = CC[2+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
1999 if(m0<=3 & m1>3) D0[3+bs*0] = CC[3+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
2004 if(m0<=0 & m1>0) D0[0+bs*1] = CC[0+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
2005 if(m0<=1 & m1>1) D0[1+bs*1] = CC[1+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
2006 if(m0<=2 & m1>2) D0[2+bs*1] = CC[2+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
2007 if(m0<=3 & m1>3) D0[3+bs*1] = CC[3+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
2012 if(m0<=0 & m1>0) D0[0+bs*2] = CC[0+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
2013 if(m0<=1 & m1>1) D0[1+bs*2] = CC[1+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
2014 if(m0<=2 & m1>2) D0[2+bs*2] = CC[2+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
2015 if(m0<=3 & m1>3) D0[3+bs*2] = CC[3+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
2020 if(m0<=0 & m1>0) D0[0+bs*3] = CC[0+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
2021 if(m0<=1 & m1>1) D0[1+bs*3] = CC[1+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
2022 if(m0<=2 & m1>2) D0[2+bs*3] = CC[2+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
2023 if(m0<=3 & m1>3) D0[3+bs*3] = CC[3+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
2032 if(m0<=0 & m1>0) D0[1+bs*0] = CC[0+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
2033 if(m0<=1 & m1>1) D0[2+bs*0] = CC[1+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
2034 if(m0<=2 & m1>2) D0[3+bs*0] = CC[2+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
2035 if(m0<=3 & m1>3) D1[0+bs*0] = CC[3+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
2040 if(m0<=0 & m1>0) D0[1+bs*1] = CC[0+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
2041 if(m0<=1 & m1>1) D0[2+bs*1] = CC[1+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
2042 if(m0<=2 & m1>2) D0[3+bs*1] = CC[2+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
2043 if(m0<=3 & m1>3) D1[0+bs*1] = CC[3+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
2048 if(m0<=0 & m1>0) D0[1+bs*2] = CC[0+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
2049 if(m0<=1 & m1>1) D0[2+bs*2] = CC[1+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
2050 if(m0<=2 & m1>2) D0[3+bs*2] = CC[2+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
2051 if(m0<=3 & m1>3) D1[0+bs*2] = CC[3+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
2056 if(m0<=0 & m1>0) D0[1+bs*3] = CC[0+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
2057 if(m0<=1 & m1>1) D0[2+bs*3] = CC[1+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
2058 if(m0<=2 & m1>2) D0[3+bs*3] = CC[2+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
2059 if(m0<=3 & m1>3) D1[0+bs*3] = CC[3+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
2068 if(m0<=0 & m1>0) D0[2+bs*0] = CC[0+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
2069 if(m0<=1 & m1>1) D0[3+bs*0] = CC[1+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
2070 if(m0<=2 & m1>2) D1[0+bs*0] = CC[2+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
2071 if(m0<=3 & m1>3) D1[1+bs*0] = CC[3+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
2076 if(m0<=0 & m1>0) D0[2+bs*1] = CC[0+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
2077 if(m0<=1 & m1>1) D0[3+bs*1] = CC[1+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
2078 if(m0<=2 & m1>2) D1[0+bs*1] = CC[2+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
2079 if(m0<=3 & m1>3) D1[1+bs*1] = CC[3+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
2084 if(m0<=0 & m1>0) D0[2+bs*2] = CC[0+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
2085 if(m0<=1 & m1>1) D0[3+bs*2] = CC[1+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
2086 if(m0<=2 & m1>2) D1[0+bs*2] = CC[2+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
2087 if(m0<=3 & m1>3) D1[1+bs*2] = CC[3+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
2092 if(m0<=0 & m1>0) D0[2+bs*3] = CC[0+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
2093 if(m0<=1 & m1>1) D0[3+bs*3] = CC[1+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
2094 if(m0<=2 & m1>2) D1[0+bs*3] = CC[2+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
2095 if(m0<=3 & m1>3) D1[1+bs*3] = CC[3+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
2104 if(m0<=0 & m1>0) D0[3+bs*0] = CC[0+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
2105 if(m0<=1 & m1>1) D1[0+bs*0] = CC[1+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
2106 if(m0<=2 & m1>2) D1[1+bs*0] = CC[2+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
2107 if(m0<=3 & m1>3) D1[2+bs*0] = CC[3+bs*0]; in kernel_dgemm_tt_4x4_gen_lib4()
2112 if(m0<=0 & m1>0) D0[3+bs*1] = CC[0+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
2113 if(m0<=1 & m1>1) D1[0+bs*1] = CC[1+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
2114 if(m0<=2 & m1>2) D1[1+bs*1] = CC[2+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
2115 if(m0<=3 & m1>3) D1[2+bs*1] = CC[3+bs*1]; in kernel_dgemm_tt_4x4_gen_lib4()
2120 if(m0<=0 & m1>0) D0[3+bs*2] = CC[0+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
2121 if(m0<=1 & m1>1) D1[0+bs*2] = CC[1+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
2122 if(m0<=2 & m1>2) D1[1+bs*2] = CC[2+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
2123 if(m0<=3 & m1>3) D1[2+bs*2] = CC[3+bs*2]; in kernel_dgemm_tt_4x4_gen_lib4()
2128 if(m0<=0 & m1>0) D0[3+bs*3] = CC[0+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
2129 if(m0<=1 & m1>1) D1[0+bs*3] = CC[1+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
2130 if(m0<=2 & m1>2) D1[1+bs*3] = CC[2+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
2131 if(m0<=3 & m1>3) D1[2+bs*3] = CC[3+bs*3]; in kernel_dgemm_tt_4x4_gen_lib4()
2148 double CC[16] = {0}; in kernel_dsyrk_nn_u_4x4_lib4() local
2150 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dsyrk_nn_u_4x4_lib4()
2153 kernel_dgemm_nn_4x4_lib4(kmax, alpha, A, offsetB, B, sdb, beta, C, CC); in kernel_dsyrk_nn_u_4x4_lib4()
2155 D[0+bs*0] = CC[0+bs*0]; in kernel_dsyrk_nn_u_4x4_lib4()
2157 D[0+bs*1] = CC[0+bs*1]; in kernel_dsyrk_nn_u_4x4_lib4()
2158 D[1+bs*1] = CC[1+bs*1]; in kernel_dsyrk_nn_u_4x4_lib4()
2160 D[0+bs*2] = CC[0+bs*2]; in kernel_dsyrk_nn_u_4x4_lib4()
2161 D[1+bs*2] = CC[1+bs*2]; in kernel_dsyrk_nn_u_4x4_lib4()
2162 D[2+bs*2] = CC[2+bs*2]; in kernel_dsyrk_nn_u_4x4_lib4()
2164 D[0+bs*3] = CC[0+bs*3]; in kernel_dsyrk_nn_u_4x4_lib4()
2165 D[1+bs*3] = CC[1+bs*3]; in kernel_dsyrk_nn_u_4x4_lib4()
2166 D[2+bs*3] = CC[2+bs*3]; in kernel_dsyrk_nn_u_4x4_lib4()
2167 D[3+bs*3] = CC[3+bs*3]; in kernel_dsyrk_nn_u_4x4_lib4()
2183 double CC[16] = {0}; in kernel_dsyrk_nn_u_4x4_vs_lib4() local
2185 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2188 kernel_dgemm_nn_4x4_lib4(kmax, alpha, A, offsetB, B, sdb, beta, C, CC); in kernel_dsyrk_nn_u_4x4_vs_lib4()
2192 D[0+bs*0] = CC[0+bs*0]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2197 D[0+bs*1] = CC[0+bs*1]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2198 D[1+bs*1] = CC[1+bs*1]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2203 D[0+bs*2] = CC[0+bs*2]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2204 D[1+bs*2] = CC[1+bs*2]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2205 D[2+bs*2] = CC[2+bs*2]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2210 D[0+bs*3] = CC[0+bs*3]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2211 D[1+bs*3] = CC[1+bs*3]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2212 D[2+bs*3] = CC[2+bs*3]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2213 D[3+bs*3] = CC[3+bs*3]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2217 D[0+bs*0] = CC[0+bs*0]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2222 D[0+bs*1] = CC[0+bs*1]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2223 D[1+bs*1] = CC[1+bs*1]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2228 D[0+bs*2] = CC[0+bs*2]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2229 D[1+bs*2] = CC[1+bs*2]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2230 D[2+bs*2] = CC[2+bs*2]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2235 D[0+bs*3] = CC[0+bs*3]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2236 D[1+bs*3] = CC[1+bs*3]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2237 D[2+bs*3] = CC[2+bs*3]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2241 D[0+bs*0] = CC[0+bs*0]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2246 D[0+bs*1] = CC[0+bs*1]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2247 D[1+bs*1] = CC[1+bs*1]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2252 D[0+bs*2] = CC[0+bs*2]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2253 D[1+bs*2] = CC[1+bs*2]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2258 D[0+bs*3] = CC[0+bs*3]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2259 D[1+bs*3] = CC[1+bs*3]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2263 D[0+bs*0] = CC[0+bs*0]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2268 D[0+bs*1] = CC[0+bs*1]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2273 D[0+bs*2] = CC[0+bs*2]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2278 D[0+bs*3] = CC[0+bs*3]; in kernel_dsyrk_nn_u_4x4_vs_lib4()
2295 double CC[16] = {0}; in kernel_dsyrk_nt_l_4x4_lib4() local
2297 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dsyrk_nt_l_4x4_lib4()
2300 kernel_dgemm_nt_4x4_lib4(kmax, alpha, A, B, beta, C, CC); in kernel_dsyrk_nt_l_4x4_lib4()
2302 D[0+bs*0] = CC[0+bs*0]; in kernel_dsyrk_nt_l_4x4_lib4()
2303 D[1+bs*0] = CC[1+bs*0]; in kernel_dsyrk_nt_l_4x4_lib4()
2304 D[2+bs*0] = CC[2+bs*0]; in kernel_dsyrk_nt_l_4x4_lib4()
2305 D[3+bs*0] = CC[3+bs*0]; in kernel_dsyrk_nt_l_4x4_lib4()
2307 D[1+bs*1] = CC[1+bs*1]; in kernel_dsyrk_nt_l_4x4_lib4()
2308 D[2+bs*1] = CC[2+bs*1]; in kernel_dsyrk_nt_l_4x4_lib4()
2309 D[3+bs*1] = CC[3+bs*1]; in kernel_dsyrk_nt_l_4x4_lib4()
2311 D[2+bs*2] = CC[2+bs*2]; in kernel_dsyrk_nt_l_4x4_lib4()
2312 D[3+bs*2] = CC[3+bs*2]; in kernel_dsyrk_nt_l_4x4_lib4()
2314 D[3+bs*3] = CC[3+bs*3]; in kernel_dsyrk_nt_l_4x4_lib4()
2330 double CC[16] = {0}; in kernel_dsyrk_nt_l_4x4_vs_lib4() local
2332 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dsyrk_nt_l_4x4_vs_lib4()
2335 kernel_dgemm_nt_4x4_lib4(kmax, alpha, A, B, beta, C, CC); in kernel_dsyrk_nt_l_4x4_vs_lib4()
2339 D[0+bs*0] = CC[0+bs*0]; in kernel_dsyrk_nt_l_4x4_vs_lib4()
2340 D[1+bs*0] = CC[1+bs*0]; in kernel_dsyrk_nt_l_4x4_vs_lib4()
2341 D[2+bs*0] = CC[2+bs*0]; in kernel_dsyrk_nt_l_4x4_vs_lib4()
2342 D[3+bs*0] = CC[3+bs*0]; in kernel_dsyrk_nt_l_4x4_vs_lib4()
2347 D[1+bs*1] = CC[1+bs*1]; in kernel_dsyrk_nt_l_4x4_vs_lib4()
2348 D[2+bs*1] = CC[2+bs*1]; in kernel_dsyrk_nt_l_4x4_vs_lib4()
2349 D[3+bs*1] = CC[3+bs*1]; in kernel_dsyrk_nt_l_4x4_vs_lib4()
2354 D[2+bs*2] = CC[2+bs*2]; in kernel_dsyrk_nt_l_4x4_vs_lib4()
2355 D[3+bs*2] = CC[3+bs*2]; in kernel_dsyrk_nt_l_4x4_vs_lib4()
2360 D[3+bs*3] = CC[3+bs*3]; in kernel_dsyrk_nt_l_4x4_vs_lib4()
2364 D[0+bs*0] = CC[0+bs*0]; in kernel_dsyrk_nt_l_4x4_vs_lib4()
2365 D[1+bs*0] = CC[1+bs*0]; in kernel_dsyrk_nt_l_4x4_vs_lib4()
2366 D[2+bs*0] = CC[2+bs*0]; in kernel_dsyrk_nt_l_4x4_vs_lib4()
2371 D[1+bs*1] = CC[1+bs*1]; in kernel_dsyrk_nt_l_4x4_vs_lib4()
2372 D[2+bs*1] = CC[2+bs*1]; in kernel_dsyrk_nt_l_4x4_vs_lib4()
2377 D[2+bs*2] = CC[2+bs*2]; in kernel_dsyrk_nt_l_4x4_vs_lib4()
2381 D[0+bs*0] = CC[0+bs*0]; in kernel_dsyrk_nt_l_4x4_vs_lib4()
2382 D[1+bs*0] = CC[1+bs*0]; in kernel_dsyrk_nt_l_4x4_vs_lib4()
2387 D[1+bs*1] = CC[1+bs*1]; in kernel_dsyrk_nt_l_4x4_vs_lib4()
2391 D[0+bs*0] = CC[0+bs*0]; in kernel_dsyrk_nt_l_4x4_vs_lib4()
2408 double CC[16] = {0}; in kernel_dsyrk_nt_l_4x4_gen_lib4() local
2410 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2418 CC[0+bs*0] = beta[0]*C0[0+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2419 CC[1+bs*0] = beta[0]*C0[1+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2420 CC[2+bs*0] = beta[0]*C0[2+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2421 CC[3+bs*0] = beta[0]*C0[3+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2423 CC[1+bs*1] = beta[0]*C0[1+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2424 CC[2+bs*1] = beta[0]*C0[2+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2425 CC[3+bs*1] = beta[0]*C0[3+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2427 CC[2+bs*2] = beta[0]*C0[2+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2428 CC[3+bs*2] = beta[0]*C0[3+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2430 CC[3+bs*3] = beta[0]*C0[3+bs*3]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2436 CC[0+bs*0] = beta[0]*C0[1+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2437 CC[1+bs*0] = beta[0]*C0[2+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2438 CC[2+bs*0] = beta[0]*C0[3+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2439 CC[3+bs*0] = beta[0]*C1[0+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2441 CC[1+bs*1] = beta[0]*C0[2+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2442 CC[2+bs*1] = beta[0]*C0[3+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2443 CC[3+bs*1] = beta[0]*C1[0+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2445 CC[2+bs*2] = beta[0]*C0[3+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2446 CC[3+bs*2] = beta[0]*C1[0+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2448 CC[3+bs*3] = beta[0]*C1[0+bs*3]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2454 CC[0+bs*0] = beta[0]*C0[2+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2455 CC[1+bs*0] = beta[0]*C0[3+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2456 CC[2+bs*0] = beta[0]*C1[0+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2457 CC[3+bs*0] = beta[0]*C1[1+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2459 CC[1+bs*1] = beta[0]*C0[3+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2460 CC[2+bs*1] = beta[0]*C1[0+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2461 CC[3+bs*1] = beta[0]*C1[1+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2463 CC[2+bs*2] = beta[0]*C1[0+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2464 CC[3+bs*2] = beta[0]*C1[1+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2466 CC[3+bs*3] = beta[0]*C1[1+bs*3]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2472 CC[0+bs*0] = beta[0]*C0[3+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2473 CC[1+bs*0] = beta[0]*C1[0+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2474 CC[2+bs*0] = beta[0]*C1[1+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2475 CC[3+bs*0] = beta[0]*C1[2+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2477 CC[1+bs*1] = beta[0]*C1[0+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2478 CC[2+bs*1] = beta[0]*C1[1+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2479 CC[3+bs*1] = beta[0]*C1[2+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2481 CC[2+bs*2] = beta[0]*C1[1+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2482 CC[3+bs*2] = beta[0]*C1[2+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2484 CC[3+bs*3] = beta[0]*C1[2+bs*3]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2489 kernel_dgemm_nt_4x4_lib4(kmax, alpha, A, B, &beta1, CC, CC); in kernel_dsyrk_nt_l_4x4_gen_lib4()
2496 CC[0+bs*0] = CC[0+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2497 CC[1+bs*0] = CC[1+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2498 CC[2+bs*0] = CC[2+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2499 CC[3+bs*0] = CC[3+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2501 CC[0+bs*1] = CC[0+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2502 CC[1+bs*1] = CC[1+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2503 CC[2+bs*1] = CC[2+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2504 CC[3+bs*1] = CC[3+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2506 CC[0+bs*2] = CC[0+bs*3]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2507 CC[1+bs*2] = CC[1+bs*3]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2508 CC[2+bs*2] = CC[2+bs*3]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2509 CC[3+bs*2] = CC[3+bs*3]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2515 CC[0+bs*0] = CC[0+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2516 CC[1+bs*0] = CC[1+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2517 CC[2+bs*0] = CC[2+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2518 CC[3+bs*0] = CC[3+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2520 CC[0+bs*1] = CC[0+bs*3]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2521 CC[1+bs*1] = CC[1+bs*3]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2522 CC[2+bs*1] = CC[2+bs*3]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2523 CC[3+bs*1] = CC[3+bs*3]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2529 CC[0+bs*0] = CC[0+bs*3]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2530 CC[1+bs*0] = CC[1+bs*3]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2531 CC[2+bs*0] = CC[2+bs*3]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2532 CC[3+bs*0] = CC[3+bs*3]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2548 if(m1>0) D0[0+bs*0] = CC[0+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2549 if(m1>1) D0[1+bs*0] = CC[1+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2550 if(m1>2) D0[2+bs*0] = CC[2+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2551 if(m1>3) D0[3+bs*0] = CC[3+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2556 if(m1>1) D0[1+bs*1] = CC[1+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2557 if(m1>2) D0[2+bs*1] = CC[2+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2558 if(m1>3) D0[3+bs*1] = CC[3+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2563 if(m1>2) D0[2+bs*2] = CC[2+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2564 if(m1>3) D0[3+bs*2] = CC[3+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2569 if(m1>3) D0[3+bs*3] = CC[3+bs*3]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2576 if(m1>1) D0[1+bs*0] = CC[1+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2577 if(m1>2) D0[2+bs*0] = CC[2+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2578 if(m1>3) D0[3+bs*0] = CC[3+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2583 if(m1>2) D0[2+bs*1] = CC[2+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2584 if(m1>3) D0[3+bs*1] = CC[3+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2589 if(m1>3) D0[3+bs*2] = CC[3+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2596 if(m1>2) D0[2+bs*0] = CC[2+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2597 if(m1>3) D0[3+bs*0] = CC[3+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2602 if(m1>3) D0[3+bs*1] = CC[3+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2609 if(m1>3) D0[3+bs*0] = CC[3+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2620 if(m1>0) D0[1+bs*0] = CC[0+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2621 if(m1>1) D0[2+bs*0] = CC[1+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2622 if(m1>2) D0[3+bs*0] = CC[2+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2623 if(m1>3) D1[0+bs*0] = CC[3+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2628 if(m1>1) D0[2+bs*1] = CC[1+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2629 if(m1>2) D0[3+bs*1] = CC[2+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2630 if(m1>3) D1[0+bs*1] = CC[3+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2635 if(m1>2) D0[3+bs*2] = CC[2+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2636 if(m1>3) D1[0+bs*2] = CC[3+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2641 if(m1>3) D1[0+bs*3] = CC[3+bs*3]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2648 if(m1>1) D0[2+bs*0] = CC[1+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2649 if(m1>2) D0[3+bs*0] = CC[2+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2650 if(m1>3) D1[0+bs*0] = CC[3+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2655 if(m1>2) D0[3+bs*1] = CC[2+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2656 if(m1>3) D1[0+bs*1] = CC[3+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2661 if(m1>3) D1[0+bs*2] = CC[3+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2668 if(m1>2) D0[3+bs*0] = CC[2+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2669 if(m1>3) D1[0+bs*0] = CC[3+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2674 if(m1>3) D1[0+bs*1] = CC[3+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2681 if(m1>3) D1[0+bs*0] = CC[3+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2692 if(m1>0) D0[2+bs*0] = CC[0+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2693 if(m1>1) D0[3+bs*0] = CC[1+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2694 if(m1>2) D1[0+bs*0] = CC[2+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2695 if(m1>3) D1[1+bs*0] = CC[3+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2700 if(m1>1) D0[3+bs*1] = CC[1+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2701 if(m1>2) D1[0+bs*1] = CC[2+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2702 if(m1>3) D1[1+bs*1] = CC[3+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2707 if(m1>2) D1[0+bs*2] = CC[2+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2708 if(m1>3) D1[1+bs*2] = CC[3+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2713 if(m1>3) D1[1+bs*3] = CC[3+bs*3]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2720 if(m1>1) D0[3+bs*0] = CC[1+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2721 if(m1>2) D1[0+bs*0] = CC[2+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2722 if(m1>3) D1[1+bs*0] = CC[3+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2727 if(m1>2) D1[0+bs*1] = CC[2+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2728 if(m1>3) D1[1+bs*1] = CC[3+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2733 if(m1>3) D1[1+bs*2] = CC[3+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2740 if(m1>2) D1[0+bs*0] = CC[2+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2741 if(m1>3) D1[1+bs*0] = CC[3+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2746 if(m1>3) D1[1+bs*1] = CC[3+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2753 if(m1>3) D1[1+bs*0] = CC[3+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2764 if(m1>0) D0[3+bs*0] = CC[0+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2765 if(m1>1) D1[0+bs*0] = CC[1+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2766 if(m1>2) D1[1+bs*0] = CC[2+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2767 if(m1>3) D1[2+bs*0] = CC[3+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2772 if(m1>1) D1[0+bs*1] = CC[1+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2773 if(m1>2) D1[1+bs*1] = CC[2+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2774 if(m1>3) D1[2+bs*1] = CC[3+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2779 if(m1>2) D1[1+bs*2] = CC[2+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2780 if(m1>3) D1[2+bs*2] = CC[3+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2785 if(m1>3) D1[2+bs*3] = CC[3+bs*3]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2792 if(m1>1) D1[0+bs*0] = CC[1+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2793 if(m1>2) D1[1+bs*0] = CC[2+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2794 if(m1>3) D1[2+bs*0] = CC[3+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2799 if(m1>2) D1[1+bs*1] = CC[2+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2800 if(m1>3) D1[2+bs*1] = CC[3+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2805 if(m1>3) D1[2+bs*2] = CC[3+bs*2]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2812 if(m1>2) D1[1+bs*0] = CC[2+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2813 if(m1>3) D1[2+bs*0] = CC[3+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2818 if(m1>3) D1[2+bs*1] = CC[3+bs*1]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2825 if(m1>3) D1[2+bs*0] = CC[3+bs*0]; in kernel_dsyrk_nt_l_4x4_gen_lib4()
2843 double CC[16] = {0}; in kernel_dsyrk_nt_u_4x4_lib4() local
2845 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dsyrk_nt_u_4x4_lib4()
2848 kernel_dgemm_nt_4x4_lib4(kmax, alpha, A, B, beta, C, CC); in kernel_dsyrk_nt_u_4x4_lib4()
2850 D[0+bs*0] = CC[0+bs*0]; in kernel_dsyrk_nt_u_4x4_lib4()
2852 D[0+bs*1] = CC[0+bs*1]; in kernel_dsyrk_nt_u_4x4_lib4()
2853 D[1+bs*1] = CC[1+bs*1]; in kernel_dsyrk_nt_u_4x4_lib4()
2855 D[0+bs*2] = CC[0+bs*2]; in kernel_dsyrk_nt_u_4x4_lib4()
2856 D[1+bs*2] = CC[1+bs*2]; in kernel_dsyrk_nt_u_4x4_lib4()
2857 D[2+bs*2] = CC[2+bs*2]; in kernel_dsyrk_nt_u_4x4_lib4()
2859 D[0+bs*3] = CC[0+bs*3]; in kernel_dsyrk_nt_u_4x4_lib4()
2860 D[1+bs*3] = CC[1+bs*3]; in kernel_dsyrk_nt_u_4x4_lib4()
2861 D[2+bs*3] = CC[2+bs*3]; in kernel_dsyrk_nt_u_4x4_lib4()
2862 D[3+bs*3] = CC[3+bs*3]; in kernel_dsyrk_nt_u_4x4_lib4()
2878 double CC[16] = {0}; in kernel_dsyrk_nt_u_4x4_vs_lib4() local
2880 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2883 kernel_dgemm_nt_4x4_lib4(kmax, alpha, A, B, beta, C, CC); in kernel_dsyrk_nt_u_4x4_vs_lib4()
2887 D[0+bs*0] = CC[0+bs*0]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2892 D[0+bs*1] = CC[0+bs*1]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2893 D[1+bs*1] = CC[1+bs*1]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2898 D[0+bs*2] = CC[0+bs*2]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2899 D[1+bs*2] = CC[1+bs*2]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2900 D[2+bs*2] = CC[2+bs*2]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2905 D[0+bs*3] = CC[0+bs*3]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2906 D[1+bs*3] = CC[1+bs*3]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2907 D[2+bs*3] = CC[2+bs*3]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2908 D[3+bs*3] = CC[3+bs*3]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2912 D[0+bs*0] = CC[0+bs*0]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2917 D[0+bs*1] = CC[0+bs*1]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2918 D[1+bs*1] = CC[1+bs*1]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2923 D[0+bs*2] = CC[0+bs*2]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2924 D[1+bs*2] = CC[1+bs*2]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2925 D[2+bs*2] = CC[2+bs*2]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2930 D[0+bs*3] = CC[0+bs*3]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2931 D[1+bs*3] = CC[1+bs*3]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2932 D[2+bs*3] = CC[2+bs*3]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2936 D[0+bs*0] = CC[0+bs*0]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2941 D[0+bs*1] = CC[0+bs*1]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2942 D[1+bs*1] = CC[1+bs*1]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2947 D[0+bs*2] = CC[0+bs*2]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2948 D[1+bs*2] = CC[1+bs*2]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2953 D[0+bs*3] = CC[0+bs*3]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2954 D[1+bs*3] = CC[1+bs*3]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2958 D[0+bs*0] = CC[0+bs*0]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2963 D[0+bs*1] = CC[0+bs*1]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2968 D[0+bs*2] = CC[0+bs*2]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2973 D[0+bs*3] = CC[0+bs*3]; in kernel_dsyrk_nt_u_4x4_vs_lib4()
2990 double CC[16] = {0}; in kernel_dsyrk_nt_u_4x4_gen_lib4() local
2992 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3000 CC[0+bs*0] = beta[0]*C0[0+bs*0]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3002 CC[0+bs*1] = beta[0]*C0[0+bs*1]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3003 CC[1+bs*1] = beta[0]*C0[1+bs*1]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3005 CC[0+bs*2] = beta[0]*C0[0+bs*2]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3006 CC[1+bs*2] = beta[0]*C0[1+bs*2]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3007 CC[2+bs*2] = beta[0]*C0[2+bs*2]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3009 CC[0+bs*3] = beta[0]*C0[0+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3010 CC[1+bs*3] = beta[0]*C0[1+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3011 CC[2+bs*3] = beta[0]*C0[2+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3012 CC[3+bs*3] = beta[0]*C0[3+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3018 CC[0+bs*0] = beta[0]*C0[0+bs*0]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3020 CC[0+bs*1] = beta[0]*C0[0+bs*1]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3021 CC[1+bs*1] = beta[0]*C0[1+bs*1]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3023 CC[0+bs*2] = beta[0]*C0[0+bs*2]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3024 CC[1+bs*2] = beta[0]*C0[1+bs*2]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3025 CC[2+bs*2] = beta[0]*C0[2+bs*2]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3027 CC[0+bs*3] = beta[0]*C0[0+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3028 CC[1+bs*3] = beta[0]*C0[1+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3029 CC[2+bs*3] = beta[0]*C0[2+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3030 CC[3+bs*3] = beta[0]*C1[3+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3036 CC[0+bs*0] = beta[0]*C0[0+bs*0]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3038 CC[0+bs*1] = beta[0]*C0[0+bs*1]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3039 CC[1+bs*1] = beta[0]*C0[1+bs*1]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3041 CC[0+bs*2] = beta[0]*C0[0+bs*2]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3042 CC[1+bs*2] = beta[0]*C0[1+bs*2]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3043 CC[2+bs*2] = beta[0]*C1[2+bs*2]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3045 CC[0+bs*3] = beta[0]*C0[0+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3046 CC[1+bs*3] = beta[0]*C0[1+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3047 CC[2+bs*3] = beta[0]*C1[2+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3048 CC[3+bs*3] = beta[0]*C1[3+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3054 CC[0+bs*0] = beta[0]*C0[0+bs*0]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3056 CC[0+bs*1] = beta[0]*C0[0+bs*1]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3057 CC[1+bs*1] = beta[0]*C1[1+bs*1]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3059 CC[0+bs*2] = beta[0]*C0[0+bs*2]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3060 CC[1+bs*2] = beta[0]*C1[1+bs*2]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3061 CC[2+bs*2] = beta[0]*C1[2+bs*2]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3063 CC[0+bs*3] = beta[0]*C0[0+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3064 CC[1+bs*3] = beta[0]*C1[1+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3065 CC[2+bs*3] = beta[0]*C1[2+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3066 CC[3+bs*3] = beta[0]*C1[3+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3071 kernel_dgemm_nt_4x4_lib4(kmax, alpha, A, B, &beta1, CC, CC); in kernel_dsyrk_nt_u_4x4_gen_lib4()
3092 if(m0<=0 & m1>0) D0[0+bs*0] = CC[0+bs*0]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3098 if(m0<=0 & m1>0) D0[0+bs*1] = CC[0+bs*1]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3099 if(m0<=1 & m1>1) D0[1+bs*1] = CC[1+bs*1]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3105 if(m0<=0 & m1>0) D0[0+bs*2] = CC[0+bs*2]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3106 if(m0<=1 & m1>1) D0[1+bs*2] = CC[1+bs*2]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3107 if(m0<=2 & m1>2) D0[2+bs*2] = CC[2+bs*2]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3113 if(m0<=0 & m1>0) D0[0+bs*3] = CC[0+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3114 if(m0<=1 & m1>1) D0[1+bs*3] = CC[1+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3115 if(m0<=2 & m1>2) D0[2+bs*3] = CC[2+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3116 if(m0<=3 & m1>3) D0[3+bs*3] = CC[3+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3137 if(m0<=0 & m1>0) D0[0+bs*0] = CC[0+bs*0]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3143 if(m0<=0 & m1>0) D0[0+bs*1] = CC[0+bs*1]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3144 if(m0<=1 & m1>1) D0[1+bs*1] = CC[1+bs*1]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3150 if(m0<=0 & m1>0) D0[0+bs*2] = CC[0+bs*2]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3151 if(m0<=1 & m1>1) D0[1+bs*2] = CC[1+bs*2]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3152 if(m0<=2 & m1>2) D0[2+bs*2] = CC[2+bs*2]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3158 if(m0<=0 & m1>0) D0[0+bs*3] = CC[0+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3159 if(m0<=1 & m1>1) D0[1+bs*3] = CC[1+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3160 if(m0<=2 & m1>2) D0[2+bs*3] = CC[2+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3161 if(m0<=3 & m1>3) D1[3+bs*3] = CC[3+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3182 if(m0<=0 & m1>0) D0[0+bs*0] = CC[0+bs*0]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3188 if(m0<=0 & m1>0) D0[0+bs*1] = CC[0+bs*1]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3189 if(m0<=1 & m1>1) D0[1+bs*1] = CC[1+bs*1]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3195 if(m0<=0 & m1>0) D0[0+bs*2] = CC[0+bs*2]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3196 if(m0<=1 & m1>1) D0[1+bs*2] = CC[1+bs*2]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3197 if(m0<=2 & m1>2) D1[2+bs*2] = CC[2+bs*2]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3203 if(m0<=0 & m1>0) D0[0+bs*3] = CC[0+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3204 if(m0<=1 & m1>1) D0[1+bs*3] = CC[1+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3205 if(m0<=2 & m1>2) D1[2+bs*3] = CC[2+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3206 if(m0<=3 & m1>3) D1[3+bs*3] = CC[3+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3227 if(m0<=0 & m1>0) D0[0+bs*0] = CC[0+bs*0]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3233 if(m0<=0 & m1>0) D0[0+bs*1] = CC[0+bs*1]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3234 if(m0<=1 & m1>1) D1[1+bs*1] = CC[1+bs*1]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3240 if(m0<=0 & m1>0) D0[0+bs*2] = CC[0+bs*2]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3241 if(m0<=1 & m1>1) D1[1+bs*2] = CC[1+bs*2]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3242 if(m0<=2 & m1>2) D1[2+bs*2] = CC[2+bs*2]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3248 if(m0<=0 & m1>0) D0[0+bs*3] = CC[0+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3249 if(m0<=1 & m1>1) D1[1+bs*3] = CC[1+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3250 if(m0<=2 & m1>2) D1[2+bs*3] = CC[2+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3251 if(m0<=3 & m1>3) D1[3+bs*3] = CC[3+bs*3]; in kernel_dsyrk_nt_u_4x4_gen_lib4()
3272 double CC[16] = {0}; in kernel_dtrmm_nt_ru_4x4_lib4() local
3274 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dtrmm_nt_ru_4x4_lib4()
3291 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nt_ru_4x4_lib4()
3292 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nt_ru_4x4_lib4()
3293 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nt_ru_4x4_lib4()
3294 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nt_ru_4x4_lib4()
3312 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nt_ru_4x4_lib4()
3313 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nt_ru_4x4_lib4()
3314 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nt_ru_4x4_lib4()
3315 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nt_ru_4x4_lib4()
3317 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nt_ru_4x4_lib4()
3318 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nt_ru_4x4_lib4()
3319 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nt_ru_4x4_lib4()
3320 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nt_ru_4x4_lib4()
3339 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nt_ru_4x4_lib4()
3340 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nt_ru_4x4_lib4()
3341 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nt_ru_4x4_lib4()
3342 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nt_ru_4x4_lib4()
3344 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nt_ru_4x4_lib4()
3345 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nt_ru_4x4_lib4()
3346 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nt_ru_4x4_lib4()
3347 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nt_ru_4x4_lib4()
3349 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nt_ru_4x4_lib4()
3350 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nt_ru_4x4_lib4()
3351 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nt_ru_4x4_lib4()
3352 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nt_ru_4x4_lib4()
3359 kernel_dgemm_nt_4x4_lib4(kmax-k, alpha, A, B, alpha, CC, D); in kernel_dtrmm_nt_ru_4x4_lib4()
3379 double CC[16] = {0}; in kernel_dtrmm_nt_ru_4x4_vs_lib4() local
3381 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3398 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3399 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3400 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3401 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3419 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3420 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3421 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3422 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3424 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3425 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3426 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3427 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3446 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3447 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3448 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3449 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3451 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3452 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3453 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3454 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3456 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3457 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3458 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3459 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3466 kernel_dgemm_nt_4x4_lib4(kmax-k, alpha, A, B, alpha, CC, CC); in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3470 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3471 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3472 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3473 D[3+bs*0] = CC[3+bs*0]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3478 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3479 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3480 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3481 D[3+bs*1] = CC[3+bs*1]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3486 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3487 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3488 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3489 D[3+bs*2] = CC[3+bs*2]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3494 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3495 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3496 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3497 D[3+bs*3] = CC[3+bs*3]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3501 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3502 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3503 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3508 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3509 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3510 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3515 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3516 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3517 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3522 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3523 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3524 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3528 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3529 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3534 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3535 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3540 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3541 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3546 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3547 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3551 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3556 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3561 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3566 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrmm_nt_ru_4x4_vs_lib4()
3588 double CC[16] = {0}; in kernel_dtrmm_nn_rl_4x4_lib4() local
3590 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dtrmm_nn_rl_4x4_lib4()
3612 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3613 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3614 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3615 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3632 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3633 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3634 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3635 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3638 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3639 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3640 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3641 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3658 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3659 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3660 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3661 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3664 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3665 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3666 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3667 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3670 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3671 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3672 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3673 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3690 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3691 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3692 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3693 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3696 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3697 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3698 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3699 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3702 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3703 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3704 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3705 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3708 CC[0+bs*3] += a_0 * b_3; in kernel_dtrmm_nn_rl_4x4_lib4()
3709 CC[1+bs*3] += a_1 * b_3; in kernel_dtrmm_nn_rl_4x4_lib4()
3710 CC[2+bs*3] += a_2 * b_3; in kernel_dtrmm_nn_rl_4x4_lib4()
3711 CC[3+bs*3] += a_3 * b_3; in kernel_dtrmm_nn_rl_4x4_lib4()
3729 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3730 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3731 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3732 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3749 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3750 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3751 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3752 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3755 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3756 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3757 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3758 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3775 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3776 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3777 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3778 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3781 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3782 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3783 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3784 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3787 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3788 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3789 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3790 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3808 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3809 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3810 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3811 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3828 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3829 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3830 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3831 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3834 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3835 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3836 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3837 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3854 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3855 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3856 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3857 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3860 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3861 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3862 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3863 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3866 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3867 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3868 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3869 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3886 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3887 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3888 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3889 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3892 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3893 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3894 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3895 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3898 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3899 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3900 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3901 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3904 CC[0+bs*3] += a_0 * b_3; in kernel_dtrmm_nn_rl_4x4_lib4()
3905 CC[1+bs*3] += a_1 * b_3; in kernel_dtrmm_nn_rl_4x4_lib4()
3906 CC[2+bs*3] += a_2 * b_3; in kernel_dtrmm_nn_rl_4x4_lib4()
3907 CC[3+bs*3] += a_3 * b_3; in kernel_dtrmm_nn_rl_4x4_lib4()
3924 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3925 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3926 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3927 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3930 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3931 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3932 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3933 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3936 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3937 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3938 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3939 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3942 CC[0+bs*3] += a_0 * b_3; in kernel_dtrmm_nn_rl_4x4_lib4()
3943 CC[1+bs*3] += a_1 * b_3; in kernel_dtrmm_nn_rl_4x4_lib4()
3944 CC[2+bs*3] += a_2 * b_3; in kernel_dtrmm_nn_rl_4x4_lib4()
3945 CC[3+bs*3] += a_3 * b_3; in kernel_dtrmm_nn_rl_4x4_lib4()
3962 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3963 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3964 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3965 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
3968 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3969 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3970 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3971 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
3974 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3975 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3976 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3977 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
3980 CC[0+bs*3] += a_0 * b_3; in kernel_dtrmm_nn_rl_4x4_lib4()
3981 CC[1+bs*3] += a_1 * b_3; in kernel_dtrmm_nn_rl_4x4_lib4()
3982 CC[2+bs*3] += a_2 * b_3; in kernel_dtrmm_nn_rl_4x4_lib4()
3983 CC[3+bs*3] += a_3 * b_3; in kernel_dtrmm_nn_rl_4x4_lib4()
4001 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
4002 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
4003 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
4004 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
4021 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
4022 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
4023 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
4024 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
4027 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
4028 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
4029 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
4030 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
4047 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
4048 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
4049 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
4050 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
4053 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
4054 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
4055 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
4056 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
4059 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
4060 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
4061 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
4062 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
4079 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
4080 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
4081 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
4082 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
4085 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
4086 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
4087 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
4088 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
4091 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
4092 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
4093 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
4094 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
4097 CC[0+bs*3] += a_0 * b_3; in kernel_dtrmm_nn_rl_4x4_lib4()
4098 CC[1+bs*3] += a_1 * b_3; in kernel_dtrmm_nn_rl_4x4_lib4()
4099 CC[2+bs*3] += a_2 * b_3; in kernel_dtrmm_nn_rl_4x4_lib4()
4100 CC[3+bs*3] += a_3 * b_3; in kernel_dtrmm_nn_rl_4x4_lib4()
4117 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
4118 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
4119 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
4120 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_lib4()
4123 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
4124 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
4125 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
4126 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_lib4()
4129 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
4130 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
4131 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
4132 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_lib4()
4135 CC[0+bs*3] += a_0 * b_3; in kernel_dtrmm_nn_rl_4x4_lib4()
4136 CC[1+bs*3] += a_1 * b_3; in kernel_dtrmm_nn_rl_4x4_lib4()
4137 CC[2+bs*3] += a_2 * b_3; in kernel_dtrmm_nn_rl_4x4_lib4()
4138 CC[3+bs*3] += a_3 * b_3; in kernel_dtrmm_nn_rl_4x4_lib4()
4148 CC[0+bs*0] = alpha[0]*CC[0+bs*0]; in kernel_dtrmm_nn_rl_4x4_lib4()
4149 CC[1+bs*0] = alpha[0]*CC[1+bs*0]; in kernel_dtrmm_nn_rl_4x4_lib4()
4150 CC[2+bs*0] = alpha[0]*CC[2+bs*0]; in kernel_dtrmm_nn_rl_4x4_lib4()
4151 CC[3+bs*0] = alpha[0]*CC[3+bs*0]; in kernel_dtrmm_nn_rl_4x4_lib4()
4153 CC[0+bs*1] = alpha[0]*CC[0+bs*1]; in kernel_dtrmm_nn_rl_4x4_lib4()
4154 CC[1+bs*1] = alpha[0]*CC[1+bs*1]; in kernel_dtrmm_nn_rl_4x4_lib4()
4155 CC[2+bs*1] = alpha[0]*CC[2+bs*1]; in kernel_dtrmm_nn_rl_4x4_lib4()
4156 CC[3+bs*1] = alpha[0]*CC[3+bs*1]; in kernel_dtrmm_nn_rl_4x4_lib4()
4158 CC[0+bs*2] = alpha[0]*CC[0+bs*2]; in kernel_dtrmm_nn_rl_4x4_lib4()
4159 CC[1+bs*2] = alpha[0]*CC[1+bs*2]; in kernel_dtrmm_nn_rl_4x4_lib4()
4160 CC[2+bs*2] = alpha[0]*CC[2+bs*2]; in kernel_dtrmm_nn_rl_4x4_lib4()
4161 CC[3+bs*2] = alpha[0]*CC[3+bs*2]; in kernel_dtrmm_nn_rl_4x4_lib4()
4163 CC[0+bs*3] = alpha[0]*CC[0+bs*3]; in kernel_dtrmm_nn_rl_4x4_lib4()
4164 CC[1+bs*3] = alpha[0]*CC[1+bs*3]; in kernel_dtrmm_nn_rl_4x4_lib4()
4165 CC[2+bs*3] = alpha[0]*CC[2+bs*3]; in kernel_dtrmm_nn_rl_4x4_lib4()
4166 CC[3+bs*3] = alpha[0]*CC[3+bs*3]; in kernel_dtrmm_nn_rl_4x4_lib4()
4170 kernel_dgemm_nn_4x4_lib4(kmax-k, alpha, A, 0, B, sdb, &beta1, CC, D); in kernel_dtrmm_nn_rl_4x4_lib4()
4190 double CC[16] = {0}; in kernel_dtrmm_nn_rl_4x4_vs_lib4() local
4192 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4214 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4215 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4216 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4217 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4234 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4235 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4236 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4237 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4240 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4241 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4242 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4243 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4260 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4261 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4262 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4263 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4266 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4267 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4268 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4269 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4272 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4273 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4274 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4275 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4292 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4293 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4294 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4295 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4298 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4299 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4300 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4301 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4304 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4305 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4306 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4307 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4310 CC[0+bs*3] += a_0 * b_3; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4311 CC[1+bs*3] += a_1 * b_3; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4312 CC[2+bs*3] += a_2 * b_3; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4313 CC[3+bs*3] += a_3 * b_3; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4331 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4332 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4333 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4334 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4351 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4352 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4353 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4354 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4357 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4358 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4359 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4360 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4377 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4378 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4379 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4380 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4383 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4384 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4385 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4386 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4389 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4390 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4391 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4392 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4410 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4411 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4412 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4413 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4430 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4431 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4432 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4433 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4436 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4437 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4438 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4439 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4456 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4457 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4458 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4459 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4462 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4463 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4464 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4465 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4468 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4469 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4470 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4471 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4488 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4489 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4490 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4491 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4494 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4495 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4496 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4497 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4500 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4501 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4502 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4503 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4506 CC[0+bs*3] += a_0 * b_3; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4507 CC[1+bs*3] += a_1 * b_3; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4508 CC[2+bs*3] += a_2 * b_3; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4509 CC[3+bs*3] += a_3 * b_3; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4526 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4527 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4528 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4529 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4532 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4533 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4534 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4535 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4538 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4539 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4540 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4541 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4544 CC[0+bs*3] += a_0 * b_3; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4545 CC[1+bs*3] += a_1 * b_3; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4546 CC[2+bs*3] += a_2 * b_3; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4547 CC[3+bs*3] += a_3 * b_3; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4564 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4565 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4566 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4567 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4570 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4571 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4572 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4573 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4576 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4577 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4578 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4579 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4582 CC[0+bs*3] += a_0 * b_3; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4583 CC[1+bs*3] += a_1 * b_3; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4584 CC[2+bs*3] += a_2 * b_3; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4585 CC[3+bs*3] += a_3 * b_3; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4603 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4604 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4605 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4606 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4623 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4624 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4625 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4626 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4629 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4630 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4631 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4632 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4649 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4650 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4651 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4652 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4655 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4656 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4657 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4658 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4661 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4662 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4663 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4664 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4681 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4682 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4683 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4684 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4687 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4688 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4689 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4690 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4693 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4694 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4695 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4696 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4699 CC[0+bs*3] += a_0 * b_3; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4700 CC[1+bs*3] += a_1 * b_3; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4701 CC[2+bs*3] += a_2 * b_3; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4702 CC[3+bs*3] += a_3 * b_3; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4719 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4720 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4721 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4722 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4725 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4726 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4727 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4728 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4731 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4732 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4733 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4734 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4737 CC[0+bs*3] += a_0 * b_3; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4738 CC[1+bs*3] += a_1 * b_3; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4739 CC[2+bs*3] += a_2 * b_3; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4740 CC[3+bs*3] += a_3 * b_3; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4750 CC[0+bs*0] = alpha[0]*CC[0+bs*0]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4751 CC[1+bs*0] = alpha[0]*CC[1+bs*0]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4752 CC[2+bs*0] = alpha[0]*CC[2+bs*0]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4753 CC[3+bs*0] = alpha[0]*CC[3+bs*0]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4755 CC[0+bs*1] = alpha[0]*CC[0+bs*1]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4756 CC[1+bs*1] = alpha[0]*CC[1+bs*1]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4757 CC[2+bs*1] = alpha[0]*CC[2+bs*1]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4758 CC[3+bs*1] = alpha[0]*CC[3+bs*1]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4760 CC[0+bs*2] = alpha[0]*CC[0+bs*2]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4761 CC[1+bs*2] = alpha[0]*CC[1+bs*2]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4762 CC[2+bs*2] = alpha[0]*CC[2+bs*2]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4763 CC[3+bs*2] = alpha[0]*CC[3+bs*2]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4765 CC[0+bs*3] = alpha[0]*CC[0+bs*3]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4766 CC[1+bs*3] = alpha[0]*CC[1+bs*3]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4767 CC[2+bs*3] = alpha[0]*CC[2+bs*3]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4768 CC[3+bs*3] = alpha[0]*CC[3+bs*3]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4772 kernel_dgemm_nn_4x4_lib4(kmax-k, alpha, A, 0, B, sdb, &beta1, CC, CC); in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4776 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4777 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4778 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4779 D[3+bs*0] = CC[3+bs*0]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4784 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4785 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4786 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4787 D[3+bs*1] = CC[3+bs*1]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4792 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4793 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4794 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4795 D[3+bs*2] = CC[3+bs*2]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4800 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4801 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4802 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4803 D[3+bs*3] = CC[3+bs*3]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4807 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4808 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4809 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4814 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4815 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4816 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4821 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4822 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4823 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4828 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4829 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4830 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4834 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4835 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4840 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4841 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4846 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4847 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4852 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4853 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4857 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4862 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4867 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4872 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrmm_nn_rl_4x4_vs_lib4()
4893 double CC[16] = {0}; in kernel_dtrmm_nn_rl_4x4_gen_lib4() local
4895 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4917 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4918 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4919 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4920 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4937 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4938 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4939 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4940 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4943 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4944 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4945 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4946 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4963 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4964 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4965 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4966 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4969 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4970 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4971 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4972 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4975 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4976 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4977 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4978 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4995 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4996 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4997 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
4998 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5001 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5002 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5003 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5004 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5007 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5008 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5009 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5010 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5013 CC[0+bs*3] += a_0 * b_3; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5014 CC[1+bs*3] += a_1 * b_3; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5015 CC[2+bs*3] += a_2 * b_3; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5016 CC[3+bs*3] += a_3 * b_3; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5034 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5035 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5036 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5037 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5054 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5055 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5056 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5057 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5060 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5061 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5062 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5063 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5080 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5081 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5082 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5083 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5086 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5087 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5088 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5089 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5092 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5093 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5094 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5095 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5113 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5114 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5115 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5116 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5133 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5134 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5135 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5136 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5139 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5140 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5141 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5142 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5159 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5160 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5161 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5162 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5165 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5166 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5167 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5168 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5171 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5172 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5173 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5174 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5191 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5192 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5193 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5194 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5197 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5198 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5199 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5200 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5203 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5204 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5205 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5206 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5209 CC[0+bs*3] += a_0 * b_3; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5210 CC[1+bs*3] += a_1 * b_3; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5211 CC[2+bs*3] += a_2 * b_3; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5212 CC[3+bs*3] += a_3 * b_3; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5229 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5230 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5231 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5232 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5235 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5236 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5237 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5238 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5241 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5242 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5243 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5244 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5247 CC[0+bs*3] += a_0 * b_3; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5248 CC[1+bs*3] += a_1 * b_3; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5249 CC[2+bs*3] += a_2 * b_3; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5250 CC[3+bs*3] += a_3 * b_3; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5267 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5268 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5269 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5270 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5273 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5274 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5275 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5276 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5279 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5280 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5281 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5282 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5285 CC[0+bs*3] += a_0 * b_3; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5286 CC[1+bs*3] += a_1 * b_3; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5287 CC[2+bs*3] += a_2 * b_3; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5288 CC[3+bs*3] += a_3 * b_3; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5306 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5307 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5308 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5309 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5326 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5327 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5328 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5329 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5332 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5333 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5334 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5335 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5352 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5353 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5354 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5355 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5358 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5359 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5360 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5361 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5364 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5365 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5366 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5367 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5384 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5385 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5386 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5387 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5390 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5391 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5392 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5393 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5396 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5397 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5398 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5399 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5402 CC[0+bs*3] += a_0 * b_3; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5403 CC[1+bs*3] += a_1 * b_3; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5404 CC[2+bs*3] += a_2 * b_3; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5405 CC[3+bs*3] += a_3 * b_3; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5422 CC[0+bs*0] += a_0 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5423 CC[1+bs*0] += a_1 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5424 CC[2+bs*0] += a_2 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5425 CC[3+bs*0] += a_3 * b_0; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5428 CC[0+bs*1] += a_0 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5429 CC[1+bs*1] += a_1 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5430 CC[2+bs*1] += a_2 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5431 CC[3+bs*1] += a_3 * b_1; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5434 CC[0+bs*2] += a_0 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5435 CC[1+bs*2] += a_1 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5436 CC[2+bs*2] += a_2 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5437 CC[3+bs*2] += a_3 * b_2; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5440 CC[0+bs*3] += a_0 * b_3; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5441 CC[1+bs*3] += a_1 * b_3; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5442 CC[2+bs*3] += a_2 * b_3; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5443 CC[3+bs*3] += a_3 * b_3; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5453 CC[0+bs*0] = alpha[0]*CC[0+bs*0]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5454 CC[1+bs*0] = alpha[0]*CC[1+bs*0]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5455 CC[2+bs*0] = alpha[0]*CC[2+bs*0]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5456 CC[3+bs*0] = alpha[0]*CC[3+bs*0]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5458 CC[0+bs*1] = alpha[0]*CC[0+bs*1]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5459 CC[1+bs*1] = alpha[0]*CC[1+bs*1]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5460 CC[2+bs*1] = alpha[0]*CC[2+bs*1]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5461 CC[3+bs*1] = alpha[0]*CC[3+bs*1]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5463 CC[0+bs*2] = alpha[0]*CC[0+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5464 CC[1+bs*2] = alpha[0]*CC[1+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5465 CC[2+bs*2] = alpha[0]*CC[2+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5466 CC[3+bs*2] = alpha[0]*CC[3+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5468 CC[0+bs*3] = alpha[0]*CC[0+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5469 CC[1+bs*3] = alpha[0]*CC[1+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5470 CC[2+bs*3] = alpha[0]*CC[2+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5471 CC[3+bs*3] = alpha[0]*CC[3+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5475 kernel_dgemm_nn_4x4_lib4(kmax-k, alpha, A, 0, B, sdb, &beta1, CC, CC); in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5482 CC[0+bs*0] = CC[0+bs*1]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5483 CC[1+bs*0] = CC[1+bs*1]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5484 CC[2+bs*0] = CC[2+bs*1]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5485 CC[3+bs*0] = CC[3+bs*1]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5487 CC[0+bs*1] = CC[0+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5488 CC[1+bs*1] = CC[1+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5489 CC[2+bs*1] = CC[2+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5490 CC[3+bs*1] = CC[3+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5492 CC[0+bs*2] = CC[0+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5493 CC[1+bs*2] = CC[1+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5494 CC[2+bs*2] = CC[2+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5495 CC[3+bs*2] = CC[3+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5501 CC[0+bs*0] = CC[0+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5502 CC[1+bs*0] = CC[1+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5503 CC[2+bs*0] = CC[2+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5504 CC[3+bs*0] = CC[3+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5506 CC[0+bs*1] = CC[0+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5507 CC[1+bs*1] = CC[1+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5508 CC[2+bs*1] = CC[2+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5509 CC[3+bs*1] = CC[3+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5515 CC[0+bs*0] = CC[0+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5516 CC[1+bs*0] = CC[1+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5517 CC[2+bs*0] = CC[2+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5518 CC[3+bs*0] = CC[3+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5532 if(m0<=0 & m1>0) D0[0+bs*0] = CC[0+bs*0]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5533 if(m0<=1 & m1>1) D0[1+bs*0] = CC[1+bs*0]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5534 if(m0<=2 & m1>2) D0[2+bs*0] = CC[2+bs*0]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5535 if(m0<=3 & m1>3) D0[3+bs*0] = CC[3+bs*0]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5540 if(m0<=0 & m1>0) D0[0+bs*1] = CC[0+bs*1]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5541 if(m0<=1 & m1>1) D0[1+bs*1] = CC[1+bs*1]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5542 if(m0<=2 & m1>2) D0[2+bs*1] = CC[2+bs*1]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5543 if(m0<=3 & m1>3) D0[3+bs*1] = CC[3+bs*1]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5548 if(m0<=0 & m1>0) D0[0+bs*2] = CC[0+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5549 if(m0<=1 & m1>1) D0[1+bs*2] = CC[1+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5550 if(m0<=2 & m1>2) D0[2+bs*2] = CC[2+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5551 if(m0<=3 & m1>3) D0[3+bs*2] = CC[3+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5556 if(m0<=0 & m1>0) D0[0+bs*3] = CC[0+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5557 if(m0<=1 & m1>1) D0[1+bs*3] = CC[1+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5558 if(m0<=2 & m1>2) D0[2+bs*3] = CC[2+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5559 if(m0<=3 & m1>3) D0[3+bs*3] = CC[3+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5568 if(m0<=0 & m1>0) D0[1+bs*0] = CC[0+bs*0]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5569 if(m0<=1 & m1>1) D0[2+bs*0] = CC[1+bs*0]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5570 if(m0<=2 & m1>2) D0[3+bs*0] = CC[2+bs*0]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5571 if(m0<=3 & m1>3) D1[0+bs*0] = CC[3+bs*0]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5576 if(m0<=0 & m1>0) D0[1+bs*1] = CC[0+bs*1]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5577 if(m0<=1 & m1>1) D0[2+bs*1] = CC[1+bs*1]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5578 if(m0<=2 & m1>2) D0[3+bs*1] = CC[2+bs*1]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5579 if(m0<=3 & m1>3) D1[0+bs*1] = CC[3+bs*1]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5584 if(m0<=0 & m1>0) D0[1+bs*2] = CC[0+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5585 if(m0<=1 & m1>1) D0[2+bs*2] = CC[1+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5586 if(m0<=2 & m1>2) D0[3+bs*2] = CC[2+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5587 if(m0<=3 & m1>3) D1[0+bs*2] = CC[3+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5592 if(m0<=0 & m1>0) D0[1+bs*3] = CC[0+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5593 if(m0<=1 & m1>1) D0[2+bs*3] = CC[1+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5594 if(m0<=2 & m1>2) D0[3+bs*3] = CC[2+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5595 if(m0<=3 & m1>3) D1[0+bs*3] = CC[3+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5604 if(m0<=0 & m1>0) D0[2+bs*0] = CC[0+bs*0]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5605 if(m0<=1 & m1>1) D0[3+bs*0] = CC[1+bs*0]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5606 if(m0<=2 & m1>2) D1[0+bs*0] = CC[2+bs*0]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5607 if(m0<=3 & m1>3) D1[1+bs*0] = CC[3+bs*0]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5612 if(m0<=0 & m1>0) D0[2+bs*1] = CC[0+bs*1]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5613 if(m0<=1 & m1>1) D0[3+bs*1] = CC[1+bs*1]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5614 if(m0<=2 & m1>2) D1[0+bs*1] = CC[2+bs*1]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5615 if(m0<=3 & m1>3) D1[1+bs*1] = CC[3+bs*1]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5620 if(m0<=0 & m1>0) D0[2+bs*2] = CC[0+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5621 if(m0<=1 & m1>1) D0[3+bs*2] = CC[1+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5622 if(m0<=2 & m1>2) D1[0+bs*2] = CC[2+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5623 if(m0<=3 & m1>3) D1[1+bs*2] = CC[3+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5628 if(m0<=0 & m1>0) D0[2+bs*3] = CC[0+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5629 if(m0<=1 & m1>1) D0[3+bs*3] = CC[1+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5630 if(m0<=2 & m1>2) D1[0+bs*3] = CC[2+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5631 if(m0<=3 & m1>3) D1[1+bs*3] = CC[3+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5640 if(m0<=0 & m1>0) D0[3+bs*0] = CC[0+bs*0]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5641 if(m0<=1 & m1>1) D1[0+bs*0] = CC[1+bs*0]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5642 if(m0<=2 & m1>2) D1[1+bs*0] = CC[2+bs*0]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5643 if(m0<=3 & m1>3) D1[2+bs*0] = CC[3+bs*0]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5648 if(m0<=0 & m1>0) D0[3+bs*1] = CC[0+bs*1]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5649 if(m0<=1 & m1>1) D1[0+bs*1] = CC[1+bs*1]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5650 if(m0<=2 & m1>2) D1[1+bs*1] = CC[2+bs*1]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5651 if(m0<=3 & m1>3) D1[2+bs*1] = CC[3+bs*1]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5656 if(m0<=0 & m1>0) D0[3+bs*2] = CC[0+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5657 if(m0<=1 & m1>1) D1[0+bs*2] = CC[1+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5658 if(m0<=2 & m1>2) D1[1+bs*2] = CC[2+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5659 if(m0<=3 & m1>3) D1[2+bs*2] = CC[3+bs*2]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5664 if(m0<=0 & m1>0) D0[3+bs*3] = CC[0+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5665 if(m0<=1 & m1>1) D1[0+bs*3] = CC[1+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5666 if(m0<=2 & m1>2) D1[1+bs*3] = CC[2+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5667 if(m0<=3 & m1>3) D1[2+bs*3] = CC[3+bs*3]; in kernel_dtrmm_nn_rl_4x4_gen_lib4()
5689 double CC[16] = {0}; in kernel_dpotrf_nt_l_4x4_lib4() local
5691 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dpotrf_nt_l_4x4_lib4()
5699 kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, C, CC); in kernel_dpotrf_nt_l_4x4_lib4()
5701 if(CC[0+bs*0]>0) in kernel_dpotrf_nt_l_4x4_lib4()
5703 CC[0+bs*0] = sqrt(CC[0+bs*0]); in kernel_dpotrf_nt_l_4x4_lib4()
5704 tmp = 1.0/CC[0+bs*0]; in kernel_dpotrf_nt_l_4x4_lib4()
5708 CC[0+bs*0] = 0.0; in kernel_dpotrf_nt_l_4x4_lib4()
5711 CC[1+bs*0] *= tmp; in kernel_dpotrf_nt_l_4x4_lib4()
5712 CC[2+bs*0] *= tmp; in kernel_dpotrf_nt_l_4x4_lib4()
5713 CC[3+bs*0] *= tmp; in kernel_dpotrf_nt_l_4x4_lib4()
5716 CC[1+bs*1] -= CC[1+bs*0] * CC[1+bs*0]; in kernel_dpotrf_nt_l_4x4_lib4()
5717 CC[2+bs*1] -= CC[2+bs*0] * CC[1+bs*0]; in kernel_dpotrf_nt_l_4x4_lib4()
5718 CC[3+bs*1] -= CC[3+bs*0] * CC[1+bs*0]; in kernel_dpotrf_nt_l_4x4_lib4()
5719 if(CC[1+bs*1]>0) in kernel_dpotrf_nt_l_4x4_lib4()
5721 CC[1+bs*1] = sqrt(CC[1+bs*1]); in kernel_dpotrf_nt_l_4x4_lib4()
5722 tmp = 1.0/CC[1+bs*1]; in kernel_dpotrf_nt_l_4x4_lib4()
5726 CC[1+bs*1] = 0.0; in kernel_dpotrf_nt_l_4x4_lib4()
5729 CC[2+bs*1] *= tmp; in kernel_dpotrf_nt_l_4x4_lib4()
5730 CC[3+bs*1] *= tmp; in kernel_dpotrf_nt_l_4x4_lib4()
5733 CC[2+bs*2] -= CC[2+bs*0] * CC[2+bs*0]; in kernel_dpotrf_nt_l_4x4_lib4()
5734 CC[3+bs*2] -= CC[3+bs*0] * CC[2+bs*0]; in kernel_dpotrf_nt_l_4x4_lib4()
5735 CC[2+bs*2] -= CC[2+bs*1] * CC[2+bs*1]; in kernel_dpotrf_nt_l_4x4_lib4()
5736 CC[3+bs*2] -= CC[3+bs*1] * CC[2+bs*1]; in kernel_dpotrf_nt_l_4x4_lib4()
5737 if(CC[2+bs*2]>0) in kernel_dpotrf_nt_l_4x4_lib4()
5739 CC[2+bs*2] = sqrt(CC[2+bs*2]); in kernel_dpotrf_nt_l_4x4_lib4()
5740 tmp = 1.0/CC[2+bs*2]; in kernel_dpotrf_nt_l_4x4_lib4()
5744 CC[2+bs*2] = 0.0; in kernel_dpotrf_nt_l_4x4_lib4()
5747 CC[3+bs*2] *= tmp; in kernel_dpotrf_nt_l_4x4_lib4()
5750 CC[3+bs*3] -= CC[3+bs*0] * CC[3+bs*0]; in kernel_dpotrf_nt_l_4x4_lib4()
5751 CC[3+bs*3] -= CC[3+bs*1] * CC[3+bs*1]; in kernel_dpotrf_nt_l_4x4_lib4()
5752 CC[3+bs*3] -= CC[3+bs*2] * CC[3+bs*2]; in kernel_dpotrf_nt_l_4x4_lib4()
5753 if(CC[3+bs*3]>0) in kernel_dpotrf_nt_l_4x4_lib4()
5755 CC[3+bs*3] = sqrt(CC[3+bs*3]); in kernel_dpotrf_nt_l_4x4_lib4()
5756 tmp = 1.0/CC[3+bs*3]; in kernel_dpotrf_nt_l_4x4_lib4()
5760 CC[3+bs*3] = 0.0; in kernel_dpotrf_nt_l_4x4_lib4()
5765 D[0+bs*0] = CC[0+bs*0]; in kernel_dpotrf_nt_l_4x4_lib4()
5766 D[1+bs*0] = CC[1+bs*0]; in kernel_dpotrf_nt_l_4x4_lib4()
5767 D[2+bs*0] = CC[2+bs*0]; in kernel_dpotrf_nt_l_4x4_lib4()
5768 D[3+bs*0] = CC[3+bs*0]; in kernel_dpotrf_nt_l_4x4_lib4()
5770 D[1+bs*1] = CC[1+bs*1]; in kernel_dpotrf_nt_l_4x4_lib4()
5771 D[2+bs*1] = CC[2+bs*1]; in kernel_dpotrf_nt_l_4x4_lib4()
5772 D[3+bs*1] = CC[3+bs*1]; in kernel_dpotrf_nt_l_4x4_lib4()
5774 D[2+bs*2] = CC[2+bs*2]; in kernel_dpotrf_nt_l_4x4_lib4()
5775 D[3+bs*2] = CC[3+bs*2]; in kernel_dpotrf_nt_l_4x4_lib4()
5777 D[3+bs*3] = CC[3+bs*3]; in kernel_dpotrf_nt_l_4x4_lib4()
5795 double CC[16] = {0}; in kernel_dpotrf_nt_l_4x4_vs_lib4() local
5797 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5803 kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, C, CC); in kernel_dpotrf_nt_l_4x4_vs_lib4()
5805 if(CC[0+bs*0]>0) in kernel_dpotrf_nt_l_4x4_vs_lib4()
5807 CC[0+bs*0] = sqrt(CC[0+bs*0]); in kernel_dpotrf_nt_l_4x4_vs_lib4()
5808 tmp = 1.0/CC[0+bs*0]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5812 CC[0+bs*0] = 0.0; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5815 CC[1+bs*0] *= tmp; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5816 CC[2+bs*0] *= tmp; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5817 CC[3+bs*0] *= tmp; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5823 CC[1+bs*1] -= CC[1+bs*0] * CC[1+bs*0]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5824 CC[2+bs*1] -= CC[2+bs*0] * CC[1+bs*0]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5825 CC[3+bs*1] -= CC[3+bs*0] * CC[1+bs*0]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5826 if(CC[1+bs*1]>0) in kernel_dpotrf_nt_l_4x4_vs_lib4()
5828 CC[1+bs*1] = sqrt(CC[1+bs*1]); in kernel_dpotrf_nt_l_4x4_vs_lib4()
5829 tmp = 1.0/CC[1+bs*1]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5833 CC[1+bs*1] = 0.0; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5836 CC[2+bs*1] *= tmp; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5837 CC[3+bs*1] *= tmp; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5843 CC[2+bs*2] -= CC[2+bs*0] * CC[2+bs*0]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5844 CC[3+bs*2] -= CC[3+bs*0] * CC[2+bs*0]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5845 CC[2+bs*2] -= CC[2+bs*1] * CC[2+bs*1]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5846 CC[3+bs*2] -= CC[3+bs*1] * CC[2+bs*1]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5847 if(CC[2+bs*2]>0) in kernel_dpotrf_nt_l_4x4_vs_lib4()
5849 CC[2+bs*2] = sqrt(CC[2+bs*2]); in kernel_dpotrf_nt_l_4x4_vs_lib4()
5850 tmp = 1.0/CC[2+bs*2]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5854 CC[2+bs*2] = 0.0; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5857 CC[3+bs*2] *= tmp; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5863 CC[3+bs*3] -= CC[3+bs*0] * CC[3+bs*0]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5864 CC[3+bs*3] -= CC[3+bs*1] * CC[3+bs*1]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5865 CC[3+bs*3] -= CC[3+bs*2] * CC[3+bs*2]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5866 if(CC[3+bs*3]>0) in kernel_dpotrf_nt_l_4x4_vs_lib4()
5868 CC[3+bs*3] = sqrt(CC[3+bs*3]); in kernel_dpotrf_nt_l_4x4_vs_lib4()
5869 tmp = 1.0/CC[3+bs*3]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5873 CC[3+bs*3] = 0.0; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5883 D[0+bs*0] = CC[0+bs*0]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5884 D[1+bs*0] = CC[1+bs*0]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5885 D[2+bs*0] = CC[2+bs*0]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5886 D[3+bs*0] = CC[3+bs*0]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5891 D[1+bs*1] = CC[1+bs*1]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5892 D[2+bs*1] = CC[2+bs*1]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5893 D[3+bs*1] = CC[3+bs*1]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5898 D[2+bs*2] = CC[2+bs*2]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5899 D[3+bs*2] = CC[3+bs*2]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5904 D[3+bs*3] = CC[3+bs*3]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5908 D[0+bs*0] = CC[0+bs*0]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5909 D[1+bs*0] = CC[1+bs*0]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5910 D[2+bs*0] = CC[2+bs*0]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5915 D[1+bs*1] = CC[1+bs*1]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5916 D[2+bs*1] = CC[2+bs*1]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5921 D[2+bs*2] = CC[2+bs*2]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5925 D[0+bs*0] = CC[0+bs*0]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5926 D[1+bs*0] = CC[1+bs*0]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5931 D[1+bs*1] = CC[1+bs*1]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5935 D[0+bs*0] = CC[0+bs*0]; in kernel_dpotrf_nt_l_4x4_vs_lib4()
5978 double CC[16] = {0}; in kernel_dtrsm_nt_rl_inv_4x4_lib4() local
5980 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
5985 kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC); in kernel_dtrsm_nt_rl_inv_4x4_lib4()
5988 CC[0+bs*0] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
5989 CC[1+bs*0] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
5990 CC[2+bs*0] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
5991 CC[3+bs*0] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
5994 CC[0+bs*1] -= CC[0+bs*0] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
5995 CC[1+bs*1] -= CC[1+bs*0] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
5996 CC[2+bs*1] -= CC[2+bs*0] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
5997 CC[3+bs*1] -= CC[3+bs*0] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
5999 CC[0+bs*1] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6000 CC[1+bs*1] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6001 CC[2+bs*1] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6002 CC[3+bs*1] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6005 CC[0+bs*2] -= CC[0+bs*0] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6006 CC[1+bs*2] -= CC[1+bs*0] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6007 CC[2+bs*2] -= CC[2+bs*0] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6008 CC[3+bs*2] -= CC[3+bs*0] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6010 CC[0+bs*2] -= CC[0+bs*1] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6011 CC[1+bs*2] -= CC[1+bs*1] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6012 CC[2+bs*2] -= CC[2+bs*1] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6013 CC[3+bs*2] -= CC[3+bs*1] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6015 CC[0+bs*2] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6016 CC[1+bs*2] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6017 CC[2+bs*2] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6018 CC[3+bs*2] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6021 CC[0+bs*3] -= CC[0+bs*0] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6022 CC[1+bs*3] -= CC[1+bs*0] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6023 CC[2+bs*3] -= CC[2+bs*0] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6024 CC[3+bs*3] -= CC[3+bs*0] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6026 CC[0+bs*3] -= CC[0+bs*1] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6027 CC[1+bs*3] -= CC[1+bs*1] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6028 CC[2+bs*3] -= CC[2+bs*1] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6029 CC[3+bs*3] -= CC[3+bs*1] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6031 CC[0+bs*3] -= CC[0+bs*2] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6032 CC[1+bs*3] -= CC[1+bs*2] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6033 CC[2+bs*3] -= CC[2+bs*2] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6034 CC[3+bs*3] -= CC[3+bs*2] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6036 CC[0+bs*3] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6037 CC[1+bs*3] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6038 CC[2+bs*3] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6039 CC[3+bs*3] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6041 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6042 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6043 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6044 D[3+bs*0] = CC[3+bs*0]; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6046 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6047 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6048 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6049 D[3+bs*1] = CC[3+bs*1]; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6051 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6052 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6053 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6054 D[3+bs*2] = CC[3+bs*2]; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6056 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6057 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6058 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6059 D[3+bs*3] = CC[3+bs*3]; in kernel_dtrsm_nt_rl_inv_4x4_lib4()
6077 double CC[16] = {0}; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4() local
6079 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6084 kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC); in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6087 CC[0+bs*0] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6088 CC[1+bs*0] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6089 CC[2+bs*0] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6090 CC[3+bs*0] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6096 CC[0+bs*1] -= CC[0+bs*0] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6097 CC[1+bs*1] -= CC[1+bs*0] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6098 CC[2+bs*1] -= CC[2+bs*0] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6099 CC[3+bs*1] -= CC[3+bs*0] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6101 CC[0+bs*1] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6102 CC[1+bs*1] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6103 CC[2+bs*1] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6104 CC[3+bs*1] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6110 CC[0+bs*2] -= CC[0+bs*0] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6111 CC[1+bs*2] -= CC[1+bs*0] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6112 CC[2+bs*2] -= CC[2+bs*0] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6113 CC[3+bs*2] -= CC[3+bs*0] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6115 CC[0+bs*2] -= CC[0+bs*1] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6116 CC[1+bs*2] -= CC[1+bs*1] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6117 CC[2+bs*2] -= CC[2+bs*1] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6118 CC[3+bs*2] -= CC[3+bs*1] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6120 CC[0+bs*2] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6121 CC[1+bs*2] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6122 CC[2+bs*2] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6123 CC[3+bs*2] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6129 CC[0+bs*3] -= CC[0+bs*0] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6130 CC[1+bs*3] -= CC[1+bs*0] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6131 CC[2+bs*3] -= CC[2+bs*0] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6132 CC[3+bs*3] -= CC[3+bs*0] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6134 CC[0+bs*3] -= CC[0+bs*1] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6135 CC[1+bs*3] -= CC[1+bs*1] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6136 CC[2+bs*3] -= CC[2+bs*1] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6137 CC[3+bs*3] -= CC[3+bs*1] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6139 CC[0+bs*3] -= CC[0+bs*2] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6140 CC[1+bs*3] -= CC[1+bs*2] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6141 CC[2+bs*3] -= CC[2+bs*2] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6142 CC[3+bs*3] -= CC[3+bs*2] * tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6144 CC[0+bs*3] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6145 CC[1+bs*3] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6146 CC[2+bs*3] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6147 CC[3+bs*3] *= tmp; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6153 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6154 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6155 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6156 D[3+bs*0] = CC[3+bs*0]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6161 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6162 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6163 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6164 D[3+bs*1] = CC[3+bs*1]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6169 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6170 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6171 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6172 D[3+bs*2] = CC[3+bs*2]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6177 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6178 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6179 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6180 D[3+bs*3] = CC[3+bs*3]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6184 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6185 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6186 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6191 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6192 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6193 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6198 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6199 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6200 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6205 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6206 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6207 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6211 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6212 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6217 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6218 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6223 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6224 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6229 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6230 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6234 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6239 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6244 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6249 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nt_rl_inv_4x4_vs_lib4()
6292 double CC[16] = {0}; in kernel_dtrsm_nt_rl_one_4x4_lib4() local
6294 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6299 kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC); in kernel_dtrsm_nt_rl_one_4x4_lib4()
6302 CC[0+bs*1] -= CC[0+bs*0] * tmp; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6303 CC[1+bs*1] -= CC[1+bs*0] * tmp; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6304 CC[2+bs*1] -= CC[2+bs*0] * tmp; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6305 CC[3+bs*1] -= CC[3+bs*0] * tmp; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6308 CC[0+bs*2] -= CC[0+bs*0] * tmp; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6309 CC[1+bs*2] -= CC[1+bs*0] * tmp; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6310 CC[2+bs*2] -= CC[2+bs*0] * tmp; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6311 CC[3+bs*2] -= CC[3+bs*0] * tmp; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6313 CC[0+bs*2] -= CC[0+bs*1] * tmp; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6314 CC[1+bs*2] -= CC[1+bs*1] * tmp; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6315 CC[2+bs*2] -= CC[2+bs*1] * tmp; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6316 CC[3+bs*2] -= CC[3+bs*1] * tmp; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6319 CC[0+bs*3] -= CC[0+bs*0] * tmp; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6320 CC[1+bs*3] -= CC[1+bs*0] * tmp; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6321 CC[2+bs*3] -= CC[2+bs*0] * tmp; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6322 CC[3+bs*3] -= CC[3+bs*0] * tmp; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6324 CC[0+bs*3] -= CC[0+bs*1] * tmp; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6325 CC[1+bs*3] -= CC[1+bs*1] * tmp; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6326 CC[2+bs*3] -= CC[2+bs*1] * tmp; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6327 CC[3+bs*3] -= CC[3+bs*1] * tmp; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6329 CC[0+bs*3] -= CC[0+bs*2] * tmp; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6330 CC[1+bs*3] -= CC[1+bs*2] * tmp; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6331 CC[2+bs*3] -= CC[2+bs*2] * tmp; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6332 CC[3+bs*3] -= CC[3+bs*2] * tmp; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6334 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6335 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6336 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6337 D[3+bs*0] = CC[3+bs*0]; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6339 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6340 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6341 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6342 D[3+bs*1] = CC[3+bs*1]; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6344 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6345 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6346 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6347 D[3+bs*2] = CC[3+bs*2]; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6349 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6350 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6351 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6352 D[3+bs*3] = CC[3+bs*3]; in kernel_dtrsm_nt_rl_one_4x4_lib4()
6370 double CC[16] = {0}; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4() local
6372 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6377 kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC); in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6383 CC[0+bs*1] -= CC[0+bs*0] * tmp; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6384 CC[1+bs*1] -= CC[1+bs*0] * tmp; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6385 CC[2+bs*1] -= CC[2+bs*0] * tmp; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6386 CC[3+bs*1] -= CC[3+bs*0] * tmp; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6392 CC[0+bs*2] -= CC[0+bs*0] * tmp; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6393 CC[1+bs*2] -= CC[1+bs*0] * tmp; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6394 CC[2+bs*2] -= CC[2+bs*0] * tmp; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6395 CC[3+bs*2] -= CC[3+bs*0] * tmp; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6397 CC[0+bs*2] -= CC[0+bs*1] * tmp; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6398 CC[1+bs*2] -= CC[1+bs*1] * tmp; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6399 CC[2+bs*2] -= CC[2+bs*1] * tmp; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6400 CC[3+bs*2] -= CC[3+bs*1] * tmp; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6406 CC[0+bs*3] -= CC[0+bs*0] * tmp; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6407 CC[1+bs*3] -= CC[1+bs*0] * tmp; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6408 CC[2+bs*3] -= CC[2+bs*0] * tmp; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6409 CC[3+bs*3] -= CC[3+bs*0] * tmp; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6411 CC[0+bs*3] -= CC[0+bs*1] * tmp; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6412 CC[1+bs*3] -= CC[1+bs*1] * tmp; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6413 CC[2+bs*3] -= CC[2+bs*1] * tmp; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6414 CC[3+bs*3] -= CC[3+bs*1] * tmp; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6416 CC[0+bs*3] -= CC[0+bs*2] * tmp; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6417 CC[1+bs*3] -= CC[1+bs*2] * tmp; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6418 CC[2+bs*3] -= CC[2+bs*2] * tmp; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6419 CC[3+bs*3] -= CC[3+bs*2] * tmp; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6425 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6426 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6427 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6428 D[3+bs*0] = CC[3+bs*0]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6433 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6434 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6435 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6436 D[3+bs*1] = CC[3+bs*1]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6441 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6442 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6443 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6444 D[3+bs*2] = CC[3+bs*2]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6449 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6450 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6451 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6452 D[3+bs*3] = CC[3+bs*3]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6456 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6457 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6458 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6463 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6464 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6465 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6470 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6471 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6472 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6477 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6478 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6479 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6483 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6484 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6489 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6490 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6495 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6496 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6501 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6502 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6506 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6511 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6516 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6521 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nt_rl_one_4x4_vs_lib4()
6540 double CC[16] = {0}; in kernel_dtrsm_nt_ru_inv_4x4_lib4() local
6542 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6547 kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC); in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6550 CC[0+bs*3] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6551 CC[1+bs*3] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6552 CC[2+bs*3] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6553 CC[3+bs*3] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6555 CC[0+bs*2] -= CC[0+bs*3] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6556 CC[1+bs*2] -= CC[1+bs*3] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6557 CC[2+bs*2] -= CC[2+bs*3] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6558 CC[3+bs*2] -= CC[3+bs*3] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6560 CC[0+bs*1] -= CC[0+bs*3] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6561 CC[1+bs*1] -= CC[1+bs*3] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6562 CC[2+bs*1] -= CC[2+bs*3] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6563 CC[3+bs*1] -= CC[3+bs*3] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6565 CC[0+bs*0] -= CC[0+bs*3] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6566 CC[1+bs*0] -= CC[1+bs*3] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6567 CC[2+bs*0] -= CC[2+bs*3] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6568 CC[3+bs*0] -= CC[3+bs*3] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6571 CC[0+bs*2] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6572 CC[1+bs*2] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6573 CC[2+bs*2] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6574 CC[3+bs*2] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6576 CC[0+bs*1] -= CC[0+bs*2] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6577 CC[1+bs*1] -= CC[1+bs*2] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6578 CC[2+bs*1] -= CC[2+bs*2] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6579 CC[3+bs*1] -= CC[3+bs*2] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6581 CC[0+bs*0] -= CC[0+bs*2] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6582 CC[1+bs*0] -= CC[1+bs*2] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6583 CC[2+bs*0] -= CC[2+bs*2] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6584 CC[3+bs*0] -= CC[3+bs*2] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6587 CC[0+bs*1] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6588 CC[1+bs*1] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6589 CC[2+bs*1] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6590 CC[3+bs*1] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6592 CC[0+bs*0] -= CC[0+bs*1] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6593 CC[1+bs*0] -= CC[1+bs*1] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6594 CC[2+bs*0] -= CC[2+bs*1] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6595 CC[3+bs*0] -= CC[3+bs*1] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6598 CC[0+bs*0] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6599 CC[1+bs*0] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6600 CC[2+bs*0] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6601 CC[3+bs*0] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6603 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6604 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6605 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6606 D[3+bs*0] = CC[3+bs*0]; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6608 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6609 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6610 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6611 D[3+bs*1] = CC[3+bs*1]; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6613 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6614 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6615 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6616 D[3+bs*2] = CC[3+bs*2]; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6618 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6619 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6620 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6621 D[3+bs*3] = CC[3+bs*3]; in kernel_dtrsm_nt_ru_inv_4x4_lib4()
6639 double CC[16] = {0}; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4() local
6641 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6646 kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC); in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6651 CC[0+bs*3] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6652 CC[1+bs*3] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6653 CC[2+bs*3] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6654 CC[3+bs*3] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6656 CC[0+bs*2] -= CC[0+bs*3] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6657 CC[1+bs*2] -= CC[1+bs*3] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6658 CC[2+bs*2] -= CC[2+bs*3] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6659 CC[3+bs*2] -= CC[3+bs*3] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6661 CC[0+bs*1] -= CC[0+bs*3] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6662 CC[1+bs*1] -= CC[1+bs*3] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6663 CC[2+bs*1] -= CC[2+bs*3] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6664 CC[3+bs*1] -= CC[3+bs*3] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6666 CC[0+bs*0] -= CC[0+bs*3] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6667 CC[1+bs*0] -= CC[1+bs*3] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6668 CC[2+bs*0] -= CC[2+bs*3] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6669 CC[3+bs*0] -= CC[3+bs*3] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6675 CC[0+bs*2] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6676 CC[1+bs*2] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6677 CC[2+bs*2] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6678 CC[3+bs*2] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6680 CC[0+bs*1] -= CC[0+bs*2] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6681 CC[1+bs*1] -= CC[1+bs*2] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6682 CC[2+bs*1] -= CC[2+bs*2] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6683 CC[3+bs*1] -= CC[3+bs*2] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6685 CC[0+bs*0] -= CC[0+bs*2] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6686 CC[1+bs*0] -= CC[1+bs*2] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6687 CC[2+bs*0] -= CC[2+bs*2] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6688 CC[3+bs*0] -= CC[3+bs*2] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6694 CC[0+bs*1] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6695 CC[1+bs*1] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6696 CC[2+bs*1] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6697 CC[3+bs*1] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6699 CC[0+bs*0] -= CC[0+bs*1] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6700 CC[1+bs*0] -= CC[1+bs*1] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6701 CC[2+bs*0] -= CC[2+bs*1] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6702 CC[3+bs*0] -= CC[3+bs*1] * tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6706 CC[0+bs*0] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6707 CC[1+bs*0] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6708 CC[2+bs*0] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6709 CC[3+bs*0] *= tmp; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6716 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6717 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6718 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6719 D[3+bs*0] = CC[3+bs*0]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6724 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6725 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6726 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6727 D[3+bs*1] = CC[3+bs*1]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6732 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6733 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6734 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6735 D[3+bs*2] = CC[3+bs*2]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6740 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6741 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6742 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6743 D[3+bs*3] = CC[3+bs*3]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6747 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6748 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6749 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6754 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6755 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6756 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6761 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6762 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6763 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6768 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6769 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6770 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6774 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6775 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6780 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6781 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6786 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6787 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6792 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6793 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6797 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6802 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6807 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6812 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nt_ru_inv_4x4_vs_lib4()
6831 double CC[16] = {0}; in kernel_dtrsm_nt_ru_one_4x4_lib4() local
6833 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6838 kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC); in kernel_dtrsm_nt_ru_one_4x4_lib4()
6841 CC[0+bs*2] -= CC[0+bs*3] * tmp; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6842 CC[1+bs*2] -= CC[1+bs*3] * tmp; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6843 CC[2+bs*2] -= CC[2+bs*3] * tmp; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6844 CC[3+bs*2] -= CC[3+bs*3] * tmp; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6846 CC[0+bs*1] -= CC[0+bs*3] * tmp; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6847 CC[1+bs*1] -= CC[1+bs*3] * tmp; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6848 CC[2+bs*1] -= CC[2+bs*3] * tmp; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6849 CC[3+bs*1] -= CC[3+bs*3] * tmp; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6851 CC[0+bs*0] -= CC[0+bs*3] * tmp; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6852 CC[1+bs*0] -= CC[1+bs*3] * tmp; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6853 CC[2+bs*0] -= CC[2+bs*3] * tmp; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6854 CC[3+bs*0] -= CC[3+bs*3] * tmp; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6857 CC[0+bs*1] -= CC[0+bs*2] * tmp; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6858 CC[1+bs*1] -= CC[1+bs*2] * tmp; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6859 CC[2+bs*1] -= CC[2+bs*2] * tmp; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6860 CC[3+bs*1] -= CC[3+bs*2] * tmp; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6862 CC[0+bs*0] -= CC[0+bs*2] * tmp; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6863 CC[1+bs*0] -= CC[1+bs*2] * tmp; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6864 CC[2+bs*0] -= CC[2+bs*2] * tmp; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6865 CC[3+bs*0] -= CC[3+bs*2] * tmp; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6868 CC[0+bs*0] -= CC[0+bs*1] * tmp; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6869 CC[1+bs*0] -= CC[1+bs*1] * tmp; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6870 CC[2+bs*0] -= CC[2+bs*1] * tmp; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6871 CC[3+bs*0] -= CC[3+bs*1] * tmp; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6874 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6875 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6876 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6877 D[3+bs*0] = CC[3+bs*0]; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6879 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6880 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6881 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6882 D[3+bs*1] = CC[3+bs*1]; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6884 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6885 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6886 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6887 D[3+bs*2] = CC[3+bs*2]; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6889 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6890 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6891 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6892 D[3+bs*3] = CC[3+bs*3]; in kernel_dtrsm_nt_ru_one_4x4_lib4()
6910 double CC[16] = {0}; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4() local
6912 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6917 kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC); in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6922 CC[0+bs*2] -= CC[0+bs*3] * tmp; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6923 CC[1+bs*2] -= CC[1+bs*3] * tmp; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6924 CC[2+bs*2] -= CC[2+bs*3] * tmp; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6925 CC[3+bs*2] -= CC[3+bs*3] * tmp; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6927 CC[0+bs*1] -= CC[0+bs*3] * tmp; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6928 CC[1+bs*1] -= CC[1+bs*3] * tmp; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6929 CC[2+bs*1] -= CC[2+bs*3] * tmp; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6930 CC[3+bs*1] -= CC[3+bs*3] * tmp; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6932 CC[0+bs*0] -= CC[0+bs*3] * tmp; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6933 CC[1+bs*0] -= CC[1+bs*3] * tmp; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6934 CC[2+bs*0] -= CC[2+bs*3] * tmp; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6935 CC[3+bs*0] -= CC[3+bs*3] * tmp; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6941 CC[0+bs*1] -= CC[0+bs*2] * tmp; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6942 CC[1+bs*1] -= CC[1+bs*2] * tmp; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6943 CC[2+bs*1] -= CC[2+bs*2] * tmp; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6944 CC[3+bs*1] -= CC[3+bs*2] * tmp; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6946 CC[0+bs*0] -= CC[0+bs*2] * tmp; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6947 CC[1+bs*0] -= CC[1+bs*2] * tmp; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6948 CC[2+bs*0] -= CC[2+bs*2] * tmp; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6949 CC[3+bs*0] -= CC[3+bs*2] * tmp; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6955 CC[0+bs*0] -= CC[0+bs*1] * tmp; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6956 CC[1+bs*0] -= CC[1+bs*1] * tmp; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6957 CC[2+bs*0] -= CC[2+bs*1] * tmp; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6958 CC[3+bs*0] -= CC[3+bs*1] * tmp; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6966 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6967 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6968 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6969 D[3+bs*0] = CC[3+bs*0]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6974 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6975 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6976 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6977 D[3+bs*1] = CC[3+bs*1]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6982 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6983 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6984 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6985 D[3+bs*2] = CC[3+bs*2]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6990 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6991 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6992 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6993 D[3+bs*3] = CC[3+bs*3]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6997 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6998 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
6999 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
7004 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
7005 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
7006 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
7011 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
7012 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
7013 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
7018 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
7019 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
7020 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
7024 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
7025 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
7030 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
7031 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
7036 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
7037 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
7042 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
7043 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
7047 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
7052 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
7057 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
7062 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nt_ru_one_4x4_vs_lib4()
7083 double CC[16] = {0}; in kernel_dgetrf_nn_4x4_lib4() local
7085 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dgetrf_nn_4x4_lib4()
7091 kernel_dgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, &beta1, C, CC); in kernel_dgetrf_nn_4x4_lib4()
7096 tmp = 1.0 / CC[0+bs*0]; in kernel_dgetrf_nn_4x4_lib4()
7097 CC[1+bs*0] *= tmp; in kernel_dgetrf_nn_4x4_lib4()
7098 CC[2+bs*0] *= tmp; in kernel_dgetrf_nn_4x4_lib4()
7099 CC[3+bs*0] *= tmp; in kernel_dgetrf_nn_4x4_lib4()
7104 CC[1+bs*1] -= CC[1+bs*0] * CC[0+bs*1]; in kernel_dgetrf_nn_4x4_lib4()
7105 CC[2+bs*1] -= CC[2+bs*0] * CC[0+bs*1]; in kernel_dgetrf_nn_4x4_lib4()
7106 CC[3+bs*1] -= CC[3+bs*0] * CC[0+bs*1]; in kernel_dgetrf_nn_4x4_lib4()
7108 tmp = 1.0 / CC[1+bs*1]; in kernel_dgetrf_nn_4x4_lib4()
7109 CC[2+bs*1] *= tmp; in kernel_dgetrf_nn_4x4_lib4()
7110 CC[3+bs*1] *= tmp; in kernel_dgetrf_nn_4x4_lib4()
7115 CC[1+bs*2] -= CC[1+bs*0] * CC[0+bs*2]; in kernel_dgetrf_nn_4x4_lib4()
7116 CC[2+bs*2] -= CC[2+bs*0] * CC[0+bs*2]; in kernel_dgetrf_nn_4x4_lib4()
7117 CC[3+bs*2] -= CC[3+bs*0] * CC[0+bs*2]; in kernel_dgetrf_nn_4x4_lib4()
7119 CC[2+bs*2] -= CC[2+bs*1] * CC[1+bs*2]; in kernel_dgetrf_nn_4x4_lib4()
7120 CC[3+bs*2] -= CC[3+bs*1] * CC[1+bs*2]; in kernel_dgetrf_nn_4x4_lib4()
7122 tmp = 1.0 / CC[2+bs*2]; in kernel_dgetrf_nn_4x4_lib4()
7123 CC[3+bs*2] *= tmp; in kernel_dgetrf_nn_4x4_lib4()
7128 CC[1+bs*3] -= CC[1+bs*0] * CC[0+bs*3]; in kernel_dgetrf_nn_4x4_lib4()
7129 CC[2+bs*3] -= CC[2+bs*0] * CC[0+bs*3]; in kernel_dgetrf_nn_4x4_lib4()
7130 CC[3+bs*3] -= CC[3+bs*0] * CC[0+bs*3]; in kernel_dgetrf_nn_4x4_lib4()
7132 CC[2+bs*3] -= CC[2+bs*1] * CC[1+bs*3]; in kernel_dgetrf_nn_4x4_lib4()
7133 CC[3+bs*3] -= CC[3+bs*1] * CC[1+bs*3]; in kernel_dgetrf_nn_4x4_lib4()
7135 CC[3+bs*3] -= CC[3+bs*2] * CC[2+bs*3]; in kernel_dgetrf_nn_4x4_lib4()
7137 tmp = 1.0 / CC[3+bs*3]; in kernel_dgetrf_nn_4x4_lib4()
7141 D[0+bs*0] = CC[0+bs*0]; in kernel_dgetrf_nn_4x4_lib4()
7142 D[1+bs*0] = CC[1+bs*0]; in kernel_dgetrf_nn_4x4_lib4()
7143 D[2+bs*0] = CC[2+bs*0]; in kernel_dgetrf_nn_4x4_lib4()
7144 D[3+bs*0] = CC[3+bs*0]; in kernel_dgetrf_nn_4x4_lib4()
7146 D[0+bs*1] = CC[0+bs*1]; in kernel_dgetrf_nn_4x4_lib4()
7147 D[1+bs*1] = CC[1+bs*1]; in kernel_dgetrf_nn_4x4_lib4()
7148 D[2+bs*1] = CC[2+bs*1]; in kernel_dgetrf_nn_4x4_lib4()
7149 D[3+bs*1] = CC[3+bs*1]; in kernel_dgetrf_nn_4x4_lib4()
7151 D[0+bs*2] = CC[0+bs*2]; in kernel_dgetrf_nn_4x4_lib4()
7152 D[1+bs*2] = CC[1+bs*2]; in kernel_dgetrf_nn_4x4_lib4()
7153 D[2+bs*2] = CC[2+bs*2]; in kernel_dgetrf_nn_4x4_lib4()
7154 D[3+bs*2] = CC[3+bs*2]; in kernel_dgetrf_nn_4x4_lib4()
7156 D[0+bs*3] = CC[0+bs*3]; in kernel_dgetrf_nn_4x4_lib4()
7157 D[1+bs*3] = CC[1+bs*3]; in kernel_dgetrf_nn_4x4_lib4()
7158 D[2+bs*3] = CC[2+bs*3]; in kernel_dgetrf_nn_4x4_lib4()
7159 D[3+bs*3] = CC[3+bs*3]; in kernel_dgetrf_nn_4x4_lib4()
7179 double CC[16] = {0}; in kernel_dgetrf_nn_4x4_vs_lib4() local
7181 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dgetrf_nn_4x4_vs_lib4()
7187 kernel_dgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, &beta1, C, CC); in kernel_dgetrf_nn_4x4_vs_lib4()
7192 tmp = 1.0 / CC[0+bs*0]; in kernel_dgetrf_nn_4x4_vs_lib4()
7193 CC[1+bs*0] *= tmp; in kernel_dgetrf_nn_4x4_vs_lib4()
7194 CC[2+bs*0] *= tmp; in kernel_dgetrf_nn_4x4_vs_lib4()
7195 CC[3+bs*0] *= tmp; in kernel_dgetrf_nn_4x4_vs_lib4()
7203 CC[1+bs*1] -= CC[1+bs*0] * CC[0+bs*1]; in kernel_dgetrf_nn_4x4_vs_lib4()
7204 CC[2+bs*1] -= CC[2+bs*0] * CC[0+bs*1]; in kernel_dgetrf_nn_4x4_vs_lib4()
7205 CC[3+bs*1] -= CC[3+bs*0] * CC[0+bs*1]; in kernel_dgetrf_nn_4x4_vs_lib4()
7207 tmp = 1.0 / CC[1+bs*1]; in kernel_dgetrf_nn_4x4_vs_lib4()
7208 CC[2+bs*1] *= tmp; in kernel_dgetrf_nn_4x4_vs_lib4()
7209 CC[3+bs*1] *= tmp; in kernel_dgetrf_nn_4x4_vs_lib4()
7217 CC[1+bs*2] -= CC[1+bs*0] * CC[0+bs*2]; in kernel_dgetrf_nn_4x4_vs_lib4()
7218 CC[2+bs*2] -= CC[2+bs*0] * CC[0+bs*2]; in kernel_dgetrf_nn_4x4_vs_lib4()
7219 CC[3+bs*2] -= CC[3+bs*0] * CC[0+bs*2]; in kernel_dgetrf_nn_4x4_vs_lib4()
7221 CC[2+bs*2] -= CC[2+bs*1] * CC[1+bs*2]; in kernel_dgetrf_nn_4x4_vs_lib4()
7222 CC[3+bs*2] -= CC[3+bs*1] * CC[1+bs*2]; in kernel_dgetrf_nn_4x4_vs_lib4()
7224 tmp = 1.0 / CC[2+bs*2]; in kernel_dgetrf_nn_4x4_vs_lib4()
7225 CC[3+bs*2] *= tmp; in kernel_dgetrf_nn_4x4_vs_lib4()
7233 CC[1+bs*3] -= CC[1+bs*0] * CC[0+bs*3]; in kernel_dgetrf_nn_4x4_vs_lib4()
7234 CC[2+bs*3] -= CC[2+bs*0] * CC[0+bs*3]; in kernel_dgetrf_nn_4x4_vs_lib4()
7235 CC[3+bs*3] -= CC[3+bs*0] * CC[0+bs*3]; in kernel_dgetrf_nn_4x4_vs_lib4()
7237 CC[2+bs*3] -= CC[2+bs*1] * CC[1+bs*3]; in kernel_dgetrf_nn_4x4_vs_lib4()
7238 CC[3+bs*3] -= CC[3+bs*1] * CC[1+bs*3]; in kernel_dgetrf_nn_4x4_vs_lib4()
7240 CC[3+bs*3] -= CC[3+bs*2] * CC[2+bs*3]; in kernel_dgetrf_nn_4x4_vs_lib4()
7242 tmp = 1.0 / CC[3+bs*3]; in kernel_dgetrf_nn_4x4_vs_lib4()
7250 D[0+bs*0] = CC[0+bs*0]; in kernel_dgetrf_nn_4x4_vs_lib4()
7251 D[1+bs*0] = CC[1+bs*0]; in kernel_dgetrf_nn_4x4_vs_lib4()
7252 D[2+bs*0] = CC[2+bs*0]; in kernel_dgetrf_nn_4x4_vs_lib4()
7253 D[3+bs*0] = CC[3+bs*0]; in kernel_dgetrf_nn_4x4_vs_lib4()
7258 D[0+bs*1] = CC[0+bs*1]; in kernel_dgetrf_nn_4x4_vs_lib4()
7259 D[1+bs*1] = CC[1+bs*1]; in kernel_dgetrf_nn_4x4_vs_lib4()
7260 D[2+bs*1] = CC[2+bs*1]; in kernel_dgetrf_nn_4x4_vs_lib4()
7261 D[3+bs*1] = CC[3+bs*1]; in kernel_dgetrf_nn_4x4_vs_lib4()
7266 D[0+bs*2] = CC[0+bs*2]; in kernel_dgetrf_nn_4x4_vs_lib4()
7267 D[1+bs*2] = CC[1+bs*2]; in kernel_dgetrf_nn_4x4_vs_lib4()
7268 D[2+bs*2] = CC[2+bs*2]; in kernel_dgetrf_nn_4x4_vs_lib4()
7269 D[3+bs*2] = CC[3+bs*2]; in kernel_dgetrf_nn_4x4_vs_lib4()
7274 D[0+bs*3] = CC[0+bs*3]; in kernel_dgetrf_nn_4x4_vs_lib4()
7275 D[1+bs*3] = CC[1+bs*3]; in kernel_dgetrf_nn_4x4_vs_lib4()
7276 D[2+bs*3] = CC[2+bs*3]; in kernel_dgetrf_nn_4x4_vs_lib4()
7277 D[3+bs*3] = CC[3+bs*3]; in kernel_dgetrf_nn_4x4_vs_lib4()
7281 D[0+bs*0] = CC[0+bs*0]; in kernel_dgetrf_nn_4x4_vs_lib4()
7282 D[1+bs*0] = CC[1+bs*0]; in kernel_dgetrf_nn_4x4_vs_lib4()
7283 D[2+bs*0] = CC[2+bs*0]; in kernel_dgetrf_nn_4x4_vs_lib4()
7288 D[0+bs*1] = CC[0+bs*1]; in kernel_dgetrf_nn_4x4_vs_lib4()
7289 D[1+bs*1] = CC[1+bs*1]; in kernel_dgetrf_nn_4x4_vs_lib4()
7290 D[2+bs*1] = CC[2+bs*1]; in kernel_dgetrf_nn_4x4_vs_lib4()
7295 D[0+bs*2] = CC[0+bs*2]; in kernel_dgetrf_nn_4x4_vs_lib4()
7296 D[1+bs*2] = CC[1+bs*2]; in kernel_dgetrf_nn_4x4_vs_lib4()
7297 D[2+bs*2] = CC[2+bs*2]; in kernel_dgetrf_nn_4x4_vs_lib4()
7302 D[0+bs*3] = CC[0+bs*3]; in kernel_dgetrf_nn_4x4_vs_lib4()
7303 D[1+bs*3] = CC[1+bs*3]; in kernel_dgetrf_nn_4x4_vs_lib4()
7304 D[2+bs*3] = CC[2+bs*3]; in kernel_dgetrf_nn_4x4_vs_lib4()
7308 D[0+bs*0] = CC[0+bs*0]; in kernel_dgetrf_nn_4x4_vs_lib4()
7309 D[1+bs*0] = CC[1+bs*0]; in kernel_dgetrf_nn_4x4_vs_lib4()
7314 D[0+bs*1] = CC[0+bs*1]; in kernel_dgetrf_nn_4x4_vs_lib4()
7315 D[1+bs*1] = CC[1+bs*1]; in kernel_dgetrf_nn_4x4_vs_lib4()
7320 D[0+bs*2] = CC[0+bs*2]; in kernel_dgetrf_nn_4x4_vs_lib4()
7321 D[1+bs*2] = CC[1+bs*2]; in kernel_dgetrf_nn_4x4_vs_lib4()
7326 D[0+bs*3] = CC[0+bs*3]; in kernel_dgetrf_nn_4x4_vs_lib4()
7327 D[1+bs*3] = CC[1+bs*3]; in kernel_dgetrf_nn_4x4_vs_lib4()
7331 D[0+bs*0] = CC[0+bs*0]; in kernel_dgetrf_nn_4x4_vs_lib4()
7336 D[0+bs*1] = CC[0+bs*1]; in kernel_dgetrf_nn_4x4_vs_lib4()
7341 D[0+bs*2] = CC[0+bs*2]; in kernel_dgetrf_nn_4x4_vs_lib4()
7346 D[0+bs*3] = CC[0+bs*3]; in kernel_dgetrf_nn_4x4_vs_lib4()
7367 double CC[16] = {0}; in kernel_dgetrf_nt_4x4_lib4() local
7369 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dgetrf_nt_4x4_lib4()
7375 kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, C, CC); in kernel_dgetrf_nt_4x4_lib4()
7380 tmp = 1.0 / CC[0+bs*0]; in kernel_dgetrf_nt_4x4_lib4()
7381 CC[1+bs*0] *= tmp; in kernel_dgetrf_nt_4x4_lib4()
7382 CC[2+bs*0] *= tmp; in kernel_dgetrf_nt_4x4_lib4()
7383 CC[3+bs*0] *= tmp; in kernel_dgetrf_nt_4x4_lib4()
7388 CC[1+bs*1] -= CC[1+bs*0] * CC[0+bs*1]; in kernel_dgetrf_nt_4x4_lib4()
7389 CC[2+bs*1] -= CC[2+bs*0] * CC[0+bs*1]; in kernel_dgetrf_nt_4x4_lib4()
7390 CC[3+bs*1] -= CC[3+bs*0] * CC[0+bs*1]; in kernel_dgetrf_nt_4x4_lib4()
7392 tmp = 1.0 / CC[1+bs*1]; in kernel_dgetrf_nt_4x4_lib4()
7393 CC[2+bs*1] *= tmp; in kernel_dgetrf_nt_4x4_lib4()
7394 CC[3+bs*1] *= tmp; in kernel_dgetrf_nt_4x4_lib4()
7399 CC[1+bs*2] -= CC[1+bs*0] * CC[0+bs*2]; in kernel_dgetrf_nt_4x4_lib4()
7400 CC[2+bs*2] -= CC[2+bs*0] * CC[0+bs*2]; in kernel_dgetrf_nt_4x4_lib4()
7401 CC[3+bs*2] -= CC[3+bs*0] * CC[0+bs*2]; in kernel_dgetrf_nt_4x4_lib4()
7403 CC[2+bs*2] -= CC[2+bs*1] * CC[1+bs*2]; in kernel_dgetrf_nt_4x4_lib4()
7404 CC[3+bs*2] -= CC[3+bs*1] * CC[1+bs*2]; in kernel_dgetrf_nt_4x4_lib4()
7406 tmp = 1.0 / CC[2+bs*2]; in kernel_dgetrf_nt_4x4_lib4()
7407 CC[3+bs*2] *= tmp; in kernel_dgetrf_nt_4x4_lib4()
7412 CC[1+bs*3] -= CC[1+bs*0] * CC[0+bs*3]; in kernel_dgetrf_nt_4x4_lib4()
7413 CC[2+bs*3] -= CC[2+bs*0] * CC[0+bs*3]; in kernel_dgetrf_nt_4x4_lib4()
7414 CC[3+bs*3] -= CC[3+bs*0] * CC[0+bs*3]; in kernel_dgetrf_nt_4x4_lib4()
7416 CC[2+bs*3] -= CC[2+bs*1] * CC[1+bs*3]; in kernel_dgetrf_nt_4x4_lib4()
7417 CC[3+bs*3] -= CC[3+bs*1] * CC[1+bs*3]; in kernel_dgetrf_nt_4x4_lib4()
7419 CC[3+bs*3] -= CC[3+bs*2] * CC[2+bs*3]; in kernel_dgetrf_nt_4x4_lib4()
7421 tmp = 1.0 / CC[3+bs*3]; in kernel_dgetrf_nt_4x4_lib4()
7425 D[0+bs*0] = CC[0+bs*0]; in kernel_dgetrf_nt_4x4_lib4()
7426 D[1+bs*0] = CC[1+bs*0]; in kernel_dgetrf_nt_4x4_lib4()
7427 D[2+bs*0] = CC[2+bs*0]; in kernel_dgetrf_nt_4x4_lib4()
7428 D[3+bs*0] = CC[3+bs*0]; in kernel_dgetrf_nt_4x4_lib4()
7430 D[0+bs*1] = CC[0+bs*1]; in kernel_dgetrf_nt_4x4_lib4()
7431 D[1+bs*1] = CC[1+bs*1]; in kernel_dgetrf_nt_4x4_lib4()
7432 D[2+bs*1] = CC[2+bs*1]; in kernel_dgetrf_nt_4x4_lib4()
7433 D[3+bs*1] = CC[3+bs*1]; in kernel_dgetrf_nt_4x4_lib4()
7435 D[0+bs*2] = CC[0+bs*2]; in kernel_dgetrf_nt_4x4_lib4()
7436 D[1+bs*2] = CC[1+bs*2]; in kernel_dgetrf_nt_4x4_lib4()
7437 D[2+bs*2] = CC[2+bs*2]; in kernel_dgetrf_nt_4x4_lib4()
7438 D[3+bs*2] = CC[3+bs*2]; in kernel_dgetrf_nt_4x4_lib4()
7440 D[0+bs*3] = CC[0+bs*3]; in kernel_dgetrf_nt_4x4_lib4()
7441 D[1+bs*3] = CC[1+bs*3]; in kernel_dgetrf_nt_4x4_lib4()
7442 D[2+bs*3] = CC[2+bs*3]; in kernel_dgetrf_nt_4x4_lib4()
7443 D[3+bs*3] = CC[3+bs*3]; in kernel_dgetrf_nt_4x4_lib4()
7463 double CC[16] = {0}; in kernel_dgetrf_nt_4x4_vs_lib4() local
7465 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dgetrf_nt_4x4_vs_lib4()
7471 kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, C, CC); in kernel_dgetrf_nt_4x4_vs_lib4()
7476 tmp = 1.0 / CC[0+bs*0]; in kernel_dgetrf_nt_4x4_vs_lib4()
7477 CC[1+bs*0] *= tmp; in kernel_dgetrf_nt_4x4_vs_lib4()
7478 CC[2+bs*0] *= tmp; in kernel_dgetrf_nt_4x4_vs_lib4()
7479 CC[3+bs*0] *= tmp; in kernel_dgetrf_nt_4x4_vs_lib4()
7487 CC[1+bs*1] -= CC[1+bs*0] * CC[0+bs*1]; in kernel_dgetrf_nt_4x4_vs_lib4()
7488 CC[2+bs*1] -= CC[2+bs*0] * CC[0+bs*1]; in kernel_dgetrf_nt_4x4_vs_lib4()
7489 CC[3+bs*1] -= CC[3+bs*0] * CC[0+bs*1]; in kernel_dgetrf_nt_4x4_vs_lib4()
7491 tmp = 1.0 / CC[1+bs*1]; in kernel_dgetrf_nt_4x4_vs_lib4()
7492 CC[2+bs*1] *= tmp; in kernel_dgetrf_nt_4x4_vs_lib4()
7493 CC[3+bs*1] *= tmp; in kernel_dgetrf_nt_4x4_vs_lib4()
7501 CC[1+bs*2] -= CC[1+bs*0] * CC[0+bs*2]; in kernel_dgetrf_nt_4x4_vs_lib4()
7502 CC[2+bs*2] -= CC[2+bs*0] * CC[0+bs*2]; in kernel_dgetrf_nt_4x4_vs_lib4()
7503 CC[3+bs*2] -= CC[3+bs*0] * CC[0+bs*2]; in kernel_dgetrf_nt_4x4_vs_lib4()
7505 CC[2+bs*2] -= CC[2+bs*1] * CC[1+bs*2]; in kernel_dgetrf_nt_4x4_vs_lib4()
7506 CC[3+bs*2] -= CC[3+bs*1] * CC[1+bs*2]; in kernel_dgetrf_nt_4x4_vs_lib4()
7508 tmp = 1.0 / CC[2+bs*2]; in kernel_dgetrf_nt_4x4_vs_lib4()
7509 CC[3+bs*2] *= tmp; in kernel_dgetrf_nt_4x4_vs_lib4()
7517 CC[1+bs*3] -= CC[1+bs*0] * CC[0+bs*3]; in kernel_dgetrf_nt_4x4_vs_lib4()
7518 CC[2+bs*3] -= CC[2+bs*0] * CC[0+bs*3]; in kernel_dgetrf_nt_4x4_vs_lib4()
7519 CC[3+bs*3] -= CC[3+bs*0] * CC[0+bs*3]; in kernel_dgetrf_nt_4x4_vs_lib4()
7521 CC[2+bs*3] -= CC[2+bs*1] * CC[1+bs*3]; in kernel_dgetrf_nt_4x4_vs_lib4()
7522 CC[3+bs*3] -= CC[3+bs*1] * CC[1+bs*3]; in kernel_dgetrf_nt_4x4_vs_lib4()
7524 CC[3+bs*3] -= CC[3+bs*2] * CC[2+bs*3]; in kernel_dgetrf_nt_4x4_vs_lib4()
7526 tmp = 1.0 / CC[3+bs*3]; in kernel_dgetrf_nt_4x4_vs_lib4()
7534 D[0+bs*0] = CC[0+bs*0]; in kernel_dgetrf_nt_4x4_vs_lib4()
7535 D[1+bs*0] = CC[1+bs*0]; in kernel_dgetrf_nt_4x4_vs_lib4()
7536 D[2+bs*0] = CC[2+bs*0]; in kernel_dgetrf_nt_4x4_vs_lib4()
7537 D[3+bs*0] = CC[3+bs*0]; in kernel_dgetrf_nt_4x4_vs_lib4()
7542 D[0+bs*1] = CC[0+bs*1]; in kernel_dgetrf_nt_4x4_vs_lib4()
7543 D[1+bs*1] = CC[1+bs*1]; in kernel_dgetrf_nt_4x4_vs_lib4()
7544 D[2+bs*1] = CC[2+bs*1]; in kernel_dgetrf_nt_4x4_vs_lib4()
7545 D[3+bs*1] = CC[3+bs*1]; in kernel_dgetrf_nt_4x4_vs_lib4()
7550 D[0+bs*2] = CC[0+bs*2]; in kernel_dgetrf_nt_4x4_vs_lib4()
7551 D[1+bs*2] = CC[1+bs*2]; in kernel_dgetrf_nt_4x4_vs_lib4()
7552 D[2+bs*2] = CC[2+bs*2]; in kernel_dgetrf_nt_4x4_vs_lib4()
7553 D[3+bs*2] = CC[3+bs*2]; in kernel_dgetrf_nt_4x4_vs_lib4()
7558 D[0+bs*3] = CC[0+bs*3]; in kernel_dgetrf_nt_4x4_vs_lib4()
7559 D[1+bs*3] = CC[1+bs*3]; in kernel_dgetrf_nt_4x4_vs_lib4()
7560 D[2+bs*3] = CC[2+bs*3]; in kernel_dgetrf_nt_4x4_vs_lib4()
7561 D[3+bs*3] = CC[3+bs*3]; in kernel_dgetrf_nt_4x4_vs_lib4()
7565 D[0+bs*0] = CC[0+bs*0]; in kernel_dgetrf_nt_4x4_vs_lib4()
7566 D[1+bs*0] = CC[1+bs*0]; in kernel_dgetrf_nt_4x4_vs_lib4()
7567 D[2+bs*0] = CC[2+bs*0]; in kernel_dgetrf_nt_4x4_vs_lib4()
7572 D[0+bs*1] = CC[0+bs*1]; in kernel_dgetrf_nt_4x4_vs_lib4()
7573 D[1+bs*1] = CC[1+bs*1]; in kernel_dgetrf_nt_4x4_vs_lib4()
7574 D[2+bs*1] = CC[2+bs*1]; in kernel_dgetrf_nt_4x4_vs_lib4()
7579 D[0+bs*2] = CC[0+bs*2]; in kernel_dgetrf_nt_4x4_vs_lib4()
7580 D[1+bs*2] = CC[1+bs*2]; in kernel_dgetrf_nt_4x4_vs_lib4()
7581 D[2+bs*2] = CC[2+bs*2]; in kernel_dgetrf_nt_4x4_vs_lib4()
7586 D[0+bs*3] = CC[0+bs*3]; in kernel_dgetrf_nt_4x4_vs_lib4()
7587 D[1+bs*3] = CC[1+bs*3]; in kernel_dgetrf_nt_4x4_vs_lib4()
7588 D[2+bs*3] = CC[2+bs*3]; in kernel_dgetrf_nt_4x4_vs_lib4()
7592 D[0+bs*0] = CC[0+bs*0]; in kernel_dgetrf_nt_4x4_vs_lib4()
7593 D[1+bs*0] = CC[1+bs*0]; in kernel_dgetrf_nt_4x4_vs_lib4()
7598 D[0+bs*1] = CC[0+bs*1]; in kernel_dgetrf_nt_4x4_vs_lib4()
7599 D[1+bs*1] = CC[1+bs*1]; in kernel_dgetrf_nt_4x4_vs_lib4()
7604 D[0+bs*2] = CC[0+bs*2]; in kernel_dgetrf_nt_4x4_vs_lib4()
7605 D[1+bs*2] = CC[1+bs*2]; in kernel_dgetrf_nt_4x4_vs_lib4()
7610 D[0+bs*3] = CC[0+bs*3]; in kernel_dgetrf_nt_4x4_vs_lib4()
7611 D[1+bs*3] = CC[1+bs*3]; in kernel_dgetrf_nt_4x4_vs_lib4()
7615 D[0+bs*0] = CC[0+bs*0]; in kernel_dgetrf_nt_4x4_vs_lib4()
7620 D[0+bs*1] = CC[0+bs*1]; in kernel_dgetrf_nt_4x4_vs_lib4()
7625 D[0+bs*2] = CC[0+bs*2]; in kernel_dgetrf_nt_4x4_vs_lib4()
7630 D[0+bs*3] = CC[0+bs*3]; in kernel_dgetrf_nt_4x4_vs_lib4()
7653 double CC[16] = {0}; in kernel_dtrsm_nn_ll_inv_4x4_lib4() local
7655 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7660 kernel_dgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, beta, C, CC); in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7668 CC[0+bs*0] *= e_0; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7669 CC[1+bs*0] -= e_1 * CC[0+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7670 CC[2+bs*0] -= e_2 * CC[0+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7671 CC[3+bs*0] -= e_3 * CC[0+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7672 CC[0+bs*1] *= e_0; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7673 CC[1+bs*1] -= e_1 * CC[0+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7674 CC[2+bs*1] -= e_2 * CC[0+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7675 CC[3+bs*1] -= e_3 * CC[0+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7676 CC[0+bs*2] *= e_0; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7677 CC[1+bs*2] -= e_1 * CC[0+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7678 CC[2+bs*2] -= e_2 * CC[0+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7679 CC[3+bs*2] -= e_3 * CC[0+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7680 CC[0+bs*3] *= e_0; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7681 CC[1+bs*3] -= e_1 * CC[0+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7682 CC[2+bs*3] -= e_2 * CC[0+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7683 CC[3+bs*3] -= e_3 * CC[0+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7688 CC[1+bs*0] *= e_1; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7689 CC[2+bs*0] -= e_2 * CC[1+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7690 CC[3+bs*0] -= e_3 * CC[1+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7691 CC[1+bs*1] *= e_1; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7692 CC[2+bs*1] -= e_2 * CC[1+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7693 CC[3+bs*1] -= e_3 * CC[1+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7694 CC[1+bs*2] *= e_1; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7695 CC[2+bs*2] -= e_2 * CC[1+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7696 CC[3+bs*2] -= e_3 * CC[1+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7697 CC[1+bs*3] *= e_1; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7698 CC[2+bs*3] -= e_2 * CC[1+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7699 CC[3+bs*3] -= e_3 * CC[1+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7703 CC[2+bs*0] *= e_2; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7704 CC[3+bs*0] -= e_3 * CC[2+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7705 CC[2+bs*1] *= e_2; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7706 CC[3+bs*1] -= e_3 * CC[2+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7707 CC[2+bs*2] *= e_2; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7708 CC[3+bs*2] -= e_3 * CC[2+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7709 CC[2+bs*3] *= e_2; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7710 CC[3+bs*3] -= e_3 * CC[2+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7713 CC[3+bs*0] *= e_3; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7714 CC[3+bs*1] *= e_3; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7715 CC[3+bs*2] *= e_3; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7716 CC[3+bs*3] *= e_3; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7718 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7719 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7720 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7721 D[3+bs*0] = CC[3+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7723 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7724 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7725 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7726 D[3+bs*1] = CC[3+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7728 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7729 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7730 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7731 D[3+bs*2] = CC[3+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7733 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7734 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7735 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7736 D[3+bs*3] = CC[3+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_lib4()
7758 double CC[16] = {0}; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4() local
7760 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7765 kernel_dgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, beta, C, CC); in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7770 CC[0+bs*0] *= e_0; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7771 CC[0+bs*1] *= e_0; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7772 CC[0+bs*2] *= e_0; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7773 CC[0+bs*3] *= e_0; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7779 CC[1+bs*0] -= e_0 * CC[0+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7780 CC[1+bs*1] -= e_0 * CC[0+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7781 CC[1+bs*2] -= e_0 * CC[0+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7782 CC[1+bs*3] -= e_0 * CC[0+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7784 CC[1+bs*0] *= e_1; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7785 CC[1+bs*1] *= e_1; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7786 CC[1+bs*2] *= e_1; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7787 CC[1+bs*3] *= e_1; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7793 CC[2+bs*0] -= e_0 * CC[0+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7794 CC[2+bs*1] -= e_0 * CC[0+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7795 CC[2+bs*2] -= e_0 * CC[0+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7796 CC[2+bs*3] -= e_0 * CC[0+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7798 CC[2+bs*0] -= e_1 * CC[1+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7799 CC[2+bs*1] -= e_1 * CC[1+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7800 CC[2+bs*2] -= e_1 * CC[1+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7801 CC[2+bs*3] -= e_1 * CC[1+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7803 CC[2+bs*0] *= e_2; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7804 CC[2+bs*1] *= e_2; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7805 CC[2+bs*2] *= e_2; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7806 CC[2+bs*3] *= e_2; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7812 CC[3+bs*0] -= e_0 * CC[0+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7813 CC[3+bs*1] -= e_0 * CC[0+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7814 CC[3+bs*2] -= e_0 * CC[0+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7815 CC[3+bs*3] -= e_0 * CC[0+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7817 CC[3+bs*0] -= e_1 * CC[1+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7818 CC[3+bs*1] -= e_1 * CC[1+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7819 CC[3+bs*2] -= e_1 * CC[1+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7820 CC[3+bs*3] -= e_1 * CC[1+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7822 CC[3+bs*0] -= e_2 * CC[2+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7823 CC[3+bs*1] -= e_2 * CC[2+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7824 CC[3+bs*2] -= e_2 * CC[2+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7825 CC[3+bs*3] -= e_2 * CC[2+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7827 CC[3+bs*0] *= e_3; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7828 CC[3+bs*1] *= e_3; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7829 CC[3+bs*2] *= e_3; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7830 CC[3+bs*3] *= e_3; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7836 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7837 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7838 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7839 D[3+bs*0] = CC[3+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7844 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7845 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7846 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7847 D[3+bs*1] = CC[3+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7852 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7853 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7854 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7855 D[3+bs*2] = CC[3+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7860 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7861 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7862 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7863 D[3+bs*3] = CC[3+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7867 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7868 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7869 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7874 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7875 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7876 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7881 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7882 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7883 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7888 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7889 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7890 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7894 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7895 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7900 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7901 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7906 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7907 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7912 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7913 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7917 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7922 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7927 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7932 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nn_ll_inv_4x4_vs_lib4()
7955 double CC[16] = {0}; in kernel_dtrsm_nn_ll_one_4x4_lib4() local
7957 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dtrsm_nn_ll_one_4x4_lib4()
7962 kernel_dgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, beta, C, CC); in kernel_dtrsm_nn_ll_one_4x4_lib4()
7969 CC[1+bs*0] -= e_1 * CC[0+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
7970 CC[2+bs*0] -= e_2 * CC[0+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
7971 CC[3+bs*0] -= e_3 * CC[0+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
7972 CC[1+bs*1] -= e_1 * CC[0+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
7973 CC[2+bs*1] -= e_2 * CC[0+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
7974 CC[3+bs*1] -= e_3 * CC[0+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
7975 CC[1+bs*2] -= e_1 * CC[0+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
7976 CC[2+bs*2] -= e_2 * CC[0+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
7977 CC[3+bs*2] -= e_3 * CC[0+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
7978 CC[1+bs*3] -= e_1 * CC[0+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
7979 CC[2+bs*3] -= e_2 * CC[0+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
7980 CC[3+bs*3] -= e_3 * CC[0+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
7984 CC[2+bs*0] -= e_2 * CC[1+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
7985 CC[3+bs*0] -= e_3 * CC[1+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
7986 CC[2+bs*1] -= e_2 * CC[1+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
7987 CC[3+bs*1] -= e_3 * CC[1+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
7988 CC[2+bs*2] -= e_2 * CC[1+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
7989 CC[3+bs*2] -= e_3 * CC[1+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
7990 CC[2+bs*3] -= e_2 * CC[1+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
7991 CC[3+bs*3] -= e_3 * CC[1+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
7994 CC[3+bs*0] -= e_3 * CC[2+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
7995 CC[3+bs*1] -= e_3 * CC[2+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
7996 CC[3+bs*2] -= e_3 * CC[2+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
7997 CC[3+bs*3] -= e_3 * CC[2+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
7999 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
8000 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
8001 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
8002 D[3+bs*0] = CC[3+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
8004 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
8005 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
8006 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
8007 D[3+bs*1] = CC[3+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
8009 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
8010 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
8011 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
8012 D[3+bs*2] = CC[3+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
8014 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
8015 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
8016 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
8017 D[3+bs*3] = CC[3+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_lib4()
8039 double CC[16] = {0}; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4() local
8041 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8046 kernel_dgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, beta, C, CC); in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8054 CC[1+bs*0] -= e_0 * CC[0+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8055 CC[1+bs*1] -= e_0 * CC[0+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8056 CC[1+bs*2] -= e_0 * CC[0+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8057 CC[1+bs*3] -= e_0 * CC[0+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8063 CC[2+bs*0] -= e_0 * CC[0+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8064 CC[2+bs*1] -= e_0 * CC[0+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8065 CC[2+bs*2] -= e_0 * CC[0+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8066 CC[2+bs*3] -= e_0 * CC[0+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8068 CC[2+bs*0] -= e_1 * CC[1+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8069 CC[2+bs*1] -= e_1 * CC[1+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8070 CC[2+bs*2] -= e_1 * CC[1+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8071 CC[2+bs*3] -= e_1 * CC[1+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8077 CC[3+bs*0] -= e_0 * CC[0+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8078 CC[3+bs*1] -= e_0 * CC[0+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8079 CC[3+bs*2] -= e_0 * CC[0+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8080 CC[3+bs*3] -= e_0 * CC[0+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8082 CC[3+bs*0] -= e_1 * CC[1+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8083 CC[3+bs*1] -= e_1 * CC[1+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8084 CC[3+bs*2] -= e_1 * CC[1+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8085 CC[3+bs*3] -= e_1 * CC[1+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8087 CC[3+bs*0] -= e_2 * CC[2+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8088 CC[3+bs*1] -= e_2 * CC[2+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8089 CC[3+bs*2] -= e_2 * CC[2+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8090 CC[3+bs*3] -= e_2 * CC[2+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8096 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8097 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8098 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8099 D[3+bs*0] = CC[3+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8104 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8105 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8106 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8107 D[3+bs*1] = CC[3+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8112 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8113 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8114 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8115 D[3+bs*2] = CC[3+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8120 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8121 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8122 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8123 D[3+bs*3] = CC[3+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8127 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8128 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8129 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8134 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8135 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8136 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8141 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8142 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8143 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8148 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8149 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8150 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8154 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8155 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8160 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8161 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8166 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8167 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8172 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8173 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8177 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8182 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8187 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8192 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nn_ll_one_4x4_vs_lib4()
8218 double CC[16] = {0}; in kernel_dtrsm_nn_ru_inv_4x4_lib4() local
8220 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8225 kernel_dgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, beta, C, CC); in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8230 CC[0+bs*0] *= e_00; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8231 CC[1+bs*0] *= e_00; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8232 CC[2+bs*0] *= e_00; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8233 CC[3+bs*0] *= e_00; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8237 CC[0+bs*1] -= CC[0+bs*0] * e_01; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8238 CC[1+bs*1] -= CC[1+bs*0] * e_01; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8239 CC[2+bs*1] -= CC[2+bs*0] * e_01; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8240 CC[3+bs*1] -= CC[3+bs*0] * e_01; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8241 CC[0+bs*1] *= e_11; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8242 CC[1+bs*1] *= e_11; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8243 CC[2+bs*1] *= e_11; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8244 CC[3+bs*1] *= e_11; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8249 CC[0+bs*2] -= CC[0+bs*0] * e_02; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8250 CC[1+bs*2] -= CC[1+bs*0] * e_02; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8251 CC[2+bs*2] -= CC[2+bs*0] * e_02; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8252 CC[3+bs*2] -= CC[3+bs*0] * e_02; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8253 CC[0+bs*2] -= CC[0+bs*1] * e_12; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8254 CC[1+bs*2] -= CC[1+bs*1] * e_12; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8255 CC[2+bs*2] -= CC[2+bs*1] * e_12; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8256 CC[3+bs*2] -= CC[3+bs*1] * e_12; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8257 CC[0+bs*2] *= e_22; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8258 CC[1+bs*2] *= e_22; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8259 CC[2+bs*2] *= e_22; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8260 CC[3+bs*2] *= e_22; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8266 CC[0+bs*3] -= CC[0+bs*0] * e_03; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8267 CC[1+bs*3] -= CC[1+bs*0] * e_03; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8268 CC[2+bs*3] -= CC[2+bs*0] * e_03; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8269 CC[3+bs*3] -= CC[3+bs*0] * e_03; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8270 CC[0+bs*3] -= CC[0+bs*1] * e_13; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8271 CC[1+bs*3] -= CC[1+bs*1] * e_13; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8272 CC[2+bs*3] -= CC[2+bs*1] * e_13; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8273 CC[3+bs*3] -= CC[3+bs*1] * e_13; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8274 CC[0+bs*3] -= CC[0+bs*2] * e_23; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8275 CC[1+bs*3] -= CC[1+bs*2] * e_23; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8276 CC[2+bs*3] -= CC[2+bs*2] * e_23; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8277 CC[3+bs*3] -= CC[3+bs*2] * e_23; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8278 CC[0+bs*3] *= e_33; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8279 CC[1+bs*3] *= e_33; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8280 CC[2+bs*3] *= e_33; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8281 CC[3+bs*3] *= e_33; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8283 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8284 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8285 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8286 D[3+bs*0] = CC[3+bs*0]; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8288 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8289 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8290 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8291 D[3+bs*1] = CC[3+bs*1]; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8293 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8294 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8295 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8296 D[3+bs*2] = CC[3+bs*2]; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8298 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8299 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8300 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8301 D[3+bs*3] = CC[3+bs*3]; in kernel_dtrsm_nn_ru_inv_4x4_lib4()
8326 double CC[16] = {0}; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4() local
8328 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8333 kernel_dgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, beta, C, CC); in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8338 CC[0+bs*0] *= e_00; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8339 CC[1+bs*0] *= e_00; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8340 CC[2+bs*0] *= e_00; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8341 CC[3+bs*0] *= e_00; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8348 CC[0+bs*1] -= CC[0+bs*0] * e_01; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8349 CC[1+bs*1] -= CC[1+bs*0] * e_01; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8350 CC[2+bs*1] -= CC[2+bs*0] * e_01; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8351 CC[3+bs*1] -= CC[3+bs*0] * e_01; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8352 CC[0+bs*1] *= e_11; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8353 CC[1+bs*1] *= e_11; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8354 CC[2+bs*1] *= e_11; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8355 CC[3+bs*1] *= e_11; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8363 CC[0+bs*2] -= CC[0+bs*0] * e_02; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8364 CC[1+bs*2] -= CC[1+bs*0] * e_02; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8365 CC[2+bs*2] -= CC[2+bs*0] * e_02; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8366 CC[3+bs*2] -= CC[3+bs*0] * e_02; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8367 CC[0+bs*2] -= CC[0+bs*1] * e_12; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8368 CC[1+bs*2] -= CC[1+bs*1] * e_12; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8369 CC[2+bs*2] -= CC[2+bs*1] * e_12; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8370 CC[3+bs*2] -= CC[3+bs*1] * e_12; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8371 CC[0+bs*2] *= e_22; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8372 CC[1+bs*2] *= e_22; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8373 CC[2+bs*2] *= e_22; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8374 CC[3+bs*2] *= e_22; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8383 CC[0+bs*3] -= CC[0+bs*0] * e_03; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8384 CC[1+bs*3] -= CC[1+bs*0] * e_03; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8385 CC[2+bs*3] -= CC[2+bs*0] * e_03; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8386 CC[3+bs*3] -= CC[3+bs*0] * e_03; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8387 CC[0+bs*3] -= CC[0+bs*1] * e_13; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8388 CC[1+bs*3] -= CC[1+bs*1] * e_13; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8389 CC[2+bs*3] -= CC[2+bs*1] * e_13; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8390 CC[3+bs*3] -= CC[3+bs*1] * e_13; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8391 CC[0+bs*3] -= CC[0+bs*2] * e_23; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8392 CC[1+bs*3] -= CC[1+bs*2] * e_23; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8393 CC[2+bs*3] -= CC[2+bs*2] * e_23; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8394 CC[3+bs*3] -= CC[3+bs*2] * e_23; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8395 CC[0+bs*3] *= e_33; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8396 CC[1+bs*3] *= e_33; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8397 CC[2+bs*3] *= e_33; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8398 CC[3+bs*3] *= e_33; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8404 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8405 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8406 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8407 D[3+bs*0] = CC[3+bs*0]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8412 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8413 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8414 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8415 D[3+bs*1] = CC[3+bs*1]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8420 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8421 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8422 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8423 D[3+bs*2] = CC[3+bs*2]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8428 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8429 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8430 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8431 D[3+bs*3] = CC[3+bs*3]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8435 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8436 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8437 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8442 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8443 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8444 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8449 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8450 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8451 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8456 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8457 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8458 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8462 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8463 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8468 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8469 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8474 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8475 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8480 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8481 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8485 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8490 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8495 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8500 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nn_ru_inv_4x4_vs_lib4()
8528 double CC[16] = {0}; in kernel_dtrsm_nn_lu_inv_4x4_lib4() local
8530 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8536 kernel_dgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, &beta1, C, CC); in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8544 CC[3+bs*0] *= e_33; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8545 CC[3+bs*1] *= e_33; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8546 CC[3+bs*2] *= e_33; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8547 CC[3+bs*3] *= e_33; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8548 CC[0+bs*0] -= e_03 * CC[3+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8549 CC[0+bs*1] -= e_03 * CC[3+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8550 CC[0+bs*2] -= e_03 * CC[3+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8551 CC[0+bs*3] -= e_03 * CC[3+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8552 CC[1+bs*0] -= e_13 * CC[3+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8553 CC[1+bs*1] -= e_13 * CC[3+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8554 CC[1+bs*2] -= e_13 * CC[3+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8555 CC[1+bs*3] -= e_13 * CC[3+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8556 CC[2+bs*0] -= e_23 * CC[3+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8557 CC[2+bs*1] -= e_23 * CC[3+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8558 CC[2+bs*2] -= e_23 * CC[3+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8559 CC[2+bs*3] -= e_23 * CC[3+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8564 CC[2+bs*0] *= e_22; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8565 CC[2+bs*1] *= e_22; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8566 CC[2+bs*2] *= e_22; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8567 CC[2+bs*3] *= e_22; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8568 CC[0+bs*0] -= e_02 * CC[2+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8569 CC[0+bs*1] -= e_02 * CC[2+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8570 CC[0+bs*2] -= e_02 * CC[2+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8571 CC[0+bs*3] -= e_02 * CC[2+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8572 CC[1+bs*0] -= e_12 * CC[2+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8573 CC[1+bs*1] -= e_12 * CC[2+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8574 CC[1+bs*2] -= e_12 * CC[2+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8575 CC[1+bs*3] -= e_12 * CC[2+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8579 CC[1+bs*0] *= e_11; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8580 CC[1+bs*1] *= e_11; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8581 CC[1+bs*2] *= e_11; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8582 CC[1+bs*3] *= e_11; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8583 CC[0+bs*0] -= e_01 * CC[1+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8584 CC[0+bs*1] -= e_01 * CC[1+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8585 CC[0+bs*2] -= e_01 * CC[1+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8586 CC[0+bs*3] -= e_01 * CC[1+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8589 CC[0+bs*0] *= e_00; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8590 CC[0+bs*1] *= e_00; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8591 CC[0+bs*2] *= e_00; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8592 CC[0+bs*3] *= e_00; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8594 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8595 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8596 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8597 D[3+bs*0] = CC[3+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8599 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8600 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8601 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8602 D[3+bs*1] = CC[3+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8604 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8605 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8606 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8607 D[3+bs*2] = CC[3+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8609 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8610 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8611 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8612 D[3+bs*3] = CC[3+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_lib4()
8639 double CC[16] = {0}; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4() local
8641 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8647 kernel_dgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, &beta1, C, CC); in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8657 CC[3+bs*0] *= e_33; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8658 CC[3+bs*1] *= e_33; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8659 CC[3+bs*2] *= e_33; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8660 CC[3+bs*3] *= e_33; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8661 CC[0+bs*0] -= e_03 * CC[3+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8662 CC[0+bs*1] -= e_03 * CC[3+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8663 CC[0+bs*2] -= e_03 * CC[3+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8664 CC[0+bs*3] -= e_03 * CC[3+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8665 CC[1+bs*0] -= e_13 * CC[3+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8666 CC[1+bs*1] -= e_13 * CC[3+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8667 CC[1+bs*2] -= e_13 * CC[3+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8668 CC[1+bs*3] -= e_13 * CC[3+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8669 CC[2+bs*0] -= e_23 * CC[3+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8670 CC[2+bs*1] -= e_23 * CC[3+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8671 CC[2+bs*2] -= e_23 * CC[3+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8672 CC[2+bs*3] -= e_23 * CC[3+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8680 CC[2+bs*0] *= e_22; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8681 CC[2+bs*1] *= e_22; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8682 CC[2+bs*2] *= e_22; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8683 CC[2+bs*3] *= e_22; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8684 CC[0+bs*0] -= e_02 * CC[2+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8685 CC[0+bs*1] -= e_02 * CC[2+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8686 CC[0+bs*2] -= e_02 * CC[2+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8687 CC[0+bs*3] -= e_02 * CC[2+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8688 CC[1+bs*0] -= e_12 * CC[2+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8689 CC[1+bs*1] -= e_12 * CC[2+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8690 CC[1+bs*2] -= e_12 * CC[2+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8691 CC[1+bs*3] -= e_12 * CC[2+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8698 CC[1+bs*0] *= e_11; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8699 CC[1+bs*1] *= e_11; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8700 CC[1+bs*2] *= e_11; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8701 CC[1+bs*3] *= e_11; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8702 CC[0+bs*0] -= e_01 * CC[1+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8703 CC[0+bs*1] -= e_01 * CC[1+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8704 CC[0+bs*2] -= e_01 * CC[1+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8705 CC[0+bs*3] -= e_01 * CC[1+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8709 CC[0+bs*0] *= e_00; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8710 CC[0+bs*1] *= e_00; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8711 CC[0+bs*2] *= e_00; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8712 CC[0+bs*3] *= e_00; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8718 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8719 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8720 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8721 D[3+bs*0] = CC[3+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8726 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8727 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8728 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8729 D[3+bs*1] = CC[3+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8734 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8735 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8736 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8737 D[3+bs*2] = CC[3+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8742 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8743 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8744 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8745 D[3+bs*3] = CC[3+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8749 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8750 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8751 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8756 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8757 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8758 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8763 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8764 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8765 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8770 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8771 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8772 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8776 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8777 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8782 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8783 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8788 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8789 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8794 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8795 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8799 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8804 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8809 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8814 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nn_lu_inv_4x4_vs_lib4()
8843 double CC[16] = {0}; in kernel_dtrsm_nn_lu_one_4x4_lib4() local
8845 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8851 kernel_dgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, &beta1, C, CC); in kernel_dtrsm_nn_lu_one_4x4_lib4()
8858 CC[0+bs*0] -= e_03 * CC[3+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8859 CC[0+bs*1] -= e_03 * CC[3+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8860 CC[0+bs*2] -= e_03 * CC[3+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8861 CC[0+bs*3] -= e_03 * CC[3+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8862 CC[1+bs*0] -= e_13 * CC[3+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8863 CC[1+bs*1] -= e_13 * CC[3+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8864 CC[1+bs*2] -= e_13 * CC[3+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8865 CC[1+bs*3] -= e_13 * CC[3+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8866 CC[2+bs*0] -= e_23 * CC[3+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8867 CC[2+bs*1] -= e_23 * CC[3+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8868 CC[2+bs*2] -= e_23 * CC[3+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8869 CC[2+bs*3] -= e_23 * CC[3+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8873 CC[0+bs*0] -= e_02 * CC[2+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8874 CC[0+bs*1] -= e_02 * CC[2+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8875 CC[0+bs*2] -= e_02 * CC[2+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8876 CC[0+bs*3] -= e_02 * CC[2+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8877 CC[1+bs*0] -= e_12 * CC[2+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8878 CC[1+bs*1] -= e_12 * CC[2+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8879 CC[1+bs*2] -= e_12 * CC[2+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8880 CC[1+bs*3] -= e_12 * CC[2+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8883 CC[0+bs*0] -= e_01 * CC[1+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8884 CC[0+bs*1] -= e_01 * CC[1+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8885 CC[0+bs*2] -= e_01 * CC[1+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8886 CC[0+bs*3] -= e_01 * CC[1+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8888 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8889 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8890 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8891 D[3+bs*0] = CC[3+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8893 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8894 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8895 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8896 D[3+bs*1] = CC[3+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8898 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8899 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8900 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8901 D[3+bs*2] = CC[3+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8903 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8904 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8905 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8906 D[3+bs*3] = CC[3+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_lib4()
8932 double CC[16] = {0}; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4() local
8934 ALIGNED( double CC[16], 64 ) = {0}; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8940 kernel_dgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, &beta1, C, CC); in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8949 CC[0+bs*0] -= e_03 * CC[3+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8950 CC[0+bs*1] -= e_03 * CC[3+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8951 CC[0+bs*2] -= e_03 * CC[3+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8952 CC[0+bs*3] -= e_03 * CC[3+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8953 CC[1+bs*0] -= e_13 * CC[3+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8954 CC[1+bs*1] -= e_13 * CC[3+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8955 CC[1+bs*2] -= e_13 * CC[3+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8956 CC[1+bs*3] -= e_13 * CC[3+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8957 CC[2+bs*0] -= e_23 * CC[3+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8958 CC[2+bs*1] -= e_23 * CC[3+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8959 CC[2+bs*2] -= e_23 * CC[3+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8960 CC[2+bs*3] -= e_23 * CC[3+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8967 CC[0+bs*0] -= e_02 * CC[2+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8968 CC[0+bs*1] -= e_02 * CC[2+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8969 CC[0+bs*2] -= e_02 * CC[2+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8970 CC[0+bs*3] -= e_02 * CC[2+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8971 CC[1+bs*0] -= e_12 * CC[2+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8972 CC[1+bs*1] -= e_12 * CC[2+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8973 CC[1+bs*2] -= e_12 * CC[2+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8974 CC[1+bs*3] -= e_12 * CC[2+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8980 CC[0+bs*0] -= e_01 * CC[1+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8981 CC[0+bs*1] -= e_01 * CC[1+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8982 CC[0+bs*2] -= e_01 * CC[1+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8983 CC[0+bs*3] -= e_01 * CC[1+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8990 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8991 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8992 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8993 D[3+bs*0] = CC[3+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8998 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
8999 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9000 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9001 D[3+bs*1] = CC[3+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9006 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9007 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9008 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9009 D[3+bs*2] = CC[3+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9014 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9015 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9016 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9017 D[3+bs*3] = CC[3+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9021 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9022 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9023 D[2+bs*0] = CC[2+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9028 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9029 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9030 D[2+bs*1] = CC[2+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9035 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9036 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9037 D[2+bs*2] = CC[2+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9042 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9043 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9044 D[2+bs*3] = CC[2+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9048 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9049 D[1+bs*0] = CC[1+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9054 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9055 D[1+bs*1] = CC[1+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9060 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9061 D[1+bs*2] = CC[1+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9066 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9067 D[1+bs*3] = CC[1+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9071 D[0+bs*0] = CC[0+bs*0]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9076 D[0+bs*1] = CC[0+bs*1]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9081 D[0+bs*2] = CC[0+bs*2]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()
9086 D[0+bs*3] = CC[0+bs*3]; in kernel_dtrsm_nn_lu_one_4x4_vs_lib4()