1/**************************************************************************************************
2*                                                                                                 *
3* This file is part of BLASFEO.                                                                   *
4*                                                                                                 *
5* BLASFEO -- BLAS For Embedded Optimization.                                                      *
6* Copyright (C) 2019 by Gianluca Frison.                                                          *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
8* All rights reserved.                                                                            *
9*                                                                                                 *
10* The 2-Clause BSD License                                                                        *
11*                                                                                                 *
12* Redistribution and use in source and binary forms, with or without                              *
13* modification, are permitted provided that the following conditions are met:                     *
14*                                                                                                 *
15* 1. Redistributions of source code must retain the above copyright notice, this                  *
16*    list of conditions and the following disclaimer.                                             *
17* 2. Redistributions in binary form must reproduce the above copyright notice,                    *
18*    this list of conditions and the following disclaimer in the documentation                    *
19*    and/or other materials provided with the distribution.                                       *
20*                                                                                                 *
21* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND                 *
22* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED                   *
23* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                          *
24* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR                 *
25* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES                  *
26* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;                    *
27* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND                     *
28* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT                      *
29* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS                   *
30* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                    *
31*                                                                                                 *
32* Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             *
33*                                                                                                 *
34**************************************************************************************************/
35
36// common inner routine with file scope
37//
38// input arguments:
39// r10d  <- k
40// r11   <- A
41// r12   <- B
42// r13   <- ldb
43// ymm0  <- [d00 d10 d20 d30]
44// ymm1  <- [d01 d11 d21 d31]
45// ymm2  <- [d02 d12 d22 d32]
46// ymm3  <- [d03 d13 d23 d33]
47//
48// output arguments:
49
50#if MACRO_LEVEL>=2
51	.macro INNER_KERNEL_DGEMM_NT_4X4_LIB4C
52#else
53	.p2align 4,,15
54	FUN_START(inner_kernel_dgemm_nt_4x4_lib4c)
55#endif
56
57	cmpl	$0, %r10d
58	jle		5f // return
59
60	// preload
61
62	vxorpd			%ymm4, %ymm4, %ymm4
63	vmovapd			%ymm4, %ymm5
64	vmovapd			%ymm4, %ymm6
65	vmovapd			%ymm4, %ymm7
66
67	cmpl	$4, %r10d
68	jle		0f // consider clean-up loop
69
70	// main loop
71	.p2align 3
721: // main loop
73
74//	prefetcht0	0(%r12, %r13, 2) // software prefetch
75//	prefetcht0	64(%r12, %r13, 2) // software prefetch
76
77	// unroll 0
78	vmovupd			0(%r11), %ymm13 // A
79	vbroadcastsd	0(%r12), %ymm12 // B
80	vmulpd			%ymm13, %ymm12, %ymm15
81	vaddpd			%ymm0, %ymm15, %ymm0
82	vbroadcastsd	8(%r12), %ymm12 // B
83	vmulpd			%ymm13, %ymm12, %ymm15
84	vaddpd			%ymm1, %ymm15, %ymm1
85	vbroadcastsd	16(%r12), %ymm12 // B
86	vmulpd			%ymm13, %ymm12, %ymm15
87	vaddpd			%ymm2, %ymm15, %ymm2
88	vbroadcastsd	24(%r12), %ymm12 // B
89	vmulpd			%ymm13, %ymm12, %ymm15
90	vaddpd			%ymm3, %ymm15, %ymm3
91	addq	%r13, %r12
92
93	// unroll 1
94	vmovupd			32(%r11), %ymm13 // A
95	vbroadcastsd	0(%r12), %ymm12 // B
96	vmulpd			%ymm13, %ymm12, %ymm15
97	vaddpd			%ymm4, %ymm15, %ymm4
98	vbroadcastsd	8(%r12), %ymm12 // B
99	vmulpd			%ymm13, %ymm12, %ymm15
100	vaddpd			%ymm5, %ymm15, %ymm5
101	vbroadcastsd	16(%r12), %ymm12 // B
102	vmulpd			%ymm13, %ymm12, %ymm15
103	vaddpd			%ymm6, %ymm15, %ymm6
104	vbroadcastsd	24(%r12), %ymm12 // B
105	vmulpd			%ymm13, %ymm12, %ymm15
106	vaddpd			%ymm7, %ymm15, %ymm7
107	addq	%r13, %r12
108
109	// unroll 2
110	vmovupd			64(%r11), %ymm13 // A
111	vbroadcastsd	0(%r12), %ymm12 // B
112	vmulpd			%ymm13, %ymm12, %ymm15
113	vaddpd			%ymm0, %ymm15, %ymm0
114	vbroadcastsd	8(%r12), %ymm12 // B
115	vmulpd			%ymm13, %ymm12, %ymm15
116	vaddpd			%ymm1, %ymm15, %ymm1
117	vbroadcastsd	16(%r12), %ymm12 // B
118	vmulpd			%ymm13, %ymm12, %ymm15
119	vaddpd			%ymm2, %ymm15, %ymm2
120	vbroadcastsd	24(%r12), %ymm12 // B
121	vmulpd			%ymm13, %ymm12, %ymm15
122	vaddpd			%ymm3, %ymm15, %ymm3
123	addq	%r13, %r12
124
125	// unroll 3
126	vmovupd			96(%r11), %ymm13 // A
127	vbroadcastsd	0(%r12), %ymm12 // B
128	vmulpd			%ymm13, %ymm12, %ymm15
129	vaddpd			%ymm4, %ymm15, %ymm4
130	vbroadcastsd	8(%r12), %ymm12 // B
131	vmulpd			%ymm13, %ymm12, %ymm15
132	vaddpd			%ymm5, %ymm15, %ymm5
133	vbroadcastsd	16(%r12), %ymm12 // B
134	vmulpd			%ymm13, %ymm12, %ymm15
135	vaddpd			%ymm6, %ymm15, %ymm6
136	vbroadcastsd	24(%r12), %ymm12 // B
137	vmulpd			%ymm13, %ymm12, %ymm15
138	vaddpd			%ymm7, %ymm15, %ymm7
139	addq	%r13, %r12
140
141	subl	$4, %r10d
142	addq	$128, %r11
143
144	cmpl	$4, %r10d
145	jg		1b // main loop
146
147
1480: // consider clean4-up
149
150	cmpl	$3, %r10d
151	jle		4f // clean1
152
153	// unroll 0
154	vmovupd			0(%r11), %ymm13 // A
155	vbroadcastsd	0(%r12), %ymm12 // B
156	vmulpd			%ymm13, %ymm12, %ymm15
157	vaddpd			%ymm0, %ymm15, %ymm0
158	vbroadcastsd	8(%r12), %ymm12 // B
159	vmulpd			%ymm13, %ymm12, %ymm15
160	vaddpd			%ymm1, %ymm15, %ymm1
161	vbroadcastsd	16(%r12), %ymm12 // B
162	vmulpd			%ymm13, %ymm12, %ymm15
163	vaddpd			%ymm2, %ymm15, %ymm2
164	vbroadcastsd	24(%r12), %ymm12 // B
165	vmulpd			%ymm13, %ymm12, %ymm15
166	vaddpd			%ymm3, %ymm15, %ymm3
167	addq	%r13, %r12
168
169	// unroll 1
170	vmovupd			32(%r11), %ymm13 // A
171	vbroadcastsd	0(%r12), %ymm12 // B
172	vmulpd			%ymm13, %ymm12, %ymm15
173	vaddpd			%ymm4, %ymm15, %ymm4
174	vbroadcastsd	8(%r12), %ymm12 // B
175	vmulpd			%ymm13, %ymm12, %ymm15
176	vaddpd			%ymm5, %ymm15, %ymm5
177	vbroadcastsd	16(%r12), %ymm12 // B
178	vmulpd			%ymm13, %ymm12, %ymm15
179	vaddpd			%ymm6, %ymm15, %ymm6
180	vbroadcastsd	24(%r12), %ymm12 // B
181	vmulpd			%ymm13, %ymm12, %ymm15
182	vaddpd			%ymm7, %ymm15, %ymm7
183	addq	%r13, %r12
184
185	// unroll 2
186	vmovupd			64(%r11), %ymm13 // A
187	vbroadcastsd	0(%r12), %ymm12 // B
188	vmulpd			%ymm13, %ymm12, %ymm15
189	vaddpd			%ymm0, %ymm15, %ymm0
190	vbroadcastsd	8(%r12), %ymm12 // B
191	vmulpd			%ymm13, %ymm12, %ymm15
192	vaddpd			%ymm1, %ymm15, %ymm1
193	vbroadcastsd	16(%r12), %ymm12 // B
194	vmulpd			%ymm13, %ymm12, %ymm15
195	vaddpd			%ymm2, %ymm15, %ymm2
196	vbroadcastsd	24(%r12), %ymm12 // B
197	vmulpd			%ymm13, %ymm12, %ymm15
198	vaddpd			%ymm3, %ymm15, %ymm3
199	addq	%r13, %r12
200
201	// unroll 3
202	vmovupd			96(%r11), %ymm13 // A
203	vbroadcastsd	0(%r12), %ymm12 // B
204	vmulpd			%ymm13, %ymm12, %ymm15
205	vaddpd			%ymm4, %ymm15, %ymm4
206	vbroadcastsd	8(%r12), %ymm12 // B
207	vmulpd			%ymm13, %ymm12, %ymm15
208	vaddpd			%ymm5, %ymm15, %ymm5
209	vbroadcastsd	16(%r12), %ymm12 // B
210	vmulpd			%ymm13, %ymm12, %ymm15
211	vaddpd			%ymm6, %ymm15, %ymm6
212	vbroadcastsd	24(%r12), %ymm12 // B
213	vmulpd			%ymm13, %ymm12, %ymm15
214	vaddpd			%ymm7, %ymm15, %ymm7
215	addq	%r13, %r12
216
217	subl	$4, %r10d
218	addq	$128, %r11
219
220	jmp		2f // return
221
222
2234: // consider clean1-up loop
224
225	cmpl	$0, %r10d
226	jle		2f // return
227
228	// clean-up loop
2293: // clean up loop
230
231	// unroll 0
232	vmovupd			0(%r11), %ymm13 // A
233	vbroadcastsd	0(%r12), %ymm12 // B
234	vmulpd			%ymm13, %ymm12, %ymm15
235	vaddpd			%ymm0, %ymm15, %ymm0
236	vbroadcastsd	8(%r12), %ymm12 // B
237	vmulpd			%ymm13, %ymm12, %ymm15
238	vaddpd			%ymm1, %ymm15, %ymm1
239	vbroadcastsd	16(%r12), %ymm12 // B
240	vmulpd			%ymm13, %ymm12, %ymm15
241	vaddpd			%ymm2, %ymm15, %ymm2
242	vbroadcastsd	24(%r12), %ymm12 // B
243	vmulpd			%ymm13, %ymm12, %ymm15
244	vaddpd			%ymm3, %ymm15, %ymm3
245	addq	%r13, %r12
246
247	subl	$1, %r10d
248	addq	$32, %r11
249
250	cmpl	$0, %r10d
251	jg		3b // clean up loop
252
253
2542: // return
255
256	vaddpd			%ymm4, %ymm0, %ymm0
257	vaddpd			%ymm5, %ymm1, %ymm1
258	vaddpd			%ymm6, %ymm2, %ymm2
259	vaddpd			%ymm7, %ymm3, %ymm3
260
2615: // return
262
263#if MACRO_LEVEL>=2
264	.endm
265#else
266	ret
267
268	FUN_END(inner_kernel_dgemm_nt_4x4_lib4c)
269#endif
270
271
272
273
274
275// common inner routine with file scope
276//
277// input arguments:
278// r10d  <- k
279// r11   <- A
280// r12   <- B
281// r13   <- ldb
282// ymm0  <- [d00 d10 d20 d30]
283// ymm1  <- [d01 d11 d21 d31]
284// ymm2  <- [d02 d12 d22 d32]
285// ymm3  <- [d03 d13 d23 d33]
286//
287// output arguments:
288
289#if MACRO_LEVEL>=2
290	.macro INNER_KERNEL_DGEMM_NT_4X3_LIB4C
291#else
292	.p2align 4,,15
293	FUN_START(inner_kernel_dgemm_nt_4x3_lib4c)
294#endif
295
296	cmpl	$0, %r10d
297	jle		5f // return
298
299	// preload
300
301	vxorpd			%ymm4, %ymm4, %ymm4
302	vmovapd			%ymm4, %ymm5
303	vmovapd			%ymm4, %ymm6
304
305	cmpl	$4, %r10d
306	jle		0f // consider clean-up loop
307
308	// main loop
309	.p2align 3
3101: // main loop
311
312//	prefetcht0	0(%r12, %r13, 2) // software prefetch
313//	prefetcht0	64(%r12, %r13, 2) // software prefetch
314
315	// unroll 0
316	vmovupd			0(%r11), %ymm13 // A
317	vbroadcastsd	0(%r12), %ymm12 // B
318	vmulpd			%ymm13, %ymm12, %ymm15
319	vaddpd			%ymm0, %ymm15, %ymm0
320	vbroadcastsd	8(%r12), %ymm12 // B
321	vmulpd			%ymm13, %ymm12, %ymm15
322	vaddpd			%ymm1, %ymm15, %ymm1
323	vbroadcastsd	16(%r12), %ymm12 // B
324	vmulpd			%ymm13, %ymm12, %ymm15
325	vaddpd			%ymm2, %ymm15, %ymm2
326	addq	%r13, %r12
327
328	// unroll 1
329	vmovupd			32(%r11), %ymm13 // A
330	vbroadcastsd	0(%r12), %ymm12 // B
331	vmulpd			%ymm13, %ymm12, %ymm15
332	vaddpd			%ymm4, %ymm15, %ymm4
333	vbroadcastsd	8(%r12), %ymm12 // B
334	vmulpd			%ymm13, %ymm12, %ymm15
335	vaddpd			%ymm5, %ymm15, %ymm5
336	vbroadcastsd	16(%r12), %ymm12 // B
337	vmulpd			%ymm13, %ymm12, %ymm15
338	vaddpd			%ymm6, %ymm15, %ymm6
339	addq	%r13, %r12
340
341	// unroll 2
342	vmovupd			64(%r11), %ymm13 // A
343	vbroadcastsd	0(%r12), %ymm12 // B
344	vmulpd			%ymm13, %ymm12, %ymm15
345	vaddpd			%ymm0, %ymm15, %ymm0
346	vbroadcastsd	8(%r12), %ymm12 // B
347	vmulpd			%ymm13, %ymm12, %ymm15
348	vaddpd			%ymm1, %ymm15, %ymm1
349	vbroadcastsd	16(%r12), %ymm12 // B
350	vmulpd			%ymm13, %ymm12, %ymm15
351	vaddpd			%ymm2, %ymm15, %ymm2
352	addq	%r13, %r12
353
354	// unroll 3
355	vmovupd			96(%r11), %ymm13 // A
356	vbroadcastsd	0(%r12), %ymm12 // B
357	vmulpd			%ymm13, %ymm12, %ymm15
358	vaddpd			%ymm4, %ymm15, %ymm4
359	vbroadcastsd	8(%r12), %ymm12 // B
360	vmulpd			%ymm13, %ymm12, %ymm15
361	vaddpd			%ymm5, %ymm15, %ymm5
362	vbroadcastsd	16(%r12), %ymm12 // B
363	vmulpd			%ymm13, %ymm12, %ymm15
364	vaddpd			%ymm6, %ymm15, %ymm6
365	addq	%r13, %r12
366
367	subl	$4, %r10d
368	addq	$128, %r11
369
370	cmpl	$4, %r10d
371	jg		1b // main loop
372
373
3740: // consider clean4-up
375
376	cmpl	$3, %r10d
377	jle		4f // clean1
378
379	// unroll 0
380	vmovupd			0(%r11), %ymm13 // A
381	vbroadcastsd	0(%r12), %ymm12 // B
382	vmulpd			%ymm13, %ymm12, %ymm15
383	vaddpd			%ymm0, %ymm15, %ymm0
384	vbroadcastsd	8(%r12), %ymm12 // B
385	vmulpd			%ymm13, %ymm12, %ymm15
386	vaddpd			%ymm1, %ymm15, %ymm1
387	vbroadcastsd	16(%r12), %ymm12 // B
388	vmulpd			%ymm13, %ymm12, %ymm15
389	vaddpd			%ymm2, %ymm15, %ymm2
390	addq	%r13, %r12
391
392	// unroll 1
393	vmovupd			32(%r11), %ymm13 // A
394	vbroadcastsd	0(%r12), %ymm12 // B
395	vmulpd			%ymm13, %ymm12, %ymm15
396	vaddpd			%ymm4, %ymm15, %ymm4
397	vbroadcastsd	8(%r12), %ymm12 // B
398	vmulpd			%ymm13, %ymm12, %ymm15
399	vaddpd			%ymm5, %ymm15, %ymm5
400	vbroadcastsd	16(%r12), %ymm12 // B
401	vmulpd			%ymm13, %ymm12, %ymm15
402	vaddpd			%ymm6, %ymm15, %ymm6
403	addq	%r13, %r12
404
405	// unroll 2
406	vmovupd			64(%r11), %ymm13 // A
407	vbroadcastsd	0(%r12), %ymm12 // B
408	vmulpd			%ymm13, %ymm12, %ymm15
409	vaddpd			%ymm0, %ymm15, %ymm0
410	vbroadcastsd	8(%r12), %ymm12 // B
411	vmulpd			%ymm13, %ymm12, %ymm15
412	vaddpd			%ymm1, %ymm15, %ymm1
413	vbroadcastsd	16(%r12), %ymm12 // B
414	vmulpd			%ymm13, %ymm12, %ymm15
415	vaddpd			%ymm2, %ymm15, %ymm2
416	addq	%r13, %r12
417
418	// unroll 3
419	vmovupd			96(%r11), %ymm13 // A
420	vbroadcastsd	0(%r12), %ymm12 // B
421	vmulpd			%ymm13, %ymm12, %ymm15
422	vaddpd			%ymm4, %ymm15, %ymm4
423	vbroadcastsd	8(%r12), %ymm12 // B
424	vmulpd			%ymm13, %ymm12, %ymm15
425	vaddpd			%ymm5, %ymm15, %ymm5
426	vbroadcastsd	16(%r12), %ymm12 // B
427	vmulpd			%ymm13, %ymm12, %ymm15
428	vaddpd			%ymm6, %ymm15, %ymm6
429	addq	%r13, %r12
430
431	subl	$4, %r10d
432	addq	$128, %r11
433
434	jmp		2f // return
435
436
4374: // consider clean1-up loop
438
439	cmpl	$0, %r10d
440	jle		2f // return
441
442	// clean-up loop
4433: // clean up loop
444
445	// unroll 0
446	vmovupd			0(%r11), %ymm13 // A
447	vbroadcastsd	0(%r12), %ymm12 // B
448	vmulpd			%ymm13, %ymm12, %ymm15
449	vaddpd			%ymm0, %ymm15, %ymm0
450	vbroadcastsd	8(%r12), %ymm12 // B
451	vmulpd			%ymm13, %ymm12, %ymm15
452	vaddpd			%ymm1, %ymm15, %ymm1
453	vbroadcastsd	16(%r12), %ymm12 // B
454	vmulpd			%ymm13, %ymm12, %ymm15
455	vaddpd			%ymm2, %ymm15, %ymm2
456	addq	%r13, %r12
457
458	subl	$1, %r10d
459	addq	$32, %r11
460
461	cmpl	$0, %r10d
462	jg		3b // clean up loop
463
464
4652: // return
466
467	vaddpd			%ymm4, %ymm0, %ymm0
468	vaddpd			%ymm5, %ymm1, %ymm1
469	vaddpd			%ymm6, %ymm2, %ymm2
470
4715: // return
472
473#if MACRO_LEVEL>=2
474	.endm
475#else
476	ret
477
478	FUN_END(inner_kernel_dgemm_nt_4x3_lib4c)
479#endif
480
481
482
483
484
485// common inner routine with file scope
486//
487// input arguments:
488// r10d  <- k
489// r11   <- A
490// r12   <- B
491// r13   <- ldb
492// ymm0  <- [d00 d10 d20 d30]
493// ymm1  <- [d01 d11 d21 d31]
494// ymm2  <- [d02 d12 d22 d32]
495// ymm3  <- [d03 d13 d23 d33]
496//
497// output arguments:
498
499#if MACRO_LEVEL>=2
500	.macro INNER_KERNEL_DGEMM_NT_4X2_LIB4C
501#else
502	.p2align 4,,15
503	FUN_START(inner_kernel_dgemm_nt_4x2_lib4c)
504#endif
505
506	cmpl	$0, %r10d
507	jle		5f // return
508
509	// preload
510
511	vxorpd			%ymm4, %ymm4, %ymm4
512	vmovapd			%ymm4, %ymm5
513
514	cmpl	$4, %r10d
515	jle		0f // consider clean-up loop
516
517	// main loop
518	.p2align 3
5191: // main loop
520
521//	prefetcht0	0(%r12, %r13, 2) // software prefetch
522//	prefetcht0	64(%r12, %r13, 2) // software prefetch
523
524	// unroll 0
525	vmovupd			0(%r11), %ymm13 // A
526	vbroadcastsd	0(%r12), %ymm12 // B
527	vmulpd			%ymm13, %ymm12, %ymm15
528	vaddpd			%ymm0, %ymm15, %ymm0
529	vbroadcastsd	8(%r12), %ymm12 // B
530	vmulpd			%ymm13, %ymm12, %ymm15
531	vaddpd			%ymm1, %ymm15, %ymm1
532	addq	%r13, %r12
533
534	// unroll 1
535	vmovupd			32(%r11), %ymm13 // A
536	vbroadcastsd	0(%r12), %ymm12 // B
537	vmulpd			%ymm13, %ymm12, %ymm15
538	vaddpd			%ymm4, %ymm15, %ymm4
539	vbroadcastsd	8(%r12), %ymm12 // B
540	vmulpd			%ymm13, %ymm12, %ymm15
541	vaddpd			%ymm5, %ymm15, %ymm5
542	addq	%r13, %r12
543
544	// unroll 2
545	vmovupd			64(%r11), %ymm13 // A
546	vbroadcastsd	0(%r12), %ymm12 // B
547	vmulpd			%ymm13, %ymm12, %ymm15
548	vaddpd			%ymm0, %ymm15, %ymm0
549	vbroadcastsd	8(%r12), %ymm12 // B
550	vmulpd			%ymm13, %ymm12, %ymm15
551	vaddpd			%ymm1, %ymm15, %ymm1
552	addq	%r13, %r12
553
554	// unroll 3
555	vmovupd			96(%r11), %ymm13 // A
556	vbroadcastsd	0(%r12), %ymm12 // B
557	vmulpd			%ymm13, %ymm12, %ymm15
558	vaddpd			%ymm4, %ymm15, %ymm4
559	vbroadcastsd	8(%r12), %ymm12 // B
560	vmulpd			%ymm13, %ymm12, %ymm15
561	vaddpd			%ymm5, %ymm15, %ymm5
562	addq	%r13, %r12
563
564	subl	$4, %r10d
565	addq	$128, %r11
566
567	cmpl	$4, %r10d
568	jg		1b // main loop
569
570
5710: // consider clean4-up
572
573	cmpl	$3, %r10d
574	jle		4f // clean1
575
576	// unroll 0
577	vmovupd			0(%r11), %ymm13 // A
578	vbroadcastsd	0(%r12), %ymm12 // B
579	vmulpd			%ymm13, %ymm12, %ymm15
580	vaddpd			%ymm0, %ymm15, %ymm0
581	vbroadcastsd	8(%r12), %ymm12 // B
582	vmulpd			%ymm13, %ymm12, %ymm15
583	vaddpd			%ymm1, %ymm15, %ymm1
584	addq	%r13, %r12
585
586	// unroll 1
587	vmovupd			32(%r11), %ymm13 // A
588	vbroadcastsd	0(%r12), %ymm12 // B
589	vmulpd			%ymm13, %ymm12, %ymm15
590	vaddpd			%ymm4, %ymm15, %ymm4
591	vbroadcastsd	8(%r12), %ymm12 // B
592	vmulpd			%ymm13, %ymm12, %ymm15
593	vaddpd			%ymm5, %ymm15, %ymm5
594	addq	%r13, %r12
595
596	// unroll 2
597	vmovupd			64(%r11), %ymm13 // A
598	vbroadcastsd	0(%r12), %ymm12 // B
599	vmulpd			%ymm13, %ymm12, %ymm15
600	vaddpd			%ymm0, %ymm15, %ymm0
601	vbroadcastsd	8(%r12), %ymm12 // B
602	vmulpd			%ymm13, %ymm12, %ymm15
603	vaddpd			%ymm1, %ymm15, %ymm1
604	addq	%r13, %r12
605
606	// unroll 3
607	vmovupd			96(%r11), %ymm13 // A
608	vbroadcastsd	0(%r12), %ymm12 // B
609	vmulpd			%ymm13, %ymm12, %ymm15
610	vaddpd			%ymm4, %ymm15, %ymm4
611	vbroadcastsd	8(%r12), %ymm12 // B
612	vmulpd			%ymm13, %ymm12, %ymm15
613	vaddpd			%ymm5, %ymm15, %ymm5
614	addq	%r13, %r12
615
616	subl	$4, %r10d
617	addq	$128, %r11
618
619	jmp		2f // return
620
621
6224: // consider clean1-up loop
623
624	cmpl	$0, %r10d
625	jle		2f // return
626
627	// clean-up loop
6283: // clean up loop
629
630	// unroll 0
631	vmovupd			0(%r11), %ymm13 // A
632	vbroadcastsd	0(%r12), %ymm12 // B
633	vmulpd			%ymm13, %ymm12, %ymm15
634	vaddpd			%ymm0, %ymm15, %ymm0
635	vbroadcastsd	8(%r12), %ymm12 // B
636	vmulpd			%ymm13, %ymm12, %ymm15
637	vaddpd			%ymm1, %ymm15, %ymm1
638	addq	%r13, %r12
639
640	subl	$1, %r10d
641	addq	$32, %r11
642
643	cmpl	$0, %r10d
644	jg		3b // clean up loop
645
646
6472: // return
648
649	vaddpd			%ymm4, %ymm0, %ymm0
650	vaddpd			%ymm5, %ymm1, %ymm1
651
6525: // return
653
654#if MACRO_LEVEL>=2
655	.endm
656#else
657	ret
658
659	FUN_END(inner_kernel_dgemm_nt_4x2_lib4c)
660#endif
661
662
663
664
665
666// common inner routine with file scope
667//
668// input arguments:
669// r10d  <- k
670// r11   <- A
671// r12   <- B
672// r13   <- ldb
673// ymm0  <- [d00 d10 d20 d30]
674// ymm1  <- [d01 d11 d21 d31]
675// ymm2  <- [d02 d12 d22 d32]
676// ymm3  <- [d03 d13 d23 d33]
677//
678// output arguments:
679
680#if MACRO_LEVEL>=2
681	.macro INNER_KERNEL_DGEMM_NT_4X1_LIB4C
682#else
683	.p2align 4,,15
684	FUN_START(inner_kernel_dgemm_nt_4x1_lib4c)
685#endif
686
687	cmpl	$0, %r10d
688	jle		5f // return
689
690	// preload
691
692	vxorpd			%ymm4, %ymm4, %ymm4
693
694	cmpl	$4, %r10d
695	jle		0f // consider clean-up loop
696
697	// main loop
698	.p2align 3
6991: // main loop
700
701//	prefetcht0	0(%r12, %r13, 2) // software prefetch
702//	prefetcht0	64(%r12, %r13, 2) // software prefetch
703
704	// unroll 0
705	vmovupd			0(%r11), %ymm13 // A
706	vbroadcastsd	0(%r12), %ymm12 // B
707	vmulpd			%ymm13, %ymm12, %ymm15
708	vaddpd			%ymm0, %ymm15, %ymm0
709	addq	%r13, %r12
710
711	// unroll 1
712	vmovupd			32(%r11), %ymm13 // A
713	vbroadcastsd	0(%r12), %ymm12 // B
714	vmulpd			%ymm13, %ymm12, %ymm15
715	vaddpd			%ymm4, %ymm15, %ymm4
716	addq	%r13, %r12
717
718	// unroll 2
719	vmovupd			64(%r11), %ymm13 // A
720	vbroadcastsd	0(%r12), %ymm12 // B
721	vmulpd			%ymm13, %ymm12, %ymm15
722	vaddpd			%ymm0, %ymm15, %ymm0
723	addq	%r13, %r12
724
725	// unroll 3
726	vmovupd			96(%r11), %ymm13 // A
727	vbroadcastsd	0(%r12), %ymm12 // B
728	vmulpd			%ymm13, %ymm12, %ymm15
729	vaddpd			%ymm4, %ymm15, %ymm4
730	addq	%r13, %r12
731
732	subl	$4, %r10d
733	addq	$128, %r11
734
735	cmpl	$4, %r10d
736	jg		1b // main loop
737
738
7390: // consider clean4-up
740
741	cmpl	$3, %r10d
742	jle		4f // clean1
743
744	// unroll 0
745	vmovupd			0(%r11), %ymm13 // A
746	vbroadcastsd	0(%r12), %ymm12 // B
747	vmulpd			%ymm13, %ymm12, %ymm15
748	vaddpd			%ymm0, %ymm15, %ymm0
749	addq	%r13, %r12
750
751	// unroll 1
752	vmovupd			32(%r11), %ymm13 // A
753	vbroadcastsd	0(%r12), %ymm12 // B
754	vmulpd			%ymm13, %ymm12, %ymm15
755	vaddpd			%ymm4, %ymm15, %ymm4
756	addq	%r13, %r12
757
758	// unroll 2
759	vmovupd			64(%r11), %ymm13 // A
760	vbroadcastsd	0(%r12), %ymm12 // B
761	vmulpd			%ymm13, %ymm12, %ymm15
762	vaddpd			%ymm0, %ymm15, %ymm0
763	addq	%r13, %r12
764
765	// unroll 3
766	vmovupd			96(%r11), %ymm13 // A
767	vbroadcastsd	0(%r12), %ymm12 // B
768	vmulpd			%ymm13, %ymm12, %ymm15
769	vaddpd			%ymm4, %ymm15, %ymm4
770	addq	%r13, %r12
771
772	subl	$4, %r10d
773	addq	$128, %r11
774
775	jmp		2f // return
776
777
7784: // consider clean1-up loop
779
780	cmpl	$0, %r10d
781	jle		2f // return
782
783	// clean-up loop
7843: // clean up loop
785
786	// unroll 0
787	vmovupd			0(%r11), %ymm13 // A
788	vbroadcastsd	0(%r12), %ymm12 // B
789	vmulpd			%ymm13, %ymm12, %ymm15
790	vaddpd			%ymm0, %ymm15, %ymm0
791	addq	%r13, %r12
792
793	subl	$1, %r10d
794	addq	$32, %r11
795
796	cmpl	$0, %r10d
797	jg		3b // clean up loop
798
799
8002: // return
801
802	vaddpd			%ymm4, %ymm0, %ymm0
803
8045: // return
805
806#if MACRO_LEVEL>=2
807	.endm
808#else
809	ret
810
811	FUN_END(inner_kernel_dgemm_nt_4x1_lib4c)
812#endif
813
814
815
816
817
818// common inner routine with file scope
819//
820// input arguments:
821// r10d  <- k
822// r11   <- A
823// r12   <- B
824// r13   <- ldb
825// ymm0  <- [d00 d10 d20 d30]
826// ymm1  <- [d01 d11 d21 d31]
827// ymm2  <- [d02 d12 d22 d32]
828// ymm3  <- [d03 d13 d23 d33]
829//
830// output arguments:
831
832#if MACRO_LEVEL>=2
833	.macro INNER_KERNEL_DGEMM_NN_4X4_LIB4C
834#else
835	.p2align 4,,15
836	FUN_START(inner_kernel_dgemm_nn_4x4_lib4c)
837#endif
838
839	cmpl	$0, %r10d
840	jle		5f // return
841
842	movq	%r12, %r15
843	addq	%r13, %r15
844	addq	%r13, %r15 // B+2*ldb
845
846	// preload
847
848	vxorpd			%ymm4, %ymm4, %ymm4
849	vmovapd			%ymm4, %ymm5
850	vmovapd			%ymm4, %ymm6
851	vmovapd			%ymm4, %ymm7
852
853	cmpl	$4, %r10d
854	jle		0f // consider clean-up loop
855
856	// main loop
857	.p2align 3
8581: // main loop
859
860//	prefetcht0	0(%r12, %r13, 2) // software prefetch
861//	prefetcht0	64(%r12, %r13, 2) // software prefetch
862
863	// unroll 0
864	vmovupd			0(%r11), %ymm13 // A
865	vbroadcastsd	0(%r12), %ymm12 // B
866	vmulpd			%ymm13, %ymm12, %ymm15
867	vaddpd			%ymm0, %ymm15, %ymm0
868	vbroadcastsd	0(%r12, %r13, 1), %ymm12 // B
869	vmulpd			%ymm13, %ymm12, %ymm15
870	vaddpd			%ymm1, %ymm15, %ymm1
871	vbroadcastsd	0(%r15), %ymm12 // B
872	vmulpd			%ymm13, %ymm12, %ymm15
873	vaddpd			%ymm2, %ymm15, %ymm2
874	vbroadcastsd	0(%r15, %r13, 1), %ymm12 // B
875	vmulpd			%ymm13, %ymm12, %ymm15
876	vaddpd			%ymm3, %ymm15, %ymm3
877
878	// unroll 1
879	vmovupd			32(%r11), %ymm13 // A
880	vbroadcastsd	8(%r12), %ymm12 // B
881	vmulpd			%ymm13, %ymm12, %ymm15
882	vaddpd			%ymm4, %ymm15, %ymm4
883	vbroadcastsd	8(%r12, %r13, 1), %ymm12 // B
884	vmulpd			%ymm13, %ymm12, %ymm15
885	vaddpd			%ymm5, %ymm15, %ymm5
886	vbroadcastsd	8(%r15), %ymm12 // B
887	vmulpd			%ymm13, %ymm12, %ymm15
888	vaddpd			%ymm6, %ymm15, %ymm6
889	vbroadcastsd	8(%r15, %r13, 1), %ymm12 // B
890	vmulpd			%ymm13, %ymm12, %ymm15
891	vaddpd			%ymm7, %ymm15, %ymm7
892
893	// unroll 2
894	vmovupd			64(%r11), %ymm13 // A
895	vbroadcastsd	16(%r12), %ymm12 // B
896	vmulpd			%ymm13, %ymm12, %ymm15
897	vaddpd			%ymm0, %ymm15, %ymm0
898	vbroadcastsd	16(%r12, %r13, 1), %ymm12 // B
899	vmulpd			%ymm13, %ymm12, %ymm15
900	vaddpd			%ymm1, %ymm15, %ymm1
901	vbroadcastsd	16(%r15), %ymm12 // B
902	vmulpd			%ymm13, %ymm12, %ymm15
903	vaddpd			%ymm2, %ymm15, %ymm2
904	vbroadcastsd	16(%r15, %r13, 1), %ymm12 // B
905	vmulpd			%ymm13, %ymm12, %ymm15
906	vaddpd			%ymm3, %ymm15, %ymm3
907
908	// unroll 3
909	vmovupd			96(%r11), %ymm13 // A
910	vbroadcastsd	24(%r12), %ymm12 // B
911	vmulpd			%ymm13, %ymm12, %ymm15
912	vaddpd			%ymm4, %ymm15, %ymm4
913	vbroadcastsd	24(%r12, %r13, 1), %ymm12 // B
914	vmulpd			%ymm13, %ymm12, %ymm15
915	vaddpd			%ymm5, %ymm15, %ymm5
916	vbroadcastsd	24(%r15), %ymm12 // B
917	vmulpd			%ymm13, %ymm12, %ymm15
918	vaddpd			%ymm6, %ymm15, %ymm6
919	vbroadcastsd	24(%r15, %r13, 1), %ymm12 // B
920	vmulpd			%ymm13, %ymm12, %ymm15
921	vaddpd			%ymm7, %ymm15, %ymm7
922
923	subl	$4, %r10d
924	addq	$32, %r12
925	addq	$32, %r15
926	addq	$128, %r11
927
928	cmpl	$4, %r10d
929	jg		1b // main loop
930
931
9320: // consider clean4-up
933
934	cmpl	$3, %r10d
935	jle		4f // clean1
936
937	// unroll 0
938	vmovupd			0(%r11), %ymm13 // A
939	vbroadcastsd	0(%r12), %ymm12 // B
940	vmulpd			%ymm13, %ymm12, %ymm15
941	vaddpd			%ymm0, %ymm15, %ymm0
942	vbroadcastsd	0(%r12, %r13, 1), %ymm12 // B
943	vmulpd			%ymm13, %ymm12, %ymm15
944	vaddpd			%ymm1, %ymm15, %ymm1
945	vbroadcastsd	0(%r15), %ymm12 // B
946	vmulpd			%ymm13, %ymm12, %ymm15
947	vaddpd			%ymm2, %ymm15, %ymm2
948	vbroadcastsd	0(%r15, %r13, 1), %ymm12 // B
949	vmulpd			%ymm13, %ymm12, %ymm15
950	vaddpd			%ymm3, %ymm15, %ymm3
951
952	// unroll 1
953	vmovupd			32(%r11), %ymm13 // A
954	vbroadcastsd	8(%r12), %ymm12 // B
955	vmulpd			%ymm13, %ymm12, %ymm15
956	vaddpd			%ymm4, %ymm15, %ymm4
957	vbroadcastsd	8(%r12, %r13, 1), %ymm12 // B
958	vmulpd			%ymm13, %ymm12, %ymm15
959	vaddpd			%ymm5, %ymm15, %ymm5
960	vbroadcastsd	8(%r15), %ymm12 // B
961	vmulpd			%ymm13, %ymm12, %ymm15
962	vaddpd			%ymm6, %ymm15, %ymm6
963	vbroadcastsd	8(%r15, %r13, 1), %ymm12 // B
964	vmulpd			%ymm13, %ymm12, %ymm15
965	vaddpd			%ymm7, %ymm15, %ymm7
966
967	// unroll 2
968	vmovupd			64(%r11), %ymm13 // A
969	vbroadcastsd	16(%r12), %ymm12 // B
970	vmulpd			%ymm13, %ymm12, %ymm15
971	vaddpd			%ymm0, %ymm15, %ymm0
972	vbroadcastsd	16(%r12, %r13, 1), %ymm12 // B
973	vmulpd			%ymm13, %ymm12, %ymm15
974	vaddpd			%ymm1, %ymm15, %ymm1
975	vbroadcastsd	16(%r15), %ymm12 // B
976	vmulpd			%ymm13, %ymm12, %ymm15
977	vaddpd			%ymm2, %ymm15, %ymm2
978	vbroadcastsd	16(%r15, %r13, 1), %ymm12 // B
979	vmulpd			%ymm13, %ymm12, %ymm15
980	vaddpd			%ymm3, %ymm15, %ymm3
981
982	// unroll 3
983	vmovupd			96(%r11), %ymm13 // A
984	vbroadcastsd	24(%r12), %ymm12 // B
985	vmulpd			%ymm13, %ymm12, %ymm15
986	vaddpd			%ymm4, %ymm15, %ymm4
987	vbroadcastsd	24(%r12, %r13, 1), %ymm12 // B
988	vmulpd			%ymm13, %ymm12, %ymm15
989	vaddpd			%ymm5, %ymm15, %ymm5
990	vbroadcastsd	24(%r15), %ymm12 // B
991	vmulpd			%ymm13, %ymm12, %ymm15
992	vaddpd			%ymm6, %ymm15, %ymm6
993	vbroadcastsd	24(%r15, %r13, 1), %ymm12 // B
994	vmulpd			%ymm13, %ymm12, %ymm15
995	vaddpd			%ymm7, %ymm15, %ymm7
996
997	subl	$4, %r10d
998	addq	$32, %r12
999	addq	$32, %r15
1000	addq	$128, %r11
1001
1002	jmp		2f // return
1003
1004
10054: // consider clean1-up loop
1006
1007	cmpl	$0, %r10d
1008	jle		2f // return
1009
1010	// clean-up loop
10113: // clean up loop
1012
1013	// unroll 0
1014	vmovupd			0(%r11), %ymm13 // A
1015	vbroadcastsd	0(%r12), %ymm12 // B
1016	vmulpd			%ymm13, %ymm12, %ymm15
1017	vaddpd			%ymm0, %ymm15, %ymm0
1018	vbroadcastsd	0(%r12, %r13, 1), %ymm12 // B
1019	vmulpd			%ymm13, %ymm12, %ymm15
1020	vaddpd			%ymm1, %ymm15, %ymm1
1021	vbroadcastsd	0(%r15), %ymm12 // B
1022	vmulpd			%ymm13, %ymm12, %ymm15
1023	vaddpd			%ymm2, %ymm15, %ymm2
1024	vbroadcastsd	0(%r15, %r13, 1), %ymm12 // B
1025	vmulpd			%ymm13, %ymm12, %ymm15
1026	vaddpd			%ymm3, %ymm15, %ymm3
1027
1028	subl	$1, %r10d
1029	addq	$8, %r12
1030	addq	$8, %r15
1031	addq	$32, %r11
1032
1033	cmpl	$0, %r10d
1034	jg		3b // clean up loop
1035
1036
10372: // return
1038
1039	vaddpd			%ymm4, %ymm0, %ymm0
1040	vaddpd			%ymm5, %ymm1, %ymm1
1041	vaddpd			%ymm6, %ymm2, %ymm2
1042	vaddpd			%ymm7, %ymm3, %ymm3
1043
10445: // return
1045
1046#if MACRO_LEVEL>=2
1047	.endm
1048#else
1049	ret
1050
1051	FUN_END(inner_kernel_dgemm_nn_4x4_lib4c)
1052#endif
1053
1054
1055
1056
1057
1058// common inner routine with file scope
1059//
1060// input arguments:
1061// r10d  <- k
1062// r11   <- A
1063// r12   <- B
1064// r13   <- ldb
1065// ymm0  <- [d00 d10 d20 d30]
1066// ymm1  <- [d01 d11 d21 d31]
1067// ymm2  <- [d02 d12 d22 d32]
1068// ymm3  <- [d03 d13 d23 d33]
1069//
1070// output arguments:
1071
1072#if MACRO_LEVEL>=2
1073	.macro INNER_KERNEL_DGEMM_NN_4X3_LIB4C
1074#else
1075	.p2align 4,,15
1076	FUN_START(inner_kernel_dgemm_nn_4x3_lib4c)
1077#endif
1078
1079	cmpl	$0, %r10d
1080	jle		5f // return
1081
1082	movq	%r12, %r15
1083	addq	%r13, %r15
1084	addq	%r13, %r15 // B+2*ldb
1085
1086	// preload
1087
1088	vxorpd			%ymm4, %ymm4, %ymm4
1089	vmovapd			%ymm4, %ymm5
1090	vmovapd			%ymm4, %ymm6
1091
1092	cmpl	$4, %r10d
1093	jle		0f // consider clean-up loop
1094
1095	// main loop
1096	.p2align 3
10971: // main loop
1098
1099//	prefetcht0	0(%r12, %r13, 2) // software prefetch
1100//	prefetcht0	64(%r12, %r13, 2) // software prefetch
1101
1102	// unroll 0
1103	vmovupd			0(%r11), %ymm13 // A
1104	vbroadcastsd	0(%r12), %ymm12 // B
1105	vmulpd			%ymm13, %ymm12, %ymm15
1106	vaddpd			%ymm0, %ymm15, %ymm0
1107	vbroadcastsd	0(%r12, %r13, 1), %ymm12 // B
1108	vmulpd			%ymm13, %ymm12, %ymm15
1109	vaddpd			%ymm1, %ymm15, %ymm1
1110	vbroadcastsd	0(%r15), %ymm12 // B
1111	vmulpd			%ymm13, %ymm12, %ymm15
1112	vaddpd			%ymm2, %ymm15, %ymm2
1113
1114	// unroll 1
1115	vmovupd			32(%r11), %ymm13 // A
1116	vbroadcastsd	8(%r12), %ymm12 // B
1117	vmulpd			%ymm13, %ymm12, %ymm15
1118	vaddpd			%ymm4, %ymm15, %ymm4
1119	vbroadcastsd	8(%r12, %r13, 1), %ymm12 // B
1120	vmulpd			%ymm13, %ymm12, %ymm15
1121	vaddpd			%ymm5, %ymm15, %ymm5
1122	vbroadcastsd	8(%r15), %ymm12 // B
1123	vmulpd			%ymm13, %ymm12, %ymm15
1124	vaddpd			%ymm6, %ymm15, %ymm6
1125
1126	// unroll 2
1127	vmovupd			64(%r11), %ymm13 // A
1128	vbroadcastsd	16(%r12), %ymm12 // B
1129	vmulpd			%ymm13, %ymm12, %ymm15
1130	vaddpd			%ymm0, %ymm15, %ymm0
1131	vbroadcastsd	16(%r12, %r13, 1), %ymm12 // B
1132	vmulpd			%ymm13, %ymm12, %ymm15
1133	vaddpd			%ymm1, %ymm15, %ymm1
1134	vbroadcastsd	16(%r15), %ymm12 // B
1135	vmulpd			%ymm13, %ymm12, %ymm15
1136	vaddpd			%ymm2, %ymm15, %ymm2
1137
1138	// unroll 3
1139	vmovupd			96(%r11), %ymm13 // A
1140	vbroadcastsd	24(%r12), %ymm12 // B
1141	vmulpd			%ymm13, %ymm12, %ymm15
1142	vaddpd			%ymm4, %ymm15, %ymm4
1143	vbroadcastsd	24(%r12, %r13, 1), %ymm12 // B
1144	vmulpd			%ymm13, %ymm12, %ymm15
1145	vaddpd			%ymm5, %ymm15, %ymm5
1146	vbroadcastsd	24(%r15), %ymm12 // B
1147	vmulpd			%ymm13, %ymm12, %ymm15
1148	vaddpd			%ymm6, %ymm15, %ymm6
1149
1150	subl	$4, %r10d
1151	addq	$32, %r12
1152	addq	$32, %r15
1153	addq	$128, %r11
1154
1155	cmpl	$4, %r10d
1156	jg		1b // main loop
1157
1158
11590: // consider clean4-up
1160
1161	cmpl	$3, %r10d
1162	jle		4f // clean1
1163
1164	// unroll 0
1165	vmovupd			0(%r11), %ymm13 // A
1166	vbroadcastsd	0(%r12), %ymm12 // B
1167	vmulpd			%ymm13, %ymm12, %ymm15
1168	vaddpd			%ymm0, %ymm15, %ymm0
1169	vbroadcastsd	0(%r12, %r13, 1), %ymm12 // B
1170	vmulpd			%ymm13, %ymm12, %ymm15
1171	vaddpd			%ymm1, %ymm15, %ymm1
1172	vbroadcastsd	0(%r15), %ymm12 // B
1173	vmulpd			%ymm13, %ymm12, %ymm15
1174	vaddpd			%ymm2, %ymm15, %ymm2
1175
1176	// unroll 1
1177	vmovupd			32(%r11), %ymm13 // A
1178	vbroadcastsd	8(%r12), %ymm12 // B
1179	vmulpd			%ymm13, %ymm12, %ymm15
1180	vaddpd			%ymm4, %ymm15, %ymm4
1181	vbroadcastsd	8(%r12, %r13, 1), %ymm12 // B
1182	vmulpd			%ymm13, %ymm12, %ymm15
1183	vaddpd			%ymm5, %ymm15, %ymm5
1184	vbroadcastsd	8(%r15), %ymm12 // B
1185	vmulpd			%ymm13, %ymm12, %ymm15
1186	vaddpd			%ymm6, %ymm15, %ymm6
1187
1188	// unroll 2
1189	vmovupd			64(%r11), %ymm13 // A
1190	vbroadcastsd	16(%r12), %ymm12 // B
1191	vmulpd			%ymm13, %ymm12, %ymm15
1192	vaddpd			%ymm0, %ymm15, %ymm0
1193	vbroadcastsd	16(%r12, %r13, 1), %ymm12 // B
1194	vmulpd			%ymm13, %ymm12, %ymm15
1195	vaddpd			%ymm1, %ymm15, %ymm1
1196	vbroadcastsd	16(%r15), %ymm12 // B
1197	vmulpd			%ymm13, %ymm12, %ymm15
1198	vaddpd			%ymm2, %ymm15, %ymm2
1199
1200	// unroll 3
1201	vmovupd			96(%r11), %ymm13 // A
1202	vbroadcastsd	24(%r12), %ymm12 // B
1203	vmulpd			%ymm13, %ymm12, %ymm15
1204	vaddpd			%ymm4, %ymm15, %ymm4
1205	vbroadcastsd	24(%r12, %r13, 1), %ymm12 // B
1206	vmulpd			%ymm13, %ymm12, %ymm15
1207	vaddpd			%ymm5, %ymm15, %ymm5
1208	vbroadcastsd	24(%r15), %ymm12 // B
1209	vmulpd			%ymm13, %ymm12, %ymm15
1210	vaddpd			%ymm6, %ymm15, %ymm6
1211
1212	subl	$4, %r10d
1213	addq	$32, %r12
1214	addq	$32, %r15
1215	addq	$128, %r11
1216
1217	jmp		2f // return
1218
1219
12204: // consider clean1-up loop
1221
1222	cmpl	$0, %r10d
1223	jle		2f // return
1224
1225	// clean-up loop
12263: // clean up loop
1227
1228	// unroll 0
1229	vmovupd			0(%r11), %ymm13 // A
1230	vbroadcastsd	0(%r12), %ymm12 // B
1231	vmulpd			%ymm13, %ymm12, %ymm15
1232	vaddpd			%ymm0, %ymm15, %ymm0
1233	vbroadcastsd	0(%r12, %r13, 1), %ymm12 // B
1234	vmulpd			%ymm13, %ymm12, %ymm15
1235	vaddpd			%ymm1, %ymm15, %ymm1
1236	vbroadcastsd	0(%r15), %ymm12 // B
1237	vmulpd			%ymm13, %ymm12, %ymm15
1238	vaddpd			%ymm2, %ymm15, %ymm2
1239
1240	subl	$1, %r10d
1241	addq	$8, %r12
1242	addq	$8, %r15
1243	addq	$32, %r11
1244
1245	cmpl	$0, %r10d
1246	jg		3b // clean up loop
1247
1248
12492: // return
1250
1251	vaddpd			%ymm4, %ymm0, %ymm0
1252	vaddpd			%ymm5, %ymm1, %ymm1
1253	vaddpd			%ymm6, %ymm2, %ymm2
1254
12555: // return
1256
1257#if MACRO_LEVEL>=2
1258	.endm
1259#else
1260	ret
1261
1262	FUN_END(inner_kernel_dgemm_nn_4x3_lib4c)
1263#endif
1264
1265
1266
1267
1268
1269// common inner routine with file scope
1270//
1271// input arguments:
1272// r10d  <- k
1273// r11   <- A
1274// r12   <- B
1275// r13   <- ldb
1276// ymm0  <- [d00 d10 d20 d30]
1277// ymm1  <- [d01 d11 d21 d31]
1278// ymm2  <- [d02 d12 d22 d32]
1279// ymm3  <- [d03 d13 d23 d33]
1280//
1281// output arguments:
1282
1283#if MACRO_LEVEL>=2
1284	.macro INNER_KERNEL_DGEMM_NN_4X2_LIB4C
1285#else
1286	.p2align 4,,15
1287	FUN_START(inner_kernel_dgemm_nn_4x2_lib4c)
1288#endif
1289
1290	cmpl	$0, %r10d
1291	jle		5f // return
1292
1293	movq	%r12, %r15
1294	addq	%r13, %r15
1295	addq	%r13, %r15 // B+2*ldb
1296
1297	// preload
1298
1299	vxorpd			%ymm4, %ymm4, %ymm4
1300	vmovapd			%ymm4, %ymm5
1301
1302	cmpl	$4, %r10d
1303	jle		0f // consider clean-up loop
1304
1305	// main loop
1306	.p2align 3
13071: // main loop
1308
1309//	prefetcht0	0(%r12, %r13, 2) // software prefetch
1310//	prefetcht0	64(%r12, %r13, 2) // software prefetch
1311
1312	// unroll 0
1313	vmovupd			0(%r11), %ymm13 // A
1314	vbroadcastsd	0(%r12), %ymm12 // B
1315	vmulpd			%ymm13, %ymm12, %ymm15
1316	vaddpd			%ymm0, %ymm15, %ymm0
1317	vbroadcastsd	0(%r12, %r13, 1), %ymm12 // B
1318	vmulpd			%ymm13, %ymm12, %ymm15
1319	vaddpd			%ymm1, %ymm15, %ymm1
1320
1321	// unroll 1
1322	vmovupd			32(%r11), %ymm13 // A
1323	vbroadcastsd	8(%r12), %ymm12 // B
1324	vmulpd			%ymm13, %ymm12, %ymm15
1325	vaddpd			%ymm4, %ymm15, %ymm4
1326	vbroadcastsd	8(%r12, %r13, 1), %ymm12 // B
1327	vmulpd			%ymm13, %ymm12, %ymm15
1328	vaddpd			%ymm5, %ymm15, %ymm5
1329
1330	// unroll 2
1331	vmovupd			64(%r11), %ymm13 // A
1332	vbroadcastsd	16(%r12), %ymm12 // B
1333	vmulpd			%ymm13, %ymm12, %ymm15
1334	vaddpd			%ymm0, %ymm15, %ymm0
1335	vbroadcastsd	16(%r12, %r13, 1), %ymm12 // B
1336	vmulpd			%ymm13, %ymm12, %ymm15
1337	vaddpd			%ymm1, %ymm15, %ymm1
1338
1339	// unroll 3
1340	vmovupd			96(%r11), %ymm13 // A
1341	vbroadcastsd	24(%r12), %ymm12 // B
1342	vmulpd			%ymm13, %ymm12, %ymm15
1343	vaddpd			%ymm4, %ymm15, %ymm4
1344	vbroadcastsd	24(%r12, %r13, 1), %ymm12 // B
1345	vmulpd			%ymm13, %ymm12, %ymm15
1346	vaddpd			%ymm5, %ymm15, %ymm5
1347
1348	subl	$4, %r10d
1349	addq	$32, %r12
1350	addq	$32, %r15
1351	addq	$128, %r11
1352
1353	cmpl	$4, %r10d
1354	jg		1b // main loop
1355
1356
13570: // consider clean4-up
1358
1359	cmpl	$3, %r10d
1360	jle		4f // clean1
1361
1362	// unroll 0
1363	vmovupd			0(%r11), %ymm13 // A
1364	vbroadcastsd	0(%r12), %ymm12 // B
1365	vmulpd			%ymm13, %ymm12, %ymm15
1366	vaddpd			%ymm0, %ymm15, %ymm0
1367	vbroadcastsd	0(%r12, %r13, 1), %ymm12 // B
1368	vmulpd			%ymm13, %ymm12, %ymm15
1369	vaddpd			%ymm1, %ymm15, %ymm1
1370
1371	// unroll 1
1372	vmovupd			32(%r11), %ymm13 // A
1373	vbroadcastsd	8(%r12), %ymm12 // B
1374	vmulpd			%ymm13, %ymm12, %ymm15
1375	vaddpd			%ymm4, %ymm15, %ymm4
1376	vbroadcastsd	8(%r12, %r13, 1), %ymm12 // B
1377	vmulpd			%ymm13, %ymm12, %ymm15
1378	vaddpd			%ymm5, %ymm15, %ymm5
1379
1380	// unroll 2
1381	vmovupd			64(%r11), %ymm13 // A
1382	vbroadcastsd	16(%r12), %ymm12 // B
1383	vmulpd			%ymm13, %ymm12, %ymm15
1384	vaddpd			%ymm0, %ymm15, %ymm0
1385	vbroadcastsd	16(%r12, %r13, 1), %ymm12 // B
1386	vmulpd			%ymm13, %ymm12, %ymm15
1387	vaddpd			%ymm1, %ymm15, %ymm1
1388
1389	// unroll 3
1390	vmovupd			96(%r11), %ymm13 // A
1391	vbroadcastsd	24(%r12), %ymm12 // B
1392	vmulpd			%ymm13, %ymm12, %ymm15
1393	vaddpd			%ymm4, %ymm15, %ymm4
1394	vbroadcastsd	24(%r12, %r13, 1), %ymm12 // B
1395	vmulpd			%ymm13, %ymm12, %ymm15
1396	vaddpd			%ymm5, %ymm15, %ymm5
1397
1398	subl	$4, %r10d
1399	addq	$32, %r12
1400	addq	$32, %r15
1401	addq	$128, %r11
1402
1403	jmp		2f // return
1404
1405
14064: // consider clean1-up loop
1407
1408	cmpl	$0, %r10d
1409	jle		2f // return
1410
1411	// clean-up loop
14123: // clean up loop
1413
1414	// unroll 0
1415	vmovupd			0(%r11), %ymm13 // A
1416	vbroadcastsd	0(%r12), %ymm12 // B
1417	vmulpd			%ymm13, %ymm12, %ymm15
1418	vaddpd			%ymm0, %ymm15, %ymm0
1419	vbroadcastsd	0(%r12, %r13, 1), %ymm12 // B
1420	vmulpd			%ymm13, %ymm12, %ymm15
1421	vaddpd			%ymm1, %ymm15, %ymm1
1422
1423	subl	$1, %r10d
1424	addq	$8, %r12
1425	addq	$8, %r15
1426	addq	$32, %r11
1427
1428	cmpl	$0, %r10d
1429	jg		3b // clean up loop
1430
1431
14322: // return
1433
1434	vaddpd			%ymm4, %ymm0, %ymm0
1435	vaddpd			%ymm5, %ymm1, %ymm1
1436
14375: // return
1438
1439#if MACRO_LEVEL>=2
1440	.endm
1441#else
1442	ret
1443
1444	FUN_END(inner_kernel_dgemm_nn_4x2_lib4c)
1445#endif
1446
1447
1448
1449
1450
1451// common inner routine with file scope
1452//
1453// input arguments:
1454// r10d  <- k
1455// r11   <- A
1456// r12   <- B
1457// r13   <- ldb
1458// ymm0  <- [d00 d10 d20 d30]
1459// ymm1  <- [d01 d11 d21 d31]
1460// ymm2  <- [d02 d12 d22 d32]
1461// ymm3  <- [d03 d13 d23 d33]
1462//
1463// output arguments:
1464
1465#if MACRO_LEVEL>=2
1466	.macro INNER_KERNEL_DGEMM_NN_4X1_LIB4C
1467#else
1468	.p2align 4,,15
1469	FUN_START(inner_kernel_dgemm_nn_4x1_lib4c)
1470#endif
1471
1472	cmpl	$0, %r10d
1473	jle		5f // return
1474
1475	movq	%r12, %r15
1476	addq	%r13, %r15
1477	addq	%r13, %r15 // B+2*ldb
1478
1479	// preload
1480
1481	vxorpd			%ymm4, %ymm4, %ymm4
1482
1483	cmpl	$4, %r10d
1484	jle		0f // consider clean-up loop
1485
1486	// main loop
1487	.p2align 3
14881: // main loop
1489
1490//	prefetcht0	0(%r12, %r13, 2) // software prefetch
1491//	prefetcht0	64(%r12, %r13, 2) // software prefetch
1492
1493	// unroll 0
1494	vmovupd			0(%r11), %ymm13 // A
1495	vbroadcastsd	0(%r12), %ymm12 // B
1496	vmulpd			%ymm13, %ymm12, %ymm15
1497	vaddpd			%ymm0, %ymm15, %ymm0
1498
1499	// unroll 1
1500	vmovupd			32(%r11), %ymm13 // A
1501	vbroadcastsd	8(%r12), %ymm12 // B
1502	vmulpd			%ymm13, %ymm12, %ymm15
1503	vaddpd			%ymm4, %ymm15, %ymm4
1504
1505	// unroll 2
1506	vmovupd			64(%r11), %ymm13 // A
1507	vbroadcastsd	16(%r12), %ymm12 // B
1508	vmulpd			%ymm13, %ymm12, %ymm15
1509	vaddpd			%ymm0, %ymm15, %ymm0
1510
1511	// unroll 3
1512	vmovupd			96(%r11), %ymm13 // A
1513	vbroadcastsd	24(%r12), %ymm12 // B
1514	vmulpd			%ymm13, %ymm12, %ymm15
1515	vaddpd			%ymm4, %ymm15, %ymm4
1516
1517	subl	$4, %r10d
1518	addq	$32, %r12
1519	addq	$32, %r15
1520	addq	$128, %r11
1521
1522	cmpl	$4, %r10d
1523	jg		1b // main loop
1524
1525
15260: // consider clean4-up
1527
1528	cmpl	$3, %r10d
1529	jle		4f // clean1
1530
1531	// unroll 0
1532	vmovupd			0(%r11), %ymm13 // A
1533	vbroadcastsd	0(%r12), %ymm12 // B
1534	vmulpd			%ymm13, %ymm12, %ymm15
1535	vaddpd			%ymm0, %ymm15, %ymm0
1536
1537	// unroll 1
1538	vmovupd			32(%r11), %ymm13 // A
1539	vbroadcastsd	8(%r12), %ymm12 // B
1540	vmulpd			%ymm13, %ymm12, %ymm15
1541	vaddpd			%ymm4, %ymm15, %ymm4
1542
1543	// unroll 2
1544	vmovupd			64(%r11), %ymm13 // A
1545	vbroadcastsd	16(%r12), %ymm12 // B
1546	vmulpd			%ymm13, %ymm12, %ymm15
1547	vaddpd			%ymm0, %ymm15, %ymm0
1548
1549	// unroll 3
1550	vmovupd			96(%r11), %ymm13 // A
1551	vbroadcastsd	24(%r12), %ymm12 // B
1552	vmulpd			%ymm13, %ymm12, %ymm15
1553	vaddpd			%ymm4, %ymm15, %ymm4
1554
1555	subl	$4, %r10d
1556	addq	$32, %r12
1557	addq	$32, %r15
1558	addq	$128, %r11
1559
1560	jmp		2f // return
1561
1562
15634: // consider clean1-up loop
1564
1565	cmpl	$0, %r10d
1566	jle		2f // return
1567
1568	// clean-up loop
15693: // clean up loop
1570
1571	// unroll 0
1572	vmovupd			0(%r11), %ymm13 // A
1573	vbroadcastsd	0(%r12), %ymm12 // B
1574	vmulpd			%ymm13, %ymm12, %ymm15
1575	vaddpd			%ymm0, %ymm15, %ymm0
1576
1577	subl	$1, %r10d
1578	addq	$8, %r12
1579	addq	$8, %r15
1580	addq	$32, %r11
1581
1582	cmpl	$0, %r10d
1583	jg		3b // clean up loop
1584
1585
15862: // return
1587
1588	vaddpd			%ymm4, %ymm0, %ymm0
1589
15905: // return
1591
1592#if MACRO_LEVEL>=2
1593	.endm
1594#else
1595	ret
1596
1597	FUN_END(inner_kernel_dgemm_nn_4x1_lib4c)
1598#endif
1599
1600
1601
1602
1603
1604// common inner routine with file scope
1605//
1606// edge for B lower triangular
1607//
1608// input arguments:
1609// r10d  <- k
1610// r11   <- A
1611// r12   <- B
1612// r13   <- ldb
1613// ymm0  <- [d00 d10 d20 d30]
1614// ymm1  <- [d01 d11 d21 d31]
1615// ymm2  <- [d02 d12 d22 d32]
1616// ymm3  <- [d03 d13 d23 d33]
1617//
1618// output arguments:
1619
1620#if MACRO_LEVEL>=1
1621	.macro INNER_EDGE_DTRMM_NN_RL_4X4_LIB4C
1622#else
1623	.p2align 4,,15
1624	FUN_START(inner_edge_dtrmm_nn_rl_4x4_lib4c)
1625#endif
1626
1627	movq	%r12, %r15
1628	addq	%r13, %r15
1629	addq	%r13, %r15 // B+2*ldb
1630
1631	// unroll 0
1632	vmovupd			0(%r11), %ymm13 // A
1633	vbroadcastsd	0(%r12), %ymm12 // B
1634	vmulpd			%ymm13, %ymm12, %ymm15
1635	vaddpd			%ymm0, %ymm15, %ymm0
1636
1637	// unroll 1
1638	vmovupd			32(%r11), %ymm13 // A
1639	vbroadcastsd	8(%r12), %ymm12 // B
1640	vmulpd			%ymm13, %ymm12, %ymm15
1641	vaddpd			%ymm0, %ymm15, %ymm0
1642	vbroadcastsd	8(%r12, %r13, 1), %ymm12 // B
1643	vmulpd			%ymm13, %ymm12, %ymm15
1644	vaddpd			%ymm1, %ymm15, %ymm1
1645
1646	// unroll 2
1647	vmovupd			64(%r11), %ymm13 // A
1648	vbroadcastsd	16(%r12), %ymm12 // B
1649	vmulpd			%ymm13, %ymm12, %ymm15
1650	vaddpd			%ymm0, %ymm15, %ymm0
1651	vbroadcastsd	16(%r12, %r13, 1), %ymm12 // B
1652	vmulpd			%ymm13, %ymm12, %ymm15
1653	vaddpd			%ymm1, %ymm15, %ymm1
1654	vbroadcastsd	16(%r15), %ymm12 // B
1655	vmulpd			%ymm13, %ymm12, %ymm15
1656	vaddpd			%ymm2, %ymm15, %ymm2
1657
1658	// unroll 3
1659	vmovupd			96(%r11), %ymm13 // A
1660	vbroadcastsd	24(%r12), %ymm12 // B
1661	vmulpd			%ymm13, %ymm12, %ymm15
1662	vaddpd			%ymm0, %ymm15, %ymm0
1663	vbroadcastsd	24(%r12, %r13, 1), %ymm12 // B
1664	vmulpd			%ymm13, %ymm12, %ymm15
1665	vaddpd			%ymm1, %ymm15, %ymm1
1666	vbroadcastsd	24(%r15), %ymm12 // B
1667	vmulpd			%ymm13, %ymm12, %ymm15
1668	vaddpd			%ymm2, %ymm15, %ymm2
1669	vbroadcastsd	24(%r15, %r13, 1), %ymm12 // B
1670	vmulpd			%ymm13, %ymm12, %ymm15
1671	vaddpd			%ymm3, %ymm15, %ymm3
1672
1673	subl	$4, %r10d
1674	addq	$32, %r12
1675	addq	$32, %r15
1676	addq	$128, %r11
1677
1678#if MACRO_LEVEL>=1
1679	.endm
1680#else
1681	ret
1682
1683	FUN_END(inner_edge_dtrmm_nn_rl_4x4_lib4c)
1684#endif
1685
1686
1687
1688
1689
1690// common inner routine with file scope
1691//
1692// edge for B lower triangular
1693//
1694// input arguments:
1695// r10d  <- k
1696// r11   <- A
1697// r12   <- B
1698// r13   <- ldb
1699// ymm0  <- [d00 d10 d20 d30]
1700// ymm1  <- [d01 d11 d21 d31]
1701// ymm2  <- [d02 d12 d22 d32]
1702// ymm3  <- [d03 d13 d23 d33]
1703//
1704// output arguments:
1705
1706#if MACRO_LEVEL>=1
1707	.macro INNER_EDGE_DTRMM_NN_RL_4X4_VS_LIB4C
1708#else
1709	.p2align 4,,15
1710	FUN_START(inner_edge_dtrmm_nn_rl_4x4_vs_lib4c)
1711#endif
1712
1713	cmpl	$0, %r10d
1714	jle		0f // end
1715
1716	movq	%r12, %r15
1717	addq	%r13, %r15
1718	addq	%r13, %r15 // B+2*ldb
1719
1720	// unroll 0
1721	vmovupd			0(%r11), %ymm13 // A
1722	vbroadcastsd	0(%r12), %ymm12 // B
1723	vmulpd			%ymm13, %ymm12, %ymm15
1724	vaddpd			%ymm0, %ymm15, %ymm0
1725
1726	subl	$1, %r10d
1727	addq	$8, %r12
1728	addq	$8, %r15
1729	addq	$32, %r11
1730
1731	cmpl	$0, %r10d
1732	jle		0f // end
1733
1734	// unroll 1
1735	vmovupd			0(%r11), %ymm13 // A
1736	vbroadcastsd	0(%r12), %ymm12 // B
1737	vmulpd			%ymm13, %ymm12, %ymm15
1738	vaddpd			%ymm0, %ymm15, %ymm0
1739	vbroadcastsd	0(%r12, %r13, 1), %ymm12 // B
1740	vmulpd			%ymm13, %ymm12, %ymm15
1741	vaddpd			%ymm1, %ymm15, %ymm1
1742
1743	subl	$1, %r10d
1744	addq	$8, %r12
1745	addq	$8, %r15
1746	addq	$32, %r11
1747
1748	cmpl	$0, %r10d
1749	jle		0f // end
1750
1751	// unroll 2
1752	vmovupd			0(%r11), %ymm13 // A
1753	vbroadcastsd	0(%r12), %ymm12 // B
1754	vmulpd			%ymm13, %ymm12, %ymm15
1755	vaddpd			%ymm0, %ymm15, %ymm0
1756	vbroadcastsd	0(%r12, %r13, 1), %ymm12 // B
1757	vmulpd			%ymm13, %ymm12, %ymm15
1758	vaddpd			%ymm1, %ymm15, %ymm1
1759	vbroadcastsd	0(%r15), %ymm12 // B
1760	vmulpd			%ymm13, %ymm12, %ymm15
1761	vaddpd			%ymm2, %ymm15, %ymm2
1762
1763	subl	$1, %r10d
1764	addq	$8, %r12
1765	addq	$8, %r15
1766	addq	$32, %r11
1767
1768	cmpl	$0, %r10d
1769	jle		0f // end
1770
1771	// unroll 3
1772	vmovupd			0(%r11), %ymm13 // A
1773	vbroadcastsd	0(%r12), %ymm12 // B
1774	vmulpd			%ymm13, %ymm12, %ymm15
1775	vaddpd			%ymm0, %ymm15, %ymm0
1776	vbroadcastsd	0(%r12, %r13, 1), %ymm12 // B
1777	vmulpd			%ymm13, %ymm12, %ymm15
1778	vaddpd			%ymm1, %ymm15, %ymm1
1779	vbroadcastsd	0(%r15), %ymm12 // B
1780	vmulpd			%ymm13, %ymm12, %ymm15
1781	vaddpd			%ymm2, %ymm15, %ymm2
1782	vbroadcastsd	0(%r15, %r13, 1), %ymm12 // B
1783	vmulpd			%ymm13, %ymm12, %ymm15
1784	vaddpd			%ymm3, %ymm15, %ymm3
1785
1786	subl	$1, %r10d
1787	addq	$8, %r12
1788	addq	$8, %r15
1789	addq	$32, %r11
1790
17910:
1792
1793#if MACRO_LEVEL>=1
1794	.endm
1795#else
1796	ret
1797
1798	FUN_END(inner_edge_dtrmm_nn_rl_4x4_vs_lib4c)
1799#endif
1800
1801
1802
1803
1804
1805// common inner routine with file scope
1806//
1807// edge for B lower triangular
1808//
1809// input arguments:
1810// r10d  <- k
1811// r11   <- A
1812// r12   <- B
1813// r13   <- ldb
1814// ymm0  <- [d00 d10 d20 d30]
1815// ymm1  <- [d01 d11 d21 d31]
1816// ymm2  <- [d02 d12 d22 d32]
1817// ymm3  <- [d03 d13 d23 d33]
1818//
1819// output arguments:
1820
1821#if MACRO_LEVEL>=1
1822	.macro INNER_EDGE_DTRMM_NN_RL_ONE_4X4_LIB4C
1823#else
1824	.p2align 4,,15
1825	FUN_START(inner_edge_dtrmm_nn_rl_one_4x4_lib4c)
1826#endif
1827
1828	movq	%r12, %r15
1829	addq	%r13, %r15
1830	addq	%r13, %r15 // B+2*ldb
1831
1832	// unroll 0
1833	vmovupd			0(%r11), %ymm13 // A
1834	vaddpd			%ymm0, %ymm13, %ymm0
1835
1836	// unroll 1
1837	vmovupd			32(%r11), %ymm13 // A
1838	vbroadcastsd	8(%r12), %ymm12 // B
1839	vmulpd			%ymm13, %ymm12, %ymm15
1840	vaddpd			%ymm0, %ymm15, %ymm0
1841	vaddpd			%ymm1, %ymm13, %ymm1
1842
1843	// unroll 2
1844	vmovupd			64(%r11), %ymm13 // A
1845	vbroadcastsd	16(%r12), %ymm12 // B
1846	vmulpd			%ymm13, %ymm12, %ymm15
1847	vaddpd			%ymm0, %ymm15, %ymm0
1848	vbroadcastsd	16(%r12, %r13, 1), %ymm12 // B
1849	vmulpd			%ymm13, %ymm12, %ymm15
1850	vaddpd			%ymm1, %ymm15, %ymm1
1851	vaddpd			%ymm2, %ymm13, %ymm2
1852
1853	// unroll 3
1854	vmovupd			96(%r11), %ymm13 // A
1855	vbroadcastsd	24(%r12), %ymm12 // B
1856	vmulpd			%ymm13, %ymm12, %ymm15
1857	vaddpd			%ymm0, %ymm15, %ymm0
1858	vbroadcastsd	24(%r12, %r13, 1), %ymm12 // B
1859	vmulpd			%ymm13, %ymm12, %ymm15
1860	vaddpd			%ymm1, %ymm15, %ymm1
1861	vbroadcastsd	24(%r15), %ymm12 // B
1862	vmulpd			%ymm13, %ymm12, %ymm15
1863	vaddpd			%ymm2, %ymm15, %ymm2
1864	vaddpd			%ymm3, %ymm13, %ymm3
1865
1866	subl	$4, %r10d
1867	addq	$32, %r12
1868	addq	$32, %r15
1869	addq	$128, %r11
1870
1871#if MACRO_LEVEL>=1
1872	.endm
1873#else
1874	ret
1875
1876	FUN_END(inner_edge_dtrmm_nn_rl_one_4x4_lib4c)
1877#endif
1878
1879
1880
1881
1882
1883// common inner routine with file scope
1884//
1885// edge for B lower triangular
1886//
1887// input arguments:
1888// r10d  <- k
1889// r11   <- A
1890// r12   <- B
1891// r13   <- ldb
1892// ymm0  <- [d00 d10 d20 d30]
1893// ymm1  <- [d01 d11 d21 d31]
1894// ymm2  <- [d02 d12 d22 d32]
1895// ymm3  <- [d03 d13 d23 d33]
1896//
1897// output arguments:
1898
1899#if MACRO_LEVEL>=1
1900	.macro INNER_EDGE_DTRMM_NN_RL_ONE_4X4_VS_LIB4C
1901#else
1902	.p2align 4,,15
1903	FUN_START(inner_edge_dtrmm_nn_rl_one_4x4_vs_lib4c)
1904#endif
1905
1906	cmpl	$0, %r10d
1907	jle		0f // end
1908
1909	movq	%r12, %r15
1910	addq	%r13, %r15
1911	addq	%r13, %r15 // B+2*ldb
1912
1913	// unroll 0
1914	vmovupd			0(%r11), %ymm13 // A
1915	vaddpd			%ymm0, %ymm13, %ymm0
1916
1917	subl	$1, %r10d
1918	addq	$8, %r12
1919	addq	$8, %r15
1920	addq	$32, %r11
1921
1922	cmpl	$0, %r10d
1923	jle		0f // end
1924
1925	// unroll 1
1926	vmovupd			0(%r11), %ymm13 // A
1927	vbroadcastsd	0(%r12), %ymm12 // B
1928	vmulpd			%ymm13, %ymm12, %ymm15
1929	vaddpd			%ymm0, %ymm15, %ymm0
1930	vaddpd			%ymm1, %ymm13, %ymm1
1931
1932	subl	$1, %r10d
1933	addq	$8, %r12
1934	addq	$8, %r15
1935	addq	$32, %r11
1936
1937	cmpl	$0, %r10d
1938	jle		0f // end
1939
1940	// unroll 2
1941	vmovupd			0(%r11), %ymm13 // A
1942	vbroadcastsd	0(%r12), %ymm12 // B
1943	vmulpd			%ymm13, %ymm12, %ymm15
1944	vaddpd			%ymm0, %ymm15, %ymm0
1945	vbroadcastsd	0(%r12, %r13, 1), %ymm12 // B
1946	vmulpd			%ymm13, %ymm12, %ymm15
1947	vaddpd			%ymm1, %ymm15, %ymm1
1948	vaddpd			%ymm2, %ymm13, %ymm2
1949
1950	subl	$1, %r10d
1951	addq	$8, %r12
1952	addq	$8, %r15
1953	addq	$32, %r11
1954
1955	cmpl	$0, %r10d
1956	jle		0f // end
1957
1958	// unroll 3
1959	vmovupd			0(%r11), %ymm13 // A
1960	vbroadcastsd	0(%r12), %ymm12 // B
1961	vmulpd			%ymm13, %ymm12, %ymm15
1962	vaddpd			%ymm0, %ymm15, %ymm0
1963	vbroadcastsd	0(%r12, %r13, 1), %ymm12 // B
1964	vmulpd			%ymm13, %ymm12, %ymm15
1965	vaddpd			%ymm1, %ymm15, %ymm1
1966	vbroadcastsd	0(%r15), %ymm12 // B
1967	vmulpd			%ymm13, %ymm12, %ymm15
1968	vaddpd			%ymm2, %ymm15, %ymm2
1969	vaddpd			%ymm3, %ymm13, %ymm3
1970
1971	subl	$1, %r10d
1972	addq	$8, %r12
1973	addq	$8, %r15
1974	addq	$32, %r11
1975
19760:
1977
1978#if MACRO_LEVEL>=1
1979	.endm
1980#else
1981	ret
1982
1983	FUN_END(inner_edge_dtrmm_nn_rl_one_4x4_vs_lib4c)
1984#endif
1985
1986
1987
1988
1989
1990// common inner routine with file scope
1991//
1992// edge for B upper triangular
1993//
1994// input arguments:
1995// r10d  <- k
1996// r11   <- A
1997// r12   <- B
1998// r13   <- ldb
1999// ymm0  <- [d00 d10 d20 d30]
2000// ymm1  <- [d01 d11 d21 d31]
2001// ymm2  <- [d02 d12 d22 d32]
2002// ymm3  <- [d03 d13 d23 d33]
2003//
2004// output arguments:
2005
2006#if MACRO_LEVEL>=1
2007	.macro INNER_EDGE_DTRMM_NN_RU_4X4_LIB4C
2008#else
2009	.p2align 4,,15
2010	FUN_START(inner_edge_dtrmm_nn_ru_4x4_lib4c)
2011#endif
2012
2013	movq	%r12, %r15
2014	addq	%r13, %r15
2015	addq	%r13, %r15 // B+2*ldb
2016
2017	// unroll 0
2018	vmovupd			0(%r11), %ymm13 // A
2019	vbroadcastsd	0(%r12), %ymm12 // B
2020	vmulpd			%ymm13, %ymm12, %ymm15
2021	vaddpd			%ymm0, %ymm15, %ymm0
2022	vbroadcastsd	0(%r12, %r13), %ymm12 // B
2023	vmulpd			%ymm13, %ymm12, %ymm15
2024	vaddpd			%ymm1, %ymm15, %ymm1
2025	vbroadcastsd	0(%r15), %ymm12 // B
2026	vmulpd			%ymm13, %ymm12, %ymm15
2027	vaddpd			%ymm2, %ymm15, %ymm2
2028	vbroadcastsd	0(%r15, %r13), %ymm12 // B
2029	vmulpd			%ymm13, %ymm12, %ymm15
2030	vaddpd			%ymm3, %ymm15, %ymm3
2031
2032	// unroll 1
2033	vmovupd			32(%r11), %ymm13 // A
2034	vbroadcastsd	8(%r12, %r13), %ymm12 // B
2035	vmulpd			%ymm13, %ymm12, %ymm15
2036	vaddpd			%ymm1, %ymm15, %ymm1
2037	vbroadcastsd	8(%r15), %ymm12 // B
2038	vmulpd			%ymm13, %ymm12, %ymm15
2039	vaddpd			%ymm2, %ymm15, %ymm2
2040	vbroadcastsd	8(%r15, %r13), %ymm12 // B
2041	vmulpd			%ymm13, %ymm12, %ymm15
2042	vaddpd			%ymm3, %ymm15, %ymm3
2043
2044	// unroll 2
2045	vmovupd			64(%r11), %ymm13 // A
2046	vbroadcastsd	16(%r15), %ymm12 // B
2047	vmulpd			%ymm13, %ymm12, %ymm15
2048	vaddpd			%ymm2, %ymm15, %ymm2
2049	vbroadcastsd	16(%r15, %r13), %ymm12 // B
2050	vmulpd			%ymm13, %ymm12, %ymm15
2051	vaddpd			%ymm3, %ymm15, %ymm3
2052
2053	// unroll 3
2054	vmovupd			96(%r11), %ymm13 // A
2055	vbroadcastsd	24(%r15, %r13), %ymm12 // B
2056	vmulpd			%ymm13, %ymm12, %ymm15
2057	vaddpd			%ymm3, %ymm15, %ymm3
2058
2059	subl	$4, %r10d
2060	addq	$32, %r12
2061	addq	$32, %r15
2062	addq	$128, %r11
2063
2064#if MACRO_LEVEL>=1
2065	.endm
2066#else
2067	ret
2068
2069	FUN_END(inner_edge_dtrmm_nn_ru_4x4_lib4c)
2070#endif
2071
2072
2073
2074
2075
2076// common inner routine with file scope
2077//
2078// edge for B upper triangular
2079//
2080// input arguments:
2081// r10d  <- k
2082// r11   <- A
2083// r12   <- B
2084// r13   <- ldb
2085// r14   <- n1
2086// ymm0  <- [d00 d10 d20 d30]
2087// ymm1  <- [d01 d11 d21 d31]
2088// ymm2  <- [d02 d12 d22 d32]
2089// ymm3  <- [d03 d13 d23 d33]
2090//
2091// output arguments:
2092
2093#if MACRO_LEVEL>=1
2094	.macro INNER_EDGE_DTRMM_NN_RU_4X4_VS_LIB4C
2095#else
2096	.p2align 4,,15
2097	FUN_START(inner_edge_dtrmm_nn_ru_4x4_vs_lib4c)
2098#endif
2099
2100	cmpl	$0, %r14d
2101	jle		0f // end
2102
2103	movq	%r12, %r15
2104	addq	%r13, %r15
2105	addq	%r13, %r15 // B+2*ldb
2106
2107	cmpl	$4, %r14d
2108	jl		1f // end
2109
2110	// unroll 0
2111	vmovupd			0(%r11), %ymm13 // A
2112	vbroadcastsd	0(%r12), %ymm12 // B
2113	vmulpd			%ymm13, %ymm12, %ymm15
2114	vaddpd			%ymm0, %ymm15, %ymm0
2115	vbroadcastsd	0(%r12, %r13), %ymm12 // B
2116	vmulpd			%ymm13, %ymm12, %ymm15
2117	vaddpd			%ymm1, %ymm15, %ymm1
2118	vbroadcastsd	0(%r15), %ymm12 // B
2119	vmulpd			%ymm13, %ymm12, %ymm15
2120	vaddpd			%ymm2, %ymm15, %ymm2
2121	vbroadcastsd	0(%r15, %r13), %ymm12 // B
2122	vmulpd			%ymm13, %ymm12, %ymm15
2123	vaddpd			%ymm3, %ymm15, %ymm3
2124
2125	// unroll 1
2126	vmovupd			32(%r11), %ymm13 // A
2127	vbroadcastsd	8(%r12, %r13), %ymm12 // B
2128	vmulpd			%ymm13, %ymm12, %ymm15
2129	vaddpd			%ymm1, %ymm15, %ymm1
2130	vbroadcastsd	8(%r15), %ymm12 // B
2131	vmulpd			%ymm13, %ymm12, %ymm15
2132	vaddpd			%ymm2, %ymm15, %ymm2
2133	vbroadcastsd	8(%r15, %r13), %ymm12 // B
2134	vmulpd			%ymm13, %ymm12, %ymm15
2135	vaddpd			%ymm3, %ymm15, %ymm3
2136
2137	// unroll 2
2138	vmovupd			64(%r11), %ymm13 // A
2139	vbroadcastsd	16(%r15), %ymm12 // B
2140	vmulpd			%ymm13, %ymm12, %ymm15
2141	vaddpd			%ymm2, %ymm15, %ymm2
2142	vbroadcastsd	16(%r15, %r13), %ymm12 // B
2143	vmulpd			%ymm13, %ymm12, %ymm15
2144	vaddpd			%ymm3, %ymm15, %ymm3
2145
2146	// unroll 3
2147	vmovupd			96(%r11), %ymm13 // A
2148	vbroadcastsd	24(%r15, %r13), %ymm12 // B
2149	vmulpd			%ymm13, %ymm12, %ymm15
2150	vaddpd			%ymm3, %ymm15, %ymm3
2151
2152	subl	$4, %r10d
2153	addq	$32, %r12
2154	addq	$32, %r15
2155	addq	$128, %r11
2156
2157	jmp		0f
2158
21591:
2160
2161	cmpl	$3, %r14d
2162	jl		2f // end
2163
2164	// unroll 0
2165	vmovupd			0(%r11), %ymm13 // A
2166	vbroadcastsd	0(%r12), %ymm12 // B
2167	vmulpd			%ymm13, %ymm12, %ymm15
2168	vaddpd			%ymm0, %ymm15, %ymm0
2169	vbroadcastsd	0(%r12, %r13), %ymm12 // B
2170	vmulpd			%ymm13, %ymm12, %ymm15
2171	vaddpd			%ymm1, %ymm15, %ymm1
2172	vbroadcastsd	0(%r15), %ymm12 // B
2173	vmulpd			%ymm13, %ymm12, %ymm15
2174	vaddpd			%ymm2, %ymm15, %ymm2
2175
2176	// unroll 1
2177	vmovupd			32(%r11), %ymm13 // A
2178	vbroadcastsd	8(%r12, %r13), %ymm12 // B
2179	vmulpd			%ymm13, %ymm12, %ymm15
2180	vaddpd			%ymm1, %ymm15, %ymm1
2181	vbroadcastsd	8(%r15), %ymm12 // B
2182	vmulpd			%ymm13, %ymm12, %ymm15
2183	vaddpd			%ymm2, %ymm15, %ymm2
2184
2185	// unroll 2
2186	vmovupd			64(%r11), %ymm13 // A
2187	vbroadcastsd	16(%r15), %ymm12 // B
2188	vmulpd			%ymm13, %ymm12, %ymm15
2189	vaddpd			%ymm2, %ymm15, %ymm2
2190
2191	// unroll 3
2192
2193	subl	$3, %r10d
2194	addq	$24, %r12
2195	addq	$24, %r15
2196	addq	$96, %r11
2197
2198	jmp		0f
2199
22002:
2201
2202	cmpl	$2, %r14d
2203	jl		3f // end
2204
2205	// unroll 0
2206	vmovupd			0(%r11), %ymm13 // A
2207	vbroadcastsd	0(%r12), %ymm12 // B
2208	vmulpd			%ymm13, %ymm12, %ymm15
2209	vaddpd			%ymm0, %ymm15, %ymm0
2210	vbroadcastsd	0(%r12, %r13), %ymm12 // B
2211	vmulpd			%ymm13, %ymm12, %ymm15
2212	vaddpd			%ymm1, %ymm15, %ymm1
2213
2214	// unroll 1
2215	vmovupd			32(%r11), %ymm13 // A
2216	vbroadcastsd	8(%r12, %r13), %ymm12 // B
2217	vmulpd			%ymm13, %ymm12, %ymm15
2218	vaddpd			%ymm1, %ymm15, %ymm1
2219
2220	// unroll 2
2221
2222	// unroll 3
2223
2224	subl	$2, %r10d
2225	addq	$16, %r12
2226	addq	$16, %r15
2227	addq	$64, %r11
2228
2229	jmp		0f
2230
22313:
2232
2233//	cmpl	$1, %r14d
2234//	jl		0f // end
2235
2236	// unroll 0
2237	vmovupd			0(%r11), %ymm13 // A
2238	vbroadcastsd	0(%r12), %ymm12 // B
2239	vmulpd			%ymm13, %ymm12, %ymm15
2240	vaddpd			%ymm0, %ymm15, %ymm0
2241
2242	// unroll 1
2243
2244	// unroll 2
2245
2246	// unroll 3
2247
2248	subl	$1, %r10d
2249	addq	$8, %r12
2250	addq	$8, %r15
2251	addq	$32, %r11
2252
22530:
2254
2255#if MACRO_LEVEL>=1
2256	.endm
2257#else
2258	ret
2259
2260	FUN_END(inner_edge_dtrmm_nn_ru_4x4_vs_lib4c)
2261#endif
2262
2263
2264
2265
2266
2267// common inner routine with file scope
2268//
2269// edge for B upper triangular
2270//
2271// input arguments:
2272// r10d  <- k
2273// r11   <- A
2274// r12   <- B
2275// r13   <- ldb
2276// ymm0  <- [d00 d10 d20 d30]
2277// ymm1  <- [d01 d11 d21 d31]
2278// ymm2  <- [d02 d12 d22 d32]
2279// ymm3  <- [d03 d13 d23 d33]
2280//
2281// output arguments:
2282
2283#if MACRO_LEVEL>=1
2284	.macro INNER_EDGE_DTRMM_NN_RU_ONE_4X4_LIB4C
2285#else
2286	.p2align 4,,15
2287	FUN_START(inner_edge_dtrmm_nn_ru_one_4x4_lib4c)
2288#endif
2289
2290	movq	%r12, %r15
2291	addq	%r13, %r15
2292	addq	%r13, %r15 // B+2*ldb
2293
2294	// unroll 0
2295	vmovupd			0(%r11), %ymm13 // A
2296	vaddpd			%ymm0, %ymm13, %ymm0
2297	vbroadcastsd	0(%r12, %r13), %ymm12 // B
2298	vmulpd			%ymm13, %ymm12, %ymm15
2299	vaddpd			%ymm1, %ymm15, %ymm1
2300	vbroadcastsd	0(%r15), %ymm12 // B
2301	vmulpd			%ymm13, %ymm12, %ymm15
2302	vaddpd			%ymm2, %ymm15, %ymm2
2303	vbroadcastsd	0(%r15, %r13), %ymm12 // B
2304	vmulpd			%ymm13, %ymm12, %ymm15
2305	vaddpd			%ymm3, %ymm15, %ymm3
2306
2307	// unroll 1
2308	vmovupd			32(%r11), %ymm13 // A
2309	vaddpd			%ymm1, %ymm13, %ymm1
2310	vbroadcastsd	8(%r15), %ymm12 // B
2311	vmulpd			%ymm13, %ymm12, %ymm15
2312	vaddpd			%ymm2, %ymm15, %ymm2
2313	vbroadcastsd	8(%r15, %r13), %ymm12 // B
2314	vmulpd			%ymm13, %ymm12, %ymm15
2315	vaddpd			%ymm3, %ymm15, %ymm3
2316
2317	// unroll 2
2318	vmovupd			64(%r11), %ymm13 // A
2319	vaddpd			%ymm2, %ymm13, %ymm2
2320	vbroadcastsd	16(%r15, %r13), %ymm12 // B
2321	vmulpd			%ymm13, %ymm12, %ymm15
2322	vaddpd			%ymm3, %ymm15, %ymm3
2323
2324	// unroll 3
2325	vmovupd			96(%r11), %ymm13 // A
2326	vaddpd			%ymm3, %ymm13, %ymm3
2327
2328	subl	$4, %r10d
2329	addq	$32, %r12
2330	addq	$32, %r15
2331	addq	$128, %r11
2332
2333#if MACRO_LEVEL>=1
2334	.endm
2335#else
2336	ret
2337
2338	FUN_END(inner_edge_dtrmm_nn_ru_one_4x4_lib4c)
2339#endif
2340
2341
2342
2343
2344
2345// common inner routine with file scope
2346//
2347// edge for B upper triangular
2348//
2349// input arguments:
2350// r10d  <- k
2351// r11   <- A
2352// r12   <- B
2353// r13   <- ldb
2354// r14   <- n1
2355// ymm0  <- [d00 d10 d20 d30]
2356// ymm1  <- [d01 d11 d21 d31]
2357// ymm2  <- [d02 d12 d22 d32]
2358// ymm3  <- [d03 d13 d23 d33]
2359//
2360// output arguments:
2361
2362#if MACRO_LEVEL>=1
2363	.macro INNER_EDGE_DTRMM_NN_RU_ONE_4X4_VS_LIB4C
2364#else
2365	.p2align 4,,15
2366	FUN_START(inner_edge_dtrmm_nn_ru_one_4x4_vs_lib4c)
2367#endif
2368
2369	cmpl	$0, %r14d
2370	jle		0f // end
2371
2372	movq	%r12, %r15
2373	addq	%r13, %r15
2374	addq	%r13, %r15 // B+2*ldb
2375
2376	cmpl	$4, %r14d
2377	jl		1f // end
2378
2379	// unroll 0
2380	vmovupd			0(%r11), %ymm13 // A
2381	vaddpd			%ymm0, %ymm13, %ymm0
2382	vbroadcastsd	0(%r12, %r13), %ymm12 // B
2383	vmulpd			%ymm13, %ymm12, %ymm15
2384	vaddpd			%ymm1, %ymm15, %ymm1
2385	vbroadcastsd	0(%r15), %ymm12 // B
2386	vmulpd			%ymm13, %ymm12, %ymm15
2387	vaddpd			%ymm2, %ymm15, %ymm2
2388	vbroadcastsd	0(%r15, %r13), %ymm12 // B
2389	vmulpd			%ymm13, %ymm12, %ymm15
2390	vaddpd			%ymm3, %ymm15, %ymm3
2391
2392	// unroll 1
2393	vmovupd			32(%r11), %ymm13 // A
2394	vaddpd			%ymm1, %ymm13, %ymm1
2395	vbroadcastsd	8(%r15), %ymm12 // B
2396	vmulpd			%ymm13, %ymm12, %ymm15
2397	vaddpd			%ymm2, %ymm15, %ymm2
2398	vbroadcastsd	8(%r15, %r13), %ymm12 // B
2399	vmulpd			%ymm13, %ymm12, %ymm15
2400	vaddpd			%ymm3, %ymm15, %ymm3
2401
2402	// unroll 2
2403	vmovupd			64(%r11), %ymm13 // A
2404	vaddpd			%ymm2, %ymm13, %ymm2
2405	vbroadcastsd	16(%r15, %r13), %ymm12 // B
2406	vmulpd			%ymm13, %ymm12, %ymm15
2407	vaddpd			%ymm3, %ymm15, %ymm3
2408
2409	// unroll 3
2410	vmovupd			96(%r11), %ymm13 // A
2411	vaddpd			%ymm3, %ymm13, %ymm3
2412
2413	subl	$4, %r10d
2414	addq	$32, %r12
2415	addq	$32, %r15
2416	addq	$128, %r11
2417
2418	jmp		0f
2419
24201:
2421
2422	cmpl	$3, %r14d
2423	jl		2f // end
2424
2425	// unroll 0
2426	vmovupd			0(%r11), %ymm13 // A
2427	vaddpd			%ymm0, %ymm13, %ymm0
2428	vbroadcastsd	0(%r12, %r13), %ymm12 // B
2429	vmulpd			%ymm13, %ymm12, %ymm15
2430	vaddpd			%ymm1, %ymm15, %ymm1
2431	vbroadcastsd	0(%r15), %ymm12 // B
2432	vmulpd			%ymm13, %ymm12, %ymm15
2433	vaddpd			%ymm2, %ymm15, %ymm2
2434
2435	// unroll 1
2436	vmovupd			32(%r11), %ymm13 // A
2437	vaddpd			%ymm1, %ymm13, %ymm1
2438	vbroadcastsd	8(%r15), %ymm12 // B
2439	vmulpd			%ymm13, %ymm12, %ymm15
2440	vaddpd			%ymm2, %ymm15, %ymm2
2441
2442	// unroll 2
2443	vmovupd			64(%r11), %ymm13 // A
2444	vaddpd			%ymm2, %ymm13, %ymm2
2445
2446	// unroll 3
2447
2448	subl	$3, %r10d
2449	addq	$24, %r12
2450	addq	$24, %r15
2451	addq	$96, %r11
2452
2453	jmp		0f
2454
24552:
2456
2457	cmpl	$2, %r14d
2458	jl		3f // end
2459
2460	// unroll 0
2461	vmovupd			0(%r11), %ymm13 // A
2462	vaddpd			%ymm0, %ymm13, %ymm0
2463	vbroadcastsd	0(%r12, %r13), %ymm12 // B
2464	vmulpd			%ymm13, %ymm12, %ymm15
2465	vaddpd			%ymm1, %ymm15, %ymm1
2466
2467	// unroll 1
2468	vmovupd			32(%r11), %ymm13 // A
2469	vaddpd			%ymm1, %ymm13, %ymm1
2470
2471	// unroll 2
2472
2473	// unroll 3
2474
2475	subl	$2, %r10d
2476	addq	$16, %r12
2477	addq	$16, %r15
2478	addq	$64, %r11
2479
2480	jmp		0f
2481
24823:
2483
2484//	cmpl	$1, %r14d
2485//	jl		0f // end
2486
2487	// unroll 0
2488	vmovupd			0(%r11), %ymm13 // A
2489	vaddpd			%ymm0, %ymm13, %ymm0
2490
2491	// unroll 1
2492
2493	// unroll 2
2494
2495	// unroll 3
2496
2497	subl	$1, %r10d
2498	addq	$8, %r12
2499	addq	$8, %r15
2500	addq	$32, %r11
2501
25020:
2503
2504#if MACRO_LEVEL>=1
2505	.endm
2506#else
2507	ret
2508
2509	FUN_END(inner_edge_dtrmm_nn_ru_one_4x4_vs_lib4c)
2510#endif
2511
2512
2513
2514
2515
2516// common inner routine with file scope
2517//
2518// edge for B lower triangular
2519//
2520// input arguments:
2521// r10d  <- k
2522// r11   <- A
2523// r12   <- B
2524// r13   <- ldb
2525// ymm0  <- [d00 d10 d20 d30]
2526// ymm1  <- [d01 d11 d21 d31]
2527// ymm2  <- [d02 d12 d22 d32]
2528// ymm3  <- [d03 d13 d23 d33]
2529//
2530// output arguments:
2531
2532#if MACRO_LEVEL>=1
2533	.macro INNER_EDGE_DTRMM_NT_RL_4X4_LIB4C
2534#else
2535	.p2align 4,,15
2536	FUN_START(inner_edge_dtrmm_nt_rl_4x4_lib4c)
2537#endif
2538
2539	// unroll 0
2540	vmovupd			0(%r11), %ymm13 // A
2541	vbroadcastsd	0(%r12), %ymm12 // B
2542	vmulpd			%ymm13, %ymm12, %ymm15
2543	vaddpd			%ymm0, %ymm15, %ymm0
2544	vbroadcastsd	8(%r12), %ymm12 // B
2545	vmulpd			%ymm13, %ymm12, %ymm15
2546	vaddpd			%ymm1, %ymm15, %ymm1
2547	vbroadcastsd	16(%r12), %ymm12 // B
2548	vmulpd			%ymm13, %ymm12, %ymm15
2549	vaddpd			%ymm2, %ymm15, %ymm2
2550	vbroadcastsd	24(%r12), %ymm12 // B
2551	vmulpd			%ymm13, %ymm12, %ymm15
2552	vaddpd			%ymm3, %ymm15, %ymm3
2553	addq	%r13, %r12
2554
2555	// unroll 1
2556	vmovupd			32(%r11), %ymm13 // A
2557	vbroadcastsd	8(%r12), %ymm12 // B
2558	vmulpd			%ymm13, %ymm12, %ymm15
2559	vaddpd			%ymm1, %ymm15, %ymm1
2560	vbroadcastsd	16(%r12), %ymm12 // B
2561	vmulpd			%ymm13, %ymm12, %ymm15
2562	vaddpd			%ymm2, %ymm15, %ymm2
2563	vbroadcastsd	24(%r12), %ymm12 // B
2564	vmulpd			%ymm13, %ymm12, %ymm15
2565	vaddpd			%ymm3, %ymm15, %ymm3
2566	addq	%r13, %r12
2567
2568	// unroll 2
2569	vmovupd			64(%r11), %ymm13 // A
2570	vbroadcastsd	16(%r12), %ymm12 // B
2571	vmulpd			%ymm13, %ymm12, %ymm15
2572	vaddpd			%ymm2, %ymm15, %ymm2
2573	vbroadcastsd	24(%r12), %ymm12 // B
2574	vmulpd			%ymm13, %ymm12, %ymm15
2575	vaddpd			%ymm3, %ymm15, %ymm3
2576	addq	%r13, %r12
2577
2578	// unroll 3
2579	vmovupd			96(%r11), %ymm13 // A
2580	vbroadcastsd	24(%r12), %ymm12 // B
2581	vmulpd			%ymm13, %ymm12, %ymm15
2582	vaddpd			%ymm3, %ymm15, %ymm3
2583	addq	%r13, %r12
2584
2585	subl	$4, %r10d
2586	addq	$128, %r11
2587
2588#if MACRO_LEVEL>=1
2589	.endm
2590#else
2591	ret
2592
2593	FUN_END(inner_edge_dtrmm_nt_rl_4x4_lib4c)
2594#endif
2595
2596
2597
2598
2599
2600// common inner routine with file scope
2601//
2602// edge for B lower triangular
2603//
2604// input arguments:
2605// r10d  <- k
2606// r11   <- A
2607// r12   <- B
2608// r13   <- ldb
2609// r14   <- n1
2610// ymm0  <- [d00 d10 d20 d30]
2611// ymm1  <- [d01 d11 d21 d31]
2612// ymm2  <- [d02 d12 d22 d32]
2613// ymm3  <- [d03 d13 d23 d33]
2614//
2615// output arguments:
2616
2617#if MACRO_LEVEL>=1
2618	.macro INNER_EDGE_DTRMM_NT_RL_4X4_VS_LIB4C
2619#else
2620	.p2align 4,,15
2621	FUN_START(inner_edge_dtrmm_nt_rl_4x4_vs_lib4c)
2622#endif
2623
2624	cmpl	$0, %r14d
2625	jle		0f // end
2626
2627	cmpl	$4, %r14d
2628	jl		1f // end
2629
2630	// unroll 0
2631	vmovupd			0(%r11), %ymm13 // A
2632	vbroadcastsd	0(%r12), %ymm12 // B
2633	vmulpd			%ymm13, %ymm12, %ymm15
2634	vaddpd			%ymm0, %ymm15, %ymm0
2635	vbroadcastsd	8(%r12), %ymm12 // B
2636	vmulpd			%ymm13, %ymm12, %ymm15
2637	vaddpd			%ymm1, %ymm15, %ymm1
2638	vbroadcastsd	16(%r12), %ymm12 // B
2639	vmulpd			%ymm13, %ymm12, %ymm15
2640	vaddpd			%ymm2, %ymm15, %ymm2
2641	vbroadcastsd	24(%r12), %ymm12 // B
2642	vmulpd			%ymm13, %ymm12, %ymm15
2643	vaddpd			%ymm3, %ymm15, %ymm3
2644	addq	%r13, %r12
2645
2646	// unroll 1
2647	vmovupd			32(%r11), %ymm13 // A
2648	vbroadcastsd	8(%r12), %ymm12 // B
2649	vmulpd			%ymm13, %ymm12, %ymm15
2650	vaddpd			%ymm1, %ymm15, %ymm1
2651	vbroadcastsd	16(%r12), %ymm12 // B
2652	vmulpd			%ymm13, %ymm12, %ymm15
2653	vaddpd			%ymm2, %ymm15, %ymm2
2654	vbroadcastsd	24(%r12), %ymm12 // B
2655	vmulpd			%ymm13, %ymm12, %ymm15
2656	vaddpd			%ymm3, %ymm15, %ymm3
2657	addq	%r13, %r12
2658
2659	// unroll 2
2660	vmovupd			64(%r11), %ymm13 // A
2661	vbroadcastsd	16(%r12), %ymm12 // B
2662	vmulpd			%ymm13, %ymm12, %ymm15
2663	vaddpd			%ymm2, %ymm15, %ymm2
2664	vbroadcastsd	24(%r12), %ymm12 // B
2665	vmulpd			%ymm13, %ymm12, %ymm15
2666	vaddpd			%ymm3, %ymm15, %ymm3
2667	addq	%r13, %r12
2668
2669	// unroll 3
2670	vmovupd			96(%r11), %ymm13 // A
2671	vbroadcastsd	24(%r12), %ymm12 // B
2672	vmulpd			%ymm13, %ymm12, %ymm15
2673	vaddpd			%ymm3, %ymm15, %ymm3
2674	addq	%r13, %r12
2675
2676	subl	$4, %r10d
2677	addq	$128, %r11
2678
2679	jmp		0f
2680
26811:
2682
2683	cmpl	$3, %r14d
2684	jl		2f // end
2685
2686	// unroll 0
2687	vmovupd			0(%r11), %ymm13 // A
2688	vbroadcastsd	0(%r12), %ymm12 // B
2689	vmulpd			%ymm13, %ymm12, %ymm15
2690	vaddpd			%ymm0, %ymm15, %ymm0
2691	vbroadcastsd	8(%r12), %ymm12 // B
2692	vmulpd			%ymm13, %ymm12, %ymm15
2693	vaddpd			%ymm1, %ymm15, %ymm1
2694	vbroadcastsd	16(%r12), %ymm12 // B
2695	vmulpd			%ymm13, %ymm12, %ymm15
2696	vaddpd			%ymm2, %ymm15, %ymm2
2697	addq	%r13, %r12
2698
2699	// unroll 1
2700	vmovupd			32(%r11), %ymm13 // A
2701	vbroadcastsd	8(%r12), %ymm12 // B
2702	vmulpd			%ymm13, %ymm12, %ymm15
2703	vaddpd			%ymm1, %ymm15, %ymm1
2704	vbroadcastsd	16(%r12), %ymm12 // B
2705	vmulpd			%ymm13, %ymm12, %ymm15
2706	vaddpd			%ymm2, %ymm15, %ymm2
2707	addq	%r13, %r12
2708
2709	// unroll 2
2710	vmovupd			64(%r11), %ymm13 // A
2711	vbroadcastsd	16(%r12), %ymm12 // B
2712	vmulpd			%ymm13, %ymm12, %ymm15
2713	vaddpd			%ymm2, %ymm15, %ymm2
2714	addq	%r13, %r12
2715
2716	// unroll 3
2717
2718	subl	$3, %r10d
2719	addq	$96, %r11
2720
2721	jmp		0f
2722
27232:
2724
2725	cmpl	$2, %r14d
2726	jl		3f // end
2727
2728	// unroll 0
2729	vmovupd			0(%r11), %ymm13 // A
2730	vbroadcastsd	0(%r12), %ymm12 // B
2731	vmulpd			%ymm13, %ymm12, %ymm15
2732	vaddpd			%ymm0, %ymm15, %ymm0
2733	vbroadcastsd	8(%r12), %ymm12 // B
2734	vmulpd			%ymm13, %ymm12, %ymm15
2735	vaddpd			%ymm1, %ymm15, %ymm1
2736	addq	%r13, %r12
2737
2738	// unroll 1
2739	vmovupd			32(%r11), %ymm13 // A
2740	vbroadcastsd	8(%r12), %ymm12 // B
2741	vmulpd			%ymm13, %ymm12, %ymm15
2742	vaddpd			%ymm1, %ymm15, %ymm1
2743	addq	%r13, %r12
2744
2745	// unroll 2
2746
2747	// unroll 3
2748
2749	subl	$2, %r10d
2750	addq	$64, %r11
2751
2752	jmp		0f
2753
27543:
2755
2756//	cmpl	$1, %r14d
2757//	jl		0f // end
2758
2759	// unroll 0
2760	vmovupd			0(%r11), %ymm13 // A
2761	vbroadcastsd	0(%r12), %ymm12 // B
2762	vmulpd			%ymm13, %ymm12, %ymm15
2763	vaddpd			%ymm0, %ymm15, %ymm0
2764	addq	%r13, %r12
2765
2766	// unroll 1
2767
2768	// unroll 2
2769
2770	// unroll 3
2771
2772	subl	$1, %r10d
2773	addq	$32, %r11
2774
27750:
2776
2777#if MACRO_LEVEL>=1
2778	.endm
2779#else
2780	ret
2781
2782	FUN_END(inner_edge_dtrmm_nt_rl_4x4_vs_lib4c)
2783#endif
2784
2785
2786
2787
2788
2789// common inner routine with file scope
2790//
2791// edge for B lower triangular
2792//
2793// input arguments:
2794// r10d  <- k
2795// r11   <- A
2796// r12   <- B
2797// r13   <- ldb
2798// ymm0  <- [d00 d10 d20 d30]
2799// ymm1  <- [d01 d11 d21 d31]
2800// ymm2  <- [d02 d12 d22 d32]
2801// ymm3  <- [d03 d13 d23 d33]
2802//
2803// output arguments:
2804
2805#if MACRO_LEVEL>=1
2806	.macro INNER_EDGE_DTRMM_NT_RL_ONE_4X4_LIB4C
2807#else
2808	.p2align 4,,15
2809	FUN_START(inner_edge_dtrmm_nt_rl_one_4x4_lib4c)
2810#endif
2811
2812	// unroll 0
2813	vmovupd			0(%r11), %ymm13 // A
2814	vaddpd			%ymm0, %ymm13, %ymm0
2815	vbroadcastsd	8(%r12), %ymm12 // B
2816	vmulpd			%ymm13, %ymm12, %ymm15
2817	vaddpd			%ymm1, %ymm15, %ymm1
2818	vbroadcastsd	16(%r12), %ymm12 // B
2819	vmulpd			%ymm13, %ymm12, %ymm15
2820	vaddpd			%ymm2, %ymm15, %ymm2
2821	vbroadcastsd	24(%r12), %ymm12 // B
2822	vmulpd			%ymm13, %ymm12, %ymm15
2823	vaddpd			%ymm3, %ymm15, %ymm3
2824	addq	%r13, %r12
2825
2826	// unroll 1
2827	vmovupd			32(%r11), %ymm13 // A
2828	vaddpd			%ymm1, %ymm13, %ymm1
2829	vbroadcastsd	16(%r12), %ymm12 // B
2830	vmulpd			%ymm13, %ymm12, %ymm15
2831	vaddpd			%ymm2, %ymm15, %ymm2
2832	vbroadcastsd	24(%r12), %ymm12 // B
2833	vmulpd			%ymm13, %ymm12, %ymm15
2834	vaddpd			%ymm3, %ymm15, %ymm3
2835	addq	%r13, %r12
2836
2837	// unroll 2
2838	vmovupd			64(%r11), %ymm13 // A
2839	vaddpd			%ymm2, %ymm13, %ymm2
2840	vbroadcastsd	24(%r12), %ymm12 // B
2841	vmulpd			%ymm13, %ymm12, %ymm15
2842	vaddpd			%ymm3, %ymm15, %ymm3
2843	addq	%r13, %r12
2844
2845	// unroll 3
2846	vmovupd			96(%r11), %ymm13 // A
2847	vaddpd			%ymm3, %ymm13, %ymm3
2848	addq	%r13, %r12
2849
2850	subl	$4, %r10d
2851	addq	$128, %r11
2852
2853#if MACRO_LEVEL>=1
2854	.endm
2855#else
2856	ret
2857
2858	FUN_END(inner_edge_dtrmm_nt_rl_one_4x4_lib4c)
2859#endif
2860
2861
2862
2863
2864
2865// common inner routine with file scope
2866//
2867// edge for B lower triangular
2868//
2869// input arguments:
2870// r10d  <- k
2871// r11   <- A
2872// r12   <- B
2873// r13   <- ldb
2874// r14   <- n1
2875// ymm0  <- [d00 d10 d20 d30]
2876// ymm1  <- [d01 d11 d21 d31]
2877// ymm2  <- [d02 d12 d22 d32]
2878// ymm3  <- [d03 d13 d23 d33]
2879//
2880// output arguments:
2881
2882#if MACRO_LEVEL>=1
2883	.macro INNER_EDGE_DTRMM_NT_RL_ONE_4X4_VS_LIB4C
2884#else
2885	.p2align 4,,15
2886	FUN_START(inner_edge_dtrmm_nt_rl_one_4x4_vs_lib4c)
2887#endif
2888
2889	cmpl	$0, %r14d
2890	jle		0f // end
2891
2892	cmpl	$4, %r14d
2893	jl		1f // end
2894
2895	// unroll 0
2896	vmovupd			0(%r11), %ymm13 // A
2897	vaddpd			%ymm0, %ymm13, %ymm0
2898	vbroadcastsd	8(%r12), %ymm12 // B
2899	vmulpd			%ymm13, %ymm12, %ymm15
2900	vaddpd			%ymm1, %ymm15, %ymm1
2901	vbroadcastsd	16(%r12), %ymm12 // B
2902	vmulpd			%ymm13, %ymm12, %ymm15
2903	vaddpd			%ymm2, %ymm15, %ymm2
2904	vbroadcastsd	24(%r12), %ymm12 // B
2905	vmulpd			%ymm13, %ymm12, %ymm15
2906	vaddpd			%ymm3, %ymm15, %ymm3
2907	addq	%r13, %r12
2908
2909	// unroll 1
2910	vmovupd			32(%r11), %ymm13 // A
2911	vaddpd			%ymm1, %ymm13, %ymm1
2912	vbroadcastsd	16(%r12), %ymm12 // B
2913	vmulpd			%ymm13, %ymm12, %ymm15
2914	vaddpd			%ymm2, %ymm15, %ymm2
2915	vbroadcastsd	24(%r12), %ymm12 // B
2916	vmulpd			%ymm13, %ymm12, %ymm15
2917	vaddpd			%ymm3, %ymm15, %ymm3
2918	addq	%r13, %r12
2919
2920	// unroll 2
2921	vmovupd			64(%r11), %ymm13 // A
2922	vaddpd			%ymm2, %ymm13, %ymm2
2923	vbroadcastsd	24(%r12), %ymm12 // B
2924	vmulpd			%ymm13, %ymm12, %ymm15
2925	vaddpd			%ymm3, %ymm15, %ymm3
2926	addq	%r13, %r12
2927
2928	// unroll 3
2929	vmovupd			96(%r11), %ymm13 // A
2930	vaddpd			%ymm3, %ymm13, %ymm3
2931	addq	%r13, %r12
2932
2933	subl	$4, %r10d
2934	addq	$128, %r11
2935
2936	jmp		0f
2937
29381:
2939
2940	cmpl	$3, %r14d
2941	jl		2f // end
2942
2943	// unroll 0
2944	vmovupd			0(%r11), %ymm13 // A
2945	vaddpd			%ymm0, %ymm13, %ymm0
2946	vbroadcastsd	8(%r12), %ymm12 // B
2947	vmulpd			%ymm13, %ymm12, %ymm15
2948	vaddpd			%ymm1, %ymm15, %ymm1
2949	vbroadcastsd	16(%r12), %ymm12 // B
2950	vmulpd			%ymm13, %ymm12, %ymm15
2951	vaddpd			%ymm2, %ymm15, %ymm2
2952	addq	%r13, %r12
2953
2954	// unroll 1
2955	vmovupd			32(%r11), %ymm13 // A
2956	vaddpd			%ymm1, %ymm13, %ymm1
2957	vbroadcastsd	16(%r12), %ymm12 // B
2958	vmulpd			%ymm13, %ymm12, %ymm15
2959	vaddpd			%ymm2, %ymm15, %ymm2
2960	addq	%r13, %r12
2961
2962	// unroll 2
2963	vmovupd			64(%r11), %ymm13 // A
2964	vaddpd			%ymm2, %ymm13, %ymm2
2965	addq	%r13, %r12
2966
2967	// unroll 3
2968
2969	subl	$3, %r10d
2970	addq	$96, %r11
2971
2972	jmp		0f
2973
29742:
2975
2976	cmpl	$2, %r14d
2977	jl		3f // end
2978
2979	// unroll 0
2980	vmovupd			0(%r11), %ymm13 // A
2981	vaddpd			%ymm0, %ymm13, %ymm0
2982	vbroadcastsd	8(%r12), %ymm12 // B
2983	vmulpd			%ymm13, %ymm12, %ymm15
2984	vaddpd			%ymm1, %ymm15, %ymm1
2985	addq	%r13, %r12
2986
2987	// unroll 1
2988	vmovupd			32(%r11), %ymm13 // A
2989	vaddpd			%ymm1, %ymm13, %ymm1
2990	addq	%r13, %r12
2991
2992	// unroll 2
2993
2994	// unroll 3
2995
2996	subl	$2, %r10d
2997	addq	$64, %r11
2998
2999	jmp		0f
3000
30013:
3002
3003//	cmpl	$1, %r14d
3004//	jl		0f // end
3005
3006	// unroll 0
3007	vmovupd			0(%r11), %ymm13 // A
3008	vaddpd			%ymm0, %ymm13, %ymm0
3009	addq	%r13, %r12
3010
3011	// unroll 1
3012
3013	// unroll 2
3014
3015	// unroll 3
3016
3017	subl	$1, %r10d
3018	addq	$32, %r11
3019
30200:
3021
3022#if MACRO_LEVEL>=1
3023	.endm
3024#else
3025	ret
3026
3027	FUN_END(inner_edge_dtrmm_nt_rl_one_4x4_vs_lib4c)
3028#endif
3029
3030
3031
3032
3033
3034// common inner routine with file scope
3035//
3036// edge for B upper triangular
3037//
3038// input arguments:
3039// r10   <- kmax
3040// r11   <- A
3041// r12   <- B
3042// r13   <- ldb
3043// ymm0  <- [d00 d10 d20 d30]
3044// ymm1  <- [d01 d11 d21 d31]
3045// ymm2  <- [d02 d12 d22 d32]
3046// ymm3  <- [d03 d13 d23 d33]
3047
3048//
3049// output arguments:
3050
3051
3052#if MACRO_LEVEL>=1
3053	.macro INNER_EDGE_DTRMM_NT_RU_4X4_LIB4C
3054#else
3055	.p2align 4,,15
3056	FUN_START(inner_edge_dtrmm_nt_ru_4x4_lib4c)
3057#endif
3058
3059	vmovapd			0(%r11), %ymm8
3060	vbroadcastsd	0(%r12), %ymm12
3061	vmulpd			%ymm8, %ymm12, %ymm15
3062	vaddpd			%ymm0, %ymm15, %ymm0
3063	addq			%r13, %r12
3064
3065	vmovapd			32(%r11), %ymm8
3066	vbroadcastsd	0(%r12), %ymm12
3067	vmulpd			%ymm8, %ymm12, %ymm15
3068	vaddpd			%ymm0, %ymm15, %ymm0
3069	vbroadcastsd	8(%r12), %ymm12
3070	vmulpd			%ymm8, %ymm12, %ymm15
3071	vaddpd			%ymm1, %ymm15, %ymm1
3072	addq			%r13, %r12
3073
3074	vmovapd			64(%r11), %ymm8
3075	vbroadcastsd	0(%r12), %ymm12
3076	vmulpd			%ymm8, %ymm12, %ymm15
3077	vaddpd			%ymm0, %ymm15, %ymm0
3078	vbroadcastsd	8(%r12), %ymm12
3079	vmulpd			%ymm8, %ymm12, %ymm15
3080	vaddpd			%ymm1, %ymm15, %ymm1
3081	vbroadcastsd	16(%r12), %ymm12
3082	vmulpd			%ymm8, %ymm12, %ymm15
3083	vaddpd			%ymm2, %ymm15, %ymm2
3084	addq			%r13, %r12
3085
3086	vmovapd			96(%r11), %ymm8
3087	vbroadcastsd	0(%r12), %ymm12
3088	vmulpd			%ymm8, %ymm12, %ymm15
3089	vaddpd			%ymm0, %ymm15, %ymm0
3090	vbroadcastsd	8(%r12), %ymm12
3091	vmulpd			%ymm8, %ymm12, %ymm15
3092	vaddpd			%ymm1, %ymm15, %ymm1
3093	vbroadcastsd	16(%r12), %ymm12
3094	vmulpd			%ymm8, %ymm12, %ymm15
3095	vaddpd			%ymm2, %ymm15, %ymm2
3096	vbroadcastsd	24(%r12), %ymm12
3097	vmulpd			%ymm8, %ymm12, %ymm15
3098	vaddpd			%ymm3, %ymm15, %ymm3
3099	addq			%r13, %r12
3100
3101	subl	$4, %r10d
3102	addq	$128, %r11
3103
3104#if MACRO_LEVEL>=1
3105	.endm
3106#else
3107	ret
3108
3109	FUN_END(inner_edge_dtrmm_nt_ru_4x4_lib4c)
3110#endif
3111
3112
3113
3114
3115
3116// common inner routine with file scope
3117//
3118// edge for B upper triangular
3119//
3120// input arguments:
3121// r10d  <- k
3122// r11   <- A
3123// r12   <- B
3124// r13   <- ldb
3125// ymm0  <- [d00 d10 d20 d30]
3126// ymm1  <- [d01 d11 d21 d31]
3127// ymm2  <- [d02 d12 d22 d32]
3128// ymm3  <- [d03 d13 d23 d33]
3129
3130//
3131// output arguments:
3132
3133
3134#if MACRO_LEVEL>=1
3135	.macro INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4C
3136#else
3137	.p2align 4,,15
3138	FUN_START(inner_edge_dtrmm_nt_ru_4x4_vs_lib4c)
3139#endif
3140
3141	cmpl	$0, %r10d
3142	jle		0f // end
3143
3144	vmovapd			0(%r11), %ymm8
3145	subl			$1, %r10d
3146	vbroadcastsd	0(%r12), %ymm12
3147	vmulpd			%ymm8, %ymm12, %ymm15
3148	vaddpd			%ymm0, %ymm15, %ymm0
3149	addq			%r13, %r12
3150	addq			$32, %r11
3151
3152	cmpl	$0, %r10d
3153	jle		0f
3154
3155	vmovapd			0(%r11), %ymm8
3156	subl			$1, %r10d
3157	vbroadcastsd	0(%r12), %ymm12
3158	vmulpd			%ymm8, %ymm12, %ymm15
3159	vaddpd			%ymm0, %ymm15, %ymm0
3160	vbroadcastsd	8(%r12), %ymm12
3161	addq			$32, %r11
3162	vmulpd			%ymm8, %ymm12, %ymm15
3163	vaddpd			%ymm1, %ymm15, %ymm1
3164	addq			%r13, %r12
3165
3166	cmpl	$0, %r10d
3167	jle		0f
3168
3169	vmovapd			0(%r11), %ymm8
3170	subl			$1, %r10d
3171	vbroadcastsd	0(%r12), %ymm12
3172	vmulpd			%ymm8, %ymm12, %ymm15
3173	vaddpd			%ymm0, %ymm15, %ymm0
3174	vbroadcastsd	8(%r12), %ymm12
3175	addq			$32, %r11
3176	vmulpd			%ymm8, %ymm12, %ymm15
3177	vaddpd			%ymm1, %ymm15, %ymm1
3178	vbroadcastsd	16(%r12), %ymm12
3179	vmulpd			%ymm8, %ymm12, %ymm15
3180	vaddpd			%ymm2, %ymm15, %ymm2
3181	addq			%r13, %r12
3182
3183	cmpl	$0, %r10d
3184	jle		0f
3185
3186	vmovapd			0(%r11), %ymm8
3187	subl			$1, %r10d
3188	vbroadcastsd	0(%r12), %ymm12
3189	vmulpd			%ymm8, %ymm12, %ymm15
3190	vaddpd			%ymm0, %ymm15, %ymm0
3191	vbroadcastsd	8(%r12), %ymm12
3192	addq			$32, %r11
3193	vmulpd			%ymm8, %ymm12, %ymm15
3194	vaddpd			%ymm1, %ymm15, %ymm1
3195	vbroadcastsd	16(%r12), %ymm12
3196	vmulpd			%ymm8, %ymm12, %ymm15
3197	vaddpd			%ymm2, %ymm15, %ymm2
3198	vbroadcastsd	24(%r12), %ymm12
3199	vmulpd			%ymm8, %ymm12, %ymm15
3200	vaddpd			%ymm3, %ymm15, %ymm3
3201	addq			%r13, %r12
3202
32030:
3204
3205#if MACRO_LEVEL>=1
3206	.endm
3207#else
3208	ret
3209
3210	FUN_END(inner_edge_dtrmm_nt_ru_4x4_vs_lib4c)
3211#endif
3212
3213
3214
3215
3216
3217// common inner routine with file scope
3218//
3219// edge for B upper triangular
3220//
3221// input arguments:
3222// r10   <- kmax
3223// r11   <- A
3224// r12   <- B
3225// r13   <- ldb
3226// ymm0  <- [d00 d10 d20 d30]
3227// ymm1  <- [d01 d11 d21 d31]
3228// ymm2  <- [d02 d12 d22 d32]
3229// ymm3  <- [d03 d13 d23 d33]
3230
3231//
3232// output arguments:
3233
3234
3235#if MACRO_LEVEL>=1
3236	.macro INNER_EDGE_DTRMM_NT_RU_ONE_4X4_LIB4C
3237#else
3238	.p2align 4,,15
3239	FUN_START(inner_edge_dtrmm_nt_ru_one_4x4_lib4c)
3240#endif
3241
3242	vmovapd			0(%r11), %ymm8
3243	vaddpd			%ymm0, %ymm8, %ymm0
3244	addq			%r13, %r12
3245
3246	vmovapd			32(%r11), %ymm8
3247	vbroadcastsd	0(%r12), %ymm12
3248	vmulpd			%ymm8, %ymm12, %ymm15
3249	vaddpd			%ymm0, %ymm15, %ymm0
3250	vaddpd			%ymm1, %ymm8, %ymm1
3251	addq			%r13, %r12
3252
3253	vmovapd			64(%r11), %ymm8
3254	vbroadcastsd	0(%r12), %ymm12
3255	vmulpd			%ymm8, %ymm12, %ymm15
3256	vaddpd			%ymm0, %ymm15, %ymm0
3257	vbroadcastsd	8(%r12), %ymm12
3258	vmulpd			%ymm8, %ymm12, %ymm15
3259	vaddpd			%ymm1, %ymm15, %ymm1
3260	vaddpd			%ymm2, %ymm8, %ymm2
3261	addq			%r13, %r12
3262
3263	vmovapd			96(%r11), %ymm8
3264	vbroadcastsd	0(%r12), %ymm12
3265	vmulpd			%ymm8, %ymm12, %ymm15
3266	vaddpd			%ymm0, %ymm15, %ymm0
3267	vbroadcastsd	8(%r12), %ymm12
3268	vmulpd			%ymm8, %ymm12, %ymm15
3269	vaddpd			%ymm1, %ymm15, %ymm1
3270	vbroadcastsd	16(%r12), %ymm12
3271	vmulpd			%ymm8, %ymm12, %ymm15
3272	vaddpd			%ymm2, %ymm15, %ymm2
3273	vaddpd			%ymm3, %ymm8, %ymm3
3274	addq			%r13, %r12
3275
3276	subl	$4, %r10d
3277	addq	$128, %r11
3278
3279#if MACRO_LEVEL>=1
3280	.endm
3281#else
3282	ret
3283
3284	FUN_END(inner_edge_dtrmm_nt_ru_one_4x4_lib4c)
3285#endif
3286
3287
3288
3289
3290
3291// common inner routine with file scope
3292//
3293// edge for B upper triangular
3294//
3295// input arguments:
3296// r10d  <- k
3297// r11   <- A
3298// r12   <- B
3299// r13   <- ldb
3300// ymm0  <- [d00 d10 d20 d30]
3301// ymm1  <- [d01 d11 d21 d31]
3302// ymm2  <- [d02 d12 d22 d32]
3303// ymm3  <- [d03 d13 d23 d33]
3304
3305//
3306// output arguments:
3307
3308
3309#if MACRO_LEVEL>=1
3310	.macro INNER_EDGE_DTRMM_NT_RU_ONE_4X4_VS_LIB4C
3311#else
3312	.p2align 4,,15
3313	FUN_START(inner_edge_dtrmm_nt_ru_one_4x4_vs_lib4c)
3314#endif
3315
3316	cmpl	$0, %r10d
3317	jle		0f // end
3318
3319	vmovapd			0(%r11), %ymm8
3320	subl			$1, %r10d
3321	vaddpd			%ymm0, %ymm8, %ymm0
3322	addq			$32, %r11
3323	addq			%r13, %r12
3324
3325	cmpl	$0, %r10d
3326	jle		0f
3327
3328	vmovapd			0(%r11), %ymm8
3329	subl			$1, %r10d
3330	vbroadcastsd	0(%r12), %ymm12
3331	addq			$32, %r11
3332	vmulpd			%ymm8, %ymm12, %ymm15
3333	vaddpd			%ymm0, %ymm15, %ymm0
3334	vaddpd			%ymm1, %ymm8, %ymm1
3335	addq			%r13, %r12
3336
3337	cmpl	$0, %r10d
3338	jle		0f
3339
3340	vmovapd			0(%r11), %ymm8
3341	subl			$1, %r10d
3342	vbroadcastsd	0(%r12), %ymm12
3343	vmulpd			%ymm8, %ymm12, %ymm15
3344	vaddpd			%ymm0, %ymm15, %ymm0
3345	vbroadcastsd	8(%r12), %ymm12
3346	addq			$32, %r11
3347	vmulpd			%ymm8, %ymm12, %ymm15
3348	vaddpd			%ymm1, %ymm15, %ymm1
3349	vaddpd			%ymm2, %ymm8, %ymm2
3350	addq			%r13, %r12
3351
3352	cmpl	$0, %r10d
3353	jle		0f
3354
3355	vmovapd			0(%r11), %ymm8
3356	subl			$1, %r10d
3357	vbroadcastsd	0(%r12), %ymm12
3358	vmulpd			%ymm8, %ymm12, %ymm15
3359	vaddpd			%ymm0, %ymm15, %ymm0
3360	vbroadcastsd	8(%r12), %ymm12
3361	addq			$32, %r11
3362	vmulpd			%ymm8, %ymm12, %ymm15
3363	vaddpd			%ymm1, %ymm15, %ymm1
3364	vbroadcastsd	16(%r12), %ymm12
3365	vmulpd			%ymm8, %ymm12, %ymm15
3366	vaddpd			%ymm2, %ymm15, %ymm2
3367	vaddpd			%ymm3, %ymm8, %ymm3
3368	addq			%r13, %r12
3369
33700:
3371
3372#if MACRO_LEVEL>=1
3373	.endm
3374#else
3375	ret
3376
3377	FUN_END(inner_edge_dtrmm_nt_ru_one_4x4_vs_lib4c)
3378#endif
3379
3380
3381
3382
3383
3384// common inner routine with file scope
3385//
3386// triangular substitution:
3387// side = left
3388// uplo = lower
3389// tran = not-transposed
3390// unit diagonal
3391//
3392// input arguments:
3393// r10  <- E
3394// r11  <- lde
3395// ymm0 <- [d00 d10 d20 d30]
3396// ymm1 <- [d01 d11 d21 d31]
3397// ymm2 <- [d02 d12 d22 d32]
3398// ymm3 <- [d03 d13 d23 d33]
3399//
3400// output arguments:
3401
3402#if MACRO_LEVEL>=1
3403	.macro INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB
3404#else
3405	.p2align 4,,15
3406	FUN_START(inner_edge_dtrsm_lln_one_4x4_lib)
3407#endif
3408
3409	vxorpd		%ymm14, %ymm14, %ymm14
3410
3411	vmovupd		0(%r10), %ymm12
3412	vblendpd	$0x1, %ymm14, %ymm12, %ymm12
3413	vperm2f128	$0x00, %ymm0, %ymm0, %ymm13
3414	vpermilpd	$0x0, %ymm13, %ymm13
3415	vmulpd		%ymm12, %ymm13, %ymm15
3416	vsubpd		%ymm15, %ymm0, %ymm0
3417	vperm2f128	$0x00, %ymm1, %ymm1, %ymm13
3418	vpermilpd	$0x0, %ymm13, %ymm13
3419	vmulpd		%ymm12, %ymm13, %ymm15
3420	vsubpd		%ymm15, %ymm1, %ymm1
3421	vperm2f128	$0x00, %ymm2, %ymm2, %ymm13
3422	vpermilpd	$0x0, %ymm13, %ymm13
3423	vmulpd		%ymm12, %ymm13, %ymm15
3424	vsubpd		%ymm15, %ymm2, %ymm2
3425	vperm2f128	$0x00, %ymm3, %ymm3, %ymm13
3426	vpermilpd	$0x0, %ymm13, %ymm13
3427	vmulpd		%ymm12, %ymm13, %ymm15
3428	vsubpd		%ymm15, %ymm3, %ymm3
3429	add			%r11, %r10
3430
3431	vmovupd		0(%r10), %ymm12
3432	vblendpd	$0x3, %ymm14, %ymm12, %ymm12
3433	vperm2f128	$0x00, %ymm0, %ymm0, %ymm13
3434	vpermilpd	$0xf, %ymm13, %ymm13
3435	vmulpd		%ymm12, %ymm13, %ymm15
3436	vsubpd		%ymm15, %ymm0, %ymm0
3437	vperm2f128	$0x00, %ymm1, %ymm1, %ymm13
3438	vpermilpd	$0xf, %ymm13, %ymm13
3439	vmulpd		%ymm12, %ymm13, %ymm15
3440	vsubpd		%ymm15, %ymm1, %ymm1
3441	vperm2f128	$0x00, %ymm2, %ymm2, %ymm13
3442	vpermilpd	$0xf, %ymm13, %ymm13
3443	vmulpd		%ymm12, %ymm13, %ymm15
3444	vsubpd		%ymm15, %ymm2, %ymm2
3445	vperm2f128	$0x00, %ymm3, %ymm3, %ymm13
3446	vpermilpd	$0xf, %ymm13, %ymm13
3447	vmulpd		%ymm12, %ymm13, %ymm15
3448	vsubpd		%ymm15, %ymm3, %ymm3
3449	add			%r11, %r10
3450
3451	vmovupd		0(%r10), %ymm12
3452	vblendpd	$0x7, %ymm14, %ymm12, %ymm12
3453	vperm2f128	$0x11, %ymm0, %ymm0, %ymm13
3454	vpermilpd	$0x0, %ymm13, %ymm13
3455	vmulpd		%ymm12, %ymm13, %ymm15
3456	vsubpd		%ymm15, %ymm0, %ymm0
3457	vperm2f128	$0x11, %ymm1, %ymm1, %ymm13
3458	vpermilpd	$0x0, %ymm13, %ymm13
3459	vmulpd		%ymm12, %ymm13, %ymm15
3460	vsubpd		%ymm15, %ymm1, %ymm1
3461	vperm2f128	$0x11, %ymm2, %ymm2, %ymm13
3462	vpermilpd	$0x0, %ymm13, %ymm13
3463	vmulpd		%ymm12, %ymm13, %ymm15
3464	vsubpd		%ymm15, %ymm2, %ymm2
3465	vperm2f128	$0x11, %ymm3, %ymm3, %ymm13
3466	vpermilpd	$0x0, %ymm13, %ymm13
3467	vmulpd		%ymm12, %ymm13, %ymm15
3468	vsubpd		%ymm15, %ymm3, %ymm3
3469
3470#if MACRO_LEVEL>=1
3471	.endm
3472#else
3473	ret
3474
3475	FUN_END(inner_edge_dtrsm_lln_one_4x4_lib)
3476#endif
3477
3478
3479
3480
3481
3482// common inner routine with file scope
3483//
3484// triangular substitution:
3485// side = right
3486// uplo = lower
3487// tran = not-transposed
3488// requires explicit inverse of diagonal
3489//
3490// input arguments:
3491// r10  <- E
3492// r11  <- lde
3493// r12  <- inv_diag_E
3494// ymm0 <- [d00 d10 d20 d30]
3495// ymm1 <- [d01 d11 d21 d31]
3496// ymm2 <- [d02 d12 d22 d32]
3497// ymm3 <- [d03 d13 d23 d33]
3498//
3499// output arguments:
3500
3501#if MACRO_LEVEL>=1
3502	.macro INNER_EDGE_DTRSM_RLN_INV_4X4_LIB
3503#else
3504	.p2align 4,,15
3505	FUN_START(inner_edge_dtrsm_rln_inv_4x4_lib)
3506#endif
3507
3508	// 4th column
3509	vbroadcastsd	24(%r12), %ymm13
3510	vmulpd			%ymm3, %ymm13, %ymm3
3511	vbroadcastsd	24(%r10, %r11, 2), %ymm13
3512	vmulpd			%ymm3, %ymm13, %ymm15
3513	vsubpd			%ymm15, %ymm2, %ymm2
3514	vbroadcastsd	24(%r10, %r11, 1), %ymm13
3515	vmulpd			%ymm3, %ymm13, %ymm15
3516	vsubpd			%ymm15, %ymm1, %ymm1
3517	vbroadcastsd	24(%r10), %ymm13
3518	vmulpd			%ymm3, %ymm13, %ymm15
3519	vsubpd			%ymm15, %ymm0, %ymm0
3520
3521	// 3rd column
3522	vbroadcastsd	16(%r12), %ymm13
3523	vmulpd			%ymm2, %ymm13, %ymm2
3524	vbroadcastsd	16(%r10, %r11, 1), %ymm13
3525	vmulpd			%ymm2, %ymm13, %ymm15
3526	vsubpd			%ymm15, %ymm1, %ymm1
3527	vbroadcastsd	16(%r10), %ymm13
3528	vmulpd			%ymm2, %ymm13, %ymm15
3529	vsubpd			%ymm15, %ymm0, %ymm0
3530
3531	// 2nd column
3532	vbroadcastsd	8(%r12), %ymm13
3533	vmulpd			%ymm1, %ymm13, %ymm1
3534	vbroadcastsd	8(%r10), %ymm13
3535	vmulpd			%ymm1, %ymm13, %ymm15
3536	vsubpd			%ymm15, %ymm0, %ymm0
3537
3538	// 1st column
3539	vbroadcastsd	0(%r12), %ymm13
3540	vmulpd			%ymm0, %ymm13, %ymm0
3541
3542#if MACRO_LEVEL>=1
3543	.endm
3544#else
3545	ret
3546
3547	FUN_END(inner_edge_dtrsm_rln_inv_4x4_lib)
3548#endif
3549
3550
3551
3552
3553
3554// common inner routine with file scope
3555//
3556// triangular substitution:
3557// side = right
3558// uplo = lower
3559// tran = not-transposed
3560// requires explicit inverse of diagonal
3561//
3562// input arguments:
3563// r10  <- E
3564// r11  <- lde
3565// r12  <- inv_diag_E
3566// r13  <- n1
3567// ymm0 <- [d00 d10 d20 d30]
3568// ymm1 <- [d01 d11 d21 d31]
3569// ymm2 <- [d02 d12 d22 d32]
3570// ymm3 <- [d03 d13 d23 d33]
3571//
3572// output arguments:
3573
3574#if MACRO_LEVEL>=1
3575	.macro INNER_EDGE_DTRSM_RLN_INV_4X4_VS_LIB
3576#else
3577	.p2align 4,,15
3578	FUN_START(inner_edge_dtrsm_rln_inv_4x4_vs_lib)
3579#endif
3580
3581	cmpl			$3, %r13d
3582	jle				0f
3583
3584	// 4th column
3585	vbroadcastsd	24(%r12), %ymm13
3586	vmulpd			%ymm3, %ymm13, %ymm3
3587	vbroadcastsd	24(%r10, %r11, 2), %ymm13
3588	vmulpd			%ymm3, %ymm13, %ymm15
3589	vsubpd			%ymm15, %ymm2, %ymm2
3590	vbroadcastsd	24(%r10, %r11, 1), %ymm13
3591	vmulpd			%ymm3, %ymm13, %ymm15
3592	vsubpd			%ymm15, %ymm1, %ymm1
3593	vbroadcastsd	24(%r10), %ymm13
3594	vmulpd			%ymm3, %ymm13, %ymm15
3595	vsubpd			%ymm15, %ymm0, %ymm0
3596
35970:
3598	cmpl			$2, %r13d
3599	jle				0f
3600
3601	// 3rd column
3602	vbroadcastsd	16(%r12), %ymm13
3603	vmulpd			%ymm2, %ymm13, %ymm2
3604	vbroadcastsd	16(%r10, %r11, 1), %ymm13
3605	vmulpd			%ymm2, %ymm13, %ymm15
3606	vsubpd			%ymm15, %ymm1, %ymm1
3607	vbroadcastsd	16(%r10), %ymm13
3608	vmulpd			%ymm2, %ymm13, %ymm15
3609	vsubpd			%ymm15, %ymm0, %ymm0
3610
36110:
3612	cmpl			$1, %r13d
3613	jle				0f
3614
3615	// 2nd column
3616	vbroadcastsd	8(%r12), %ymm13
3617	vmulpd			%ymm1, %ymm13, %ymm1
3618	vbroadcastsd	8(%r10), %ymm13
3619	vmulpd			%ymm1, %ymm13, %ymm15
3620	vsubpd			%ymm15, %ymm0, %ymm0
3621
36220:
3623
3624	// 1st column
3625	vbroadcastsd	0(%r12), %ymm13
3626	vmulpd			%ymm0, %ymm13, %ymm0
3627
3628#if MACRO_LEVEL>=1
3629	.endm
3630#else
3631	ret
3632
3633	FUN_END(inner_edge_dtrsm_rln_inv_4x4_vs_lib)
3634#endif
3635
3636
3637
3638
3639
3640// common inner routine with file scope
3641//
3642// triangular substitution:
3643// side = right
3644// uplo = lower
3645// tran = not-transposed
3646// unit diagonal
3647//
3648// input arguments:
3649// r10  <- E
3650// r11  <- lde
3651// ymm0 <- [d00 d10 d20 d30]
3652// ymm1 <- [d01 d11 d21 d31]
3653// ymm2 <- [d02 d12 d22 d32]
3654// ymm3 <- [d03 d13 d23 d33]
3655//
3656// output arguments:
3657
3658#if MACRO_LEVEL>=1
3659	.macro INNER_EDGE_DTRSM_RLN_ONE_4X4_LIB
3660#else
3661	.p2align 4,,15
3662	FUN_START(inner_edge_dtrsm_rln_one_4x4_lib)
3663#endif
3664
3665	// 4th column
3666	vbroadcastsd	24(%r10, %r11, 2), %ymm13
3667	vmulpd			%ymm3, %ymm13, %ymm15
3668	vsubpd			%ymm15, %ymm2, %ymm2
3669	vbroadcastsd	24(%r10, %r11, 1), %ymm13
3670	vmulpd			%ymm3, %ymm13, %ymm15
3671	vsubpd			%ymm15, %ymm1, %ymm1
3672	vbroadcastsd	24(%r10), %ymm13
3673	vmulpd			%ymm3, %ymm13, %ymm15
3674	vsubpd			%ymm15, %ymm0, %ymm0
3675
3676	// 3rd column
3677	vbroadcastsd	16(%r10, %r11, 1), %ymm13
3678	vmulpd			%ymm2, %ymm13, %ymm15
3679	vsubpd			%ymm15, %ymm1, %ymm1
3680	vbroadcastsd	16(%r10), %ymm13
3681	vmulpd			%ymm2, %ymm13, %ymm15
3682	vsubpd			%ymm15, %ymm0, %ymm0
3683
3684	// 2nd column
3685	vbroadcastsd	8(%r10), %ymm13
3686	vmulpd			%ymm1, %ymm13, %ymm15
3687	vsubpd			%ymm15, %ymm0, %ymm0
3688
3689	// 1st column
3690
3691#if MACRO_LEVEL>=1
3692	.endm
3693#else
3694	ret
3695
3696	FUN_END(inner_edge_dtrsm_rln_one_4x4_lib)
3697#endif
3698
3699
3700
3701
3702
3703// common inner routine with file scope
3704//
3705// triangular substitution:
3706// side = right
3707// uplo = lower
3708// tran = not-transposed
3709// unit diagonal
3710//
3711// input arguments:
3712// r10  <- E
3713// r11  <- lde
3714// r12  <- n1
3715// ymm0 <- [d00 d10 d20 d30]
3716// ymm1 <- [d01 d11 d21 d31]
3717// ymm2 <- [d02 d12 d22 d32]
3718// ymm3 <- [d03 d13 d23 d33]
3719//
3720// output arguments:
3721
3722#if MACRO_LEVEL>=1
3723	.macro INNER_EDGE_DTRSM_RLN_ONE_4X4_VS_LIB
3724#else
3725	.p2align 4,,15
3726	FUN_START(inner_edge_dtrsm_rln_one_4x4_vs_lib)
3727#endif
3728
3729	cmpl			$3, %r12d
3730	jle				0f
3731
3732	// 4th column
3733	vbroadcastsd	24(%r10, %r11, 2), %ymm13
3734	vmulpd			%ymm3, %ymm13, %ymm15
3735	vsubpd			%ymm15, %ymm2, %ymm2
3736	vbroadcastsd	24(%r10, %r11, 1), %ymm13
3737	vmulpd			%ymm3, %ymm13, %ymm15
3738	vsubpd			%ymm15, %ymm1, %ymm1
3739	vbroadcastsd	24(%r10), %ymm13
3740	vmulpd			%ymm3, %ymm13, %ymm15
3741	vsubpd			%ymm15, %ymm0, %ymm0
3742
37430:
3744	cmpl			$2, %r12d
3745	jle				0f
3746
3747	// 3rd column
3748	vbroadcastsd	16(%r10, %r11, 1), %ymm13
3749	vmulpd			%ymm2, %ymm13, %ymm15
3750	vsubpd			%ymm15, %ymm1, %ymm1
3751	vbroadcastsd	16(%r10), %ymm13
3752	vmulpd			%ymm2, %ymm13, %ymm15
3753	vsubpd			%ymm15, %ymm0, %ymm0
3754
37550:
3756	cmpl			$1, %r12d
3757	jle				0f
3758
3759	// 2nd column
3760	vbroadcastsd	8(%r10), %ymm13
3761	vmulpd			%ymm1, %ymm13, %ymm15
3762	vsubpd			%ymm15, %ymm0, %ymm0
3763
37640:
3765
3766	// 1st column
3767
3768#if MACRO_LEVEL>=1
3769	.endm
3770#else
3771	ret
3772
3773	FUN_END(inner_edge_dtrsm_rln_one_4x4_vs_lib)
3774#endif
3775
3776
3777
3778
3779
3780// common inner routine with file scope
3781//
3782// triangular substitution:
3783// side = right
3784// uplo = lower
3785// tran = transposed
3786// requires explicit inverse of diagonal
3787//
3788// input arguments:
3789// r10  <- E
3790// r11  <- lde
3791// r12  <- inv_diag_E
3792// ymm0 <- [d00 d10 d20 d30]
3793// ymm1 <- [d01 d11 d21 d31]
3794// ymm2 <- [d02 d12 d22 d32]
3795// ymm3 <- [d03 d13 d23 d33]
3796//
3797// output arguments:
3798
3799#if MACRO_LEVEL>=1
3800	.macro INNER_EDGE_DTRSM_RLT_INV_4X4_LIB
3801#else
3802	.p2align 4,,15
3803	FUN_START(inner_edge_dtrsm_rlt_inv_4x4_lib)
3804#endif
3805
3806	vbroadcastsd	0(%r12), %ymm13
3807	vmulpd			%ymm0, %ymm13, %ymm0
3808	vbroadcastsd	8(%r10), %ymm13
3809	vmulpd			%ymm0, %ymm13, %ymm15
3810	vsubpd			%ymm15, %ymm1, %ymm1
3811	vbroadcastsd	16(%r10), %ymm13
3812	vmulpd			%ymm0, %ymm13, %ymm15
3813	vsubpd			%ymm15, %ymm2, %ymm2
3814	vbroadcastsd	24(%r10), %ymm13
3815	vmulpd			%ymm0, %ymm13, %ymm15
3816	vsubpd			%ymm15, %ymm3, %ymm3
3817
3818	vbroadcastsd	8(%r12), %ymm13
3819	vmulpd			%ymm1, %ymm13, %ymm1
3820	vbroadcastsd	16(%r10, %r11, 1), %ymm13
3821	vmulpd			%ymm1, %ymm13, %ymm15
3822	vsubpd			%ymm15, %ymm2, %ymm2
3823	vbroadcastsd	24(%r10, %r11, 1), %ymm13
3824	vmulpd			%ymm1, %ymm13, %ymm15
3825	vsubpd			%ymm15, %ymm3, %ymm3
3826
3827	vbroadcastsd	16(%r12), %ymm13
3828	vmulpd			%ymm2, %ymm13, %ymm2
3829	vbroadcastsd	24(%r10, %r11, 2), %ymm13
3830	vmulpd			%ymm2, %ymm13, %ymm15
3831	vsubpd			%ymm15, %ymm3, %ymm3
3832
3833	vbroadcastsd	24(%r12), %ymm13
3834	vmulpd			%ymm3, %ymm13, %ymm3
3835
3836#if MACRO_LEVEL>=1
3837	.endm
3838#else
3839	ret
3840
3841	FUN_END(inner_edge_dtrsm_rlt_inv_4x4_lib)
3842#endif
3843
3844
3845
3846
3847
3848// common inner routine with file scope
3849//
3850// triangular substitution:
3851// side = right
3852// uplo = lower
3853// tran = transposed
3854// requires explicit inverse of diagonal
3855//
3856// input arguments:
3857// r10  <- E
3858// r11  <- lde
3859// r12  <- inv_diag_E
3860// r13d <- kn
3861// ymm0 <- [d00 d10 d20 d30]
3862// ymm1 <- [d01 d11 d21 d31]
3863// ymm2 <- [d02 d12 d22 d32]
3864// ymm3 <- [d03 d13 d23 d33]
3865//
3866// output arguments:
3867
3868#if MACRO_LEVEL>=1
3869	.macro INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB
3870#else
3871	.p2align 4,,15
3872	FUN_START(inner_edge_dtrsm_rlt_inv_4x4_vs_lib)
3873#endif
3874
3875	vbroadcastsd	0(%r12), %ymm13
3876	vmulpd			%ymm0, %ymm13, %ymm0
3877
3878	cmpl			$2, %r13d
3879	jl				0f // ret
3880
3881	vbroadcastsd	8(%r10), %ymm13
3882	vmulpd			%ymm0, %ymm13, %ymm15
3883	vsubpd			%ymm15, %ymm1, %ymm1
3884	vbroadcastsd	8(%r12), %ymm13
3885	vmulpd			%ymm1, %ymm13, %ymm1
3886
3887	cmpl			$3, %r13d
3888	jl				0f // ret
3889
3890	vbroadcastsd	16(%r10), %ymm13
3891	vmulpd			%ymm0, %ymm13, %ymm15
3892	vsubpd			%ymm15, %ymm2, %ymm2
3893	vbroadcastsd	16(%r10, %r11, 1), %ymm13
3894	vmulpd			%ymm1, %ymm13, %ymm15
3895	vsubpd			%ymm15, %ymm2, %ymm2
3896	vbroadcastsd	16(%r12), %ymm13
3897	vmulpd			%ymm2, %ymm13, %ymm2
3898
3899	cmpl			$4, %r13d
3900	jl				0f // ret
3901
3902	vbroadcastsd	24(%r10), %ymm13
3903	vmulpd			%ymm0, %ymm13, %ymm15
3904	vsubpd			%ymm15, %ymm3, %ymm3
3905	vbroadcastsd	24(%r10, %r11, 1), %ymm13
3906	vmulpd			%ymm1, %ymm13, %ymm15
3907	vsubpd			%ymm15, %ymm3, %ymm3
3908	vbroadcastsd	24(%r10, %r11, 2), %ymm13
3909	vmulpd			%ymm2, %ymm13, %ymm15
3910	vsubpd			%ymm15, %ymm3, %ymm3
3911	vbroadcastsd	24(%r12), %ymm13
3912	vmulpd			%ymm3, %ymm13, %ymm3
3913
39140:
3915
3916#if MACRO_LEVEL>=1
3917	.endm
3918#else
3919	ret
3920
3921	FUN_END(inner_edge_dtrsm_rlt_inv_4x4_vs_lib)
3922#endif
3923
3924
3925
3926
3927
3928// common inner routine with file scope
3929//
3930// triangular substitution:
3931// side = right
3932// uplo = lower
3933// tran = transposed
3934// unit diagonal
3935//
3936// input arguments:
3937// r10  <- E
3938// r11  <- lde
3939// ymm0 <- [d00 d10 d20 d30]
3940// ymm1 <- [d01 d11 d21 d31]
3941// ymm2 <- [d02 d12 d22 d32]
3942// ymm3 <- [d03 d13 d23 d33]
3943//
3944// output arguments:
3945
3946#if MACRO_LEVEL>=1
3947	.macro INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB
3948#else
3949	.p2align 4,,15
3950	FUN_START(inner_edge_dtrsm_rlt_one_4x4_lib)
3951#endif
3952
3953	vbroadcastsd	8(%r10), %ymm13
3954	vmulpd			%ymm0, %ymm13, %ymm15
3955	vsubpd			%ymm15, %ymm1, %ymm1
3956	vbroadcastsd	16(%r10), %ymm13
3957	vmulpd			%ymm0, %ymm13, %ymm15
3958	vsubpd			%ymm15, %ymm2, %ymm2
3959	vbroadcastsd	24(%r10), %ymm13
3960	vmulpd			%ymm0, %ymm13, %ymm15
3961	vsubpd			%ymm15, %ymm3, %ymm3
3962
3963	vbroadcastsd	16(%r10, %r11, 1), %ymm13
3964	vmulpd			%ymm1, %ymm13, %ymm15
3965	vsubpd			%ymm15, %ymm2, %ymm2
3966	vbroadcastsd	24(%r10, %r11, 1), %ymm13
3967	vmulpd			%ymm1, %ymm13, %ymm15
3968	vsubpd			%ymm15, %ymm3, %ymm3
3969
3970	vbroadcastsd	24(%r10, %r11, 2), %ymm13
3971	vmulpd			%ymm2, %ymm13, %ymm15
3972	vsubpd			%ymm15, %ymm3, %ymm3
3973
3974#if MACRO_LEVEL>=1
3975	.endm
3976#else
3977	ret
3978
3979	FUN_END(inner_edge_dtrsm_rlt_one_4x4_lib)
3980#endif
3981
3982
3983
3984
3985
3986// common inner routine with file scope
3987//
3988// triangular substitution:
3989// side = right
3990// uplo = lower
3991// tran = transposed
3992// unit diagonal
3993//
3994// input arguments:
3995// r10  <- E
3996// r11  <- lde
3997// r12d <- kn
3998// ymm0 <- [d00 d10 d20 d30]
3999// ymm1 <- [d01 d11 d21 d31]
4000// ymm2 <- [d02 d12 d22 d32]
4001// ymm3 <- [d03 d13 d23 d33]
4002//
4003// output arguments:
4004
4005#if MACRO_LEVEL>=1
4006	.macro INNER_EDGE_DTRSM_RLT_ONE_4X4_VS_LIB
4007#else
4008	.p2align 4,,15
4009	FUN_START(inner_edge_dtrsm_rlt_one_4x4_vs_lib)
4010#endif
4011
4012	cmpl			$2, %r12d
4013	jl				0f // ret
4014
4015	vbroadcastsd	8(%r10), %ymm13
4016	vmulpd			%ymm0, %ymm13, %ymm15
4017	vsubpd			%ymm15, %ymm1, %ymm1
4018
4019	cmpl			$3, %r12d
4020	jl				0f // ret
4021
4022	vbroadcastsd	16(%r10), %ymm13
4023	vmulpd			%ymm0, %ymm13, %ymm15
4024	vsubpd			%ymm15, %ymm2, %ymm2
4025	vbroadcastsd	16(%r10, %r11, 1), %ymm13
4026	vmulpd			%ymm1, %ymm13, %ymm15
4027	vsubpd			%ymm15, %ymm2, %ymm2
4028
4029	cmpl			$4, %r12d
4030	jl				0f // ret
4031
4032	vbroadcastsd	24(%r10), %ymm13
4033	vmulpd			%ymm0, %ymm13, %ymm15
4034	vsubpd			%ymm15, %ymm3, %ymm3
4035	vbroadcastsd	24(%r10, %r11, 1), %ymm13
4036	vmulpd			%ymm1, %ymm13, %ymm15
4037	vsubpd			%ymm15, %ymm3, %ymm3
4038	vbroadcastsd	24(%r10, %r11, 2), %ymm13
4039	vmulpd			%ymm2, %ymm13, %ymm15
4040	vsubpd			%ymm15, %ymm3, %ymm3
4041
40420:
4043
4044#if MACRO_LEVEL>=1
4045	.endm
4046#else
4047	ret
4048
4049	FUN_END(inner_edge_dtrsm_rlt_one_4x4_vs_lib)
4050#endif
4051
4052
4053
4054
4055
4056// common inner routine with file scope
4057//
4058// triangular substitution:
4059// side = right
4060// uplo = upper
4061// tran = not-transposed
4062// requires explicit inverse of diagonal
4063//
4064// input arguments:
4065// r10  <- E
4066// r11  <- lde
4067// r12  <- inv_diag_E
4068// ymm0 <- [d00 d10 d20 d30]
4069// ymm1 <- [d01 d11 d21 d31]
4070// ymm2 <- [d02 d12 d22 d32]
4071// ymm3 <- [d03 d13 d23 d33]
4072//
4073// output arguments:
4074
4075#if MACRO_LEVEL>=1
4076	.macro INNER_EDGE_DTRSM_RUN_INV_4X4_LIB
4077#else
4078	.p2align 4,,15
4079	FUN_START(inner_edge_dtrsm_run_inv_4x4_lib)
4080#endif
4081
4082	addq	%r11, %r10
4083
4084	vbroadcastsd	0(%r12), %ymm13
4085	vmulpd			%ymm0, %ymm13, %ymm0
4086	vbroadcastsd	0(%r10), %ymm13
4087	vmulpd			%ymm0, %ymm13, %ymm15
4088	vsubpd			%ymm15, %ymm1, %ymm1
4089	vbroadcastsd	0(%r10, %r11, 1), %ymm13
4090	vmulpd			%ymm0, %ymm13, %ymm15
4091	vsubpd			%ymm15, %ymm2, %ymm2
4092	vbroadcastsd	0(%r10, %r11, 2), %ymm13
4093	vmulpd			%ymm0, %ymm13, %ymm15
4094	vsubpd			%ymm15, %ymm3, %ymm3
4095
4096	vbroadcastsd	8(%r12), %ymm13
4097	vmulpd			%ymm1, %ymm13, %ymm1
4098	vbroadcastsd	8(%r10, %r11, 1), %ymm13
4099	vmulpd			%ymm1, %ymm13, %ymm15
4100	vsubpd			%ymm15, %ymm2, %ymm2
4101	vbroadcastsd	8(%r10, %r11, 2), %ymm13
4102	vmulpd			%ymm1, %ymm13, %ymm15
4103	vsubpd			%ymm15, %ymm3, %ymm3
4104
4105	vbroadcastsd	16(%r12), %ymm13
4106	vmulpd			%ymm2, %ymm13, %ymm2
4107	vbroadcastsd	16(%r10, %r11, 2), %ymm13
4108	vmulpd			%ymm2, %ymm13, %ymm15
4109	vsubpd			%ymm15, %ymm3, %ymm3
4110
4111	vbroadcastsd	24(%r12), %ymm13
4112	vmulpd			%ymm3, %ymm13, %ymm3
4113
4114#if MACRO_LEVEL>=1
4115	.endm
4116#else
4117	ret
4118
4119	FUN_END(inner_edge_dtrsm_run_inv_4x4_lib)
4120#endif
4121
4122
4123
4124
4125
4126// common inner routine with file scope
4127//
4128// triangular substitution:
4129// side = right
4130// uplo = upper
4131// tran = not-transposed
4132// requires explicit inverse of diagonal
4133//
4134// input arguments:
4135// r10  <- E
4136// r11  <- lde
4137// r12  <- inv_diag_E
4138// r13d <- kn
4139// ymm0 <- [d00 d10 d20 d30]
4140// ymm1 <- [d01 d11 d21 d31]
4141// ymm2 <- [d02 d12 d22 d32]
4142// ymm3 <- [d03 d13 d23 d33]
4143//
4144// output arguments:
4145
4146#if MACRO_LEVEL>=1
4147	.macro INNER_EDGE_DTRSM_RUN_INV_4X4_VS_LIB
4148#else
4149	.p2align 4,,15
4150	FUN_START(inner_edge_dtrsm_run_inv_4x4_vs_lib)
4151#endif
4152
4153	addq	%r11, %r10
4154
4155	vbroadcastsd	0(%r12), %ymm13
4156	vmulpd			%ymm0, %ymm13, %ymm0
4157
4158	cmpl			$2, %r13d
4159	jl				0f // ret
4160
4161	vbroadcastsd	0(%r10), %ymm13
4162	vmulpd			%ymm0, %ymm13, %ymm15
4163	vsubpd			%ymm15, %ymm1, %ymm1
4164	vbroadcastsd	8(%r12), %ymm13
4165	vmulpd			%ymm1, %ymm13, %ymm1
4166
4167	cmpl			$3, %r13d
4168	jl				0f // ret
4169
4170	vbroadcastsd	0(%r10, %r11, 1), %ymm13
4171	vmulpd			%ymm0, %ymm13, %ymm15
4172	vsubpd			%ymm15, %ymm2, %ymm2
4173	vbroadcastsd	8(%r10, %r11, 1), %ymm13
4174	vmulpd			%ymm1, %ymm13, %ymm15
4175	vsubpd			%ymm15, %ymm2, %ymm2
4176	vbroadcastsd	16(%r12), %ymm13
4177	vmulpd			%ymm2, %ymm13, %ymm2
4178
4179	cmpl			$4, %r13d
4180	jl				0f // ret
4181
4182	vbroadcastsd	0(%r10, %r11, 2), %ymm13
4183	vmulpd			%ymm0, %ymm13, %ymm15
4184	vsubpd			%ymm15, %ymm3, %ymm3
4185	vbroadcastsd	8(%r10, %r11, 2), %ymm13
4186	vmulpd			%ymm1, %ymm13, %ymm15
4187	vsubpd			%ymm15, %ymm3, %ymm3
4188	vbroadcastsd	16(%r10, %r11, 2), %ymm13
4189	vmulpd			%ymm2, %ymm13, %ymm15
4190	vsubpd			%ymm15, %ymm3, %ymm3
4191	vbroadcastsd	24(%r12), %ymm13
4192	vmulpd			%ymm3, %ymm13, %ymm3
4193
41940:
4195
4196#if MACRO_LEVEL>=1
4197	.endm
4198#else
4199	ret
4200
4201	FUN_END(inner_edge_dtrsm_run_inv_4x4_vs_lib)
4202#endif
4203
4204
4205
4206
4207
4208// common inner routine with file scope
4209//
4210// triangular substitution:
4211// side = right
4212// uplo = upper
4213// tran = not-transposed
4214// unit diagonal
4215//
4216// input arguments:
4217// r10  <- E
4218// r11  <- lde
4219// ymm0 <- [d00 d10 d20 d30]
4220// ymm1 <- [d01 d11 d21 d31]
4221// ymm2 <- [d02 d12 d22 d32]
4222// ymm3 <- [d03 d13 d23 d33]
4223//
4224// output arguments:
4225
4226#if MACRO_LEVEL>=1
4227	.macro INNER_EDGE_DTRSM_RUN_ONE_4X4_LIB
4228#else
4229	.p2align 4,,15
4230	FUN_START(inner_edge_dtrsm_run_one_4x4_lib)
4231#endif
4232
4233	addq	%r11, %r10
4234
4235	vbroadcastsd	0(%r10), %ymm13
4236	vmulpd			%ymm0, %ymm13, %ymm15
4237	vsubpd			%ymm15, %ymm1, %ymm1
4238	vbroadcastsd	0(%r10, %r11, 1), %ymm13
4239	vmulpd			%ymm0, %ymm13, %ymm15
4240	vsubpd			%ymm15, %ymm2, %ymm2
4241	vbroadcastsd	0(%r10, %r11, 2), %ymm13
4242	vmulpd			%ymm0, %ymm13, %ymm15
4243	vsubpd			%ymm15, %ymm3, %ymm3
4244
4245	vbroadcastsd	8(%r10, %r11, 1), %ymm13
4246	vmulpd			%ymm1, %ymm13, %ymm15
4247	vsubpd			%ymm15, %ymm2, %ymm2
4248	vbroadcastsd	8(%r10, %r11, 2), %ymm13
4249	vmulpd			%ymm1, %ymm13, %ymm15
4250	vsubpd			%ymm15, %ymm3, %ymm3
4251
4252	vbroadcastsd	16(%r10, %r11, 2), %ymm13
4253	vmulpd			%ymm2, %ymm13, %ymm15
4254	vsubpd			%ymm15, %ymm3, %ymm3
4255
4256#if MACRO_LEVEL>=1
4257	.endm
4258#else
4259	ret
4260
4261	FUN_END(inner_edge_dtrsm_run_one_4x4_lib)
4262#endif
4263
4264
4265
4266
4267
4268// common inner routine with file scope
4269//
4270// triangular substitution:
4271// side = right
4272// uplo = upper
4273// tran = not-transposed
4274// unit diagonal
4275//
4276// input arguments:
4277// r10  <- E
4278// r11  <- lde
4279// r12d <- kn
4280// ymm0 <- [d00 d10 d20 d30]
4281// ymm1 <- [d01 d11 d21 d31]
4282// ymm2 <- [d02 d12 d22 d32]
4283// ymm3 <- [d03 d13 d23 d33]
4284//
4285// output arguments:
4286
4287#if MACRO_LEVEL>=1
4288	.macro INNER_EDGE_DTRSM_RUN_ONE_4X4_VS_LIB
4289#else
4290	.p2align 4,,15
4291	FUN_START(inner_edge_dtrsm_run_one_4x4_vs_lib)
4292#endif
4293
4294	addq	%r11, %r10
4295
4296	cmpl			$2, %r12d
4297	jl				0f // ret
4298
4299	vbroadcastsd	0(%r10), %ymm13
4300	vmulpd			%ymm0, %ymm13, %ymm15
4301	vsubpd			%ymm15, %ymm1, %ymm1
4302
4303	cmpl			$3, %r12d
4304	jl				0f // ret
4305
4306	vbroadcastsd	0(%r10, %r11, 1), %ymm13
4307	vmulpd			%ymm0, %ymm13, %ymm15
4308	vsubpd			%ymm15, %ymm2, %ymm2
4309	vbroadcastsd	8(%r10, %r11, 1), %ymm13
4310	vmulpd			%ymm1, %ymm13, %ymm15
4311	vsubpd			%ymm15, %ymm2, %ymm2
4312
4313	cmpl			$4, %r12d
4314	jl				0f // ret
4315
4316	vbroadcastsd	0(%r10, %r11, 2), %ymm13
4317	vmulpd			%ymm0, %ymm13, %ymm15
4318	vsubpd			%ymm15, %ymm3, %ymm3
4319	vbroadcastsd	8(%r10, %r11, 2), %ymm13
4320	vmulpd			%ymm1, %ymm13, %ymm15
4321	vsubpd			%ymm15, %ymm3, %ymm3
4322	vbroadcastsd	16(%r10, %r11, 2), %ymm13
4323	vmulpd			%ymm2, %ymm13, %ymm15
4324	vsubpd			%ymm15, %ymm3, %ymm3
4325
43260:
4327
4328#if MACRO_LEVEL>=1
4329	.endm
4330#else
4331	ret
4332
4333	FUN_END(inner_edge_dtrsm_run_one_4x4_vs_lib)
4334#endif
4335
4336
4337
4338
4339
4340// common inner routine with file scope
4341//
4342// triangular substitution:
4343// side = right
4344// uplo = upper
4345// tran = transposed
4346// requires explicit inverse of diagonal
4347//
4348// input arguments:
4349// r10  <- E
4350// r11  <- lde
4351// r12  <- inv_diag_E
4352// ymm0 <- [d00 d10 d20 d30]
4353// ymm1 <- [d01 d11 d21 d31]
4354// ymm2 <- [d02 d12 d22 d32]
4355// ymm3 <- [d03 d13 d23 d33]
4356//
4357// output arguments:
4358
4359#if MACRO_LEVEL>=1
4360	.macro INNER_EDGE_DTRSM_RUT_INV_4X4_LIB
4361#else
4362	.p2align 4,,15
4363	FUN_START(inner_edge_dtrsm_rut_inv_4x4_lib)
4364#endif
4365
4366	addq	%r11, %r10
4367
4368	// 4th column
4369	vbroadcastsd	24(%r12), %ymm13
4370	vmulpd			%ymm3, %ymm13, %ymm3
4371	vbroadcastsd	16(%r10, %r11, 2), %ymm13
4372	vmulpd			%ymm3, %ymm13, %ymm15
4373	vsubpd			%ymm15, %ymm2, %ymm2
4374	vbroadcastsd	8(%r10, %r11, 2), %ymm13
4375	vmulpd			%ymm3, %ymm13, %ymm15
4376	vsubpd			%ymm15, %ymm1, %ymm1
4377	vbroadcastsd	0(%r10, %r11, 2), %ymm13
4378	vmulpd			%ymm3, %ymm13, %ymm15
4379	vsubpd			%ymm15, %ymm0, %ymm0
4380
4381	// 3rd column
4382	vbroadcastsd	16(%r12), %ymm13
4383	vmulpd			%ymm2, %ymm13, %ymm2
4384	vbroadcastsd	8(%r10, %r11, 1), %ymm13
4385	vmulpd			%ymm2, %ymm13, %ymm15
4386	vsubpd			%ymm15, %ymm1, %ymm1
4387	vbroadcastsd	0(%r10, %r11, 1), %ymm13
4388	vmulpd			%ymm2, %ymm13, %ymm15
4389	vsubpd			%ymm15, %ymm0, %ymm0
4390
4391	// 2nd column
4392	vbroadcastsd	8(%r12), %ymm13
4393	vmulpd			%ymm1, %ymm13, %ymm1
4394	vbroadcastsd	0(%r10), %ymm13
4395	vmulpd			%ymm1, %ymm13, %ymm15
4396	vsubpd			%ymm15, %ymm0, %ymm0
4397
4398	// 1st column
4399	vbroadcastsd	0(%r12), %ymm13
4400	vmulpd			%ymm0, %ymm13, %ymm0
4401
4402#if MACRO_LEVEL>=1
4403	.endm
4404#else
4405	ret
4406
4407	FUN_END(inner_edge_dtrsm_rut_inv_4x4_lib)
4408#endif
4409
4410
4411
4412
4413
4414// common inner routine with file scope
4415//
4416// triangular substitution:
4417// side = right
4418// uplo = upper
4419// tran = transposed
4420// requires explicit inverse of diagonal
4421//
4422// input arguments:
4423// r10  <- E
4424// r11  <- lde
4425// r12  <- inv_diag_E
4426// r13  <- n1
4427// ymm0 <- [d00 d10 d20 d30]
4428// ymm1 <- [d01 d11 d21 d31]
4429// ymm2 <- [d02 d12 d22 d32]
4430// ymm3 <- [d03 d13 d23 d33]
4431//
4432// output arguments:
4433
4434#if MACRO_LEVEL>=1
4435	.macro INNER_EDGE_DTRSM_RUT_INV_4X4_VS_LIB
4436#else
4437	.p2align 4,,15
4438	FUN_START(inner_edge_dtrsm_rut_inv_4x4_vs_lib)
4439#endif
4440
4441	addq	%r11, %r10
4442
4443	cmpl			$3, %r13d
4444	jle				0f
4445
4446	// 4th column
4447	vbroadcastsd	24(%r12), %ymm13
4448	vmulpd			%ymm3, %ymm13, %ymm3
4449	vbroadcastsd	16(%r10, %r11, 2), %ymm13
4450	vmulpd			%ymm3, %ymm13, %ymm15
4451	vsubpd			%ymm15, %ymm2, %ymm2
4452	vbroadcastsd	8(%r10, %r11, 2), %ymm13
4453	vmulpd			%ymm3, %ymm13, %ymm15
4454	vsubpd			%ymm15, %ymm1, %ymm1
4455	vbroadcastsd	0(%r10, %r11, 2), %ymm13
4456	vmulpd			%ymm3, %ymm13, %ymm15
4457	vsubpd			%ymm15, %ymm0, %ymm0
4458
44590:
4460	cmpl			$2, %r13d
4461	jle				0f
4462
4463	// 3rd column
4464	vbroadcastsd	16(%r12), %ymm13
4465	vmulpd			%ymm2, %ymm13, %ymm2
4466	vbroadcastsd	8(%r10, %r11, 1), %ymm13
4467	vmulpd			%ymm2, %ymm13, %ymm15
4468	vsubpd			%ymm15, %ymm1, %ymm1
4469	vbroadcastsd	0(%r10, %r11, 1), %ymm13
4470	vmulpd			%ymm2, %ymm13, %ymm15
4471	vsubpd			%ymm15, %ymm0, %ymm0
4472
44730:
4474	cmpl			$1, %r13d
4475	jle				0f
4476
4477	// 2nd column
4478	vbroadcastsd	8(%r12), %ymm13
4479	vmulpd			%ymm1, %ymm13, %ymm1
4480	vbroadcastsd	0(%r10), %ymm13
4481	vmulpd			%ymm1, %ymm13, %ymm15
4482	vsubpd			%ymm15, %ymm0, %ymm0
4483
44840:
4485
4486	// 1st column
4487	vbroadcastsd	0(%r12), %ymm13
4488	vmulpd			%ymm0, %ymm13, %ymm0
4489
4490#if MACRO_LEVEL>=1
4491	.endm
4492#else
4493	ret
4494
4495	FUN_END(inner_edge_dtrsm_rut_inv_4x4_vs_lib)
4496#endif
4497
4498
4499
4500
4501
4502// common inner routine with file scope
4503//
4504// triangular substitution:
4505// side = right
4506// uplo = upper
4507// tran = transposed
4508// unit diagonal
4509//
4510// input arguments:
4511// r10  <- E
4512// r11  <- lde
4513// ymm0 <- [d00 d10 d20 d30]
4514// ymm1 <- [d01 d11 d21 d31]
4515// ymm2 <- [d02 d12 d22 d32]
4516// ymm3 <- [d03 d13 d23 d33]
4517//
4518// output arguments:
4519
4520#if MACRO_LEVEL>=1
4521	.macro INNER_EDGE_DTRSM_RUT_ONE_4X4_LIB
4522#else
4523	.p2align 4,,15
4524	FUN_START(inner_edge_dtrsm_rut_one_4x4_lib)
4525#endif
4526
4527	addq	%r11, %r10
4528
4529	// 4th column
4530	vbroadcastsd	16(%r10, %r11, 2), %ymm13
4531	vmulpd			%ymm3, %ymm13, %ymm15
4532	vsubpd			%ymm15, %ymm2, %ymm2
4533	vbroadcastsd	8(%r10, %r11, 2), %ymm13
4534	vmulpd			%ymm3, %ymm13, %ymm15
4535	vsubpd			%ymm15, %ymm1, %ymm1
4536	vbroadcastsd	0(%r10, %r11, 2), %ymm13
4537	vmulpd			%ymm3, %ymm13, %ymm15
4538	vsubpd			%ymm15, %ymm0, %ymm0
4539
4540	// 3rd column
4541	vbroadcastsd	8(%r10, %r11, 1), %ymm13
4542	vmulpd			%ymm2, %ymm13, %ymm15
4543	vsubpd			%ymm15, %ymm1, %ymm1
4544	vbroadcastsd	0(%r10, %r11, 1), %ymm13
4545	vmulpd			%ymm2, %ymm13, %ymm15
4546	vsubpd			%ymm15, %ymm0, %ymm0
4547
4548	// 2nd column
4549	vbroadcastsd	0(%r10), %ymm13
4550	vmulpd			%ymm1, %ymm13, %ymm15
4551	vsubpd			%ymm15, %ymm0, %ymm0
4552
4553	// 1st column
4554
4555#if MACRO_LEVEL>=1
4556	.endm
4557#else
4558	ret
4559
4560	FUN_END(inner_edge_dtrsm_rut_one_4x4_lib)
4561#endif
4562
4563
4564
4565
4566
4567// common inner routine with file scope
4568//
4569// triangular substitution:
4570// side = right
4571// uplo = upper
4572// tran = transposed
4573//
4574// input arguments:
4575// r10  <- E
4576// r11  <- lde
4577// r12  <- n1
4578// ymm0 <- [d00 d10 d20 d30]
4579// ymm1 <- [d01 d11 d21 d31]
4580// ymm2 <- [d02 d12 d22 d32]
4581// ymm3 <- [d03 d13 d23 d33]
4582//
4583// output arguments:
4584
4585#if MACRO_LEVEL>=1
4586	.macro INNER_EDGE_DTRSM_RUT_ONE_4X4_VS_LIB
4587#else
4588	.p2align 4,,15
4589	FUN_START(inner_edge_dtrsm_rut_one_4x4_vs_lib)
4590#endif
4591
4592	addq	%r11, %r10
4593
4594	cmpl			$3, %r12d
4595	jle				0f
4596
4597	// 4th column
4598	vbroadcastsd	16(%r10, %r11, 2), %ymm13
4599	vmulpd			%ymm3, %ymm13, %ymm15
4600	vsubpd			%ymm15, %ymm2, %ymm2
4601	vbroadcastsd	8(%r10, %r11, 2), %ymm13
4602	vmulpd			%ymm3, %ymm13, %ymm15
4603	vsubpd			%ymm15, %ymm1, %ymm1
4604	vbroadcastsd	0(%r10, %r11, 2), %ymm13
4605	vmulpd			%ymm3, %ymm13, %ymm15
4606	vsubpd			%ymm15, %ymm0, %ymm0
4607
46080:
4609	cmpl			$2, %r12d
4610	jle				0f
4611
4612	// 3rd column
4613	vbroadcastsd	8(%r10, %r11, 1), %ymm13
4614	vmulpd			%ymm2, %ymm13, %ymm15
4615	vsubpd			%ymm15, %ymm1, %ymm1
4616	vbroadcastsd	0(%r10, %r11, 1), %ymm13
4617	vmulpd			%ymm2, %ymm13, %ymm15
4618	vsubpd			%ymm15, %ymm0, %ymm0
4619
46200:
4621	cmpl			$1, %r12d
4622	jle				0f
4623
4624	// 2nd column
4625	vbroadcastsd	0(%r10), %ymm13
4626	vmulpd			%ymm1, %ymm13, %ymm15
4627	vsubpd			%ymm15, %ymm0, %ymm0
4628
46290:
4630
4631	// 1st column
4632
4633#if MACRO_LEVEL>=1
4634	.endm
4635#else
4636	ret
4637
4638	FUN_END(inner_edge_dtrsm_rut_one_4x4_vs_lib)
4639#endif
4640
4641
4642
4643
4644
4645// common inner routine with file scope
4646//
4647// scale for generic alpha and beta
4648//
4649// input arguments:
4650// r10   <- alpha
4651// r11   <- beta
4652// r12   <- C
4653// r13   <- ldc
4654// ymm0 <- [d00 d11 d22 d33]
4655// ymm1 <- [d01 d10 d23 d32]
4656// ymm2 <- [d03 d12 d21 d30]
4657// ymm3 <- [d02 d13 d20 d31]
4658//
4659// output arguments:
4660
4661#if MACRO_LEVEL>=1
4662	.macro INNER_BLEND_SCALE_AB_4X4_LIB
4663#else
4664	.p2align 4,,15
4665	FUN_START(inner_blend_scale_ab_4x4_lib)
4666#endif
4667
4668	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
4669	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
4670	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
4671	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
4672
4673	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
4674	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
4675	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
4676	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
4677
4678	// alpha
4679	vbroadcastsd	0(%r10), %ymm15
4680
4681	vmulpd		%ymm0, %ymm15, %ymm0
4682	vmulpd		%ymm1, %ymm15, %ymm1
4683	vmulpd		%ymm2, %ymm15, %ymm2
4684	vmulpd		%ymm3, %ymm15, %ymm3
4685
4686	// beta
4687	vbroadcastsd	0(%r11), %ymm14
4688
4689	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
4690
4691	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
4692	je			0f // end
4693
4694	vmovupd		0(%r12), %ymm15
4695	vmulpd		%ymm14, %ymm15, %ymm15
4696	vaddpd		%ymm0, %ymm15, %ymm0
4697	addq		%r13, %r12
4698	vmovupd		0(%r12), %ymm15
4699	vmulpd		%ymm14, %ymm15, %ymm15
4700	vaddpd		%ymm1, %ymm15, %ymm1
4701	addq		%r13, %r12
4702	vmovupd		0(%r12), %ymm15
4703	vmulpd		%ymm14, %ymm15, %ymm15
4704	vaddpd		%ymm2, %ymm15, %ymm2
4705	addq		%r13, %r12
4706	vmovupd		0(%r12), %ymm15
4707	vmulpd		%ymm14, %ymm15, %ymm15
4708	vaddpd		%ymm3, %ymm15, %ymm3
4709//	addq		%r13, %r12
4710
47110:
4712
4713#if MACRO_LEVEL>=1
4714	.endm
4715#else
4716	ret
4717
4718	FUN_END(inner_blend_scale_ab_4x4_lib)
4719#endif
4720
4721
4722
4723
4724
4725// common inner routine with file scope
4726//
4727// scale for generic alpha and beta
4728//
4729// input arguments:
4730// r10   <- alpha
4731// r11   <- beta
4732// r12   <- C
4733// r13   <- ldc
4734// r14d   <- km
4735// r15d   <- kn
4736// ymm0 <- [d00 d11 d22 d33]
4737// ymm1 <- [d01 d10 d23 d32]
4738// ymm2 <- [d03 d12 d21 d30]
4739// ymm3 <- [d02 d13 d20 d31]
4740//
4741// output arguments:
4742
4743#if MACRO_LEVEL>=1
4744	.macro INNER_BLEND_SCALE_AB_4X4_VS_LIB
4745#else
4746	.p2align 4,,15
4747	FUN_START(inner_blend_scale_ab_4x4_vs_lib)
4748#endif
4749
4750	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
4751	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
4752	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
4753	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
4754
4755	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
4756	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
4757	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
4758	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
4759
4760	// alpha
4761	vbroadcastsd	0(%r10), %ymm15
4762
4763	vmulpd		%ymm0, %ymm15, %ymm0
4764	vmulpd		%ymm1, %ymm15, %ymm1
4765	vmulpd		%ymm2, %ymm15, %ymm2
4766	vmulpd		%ymm3, %ymm15, %ymm3
4767
4768	// beta
4769	vbroadcastsd	0(%r11), %ymm14
4770
4771	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
4772	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
4773	je			0f // end
4774
4775
4776	vcvtsi2sd	%r14d, %xmm15, %xmm15
4777#if defined(OS_LINUX) | defined(OS_WINDOWS)
4778	vmovupd		.LC02(%rip), %ymm13
4779#elif defined(OS_MAC)
4780	vmovupd		LC02(%rip), %ymm13
4781#endif
4782	vmovddup	%xmm15, %xmm15
4783	vinsertf128	$1, %xmm15, %ymm15, %ymm15
4784	vsubpd		%ymm15, %ymm13, %ymm13
4785
4786
4787	vmaskmovpd	0(%r12), %ymm13, %ymm15
4788	vmulpd		%ymm14, %ymm15, %ymm15
4789	vaddpd		%ymm0, %ymm15, %ymm0
4790	addq		%r13, %r12
4791	cmpl		$2, %r15d
4792	jl			0f // end
4793	vmaskmovpd	0(%r12), %ymm13, %ymm15
4794	vmulpd		%ymm14, %ymm15, %ymm15
4795	vaddpd		%ymm1, %ymm15, %ymm1
4796	addq		%r13, %r12
4797	cmpl		$3, %r15d
4798	jl			0f // end
4799	vmaskmovpd	0(%r12), %ymm13, %ymm15
4800	vmulpd		%ymm14, %ymm15, %ymm15
4801	vaddpd		%ymm2, %ymm15, %ymm2
4802	addq		%r13, %r12
4803	cmpl		$3, %r15d
4804	je			0f // end
4805	vmaskmovpd	0(%r12), %ymm13, %ymm15
4806	vmulpd		%ymm14, %ymm15, %ymm15
4807	vaddpd		%ymm3, %ymm15, %ymm3
4808//	addq		%r13, %r12
4809
48100:
4811
4812#if MACRO_LEVEL>=1
4813	.endm
4814#else
4815	ret
4816
4817	FUN_END(inner_blend_scale_ab_4x4_vs_lib)
4818#endif
4819
4820
4821
4822
4823
4824// common inner routine with file scope
4825//
4826// scale for alpha=-1 and generic beta
4827//
4828// input arguments:
4829// r10   <- beta
4830// r11   <- C
4831// r12   <- ldc
4832// ymm0 <- [d00 d11 d22 d33]
4833// ymm1 <- [d01 d10 d23 d32]
4834// ymm2 <- [d03 d12 d21 d30]
4835// ymm3 <- [d02 d13 d20 d31]
4836//
4837// output arguments:
4838
4839#if MACRO_LEVEL>=1
4840	.macro INNER_BLEND_SCALE_M1B_4X4_LIB
4841#else
4842	.p2align 4,,15
4843	FUN_START(inner_blend_scale_m1b_4x4_lib)
4844#endif
4845
4846	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
4847	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
4848	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
4849	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
4850
4851	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
4852	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
4853	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
4854	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
4855
4856	// beta
4857	vbroadcastsd	0(%r10), %ymm14
4858
4859	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
4860
4861	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
4862	je			0f // end
4863
4864	vmovupd		0(%r11), %ymm15
4865	vmulpd		%ymm14, %ymm15, %ymm15
4866	vsubpd		%ymm0, %ymm15, %ymm0
4867	addq		%r12, %r11
4868	vmovupd		0(%r11), %ymm15
4869	vmulpd		%ymm14, %ymm15, %ymm15
4870	vsubpd		%ymm1, %ymm15, %ymm1
4871	addq		%r12, %r11
4872	vmovupd		0(%r11), %ymm15
4873	vmulpd		%ymm14, %ymm15, %ymm15
4874	vsubpd		%ymm2, %ymm15, %ymm2
4875	addq		%r12, %r11
4876	vmovupd		0(%r11), %ymm15
4877	vmulpd		%ymm14, %ymm15, %ymm15
4878	vsubpd		%ymm3, %ymm15, %ymm3
4879//	addq		%r12, %r11
4880
48810:
4882
4883#if MACRO_LEVEL>=1
4884	.endm
4885#else
4886	ret
4887
4888	FUN_END(inner_blend_scale_m1b_4x4_lib)
4889#endif
4890
4891
4892
4893
4894
4895// common inner routine with file scope
4896//
4897// scale for generic alpha and beta
4898//
4899// input arguments:
4900// r10   <- beta
4901// r11   <- C
4902// r12   <- ldc
4903// r13d   <- km
4904// r14d   <- kn
4905// ymm0 <- [d00 d11 d22 d33]
4906// ymm1 <- [d01 d10 d23 d32]
4907// ymm2 <- [d03 d12 d21 d30]
4908// ymm3 <- [d02 d13 d20 d31]
4909//
4910// output arguments:
4911
4912#if MACRO_LEVEL>=1
4913	.macro INNER_BLEND_SCALE_M1B_4X4_VS_LIB
4914#else
4915	.p2align 4,,15
4916	FUN_START(inner_blend_scale_m1b_4x4_vs_lib)
4917#endif
4918
4919	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
4920	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
4921	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
4922	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
4923
4924	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
4925	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
4926	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
4927	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
4928
4929	// beta
4930	vbroadcastsd	0(%r10), %ymm14
4931
4932	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
4933	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
4934	je			0f // end
4935
4936
4937	vcvtsi2sd	%r13d, %xmm15, %xmm15
4938#if defined(OS_LINUX) | defined(OS_WINDOWS)
4939	vmovupd		.LC02(%rip), %ymm13
4940#elif defined(OS_MAC)
4941	vmovupd		LC02(%rip), %ymm13
4942#endif
4943	vmovddup	%xmm15, %xmm15
4944	vinsertf128	$1, %xmm15, %ymm15, %ymm15
4945	vsubpd		%ymm15, %ymm13, %ymm13
4946
4947
4948	vmaskmovpd	0(%r11), %ymm13, %ymm15
4949	vmulpd		%ymm14, %ymm15, %ymm15
4950	vsubpd		%ymm0, %ymm15, %ymm0
4951	addq		%r12, %r11
4952	cmpl		$2, %r14d
4953	jl			0f // end
4954	vmaskmovpd	0(%r11), %ymm13, %ymm15
4955	vmulpd		%ymm14, %ymm15, %ymm15
4956	vsubpd		%ymm1, %ymm15, %ymm1
4957	addq		%r12, %r11
4958	cmpl		$3, %r14d
4959	jl			0f // end
4960	vmaskmovpd	0(%r11), %ymm13, %ymm15
4961	vmulpd		%ymm14, %ymm15, %ymm15
4962	vsubpd		%ymm2, %ymm15, %ymm2
4963	addq		%r12, %r11
4964	cmpl		$3, %r14d
4965	je			0f // end
4966	vmaskmovpd	0(%r11), %ymm13, %ymm15
4967	vmulpd		%ymm14, %ymm15, %ymm15
4968	vsubpd		%ymm3, %ymm15, %ymm3
4969//	addq		%r12, %r11
4970
49710:
4972
4973#if MACRO_LEVEL>=1
4974	.endm
4975#else
4976	ret
4977
4978	FUN_END(inner_blend_scale_m1b_4x4_vs_lib)
4979#endif
4980
4981
4982
4983
4984
4985// common inner routine with file scope
4986//
4987// scale for alpha=-1 and beta=1
4988//
4989// input arguments:
4990// r10   <- C
4991// r11   <- ldc
4992// ymm0 <- [d00 d11 d22 d33]
4993// ymm1 <- [d01 d10 d23 d32]
4994// ymm2 <- [d03 d12 d21 d30]
4995// ymm3 <- [d02 d13 d20 d31]
4996//
4997// output arguments:
4998
4999#if MACRO_LEVEL>=1
5000	.macro INNER_BLEND_SCALE_M11_4X4_LIB
5001#else
5002	.p2align 4,,15
5003	FUN_START(inner_blend_scale_m11_4x4_lib)
5004#endif
5005
5006	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
5007	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
5008	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
5009	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
5010
5011	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
5012	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
5013	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
5014	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
5015
5016	vmovupd		0(%r10), %ymm15
5017	vsubpd		%ymm0, %ymm15, %ymm0
5018	addq		%r11, %r10
5019	vmovupd		0(%r10), %ymm15
5020	vsubpd		%ymm1, %ymm15, %ymm1
5021	addq		%r11, %r10
5022	vmovupd		0(%r10), %ymm15
5023	vsubpd		%ymm2, %ymm15, %ymm2
5024	addq		%r11, %r10
5025	vmovupd		0(%r10), %ymm15
5026	vsubpd		%ymm3, %ymm15, %ymm3
5027//	addq		%r11, %r10
5028
50290:
5030
5031#if MACRO_LEVEL>=1
5032	.endm
5033#else
5034	ret
5035
5036	FUN_END(inner_blend_scale_m11_4x4_lib)
5037#endif
5038
5039
5040
5041
5042
5043// common inner routine with file scope
5044//
5045// scale for alpha=-1 and beta=1
5046//
5047// input arguments:
5048// r10   <- C
5049// r11   <- ldc
5050// r12d   <- km
5051// r13d   <- kn
5052// ymm0 <- [d00 d11 d22 d33]
5053// ymm1 <- [d01 d10 d23 d32]
5054// ymm2 <- [d03 d12 d21 d30]
5055// ymm3 <- [d02 d13 d20 d31]
5056//
5057// output arguments:
5058
5059#if MACRO_LEVEL>=1
5060	.macro INNER_BLEND_SCALE_M11_4X4_VS_LIB
5061#else
5062	.p2align 4,,15
5063	FUN_START(inner_blend_scale_m11_4x4_vs_lib)
5064#endif
5065
5066	vblendpd	$0xa, %ymm1, %ymm0, %ymm8
5067	vblendpd	$0x5, %ymm1, %ymm0, %ymm9
5068	vblendpd	$0xa, %ymm3, %ymm2, %ymm10
5069	vblendpd	$0x5, %ymm3, %ymm2, %ymm11
5070
5071	vblendpd	$0xc, %ymm10, %ymm8, %ymm0
5072	vblendpd	$0x3, %ymm10, %ymm8, %ymm2
5073	vblendpd	$0xc, %ymm11, %ymm9, %ymm1
5074	vblendpd	$0x3, %ymm11, %ymm9, %ymm3
5075
5076	vcvtsi2sd	%r12d, %xmm15, %xmm15
5077#if defined(OS_LINUX) | defined(OS_WINDOWS)
5078	vmovupd		.LC02(%rip), %ymm13
5079#elif defined(OS_MAC)
5080	vmovupd		LC02(%rip), %ymm13
5081#endif
5082	vmovddup	%xmm15, %xmm15
5083	vinsertf128	$1, %xmm15, %ymm15, %ymm15
5084	vsubpd		%ymm15, %ymm13, %ymm13
5085
5086
5087	vmaskmovpd	0(%r10), %ymm13, %ymm15
5088	vsubpd		%ymm0, %ymm15, %ymm0
5089	addq		%r11, %r10
5090	cmpl		$2, %r13d
5091	jl			0f // end
5092	vmaskmovpd	0(%r10), %ymm13, %ymm15
5093	vsubpd		%ymm1, %ymm15, %ymm1
5094	addq		%r11, %r10
5095	cmpl		$3, %r13d
5096	jl			0f // end
5097	vmaskmovpd	0(%r10), %ymm13, %ymm15
5098	vsubpd		%ymm2, %ymm15, %ymm2
5099	addq		%r11, %r10
5100	cmpl		$3, %r13d
5101	je			0f // end
5102	vmaskmovpd	0(%r10), %ymm13, %ymm15
5103	vsubpd		%ymm3, %ymm15, %ymm3
5104//	addq		%r11, %r10
5105
51060:
5107
5108#if MACRO_LEVEL>=1
5109	.endm
5110#else
5111	ret
5112
5113	FUN_END(inner_blend_scale_m11_4x4_vs_lib)
5114#endif
5115
5116
5117
5118
5119
5120// common inner routine with file scope
5121//
5122// scale for generic alpha and beta
5123//
5124// input arguments:
5125// r10   <- alpha
5126// r11   <- beta
5127// r12   <- C
5128// r13   <- ldc
5129// ymm0 <- [d00 d11 d22 d33]
5130// ymm1 <- [d01 d10 d23 d32]
5131// ymm2 <- [d03 d12 d21 d30]
5132// ymm3 <- [d02 d13 d20 d31]
5133//
5134// output arguments:
5135
5136#if MACRO_LEVEL>=1
5137	.macro INNER_SCALE_AB_4X4_LIB
5138#else
5139	.p2align 4,,15
5140	FUN_START(inner_scale_ab_4x4_lib)
5141#endif
5142
5143	// alpha
5144	vbroadcastsd	0(%r10), %ymm15
5145
5146	vmulpd		%ymm0, %ymm15, %ymm0
5147	vmulpd		%ymm1, %ymm15, %ymm1
5148	vmulpd		%ymm2, %ymm15, %ymm2
5149	vmulpd		%ymm3, %ymm15, %ymm3
5150
5151	// beta
5152	vbroadcastsd	0(%r11), %ymm14
5153
5154	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
5155
5156	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
5157	je			0f // end
5158
5159	vmovupd		0(%r12), %ymm15
5160	vmulpd		%ymm14, %ymm15, %ymm15
5161	vaddpd		%ymm0, %ymm15, %ymm0
5162	addq		%r13, %r12
5163	vmovupd		0(%r12), %ymm15
5164	vmulpd		%ymm14, %ymm15, %ymm15
5165	vaddpd		%ymm1, %ymm15, %ymm1
5166	addq		%r13, %r12
5167	vmovupd		0(%r12), %ymm15
5168	vmulpd		%ymm14, %ymm15, %ymm15
5169	vaddpd		%ymm2, %ymm15, %ymm2
5170	addq		%r13, %r12
5171	vmovupd		0(%r12), %ymm15
5172	vmulpd		%ymm14, %ymm15, %ymm15
5173	vaddpd		%ymm3, %ymm15, %ymm3
5174//	addq		%r13, %r12
5175
51760:
5177
5178#if MACRO_LEVEL>=1
5179	.endm
5180#else
5181	ret
5182
5183	FUN_END(inner_scale_ab_4x4_lib)
5184#endif
5185
5186
5187
5188
5189
5190// common inner routine with file scope
5191//
5192// scale for generic alpha and beta
5193//
5194// input arguments:
5195// r10   <- alpha
5196// r11   <- beta
5197// r12   <- C
5198// r13   <- ldc
5199// r14d   <- km
5200// r15d   <- kn
5201// ymm0 <- [d00 d11 d22 d33]
5202// ymm1 <- [d01 d10 d23 d32]
5203// ymm2 <- [d03 d12 d21 d30]
5204// ymm3 <- [d02 d13 d20 d31]
5205//
5206// output arguments:
5207
5208#if MACRO_LEVEL>=1
5209	.macro INNER_SCALE_AB_4X4_VS_LIB
5210#else
5211	.p2align 4,,15
5212	FUN_START(inner_scale_ab_4x4_vs_lib)
5213#endif
5214
5215	// alpha
5216	vbroadcastsd	0(%r10), %ymm15
5217
5218	vmulpd		%ymm0, %ymm15, %ymm0
5219	vmulpd		%ymm1, %ymm15, %ymm1
5220	vmulpd		%ymm2, %ymm15, %ymm2
5221	vmulpd		%ymm3, %ymm15, %ymm3
5222
5223	// beta
5224	vbroadcastsd	0(%r11), %ymm14
5225
5226	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
5227	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
5228	je			0f // end
5229
5230
5231	vcvtsi2sd	%r14d, %xmm15, %xmm15
5232#if defined(OS_LINUX) | defined(OS_WINDOWS)
5233	vmovupd		.LC02(%rip), %ymm13
5234#elif defined(OS_MAC)
5235	vmovupd		LC02(%rip), %ymm13
5236#endif
5237	vmovddup	%xmm15, %xmm15
5238	vinsertf128	$1, %xmm15, %ymm15, %ymm15
5239	vsubpd		%ymm15, %ymm13, %ymm13
5240
5241
5242	vmaskmovpd	0(%r12), %ymm13, %ymm15
5243	vmulpd		%ymm14, %ymm15, %ymm15
5244	vaddpd		%ymm0, %ymm15, %ymm0
5245	addq		%r13, %r12
5246	cmpl		$2, %r15d
5247	jl			0f // end
5248	vmaskmovpd	0(%r12), %ymm13, %ymm15
5249	vmulpd		%ymm14, %ymm15, %ymm15
5250	vaddpd		%ymm1, %ymm15, %ymm1
5251	addq		%r13, %r12
5252	cmpl		$3, %r15d
5253	jl			0f // end
5254	vmaskmovpd	0(%r12), %ymm13, %ymm15
5255	vmulpd		%ymm14, %ymm15, %ymm15
5256	vaddpd		%ymm2, %ymm15, %ymm2
5257	addq		%r13, %r12
5258	cmpl		$3, %r15d
5259	je			0f // end
5260	vmaskmovpd	0(%r12), %ymm13, %ymm15
5261	vmulpd		%ymm14, %ymm15, %ymm15
5262	vaddpd		%ymm3, %ymm15, %ymm3
5263//	addq		%r13, %r12
5264
52650:
5266
5267#if MACRO_LEVEL>=1
5268	.endm
5269#else
5270	ret
5271
5272	FUN_END(inner_scale_ab_4x4_vs_lib)
5273#endif
5274
5275
5276
5277
5278
5279// common inner routine with file scope
5280//
5281// tran_scale for generic alpha and beta
5282//
5283// input arguments:
5284// r10   <- alpha
5285// r11   <- beta
5286// r12   <- C
5287// r13   <- ldc
5288// ymm0 <- [d00 d11 d22 d33]
5289// ymm1 <- [d01 d10 d23 d32]
5290// ymm2 <- [d03 d12 d21 d30]
5291// ymm3 <- [d02 d13 d20 d31]
5292//
5293// output arguments:
5294
5295#if MACRO_LEVEL>=1
5296	.macro INNER_TRAN_SCALE_AB_4X4_LIB
5297#else
5298	.p2align 4,,15
5299	FUN_START(inner_tran_scale_ab_4x4_lib)
5300#endif
5301
5302	vunpcklpd	%ymm1, %ymm0, %ymm12
5303	vunpckhpd	%ymm1, %ymm0, %ymm13
5304	vunpcklpd	%ymm3, %ymm2, %ymm14
5305	vunpckhpd	%ymm3, %ymm2, %ymm15
5306
5307	vperm2f128	$0x20, %ymm14, %ymm12, %ymm0
5308	vperm2f128	$0x31, %ymm14, %ymm12, %ymm2
5309	vperm2f128	$0x20, %ymm15, %ymm13, %ymm1
5310	vperm2f128	$0x31, %ymm15, %ymm13, %ymm3
5311
5312	// alpha
5313	vbroadcastsd	0(%r10), %ymm15
5314
5315	vmulpd		%ymm0, %ymm15, %ymm0
5316	vmulpd		%ymm1, %ymm15, %ymm1
5317	vmulpd		%ymm2, %ymm15, %ymm2
5318	vmulpd		%ymm3, %ymm15, %ymm3
5319
5320	// beta
5321	vbroadcastsd	0(%r11), %ymm14
5322
5323	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
5324
5325	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
5326	je			0f // end
5327
5328	vmovupd		0(%r12), %ymm15
5329	vmulpd		%ymm14, %ymm15, %ymm15
5330	vaddpd		%ymm0, %ymm15, %ymm0
5331	addq		%r13, %r12
5332	vmovupd		0(%r12), %ymm15
5333	vmulpd		%ymm14, %ymm15, %ymm15
5334	vaddpd		%ymm1, %ymm15, %ymm1
5335	addq		%r13, %r12
5336	vmovupd		0(%r12), %ymm15
5337	vmulpd		%ymm14, %ymm15, %ymm15
5338	vaddpd		%ymm2, %ymm15, %ymm2
5339	addq		%r13, %r12
5340	vmovupd		0(%r12), %ymm15
5341	vmulpd		%ymm14, %ymm15, %ymm15
5342	vaddpd		%ymm3, %ymm15, %ymm3
5343//	addq		%r13, %r12
5344
53450:
5346
5347#if MACRO_LEVEL>=1
5348	.endm
5349#else
5350	ret
5351
5352	FUN_END(inner_tran_scale_ab_4x4_lib)
5353#endif
5354
5355
5356
5357
5358
5359// common inner routine with file scope
5360//
5361// tran scale for generic alpha and beta
5362//
5363// input arguments:
5364// r10   <- alpha
5365// r11   <- beta
5366// r12   <- C
5367// r13   <- ldc
5368// r14d   <- km
5369// r15d   <- kn
5370// ymm0 <- [d00 d11 d22 d33]
5371// ymm1 <- [d01 d10 d23 d32]
5372// ymm2 <- [d03 d12 d21 d30]
5373// ymm3 <- [d02 d13 d20 d31]
5374//
5375// output arguments:
5376
5377#if MACRO_LEVEL>=1
5378	.macro INNER_TRAN_SCALE_AB_4X4_VS_LIB
5379#else
5380	.p2align 4,,15
5381	FUN_START(inner_tran_scale_ab_4x4_vs_lib)
5382#endif
5383
5384	vunpcklpd	%ymm1, %ymm0, %ymm12
5385	vunpckhpd	%ymm1, %ymm0, %ymm13
5386	vunpcklpd	%ymm3, %ymm2, %ymm14
5387	vunpckhpd	%ymm3, %ymm2, %ymm15
5388
5389	vperm2f128	$0x20, %ymm14, %ymm12, %ymm0
5390	vperm2f128	$0x31, %ymm14, %ymm12, %ymm2
5391	vperm2f128	$0x20, %ymm15, %ymm13, %ymm1
5392	vperm2f128	$0x31, %ymm15, %ymm13, %ymm3
5393
5394	// alpha
5395	vbroadcastsd	0(%r10), %ymm15
5396
5397	vmulpd		%ymm0, %ymm15, %ymm0
5398	vmulpd		%ymm1, %ymm15, %ymm1
5399	vmulpd		%ymm2, %ymm15, %ymm2
5400	vmulpd		%ymm3, %ymm15, %ymm3
5401
5402	// beta
5403	vbroadcastsd	0(%r11), %ymm14
5404
5405	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
5406	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
5407	je			0f // end
5408
5409
5410	vcvtsi2sd	%r14d, %xmm15, %xmm15
5411#if defined(OS_LINUX) | defined(OS_WINDOWS)
5412	vmovupd		.LC02(%rip), %ymm13
5413#elif defined(OS_MAC)
5414	vmovupd		LC02(%rip), %ymm13
5415#endif
5416	vmovddup	%xmm15, %xmm15
5417	vinsertf128	$1, %xmm15, %ymm15, %ymm15
5418	vsubpd		%ymm15, %ymm13, %ymm13
5419
5420
5421	vmaskmovpd	0(%r12), %ymm13, %ymm15
5422	vmulpd		%ymm14, %ymm15, %ymm15
5423	vaddpd		%ymm0, %ymm15, %ymm0
5424	addq		%r13, %r12
5425	cmpl		$2, %r15d
5426	jl			0f // end
5427	vmaskmovpd	0(%r12), %ymm13, %ymm15
5428	vmulpd		%ymm14, %ymm15, %ymm15
5429	vaddpd		%ymm1, %ymm15, %ymm1
5430	addq		%r13, %r12
5431	cmpl		$3, %r15d
5432	jl			0f // end
5433	vmaskmovpd	0(%r12), %ymm13, %ymm15
5434	vmulpd		%ymm14, %ymm15, %ymm15
5435	vaddpd		%ymm2, %ymm15, %ymm2
5436	addq		%r13, %r12
5437	cmpl		$3, %r15d
5438	je			0f // end
5439	vmaskmovpd	0(%r12), %ymm13, %ymm15
5440	vmulpd		%ymm14, %ymm15, %ymm15
5441	vaddpd		%ymm3, %ymm15, %ymm3
5442//	addq		%r13, %r12
5443
54440:
5445
5446#if MACRO_LEVEL>=1
5447	.endm
5448#else
5449	ret
5450
5451	FUN_END(inner_tran_scale_ab_4x4_vs_lib)
5452#endif
5453
5454
5455
5456
5457
5458// common inner routine with file scope
5459//
5460// scale for alpha=-1 and generic beta
5461//
5462// input arguments:
5463// r10   <- beta
5464// r11   <- C
5465// r12   <- ldc
5466// ymm0 <- [d00 d11 d22 d33]
5467// ymm1 <- [d01 d10 d23 d32]
5468// ymm2 <- [d03 d12 d21 d30]
5469// ymm3 <- [d02 d13 d20 d31]
5470//
5471// output arguments:
5472
5473#if MACRO_LEVEL>=1
5474	.macro INNER_SCALE_M1B_4X4_LIB
5475#else
5476	.p2align 4,,15
5477	FUN_START(inner_scale_m1b_4x4_lib)
5478#endif
5479
5480	// beta
5481	vbroadcastsd	0(%r10), %ymm14
5482
5483	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
5484
5485	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
5486	je			0f // end
5487
5488	vmovupd		0(%r11), %ymm15
5489	vmulpd		%ymm14, %ymm15, %ymm15
5490	vsubpd		%ymm0, %ymm15, %ymm0
5491	addq		%r12, %r11
5492	vmovupd		0(%r11), %ymm15
5493	vmulpd		%ymm14, %ymm15, %ymm15
5494	vsubpd		%ymm1, %ymm15, %ymm1
5495	addq		%r12, %r11
5496	vmovupd		0(%r11), %ymm15
5497	vmulpd		%ymm14, %ymm15, %ymm15
5498	vsubpd		%ymm2, %ymm15, %ymm2
5499	addq		%r12, %r11
5500	vmovupd		0(%r11), %ymm15
5501	vmulpd		%ymm14, %ymm15, %ymm15
5502	vsubpd		%ymm3, %ymm15, %ymm3
5503//	addq		%r12, %r11
5504
55050:
5506
5507#if MACRO_LEVEL>=1
5508	.endm
5509#else
5510	ret
5511
5512	FUN_END(inner_scale_m1b_4x4_lib)
5513#endif
5514
5515
5516
5517
5518
5519// common inner routine with file scope
5520//
5521// scale for generic alpha and beta
5522//
5523// input arguments:
5524// r10   <- beta
5525// r11   <- C
5526// r12   <- ldc
5527// r13d   <- km
5528// r14d   <- kn
5529// ymm0 <- [d00 d11 d22 d33]
5530// ymm1 <- [d01 d10 d23 d32]
5531// ymm2 <- [d03 d12 d21 d30]
5532// ymm3 <- [d02 d13 d20 d31]
5533//
5534// output arguments:
5535
5536#if MACRO_LEVEL>=1
5537	.macro INNER_SCALE_M1B_4X4_VS_LIB
5538#else
5539	.p2align 4,,15
5540	FUN_START(inner_scale_m1b_4x4_vs_lib)
5541#endif
5542
5543	// beta
5544	vbroadcastsd	0(%r10), %ymm14
5545
5546	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
5547	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
5548	je			0f // end
5549
5550
5551	vcvtsi2sd	%r13d, %xmm15, %xmm15
5552#if defined(OS_LINUX) | defined(OS_WINDOWS)
5553	vmovupd		.LC02(%rip), %ymm13
5554#elif defined(OS_MAC)
5555	vmovupd		LC02(%rip), %ymm13
5556#endif
5557	vmovddup	%xmm15, %xmm15
5558	vinsertf128	$1, %xmm15, %ymm15, %ymm15
5559	vsubpd		%ymm15, %ymm13, %ymm13
5560
5561
5562	vmaskmovpd	0(%r11), %ymm13, %ymm15
5563	vmulpd		%ymm14, %ymm15, %ymm15
5564	vsubpd		%ymm0, %ymm15, %ymm0
5565	addq		%r12, %r11
5566	cmpl		$2, %r14d
5567	jl			0f // end
5568	vmaskmovpd	0(%r11), %ymm13, %ymm15
5569	vmulpd		%ymm14, %ymm15, %ymm15
5570	vsubpd		%ymm1, %ymm15, %ymm1
5571	addq		%r12, %r11
5572	cmpl		$3, %r14d
5573	jl			0f // end
5574	vmaskmovpd	0(%r11), %ymm13, %ymm15
5575	vmulpd		%ymm14, %ymm15, %ymm15
5576	vsubpd		%ymm2, %ymm15, %ymm2
5577	addq		%r12, %r11
5578	cmpl		$3, %r14d
5579	je			0f // end
5580	vmaskmovpd	0(%r11), %ymm13, %ymm15
5581	vmulpd		%ymm14, %ymm15, %ymm15
5582	vsubpd		%ymm3, %ymm15, %ymm3
5583//	addq		%r12, %r11
5584
55850:
5586
5587#if MACRO_LEVEL>=1
5588	.endm
5589#else
5590	ret
5591
5592	FUN_END(inner_scale_m1b_4x4_vs_lib)
5593#endif
5594
5595
5596
5597
5598
5599// common inner routine with file scope
5600//
5601// scale for alpha=-1 and beta=1
5602//
5603// input arguments:
5604// r10   <- C
5605// r11   <- ldc
5606// ymm0 <- [d00 d11 d22 d33]
5607// ymm1 <- [d01 d10 d23 d32]
5608// ymm2 <- [d03 d12 d21 d30]
5609// ymm3 <- [d02 d13 d20 d31]
5610//
5611// output arguments:
5612
5613#if MACRO_LEVEL>=1
5614	.macro INNER_SCALE_M11_4X4_LIB
5615#else
5616	.p2align 4,,15
5617	FUN_START(inner_scale_m11_4x4_lib)
5618#endif
5619
5620	vmovupd		0(%r10), %ymm15
5621	vsubpd		%ymm0, %ymm15, %ymm0
5622	addq		%r11, %r10
5623	vmovupd		0(%r10), %ymm15
5624	vsubpd		%ymm1, %ymm15, %ymm1
5625	addq		%r11, %r10
5626	vmovupd		0(%r10), %ymm15
5627	vsubpd		%ymm2, %ymm15, %ymm2
5628	addq		%r11, %r10
5629	vmovupd		0(%r10), %ymm15
5630	vsubpd		%ymm3, %ymm15, %ymm3
5631//	addq		%r11, %r10
5632
56330:
5634
5635#if MACRO_LEVEL>=1
5636	.endm
5637#else
5638	ret
5639
5640	FUN_END(inner_scale_m11_4x4_lib)
5641#endif
5642
5643
5644
5645
5646
5647// common inner routine with file scope
5648//
5649// scale for alpha=-1 and beta=1
5650//
5651// input arguments:
5652// r10   <- C
5653// r11   <- ldc
5654// r12d   <- km
5655// r13d   <- kn
5656// ymm0 <- [d00 d11 d22 d33]
5657// ymm1 <- [d01 d10 d23 d32]
5658// ymm2 <- [d03 d12 d21 d30]
5659// ymm3 <- [d02 d13 d20 d31]
5660//
5661// output arguments:
5662
5663#if MACRO_LEVEL>=1
5664	.macro INNER_SCALE_M11_4X4_VS_LIB
5665#else
5666	.p2align 4,,15
5667	FUN_START(inner_scale_m11_4x4_vs_lib)
5668#endif
5669
5670	vcvtsi2sd	%r12d, %xmm15, %xmm15
5671#if defined(OS_LINUX) | defined(OS_WINDOWS)
5672	vmovupd		.LC02(%rip), %ymm13
5673#elif defined(OS_MAC)
5674	vmovupd		LC02(%rip), %ymm13
5675#endif
5676	vmovddup	%xmm15, %xmm15
5677	vinsertf128	$1, %xmm15, %ymm15, %ymm15
5678	vsubpd		%ymm15, %ymm13, %ymm13
5679
5680
5681	vmaskmovpd	0(%r10), %ymm13, %ymm15
5682	vsubpd		%ymm0, %ymm15, %ymm0
5683	addq		%r11, %r10
5684	cmpl		$2, %r13d
5685	jl			0f // end
5686	vmaskmovpd	0(%r10), %ymm13, %ymm15
5687	vsubpd		%ymm1, %ymm15, %ymm1
5688	addq		%r11, %r10
5689	cmpl		$3, %r13d
5690	jl			0f // end
5691	vmaskmovpd	0(%r10), %ymm13, %ymm15
5692	vsubpd		%ymm2, %ymm15, %ymm2
5693	addq		%r11, %r10
5694	cmpl		$3, %r13d
5695	je			0f // end
5696	vmaskmovpd	0(%r10), %ymm13, %ymm15
5697	vsubpd		%ymm3, %ymm15, %ymm3
5698//	addq		%r11, %r10
5699
57000:
5701
5702#if MACRO_LEVEL>=1
5703	.endm
5704#else
5705	ret
5706
5707	FUN_END(inner_scale_m11_4x4_vs_lib)
5708#endif
5709
5710
5711
5712
5713
5714// common inner routine with file scope
5715//
5716// store n
5717//
5718// input arguments:
5719// r10  <- D
5720// r11  <- ldd
5721// ymm0 <- [d00 d11 d22 d33]
5722// ymm1 <- [d01 d10 d23 d32]
5723// ymm2 <- [d03 d12 d21 d30]
5724// ymm3 <- [d02 d13 d20 d31]
5725//
5726// output arguments:
5727
5728#if MACRO_LEVEL>=1
5729	.macro INNER_STORE_4X4_LIB
5730#else
5731	.p2align 4,,15
5732	FUN_START(inner_store_4x4_lib)
5733#endif
5734
5735	vmovupd		%ymm0, 0(%r10)
5736	addq		%r11, %r10
5737	vmovupd		%ymm1, 0(%r10)
5738	addq		%r11, %r10
5739	vmovupd		%ymm2, 0(%r10)
5740	addq		%r11, %r10
5741	vmovupd		%ymm3, 0(%r10)
5742//	addq	%r11, %r10
5743
5744#if MACRO_LEVEL>=1
5745	.endm
5746#else
5747	ret
5748
5749	FUN_END(inner_store_4x4_lib)
5750#endif
5751
5752
5753
5754
5755
5756// common inner routine with file scope
5757//
5758// store n
5759//
5760// input arguments:
5761// r10  <- D
5762// r11  <- ldd
5763// ymm0 <- [d00 d11 d22 d33]
5764// ymm1 <- [d01 d10 d23 d32]
5765// ymm2 <- [d03 d12 d21 d30]
5766// ymm3 <- [d02 d13 d20 d31]
5767//
5768// output arguments:
5769
5770#if MACRO_LEVEL>=1
5771	.macro INNER_TRAN_STORE_4X4_LIB
5772#else
5773	.p2align 4,,15
5774	FUN_START(inner_tran_store_4x4_lib)
5775#endif
5776
5777	vunpcklpd	%ymm1, %ymm0, %ymm12
5778	vunpckhpd	%ymm1, %ymm0, %ymm13
5779	vunpcklpd	%ymm3, %ymm2, %ymm14
5780	vunpckhpd	%ymm3, %ymm2, %ymm15
5781
5782	vperm2f128	$0x20, %ymm14, %ymm12, %ymm0
5783	vperm2f128	$0x31, %ymm14, %ymm12, %ymm2
5784	vperm2f128	$0x20, %ymm15, %ymm13, %ymm1
5785	vperm2f128	$0x31, %ymm15, %ymm13, %ymm3
5786
5787	vmovupd		%ymm0, 0(%r10)
5788	addq		%r11, %r10
5789	vmovupd		%ymm1, 0(%r10)
5790	addq		%r11, %r10
5791	vmovupd		%ymm2, 0(%r10)
5792	addq		%r11, %r10
5793	vmovupd		%ymm3, 0(%r10)
5794//	addq	%r11, %r10
5795
5796#if MACRO_LEVEL>=1
5797	.endm
5798#else
5799	ret
5800
5801	FUN_END(inner_tran_store_4x4_lib)
5802#endif
5803
5804
5805
5806
5807
5808// common inner routine with file scope
5809//
5810// store n
5811//
5812// input arguments:
5813// r10  <- D
5814// r11  <- ldd
5815// ymm0 <- [d00 d11 d22 d33]
5816// ymm1 <- [d01 d10 d23 d32]
5817// ymm2 <- [d03 d12 d21 d30]
5818// ymm3 <- [d02 d13 d20 d31]
5819//
5820// output arguments:
5821
5822#if MACRO_LEVEL>=1
5823	.macro INNER_STORE_L_4X4_LIB
5824#else
5825	.p2align 4,,15
5826	FUN_START(inner_store_l_4x4_lib)
5827#endif
5828
5829	vmovupd		%ymm0, 0(%r10)
5830	addq		%r11, %r10
5831	vmovupd		0(%r10), %ymm15
5832	vblendpd	$0x1, %ymm15, %ymm1, %ymm1
5833	vmovupd		%ymm1, 0(%r10)
5834	addq		%r11, %r10
5835	vmovupd		0(%r10), %ymm15
5836	vblendpd	$0x3, %ymm15, %ymm2, %ymm2
5837	vmovupd		%ymm2, 0(%r10)
5838	addq		%r11, %r10
5839	vmovupd		0(%r10), %ymm15
5840	vblendpd	$0x7, %ymm15, %ymm3, %ymm3
5841	vmovupd		%ymm3, 0(%r10)
5842//	addq	%r11, %r10
5843
5844#if MACRO_LEVEL>=1
5845	.endm
5846#else
5847	ret
5848
5849	FUN_END(inner_store_l_4x4_lib)
5850#endif
5851
5852
5853
5854
5855
5856// common inner routine with file scope
5857//
5858// store n
5859//
5860// input arguments:
5861// r10  <- D
5862// r11  <- ldd
5863// ymm0 <- [d00 d11 d22 d33]
5864// ymm1 <- [d01 d10 d23 d32]
5865// ymm2 <- [d03 d12 d21 d30]
5866// ymm3 <- [d02 d13 d20 d31]
5867//
5868// output arguments:
5869
5870#if MACRO_LEVEL>=1
5871	.macro INNER_STORE_U_4X4_LIB
5872#else
5873	.p2align 4,,15
5874	FUN_START(inner_store_u_4x4_lib)
5875#endif
5876
5877	vmovupd		0(%r10), %ymm15
5878	vblendpd	$0x1, %ymm0, %ymm15, %ymm0
5879	vmovupd		%ymm0, 0(%r10)
5880	addq		%r11, %r10
5881	vmovupd		0(%r10), %ymm15
5882	vblendpd	$0x3, %ymm1, %ymm15, %ymm1
5883	vmovupd		%ymm1, 0(%r10)
5884	addq		%r11, %r10
5885	vmovupd		0(%r10), %ymm15
5886	vblendpd	$0x7, %ymm2, %ymm15, %ymm2
5887	vmovupd		%ymm2, 0(%r10)
5888	addq		%r11, %r10
5889	vmovupd		%ymm3, 0(%r10)
5890//	addq	%r11, %r10
5891
5892#if MACRO_LEVEL>=1
5893	.endm
5894#else
5895	ret
5896
5897	FUN_END(inner_store_u_4x4_lib)
5898#endif
5899
5900
5901
5902
5903
5904// common inner routine with file scope
5905//
5906// store n vs
5907//
5908// input arguments:
5909// r10   <- D
5910// r11  <- ldd
5911// r12d   <- km
5912// r13d   <- kn
5913// ymm0  <- [d00 d11 d22 d33]
5914// ymm1  <- [d01 d10 d23 d32]
5915// ymm2  <- [d03 d12 d21 d30]
5916// ymm3  <- [d02 d13 d20 d31]
5917//
5918// output arguments:
5919
5920#if MACRO_LEVEL>=1
5921	.macro INNER_STORE_4X4_VS_LIB
5922#else
5923	.p2align 4,,15
5924	FUN_START(inner_store_4x4_vs_lib)
5925#endif
5926
5927	vcvtsi2sd	%r12d, %xmm15, %xmm15
5928#if defined(OS_LINUX) | defined(OS_WINDOWS)
5929	vmovupd		.LC02(%rip), %ymm14
5930#elif defined(OS_MAC)
5931	vmovupd		LC02(%rip), %ymm14
5932#endif
5933	vmovddup	%xmm15, %xmm15
5934	vinsertf128	$1, %xmm15, %ymm15, %ymm15
5935	vsubpd		%ymm15, %ymm14, %ymm15
5936
5937	vmaskmovpd	%ymm0, %ymm15, 0(%r10)
5938	addq		%r11, %r10
5939	cmpl		$2, %r13d
5940	jl			0f // end
5941	vmaskmovpd	%ymm1, %ymm15, 0(%r10)
5942	addq		%r11, %r10
5943	cmpl		$3, %r13d
5944	jl			0f // end
5945	vmaskmovpd	%ymm2, %ymm15, 0(%r10)
5946	addq		%r11, %r10
5947	cmpl		$3, %r13d
5948	je			0f // end
5949	vmaskmovpd	%ymm3, %ymm15, 0(%r10)
5950//	addq	%r11, %r10
5951
59520:
5953
5954#if MACRO_LEVEL>=1
5955	.endm
5956#else
5957	ret
5958
5959	FUN_END(inner_store_4x4_vs_lib)
5960#endif
5961
5962
5963
5964
5965
5966// common inner routine with file scope
5967//
5968// store n vs
5969//
5970// input arguments:
5971// r10   <- D
5972// r11  <- ldd
5973// r12d   <- km
5974// r13d   <- kn
5975// ymm0  <- [d00 d11 d22 d33]
5976// ymm1  <- [d01 d10 d23 d32]
5977// ymm2  <- [d03 d12 d21 d30]
5978// ymm3  <- [d02 d13 d20 d31]
5979//
5980// output arguments:
5981
5982#if MACRO_LEVEL>=1
5983	.macro INNER_TRAN_STORE_4X4_VS_LIB
5984#else
5985	.p2align 4,,15
5986	FUN_START(inner_tran_store_4x4_vs_lib)
5987#endif
5988
5989	vunpcklpd	%ymm1, %ymm0, %ymm12
5990	vunpckhpd	%ymm1, %ymm0, %ymm13
5991	vunpcklpd	%ymm3, %ymm2, %ymm14
5992	vunpckhpd	%ymm3, %ymm2, %ymm15
5993
5994	vperm2f128	$0x20, %ymm14, %ymm12, %ymm0
5995	vperm2f128	$0x31, %ymm14, %ymm12, %ymm2
5996	vperm2f128	$0x20, %ymm15, %ymm13, %ymm1
5997	vperm2f128	$0x31, %ymm15, %ymm13, %ymm3
5998
5999	vcvtsi2sd	%r12d, %xmm15, %xmm15
6000#if defined(OS_LINUX) | defined(OS_WINDOWS)
6001	vmovupd		.LC02(%rip), %ymm14
6002#elif defined(OS_MAC)
6003	vmovupd		LC02(%rip), %ymm14
6004#endif
6005	vmovddup	%xmm15, %xmm15
6006	vinsertf128	$1, %xmm15, %ymm15, %ymm15
6007	vsubpd		%ymm15, %ymm14, %ymm15
6008
6009	vmaskmovpd	%ymm0, %ymm15, 0(%r10)
6010	addq		%r11, %r10
6011	cmpl		$2, %r13d
6012	jl			0f // end
6013	vmaskmovpd	%ymm1, %ymm15, 0(%r10)
6014	addq		%r11, %r10
6015	cmpl		$3, %r13d
6016	jl			0f // end
6017	vmaskmovpd	%ymm2, %ymm15, 0(%r10)
6018	addq		%r11, %r10
6019	cmpl		$3, %r13d
6020	je			0f // end
6021	vmaskmovpd	%ymm3, %ymm15, 0(%r10)
6022//	addq	%r11, %r10
6023
60240:
6025
6026#if MACRO_LEVEL>=1
6027	.endm
6028#else
6029	ret
6030
6031	FUN_END(inner_tran_store_4x4_vs_lib)
6032#endif
6033
6034
6035
6036
6037
6038// common inner routine with file scope
6039//
6040// store n
6041//
6042// input arguments:
6043// r10  <- D
6044// r11  <- ldd
6045// r12d   <- km
6046// r13d   <- kn
6047// ymm0 <- [d00 d11 d22 d33]
6048// ymm1 <- [d01 d10 d23 d32]
6049// ymm2 <- [d03 d12 d21 d30]
6050// ymm3 <- [d02 d13 d20 d31]
6051//
6052// output arguments:
6053
6054#if MACRO_LEVEL>=1
6055	.macro INNER_STORE_L_4X4_VS_LIB
6056#else
6057	.p2align 4,,15
6058	FUN_START(inner_store_l_4x4_vs_lib)
6059#endif
6060
6061	vcvtsi2sd	%r12d, %xmm15, %xmm15
6062#if defined(OS_LINUX) | defined(OS_WINDOWS)
6063	vmovupd		.LC02(%rip), %ymm14
6064#elif defined(OS_MAC)
6065	vmovupd		LC02(%rip), %ymm14
6066#endif
6067	vmovddup	%xmm15, %xmm15
6068	vinsertf128	$1, %xmm15, %ymm15, %ymm15
6069	vsubpd		%ymm15, %ymm14, %ymm14
6070
6071	vmaskmovpd	%ymm0, %ymm14, 0(%r10)
6072	addq		%r11, %r10
6073	cmpl		$2, %r13d
6074	jl			0f // end
6075	vmovupd		0(%r10), %ymm15
6076	vblendpd	$0x1, %ymm15, %ymm1, %ymm1
6077	vmaskmovpd	%ymm1, %ymm14, 0(%r10)
6078	addq		%r11, %r10
6079	cmpl		$3, %r13d
6080	jl			0f // end
6081	vmovupd		0(%r10), %ymm15
6082	vblendpd	$0x3, %ymm15, %ymm2, %ymm2
6083	vmaskmovpd	%ymm2, %ymm14, 0(%r10)
6084	addq		%r11, %r10
6085	cmpl		$3, %r13d
6086	je			0f // end
6087	vmovupd		0(%r10), %ymm15
6088	vblendpd	$0x7, %ymm15, %ymm3, %ymm3
6089	vmaskmovpd	%ymm3, %ymm14, 0(%r10)
6090//	addq	%r11, %r10
6091
60920:
6093
6094#if MACRO_LEVEL>=1
6095	.endm
6096#else
6097	ret
6098
6099	FUN_END(inner_store_l_4x4_vs_lib)
6100#endif
6101
6102
6103
6104
6105
6106// common inner routine with file scope
6107//
6108// store n
6109//
6110// input arguments:
6111// r10  <- D
6112// r11  <- ldd
6113// r12d   <- km
6114// r13d   <- kn
6115// ymm0 <- [d00 d11 d22 d33]
6116// ymm1 <- [d01 d10 d23 d32]
6117// ymm2 <- [d03 d12 d21 d30]
6118// ymm3 <- [d02 d13 d20 d31]
6119//
6120// output arguments:
6121
6122#if MACRO_LEVEL>=1
6123	.macro INNER_STORE_U_4X4_VS_LIB
6124#else
6125	.p2align 4,,15
6126	FUN_START(inner_store_u_4x4_vs_lib)
6127#endif
6128
6129	vcvtsi2sd	%r12d, %xmm15, %xmm15
6130#if defined(OS_LINUX) | defined(OS_WINDOWS)
6131	vmovupd		.LC02(%rip), %ymm14
6132#elif defined(OS_MAC)
6133	vmovupd		LC02(%rip), %ymm14
6134#endif
6135	vmovddup	%xmm15, %xmm15
6136	vinsertf128	$1, %xmm15, %ymm15, %ymm15
6137	vsubpd		%ymm15, %ymm14, %ymm14
6138
6139	vmovupd		0(%r10), %ymm15
6140	vblendpd	$0x1, %ymm0, %ymm15, %ymm0
6141	vmaskmovpd	%ymm0, %ymm14, 0(%r10)
6142	addq		%r11, %r10
6143	cmpl		$2, %r13d
6144	jl			0f // end
6145	vmovupd		0(%r10), %ymm15
6146	vblendpd	$0x3, %ymm1, %ymm15, %ymm1
6147	vmaskmovpd	%ymm1, %ymm14, 0(%r10)
6148	addq		%r11, %r10
6149	cmpl		$3, %r13d
6150	jl			0f // end
6151	vmovupd		0(%r10), %ymm15
6152	vblendpd	$0x7, %ymm2, %ymm15, %ymm2
6153	vmaskmovpd	%ymm2, %ymm14, 0(%r10)
6154	addq		%r11, %r10
6155	cmpl		$3, %r13d
6156	je			0f // end
6157	vmaskmovpd	%ymm3, %ymm14, 0(%r10)
6158//	addq	%r11, %r10
6159
61600:
6161
6162#if MACRO_LEVEL>=1
6163	.endm
6164#else
6165	ret
6166
6167	FUN_END(inner_store_u_4x4_vs_lib)
6168#endif
6169
6170
6171
6172
6173
6174//                                 1      2              3          4          5        6             7          8        9          10
6175// void kernel_dgemm_nt_4x4_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd);
6176
6177	.p2align 4,,15
6178	GLOB_FUN_START(kernel_dgemm_nt_4x4_lib4ccc)
6179
6180	PROLOGUE
6181
6182	// zero accumulation registers
6183
6184	ZERO_ACC
6185
6186
6187	// call inner dgemm kernel nn
6188
6189	movq	ARG1, %r10 // k
6190	movq	ARG3, %r11  // A
6191	movq	ARG4, %r12  // B
6192	movq	ARG5, %r13  // ldb
6193	sall	$3, %r13d
6194
6195#if MACRO_LEVEL>=2
6196	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
6197#else
6198	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
6199#endif
6200
6201
6202	// call inner blend
6203
6204	movq	ARG2, %r10 // alpha
6205	movq	ARG6, %r11 // beta
6206	movq	ARG7, %r12   // C
6207	movq	ARG8, %r13   // ldc
6208	sall	$3, %r13d
6209
6210#if MACRO_LEVEL>=1
6211	INNER_SCALE_AB_4X4_LIB
6212#else
6213	CALL(inner_scale_ab_4x4_lib)
6214#endif
6215
6216
6217	// store n
6218
6219	movq	ARG9, %r10 // D
6220	movq	ARG10, %r11 // ldd
6221	sall	$3, %r11d
6222
6223#if MACRO_LEVEL>=1
6224	INNER_STORE_4X4_LIB
6225#else
6226	CALL(inner_store_4x4_lib)
6227#endif
6228
6229
6230	EPILOGUE
6231
6232	ret
6233
6234	FUN_END(kernel_dgemm_nt_4x4_lib4ccc)
6235
6236
6237
6238
6239
6240//                                     1      2              3          4          5        6             7          8        9          10       11      12
6241// void kernel_dgemm_nt_4x4_vs_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
6242
6243	.p2align 4,,15
6244	GLOB_FUN_START(kernel_dgemm_nt_4x4_vs_lib4ccc)
6245
6246	PROLOGUE
6247
6248	// zero accumulation registers
6249
6250	ZERO_ACC
6251
6252
6253	// call inner dgemm kernel nn
6254
6255	movq	ARG1, %r10 // k
6256	movq	ARG3, %r11  // A
6257	movq	ARG4, %r12  // B
6258	movq	ARG5, %r13  // ldb
6259	sall	$3, %r13d
6260
6261	movq	ARG12, %r14  // n1
6262	cmpl	$1, %r14d
6263	jg		100f
6264
6265#if MACRO_LEVEL>=2
6266	INNER_KERNEL_DGEMM_NT_4X1_LIB4C
6267#else
6268	CALL(inner_kernel_dgemm_nt_4x1_lib4c)
6269#endif
6270
6271	jmp		103f
6272
6273100:
6274
6275	movq	ARG12, %r14  // n1
6276	cmpl	$2, %r14d
6277	jg		101f
6278
6279#if MACRO_LEVEL>=2
6280	INNER_KERNEL_DGEMM_NT_4X2_LIB4C
6281#else
6282	CALL(inner_kernel_dgemm_nt_4x2_lib4c)
6283#endif
6284
6285	jmp		103f
6286
6287101:
6288
6289	movq	ARG12, %r14  // n1
6290	cmpl	$3, %r14d
6291	jg		102f
6292
6293#if MACRO_LEVEL>=2
6294	INNER_KERNEL_DGEMM_NT_4X3_LIB4C
6295#else
6296	CALL(inner_kernel_dgemm_nt_4x3_lib4c)
6297#endif
6298
6299	jmp		103f
6300
6301102:
6302
6303#if MACRO_LEVEL>=2
6304	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
6305#else
6306	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
6307#endif
6308
6309103:
6310
6311
6312	// call inner blend
6313
6314	movq	ARG2, %r10 // alpha
6315	movq	ARG6, %r11 // beta
6316	movq	ARG7, %r12   // C
6317	movq	ARG8, %r13   // ldc
6318	sall	$3, %r13d
6319	movq	ARG11, %r14 // m1
6320	movq	ARG12, %r15 // n1
6321
6322#if MACRO_LEVEL>=1
6323	INNER_SCALE_AB_4X4_VS_LIB
6324#else
6325	CALL(inner_scale_ab_4x4_vs_lib)
6326#endif
6327
6328
6329	// store n
6330
6331	movq	ARG9, %r10 // D
6332	movq	ARG10, %r11 // ldd
6333	sall	$3, %r11d
6334	movq	ARG11, %r12 // m1
6335	movq	ARG12, %r13 // n1
6336
6337#if MACRO_LEVEL>=1
6338	INNER_STORE_4X4_VS_LIB
6339#else
6340	CALL(inner_store_4x4_vs_lib)
6341#endif
6342
6343
6344	EPILOGUE
6345
6346	ret
6347
6348	FUN_END(kernel_dgemm_nt_4x4_vs_lib4ccc)
6349
6350
6351
6352
6353
6354//                                 1      2              3          4          5             6          7        8          9
6355// void kernel_dgemm_nt_4x4_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd);
6356
6357	.p2align 4,,15
6358	GLOB_FUN_START(kernel_dgemm_nt_4x4_lib44cc)
6359
6360	PROLOGUE
6361
6362	// zero accumulation registers
6363
6364	ZERO_ACC
6365
6366
6367	// call inner dgemm kernel nn
6368
6369	movq	ARG1, %r10 // k
6370	movq	ARG3, %r11  // A
6371	movq	ARG4, %r12  // B
6372
6373#if MACRO_LEVEL>=2
6374	INNER_KERNEL_DGEMM_NT_4X4_LIB4
6375#else
6376	CALL(inner_kernel_dgemm_nt_4x4_lib4)
6377#endif
6378
6379
6380	// call inner blend
6381
6382	movq	ARG2, %r10 // alpha
6383	movq	ARG5, %r11 // beta
6384	movq	ARG6, %r12   // C
6385	movq	ARG7, %r13   // ldc
6386	sall	$3, %r13d
6387
6388#if MACRO_LEVEL>=1
6389	INNER_BLEND_SCALE_AB_4X4_LIB
6390#else
6391	CALL(inner_blend_scale_ab_4x4_lib)
6392#endif
6393
6394
6395	// store n
6396
6397	movq	ARG8, %r10 // D
6398	movq	ARG9, %r11 // ldd
6399	sall	$3, %r11d
6400
6401#if MACRO_LEVEL>=1
6402	INNER_STORE_4X4_LIB
6403#else
6404	CALL(inner_store_4x4_lib)
6405#endif
6406
6407
6408	EPILOGUE
6409
6410	ret
6411
6412	FUN_END(kernel_dgemm_nt_4x4_lib44cc)
6413
6414
6415
6416
6417
6418//                                     1      2              3          4          5             6          7        8          9        10      11
6419// void kernel_dgemm_nt_4x4_vs_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
6420
6421	.p2align 4,,15
6422	GLOB_FUN_START(kernel_dgemm_nt_4x4_vs_lib44cc)
6423
6424	PROLOGUE
6425
6426	// zero accumulation registers
6427
6428	ZERO_ACC
6429
6430
6431	// call inner dgemm kernel nn
6432
6433	movq	ARG1, %r10 // k
6434	movq	ARG3, %r11  // A
6435	movq	ARG4, %r12  // B
6436
6437#if MACRO_LEVEL>=2
6438	INNER_KERNEL_DGEMM_NT_4X4_LIB4
6439#else
6440	CALL(inner_kernel_dgemm_nt_4x4_lib4)
6441#endif
6442
6443
6444	// call inner blend
6445
6446	movq	ARG2, %r10 // alpha
6447	movq	ARG5, %r11 // beta
6448	movq	ARG6, %r12   // C
6449	movq	ARG7, %r13   // ldc
6450	sall	$3, %r13d
6451	movq	ARG10, %r14 // m1
6452	movq	ARG11, %r15 // n1
6453
6454#if MACRO_LEVEL>=1
6455	INNER_BLEND_SCALE_AB_4X4_VS_LIB
6456#else
6457	CALL(inner_blend_scale_ab_4x4_vs_lib)
6458#endif
6459
6460
6461	// store n
6462
6463	movq	ARG8, %r10 // D
6464	movq	ARG9, %r11 // ldd
6465	sall	$3, %r11d
6466	movq	ARG10, %r12 // m1
6467	movq	ARG11, %r13 // n1
6468
6469#if MACRO_LEVEL>=1
6470	INNER_STORE_4X4_VS_LIB
6471#else
6472	CALL(inner_store_4x4_vs_lib)
6473#endif
6474
6475
6476	EPILOGUE
6477
6478	ret
6479
6480	FUN_END(kernel_dgemm_nt_4x4_vs_lib44cc)
6481
6482
6483
6484
6485
6486//                                 1      2              3          4        5          6             7          8        9          10
6487// void kernel_dgemm_nt_4x4_libc4cc(int k, double *alpha, double *A, int lda, double *B, double *beta, double *C, int ldc, double *D, int ldd);
6488
6489	.p2align 4,,15
6490	GLOB_FUN_START(kernel_dgemm_nt_4x4_libc4cc)
6491
6492	PROLOGUE
6493
6494	// zero accumulation registers
6495
6496	ZERO_ACC
6497
6498
6499	// call inner dgemm kernel nn
6500
6501	movq	ARG1, %r10 // k
6502	movq	ARG5, %r11  // B
6503	movq	ARG3, %r12  // A
6504	movq	ARG4, %r13  // lda
6505	sall	$3, %r13d
6506
6507#if MACRO_LEVEL>=2
6508	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
6509#else
6510	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
6511#endif
6512
6513
6514	// call inner blend
6515
6516	movq	ARG2, %r10 // alpha
6517	movq	ARG6, %r11 // beta
6518	movq	ARG7, %r12   // C
6519	movq	ARG8, %r13   // ldc
6520	sall	$3, %r13d
6521
6522#if MACRO_LEVEL>=1
6523	INNER_TRAN_SCALE_AB_4X4_LIB
6524#else
6525	CALL(inner_tran_scale_ab_4x4_lib)
6526#endif
6527
6528
6529	// store n
6530
6531	movq	ARG9, %r10 // D
6532	movq	ARG10, %r11 // ldd
6533	sall	$3, %r11d
6534
6535#if MACRO_LEVEL>=1
6536	INNER_STORE_4X4_LIB
6537#else
6538	CALL(inner_store_4x4_lib)
6539#endif
6540
6541
6542	EPILOGUE
6543
6544	ret
6545
6546	FUN_END(kernel_dgemm_nt_4x4_libc4cc)
6547
6548
6549
6550
6551
6552//                                     1      2              3          4        5          6             7          8        9          10       11      12
6553// void kernel_dgemm_nt_4x4_vs_libc4cc(int k, double *alpha, double *A, int lda, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
6554
6555	.p2align 4,,15
6556	GLOB_FUN_START(kernel_dgemm_nt_4x4_vs_libc4cc)
6557
6558	PROLOGUE
6559
6560	// zero accumulation registers
6561
6562	ZERO_ACC
6563
6564
6565	// call inner dgemm kernel nn
6566
6567	movq	ARG1, %r10 // k
6568	movq	ARG5, %r11  // B
6569	movq	ARG3, %r12  // A
6570	movq	ARG4, %r13  // lda
6571	sall	$3, %r13d
6572
6573	movq	ARG11, %r14  // m1
6574	cmpl	$1, %r14d
6575	jg		100f
6576
6577#if MACRO_LEVEL>=2
6578	INNER_KERNEL_DGEMM_NT_4X1_LIB4C
6579#else
6580	CALL(inner_kernel_dgemm_nt_4x1_lib4c)
6581#endif
6582
6583	jmp		103f
6584
6585100:
6586
6587	movq	ARG11, %r14  // m1
6588	cmpl	$2, %r14d
6589	jg		101f
6590
6591#if MACRO_LEVEL>=2
6592	INNER_KERNEL_DGEMM_NT_4X2_LIB4C
6593#else
6594	CALL(inner_kernel_dgemm_nt_4x2_lib4c)
6595#endif
6596
6597	jmp		103f
6598
6599101:
6600
6601	movq	ARG11, %r14  // m1
6602	cmpl	$3, %r14d
6603	jg		102f
6604
6605#if MACRO_LEVEL>=2
6606	INNER_KERNEL_DGEMM_NT_4X3_LIB4C
6607#else
6608	CALL(inner_kernel_dgemm_nt_4x3_lib4c)
6609#endif
6610
6611	jmp		103f
6612
6613102:
6614
6615#if MACRO_LEVEL>=2
6616	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
6617#else
6618	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
6619#endif
6620
6621103:
6622
6623
6624	// call inner blend
6625
6626	movq	ARG2, %r10 // alpha
6627	movq	ARG6, %r11 // beta
6628	movq	ARG7, %r12   // C
6629	movq	ARG8, %r13   // ldc
6630	sall	$3, %r13d
6631	movq	ARG11, %r14 // m1
6632	movq	ARG12, %r15 // n1
6633
6634#if MACRO_LEVEL>=1
6635	INNER_TRAN_SCALE_AB_4X4_VS_LIB
6636#else
6637	CALL(inner_tran_scale_ab_4x4_vs_lib)
6638#endif
6639
6640
6641	// store n
6642
6643	movq	ARG9, %r10 // D
6644	movq	ARG10, %r11 // ldd
6645	sall	$3, %r11d
6646	movq	ARG11, %r12 // m1
6647	movq	ARG12, %r13 // n1
6648
6649#if MACRO_LEVEL>=1
6650	INNER_STORE_4X4_VS_LIB
6651#else
6652	CALL(inner_store_4x4_vs_lib)
6653#endif
6654
6655
6656	EPILOGUE
6657
6658	ret
6659
6660	FUN_END(kernel_dgemm_nt_4x4_vs_libc4cc)
6661
6662
6663
6664
6665
6666//                                 1      2              3          4          5        6             7          8        9          10
6667// void kernel_dgemm_nn_4x4_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd);
6668
6669	.p2align 4,,15
6670	GLOB_FUN_START(kernel_dgemm_nn_4x4_lib4ccc)
6671
6672	PROLOGUE
6673
6674	// zero accumulation registers
6675
6676	ZERO_ACC
6677
6678
6679	// call inner dgemm kernel nn
6680
6681	movq	ARG1, %r10 // k
6682	movq	ARG3, %r11  // A
6683	movq	ARG4, %r12  // B
6684	movq	ARG5, %r13  // ldb
6685	sall	$3, %r13d
6686
6687#if MACRO_LEVEL>=2
6688	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
6689#else
6690	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
6691#endif
6692
6693
6694	// call inner blend
6695
6696	movq	ARG2, %r10 // alpha
6697	movq	ARG6, %r11 // beta
6698	movq	ARG7, %r12   // C
6699	movq	ARG8, %r13   // ldc
6700	sall	$3, %r13d
6701
6702#if MACRO_LEVEL>=1
6703	INNER_SCALE_AB_4X4_LIB
6704#else
6705	CALL(inner_scale_ab_4x4_lib)
6706#endif
6707
6708
6709	// store n
6710
6711	movq	ARG9, %r10 // D
6712	movq	ARG10, %r11 // ldd
6713	sall	$3, %r11d
6714
6715#if MACRO_LEVEL>=1
6716	INNER_STORE_4X4_LIB
6717#else
6718	CALL(inner_store_4x4_lib)
6719#endif
6720
6721
6722	EPILOGUE
6723
6724	ret
6725
6726	FUN_END(kernel_dgemm_nn_4x4_lib4ccc)
6727
6728
6729
6730
6731
6732//                                     1      2              3          4          5        6             7          8        9          10       11      12
6733// void kernel_dgemm_nn_4x4_vs_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
6734
6735	.p2align 4,,15
6736	GLOB_FUN_START(kernel_dgemm_nn_4x4_vs_lib4ccc)
6737
6738	PROLOGUE
6739
6740	// zero accumulation registers
6741
6742	ZERO_ACC
6743
6744
6745	// call inner dgemm kernel nn
6746
6747	movq	ARG1, %r10 // k
6748	movq	ARG3, %r11  // A
6749	movq	ARG4, %r12  // B
6750	movq	ARG5, %r13  // ldb
6751	sall	$3, %r13d
6752
6753	movq	ARG12, %r14  // n1
6754	cmpl	$1, %r14d
6755	jg		100f
6756
6757#if MACRO_LEVEL>=2
6758	INNER_KERNEL_DGEMM_NN_4X1_LIB4C
6759#else
6760	CALL(inner_kernel_dgemm_nn_4x1_lib4c)
6761#endif
6762
6763	jmp		103f
6764
6765100:
6766
6767	movq	ARG12, %r14  // n1
6768	cmpl	$2, %r14d
6769	jg		101f
6770
6771#if MACRO_LEVEL>=2
6772	INNER_KERNEL_DGEMM_NN_4X2_LIB4C
6773#else
6774	CALL(inner_kernel_dgemm_nn_4x2_lib4c)
6775#endif
6776
6777	jmp		103f
6778
6779101:
6780
6781	movq	ARG12, %r14  // n1
6782	cmpl	$3, %r14d
6783	jg		102f
6784
6785#if MACRO_LEVEL>=2
6786	INNER_KERNEL_DGEMM_NN_4X3_LIB4C
6787#else
6788	CALL(inner_kernel_dgemm_nn_4x3_lib4c)
6789#endif
6790
6791	jmp		103f
6792
6793102:
6794
6795#if MACRO_LEVEL>=2
6796	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
6797#else
6798	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
6799#endif
6800
6801103:
6802
6803
6804	// call inner blend
6805
6806	movq	ARG2, %r10 // alpha
6807	movq	ARG6, %r11 // beta
6808	movq	ARG7, %r12   // C
6809	movq	ARG8, %r13   // ldc
6810	sall	$3, %r13d
6811	movq	ARG11, %r14 // m1
6812	movq	ARG12, %r15 // n1
6813
6814#if MACRO_LEVEL>=1
6815	INNER_SCALE_AB_4X4_VS_LIB
6816#else
6817	CALL(inner_scale_ab_4x4_vs_lib)
6818#endif
6819
6820
6821	// store n
6822
6823	movq	ARG9, %r10 // D
6824	movq	ARG10, %r11 // ldd
6825	sall	$3, %r11d
6826	movq	ARG11, %r12 // m1
6827	movq	ARG12, %r13 // n1
6828
6829#if MACRO_LEVEL>=1
6830	INNER_STORE_4X4_VS_LIB
6831#else
6832	CALL(inner_store_4x4_vs_lib)
6833#endif
6834
6835
6836	EPILOGUE
6837
6838	ret
6839
6840	FUN_END(kernel_dgemm_nn_4x4_vs_lib4ccc)
6841
6842
6843
6844
6845
6846//                                 1      2              3          4        5          6             7          8        9          10
6847// void kernel_dgemm_tt_4x4_libc4cc(int k, double *alpha, double *A, int lda, double *B, double *beta, double *C, int ldc, double *D, int ldd);
6848
6849	.p2align 4,,15
6850	GLOB_FUN_START(kernel_dgemm_tt_4x4_libc4cc)
6851
6852	PROLOGUE
6853
6854	// zero accumulation registers
6855
6856	ZERO_ACC
6857
6858
6859	// call inner dgemm kernel nn
6860
6861	movq	ARG1, %r10 // k
6862	movq	ARG5, %r11  // B
6863	movq	ARG3, %r12  // A
6864	movq	ARG4, %r13  // lda
6865	sall	$3, %r13d
6866
6867#if MACRO_LEVEL>=2
6868	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
6869#else
6870	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
6871#endif
6872
6873
6874	// call inner blend
6875
6876	movq	ARG2, %r10 // alpha
6877	movq	ARG6, %r11 // beta
6878	movq	ARG7, %r12   // C
6879	movq	ARG8, %r13   // ldc
6880	sall	$3, %r13d
6881
6882#if MACRO_LEVEL>=1
6883	INNER_TRAN_SCALE_AB_4X4_LIB
6884#else
6885	CALL(inner_tran_scale_ab_4x4_lib)
6886#endif
6887
6888
6889	// store n
6890
6891	movq	ARG9, %r10 // D
6892	movq	ARG10, %r11 // ldd
6893	sall	$3, %r11d
6894
6895#if MACRO_LEVEL>=1
6896	INNER_STORE_4X4_LIB
6897#else
6898	CALL(inner_store_4x4_lib)
6899#endif
6900
6901
6902	EPILOGUE
6903
6904	ret
6905
6906	FUN_END(kernel_dgemm_tt_4x4_libc4cc)
6907
6908
6909
6910
6911
6912//                                     1      2              3          4        5          6             7          8        9          10       11      12
6913// void kernel_dgemm_tt_4x4_vs_libc4cc(int k, double *alpha, double *A, int lda, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
6914
6915	.p2align 4,,15
6916	GLOB_FUN_START(kernel_dgemm_tt_4x4_vs_libc4cc)
6917
6918	PROLOGUE
6919
6920	// zero accumulation registers
6921
6922	ZERO_ACC
6923
6924
6925	// call inner dgemm kernel nn
6926
6927	movq	ARG1, %r10 // k
6928	movq	ARG5, %r11  // B
6929	movq	ARG3, %r12  // A
6930	movq	ARG4, %r13  // lda
6931	sall	$3, %r13d
6932
6933	movq	ARG11, %r14  // m1
6934	cmpl	$1, %r14d
6935	jg		100f
6936
6937#if MACRO_LEVEL>=2
6938	INNER_KERNEL_DGEMM_NN_4X1_LIB4C
6939#else
6940	CALL(inner_kernel_dgemm_nn_4x1_lib4c)
6941#endif
6942
6943	jmp		103f
6944
6945100:
6946
6947	movq	ARG11, %r14  // m1
6948	cmpl	$2, %r14d
6949	jg		101f
6950
6951#if MACRO_LEVEL>=2
6952	INNER_KERNEL_DGEMM_NN_4X2_LIB4C
6953#else
6954	CALL(inner_kernel_dgemm_nn_4x2_lib4c)
6955#endif
6956
6957	jmp		103f
6958
6959101:
6960
6961	movq	ARG11, %r14  // m1
6962	cmpl	$3, %r14d
6963	jg		102f
6964
6965#if MACRO_LEVEL>=2
6966	INNER_KERNEL_DGEMM_NN_4X3_LIB4C
6967#else
6968	CALL(inner_kernel_dgemm_nn_4x3_lib4c)
6969#endif
6970
6971	jmp		103f
6972
6973102:
6974
6975#if MACRO_LEVEL>=2
6976	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
6977#else
6978	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
6979#endif
6980
6981103:
6982
6983
6984	// call inner blend
6985
6986	movq	ARG2, %r10 // alpha
6987	movq	ARG6, %r11 // beta
6988	movq	ARG7, %r12   // C
6989	movq	ARG8, %r13   // ldc
6990	sall	$3, %r13d
6991	movq	ARG11, %r14 // m1
6992	movq	ARG12, %r15 // n1
6993
6994#if MACRO_LEVEL>=1
6995	INNER_TRAN_SCALE_AB_4X4_VS_LIB
6996#else
6997	CALL(inner_tran_scale_ab_4x4_vs_lib)
6998#endif
6999
7000
7001	// store n
7002
7003	movq	ARG9, %r10 // D
7004	movq	ARG10, %r11 // ldd
7005	sall	$3, %r11d
7006	movq	ARG11, %r12 // m1
7007	movq	ARG12, %r13 // n1
7008
7009#if MACRO_LEVEL>=1
7010	INNER_STORE_4X4_VS_LIB
7011#else
7012	CALL(inner_store_4x4_vs_lib)
7013#endif
7014
7015
7016	EPILOGUE
7017
7018	ret
7019
7020	FUN_END(kernel_dgemm_tt_4x4_vs_libc4cc)
7021
7022
7023
7024
7025
7026//                                   1      2              3          4          5             6          7        8          9
7027// void kernel_dsyrk_nt_l_4x4_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd);
7028
7029	.p2align 4,,15
7030	GLOB_FUN_START(kernel_dsyrk_nt_l_4x4_lib44cc)
7031
7032	PROLOGUE
7033
7034	// zero accumulation registers
7035
7036	ZERO_ACC
7037
7038
7039	// call inner dgemm kernel nn
7040
7041	movq	ARG1, %r10 // k
7042	movq	ARG3, %r11  // A
7043	movq	ARG4, %r12  // B
7044
7045#if MACRO_LEVEL>=2
7046	INNER_KERNEL_DGEMM_NT_4X4_LIB4
7047#else
7048	CALL(inner_kernel_dgemm_nt_4x4_lib4)
7049#endif
7050
7051
7052	// call inner blend
7053
7054	movq	ARG2, %r10 // alpha
7055	movq	ARG5, %r11 // beta
7056	movq	ARG6, %r12   // C
7057	movq	ARG7, %r13   // ldc
7058	sall	$3, %r13d
7059
7060#if MACRO_LEVEL>=1
7061	INNER_BLEND_SCALE_AB_4X4_LIB
7062#else
7063	CALL(inner_blend_scale_ab_4x4_lib)
7064#endif
7065
7066
7067	// store n
7068
7069	movq	ARG8, %r10 // D
7070	movq	ARG9, %r11 // ldd
7071	sall	$3, %r11d
7072
7073#if MACRO_LEVEL>=1
7074	INNER_STORE_L_4X4_LIB
7075#else
7076	CALL(inner_store_l_4x4_lib)
7077#endif
7078
7079
7080	EPILOGUE
7081
7082	ret
7083
7084	FUN_END(kernel_dsyrk_nt_l_4x4_lib44cc)
7085
7086
7087
7088
7089
7090//                                       1      2              3          4          5             6          7        8          9        10      11
7091// void kernel_dsyrk_nt_l_4x4_vs_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
7092
7093	.p2align 4,,15
7094	GLOB_FUN_START(kernel_dsyrk_nt_l_4x4_vs_lib44cc)
7095
7096	PROLOGUE
7097
7098	// zero accumulation registers
7099
7100	ZERO_ACC
7101
7102
7103	// call inner dgemm kernel nn
7104
7105	movq	ARG1, %r10 // k
7106	movq	ARG3, %r11  // A
7107	movq	ARG4, %r12  // B
7108
7109#if MACRO_LEVEL>=2
7110	INNER_KERNEL_DGEMM_NT_4X4_LIB4
7111#else
7112	CALL(inner_kernel_dgemm_nt_4x4_lib4)
7113#endif
7114
7115
7116	// call inner blend
7117
7118	movq	ARG2, %r10 // alpha
7119	movq	ARG5, %r11 // beta
7120	movq	ARG6, %r12   // C
7121	movq	ARG7, %r13   // ldc
7122	sall	$3, %r13d
7123	movq	ARG10, %r14 // m1
7124	movq	ARG11, %r15 // n1
7125
7126#if MACRO_LEVEL>=1
7127	INNER_BLEND_SCALE_AB_4X4_VS_LIB
7128#else
7129	CALL(inner_blend_scale_ab_4x4_vs_lib)
7130#endif
7131
7132
7133	// store n
7134
7135	movq	ARG8, %r10 // D
7136	movq	ARG9, %r11 // ldd
7137	sall	$3, %r11d
7138	movq	ARG10, %r12 // m1
7139	movq	ARG11, %r13 // n1
7140
7141#if MACRO_LEVEL>=1
7142	INNER_STORE_L_4X4_VS_LIB
7143#else
7144	CALL(inner_store_l_4x4_vs_lib)
7145#endif
7146
7147
7148	EPILOGUE
7149
7150	ret
7151
7152	FUN_END(kernel_dsyrk_nt_l_4x4_vs_lib44cc)
7153
7154
7155
7156
7157
7158//                                   1      2              3          4          5             6          7        8          9
7159// void kernel_dsyrk_nt_u_4x4_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd);
7160
7161	.p2align 4,,15
7162	GLOB_FUN_START(kernel_dsyrk_nt_u_4x4_lib44cc)
7163
7164	PROLOGUE
7165
7166	// zero accumulation registers
7167
7168	ZERO_ACC
7169
7170
7171	// call inner dgemm kernel nn
7172
7173	movq	ARG1, %r10 // k
7174	movq	ARG3, %r11  // A
7175	movq	ARG4, %r12  // B
7176
7177#if MACRO_LEVEL>=2
7178	INNER_KERNEL_DGEMM_NT_4X4_LIB4
7179#else
7180	CALL(inner_kernel_dgemm_nt_4x4_lib4)
7181#endif
7182
7183
7184	// call inner blend
7185
7186	movq	ARG2, %r10 // alpha
7187	movq	ARG5, %r11 // beta
7188	movq	ARG6, %r12   // C
7189	movq	ARG7, %r13   // ldc
7190	sall	$3, %r13d
7191
7192#if MACRO_LEVEL>=1
7193	INNER_BLEND_SCALE_AB_4X4_LIB
7194#else
7195	CALL(inner_blend_scale_ab_4x4_lib)
7196#endif
7197
7198
7199	// store n
7200
7201	movq	ARG8, %r10 // D
7202	movq	ARG9, %r11 // ldd
7203	sall	$3, %r11d
7204
7205#if MACRO_LEVEL>=1
7206	INNER_STORE_U_4X4_LIB
7207#else
7208	CALL(inner_store_u_4x4_lib)
7209#endif
7210
7211
7212	EPILOGUE
7213
7214	ret
7215
7216	FUN_END(kernel_dsyrk_nt_u_4x4_lib44cc)
7217
7218
7219
7220
7221
7222//                                       1      2              3          4          5             6          7        8          9        10      11
7223// void kernel_dsyrk_nt_u_4x4_vs_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
7224
7225	.p2align 4,,15
7226	GLOB_FUN_START(kernel_dsyrk_nt_u_4x4_vs_lib44cc)
7227
7228	PROLOGUE
7229
7230	// zero accumulation registers
7231
7232	ZERO_ACC
7233
7234
7235	// call inner dgemm kernel nn
7236
7237	movq	ARG1, %r10 // k
7238	movq	ARG3, %r11  // A
7239	movq	ARG4, %r12  // B
7240
7241#if MACRO_LEVEL>=2
7242	INNER_KERNEL_DGEMM_NT_4X4_LIB4
7243#else
7244	CALL(inner_kernel_dgemm_nt_4x4_lib4)
7245#endif
7246
7247
7248	// call inner blend
7249
7250	movq	ARG2, %r10 // alpha
7251	movq	ARG5, %r11 // beta
7252	movq	ARG6, %r12   // C
7253	movq	ARG7, %r13   // ldc
7254	sall	$3, %r13d
7255	movq	ARG10, %r14 // m1
7256	movq	ARG11, %r15 // n1
7257
7258#if MACRO_LEVEL>=1
7259	INNER_BLEND_SCALE_AB_4X4_VS_LIB
7260#else
7261	CALL(inner_blend_scale_ab_4x4_vs_lib)
7262#endif
7263
7264
7265	// store n
7266
7267	movq	ARG8, %r10 // D
7268	movq	ARG9, %r11 // ldd
7269	sall	$3, %r11d
7270	movq	ARG10, %r12 // m1
7271	movq	ARG11, %r13 // n1
7272
7273#if MACRO_LEVEL>=1
7274	INNER_STORE_U_4X4_VS_LIB
7275#else
7276	CALL(inner_store_u_4x4_vs_lib)
7277#endif
7278
7279
7280	EPILOGUE
7281
7282	ret
7283
7284	FUN_END(kernel_dsyrk_nt_u_4x4_vs_lib44cc)
7285
7286
7287
7288
7289
7290//                                    1      2              3          4          5        6             7          8        9          10
7291// void kernel_dtrmm_nn_rl_4x4_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd);
7292
7293	.p2align 4,,15
7294	GLOB_FUN_START(kernel_dtrmm_nn_rl_4x4_lib4ccc)
7295
7296	PROLOGUE
7297
7298	// zero accumulation registers
7299
7300	ZERO_ACC
7301
7302
7303	// call inner dgemm kernel nn
7304
7305	movq	ARG1, %r10 // k
7306	movq	ARG3, %r11  // A
7307	movq	ARG4, %r12  // B
7308	movq	ARG5, %r13  // ldb
7309	sall	$3, %r13d
7310
7311#if MACRO_LEVEL>=1
7312	INNER_EDGE_DTRMM_NN_RL_4X4_LIB4C
7313#else
7314	CALL(inner_edge_dtrmm_nn_rl_4x4_lib4c)
7315#endif
7316
7317#if MACRO_LEVEL>=2
7318	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
7319#else
7320	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
7321#endif
7322
7323
7324	// call inner blend
7325
7326	movq	ARG2, %r10 // alpha
7327	movq	ARG6, %r11 // beta
7328	movq	ARG7, %r12   // C
7329	movq	ARG8, %r13   // ldc
7330	sall	$3, %r13d
7331
7332#if MACRO_LEVEL>=1
7333	INNER_SCALE_AB_4X4_LIB
7334#else
7335	CALL(inner_scale_ab_4x4_lib)
7336#endif
7337
7338
7339	// store n
7340
7341	movq	ARG9, %r10 // D
7342	movq	ARG10, %r11 // ldd
7343	sall	$3, %r11d
7344
7345#if MACRO_LEVEL>=1
7346	INNER_STORE_4X4_LIB
7347#else
7348	CALL(inner_store_4x4_lib)
7349#endif
7350
7351
7352	EPILOGUE
7353
7354	ret
7355
7356	FUN_END(kernel_dtrmm_nn_rl_4x4_lib4ccc)
7357
7358
7359
7360
7361
7362//                                        1      2              3          4          5        6             7          8        9          10       11      12
7363// void kernel_dtrmm_nn_rl_4x4_vs_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
7364
7365	.p2align 4,,15
7366	GLOB_FUN_START(kernel_dtrmm_nn_rl_4x4_vs_lib4ccc)
7367
7368	PROLOGUE
7369
7370	// zero accumulation registers
7371
7372	ZERO_ACC
7373
7374
7375	// call inner dgemm kernel nn
7376
7377	movq	ARG1, %r10 // k
7378	movq	ARG3, %r11  // A
7379	movq	ARG4, %r12  // B
7380	movq	ARG5, %r13  // ldb
7381	sall	$3, %r13d
7382
7383#if MACRO_LEVEL>=1
7384	INNER_EDGE_DTRMM_NN_RL_4X4_VS_LIB4C
7385#else
7386	CALL(inner_edge_dtrmm_nn_rl_4x4_vs_lib4c)
7387#endif
7388
7389	movq	ARG12, %r14  // n1
7390	cmpl	$1, %r14d
7391	jg		100f
7392
7393#if MACRO_LEVEL>=2
7394	INNER_KERNEL_DGEMM_NN_4X1_LIB4C
7395#else
7396	CALL(inner_kernel_dgemm_nn_4x1_lib4c)
7397#endif
7398
7399	jmp		103f
7400
7401100:
7402
7403	movq	ARG12, %r14  // n1
7404	cmpl	$2, %r14d
7405	jg		101f
7406
7407#if MACRO_LEVEL>=2
7408	INNER_KERNEL_DGEMM_NN_4X2_LIB4C
7409#else
7410	CALL(inner_kernel_dgemm_nn_4x2_lib4c)
7411#endif
7412
7413	jmp		103f
7414
7415101:
7416
7417	movq	ARG12, %r14  // n1
7418	cmpl	$3, %r14d
7419	jg		102f
7420
7421#if MACRO_LEVEL>=2
7422	INNER_KERNEL_DGEMM_NN_4X3_LIB4C
7423#else
7424	CALL(inner_kernel_dgemm_nn_4x3_lib4c)
7425#endif
7426
7427	jmp		103f
7428
7429102:
7430
7431#if MACRO_LEVEL>=2
7432	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
7433#else
7434	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
7435#endif
7436
7437103:
7438
7439
7440	// call inner blend
7441
7442	movq	ARG2, %r10 // alpha
7443	movq	ARG6, %r11 // beta
7444	movq	ARG7, %r12   // C
7445	movq	ARG8, %r13   // ldc
7446	sall	$3, %r13d
7447	movq	ARG11, %r14 // m1
7448	movq	ARG12, %r15 // n1
7449
7450#if MACRO_LEVEL>=1
7451	INNER_SCALE_AB_4X4_VS_LIB
7452#else
7453	CALL(inner_scale_ab_4x4_vs_lib)
7454#endif
7455
7456
7457	// store n
7458
7459	movq	ARG9, %r10 // D
7460	movq	ARG10, %r11 // ldd
7461	sall	$3, %r11d
7462	movq	ARG11, %r12 // m1
7463	movq	ARG12, %r13 // n1
7464
7465#if MACRO_LEVEL>=1
7466	INNER_STORE_4X4_VS_LIB
7467#else
7468	CALL(inner_store_4x4_vs_lib)
7469#endif
7470
7471
7472	EPILOGUE
7473
7474	ret
7475
7476	FUN_END(kernel_dtrmm_nn_rl_4x4_vs_lib4ccc)
7477
7478
7479
7480
7481
7482//                                    1      2              3          4          5        6             7          8        9          10
7483// void kernel_dtrmm_nn_rl_4x4_tran_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd);
7484
7485	.p2align 4,,15
7486	GLOB_FUN_START(kernel_dtrmm_nn_rl_4x4_tran_lib4c4c)
7487
7488	PROLOGUE
7489
7490	// zero accumulation registers
7491
7492	ZERO_ACC
7493
7494
7495	// call inner dgemm kernel nn
7496
7497	movq	ARG1, %r10 // k
7498	movq	ARG3, %r11  // A
7499	movq	ARG4, %r12  // B
7500	movq	ARG5, %r13  // ldb
7501	sall	$3, %r13d
7502
7503#if MACRO_LEVEL>=1
7504	INNER_EDGE_DTRMM_NN_RL_4X4_LIB4C
7505#else
7506	CALL(inner_edge_dtrmm_nn_rl_4x4_lib4c)
7507#endif
7508
7509#if MACRO_LEVEL>=2
7510	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
7511#else
7512	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
7513#endif
7514
7515
7516	// call inner blend
7517
7518	movq	ARG2, %r10 // alpha
7519	movq	ARG6, %r11 // beta
7520	movq	ARG7, %r12   // C
7521
7522#if MACRO_LEVEL>=1
7523	INNER_SCALE_AB_4X4_LIB4
7524#else
7525	CALL(inner_scale_ab_4x4_lib4)
7526#endif
7527
7528
7529	// store n
7530
7531	movq	ARG8, %r10 // D
7532	movq	ARG9, %r11 // ldd
7533	sall	$3, %r11d
7534
7535#if MACRO_LEVEL>=1
7536	INNER_TRAN_STORE_4X4_LIB
7537#else
7538	CALL(inner_tran_store_4x4_lib)
7539#endif
7540
7541
7542	EPILOGUE
7543
7544	ret
7545
7546	FUN_END(kernel_dtrmm_nn_rl_4x4_tran_lib4c4c)
7547
7548
7549
7550
7551
7552//                                       1      2              3          4          5        6             7          8        9          10       11      12
7553// void kernel_dtrmm_nn_rl_4x4_tran_vs_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd, int m1, int n1);
7554
7555	.p2align 4,,15
7556	GLOB_FUN_START(kernel_dtrmm_nn_rl_4x4_tran_vs_lib4c4c)
7557
7558	PROLOGUE
7559
7560	// zero accumulation registers
7561
7562	ZERO_ACC
7563
7564
7565	// call inner dgemm kernel nn
7566
7567	movq	ARG1, %r10 // k
7568	movq	ARG3, %r11  // A
7569	movq	ARG4, %r12  // B
7570	movq	ARG5, %r13  // ldb
7571	sall	$3, %r13d
7572
7573#if MACRO_LEVEL>=1
7574	INNER_EDGE_DTRMM_NN_RL_4X4_VS_LIB4C
7575#else
7576	CALL(inner_edge_dtrmm_nn_rl_4x4_vs_lib4c)
7577#endif
7578
7579	movq	ARG10, %r14  // m1
7580	cmpl	$1, %r14d
7581	jg		100f
7582
7583#if MACRO_LEVEL>=2
7584	INNER_KERNEL_DGEMM_NN_4X1_LIB4C
7585#else
7586	CALL(inner_kernel_dgemm_nn_4x1_lib4c)
7587#endif
7588
7589	jmp		103f
7590
7591100:
7592
7593	movq	ARG10, %r14  // m1
7594	cmpl	$2, %r14d
7595	jg		101f
7596
7597#if MACRO_LEVEL>=2
7598	INNER_KERNEL_DGEMM_NN_4X2_LIB4C
7599#else
7600	CALL(inner_kernel_dgemm_nn_4x2_lib4c)
7601#endif
7602
7603	jmp		103f
7604
7605101:
7606
7607	movq	ARG10, %r14  // m1
7608	cmpl	$3, %r14d
7609	jg		102f
7610
7611#if MACRO_LEVEL>=2
7612	INNER_KERNEL_DGEMM_NN_4X3_LIB4C
7613#else
7614	CALL(inner_kernel_dgemm_nn_4x3_lib4c)
7615#endif
7616
7617	jmp		103f
7618
7619102:
7620
7621#if MACRO_LEVEL>=2
7622	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
7623#else
7624	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
7625#endif
7626
7627103:
7628
7629
7630	// call inner blend
7631
7632	movq	ARG2, %r10 // alpha
7633	movq	ARG6, %r11 // beta
7634	movq	ARG7, %r12   // C
7635
7636#if MACRO_LEVEL>=1
7637	INNER_SCALE_AB_4X4_LIB4
7638#else
7639	CALL(inner_scale_ab_4x4_lib4)
7640#endif
7641
7642
7643	// store n
7644
7645	movq	ARG8, %r10 // D
7646	movq	ARG9, %r11 // ldd
7647	sall	$3, %r11d
7648	movq	ARG10, %r12 // m1
7649	movq	ARG11, %r13 // n1
7650
7651#if MACRO_LEVEL>=1
7652	INNER_TRAN_STORE_4X4_VS_LIB
7653#else
7654	CALL(inner_tran_store_4x4_vs_lib)
7655#endif
7656
7657
7658	EPILOGUE
7659
7660	ret
7661
7662	FUN_END(kernel_dtrmm_nn_rl_4x4_tran_vs_lib4c4c)
7663
7664
7665
7666
7667
7668//                                    1      2              3          4          5        6             7          8        9          10
7669// void kernel_dtrmm_nn_rl_one_4x4_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd);
7670
7671	.p2align 4,,15
7672	GLOB_FUN_START(kernel_dtrmm_nn_rl_one_4x4_lib4ccc)
7673
7674	PROLOGUE
7675
7676	// zero accumulation registers
7677
7678	ZERO_ACC
7679
7680
7681	// call inner dgemm kernel nn
7682
7683	movq	ARG1, %r10 // k
7684	movq	ARG3, %r11  // A
7685	movq	ARG4, %r12  // B
7686	movq	ARG5, %r13  // ldb
7687	sall	$3, %r13d
7688
7689#if MACRO_LEVEL>=1
7690	INNER_EDGE_DTRMM_NN_RL_ONE_4X4_LIB4C
7691#else
7692	CALL(inner_edge_dtrmm_nn_rl_one_4x4_lib4c)
7693#endif
7694
7695#if MACRO_LEVEL>=2
7696	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
7697#else
7698	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
7699#endif
7700
7701
7702	// call inner blend
7703
7704	movq	ARG2, %r10 // alpha
7705	movq	ARG6, %r11 // beta
7706	movq	ARG7, %r12   // C
7707	movq	ARG8, %r13   // ldc
7708	sall	$3, %r13d
7709
7710#if MACRO_LEVEL>=1
7711	INNER_SCALE_AB_4X4_LIB
7712#else
7713	CALL(inner_scale_ab_4x4_lib)
7714#endif
7715
7716
7717	// store n
7718
7719	movq	ARG9, %r10 // D
7720	movq	ARG10, %r11 // ldd
7721	sall	$3, %r11d
7722
7723#if MACRO_LEVEL>=1
7724	INNER_STORE_4X4_LIB
7725#else
7726	CALL(inner_store_4x4_lib)
7727#endif
7728
7729
7730	EPILOGUE
7731
7732	ret
7733
7734	FUN_END(kernel_dtrmm_nn_rl_one_4x4_lib4ccc)
7735
7736
7737
7738
7739
7740//                                            1      2              3          4          5        6             7          8        9          10       11      12
7741// void kernel_dtrmm_nn_rl_4x4_one_vs_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
7742
7743	.p2align 4,,15
7744	GLOB_FUN_START(kernel_dtrmm_nn_rl_one_4x4_vs_lib4ccc)
7745
7746	PROLOGUE
7747
7748	// zero accumulation registers
7749
7750	ZERO_ACC
7751
7752
7753	// call inner dgemm kernel nn
7754
7755	movq	ARG1, %r10 // k
7756	movq	ARG3, %r11  // A
7757	movq	ARG4, %r12  // B
7758	movq	ARG5, %r13  // ldb
7759	sall	$3, %r13d
7760
7761#if MACRO_LEVEL>=1
7762	INNER_EDGE_DTRMM_NN_RL_ONE_4X4_VS_LIB4C
7763#else
7764	CALL(inner_edge_dtrmm_nn_rl_one_4x4_vs_lib4c)
7765#endif
7766
7767	movq	ARG12, %r14  // n1
7768	cmpl	$1, %r14d
7769	jg		100f
7770
7771#if MACRO_LEVEL>=2
7772	INNER_KERNEL_DGEMM_NN_4X1_LIB4C
7773#else
7774	CALL(inner_kernel_dgemm_nn_4x1_lib4c)
7775#endif
7776
7777	jmp		103f
7778
7779100:
7780
7781	movq	ARG12, %r14  // n1
7782	cmpl	$2, %r14d
7783	jg		101f
7784
7785#if MACRO_LEVEL>=2
7786	INNER_KERNEL_DGEMM_NN_4X2_LIB4C
7787#else
7788	CALL(inner_kernel_dgemm_nn_4x2_lib4c)
7789#endif
7790
7791	jmp		103f
7792
7793101:
7794
7795	movq	ARG12, %r14  // n1
7796	cmpl	$3, %r14d
7797	jg		102f
7798
7799#if MACRO_LEVEL>=2
7800	INNER_KERNEL_DGEMM_NN_4X3_LIB4C
7801#else
7802	CALL(inner_kernel_dgemm_nn_4x3_lib4c)
7803#endif
7804
7805	jmp		103f
7806
7807102:
7808
7809#if MACRO_LEVEL>=2
7810	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
7811#else
7812	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
7813#endif
7814
7815103:
7816
7817
7818	// call inner blend
7819
7820	movq	ARG2, %r10 // alpha
7821	movq	ARG6, %r11 // beta
7822	movq	ARG7, %r12   // C
7823	movq	ARG8, %r13   // ldc
7824	sall	$3, %r13d
7825	movq	ARG11, %r14 // m1
7826	movq	ARG12, %r15 // n1
7827
7828#if MACRO_LEVEL>=1
7829	INNER_SCALE_AB_4X4_VS_LIB
7830#else
7831	CALL(inner_scale_ab_4x4_vs_lib)
7832#endif
7833
7834
7835	// store n
7836
7837	movq	ARG9, %r10 // D
7838	movq	ARG10, %r11 // ldd
7839	sall	$3, %r11d
7840	movq	ARG11, %r12 // m1
7841	movq	ARG12, %r13 // n1
7842
7843#if MACRO_LEVEL>=1
7844	INNER_STORE_4X4_VS_LIB
7845#else
7846	CALL(inner_store_4x4_vs_lib)
7847#endif
7848
7849
7850	EPILOGUE
7851
7852	ret
7853
7854	FUN_END(kernel_dtrmm_nn_rl_one_4x4_vs_lib4ccc)
7855
7856
7857
7858
7859
7860//                                    1      2              3          4          5        6             7          8        9          10
7861// void kernel_dtrmm_nn_rl_one_4x4_tran_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd);
7862
7863	.p2align 4,,15
7864	GLOB_FUN_START(kernel_dtrmm_nn_rl_one_4x4_tran_lib4c4c)
7865
7866	PROLOGUE
7867
7868	// zero accumulation registers
7869
7870	ZERO_ACC
7871
7872
7873	// call inner dgemm kernel nn
7874
7875	movq	ARG1, %r10 // k
7876	movq	ARG3, %r11  // A
7877	movq	ARG4, %r12  // B
7878	movq	ARG5, %r13  // ldb
7879	sall	$3, %r13d
7880
7881#if MACRO_LEVEL>=1
7882	INNER_EDGE_DTRMM_NN_RL_ONE_4X4_LIB4C
7883#else
7884	CALL(inner_edge_dtrmm_nn_rl_one_4x4_lib4c)
7885#endif
7886
7887#if MACRO_LEVEL>=2
7888	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
7889#else
7890	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
7891#endif
7892
7893
7894	// call inner blend
7895
7896	movq	ARG2, %r10 // alpha
7897	movq	ARG6, %r11 // beta
7898	movq	ARG7, %r12   // C
7899
7900#if MACRO_LEVEL>=1
7901	INNER_SCALE_AB_4X4_LIB4
7902#else
7903	CALL(inner_scale_ab_4x4_lib4)
7904#endif
7905
7906
7907	// store n
7908
7909	movq	ARG8, %r10 // D
7910	movq	ARG9, %r11 // ldd
7911	sall	$3, %r11d
7912
7913#if MACRO_LEVEL>=1
7914	INNER_TRAN_STORE_4X4_LIB
7915#else
7916	CALL(inner_tran_store_4x4_lib)
7917#endif
7918
7919
7920	EPILOGUE
7921
7922	ret
7923
7924	FUN_END(kernel_dtrmm_nn_rl_one_4x4_tran_lib4c4c)
7925
7926
7927
7928
7929
7930//                                       1      2              3          4          5        6             7          8        9          10       11      12
7931// void kernel_dtrmm_nn_rl_one_4x4_tran_vs_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd, int m1, int n1);
7932
7933	.p2align 4,,15
7934	GLOB_FUN_START(kernel_dtrmm_nn_rl_one_4x4_tran_vs_lib4c4c)
7935
7936	PROLOGUE
7937
7938	// zero accumulation registers
7939
7940	ZERO_ACC
7941
7942
7943	// call inner dgemm kernel nn
7944
7945	movq	ARG1, %r10 // k
7946	movq	ARG3, %r11  // A
7947	movq	ARG4, %r12  // B
7948	movq	ARG5, %r13  // ldb
7949	sall	$3, %r13d
7950
7951#if MACRO_LEVEL>=1
7952	INNER_EDGE_DTRMM_NN_RL_ONE_4X4_VS_LIB4C
7953#else
7954	CALL(inner_edge_dtrmm_nn_rl_one_4x4_vs_lib4c)
7955#endif
7956
7957	movq	ARG10, %r14  // m1
7958	cmpl	$1, %r14d
7959	jg		100f
7960
7961#if MACRO_LEVEL>=2
7962	INNER_KERNEL_DGEMM_NN_4X1_LIB4C
7963#else
7964	CALL(inner_kernel_dgemm_nn_4x1_lib4c)
7965#endif
7966
7967	jmp		103f
7968
7969100:
7970
7971	movq	ARG10, %r14  // m1
7972	cmpl	$2, %r14d
7973	jg		101f
7974
7975#if MACRO_LEVEL>=2
7976	INNER_KERNEL_DGEMM_NN_4X2_LIB4C
7977#else
7978	CALL(inner_kernel_dgemm_nn_4x2_lib4c)
7979#endif
7980
7981	jmp		103f
7982
7983101:
7984
7985	movq	ARG10, %r14  // m1
7986	cmpl	$3, %r14d
7987	jg		102f
7988
7989#if MACRO_LEVEL>=2
7990	INNER_KERNEL_DGEMM_NN_4X3_LIB4C
7991#else
7992	CALL(inner_kernel_dgemm_nn_4x3_lib4c)
7993#endif
7994
7995	jmp		103f
7996
7997102:
7998
7999#if MACRO_LEVEL>=2
8000	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
8001#else
8002	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
8003#endif
8004
8005103:
8006
8007
8008	// call inner blend
8009
8010	movq	ARG2, %r10 // alpha
8011	movq	ARG6, %r11 // beta
8012	movq	ARG7, %r12   // C
8013
8014#if MACRO_LEVEL>=1
8015	INNER_SCALE_AB_4X4_LIB4
8016#else
8017	CALL(inner_scale_ab_4x4_lib4)
8018#endif
8019
8020
8021	// store n
8022
8023	movq	ARG8, %r10 // D
8024	movq	ARG9, %r11 // ldd
8025	sall	$3, %r11d
8026	movq	ARG10, %r12 // m1
8027	movq	ARG11, %r13 // n1
8028
8029#if MACRO_LEVEL>=1
8030	INNER_TRAN_STORE_4X4_VS_LIB
8031#else
8032	CALL(inner_tran_store_4x4_vs_lib)
8033#endif
8034
8035
8036	EPILOGUE
8037
8038	ret
8039
8040	FUN_END(kernel_dtrmm_nn_rl_one_4x4_tran_vs_lib4c4c)
8041
8042
8043
8044
8045
8046//                                    1      2              3          4          5        6             7          8        9          10
8047// void kernel_dtrmm_nn_ru_4x4_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd);
8048
8049	.p2align 4,,15
8050	GLOB_FUN_START(kernel_dtrmm_nn_ru_4x4_lib4ccc)
8051
8052	PROLOGUE
8053
8054	// zero accumulation registers
8055
8056	ZERO_ACC
8057
8058
8059	// call inner dgemm kernel nn
8060
8061	movq	ARG1, %r10 // k
8062	movq	ARG3, %r11  // A
8063	movq	ARG4, %r12  // B
8064	movq	ARG5, %r13  // ldb
8065	sall	$3, %r13d
8066
8067#if MACRO_LEVEL>=2
8068	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
8069#else
8070	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
8071#endif
8072
8073#if MACRO_LEVEL>=1
8074	INNER_EDGE_DTRMM_NN_RU_4X4_LIB4C
8075#else
8076	CALL(inner_edge_dtrmm_nn_ru_4x4_lib4c)
8077#endif
8078
8079
8080	// call inner blend
8081
8082	movq	ARG2, %r10 // alpha
8083	movq	ARG6, %r11 // beta
8084	movq	ARG7, %r12   // C
8085	movq	ARG8, %r13   // ldc
8086	sall	$3, %r13d
8087
8088#if MACRO_LEVEL>=1
8089	INNER_SCALE_AB_4X4_LIB
8090#else
8091	CALL(inner_scale_ab_4x4_lib)
8092#endif
8093
8094
8095	// store n
8096
8097	movq	ARG9, %r10 // D
8098	movq	ARG10, %r11 // ldd
8099	sall	$3, %r11d
8100
8101#if MACRO_LEVEL>=1
8102	INNER_STORE_4X4_LIB
8103#else
8104	CALL(inner_store_4x4_lib)
8105#endif
8106
8107
8108	EPILOGUE
8109
8110	ret
8111
8112	FUN_END(kernel_dtrmm_nn_ru_4x4_lib4ccc)
8113
8114
8115
8116
8117
8118//                                        1      2              3          4          5        6             7          8        9          10       11      12
8119// void kernel_dtrmm_nn_ru_4x4_vs_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
8120
8121	.p2align 4,,15
8122	GLOB_FUN_START(kernel_dtrmm_nn_ru_4x4_vs_lib4ccc)
8123
8124	PROLOGUE
8125
8126	// zero accumulation registers
8127
8128	ZERO_ACC
8129
8130
8131	// call inner dgemm kernel nn
8132
8133	movq	ARG1, %r10 // k
8134	movq	ARG3, %r11  // A
8135	movq	ARG4, %r12  // B
8136	movq	ARG5, %r13  // ldb
8137	sall	$3, %r13d
8138
8139	movq	ARG12, %r14  // n1
8140	cmpl	$1, %r14d
8141	jg		100f
8142
8143#if MACRO_LEVEL>=2
8144	INNER_KERNEL_DGEMM_NN_4X1_LIB4C
8145#else
8146	CALL(inner_kernel_dgemm_nn_4x1_lib4c)
8147#endif
8148
8149	jmp		103f
8150
8151100:
8152
8153	movq	ARG12, %r14  // n1
8154	cmpl	$2, %r14d
8155	jg		101f
8156
8157#if MACRO_LEVEL>=2
8158	INNER_KERNEL_DGEMM_NN_4X2_LIB4C
8159#else
8160	CALL(inner_kernel_dgemm_nn_4x2_lib4c)
8161#endif
8162
8163	jmp		103f
8164
8165101:
8166
8167	movq	ARG12, %r14  // n1
8168	cmpl	$3, %r14d
8169	jg		102f
8170
8171#if MACRO_LEVEL>=2
8172	INNER_KERNEL_DGEMM_NN_4X3_LIB4C
8173#else
8174	CALL(inner_kernel_dgemm_nn_4x3_lib4c)
8175#endif
8176
8177	jmp		103f
8178
8179102:
8180
8181#if MACRO_LEVEL>=2
8182	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
8183#else
8184	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
8185#endif
8186
8187103:
8188
8189	movq	ARG12, %r14 // n1
8190
8191#if MACRO_LEVEL>=1
8192	INNER_EDGE_DTRMM_NN_RU_4X4_VS_LIB4C
8193#else
8194	CALL(inner_edge_dtrmm_nn_ru_4x4_vs_lib4c)
8195#endif
8196
8197
8198	// call inner blend
8199
8200	movq	ARG2, %r10 // alpha
8201	movq	ARG6, %r11 // beta
8202	movq	ARG7, %r12   // C
8203	movq	ARG8, %r13   // ldc
8204	sall	$3, %r13d
8205	movq	ARG11, %r14 // m1
8206	movq	ARG12, %r15 // n1
8207
8208#if MACRO_LEVEL>=1
8209	INNER_SCALE_AB_4X4_VS_LIB
8210#else
8211	CALL(inner_scale_ab_4x4_vs_lib)
8212#endif
8213
8214
8215	// store n
8216
8217	movq	ARG9, %r10 // D
8218	movq	ARG10, %r11 // ldd
8219	sall	$3, %r11d
8220	movq	ARG11, %r12 // m1
8221	movq	ARG12, %r13 // n1
8222
8223#if MACRO_LEVEL>=1
8224	INNER_STORE_4X4_VS_LIB
8225#else
8226	CALL(inner_store_4x4_vs_lib)
8227#endif
8228
8229
8230	EPILOGUE
8231
8232	ret
8233
8234	FUN_END(kernel_dtrmm_nn_ru_4x4_vs_lib4ccc)
8235
8236
8237
8238
8239
8240//                                    1      2              3          4          5        6             7          8        9          10
8241// void kernel_dtrmm_nn_ru_4x4_tran_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd);
8242
8243	.p2align 4,,15
8244	GLOB_FUN_START(kernel_dtrmm_nn_ru_4x4_tran_lib4c4c)
8245
8246	PROLOGUE
8247
8248	// zero accumulation registers
8249
8250	ZERO_ACC
8251
8252
8253	// call inner dgemm kernel nn
8254
8255	movq	ARG1, %r10 // k
8256	movq	ARG3, %r11  // A
8257	movq	ARG4, %r12  // B
8258	movq	ARG5, %r13  // ldb
8259	sall	$3, %r13d
8260
8261#if MACRO_LEVEL>=2
8262	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
8263#else
8264	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
8265#endif
8266
8267#if MACRO_LEVEL>=1
8268	INNER_EDGE_DTRMM_NN_RU_4X4_LIB4C
8269#else
8270	CALL(inner_edge_dtrmm_nn_ru_4x4_lib4c)
8271#endif
8272
8273
8274	// call inner blend
8275
8276	movq	ARG2, %r10 // alpha
8277	movq	ARG6, %r11 // beta
8278	movq	ARG7, %r12   // C
8279
8280#if MACRO_LEVEL>=1
8281	INNER_SCALE_AB_4X4_LIB4
8282#else
8283	CALL(inner_scale_ab_4x4_lib4)
8284#endif
8285
8286
8287	// store n
8288
8289	movq	ARG8, %r10 // D
8290	movq	ARG9, %r11 // ldd
8291	sall	$3, %r11d
8292
8293#if MACRO_LEVEL>=1
8294	INNER_TRAN_STORE_4X4_LIB
8295#else
8296	CALL(inner_tran_store_4x4_lib)
8297#endif
8298
8299
8300	EPILOGUE
8301
8302	ret
8303
8304	FUN_END(kernel_dtrmm_nn_ru_4x4_tran_lib4c4c)
8305
8306
8307
8308
8309
8310//                                       1      2              3          4          5        6             7          8        9          10       11      12
8311// void kernel_dtrmm_nn_ru_4x4_tran_vs_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd, int m1, int n1);
8312
8313	.p2align 4,,15
8314	GLOB_FUN_START(kernel_dtrmm_nn_ru_4x4_tran_vs_lib4c4c)
8315
8316	PROLOGUE
8317
8318	// zero accumulation registers
8319
8320	ZERO_ACC
8321
8322
8323	// call inner dgemm kernel nn
8324
8325	movq	ARG1, %r10 // k
8326	movq	ARG3, %r11  // A
8327	movq	ARG4, %r12  // B
8328	movq	ARG5, %r13  // ldb
8329	sall	$3, %r13d
8330
8331	movq	ARG10, %r14  // m1
8332	cmpl	$1, %r14d
8333	jg		100f
8334
8335#if MACRO_LEVEL>=2
8336	INNER_KERNEL_DGEMM_NN_4X1_LIB4C
8337#else
8338	CALL(inner_kernel_dgemm_nn_4x1_lib4c)
8339#endif
8340
8341	jmp		103f
8342
8343100:
8344
8345	movq	ARG10, %r14  // m1
8346	cmpl	$2, %r14d
8347	jg		101f
8348
8349#if MACRO_LEVEL>=2
8350	INNER_KERNEL_DGEMM_NN_4X2_LIB4C
8351#else
8352	CALL(inner_kernel_dgemm_nn_4x2_lib4c)
8353#endif
8354
8355	jmp		103f
8356
8357101:
8358
8359	movq	ARG10, %r14  // m1
8360	cmpl	$3, %r14d
8361	jg		102f
8362
8363#if MACRO_LEVEL>=2
8364	INNER_KERNEL_DGEMM_NN_4X3_LIB4C
8365#else
8366	CALL(inner_kernel_dgemm_nn_4x3_lib4c)
8367#endif
8368
8369	jmp		103f
8370
8371102:
8372
8373#if MACRO_LEVEL>=2
8374	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
8375#else
8376	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
8377#endif
8378
8379103:
8380
8381	movq	ARG10, %r14 // m1
8382
8383#if MACRO_LEVEL>=1
8384	INNER_EDGE_DTRMM_NN_RU_4X4_VS_LIB4C
8385#else
8386	CALL(inner_edge_dtrmm_nn_ru_4x4_vs_lib4c)
8387#endif
8388
8389
8390	// call inner blend
8391
8392	movq	ARG2, %r10 // alpha
8393	movq	ARG6, %r11 // beta
8394	movq	ARG7, %r12   // C
8395
8396#if MACRO_LEVEL>=1
8397	INNER_SCALE_AB_4X4_LIB4
8398#else
8399	CALL(inner_scale_ab_4x4_lib4)
8400#endif
8401
8402
8403	// store n
8404
8405	movq	ARG8, %r10 // D
8406	movq	ARG9, %r11 // ldd
8407	sall	$3, %r11d
8408	movq	ARG10, %r12 // m1
8409	movq	ARG11, %r13 // n1
8410
8411#if MACRO_LEVEL>=1
8412	INNER_TRAN_STORE_4X4_VS_LIB
8413#else
8414	CALL(inner_tran_store_4x4_vs_lib)
8415#endif
8416
8417
8418	EPILOGUE
8419
8420	ret
8421
8422	FUN_END(kernel_dtrmm_nn_ru_4x4_tran_vs_lib4c4c)
8423
8424
8425
8426
8427
8428//                                    1      2              3          4          5        6             7          8        9          10
8429// void kernel_dtrmm_nn_ru_one_4x4_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd);
8430
8431	.p2align 4,,15
8432	GLOB_FUN_START(kernel_dtrmm_nn_ru_one_4x4_lib4ccc)
8433
8434	PROLOGUE
8435
8436	// zero accumulation registers
8437
8438	ZERO_ACC
8439
8440
8441	// call inner dgemm kernel nn
8442
8443	movq	ARG1, %r10 // k
8444	movq	ARG3, %r11  // A
8445	movq	ARG4, %r12  // B
8446	movq	ARG5, %r13  // ldb
8447	sall	$3, %r13d
8448
8449#if MACRO_LEVEL>=2
8450	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
8451#else
8452	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
8453#endif
8454
8455#if MACRO_LEVEL>=1
8456	INNER_EDGE_DTRMM_NN_RU_ONE_4X4_LIB4C
8457#else
8458	CALL(inner_edge_dtrmm_nn_ru_one_4x4_lib4c)
8459#endif
8460
8461
8462	// call inner blend
8463
8464	movq	ARG2, %r10 // alpha
8465	movq	ARG6, %r11 // beta
8466	movq	ARG7, %r12   // C
8467	movq	ARG8, %r13   // ldc
8468	sall	$3, %r13d
8469
8470#if MACRO_LEVEL>=1
8471	INNER_SCALE_AB_4X4_LIB
8472#else
8473	CALL(inner_scale_ab_4x4_lib)
8474#endif
8475
8476
8477	// store n
8478
8479	movq	ARG9, %r10 // D
8480	movq	ARG10, %r11 // ldd
8481	sall	$3, %r11d
8482
8483#if MACRO_LEVEL>=1
8484	INNER_STORE_4X4_LIB
8485#else
8486	CALL(inner_store_4x4_lib)
8487#endif
8488
8489
8490	EPILOGUE
8491
8492	ret
8493
8494	FUN_END(kernel_dtrmm_nn_ru_one_4x4_lib4ccc)
8495
8496
8497
8498
8499
8500//                                            1      2              3          4          5        6             7          8        9          10       11      12
8501// void kernel_dtrmm_nn_ru_one_4x4_vs_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
8502
8503	.p2align 4,,15
8504	GLOB_FUN_START(kernel_dtrmm_nn_ru_one_4x4_vs_lib4ccc)
8505
8506	PROLOGUE
8507
8508	// zero accumulation registers
8509
8510	ZERO_ACC
8511
8512
8513	// call inner dgemm kernel nn
8514
8515	movq	ARG1, %r10 // k
8516	movq	ARG3, %r11  // A
8517	movq	ARG4, %r12  // B
8518	movq	ARG5, %r13  // ldb
8519	sall	$3, %r13d
8520
8521	movq	ARG12, %r14  // n1
8522	cmpl	$1, %r14d
8523	jg		100f
8524
8525#if MACRO_LEVEL>=2
8526	INNER_KERNEL_DGEMM_NN_4X1_LIB4C
8527#else
8528	CALL(inner_kernel_dgemm_nn_4x1_lib4c)
8529#endif
8530
8531	jmp		103f
8532
8533100:
8534
8535	movq	ARG12, %r14  // n1
8536	cmpl	$2, %r14d
8537	jg		101f
8538
8539#if MACRO_LEVEL>=2
8540	INNER_KERNEL_DGEMM_NN_4X2_LIB4C
8541#else
8542	CALL(inner_kernel_dgemm_nn_4x2_lib4c)
8543#endif
8544
8545	jmp		103f
8546
8547101:
8548
8549	movq	ARG12, %r14  // n1
8550	cmpl	$3, %r14d
8551	jg		102f
8552
8553#if MACRO_LEVEL>=2
8554	INNER_KERNEL_DGEMM_NN_4X3_LIB4C
8555#else
8556	CALL(inner_kernel_dgemm_nn_4x3_lib4c)
8557#endif
8558
8559	jmp		103f
8560
8561102:
8562
8563#if MACRO_LEVEL>=2
8564	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
8565#else
8566	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
8567#endif
8568
8569103:
8570
8571	movq	ARG12, %r14 // n1
8572
8573#if MACRO_LEVEL>=1
8574	INNER_EDGE_DTRMM_NN_RU_ONE_4X4_VS_LIB4C
8575#else
8576	CALL(inner_edge_dtrmm_nn_ru_one_4x4_vs_lib4c)
8577#endif
8578
8579
8580	// call inner blend
8581
8582	movq	ARG2, %r10 // alpha
8583	movq	ARG6, %r11 // beta
8584	movq	ARG7, %r12   // C
8585	movq	ARG8, %r13   // ldc
8586	sall	$3, %r13d
8587	movq	ARG11, %r14 // m1
8588	movq	ARG12, %r15 // n1
8589
8590#if MACRO_LEVEL>=1
8591	INNER_SCALE_AB_4X4_VS_LIB
8592#else
8593	CALL(inner_scale_ab_4x4_vs_lib)
8594#endif
8595
8596
8597	// store n
8598
8599	movq	ARG9, %r10 // D
8600	movq	ARG10, %r11 // ldd
8601	sall	$3, %r11d
8602	movq	ARG11, %r12 // m1
8603	movq	ARG12, %r13 // n1
8604
8605#if MACRO_LEVEL>=1
8606	INNER_STORE_4X4_VS_LIB
8607#else
8608	CALL(inner_store_4x4_vs_lib)
8609#endif
8610
8611
8612	EPILOGUE
8613
8614	ret
8615
8616	FUN_END(kernel_dtrmm_nn_ru_one_4x4_vs_lib4ccc)
8617
8618
8619
8620
8621
8622//                                    1      2              3          4          5        6             7          8        9          10
8623// void kernel_dtrmm_nn_ru_one_4x4_tran_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd);
8624
8625	.p2align 4,,15
8626	GLOB_FUN_START(kernel_dtrmm_nn_ru_one_4x4_tran_lib4c4c)
8627
8628	PROLOGUE
8629
8630	// zero accumulation registers
8631
8632	ZERO_ACC
8633
8634
8635	// call inner dgemm kernel nn
8636
8637	movq	ARG1, %r10 // k
8638	movq	ARG3, %r11  // A
8639	movq	ARG4, %r12  // B
8640	movq	ARG5, %r13  // ldb
8641	sall	$3, %r13d
8642
8643#if MACRO_LEVEL>=2
8644	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
8645#else
8646	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
8647#endif
8648
8649#if MACRO_LEVEL>=1
8650	INNER_EDGE_DTRMM_NN_RU_ONE_4X4_LIB4C
8651#else
8652	CALL(inner_edge_dtrmm_nn_ru_one_4x4_lib4c)
8653#endif
8654
8655
8656	// call inner blend
8657
8658	movq	ARG2, %r10 // alpha
8659	movq	ARG6, %r11 // beta
8660	movq	ARG7, %r12   // C
8661
8662#if MACRO_LEVEL>=1
8663	INNER_SCALE_AB_4X4_LIB4
8664#else
8665	CALL(inner_scale_ab_4x4_lib4)
8666#endif
8667
8668
8669	// store n
8670
8671	movq	ARG8, %r10 // D
8672	movq	ARG9, %r11 // ldd
8673	sall	$3, %r11d
8674
8675#if MACRO_LEVEL>=1
8676	INNER_TRAN_STORE_4X4_LIB
8677#else
8678	CALL(inner_tran_store_4x4_lib)
8679#endif
8680
8681
8682	EPILOGUE
8683
8684	ret
8685
8686	FUN_END(kernel_dtrmm_nn_ru_one_4x4_tran_lib4c4c)
8687
8688
8689
8690
8691
8692//                                       1      2              3          4          5        6             7          8        9          10       11      12
8693// void kernel_dtrmm_nn_ru_one_4x4_tran_vs_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd, int m1, int n1);
8694
8695	.p2align 4,,15
8696	GLOB_FUN_START(kernel_dtrmm_nn_ru_one_4x4_tran_vs_lib4c4c)
8697
8698	PROLOGUE
8699
8700	// zero accumulation registers
8701
8702	ZERO_ACC
8703
8704
8705	// call inner dgemm kernel nn
8706
8707	movq	ARG1, %r10 // k
8708	movq	ARG3, %r11  // A
8709	movq	ARG4, %r12  // B
8710	movq	ARG5, %r13  // ldb
8711	sall	$3, %r13d
8712
8713	movq	ARG10, %r14  // m1
8714	cmpl	$1, %r14d
8715	jg		100f
8716
8717#if MACRO_LEVEL>=2
8718	INNER_KERNEL_DGEMM_NN_4X1_LIB4C
8719#else
8720	CALL(inner_kernel_dgemm_nn_4x1_lib4c)
8721#endif
8722
8723	jmp		103f
8724
8725100:
8726
8727	movq	ARG10, %r14  // m1
8728	cmpl	$2, %r14d
8729	jg		101f
8730
8731#if MACRO_LEVEL>=2
8732	INNER_KERNEL_DGEMM_NN_4X2_LIB4C
8733#else
8734	CALL(inner_kernel_dgemm_nn_4x2_lib4c)
8735#endif
8736
8737	jmp		103f
8738
8739101:
8740
8741	movq	ARG10, %r14  // m1
8742	cmpl	$3, %r14d
8743	jg		102f
8744
8745#if MACRO_LEVEL>=2
8746	INNER_KERNEL_DGEMM_NN_4X3_LIB4C
8747#else
8748	CALL(inner_kernel_dgemm_nn_4x3_lib4c)
8749#endif
8750
8751	jmp		103f
8752
8753102:
8754
8755#if MACRO_LEVEL>=2
8756	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
8757#else
8758	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
8759#endif
8760
8761103:
8762
8763	movq	ARG10, %r14 // m1
8764
8765#if MACRO_LEVEL>=1
8766	INNER_EDGE_DTRMM_NN_RU_ONE_4X4_VS_LIB4C
8767#else
8768	CALL(inner_edge_dtrmm_nn_ru_one_4x4_vs_lib4c)
8769#endif
8770
8771
8772	// call inner blend
8773
8774	movq	ARG2, %r10 // alpha
8775	movq	ARG6, %r11 // beta
8776	movq	ARG7, %r12   // C
8777
8778#if MACRO_LEVEL>=1
8779	INNER_SCALE_AB_4X4_LIB4
8780#else
8781	CALL(inner_scale_ab_4x4_lib4)
8782#endif
8783
8784
8785	// store n
8786
8787	movq	ARG8, %r10 // D
8788	movq	ARG9, %r11 // ldd
8789	sall	$3, %r11d
8790	movq	ARG10, %r12 // m1
8791	movq	ARG11, %r13 // n1
8792
8793#if MACRO_LEVEL>=1
8794	INNER_TRAN_STORE_4X4_VS_LIB
8795#else
8796	CALL(inner_tran_store_4x4_vs_lib)
8797#endif
8798
8799
8800	EPILOGUE
8801
8802	ret
8803
8804	FUN_END(kernel_dtrmm_nn_ru_one_4x4_tran_vs_lib4c4c)
8805
8806
8807
8808
8809
8810//                                    1      2              3          4          5             6          7        8          9
8811// void kernel_dtrmm_nt_rl_4x4_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd);
8812
8813	.p2align 4,,15
8814	GLOB_FUN_START(kernel_dtrmm_nt_rl_4x4_lib44cc)
8815
8816	PROLOGUE
8817
8818	// zero accumulation registers
8819
8820	ZERO_ACC
8821
8822
8823	// call inner dgemm kernel nn
8824
8825	movq	ARG1, %r10 // k
8826	movq	ARG3, %r11 // A
8827	movq	ARG4, %r12 // B
8828
8829#if MACRO_LEVEL>=1
8830//	INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
8831#else
8832//	CALL(inner_edge_dtrmm_nt_ru_4x4_lib4)
8833#endif
8834
8835#if MACRO_LEVEL>=2
8836	INNER_KERNEL_DGEMM_NT_4X4_LIB4
8837#else
8838	CALL(inner_kernel_dgemm_nt_4x4_lib4)
8839#endif
8840
8841
8842	// call inner blend
8843
8844#if MACRO_LEVEL>=1
8845	INNER_BLEND_4X4_LIB4
8846#else
8847	CALL(inner_blend_4x4_lib4)
8848#endif
8849
8850
8851	// final triangle
8852
8853//	movq	ARG1, %r10
8854//	movq	ARG3, %r11
8855//	movq	ARG4, %r12
8856
8857#if MACRO_LEVEL>=1
8858	INNER_EDGE_DTRMM_NT_RL_4X4_LIB4
8859#else
8860	CALL(inner_edge_dtrmm_nt_rl_4x4_lib4)
8861#endif
8862
8863
8864	// call inner blend
8865
8866	movq	ARG2, %r10 // alpha
8867	movq	ARG5, %r11 // beta
8868	movq	ARG6, %r12   // C
8869	movq	ARG7, %r13   // ldc
8870	sall	$3, %r13d
8871
8872#if MACRO_LEVEL>=1
8873	INNER_SCALE_AB_4X4_LIB
8874#else
8875	CALL(inner_scale_ab_4x4_lib)
8876#endif
8877
8878
8879	// store n
8880
8881	movq	ARG8, %r10 // D
8882	movq	ARG9, %r11 // ldd
8883	sall	$3, %r11d
8884
8885#if MACRO_LEVEL>=1
8886	INNER_STORE_4X4_LIB
8887#else
8888	CALL(inner_store_4x4_lib)
8889#endif
8890
8891
8892	EPILOGUE
8893
8894	ret
8895
8896	FUN_END(kernel_dtrmm_nt_rl_4x4_lib44cc)
8897
8898
8899
8900
8901
8902//                                        1      2              3          4          5             6          7        8          9        10      11
8903// void kernel_dtrmm_nt_rl_4x4_vs_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
8904
8905	.p2align 4,,15
8906	GLOB_FUN_START(kernel_dtrmm_nt_rl_4x4_vs_lib44cc)
8907
8908	PROLOGUE
8909
8910	// zero accumulation registers
8911
8912	ZERO_ACC
8913
8914
8915	// call inner dgemm kernel nn
8916
8917	movq	ARG1, %r10 // k
8918//	subl	$4, %r10d
8919	movq	ARG3, %r11 // A
8920//	addq	$128, %r11
8921	movq	ARG4, %r12 // B
8922//	addq	$128, %r12
8923
8924#if MACRO_LEVEL>=1
8925//	INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
8926#else
8927//	CALL(inner_edge_dtrmm_nt_ru_4x4_vs_lib4)
8928#endif
8929
8930#if MACRO_LEVEL>=2
8931	INNER_KERNEL_DGEMM_NT_4X4_LIB4
8932#else
8933	CALL(inner_kernel_dgemm_nt_4x4_lib4)
8934#endif
8935
8936
8937	// call inner blend
8938
8939#if MACRO_LEVEL>=1
8940	INNER_BLEND_4X4_LIB4
8941#else
8942	CALL(inner_blend_4x4_lib4)
8943#endif
8944
8945
8946	// initial triangle
8947
8948//	movq	ARG1, %r10
8949//	movq	ARG3, %r11
8950//	movq	ARG4, %r12
8951	movq	ARG11, %r13
8952
8953#if MACRO_LEVEL>=1
8954	INNER_EDGE_DTRMM_NT_RL_4X4_VS_LIB4
8955#else
8956	CALL(inner_edge_dtrmm_nt_rl_4x4_vs_lib4)
8957#endif
8958
8959
8960	// call inner blend
8961
8962	movq	ARG2, %r10 // alpha
8963	movq	ARG5, %r11 // beta
8964	movq	ARG6, %r12   // C
8965	movq	ARG7, %r13   // ldc
8966	sall	$3, %r13d
8967	movq	ARG10, %r14   // m1
8968	movq	ARG11, %r15   // n1
8969
8970#if MACRO_LEVEL>=1
8971	INNER_SCALE_AB_4X4_VS_LIB
8972#else
8973	CALL(inner_scale_ab_4x4_vs_lib)
8974#endif
8975
8976
8977	// store n
8978
8979	movq	ARG8, %r10 // D
8980	movq	ARG9, %r11 // ldd
8981	sall	$3, %r11d
8982	movq	ARG10, %r12   // m1
8983	movq	ARG11, %r13   // n1
8984
8985#if MACRO_LEVEL>=1
8986	INNER_STORE_4X4_VS_LIB
8987#else
8988	CALL(inner_store_4x4_vs_lib)
8989#endif
8990
8991
8992	EPILOGUE
8993
8994	ret
8995
8996	FUN_END(kernel_dtrmm_nt_rl_4x4_vs_lib44cc)
8997
8998
8999
9000
9001
9002//                                    1      2              3          4          5             6          7        8          9
9003// void kernel_dtrmm_nt_rl_4x4_tran_lib444c(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int ldd);
9004
9005	.p2align 4,,15
9006	GLOB_FUN_START(kernel_dtrmm_nt_rl_4x4_tran_lib444c)
9007
9008	PROLOGUE
9009
9010	// zero accumulation registers
9011
9012	ZERO_ACC
9013
9014
9015	// call inner dgemm kernel nn
9016
9017	movq	ARG1, %r10 // k
9018	movq	ARG3, %r11 // A
9019	movq	ARG4, %r12 // B
9020
9021#if MACRO_LEVEL>=1
9022//	INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
9023#else
9024//	CALL(inner_edge_dtrmm_nt_ru_4x4_lib4)
9025#endif
9026
9027#if MACRO_LEVEL>=2
9028	INNER_KERNEL_DGEMM_NT_4X4_LIB4
9029#else
9030	CALL(inner_kernel_dgemm_nt_4x4_lib4)
9031#endif
9032
9033
9034	// call inner blend
9035
9036#if MACRO_LEVEL>=1
9037	INNER_BLEND_4X4_LIB4
9038#else
9039	CALL(inner_blend_4x4_lib4)
9040#endif
9041
9042
9043	// final triangle
9044
9045//	movq	ARG1, %r10
9046//	movq	ARG3, %r11
9047//	movq	ARG4, %r12
9048
9049#if MACRO_LEVEL>=1
9050	INNER_EDGE_DTRMM_NT_RL_4X4_LIB4
9051#else
9052	CALL(inner_edge_dtrmm_nt_rl_4x4_lib4)
9053#endif
9054
9055
9056	// call inner blend
9057
9058	movq	ARG2, %r10 // alpha
9059	movq	ARG5, %r11 // beta
9060	movq	ARG6, %r12   // C
9061
9062#if MACRO_LEVEL>=1
9063	INNER_SCALE_AB_4X4_LIB4
9064#else
9065	CALL(inner_scale_ab_4x4_lib4)
9066#endif
9067
9068
9069	// store n
9070
9071	movq	ARG7, %r10 // D
9072	movq	ARG8, %r11 // ldd
9073	sall	$3, %r11d
9074
9075#if MACRO_LEVEL>=1
9076	INNER_TRAN_STORE_4X4_LIB
9077#else
9078	CALL(inner_tran_store_4x4_lib)
9079#endif
9080
9081
9082	EPILOGUE
9083
9084	ret
9085
9086	FUN_END(kernel_dtrmm_nt_rl_4x4_tran_lib444c)
9087
9088
9089
9090
9091
9092//                                       1      2              3          4          5             6          7        8          9        10      11
9093// void kernel_dtrmm_nt_rl_4x4_tran_vs_lib444c(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int ldd, int m1, int n1);
9094
9095	.p2align 4,,15
9096	GLOB_FUN_START(kernel_dtrmm_nt_rl_4x4_tran_vs_lib444c)
9097
9098	PROLOGUE
9099
9100	// zero accumulation registers
9101
9102	ZERO_ACC
9103
9104
9105	// call inner dgemm kernel nn
9106
9107	movq	ARG1, %r10 // k
9108//	subl	$4, %r10d
9109	movq	ARG3, %r11 // A
9110//	addq	$128, %r11
9111	movq	ARG4, %r12 // B
9112//	addq	$128, %r12
9113
9114#if MACRO_LEVEL>=1
9115//	INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
9116#else
9117//	CALL(inner_edge_dtrmm_nt_ru_4x4_vs_lib4)
9118#endif
9119
9120#if MACRO_LEVEL>=2
9121	INNER_KERNEL_DGEMM_NT_4X4_LIB4
9122#else
9123	CALL(inner_kernel_dgemm_nt_4x4_lib4)
9124#endif
9125
9126
9127	// call inner blend
9128
9129#if MACRO_LEVEL>=1
9130	INNER_BLEND_4X4_LIB4
9131#else
9132	CALL(inner_blend_4x4_lib4)
9133#endif
9134
9135
9136	// initial triangle
9137
9138//	movq	ARG1, %r10
9139//	movq	ARG3, %r11
9140//	movq	ARG4, %r12
9141	movq	ARG9, %r13 // m1
9142
9143#if MACRO_LEVEL>=1
9144	INNER_EDGE_DTRMM_NT_RL_4X4_VS_LIB4
9145#else
9146	CALL(inner_edge_dtrmm_nt_rl_4x4_vs_lib4)
9147#endif
9148
9149
9150	// call inner blend
9151
9152	movq	ARG2, %r10 // alpha
9153	movq	ARG5, %r11 // beta
9154	movq	ARG6, %r12   // C
9155
9156#if MACRO_LEVEL>=1
9157	INNER_SCALE_AB_4X4_LIB4
9158#else
9159	CALL(inner_scale_ab_4x4_lib4)
9160#endif
9161
9162
9163	// store n
9164
9165	movq	ARG7, %r10 // D
9166	movq	ARG8, %r11 // ldd
9167	sall	$3, %r11d
9168	movq	ARG9, %r12   // m1
9169	movq	ARG10, %r13   // n1
9170
9171#if MACRO_LEVEL>=1
9172	INNER_TRAN_STORE_4X4_VS_LIB
9173#else
9174	CALL(inner_tran_store_4x4_vs_lib)
9175#endif
9176
9177
9178	EPILOGUE
9179
9180	ret
9181
9182	FUN_END(kernel_dtrmm_nt_rl_4x4_tran_vs_lib444c)
9183
9184
9185
9186
9187
9188//                                    1      2              3          4          5             6          7        8          9
9189// void kernel_dtrmm_nt_rl_one_4x4_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd);
9190
9191	.p2align 4,,15
9192	GLOB_FUN_START(kernel_dtrmm_nt_rl_one_4x4_lib44cc)
9193
9194	PROLOGUE
9195
9196	// zero accumulation registers
9197
9198	ZERO_ACC
9199
9200
9201	// call inner dgemm kernel nn
9202
9203	movq	ARG1, %r10 // k
9204	movq	ARG3, %r11 // A
9205	movq	ARG4, %r12 // B
9206
9207#if MACRO_LEVEL>=1
9208//	INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
9209#else
9210//	CALL(inner_edge_dtrmm_nt_ru_4x4_lib4)
9211#endif
9212
9213#if MACRO_LEVEL>=2
9214	INNER_KERNEL_DGEMM_NT_4X4_LIB4
9215#else
9216	CALL(inner_kernel_dgemm_nt_4x4_lib4)
9217#endif
9218
9219
9220	// call inner blend
9221
9222#if MACRO_LEVEL>=1
9223	INNER_BLEND_4X4_LIB4
9224#else
9225	CALL(inner_blend_4x4_lib4)
9226#endif
9227
9228
9229	// final triangle
9230
9231//	movq	ARG1, %r10
9232//	movq	ARG3, %r11
9233//	movq	ARG4, %r12
9234
9235#if MACRO_LEVEL>=1
9236	INNER_EDGE_DTRMM_NT_RL_ONE_4X4_LIB4
9237#else
9238	CALL(inner_edge_dtrmm_nt_rl_one_4x4_lib4)
9239#endif
9240
9241
9242	// call inner blend
9243
9244	movq	ARG2, %r10 // alpha
9245	movq	ARG5, %r11 // beta
9246	movq	ARG6, %r12   // C
9247	movq	ARG7, %r13   // ldc
9248	sall	$3, %r13d
9249
9250#if MACRO_LEVEL>=1
9251	INNER_SCALE_AB_4X4_LIB
9252#else
9253	CALL(inner_scale_ab_4x4_lib)
9254#endif
9255
9256
9257	// store n
9258
9259	movq	ARG8, %r10 // D
9260	movq	ARG9, %r11 // ldd
9261	sall	$3, %r11d
9262
9263#if MACRO_LEVEL>=1
9264	INNER_STORE_4X4_LIB
9265#else
9266	CALL(inner_store_4x4_lib)
9267#endif
9268
9269
9270	EPILOGUE
9271
9272	ret
9273
9274	FUN_END(kernel_dtrmm_nt_rl_one_4x4_lib44cc)
9275
9276
9277
9278
9279
9280//                                            1      2              3          4          5             6          7        8          9        10      11
9281// void kernel_dtrmm_nt_rl_one_4x4_vs_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
9282
9283	.p2align 4,,15
9284	GLOB_FUN_START(kernel_dtrmm_nt_rl_one_4x4_vs_lib44cc)
9285
9286	PROLOGUE
9287
9288	// zero accumulation registers
9289
9290	ZERO_ACC
9291
9292
9293	// call inner dgemm kernel nn
9294
9295	movq	ARG1, %r10 // k
9296//	subl	$4, %r10d
9297	movq	ARG3, %r11 // A
9298//	addq	$128, %r11
9299	movq	ARG4, %r12 // B
9300//	addq	$128, %r12
9301
9302#if MACRO_LEVEL>=1
9303//	INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
9304#else
9305//	CALL(inner_edge_dtrmm_nt_ru_4x4_vs_lib4)
9306#endif
9307
9308#if MACRO_LEVEL>=2
9309	INNER_KERNEL_DGEMM_NT_4X4_LIB4
9310#else
9311	CALL(inner_kernel_dgemm_nt_4x4_lib4)
9312#endif
9313
9314
9315	// call inner blend
9316
9317#if MACRO_LEVEL>=1
9318	INNER_BLEND_4X4_LIB4
9319#else
9320	CALL(inner_blend_4x4_lib4)
9321#endif
9322
9323
9324	// initial triangle
9325
9326//	movq	ARG1, %r10
9327//	movq	ARG3, %r11
9328//	movq	ARG4, %r12
9329	movq	ARG11, %r13
9330
9331#if MACRO_LEVEL>=1
9332	INNER_EDGE_DTRMM_NT_RL_ONE_4X4_VS_LIB4
9333#else
9334	CALL(inner_edge_dtrmm_nt_rl_one_4x4_vs_lib4)
9335#endif
9336
9337
9338	// call inner blend
9339
9340	movq	ARG2, %r10 // alpha
9341	movq	ARG5, %r11 // beta
9342	movq	ARG6, %r12   // C
9343	movq	ARG7, %r13   // ldc
9344	sall	$3, %r13d
9345	movq	ARG10, %r14   // m1
9346	movq	ARG11, %r15   // n1
9347
9348#if MACRO_LEVEL>=1
9349	INNER_SCALE_AB_4X4_VS_LIB
9350#else
9351	CALL(inner_scale_ab_4x4_vs_lib)
9352#endif
9353
9354
9355	// store n
9356
9357	movq	ARG8, %r10 // D
9358	movq	ARG9, %r11 // ldd
9359	sall	$3, %r11d
9360	movq	ARG10, %r12   // m1
9361	movq	ARG11, %r13   // n1
9362
9363#if MACRO_LEVEL>=1
9364	INNER_STORE_4X4_VS_LIB
9365#else
9366	CALL(inner_store_4x4_vs_lib)
9367#endif
9368
9369
9370	EPILOGUE
9371
9372	ret
9373
9374	FUN_END(kernel_dtrmm_nt_rl_one_4x4_vs_lib44cc)
9375
9376
9377
9378
9379
9380//                                    1      2              3          4          5             6          7        8          9
9381// void kernel_dtrmm_nt_rl_one_4x4_tran_lib444c(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int ldd);
9382
9383	.p2align 4,,15
9384	GLOB_FUN_START(kernel_dtrmm_nt_rl_one_4x4_tran_lib444c)
9385
9386	PROLOGUE
9387
9388	// zero accumulation registers
9389
9390	ZERO_ACC
9391
9392
9393	// call inner dgemm kernel nn
9394
9395	movq	ARG1, %r10 // k
9396	movq	ARG3, %r11 // A
9397	movq	ARG4, %r12 // B
9398
9399#if MACRO_LEVEL>=1
9400//	INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
9401#else
9402//	CALL(inner_edge_dtrmm_nt_ru_4x4_lib4)
9403#endif
9404
9405#if MACRO_LEVEL>=2
9406	INNER_KERNEL_DGEMM_NT_4X4_LIB4
9407#else
9408	CALL(inner_kernel_dgemm_nt_4x4_lib4)
9409#endif
9410
9411
9412	// call inner blend
9413
9414#if MACRO_LEVEL>=1
9415	INNER_BLEND_4X4_LIB4
9416#else
9417	CALL(inner_blend_4x4_lib4)
9418#endif
9419
9420
9421	// final triangle
9422
9423//	movq	ARG1, %r10
9424//	movq	ARG3, %r11
9425//	movq	ARG4, %r12
9426
9427#if MACRO_LEVEL>=1
9428	INNER_EDGE_DTRMM_NT_RL_ONE_4X4_LIB4
9429#else
9430	CALL(inner_edge_dtrmm_nt_rl_one_4x4_lib4)
9431#endif
9432
9433
9434	// call inner blend
9435
9436	movq	ARG2, %r10 // alpha
9437	movq	ARG5, %r11 // beta
9438	movq	ARG6, %r12   // C
9439
9440#if MACRO_LEVEL>=1
9441	INNER_SCALE_AB_4X4_LIB4
9442#else
9443	CALL(inner_scale_ab_4x4_lib4)
9444#endif
9445
9446
9447	// store n
9448
9449	movq	ARG7, %r10 // D
9450	movq	ARG8, %r11 // ldd
9451	sall	$3, %r11d
9452
9453#if MACRO_LEVEL>=1
9454	INNER_TRAN_STORE_4X4_LIB
9455#else
9456	CALL(inner_tran_store_4x4_lib)
9457#endif
9458
9459
9460	EPILOGUE
9461
9462	ret
9463
9464	FUN_END(kernel_dtrmm_nt_rl_one_4x4_tran_lib444c)
9465
9466
9467
9468
9469
9470//                                       1      2              3          4          5             6          7        8          9        10      11
9471// void kernel_dtrmm_nt_rl_one_4x4_tran_vs_lib444c(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int ldd, int m1, int n1);
9472
9473	.p2align 4,,15
9474	GLOB_FUN_START(kernel_dtrmm_nt_rl_one_4x4_tran_vs_lib444c)
9475
9476	PROLOGUE
9477
9478	// zero accumulation registers
9479
9480	ZERO_ACC
9481
9482
9483	// call inner dgemm kernel nn
9484
9485	movq	ARG1, %r10 // k
9486//	subl	$4, %r10d
9487	movq	ARG3, %r11 // A
9488//	addq	$128, %r11
9489	movq	ARG4, %r12 // B
9490//	addq	$128, %r12
9491
9492#if MACRO_LEVEL>=1
9493//	INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
9494#else
9495//	CALL(inner_edge_dtrmm_nt_ru_4x4_vs_lib4)
9496#endif
9497
9498#if MACRO_LEVEL>=2
9499	INNER_KERNEL_DGEMM_NT_4X4_LIB4
9500#else
9501	CALL(inner_kernel_dgemm_nt_4x4_lib4)
9502#endif
9503
9504
9505	// call inner blend
9506
9507#if MACRO_LEVEL>=1
9508	INNER_BLEND_4X4_LIB4
9509#else
9510	CALL(inner_blend_4x4_lib4)
9511#endif
9512
9513
9514	// initial triangle
9515
9516//	movq	ARG1, %r10
9517//	movq	ARG3, %r11
9518//	movq	ARG4, %r12
9519	movq	ARG9, %r13 // m1
9520
9521#if MACRO_LEVEL>=1
9522	INNER_EDGE_DTRMM_NT_RL_ONE_4X4_VS_LIB4
9523#else
9524	CALL(inner_edge_dtrmm_nt_rl_one_4x4_vs_lib4)
9525#endif
9526
9527
9528	// call inner blend
9529
9530	movq	ARG2, %r10 // alpha
9531	movq	ARG5, %r11 // beta
9532	movq	ARG6, %r12   // C
9533
9534#if MACRO_LEVEL>=1
9535	INNER_SCALE_AB_4X4_LIB4
9536#else
9537	CALL(inner_scale_ab_4x4_lib4)
9538#endif
9539
9540
9541	// store n
9542
9543	movq	ARG7, %r10 // D
9544	movq	ARG8, %r11 // ldd
9545	sall	$3, %r11d
9546	movq	ARG9, %r12   // m1
9547	movq	ARG10, %r13   // n1
9548
9549#if MACRO_LEVEL>=1
9550	INNER_TRAN_STORE_4X4_VS_LIB
9551#else
9552	CALL(inner_tran_store_4x4_vs_lib)
9553#endif
9554
9555
9556	EPILOGUE
9557
9558	ret
9559
9560	FUN_END(kernel_dtrmm_nt_rl_one_4x4_tran_vs_lib444c)
9561
9562
9563
9564
9565
9566//                                    1      2              3          4          5        6             7          8        9          10
9567// void kernel_dtrmm_nt_rl_4x4_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd);
9568
9569	.p2align 4,,15
9570	GLOB_FUN_START(kernel_dtrmm_nt_rl_4x4_lib4ccc)
9571
9572	PROLOGUE
9573
9574	// zero accumulation registers
9575
9576	ZERO_ACC
9577
9578
9579	// call inner dgemm kernel nn
9580
9581	movq	ARG1, %r10 // k
9582	movq	ARG3, %r11  // A
9583	movq	ARG4, %r12  // B
9584	movq	ARG5, %r13  // ldb
9585	sall	$3, %r13d
9586
9587#if MACRO_LEVEL>=2
9588	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
9589#else
9590	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
9591#endif
9592
9593#if MACRO_LEVEL>=1
9594	INNER_EDGE_DTRMM_NT_RL_4X4_LIB4C
9595#else
9596	CALL(inner_edge_dtrmm_nt_rl_4x4_lib4c)
9597#endif
9598
9599
9600	// call inner blend
9601
9602	movq	ARG2, %r10 // alpha
9603	movq	ARG6, %r11 // beta
9604	movq	ARG7, %r12   // C
9605	movq	ARG8, %r13   // ldc
9606	sall	$3, %r13d
9607
9608#if MACRO_LEVEL>=1
9609	INNER_SCALE_AB_4X4_LIB
9610#else
9611	CALL(inner_scale_ab_4x4_lib)
9612#endif
9613
9614
9615	// store n
9616
9617	movq	ARG9, %r10 // D
9618	movq	ARG10, %r11 // ldd
9619	sall	$3, %r11d
9620
9621#if MACRO_LEVEL>=1
9622	INNER_STORE_4X4_LIB
9623#else
9624	CALL(inner_store_4x4_lib)
9625#endif
9626
9627
9628	EPILOGUE
9629
9630	ret
9631
9632	FUN_END(kernel_dtrmm_nt_rl_4x4_lib4ccc)
9633
9634
9635
9636
9637
9638//                                        1      2              3          4          5        6             7          8        9          10       11      12
9639// void kernel_dtrmm_nt_rl_4x4_vs_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
9640
9641	.p2align 4,,15
9642	GLOB_FUN_START(kernel_dtrmm_nt_rl_4x4_vs_lib4ccc)
9643
9644	PROLOGUE
9645
9646	// zero accumulation registers
9647
9648	ZERO_ACC
9649
9650
9651	// call inner dgemm kernel nn
9652
9653	movq	ARG1, %r10 // k
9654	movq	ARG3, %r11  // A
9655	movq	ARG4, %r12  // B
9656	movq	ARG5, %r13  // ldb
9657	sall	$3, %r13d
9658
9659	movq	ARG12, %r14  // n1
9660	cmpl	$1, %r14d
9661	jg		100f
9662
9663#if MACRO_LEVEL>=2
9664	INNER_KERNEL_DGEMM_NT_4X1_LIB4C
9665#else
9666	CALL(inner_kernel_dgemm_nt_4x1_lib4c)
9667#endif
9668
9669	jmp		103f
9670
9671100:
9672
9673	movq	ARG12, %r14  // n1
9674	cmpl	$2, %r14d
9675	jg		101f
9676
9677#if MACRO_LEVEL>=2
9678	INNER_KERNEL_DGEMM_NT_4X2_LIB4C
9679#else
9680	CALL(inner_kernel_dgemm_nt_4x2_lib4c)
9681#endif
9682
9683	jmp		103f
9684
9685101:
9686
9687	movq	ARG12, %r14  // n1
9688	cmpl	$3, %r14d
9689	jg		102f
9690
9691#if MACRO_LEVEL>=2
9692	INNER_KERNEL_DGEMM_NT_4X3_LIB4C
9693#else
9694	CALL(inner_kernel_dgemm_nt_4x3_lib4c)
9695#endif
9696
9697	jmp		103f
9698
9699102:
9700
9701#if MACRO_LEVEL>=2
9702	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
9703#else
9704	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
9705#endif
9706
9707103:
9708
9709	movq	ARG12, %r14 // n1
9710
9711#if MACRO_LEVEL>=1
9712	INNER_EDGE_DTRMM_NT_RL_4X4_VS_LIB4C
9713#else
9714	CALL(inner_edge_dtrmm_nt_rl_4x4_vs_lib4c)
9715#endif
9716
9717
9718	// call inner blend
9719
9720	movq	ARG2, %r10 // alpha
9721	movq	ARG6, %r11 // beta
9722	movq	ARG7, %r12   // C
9723	movq	ARG8, %r13   // ldc
9724	sall	$3, %r13d
9725	movq	ARG11, %r14 // m1
9726	movq	ARG12, %r15 // n1
9727
9728#if MACRO_LEVEL>=1
9729	INNER_SCALE_AB_4X4_VS_LIB
9730#else
9731	CALL(inner_scale_ab_4x4_vs_lib)
9732#endif
9733
9734
9735	// store n
9736
9737	movq	ARG9, %r10 // D
9738	movq	ARG10, %r11 // ldd
9739	sall	$3, %r11d
9740	movq	ARG11, %r12 // m1
9741	movq	ARG12, %r13 // n1
9742
9743#if MACRO_LEVEL>=1
9744	INNER_STORE_4X4_VS_LIB
9745#else
9746	CALL(inner_store_4x4_vs_lib)
9747#endif
9748
9749
9750	EPILOGUE
9751
9752	ret
9753
9754	FUN_END(kernel_dtrmm_nt_rl_4x4_vs_lib4ccc)
9755
9756
9757
9758
9759
9760//                                         1      2              3          4          5        6             7          8          9
9761// void kernel_dtrmm_nt_rl_4x4_tran_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd);
9762
9763	.p2align 4,,15
9764	GLOB_FUN_START(kernel_dtrmm_nt_rl_4x4_tran_lib4c4c)
9765
9766	PROLOGUE
9767
9768	// zero accumulation registers
9769
9770	ZERO_ACC
9771
9772
9773	// call inner dgemm kernel nn
9774
9775	movq	ARG1, %r10 // k
9776	movq	ARG3, %r11  // A
9777	movq	ARG4, %r12  // B
9778	movq	ARG5, %r13  // ldb
9779	sall	$3, %r13d
9780
9781#if MACRO_LEVEL>=2
9782	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
9783#else
9784	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
9785#endif
9786
9787#if MACRO_LEVEL>=1
9788	INNER_EDGE_DTRMM_NT_RL_4X4_LIB4C
9789#else
9790	CALL(inner_edge_dtrmm_nt_rl_4x4_lib4c)
9791#endif
9792
9793
9794	// call inner blend
9795
9796	movq	ARG2, %r10 // alpha
9797	movq	ARG6, %r11 // beta
9798	movq	ARG7, %r12   // C
9799
9800#if MACRO_LEVEL>=1
9801	INNER_SCALE_AB_4X4_LIB4
9802#else
9803	CALL(inner_scale_ab_4x4_lib4)
9804#endif
9805
9806
9807	// store n
9808
9809	movq	ARG8, %r10 // D
9810	movq	ARG9, %r11 // ldd
9811	sall	$3, %r11d
9812
9813#if MACRO_LEVEL>=1
9814	INNER_TRAN_STORE_4X4_LIB
9815#else
9816	CALL(inner_tran_store_4x4_lib)
9817#endif
9818
9819
9820	EPILOGUE
9821
9822	ret
9823
9824	FUN_END(kernel_dtrmm_nt_rl_4x4_tran_lib4c4c)
9825
9826
9827
9828
9829
9830//                                            1      2              3          4          5        6             7          8          9        10      11
9831// void kernel_dtrmm_nt_rl_4x4_tran_vs_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd, int m1, int n1);
9832
9833	.p2align 4,,15
9834	GLOB_FUN_START(kernel_dtrmm_nt_rl_4x4_tran_vs_lib4c4c)
9835
9836	PROLOGUE
9837
9838	// zero accumulation registers
9839
9840	ZERO_ACC
9841
9842
9843	// call inner dgemm kernel nn
9844
9845	movq	ARG1, %r10 // k
9846	movq	ARG3, %r11  // A
9847	movq	ARG4, %r12  // B
9848	movq	ARG5, %r13  // ldb
9849	sall	$3, %r13d
9850
9851	movq	ARG10, %r14  // m1
9852	cmpl	$1, %r14d
9853	jg		100f
9854
9855#if MACRO_LEVEL>=2
9856	INNER_KERNEL_DGEMM_NT_4X1_LIB4C
9857#else
9858	CALL(inner_kernel_dgemm_nt_4x1_lib4c)
9859#endif
9860
9861	jmp		103f
9862
9863100:
9864
9865	movq	ARG10, %r14  // m1
9866	cmpl	$2, %r14d
9867	jg		101f
9868
9869#if MACRO_LEVEL>=2
9870	INNER_KERNEL_DGEMM_NT_4X2_LIB4C
9871#else
9872	CALL(inner_kernel_dgemm_nt_4x2_lib4c)
9873#endif
9874
9875	jmp		103f
9876
9877101:
9878
9879	movq	ARG10, %r14  // m1
9880	cmpl	$3, %r14d
9881	jg		102f
9882
9883#if MACRO_LEVEL>=2
9884	INNER_KERNEL_DGEMM_NT_4X3_LIB4C
9885#else
9886	CALL(inner_kernel_dgemm_nt_4x3_lib4c)
9887#endif
9888
9889	jmp		103f
9890
9891102:
9892
9893#if MACRO_LEVEL>=2
9894	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
9895#else
9896	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
9897#endif
9898
9899103:
9900
9901	movq	ARG10, %r14 // m1
9902
9903#if MACRO_LEVEL>=1
9904	INNER_EDGE_DTRMM_NT_RL_4X4_VS_LIB4C
9905#else
9906	CALL(inner_edge_dtrmm_nt_rl_4x4_vs_lib4c)
9907#endif
9908
9909
9910	// call inner blend
9911
9912	movq	ARG2, %r10 // alpha
9913	movq	ARG6, %r11 // beta
9914	movq	ARG7, %r12   // C
9915
9916#if MACRO_LEVEL>=1
9917	INNER_SCALE_AB_4X4_LIB4
9918#else
9919	CALL(inner_scale_ab_4x4_lib4)
9920#endif
9921
9922
9923	// store n
9924
9925	movq	ARG8, %r10 // D
9926	movq	ARG9, %r11 // ldd
9927	sall	$3, %r11d
9928	movq	ARG10, %r12 // m1
9929	movq	ARG11, %r13 // n1
9930
9931#if MACRO_LEVEL>=1
9932	INNER_TRAN_STORE_4X4_VS_LIB
9933#else
9934	CALL(inner_tran_store_4x4_vs_lib)
9935#endif
9936
9937
9938	EPILOGUE
9939
9940	ret
9941
9942	FUN_END(kernel_dtrmm_nt_rl_4x4_tran_vs_lib4c4c)
9943
9944
9945
9946
9947
9948//                                    1      2              3          4          5        6             7          8        9          10
9949// void kernel_dtrmm_nt_rl_one_4x4_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd);
9950
9951	.p2align 4,,15
9952	GLOB_FUN_START(kernel_dtrmm_nt_rl_one_4x4_lib4ccc)
9953
9954	PROLOGUE
9955
9956	// zero accumulation registers
9957
9958	ZERO_ACC
9959
9960
9961	// call inner dgemm kernel nn
9962
9963	movq	ARG1, %r10 // k
9964	movq	ARG3, %r11  // A
9965	movq	ARG4, %r12  // B
9966	movq	ARG5, %r13  // ldb
9967	sall	$3, %r13d
9968
9969#if MACRO_LEVEL>=2
9970	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
9971#else
9972	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
9973#endif
9974
9975#if MACRO_LEVEL>=1
9976	INNER_EDGE_DTRMM_NT_RL_ONE_4X4_LIB4C
9977#else
9978	CALL(inner_edge_dtrmm_nt_rl_one_4x4_lib4c)
9979#endif
9980
9981
9982	// call inner blend
9983
9984	movq	ARG2, %r10 // alpha
9985	movq	ARG6, %r11 // beta
9986	movq	ARG7, %r12   // C
9987	movq	ARG8, %r13   // ldc
9988	sall	$3, %r13d
9989
9990#if MACRO_LEVEL>=1
9991	INNER_SCALE_AB_4X4_LIB
9992#else
9993	CALL(inner_scale_ab_4x4_lib)
9994#endif
9995
9996
9997	// store n
9998
9999	movq	ARG9, %r10 // D
10000	movq	ARG10, %r11 // ldd
10001	sall	$3, %r11d
10002
10003#if MACRO_LEVEL>=1
10004	INNER_STORE_4X4_LIB
10005#else
10006	CALL(inner_store_4x4_lib)
10007#endif
10008
10009
10010	EPILOGUE
10011
10012	ret
10013
10014	FUN_END(kernel_dtrmm_nt_rl_one_4x4_lib4ccc)
10015
10016
10017
10018
10019
10020//                                            1      2              3          4          5        6             7          8        9          10       11      12
10021// void kernel_dtrmm_nt_rl_one_4x4_vs_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
10022
10023	.p2align 4,,15
10024	GLOB_FUN_START(kernel_dtrmm_nt_rl_one_4x4_vs_lib4ccc)
10025
10026	PROLOGUE
10027
10028	// zero accumulation registers
10029
10030	ZERO_ACC
10031
10032
10033	// call inner dgemm kernel nn
10034
10035	movq	ARG1, %r10 // k
10036	movq	ARG3, %r11  // A
10037	movq	ARG4, %r12  // B
10038	movq	ARG5, %r13  // ldb
10039	sall	$3, %r13d
10040
10041	movq	ARG12, %r14  // n1
10042	cmpl	$1, %r14d
10043	jg		100f
10044
10045#if MACRO_LEVEL>=2
10046	INNER_KERNEL_DGEMM_NT_4X1_LIB4C
10047#else
10048	CALL(inner_kernel_dgemm_nt_4x1_lib4c)
10049#endif
10050
10051	jmp		103f
10052
10053100:
10054
10055	movq	ARG12, %r14  // n1
10056	cmpl	$2, %r14d
10057	jg		101f
10058
10059#if MACRO_LEVEL>=2
10060	INNER_KERNEL_DGEMM_NT_4X2_LIB4C
10061#else
10062	CALL(inner_kernel_dgemm_nt_4x2_lib4c)
10063#endif
10064
10065	jmp		103f
10066
10067101:
10068
10069	movq	ARG12, %r14  // n1
10070	cmpl	$3, %r14d
10071	jg		102f
10072
10073#if MACRO_LEVEL>=2
10074	INNER_KERNEL_DGEMM_NT_4X3_LIB4C
10075#else
10076	CALL(inner_kernel_dgemm_nt_4x3_lib4c)
10077#endif
10078
10079	jmp		103f
10080
10081102:
10082
10083#if MACRO_LEVEL>=2
10084	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
10085#else
10086	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
10087#endif
10088
10089103:
10090
10091	movq	ARG12, %r14 // n1
10092
10093#if MACRO_LEVEL>=1
10094	INNER_EDGE_DTRMM_NT_RL_ONE_4X4_VS_LIB4C
10095#else
10096	CALL(inner_edge_dtrmm_nt_rl_one_4x4_vs_lib4c)
10097#endif
10098
10099
10100	// call inner blend
10101
10102	movq	ARG2, %r10 // alpha
10103	movq	ARG6, %r11 // beta
10104	movq	ARG7, %r12   // C
10105	movq	ARG8, %r13   // ldc
10106	sall	$3, %r13d
10107	movq	ARG11, %r14 // m1
10108	movq	ARG12, %r15 // n1
10109
10110#if MACRO_LEVEL>=1
10111	INNER_SCALE_AB_4X4_VS_LIB
10112#else
10113	CALL(inner_scale_ab_4x4_vs_lib)
10114#endif
10115
10116
10117	// store n
10118
10119	movq	ARG9, %r10 // D
10120	movq	ARG10, %r11 // ldd
10121	sall	$3, %r11d
10122	movq	ARG11, %r12 // m1
10123	movq	ARG12, %r13 // n1
10124
10125#if MACRO_LEVEL>=1
10126	INNER_STORE_4X4_VS_LIB
10127#else
10128	CALL(inner_store_4x4_vs_lib)
10129#endif
10130
10131
10132	EPILOGUE
10133
10134	ret
10135
10136	FUN_END(kernel_dtrmm_nt_rl_one_4x4_vs_lib4ccc)
10137
10138
10139
10140
10141
10142//                                         1      2              3          4          5        6             7          8          9
10143// void kernel_dtrmm_nt_rl_one_4x4_tran_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd);
10144
10145	.p2align 4,,15
10146	GLOB_FUN_START(kernel_dtrmm_nt_rl_one_4x4_tran_lib4c4c)
10147
10148	PROLOGUE
10149
10150	// zero accumulation registers
10151
10152	ZERO_ACC
10153
10154
10155	// call inner dgemm kernel nn
10156
10157	movq	ARG1, %r10 // k
10158	movq	ARG3, %r11  // A
10159	movq	ARG4, %r12  // B
10160	movq	ARG5, %r13  // ldb
10161	sall	$3, %r13d
10162
10163#if MACRO_LEVEL>=2
10164	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
10165#else
10166	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
10167#endif
10168
10169#if MACRO_LEVEL>=1
10170	INNER_EDGE_DTRMM_NT_RL_ONE_4X4_LIB4C
10171#else
10172	CALL(inner_edge_dtrmm_nt_rl_one_4x4_lib4c)
10173#endif
10174
10175
10176	// call inner blend
10177
10178	movq	ARG2, %r10 // alpha
10179	movq	ARG6, %r11 // beta
10180	movq	ARG7, %r12   // C
10181
10182#if MACRO_LEVEL>=1
10183	INNER_SCALE_AB_4X4_LIB4
10184#else
10185	CALL(inner_scale_ab_4x4_lib4)
10186#endif
10187
10188
10189	// store n
10190
10191	movq	ARG8, %r10 // D
10192	movq	ARG9, %r11 // ldd
10193	sall	$3, %r11d
10194
10195#if MACRO_LEVEL>=1
10196	INNER_TRAN_STORE_4X4_LIB
10197#else
10198	CALL(inner_tran_store_4x4_lib)
10199#endif
10200
10201
10202	EPILOGUE
10203
10204	ret
10205
10206	FUN_END(kernel_dtrmm_nt_rl_one_4x4_tran_lib4c4c)
10207
10208
10209
10210
10211
10212//                                            1      2              3          4          5        6             7          8          9        10      11
10213// void kernel_dtrmm_nt_rl_one_4x4_tran_vs_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd, int m1, int n1);
10214
10215	.p2align 4,,15
10216	GLOB_FUN_START(kernel_dtrmm_nt_rl_one_4x4_tran_vs_lib4c4c)
10217
10218	PROLOGUE
10219
10220	// zero accumulation registers
10221
10222	ZERO_ACC
10223
10224
10225	// call inner dgemm kernel nn
10226
10227	movq	ARG1, %r10 // k
10228	movq	ARG3, %r11  // A
10229	movq	ARG4, %r12  // B
10230	movq	ARG5, %r13  // ldb
10231	sall	$3, %r13d
10232
10233	movq	ARG10, %r14  // m1
10234	cmpl	$1, %r14d
10235	jg		100f
10236
10237#if MACRO_LEVEL>=2
10238	INNER_KERNEL_DGEMM_NT_4X1_LIB4C
10239#else
10240	CALL(inner_kernel_dgemm_nt_4x1_lib4c)
10241#endif
10242
10243	jmp		103f
10244
10245100:
10246
10247	movq	ARG10, %r14  // m1
10248	cmpl	$2, %r14d
10249	jg		101f
10250
10251#if MACRO_LEVEL>=2
10252	INNER_KERNEL_DGEMM_NT_4X2_LIB4C
10253#else
10254	CALL(inner_kernel_dgemm_nt_4x2_lib4c)
10255#endif
10256
10257	jmp		103f
10258
10259101:
10260
10261	movq	ARG10, %r14  // m1
10262	cmpl	$3, %r14d
10263	jg		102f
10264
10265#if MACRO_LEVEL>=2
10266	INNER_KERNEL_DGEMM_NT_4X3_LIB4C
10267#else
10268	CALL(inner_kernel_dgemm_nt_4x3_lib4c)
10269#endif
10270
10271	jmp		103f
10272
10273102:
10274
10275#if MACRO_LEVEL>=2
10276	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
10277#else
10278	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
10279#endif
10280
10281103:
10282
10283	movq	ARG10, %r14 // m1
10284
10285#if MACRO_LEVEL>=1
10286	INNER_EDGE_DTRMM_NT_RL_ONE_4X4_VS_LIB4C
10287#else
10288	CALL(inner_edge_dtrmm_nt_rl_one_4x4_vs_lib4c)
10289#endif
10290
10291
10292	// call inner blend
10293
10294	movq	ARG2, %r10 // alpha
10295	movq	ARG6, %r11 // beta
10296	movq	ARG7, %r12   // C
10297
10298#if MACRO_LEVEL>=1
10299	INNER_SCALE_AB_4X4_LIB4
10300#else
10301	CALL(inner_scale_ab_4x4_lib4)
10302#endif
10303
10304
10305	// store n
10306
10307	movq	ARG8, %r10 // D
10308	movq	ARG9, %r11 // ldd
10309	sall	$3, %r11d
10310	movq	ARG10, %r12 // m1
10311	movq	ARG11, %r13 // n1
10312
10313#if MACRO_LEVEL>=1
10314	INNER_TRAN_STORE_4X4_VS_LIB
10315#else
10316	CALL(inner_tran_store_4x4_vs_lib)
10317#endif
10318
10319
10320	EPILOGUE
10321
10322	ret
10323
10324	FUN_END(kernel_dtrmm_nt_rl_one_4x4_tran_vs_lib4c4c)
10325
10326
10327
10328
10329
10330//                                    1      2              3          4          5             6          7        8          9
10331// void kernel_dtrmm_nt_ru_4x4_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd);
10332
10333	.p2align 4,,15
10334	GLOB_FUN_START(kernel_dtrmm_nt_ru_4x4_lib44cc)
10335
10336	PROLOGUE
10337
10338	// zero accumulation registers
10339
10340	ZERO_ACC
10341
10342
10343	// call inner dgemm kernel nn
10344
10345	movq	ARG1, %r10 // k
10346	subl	$4, %r10d
10347	movq	ARG3, %r11 // A
10348	addq	$128, %r11
10349	movq	ARG4, %r12 // B
10350	addq	$128, %r12
10351
10352#if MACRO_LEVEL>=1
10353//	INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
10354#else
10355//	CALL(inner_edge_dtrmm_nt_ru_4x4_lib4)
10356#endif
10357
10358#if MACRO_LEVEL>=2
10359	INNER_KERNEL_DGEMM_NT_4X4_LIB4
10360#else
10361	CALL(inner_kernel_dgemm_nt_4x4_lib4)
10362#endif
10363
10364
10365	// call inner blend
10366
10367#if MACRO_LEVEL>=1
10368	INNER_BLEND_4X4_LIB4
10369#else
10370	CALL(inner_blend_4x4_lib4)
10371#endif
10372
10373
10374	// initial triangle
10375
10376	movq	ARG1, %r10
10377	movq	ARG3, %r11
10378	movq	ARG4, %r12
10379
10380#if MACRO_LEVEL>=1
10381	INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
10382#else
10383	CALL(inner_edge_dtrmm_nt_ru_4x4_lib4)
10384#endif
10385
10386
10387	// call inner blend
10388
10389	movq	ARG2, %r10 // alpha
10390	movq	ARG5, %r11 // beta
10391	movq	ARG6, %r12   // C
10392	movq	ARG7, %r13   // ldc
10393	sall	$3, %r13d
10394
10395#if MACRO_LEVEL>=1
10396	INNER_SCALE_AB_4X4_LIB
10397#else
10398	CALL(inner_scale_ab_4x4_lib)
10399#endif
10400
10401
10402	// store n
10403
10404	movq	ARG8, %r10 // D
10405	movq	ARG9, %r11 // ldd
10406	sall	$3, %r11d
10407
10408#if MACRO_LEVEL>=1
10409	INNER_STORE_4X4_LIB
10410#else
10411	CALL(inner_store_4x4_lib)
10412#endif
10413
10414
10415	EPILOGUE
10416
10417	ret
10418
10419	FUN_END(kernel_dtrmm_nt_ru_4x4_lib44cc)
10420
10421
10422
10423
10424
10425//                                        1      2              3          4          5             6          7        8          9        10      11
10426// void kernel_dtrmm_nt_ru_4x4_vs_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
10427
10428	.p2align 4,,15
10429	GLOB_FUN_START(kernel_dtrmm_nt_ru_4x4_vs_lib44cc)
10430
10431	PROLOGUE
10432
10433	// zero accumulation registers
10434
10435	ZERO_ACC
10436
10437
10438	// call inner dgemm kernel nn
10439
10440	movq	ARG1, %r10 // k
10441	subl	$4, %r10d
10442	movq	ARG3, %r11 // A
10443	addq	$128, %r11
10444	movq	ARG4, %r12 // B
10445	addq	$128, %r12
10446
10447#if MACRO_LEVEL>=1
10448//	INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
10449#else
10450//	CALL(inner_edge_dtrmm_nt_ru_4x4_vs_lib4)
10451#endif
10452
10453#if MACRO_LEVEL>=2
10454	INNER_KERNEL_DGEMM_NT_4X4_LIB4
10455#else
10456	CALL(inner_kernel_dgemm_nt_4x4_lib4)
10457#endif
10458
10459
10460	// call inner blend
10461
10462#if MACRO_LEVEL>=1
10463	INNER_BLEND_4X4_LIB4
10464#else
10465	CALL(inner_blend_4x4_lib4)
10466#endif
10467
10468
10469	// initial triangle
10470
10471	movq	ARG1, %r10
10472	movq	ARG3, %r11
10473	movq	ARG4, %r12
10474
10475#if MACRO_LEVEL>=1
10476	INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
10477#else
10478	CALL(inner_edge_dtrmm_nt_ru_4x4_vs_lib4)
10479#endif
10480
10481
10482	// call inner blend
10483
10484	movq	ARG2, %r10 // alpha
10485	movq	ARG5, %r11 // beta
10486	movq	ARG6, %r12   // C
10487	movq	ARG7, %r13   // ldc
10488	sall	$3, %r13d
10489	movq	ARG10, %r14   // m1
10490	movq	ARG11, %r15   // n1
10491
10492#if MACRO_LEVEL>=1
10493	INNER_SCALE_AB_4X4_VS_LIB
10494#else
10495	CALL(inner_scale_ab_4x4_vs_lib)
10496#endif
10497
10498
10499	// store n
10500
10501	movq	ARG8, %r10 // D
10502	movq	ARG9, %r11 // ldd
10503	sall	$3, %r11d
10504	movq	ARG10, %r12   // m1
10505	movq	ARG11, %r13   // n1
10506
10507#if MACRO_LEVEL>=1
10508	INNER_STORE_4X4_VS_LIB
10509#else
10510	CALL(inner_store_4x4_vs_lib)
10511#endif
10512
10513
10514	EPILOGUE
10515
10516	ret
10517
10518	FUN_END(kernel_dtrmm_nt_ru_4x4_vs_lib44cc)
10519
10520
10521
10522
10523
10524//                                    1      2              3          4          5             6          7        8          9
10525// void kernel_dtrmm_nt_ru_4x4_tran_lib444c(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int ldd);
10526
10527	.p2align 4,,15
10528	GLOB_FUN_START(kernel_dtrmm_nt_ru_4x4_tran_lib444c)
10529
10530	PROLOGUE
10531
10532	// zero accumulation registers
10533
10534	ZERO_ACC
10535
10536
10537	// call inner dgemm kernel nn
10538
10539	movq	ARG1, %r10 // k
10540	subl	$4, %r10d
10541	movq	ARG3, %r11 // A
10542	addq	$128, %r11
10543	movq	ARG4, %r12 // B
10544	addq	$128, %r12
10545
10546#if MACRO_LEVEL>=1
10547//	INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
10548#else
10549//	CALL(inner_edge_dtrmm_nt_ru_4x4_lib4)
10550#endif
10551
10552#if MACRO_LEVEL>=2
10553	INNER_KERNEL_DGEMM_NT_4X4_LIB4
10554#else
10555	CALL(inner_kernel_dgemm_nt_4x4_lib4)
10556#endif
10557
10558
10559	// call inner blend
10560
10561#if MACRO_LEVEL>=1
10562	INNER_BLEND_4X4_LIB4
10563#else
10564	CALL(inner_blend_4x4_lib4)
10565#endif
10566
10567
10568	// initial triangle
10569
10570	movq	ARG1, %r10
10571	movq	ARG3, %r11
10572	movq	ARG4, %r12
10573
10574#if MACRO_LEVEL>=1
10575	INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
10576#else
10577	CALL(inner_edge_dtrmm_nt_ru_4x4_lib4)
10578#endif
10579
10580
10581	// call inner blend
10582
10583	movq	ARG2, %r10 // alpha
10584	movq	ARG5, %r11 // beta
10585	movq	ARG6, %r12   // C
10586
10587#if MACRO_LEVEL>=1
10588	INNER_SCALE_AB_4X4_LIB4
10589#else
10590	CALL(inner_scale_ab_4x4_lib4)
10591#endif
10592
10593
10594	// store n
10595
10596	movq	ARG7, %r10 // D
10597	movq	ARG8, %r11 // ldd
10598	sall	$3, %r11d
10599
10600#if MACRO_LEVEL>=1
10601	INNER_TRAN_STORE_4X4_LIB
10602#else
10603	CALL(inner_tran_store_4x4_lib)
10604#endif
10605
10606
10607	EPILOGUE
10608
10609	ret
10610
10611	FUN_END(kernel_dtrmm_nt_ru_4x4_tran_lib444c)
10612
10613
10614
10615
10616
10617//                                       1      2              3          4          5             6          7        8          9        10      11
10618// void kernel_dtrmm_nt_ru_4x4_tran_vs_lib444c(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int ldd, int m1, int n1);
10619
10620	.p2align 4,,15
10621	GLOB_FUN_START(kernel_dtrmm_nt_ru_4x4_tran_vs_lib444c)
10622
10623	PROLOGUE
10624
10625	// zero accumulation registers
10626
10627	ZERO_ACC
10628
10629
10630	// call inner dgemm kernel nn
10631
10632	movq	ARG1, %r10 // k
10633	subl	$4, %r10d
10634	movq	ARG3, %r11 // A
10635	addq	$128, %r11
10636	movq	ARG4, %r12 // B
10637	addq	$128, %r12
10638
10639#if MACRO_LEVEL>=1
10640//	INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
10641#else
10642//	CALL(inner_edge_dtrmm_nt_ru_4x4_vs_lib4)
10643#endif
10644
10645#if MACRO_LEVEL>=2
10646	INNER_KERNEL_DGEMM_NT_4X4_LIB4
10647#else
10648	CALL(inner_kernel_dgemm_nt_4x4_lib4)
10649#endif
10650
10651
10652	// call inner blend
10653
10654#if MACRO_LEVEL>=1
10655	INNER_BLEND_4X4_LIB4
10656#else
10657	CALL(inner_blend_4x4_lib4)
10658#endif
10659
10660
10661	// initial triangle
10662
10663	movq	ARG1, %r10
10664	movq	ARG3, %r11
10665	movq	ARG4, %r12
10666
10667#if MACRO_LEVEL>=1
10668	INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
10669#else
10670	CALL(inner_edge_dtrmm_nt_ru_4x4_vs_lib4)
10671#endif
10672
10673
10674	// call inner blend
10675
10676	movq	ARG2, %r10 // alpha
10677	movq	ARG5, %r11 // beta
10678	movq	ARG6, %r12   // C
10679
10680#if MACRO_LEVEL>=1
10681	INNER_SCALE_AB_4X4_LIB4
10682#else
10683	CALL(inner_scale_ab_4x4_lib4)
10684#endif
10685
10686
10687	// store n
10688
10689	movq	ARG7, %r10 // D
10690	movq	ARG8, %r11 // ldd
10691	sall	$3, %r11d
10692	movq	ARG9, %r12   // m1
10693	movq	ARG10, %r13   // n1
10694
10695#if MACRO_LEVEL>=1
10696	INNER_TRAN_STORE_4X4_VS_LIB
10697#else
10698	CALL(inner_tran_store_4x4_vs_lib)
10699#endif
10700
10701
10702	EPILOGUE
10703
10704	ret
10705
10706	FUN_END(kernel_dtrmm_nt_ru_4x4_tran_vs_lib444c)
10707
10708
10709
10710
10711
10712//                                    1      2              3          4          5        6             7          8        9          10
10713// void kernel_dtrmm_nt_ru_4x4_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd);
10714
10715	.p2align 4,,15
10716	GLOB_FUN_START(kernel_dtrmm_nt_ru_4x4_lib4ccc)
10717
10718	PROLOGUE
10719
10720	// zero accumulation registers
10721
10722	ZERO_ACC
10723
10724
10725	// call inner dgemm kernel nn
10726
10727	movq	ARG1, %r10 // k
10728	movq	ARG3, %r11  // A
10729	movq	ARG4, %r12  // B
10730	movq	ARG5, %r13  // ldb
10731	sall	$3, %r13d
10732
10733#if MACRO_LEVEL>=1
10734	INNER_EDGE_DTRMM_NT_RU_4X4_LIB4C
10735#else
10736	CALL(inner_edge_dtrmm_nt_ru_4x4_lib4c)
10737#endif
10738
10739#if MACRO_LEVEL>=2
10740	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
10741#else
10742	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
10743#endif
10744
10745
10746	// call inner blend
10747
10748	movq	ARG2, %r10 // alpha
10749	movq	ARG6, %r11 // beta
10750	movq	ARG7, %r12   // C
10751	movq	ARG8, %r13   // ldc
10752	sall	$3, %r13d
10753
10754#if MACRO_LEVEL>=1
10755	INNER_SCALE_AB_4X4_LIB
10756#else
10757	CALL(inner_scale_ab_4x4_lib)
10758#endif
10759
10760
10761	// store n
10762
10763	movq	ARG9, %r10 // D
10764	movq	ARG10, %r11 // ldd
10765	sall	$3, %r11d
10766
10767#if MACRO_LEVEL>=1
10768	INNER_STORE_4X4_LIB
10769#else
10770	CALL(inner_store_4x4_lib)
10771#endif
10772
10773
10774	EPILOGUE
10775
10776	ret
10777
10778	FUN_END(kernel_dtrmm_nt_ru_4x4_lib4ccc)
10779
10780
10781
10782
10783
10784//                                        1      2              3          4          5        6             7          8        9          10       11      12
10785// void kernel_dtrmm_nt_ru_4x4_vs_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
10786
10787	.p2align 4,,15
10788	GLOB_FUN_START(kernel_dtrmm_nt_ru_4x4_vs_lib4ccc)
10789
10790	PROLOGUE
10791
10792	// zero accumulation registers
10793
10794	ZERO_ACC
10795
10796
10797	// call inner dgemm kernel nn
10798
10799	movq	ARG1, %r10 // k
10800	movq	ARG3, %r11  // A
10801	movq	ARG4, %r12  // B
10802	movq	ARG5, %r13  // ldb
10803	sall	$3, %r13d
10804
10805#if MACRO_LEVEL>=1
10806	INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4C
10807#else
10808	CALL(inner_edge_dtrmm_nt_ru_4x4_vs_lib4c)
10809#endif
10810
10811	movq	ARG12, %r14  // n1
10812	cmpl	$1, %r14d
10813	jg		100f
10814
10815#if MACRO_LEVEL>=2
10816	INNER_KERNEL_DGEMM_NT_4X1_LIB4C
10817#else
10818	CALL(inner_kernel_dgemm_nt_4x1_lib4c)
10819#endif
10820
10821	jmp		103f
10822
10823100:
10824
10825	movq	ARG12, %r14  // n1
10826	cmpl	$2, %r14d
10827	jg		101f
10828
10829#if MACRO_LEVEL>=2
10830	INNER_KERNEL_DGEMM_NT_4X2_LIB4C
10831#else
10832	CALL(inner_kernel_dgemm_nt_4x2_lib4c)
10833#endif
10834
10835	jmp		103f
10836
10837101:
10838
10839	movq	ARG12, %r14  // n1
10840	cmpl	$3, %r14d
10841	jg		102f
10842
10843#if MACRO_LEVEL>=2
10844	INNER_KERNEL_DGEMM_NT_4X3_LIB4C
10845#else
10846	CALL(inner_kernel_dgemm_nt_4x3_lib4c)
10847#endif
10848
10849	jmp		103f
10850
10851102:
10852
10853#if MACRO_LEVEL>=2
10854	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
10855#else
10856	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
10857#endif
10858
10859103:
10860
10861
10862	// call inner blend
10863
10864	movq	ARG2, %r10 // alpha
10865	movq	ARG6, %r11 // beta
10866	movq	ARG7, %r12   // C
10867	movq	ARG8, %r13   // ldc
10868	sall	$3, %r13d
10869	movq	ARG11, %r14 // m1
10870	movq	ARG12, %r15 // n1
10871
10872#if MACRO_LEVEL>=1
10873	INNER_SCALE_AB_4X4_VS_LIB
10874#else
10875	CALL(inner_scale_ab_4x4_vs_lib)
10876#endif
10877
10878
10879	// store n
10880
10881	movq	ARG9, %r10 // D
10882	movq	ARG10, %r11 // ldd
10883	sall	$3, %r11d
10884	movq	ARG11, %r12 // m1
10885	movq	ARG12, %r13 // n1
10886
10887#if MACRO_LEVEL>=1
10888	INNER_STORE_4X4_VS_LIB
10889#else
10890	CALL(inner_store_4x4_vs_lib)
10891#endif
10892
10893
10894	EPILOGUE
10895
10896	ret
10897
10898	FUN_END(kernel_dtrmm_nt_ru_4x4_vs_lib4ccc)
10899
10900
10901
10902
10903
10904//                                    1      2              3          4          5        6             7          8        9          10
10905// void kernel_dtrmm_nt_ru_4x4_tran_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd);
10906
10907	.p2align 4,,15
10908	GLOB_FUN_START(kernel_dtrmm_nt_ru_4x4_tran_lib4c4c)
10909
10910	PROLOGUE
10911
10912	// zero accumulation registers
10913
10914	ZERO_ACC
10915
10916
10917	// call inner dgemm kernel nn
10918
10919	movq	ARG1, %r10 // k
10920	movq	ARG3, %r11  // A
10921	movq	ARG4, %r12  // B
10922	movq	ARG5, %r13  // ldb
10923	sall	$3, %r13d
10924
10925#if MACRO_LEVEL>=1
10926	INNER_EDGE_DTRMM_NT_RU_4X4_LIB4C
10927#else
10928	CALL(inner_edge_dtrmm_nt_ru_4x4_lib4c)
10929#endif
10930
10931#if MACRO_LEVEL>=2
10932	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
10933#else
10934	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
10935#endif
10936
10937
10938	// call inner blend
10939
10940	movq	ARG2, %r10 // alpha
10941	movq	ARG6, %r11 // beta
10942	movq	ARG7, %r12   // C
10943
10944#if MACRO_LEVEL>=1
10945	INNER_SCALE_AB_4X4_LIB4
10946#else
10947	CALL(inner_scale_ab_4x4_lib4)
10948#endif
10949
10950
10951	// store n
10952
10953	movq	ARG8, %r10 // D
10954	movq	ARG9, %r11 // ldd
10955	sall	$3, %r11d
10956
10957#if MACRO_LEVEL>=1
10958	INNER_TRAN_STORE_4X4_LIB
10959#else
10960	CALL(inner_tran_store_4x4_lib)
10961#endif
10962
10963
10964	EPILOGUE
10965
10966	ret
10967
10968	FUN_END(kernel_dtrmm_nt_ru_4x4_tran_lib4c4c)
10969
10970
10971
10972
10973
10974//                                       1      2              3          4          5        6             7          8        9          10       11      12
10975// void kernel_dtrmm_nt_ru_4x4_tran_vs_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd, int m1, int n1);
10976
10977	.p2align 4,,15
10978	GLOB_FUN_START(kernel_dtrmm_nt_ru_4x4_tran_vs_lib4c4c)
10979
10980	PROLOGUE
10981
10982	// zero accumulation registers
10983
10984	ZERO_ACC
10985
10986
10987	// call inner dgemm kernel nn
10988
10989	movq	ARG1, %r10 // k
10990	movq	ARG3, %r11  // A
10991	movq	ARG4, %r12  // B
10992	movq	ARG5, %r13  // ldb
10993	sall	$3, %r13d
10994
10995#if MACRO_LEVEL>=1
10996	INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4C
10997#else
10998	CALL(inner_edge_dtrmm_nt_ru_4x4_vs_lib4c)
10999#endif
11000
11001	movq	ARG10, %r14  // m1
11002	cmpl	$1, %r14d
11003	jg		100f
11004
11005#if MACRO_LEVEL>=2
11006	INNER_KERNEL_DGEMM_NT_4X1_LIB4C
11007#else
11008	CALL(inner_kernel_dgemm_nt_4x1_lib4c)
11009#endif
11010
11011	jmp		103f
11012
11013100:
11014
11015	movq	ARG10, %r14  // m1
11016	cmpl	$2, %r14d
11017	jg		101f
11018
11019#if MACRO_LEVEL>=2
11020	INNER_KERNEL_DGEMM_NT_4X2_LIB4C
11021#else
11022	CALL(inner_kernel_dgemm_nt_4x2_lib4c)
11023#endif
11024
11025	jmp		103f
11026
11027101:
11028
11029	movq	ARG10, %r14  // m1
11030	cmpl	$3, %r14d
11031	jg		102f
11032
11033#if MACRO_LEVEL>=2
11034	INNER_KERNEL_DGEMM_NT_4X3_LIB4C
11035#else
11036	CALL(inner_kernel_dgemm_nt_4x3_lib4c)
11037#endif
11038
11039	jmp		103f
11040
11041102:
11042
11043#if MACRO_LEVEL>=2
11044	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
11045#else
11046	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
11047#endif
11048
11049103:
11050
11051
11052	// call inner blend
11053
11054	movq	ARG2, %r10 // alpha
11055	movq	ARG6, %r11 // beta
11056	movq	ARG7, %r12   // C
11057
11058#if MACRO_LEVEL>=1
11059	INNER_SCALE_AB_4X4_LIB4
11060#else
11061	CALL(inner_scale_ab_4x4_lib4)
11062#endif
11063
11064
11065	// store n
11066
11067	movq	ARG8, %r10 // D
11068	movq	ARG9, %r11 // ldd
11069	sall	$3, %r11d
11070	movq	ARG10, %r12 // m1
11071	movq	ARG11, %r13 // n1
11072
11073#if MACRO_LEVEL>=1
11074	INNER_TRAN_STORE_4X4_VS_LIB
11075#else
11076	CALL(inner_tran_store_4x4_vs_lib)
11077#endif
11078
11079
11080	EPILOGUE
11081
11082	ret
11083
11084	FUN_END(kernel_dtrmm_nt_ru_4x4_tran_vs_lib4c4c)
11085
11086
11087
11088
11089
11090//                                        1      2              3          4          5             6          7        8          9
11091// void kernel_dtrmm_nt_ru_one_4x4_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd);
11092
11093	.p2align 4,,15
11094	GLOB_FUN_START(kernel_dtrmm_nt_ru_one_4x4_lib44cc)
11095
11096	PROLOGUE
11097
11098	// zero accumulation registers
11099
11100	ZERO_ACC
11101
11102
11103	// call inner dgemm kernel nn
11104
11105	movq	ARG1, %r10 // k
11106	subl	$4, %r10d
11107	movq	ARG3, %r11 // A
11108	addq	$128, %r11
11109	movq	ARG4, %r12 // B
11110	addq	$128, %r12
11111
11112#if MACRO_LEVEL>=1
11113//	INNER_EDGE_DTRMM_NT_RU_ONE_4X4_LIB4
11114#else
11115//	CALL(inner_edge_dtrmm_one_nt_ru_4x4_lib4)
11116#endif
11117
11118#if MACRO_LEVEL>=2
11119	INNER_KERNEL_DGEMM_NT_4X4_LIB4
11120#else
11121	CALL(inner_kernel_dgemm_nt_4x4_lib4)
11122#endif
11123
11124
11125	// call inner blend
11126
11127#if MACRO_LEVEL>=1
11128	INNER_BLEND_4X4_LIB4
11129#else
11130	CALL(inner_blend_4x4_lib4)
11131#endif
11132
11133
11134	// initial triangle
11135
11136	movq	ARG1, %r10
11137	movq	ARG3, %r11
11138	movq	ARG4, %r12
11139
11140#if MACRO_LEVEL>=1
11141	INNER_EDGE_DTRMM_NT_RU_ONE_4X4_LIB4
11142#else
11143	CALL(inner_edge_dtrmm_nt_ru_one_4x4_lib4)
11144#endif
11145
11146
11147	// call inner blend
11148
11149	movq	ARG2, %r10 // alpha
11150	movq	ARG5, %r11 // beta
11151	movq	ARG6, %r12   // C
11152	movq	ARG7, %r13   // ldc
11153	sall	$3, %r13d
11154
11155#if MACRO_LEVEL>=1
11156	INNER_SCALE_AB_4X4_LIB
11157#else
11158	CALL(inner_scale_ab_4x4_lib)
11159#endif
11160
11161
11162	// store n
11163
11164	movq	ARG8, %r10 // D
11165	movq	ARG9, %r11 // ldd
11166	sall	$3, %r11d
11167
11168#if MACRO_LEVEL>=1
11169	INNER_STORE_4X4_LIB
11170#else
11171	CALL(inner_store_4x4_lib)
11172#endif
11173
11174
11175	EPILOGUE
11176
11177	ret
11178
11179	FUN_END(kernel_dtrmm_nt_ru_one_4x4_lib44cc)
11180
11181
11182
11183
11184
11185//                                           1      2              3          4          5             6          7        8          9        10      11
11186// void kernel_dtrmm_nt_ru_one_4x4_vs_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
11187
11188	.p2align 4,,15
11189	GLOB_FUN_START(kernel_dtrmm_nt_ru_one_4x4_vs_lib44cc)
11190
11191	PROLOGUE
11192
11193	// zero accumulation registers
11194
11195	ZERO_ACC
11196
11197
11198	// call inner dgemm kernel nn
11199
11200	movq	ARG1, %r10 // k
11201	subl	$4, %r10d
11202	movq	ARG3, %r11 // A
11203	addq	$128, %r11
11204	movq	ARG4, %r12 // B
11205	addq	$128, %r12
11206
11207#if MACRO_LEVEL>=1
11208//	INNER_EDGE_DTRMM_NT_RU_ONE_4X4_VS_LIB4
11209#else
11210//	CALL(inner_edge_dtrmm_nt_ru_one_4x4_vs_lib4)
11211#endif
11212
11213#if MACRO_LEVEL>=2
11214	INNER_KERNEL_DGEMM_NT_4X4_LIB4
11215#else
11216	CALL(inner_kernel_dgemm_nt_4x4_lib4)
11217#endif
11218
11219
11220	// call inner blend
11221
11222#if MACRO_LEVEL>=1
11223	INNER_BLEND_4X4_LIB4
11224#else
11225	CALL(inner_blend_4x4_lib4)
11226#endif
11227
11228
11229	// initial triangle
11230
11231	movq	ARG1, %r10
11232	movq	ARG3, %r11
11233	movq	ARG4, %r12
11234
11235#if MACRO_LEVEL>=1
11236	INNER_EDGE_DTRMM_NT_RU_ONE_4X4_VS_LIB4
11237#else
11238	CALL(inner_edge_dtrmm_nt_ru_one_4x4_vs_lib4)
11239#endif
11240
11241
11242	// call inner blend
11243
11244	movq	ARG2, %r10 // alpha
11245	movq	ARG5, %r11 // beta
11246	movq	ARG6, %r12   // C
11247	movq	ARG7, %r13   // ldc
11248	sall	$3, %r13d
11249	movq	ARG10, %r14   // m1
11250	movq	ARG11, %r15   // n1
11251
11252#if MACRO_LEVEL>=1
11253	INNER_SCALE_AB_4X4_VS_LIB
11254#else
11255	CALL(inner_scale_ab_4x4_vs_lib)
11256#endif
11257
11258
11259	// store n
11260
11261	movq	ARG8, %r10 // D
11262	movq	ARG9, %r11 // ldd
11263	sall	$3, %r11d
11264	movq	ARG10, %r12   // m1
11265	movq	ARG11, %r13   // n1
11266
11267#if MACRO_LEVEL>=1
11268	INNER_STORE_4X4_LIB
11269#else
11270	CALL(inner_store_4x4_lib)
11271#endif
11272
11273
11274	EPILOGUE
11275
11276	ret
11277
11278	FUN_END(kernel_dtrmm_nt_ru_one_4x4_vs_lib44cc)
11279
11280
11281
11282
11283
11284//                                    1      2              3          4          5             6          7        8          9
11285// void kernel_dtrmm_nt_ru_one_4x4_tran_lib444c(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int ldd);
11286
11287	.p2align 4,,15
11288	GLOB_FUN_START(kernel_dtrmm_nt_ru_one_4x4_tran_lib444c)
11289
11290	PROLOGUE
11291
11292	// zero accumulation registers
11293
11294	ZERO_ACC
11295
11296
11297	// call inner dgemm kernel nn
11298
11299	movq	ARG1, %r10 // k
11300	subl	$4, %r10d
11301	movq	ARG3, %r11 // A
11302	addq	$128, %r11
11303	movq	ARG4, %r12 // B
11304	addq	$128, %r12
11305
11306#if MACRO_LEVEL>=1
11307//	INNER_EDGE_DTRMM_NT_RU_4X4_LIB4
11308#else
11309//	CALL(inner_edge_dtrmm_nt_ru_4x4_lib4)
11310#endif
11311
11312#if MACRO_LEVEL>=2
11313	INNER_KERNEL_DGEMM_NT_4X4_LIB4
11314#else
11315	CALL(inner_kernel_dgemm_nt_4x4_lib4)
11316#endif
11317
11318
11319	// call inner blend
11320
11321#if MACRO_LEVEL>=1
11322	INNER_BLEND_4X4_LIB4
11323#else
11324	CALL(inner_blend_4x4_lib4)
11325#endif
11326
11327
11328	// initial triangle
11329
11330	movq	ARG1, %r10
11331	movq	ARG3, %r11
11332	movq	ARG4, %r12
11333
11334#if MACRO_LEVEL>=1
11335	INNER_EDGE_DTRMM_NT_RU_ONE_4X4_LIB4
11336#else
11337	CALL(inner_edge_dtrmm_nt_ru_one_4x4_lib4)
11338#endif
11339
11340
11341	// call inner blend
11342
11343	movq	ARG2, %r10 // alpha
11344	movq	ARG5, %r11 // beta
11345	movq	ARG6, %r12   // C
11346
11347#if MACRO_LEVEL>=1
11348	INNER_SCALE_AB_4X4_LIB4
11349#else
11350	CALL(inner_scale_ab_4x4_lib4)
11351#endif
11352
11353
11354	// store n
11355
11356	movq	ARG7, %r10 // D
11357	movq	ARG8, %r11 // ldd
11358	sall	$3, %r11d
11359
11360#if MACRO_LEVEL>=1
11361	INNER_TRAN_STORE_4X4_LIB
11362#else
11363	CALL(inner_tran_store_4x4_lib)
11364#endif
11365
11366
11367	EPILOGUE
11368
11369	ret
11370
11371	FUN_END(kernel_dtrmm_nt_ru_one_4x4_tran_lib444c)
11372
11373
11374
11375
11376
11377//                                       1      2              3          4          5             6          7        8          9        10      11
11378// void kernel_dtrmm_nt_ru_one_4x4_tran_vs_lib444c(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int ldd, int m1, int n1);
11379
11380	.p2align 4,,15
11381	GLOB_FUN_START(kernel_dtrmm_nt_ru_one_4x4_tran_vs_lib444c)
11382
11383	PROLOGUE
11384
11385	// zero accumulation registers
11386
11387	ZERO_ACC
11388
11389
11390	// call inner dgemm kernel nn
11391
11392	movq	ARG1, %r10 // k
11393	subl	$4, %r10d
11394	movq	ARG3, %r11 // A
11395	addq	$128, %r11
11396	movq	ARG4, %r12 // B
11397	addq	$128, %r12
11398
11399#if MACRO_LEVEL>=1
11400//	INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4
11401#else
11402//	CALL(inner_edge_dtrmm_nt_ru_4x4_vs_lib4)
11403#endif
11404
11405#if MACRO_LEVEL>=2
11406	INNER_KERNEL_DGEMM_NT_4X4_LIB4
11407#else
11408	CALL(inner_kernel_dgemm_nt_4x4_lib4)
11409#endif
11410
11411
11412	// call inner blend
11413
11414#if MACRO_LEVEL>=1
11415	INNER_BLEND_4X4_LIB4
11416#else
11417	CALL(inner_blend_4x4_lib4)
11418#endif
11419
11420
11421	// initial triangle
11422
11423	movq	ARG1, %r10
11424	movq	ARG3, %r11
11425	movq	ARG4, %r12
11426
11427#if MACRO_LEVEL>=1
11428	INNER_EDGE_DTRMM_NT_RU_ONE_4X4_VS_LIB4
11429#else
11430	CALL(inner_edge_dtrmm_nt_ru_one_4x4_vs_lib4)
11431#endif
11432
11433
11434	// call inner blend
11435
11436	movq	ARG2, %r10 // alpha
11437	movq	ARG5, %r11 // beta
11438	movq	ARG6, %r12   // C
11439
11440#if MACRO_LEVEL>=1
11441	INNER_SCALE_AB_4X4_LIB4
11442#else
11443	CALL(inner_scale_ab_4x4_lib4)
11444#endif
11445
11446
11447	// store n
11448
11449	movq	ARG7, %r10 // D
11450	movq	ARG8, %r11 // ldd
11451	sall	$3, %r11d
11452	movq	ARG9, %r12   // m1
11453	movq	ARG10, %r13   // n1
11454
11455#if MACRO_LEVEL>=1
11456	INNER_TRAN_STORE_4X4_VS_LIB
11457#else
11458	CALL(inner_tran_store_4x4_vs_lib)
11459#endif
11460
11461
11462	EPILOGUE
11463
11464	ret
11465
11466	FUN_END(kernel_dtrmm_nt_ru_one_4x4_tran_vs_lib444c)
11467
11468
11469
11470
11471
11472//                                        1      2              3          4          5        6             7          8        9          10
11473// void kernel_dtrmm_nt_ru_one_4x4_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd);
11474
11475	.p2align 4,,15
11476	GLOB_FUN_START(kernel_dtrmm_nt_ru_one_4x4_lib4ccc)
11477
11478	PROLOGUE
11479
11480	// zero accumulation registers
11481
11482	ZERO_ACC
11483
11484
11485	// call inner dgemm kernel nn
11486
11487	movq	ARG1, %r10 // k
11488	movq	ARG3, %r11  // A
11489	movq	ARG4, %r12  // B
11490	movq	ARG5, %r13  // ldb
11491	sall	$3, %r13d
11492
11493#if MACRO_LEVEL>=1
11494	INNER_EDGE_DTRMM_NT_RU_ONE_4X4_LIB4C
11495#else
11496	CALL(inner_edge_dtrmm_nt_ru_one_4x4_lib4c)
11497#endif
11498
11499#if MACRO_LEVEL>=2
11500	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
11501#else
11502	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
11503#endif
11504
11505
11506	// call inner blend
11507
11508	movq	ARG2, %r10 // alpha
11509	movq	ARG6, %r11 // beta
11510	movq	ARG7, %r12   // C
11511	movq	ARG8, %r13   // ldc
11512	sall	$3, %r13d
11513
11514#if MACRO_LEVEL>=1
11515	INNER_SCALE_AB_4X4_LIB
11516#else
11517	CALL(inner_scale_ab_4x4_lib)
11518#endif
11519
11520
11521	// store n
11522
11523	movq	ARG9, %r10 // D
11524	movq	ARG10, %r11 // ldd
11525	sall	$3, %r11d
11526
11527#if MACRO_LEVEL>=1
11528	INNER_STORE_4X4_LIB
11529#else
11530	CALL(inner_store_4x4_lib)
11531#endif
11532
11533
11534	EPILOGUE
11535
11536	ret
11537
11538	FUN_END(kernel_dtrmm_nt_ru_one_4x4_lib4ccc)
11539
11540
11541
11542
11543
11544//                                            1      2              3          4          5        6             7          8        9          10       11      12
11545// void kernel_dtrmm_nt_ru_one_4x4_vs_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
11546
11547	.p2align 4,,15
11548	GLOB_FUN_START(kernel_dtrmm_nt_ru_one_4x4_vs_lib4ccc)
11549
11550	PROLOGUE
11551
11552	// zero accumulation registers
11553
11554	ZERO_ACC
11555
11556
11557	// call inner dgemm kernel nn
11558
11559	movq	ARG1, %r10 // k
11560	movq	ARG3, %r11  // A
11561	movq	ARG4, %r12  // B
11562	movq	ARG5, %r13  // ldb
11563	sall	$3, %r13d
11564
11565#if MACRO_LEVEL>=1
11566	INNER_EDGE_DTRMM_NT_RU_ONE_4X4_VS_LIB4C
11567#else
11568	CALL(inner_edge_dtrmm_nt_ru_one_4x4_vs_lib4c)
11569#endif
11570
11571	movq	ARG12, %r14  // n1
11572	cmpl	$1, %r14d
11573	jg		100f
11574
11575#if MACRO_LEVEL>=2
11576	INNER_KERNEL_DGEMM_NT_4X1_LIB4C
11577#else
11578	CALL(inner_kernel_dgemm_nt_4x1_lib4c)
11579#endif
11580
11581	jmp		103f
11582
11583100:
11584
11585	movq	ARG12, %r14  // n1
11586	cmpl	$2, %r14d
11587	jg		101f
11588
11589#if MACRO_LEVEL>=2
11590	INNER_KERNEL_DGEMM_NT_4X2_LIB4C
11591#else
11592	CALL(inner_kernel_dgemm_nt_4x2_lib4c)
11593#endif
11594
11595	jmp		103f
11596
11597101:
11598
11599	movq	ARG12, %r14  // n1
11600	cmpl	$3, %r14d
11601	jg		102f
11602
11603#if MACRO_LEVEL>=2
11604	INNER_KERNEL_DGEMM_NT_4X3_LIB4C
11605#else
11606	CALL(inner_kernel_dgemm_nt_4x3_lib4c)
11607#endif
11608
11609	jmp		103f
11610
11611102:
11612
11613#if MACRO_LEVEL>=2
11614	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
11615#else
11616	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
11617#endif
11618
11619103:
11620
11621
11622	// call inner blend
11623
11624	movq	ARG2, %r10 // alpha
11625	movq	ARG6, %r11 // beta
11626	movq	ARG7, %r12   // C
11627	movq	ARG8, %r13   // ldc
11628	sall	$3, %r13d
11629	movq	ARG11, %r14 // m1
11630	movq	ARG12, %r15 // n1
11631
11632#if MACRO_LEVEL>=1
11633	INNER_SCALE_AB_4X4_VS_LIB
11634#else
11635	CALL(inner_scale_ab_4x4_vs_lib)
11636#endif
11637
11638
11639	// store n
11640
11641	movq	ARG9, %r10 // D
11642	movq	ARG10, %r11 // ldd
11643	sall	$3, %r11d
11644	movq	ARG11, %r12 // m1
11645	movq	ARG12, %r13 // n1
11646
11647#if MACRO_LEVEL>=1
11648	INNER_STORE_4X4_VS_LIB
11649#else
11650	CALL(inner_store_4x4_vs_lib)
11651#endif
11652
11653
11654	EPILOGUE
11655
11656	ret
11657
11658	FUN_END(kernel_dtrmm_nt_ru_one_4x4_vs_lib4ccc)
11659
11660
11661
11662
11663
11664//                                    1      2              3          4          5        6             7          8        9          10
11665// void kernel_dtrmm_nt_ru_one_4x4_tran_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd);
11666
11667	.p2align 4,,15
11668	GLOB_FUN_START(kernel_dtrmm_nt_ru_one_4x4_tran_lib4c4c)
11669
11670	PROLOGUE
11671
11672	// zero accumulation registers
11673
11674	ZERO_ACC
11675
11676
11677	// call inner dgemm kernel nn
11678
11679	movq	ARG1, %r10 // k
11680	movq	ARG3, %r11  // A
11681	movq	ARG4, %r12  // B
11682	movq	ARG5, %r13  // ldb
11683	sall	$3, %r13d
11684
11685#if MACRO_LEVEL>=1
11686	INNER_EDGE_DTRMM_NT_RU_ONE_4X4_LIB4C
11687#else
11688	CALL(inner_edge_dtrmm_nt_ru_one_4x4_lib4c)
11689#endif
11690
11691#if MACRO_LEVEL>=2
11692	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
11693#else
11694	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
11695#endif
11696
11697
11698	// call inner blend
11699
11700	movq	ARG2, %r10 // alpha
11701	movq	ARG6, %r11 // beta
11702	movq	ARG7, %r12   // C
11703
11704#if MACRO_LEVEL>=1
11705	INNER_SCALE_AB_4X4_LIB4
11706#else
11707	CALL(inner_scale_ab_4x4_lib4)
11708#endif
11709
11710
11711	// store n
11712
11713	movq	ARG8, %r10 // D
11714	movq	ARG9, %r11 // ldd
11715	sall	$3, %r11d
11716
11717#if MACRO_LEVEL>=1
11718	INNER_TRAN_STORE_4X4_LIB
11719#else
11720	CALL(inner_tran_store_4x4_lib)
11721#endif
11722
11723
11724	EPILOGUE
11725
11726	ret
11727
11728	FUN_END(kernel_dtrmm_nt_ru_one_4x4_tran_lib4c4c)
11729
11730
11731
11732
11733
11734//                                       1      2              3          4          5        6             7          8        9          10       11      12
11735// void kernel_dtrmm_nt_ru_one_4x4_tran_vs_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd, int m1, int n1);
11736
11737	.p2align 4,,15
11738	GLOB_FUN_START(kernel_dtrmm_nt_ru_one_4x4_tran_vs_lib4c4c)
11739
11740	PROLOGUE
11741
11742	// zero accumulation registers
11743
11744	ZERO_ACC
11745
11746
11747	// call inner dgemm kernel nn
11748
11749	movq	ARG1, %r10 // k
11750	movq	ARG3, %r11  // A
11751	movq	ARG4, %r12  // B
11752	movq	ARG5, %r13  // ldb
11753	sall	$3, %r13d
11754
11755#if MACRO_LEVEL>=1
11756	INNER_EDGE_DTRMM_NT_RU_ONE_4X4_VS_LIB4C
11757#else
11758	CALL(inner_edge_dtrmm_nt_ru_one_4x4_vs_lib4c)
11759#endif
11760
11761	movq	ARG10, %r14  // m1
11762	cmpl	$1, %r14d
11763	jg		100f
11764
11765#if MACRO_LEVEL>=2
11766	INNER_KERNEL_DGEMM_NT_4X1_LIB4C
11767#else
11768	CALL(inner_kernel_dgemm_nt_4x1_lib4c)
11769#endif
11770
11771	jmp		103f
11772
11773100:
11774
11775	movq	ARG10, %r14  // m1
11776	cmpl	$2, %r14d
11777	jg		101f
11778
11779#if MACRO_LEVEL>=2
11780	INNER_KERNEL_DGEMM_NT_4X2_LIB4C
11781#else
11782	CALL(inner_kernel_dgemm_nt_4x2_lib4c)
11783#endif
11784
11785	jmp		103f
11786
11787101:
11788
11789	movq	ARG10, %r14  // m1
11790	cmpl	$3, %r14d
11791	jg		102f
11792
11793#if MACRO_LEVEL>=2
11794	INNER_KERNEL_DGEMM_NT_4X3_LIB4C
11795#else
11796	CALL(inner_kernel_dgemm_nt_4x3_lib4c)
11797#endif
11798
11799	jmp		103f
11800
11801102:
11802
11803#if MACRO_LEVEL>=2
11804	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
11805#else
11806	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
11807#endif
11808
11809103:
11810
11811
11812	// call inner blend
11813
11814	movq	ARG2, %r10 // alpha
11815	movq	ARG6, %r11 // beta
11816	movq	ARG7, %r12   // C
11817
11818#if MACRO_LEVEL>=1
11819	INNER_SCALE_AB_4X4_LIB4
11820#else
11821	CALL(inner_scale_ab_4x4_lib4)
11822#endif
11823
11824
11825	// store n
11826
11827	movq	ARG8, %r10 // D
11828	movq	ARG9, %r11 // ldd
11829	sall	$3, %r11d
11830	movq	ARG10, %r12 // m1
11831	movq	ARG11, %r13 // n1
11832
11833#if MACRO_LEVEL>=1
11834	INNER_TRAN_STORE_4X4_VS_LIB
11835#else
11836	CALL(inner_tran_store_4x4_vs_lib)
11837#endif
11838
11839
11840	EPILOGUE
11841
11842	ret
11843
11844	FUN_END(kernel_dtrmm_nt_ru_one_4x4_tran_vs_lib4c4c)
11845
11846
11847
11848
11849
11850//                                    1      2          3          4          5        6          7        8
11851// void kernel_dpotrf_nt_l_4x4_lib44cc(int k, double *A, double *B, double *C, int ldc, double *D, int ldd, double *inv_diag_D);
11852
11853	.p2align 4,,15
11854	GLOB_FUN_START(kernel_dpotrf_nt_l_4x4_lib44cc)
11855
11856	PROLOGUE
11857
11858	// zero accumulation registers
11859
11860	ZERO_ACC
11861
11862
11863	// call inner dgemm kernel nt
11864
11865	movq	ARG1, %r10 // kmax
11866	movq	ARG2, %r11 // A
11867	movq	ARG3, %r12 // B
11868
11869#if MACRO_LEVEL>=2
11870	INNER_KERNEL_DGEMM_NT_4X4_LIB4
11871#else
11872	CALL(inner_kernel_dgemm_nt_4x4_lib4)
11873#endif
11874
11875
11876	// call inner blender_loader nn
11877
11878	movq	ARG4, %r10 // C
11879	movq	ARG5, %r11 // ldc
11880	sall	$3, %r11d
11881
11882#if MACRO_LEVEL>=1
11883	INNER_BLEND_SCALE_M11_4X4_LIB
11884#else
11885	CALL(inner_blend_scale_m11_4x4_lib)
11886#endif
11887
11888
11889	// factorization
11890
11891	movq	ARG8, %r10  // inv_diag_D
11892	movl	$4, %r11d
11893
11894#if MACRO_LEVEL>=1
11895	INNER_EDGE_DPOTRF_4X4_VS_LIB4
11896#else
11897	CALL(inner_edge_dpotrf_4x4_vs_lib4)
11898#endif
11899
11900
11901	// store
11902
11903	movq	ARG6, %r10 // D
11904	movq	ARG7, %r11 // ldd
11905	sall	$3, %r11d
11906
11907#if MACRO_LEVEL>=1
11908	INNER_STORE_L_4X4_LIB
11909#else
11910	CALL(inner_store_l_4x4_lib)
11911#endif
11912
11913
11914	EPILOGUE
11915
11916	ret
11917
11918	FUN_END(kernel_dpotrf_nt_l_4x4_lib44cc)
11919
11920
11921
11922
11923
11924//                                       1      2          3          4          5        6          7        8                   9       10
11925// void kernel_dpotrf_nt_l_4x4_vs_lib44cc(int k, double *A, double *B, double *C, int ldc, double *D, int ldd, double *inv_diag_D, int m1, int n1);
11926
11927	.p2align 4,,15
11928	GLOB_FUN_START(kernel_dpotrf_nt_l_4x4_vs_lib44cc)
11929
11930	PROLOGUE
11931
11932	// zero accumulation registers
11933
11934	ZERO_ACC
11935
11936
11937	// call inner dgemm kernel nt
11938
11939	movq	ARG1, %r10 // kmax
11940	movq	ARG2, %r11 // A
11941	movq	ARG3, %r12 // B
11942
11943#if MACRO_LEVEL>=2
11944	INNER_KERNEL_DGEMM_NT_4X4_LIB4
11945#else
11946	CALL(inner_kernel_dgemm_nt_4x4_lib4)
11947#endif
11948
11949
11950	// call inner blender_loader nn
11951
11952	movq	ARG4, %r10 // C
11953	movq	ARG5, %r11 // ldc
11954	sall	$3, %r11d
11955	movq	ARG9, %r12 // m1
11956	movq	ARG10, %r13 // n1
11957
11958#if MACRO_LEVEL>=1
11959	INNER_BLEND_SCALE_M11_4X4_VS_LIB
11960#else
11961	CALL(inner_blend_scale_m11_4x4_vs_lib)
11962#endif
11963
11964
11965	// factorization
11966
11967	movq	ARG8, %r10  // inv_diag_D
11968	movq	ARG10, %r11 // n1
11969
11970#if MACRO_LEVEL>=1
11971	INNER_EDGE_DPOTRF_4X4_VS_LIB4
11972#else
11973	CALL(inner_edge_dpotrf_4x4_vs_lib4)
11974#endif
11975
11976
11977	// store
11978
11979	movq	ARG6, %r10 // D
11980	movq	ARG7, %r11 // ldd
11981	sall	$3, %r11d
11982	movq	ARG9, %r12 // m1
11983	movq	ARG10, %r13 // n1
11984
11985#if MACRO_LEVEL>=1
11986	INNER_STORE_L_4X4_VS_LIB
11987#else
11988	CALL(inner_store_l_4x4_vs_lib)
11989#endif
11990
11991
11992	EPILOGUE
11993
11994	ret
11995
11996	FUN_END(kernel_dpotrf_nt_l_4x4_vs_lib44cc)
11997
11998
11999
12000
12001
12002//                                         1      2          3          4        5             6          7          8          9        10
12003// void kernel_dtrsm_nn_rl_inv_4x4_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, double *inv_diag_E);
12004
12005	.p2align 4,,15
12006	GLOB_FUN_START(kernel_dtrsm_nn_rl_inv_4x4_lib4c44c)
12007
12008	PROLOGUE
12009
12010	// zero accumulation registers
12011
12012	ZERO_ACC
12013
12014
12015	// call inner dgemm kernel nt
12016
12017	movq	ARG1, %r10 // kmax
12018	movq	ARG2, %r11 // A
12019	movq	ARG3, %r12 // B
12020	movq	ARG4, %r13 // ldb
12021	sall	$3, %r13d
12022
12023#if MACRO_LEVEL>=2
12024	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
12025#else
12026	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
12027#endif
12028
12029
12030	// call inner blender_loader nn
12031
12032	movq	ARG5, %r10 // beta
12033	movq	ARG6, %r11 // C
12034
12035#if MACRO_LEVEL>=1
12036	INNER_SCALE_M1B_4X4_LIB4
12037#else
12038	CALL(inner_scale_m1b_4x4_lib4)
12039#endif
12040
12041
12042	// solve
12043
12044	movq	ARG8, %r10  // E
12045	movq	ARG9, %r11 // lde
12046	sall	$3, %r11d
12047	movq	ARG10, %r12  // inv_diag_E
12048
12049#if MACRO_LEVEL>=1
12050	INNER_EDGE_DTRSM_RLN_INV_4X4_LIB
12051#else
12052	CALL(inner_edge_dtrsm_rln_inv_4x4_lib)
12053#endif
12054
12055
12056	// store
12057
12058	movq	ARG7, %r10 // D
12059
12060#if MACRO_LEVEL>=1
12061	INNER_STORE_4X4_LIB4
12062#else
12063	CALL(inner_store_4x4_lib4)
12064#endif
12065
12066
12067	EPILOGUE
12068
12069	ret
12070
12071	FUN_END(kernel_dtrsm_nn_rl_inv_4x4_lib4c44c)
12072
12073
12074
12075
12076
12077//                                            1      2          3          4        5             6          7          8          9        10                  11      12
12078// void kernel_dtrsm_nn_rl_inv_4x4_vs_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, double *inv_diag_E, int m1, int n1);
12079
12080	.p2align 4,,15
12081	GLOB_FUN_START(kernel_dtrsm_nn_rl_inv_4x4_vs_lib4c44c)
12082
12083	PROLOGUE
12084
12085	// zero accumulation registers
12086
12087	ZERO_ACC
12088
12089
12090	// call inner dgemm kernel nt
12091
12092	movq	ARG1, %r10 // kmax
12093	movq	ARG2, %r11 // A
12094	movq	ARG3, %r12 // B
12095	movq	ARG4, %r13 // ldb
12096	sall	$3, %r13d
12097
12098	movq	ARG12, %r14  // n1
12099	cmpl	$1, %r14d
12100	jg		100f
12101
12102#if MACRO_LEVEL>=2
12103	INNER_KERNEL_DGEMM_NN_4X1_LIB4C
12104#else
12105	CALL(inner_kernel_dgemm_nn_4x1_lib4c)
12106#endif
12107
12108	jmp		103f
12109
12110100:
12111
12112	movq	ARG12, %r14  // n1
12113	cmpl	$2, %r14d
12114	jg		101f
12115
12116#if MACRO_LEVEL>=2
12117	INNER_KERNEL_DGEMM_NN_4X2_LIB4C
12118#else
12119	CALL(inner_kernel_dgemm_nn_4x2_lib4c)
12120#endif
12121
12122	jmp		103f
12123
12124101:
12125
12126	movq	ARG12, %r14  // n1
12127	cmpl	$3, %r14d
12128	jg		102f
12129
12130#if MACRO_LEVEL>=2
12131	INNER_KERNEL_DGEMM_NN_4X3_LIB4C
12132#else
12133	CALL(inner_kernel_dgemm_nn_4x3_lib4c)
12134#endif
12135
12136	jmp		103f
12137
12138102:
12139
12140#if MACRO_LEVEL>=2
12141	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
12142#else
12143	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
12144#endif
12145
12146103:
12147
12148
12149	// call inner blender_loader nn
12150
12151	movq	ARG5, %r10 // beta
12152	movq	ARG6, %r11 // C
12153
12154#if MACRO_LEVEL>=1
12155	INNER_SCALE_M1B_4X4_LIB4
12156#else
12157	CALL(inner_scale_m1b_4x4_lib4)
12158#endif
12159
12160
12161	// solve
12162
12163	movq	ARG8, %r10  // E
12164	movq	ARG9, %r11 // lde
12165	sall	$3, %r11d
12166	movq	ARG10, %r12  // inv_diag_E
12167	movq	ARG12, %r13  // n1
12168
12169#if MACRO_LEVEL>=1
12170	INNER_EDGE_DTRSM_RLN_INV_4X4_VS_LIB
12171#else
12172	CALL(inner_edge_dtrsm_rln_inv_4x4_vs_lib)
12173#endif
12174
12175
12176	// store
12177
12178	movq	ARG7, %r10 // D
12179	movq	ARG11, %r11  // m1
12180	movq	ARG12, %r12  // n1
12181
12182#if MACRO_LEVEL>=1
12183	INNER_STORE_4X4_VS_LIB4
12184#else
12185	CALL(inner_store_4x4_vs_lib4)
12186#endif
12187
12188
12189	EPILOGUE
12190
12191	ret
12192
12193	FUN_END(kernel_dtrsm_nn_rl_inv_4x4_vs_lib4c44c)
12194
12195
12196
12197
12198
12199//                                         1      2          3          4        5             6          7        8          9        10         11       12
12200// void kernel_dtrsm_nn_rl_inv_4x4_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E);
12201
12202	.p2align 4,,15
12203	GLOB_FUN_START(kernel_dtrsm_nn_rl_inv_4x4_lib4cccc)
12204
12205	PROLOGUE
12206
12207	// zero accumulation registers
12208
12209	ZERO_ACC
12210
12211
12212	// call inner dgemm kernel nt
12213
12214	movq	ARG1, %r10
12215	movq	ARG2, %r11
12216	movq	ARG3, %r12
12217	movq	ARG4, %r13 // ldb
12218	sall	$3, %r13d
12219
12220#if MACRO_LEVEL>=2
12221	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
12222#else
12223	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
12224#endif
12225
12226
12227	// call inner blender_loader nn
12228
12229	movq	ARG5, %r10 // beta
12230	movq	ARG6, %r11 // C
12231	movq	ARG7, %r12 // ldc
12232	sall	$3, %r12d
12233
12234#if MACRO_LEVEL>=1
12235	INNER_SCALE_M1B_4X4_LIB
12236#else
12237	CALL(inner_scale_m1b_4x4_lib)
12238#endif
12239
12240
12241	// solve
12242
12243	movq	ARG10, %r10  // E
12244	movq	ARG11, %r11 // lde
12245	sall	$3, %r11d
12246	movq	ARG12, %r12  // inv_diag_E
12247
12248#if MACRO_LEVEL>=1
12249	INNER_EDGE_DTRSM_RLN_INV_4X4_LIB
12250#else
12251	CALL(inner_edge_dtrsm_rln_inv_4x4_lib)
12252#endif
12253
12254
12255	// store
12256
12257	movq	ARG8, %r10 // D
12258	movq	ARG9, %r11 // ldd
12259	sall	$3, %r11d
12260
12261#if MACRO_LEVEL>=1
12262	INNER_STORE_4X4_LIB
12263#else
12264	CALL(inner_store_4x4_lib)
12265#endif
12266
12267
12268	EPILOGUE
12269
12270	ret
12271
12272	FUN_END(kernel_dtrsm_nn_rl_inv_4x4_lib4cccc)
12273
12274
12275
12276
12277
12278//                                             1      2          3          4        5             6          7        8          9        10         11       12                  13      14
12279// void kernel_dtrsm_nn_rl_inv_4x4_vs_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E, int m1, int n1);
12280
12281	.p2align 4,,15
12282	GLOB_FUN_START(kernel_dtrsm_nn_rl_inv_4x4_vs_lib4cccc)
12283
12284	PROLOGUE
12285
12286	// zero accumulation registers
12287
12288	ZERO_ACC
12289
12290
12291	// call inner dgemm kernel nt
12292
12293	movq	ARG1, %r10
12294	movq	ARG2, %r11
12295	movq	ARG3, %r12
12296	movq	ARG4, %r13 // ldb
12297	sall	$3, %r13d
12298
12299
12300	movq	ARG14, %r14  // n1
12301	cmpl	$1, %r14d
12302	jg		100f
12303
12304#if MACRO_LEVEL>=2
12305	INNER_KERNEL_DGEMM_NN_4X1_LIB4C
12306#else
12307	CALL(inner_kernel_dgemm_nn_4x1_lib4c)
12308#endif
12309
12310	jmp		103f
12311
12312100:
12313
12314	movq	ARG14, %r14  // n1
12315	cmpl	$2, %r14d
12316	jg		101f
12317
12318#if MACRO_LEVEL>=2
12319	INNER_KERNEL_DGEMM_NN_4X2_LIB4C
12320#else
12321	CALL(inner_kernel_dgemm_nn_4x2_lib4c)
12322#endif
12323
12324	jmp		103f
12325
12326101:
12327
12328	movq	ARG14, %r14  // n1
12329	cmpl	$3, %r14d
12330	jg		102f
12331
12332#if MACRO_LEVEL>=2
12333	INNER_KERNEL_DGEMM_NN_4X3_LIB4C
12334#else
12335	CALL(inner_kernel_dgemm_nn_4x3_lib4c)
12336#endif
12337
12338	jmp		103f
12339
12340102:
12341
12342#if MACRO_LEVEL>=2
12343	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
12344#else
12345	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
12346#endif
12347
12348103:
12349
12350
12351	// call inner blender_loader nn
12352
12353	movq	ARG5, %r10 // beta
12354	movq	ARG6, %r11 // C
12355	movq	ARG7, %r12 // ldc
12356	sall	$3, %r12d
12357	movq	ARG13, %r13 // m1
12358	movq	ARG14, %r14 // n1
12359
12360#if MACRO_LEVEL>=1
12361	INNER_SCALE_M1B_4X4_VS_LIB
12362#else
12363	CALL(inner_scale_m1b_4x4_vs_lib)
12364#endif
12365
12366
12367	// solve
12368
12369	movq	ARG10, %r10  // E
12370	movq	ARG11, %r11 // lde
12371	sall	$3, %r11d
12372	movq	ARG12, %r12  // inv_diag_E
12373	movq	ARG14, %r13 // n1
12374
12375#if MACRO_LEVEL>=1
12376	INNER_EDGE_DTRSM_RLN_INV_4X4_VS_LIB
12377#else
12378	CALL(inner_edge_dtrsm_rln_inv_4x4_vs_lib)
12379#endif
12380
12381
12382	// store
12383
12384	movq	ARG8, %r10 // D
12385	movq	ARG9, %r11 // ldd
12386	sall	$3, %r11d
12387	movq	ARG13, %r12 // m1
12388	movq	ARG14, %r13 // n1
12389
12390#if MACRO_LEVEL>=1
12391	INNER_STORE_4X4_VS_LIB
12392#else
12393	CALL(inner_store_4x4_vs_lib)
12394#endif
12395
12396
12397	EPILOGUE
12398
12399	ret
12400
12401	FUN_END(kernel_dtrsm_nn_rl_inv_4x4_vs_lib4cccc)
12402
12403
12404
12405
12406
12407//                                         1      2          3          4        5             6          7          8          9
12408// void kernel_dtrsm_nn_rl_one_4x4_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde);
12409
12410	.p2align 4,,15
12411	GLOB_FUN_START(kernel_dtrsm_nn_rl_one_4x4_lib4c44c)
12412
12413	PROLOGUE
12414
12415	// zero accumulation registers
12416
12417	ZERO_ACC
12418
12419
12420	// call inner dgemm kernel nt
12421
12422	movq	ARG1, %r10 // kmax
12423	movq	ARG2, %r11 // A
12424	movq	ARG3, %r12 // B
12425	movq	ARG4, %r13 // ldb
12426	sall	$3, %r13d
12427
12428#if MACRO_LEVEL>=2
12429	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
12430#else
12431	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
12432#endif
12433
12434
12435	// call inner blender_loader nn
12436
12437	movq	ARG5, %r10 // beta
12438	movq	ARG6, %r11 // C
12439
12440#if MACRO_LEVEL>=1
12441	INNER_SCALE_M1B_4X4_LIB4
12442#else
12443	CALL(inner_scale_m1b_4x4_lib4)
12444#endif
12445
12446
12447	// solve
12448
12449	movq	ARG8, %r10  // E
12450	movq	ARG9, %r11 // lde
12451	sall	$3, %r11d
12452
12453#if MACRO_LEVEL>=1
12454	INNER_EDGE_DTRSM_RLN_ONE_4X4_LIB
12455#else
12456	CALL(inner_edge_dtrsm_rln_one_4x4_lib)
12457#endif
12458
12459
12460	// store
12461
12462	movq	ARG7, %r10 // D
12463
12464#if MACRO_LEVEL>=1
12465	INNER_STORE_4X4_LIB4
12466#else
12467	CALL(inner_store_4x4_lib4)
12468#endif
12469
12470
12471	EPILOGUE
12472
12473	ret
12474
12475	FUN_END(kernel_dtrsm_nn_rl_one_4x4_lib4c44c)
12476
12477
12478
12479
12480
12481//                                            1      2          3          4        5             6          7          8          9        10      11
12482// void kernel_dtrsm_nn_rl_one_4x4_vs_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, int m1, int n1);
12483
12484	.p2align 4,,15
12485	GLOB_FUN_START(kernel_dtrsm_nn_rl_one_4x4_vs_lib4c44c)
12486
12487	PROLOGUE
12488
12489	// zero accumulation registers
12490
12491	ZERO_ACC
12492
12493
12494	// call inner dgemm kernel nt
12495
12496	movq	ARG1, %r10 // kmax
12497	movq	ARG2, %r11 // A
12498	movq	ARG3, %r12 // B
12499	movq	ARG4, %r13 // ldb
12500	sall	$3, %r13d
12501
12502	movq	ARG11, %r14  // n1
12503	cmpl	$1, %r14d
12504	jg		100f
12505
12506#if MACRO_LEVEL>=2
12507	INNER_KERNEL_DGEMM_NN_4X1_LIB4C
12508#else
12509	CALL(inner_kernel_dgemm_nn_4x1_lib4c)
12510#endif
12511
12512	jmp		103f
12513
12514100:
12515
12516	movq	ARG11, %r14  // n1
12517	cmpl	$2, %r14d
12518	jg		101f
12519
12520#if MACRO_LEVEL>=2
12521	INNER_KERNEL_DGEMM_NN_4X2_LIB4C
12522#else
12523	CALL(inner_kernel_dgemm_nn_4x2_lib4c)
12524#endif
12525
12526	jmp		103f
12527
12528101:
12529
12530	movq	ARG11, %r14  // n1
12531	cmpl	$3, %r14d
12532	jg		102f
12533
12534#if MACRO_LEVEL>=2
12535	INNER_KERNEL_DGEMM_NN_4X3_LIB4C
12536#else
12537	CALL(inner_kernel_dgemm_nn_4x3_lib4c)
12538#endif
12539
12540	jmp		103f
12541
12542102:
12543
12544#if MACRO_LEVEL>=2
12545	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
12546#else
12547	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
12548#endif
12549
12550103:
12551
12552
12553	// call inner blender_loader nn
12554
12555	movq	ARG5, %r10 // beta
12556	movq	ARG6, %r11 // C
12557
12558#if MACRO_LEVEL>=1
12559	INNER_SCALE_M1B_4X4_LIB4
12560#else
12561	CALL(inner_scale_m1b_4x4_lib4)
12562#endif
12563
12564
12565	// solve
12566
12567	movq	ARG8, %r10  // E
12568	movq	ARG9, %r11 // lde
12569	sall	$3, %r11d
12570	movq	ARG11, %r12  // n1
12571
12572#if MACRO_LEVEL>=1
12573	INNER_EDGE_DTRSM_RLN_ONE_4X4_VS_LIB
12574#else
12575	CALL(inner_edge_dtrsm_rln_one_4x4_vs_lib)
12576#endif
12577
12578
12579	// store
12580
12581	movq	ARG7, %r10 // D
12582	movq	ARG10, %r11  // m1
12583	movq	ARG11, %r12  // n1
12584
12585#if MACRO_LEVEL>=1
12586	INNER_STORE_4X4_VS_LIB4
12587#else
12588	CALL(inner_store_4x4_vs_lib4)
12589#endif
12590
12591
12592	EPILOGUE
12593
12594	ret
12595
12596	FUN_END(kernel_dtrsm_nn_rl_one_4x4_vs_lib4c44c)
12597
12598
12599
12600
12601
12602//                                         1      2          3          4        5             6          7        8          9        10         11
12603// void kernel_dtrsm_nn_rl_one_4x4_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde);
12604
12605	.p2align 4,,15
12606	GLOB_FUN_START(kernel_dtrsm_nn_rl_one_4x4_lib4cccc)
12607
12608	PROLOGUE
12609
12610	// zero accumulation registers
12611
12612	ZERO_ACC
12613
12614
12615	// call inner dgemm kernel nt
12616
12617	movq	ARG1, %r10
12618	movq	ARG2, %r11
12619	movq	ARG3, %r12
12620	movq	ARG4, %r13 // ldb
12621	sall	$3, %r13d
12622
12623#if MACRO_LEVEL>=2
12624	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
12625#else
12626	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
12627#endif
12628
12629
12630	// call inner blender_loader nn
12631
12632	movq	ARG5, %r10 // beta
12633	movq	ARG6, %r11 // C
12634	movq	ARG7, %r12 // ldc
12635	sall	$3, %r12d
12636
12637#if MACRO_LEVEL>=1
12638	INNER_SCALE_M1B_4X4_LIB
12639#else
12640	CALL(inner_scale_m1b_4x4_lib)
12641#endif
12642
12643
12644	// solve
12645
12646	movq	ARG10, %r10  // E
12647	movq	ARG11, %r11 // lde
12648	sall	$3, %r11d
12649
12650#if MACRO_LEVEL>=1
12651	INNER_EDGE_DTRSM_RLN_ONE_4X4_LIB
12652#else
12653	CALL(inner_edge_dtrsm_rln_one_4x4_lib)
12654#endif
12655
12656
12657	// store
12658
12659	movq	ARG8, %r10 // D
12660	movq	ARG9, %r11 // ldd
12661	sall	$3, %r11d
12662
12663#if MACRO_LEVEL>=1
12664	INNER_STORE_4X4_LIB
12665#else
12666	CALL(inner_store_4x4_lib)
12667#endif
12668
12669
12670	EPILOGUE
12671
12672	ret
12673
12674	FUN_END(kernel_dtrsm_nn_rl_one_4x4_lib4cccc)
12675
12676
12677
12678
12679
12680//                                             1      2          3          4        5             6          7        8          9        10         11       12      13
12681// void kernel_dtrsm_nn_rl_one_4x4_vs_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, int m1, int n1);
12682
12683	.p2align 4,,15
12684	GLOB_FUN_START(kernel_dtrsm_nn_rl_one_4x4_vs_lib4cccc)
12685
12686	PROLOGUE
12687
12688	// zero accumulation registers
12689
12690	ZERO_ACC
12691
12692
12693	// call inner dgemm kernel nt
12694
12695	movq	ARG1, %r10
12696	movq	ARG2, %r11
12697	movq	ARG3, %r12
12698	movq	ARG4, %r13 // ldb
12699	sall	$3, %r13d
12700
12701
12702	movq	ARG13, %r14  // n1
12703	cmpl	$1, %r14d
12704	jg		100f
12705
12706#if MACRO_LEVEL>=2
12707	INNER_KERNEL_DGEMM_NN_4X1_LIB4C
12708#else
12709	CALL(inner_kernel_dgemm_nn_4x1_lib4c)
12710#endif
12711
12712	jmp		103f
12713
12714100:
12715
12716	movq	ARG13, %r14  // n1
12717	cmpl	$2, %r14d
12718	jg		101f
12719
12720#if MACRO_LEVEL>=2
12721	INNER_KERNEL_DGEMM_NN_4X2_LIB4C
12722#else
12723	CALL(inner_kernel_dgemm_nn_4x2_lib4c)
12724#endif
12725
12726	jmp		103f
12727
12728101:
12729
12730	movq	ARG13, %r14  // n1
12731	cmpl	$3, %r14d
12732	jg		102f
12733
12734#if MACRO_LEVEL>=2
12735	INNER_KERNEL_DGEMM_NN_4X3_LIB4C
12736#else
12737	CALL(inner_kernel_dgemm_nn_4x3_lib4c)
12738#endif
12739
12740	jmp		103f
12741
12742102:
12743
12744#if MACRO_LEVEL>=2
12745	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
12746#else
12747	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
12748#endif
12749
12750103:
12751
12752
12753	// call inner blender_loader nn
12754
12755	movq	ARG5, %r10 // beta
12756	movq	ARG6, %r11 // C
12757	movq	ARG7, %r12 // ldc
12758	sall	$3, %r12d
12759	movq	ARG12, %r13 // m1
12760	movq	ARG13, %r14 // n1
12761
12762#if MACRO_LEVEL>=1
12763	INNER_SCALE_M1B_4X4_VS_LIB
12764#else
12765	CALL(inner_scale_m1b_4x4_vs_lib)
12766#endif
12767
12768
12769	// solve
12770
12771	movq	ARG10, %r10  // E
12772	movq	ARG11, %r11 // lde
12773	sall	$3, %r11d
12774	movq	ARG13, %r13 // n1
12775
12776#if MACRO_LEVEL>=1
12777	INNER_EDGE_DTRSM_RLN_ONE_4X4_VS_LIB
12778#else
12779	CALL(inner_edge_dtrsm_rln_one_4x4_vs_lib)
12780#endif
12781
12782
12783	// store
12784
12785	movq	ARG8, %r10 // D
12786	movq	ARG9, %r11 // ldd
12787	sall	$3, %r11d
12788	movq	ARG12, %r12 // m1
12789	movq	ARG13, %r13 // n1
12790
12791#if MACRO_LEVEL>=1
12792	INNER_STORE_4X4_VS_LIB
12793#else
12794	CALL(inner_store_4x4_vs_lib)
12795#endif
12796
12797
12798	EPILOGUE
12799
12800	ret
12801
12802	FUN_END(kernel_dtrsm_nn_rl_one_4x4_vs_lib4cccc)
12803
12804
12805
12806
12807
12808//                                         1      2          3          4             5          6        7          8        9          10
12809// void kernel_dtrsm_nt_rl_inv_4x4_lib44cc4(int k, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E, double *inv_diag_E);
12810
12811	.p2align 4,,15
12812	GLOB_FUN_START(kernel_dtrsm_nt_rl_inv_4x4_lib44cc4)
12813
12814	PROLOGUE
12815
12816	// zero accumulation registers
12817
12818	ZERO_ACC
12819
12820
12821	// call inner dgemm kernel nt
12822
12823	movq	ARG1, %r10
12824	movq	ARG2, %r11
12825	movq	ARG3, %r12
12826
12827#if MACRO_LEVEL>=2
12828	INNER_KERNEL_DGEMM_NT_4X4_LIB4
12829#else
12830	CALL(inner_kernel_dgemm_nt_4x4_lib4)
12831#endif
12832
12833
12834	// call inner blender_loader nn
12835
12836	movq	ARG4, %r10 // beta
12837	movq	ARG5, %r11 // C
12838	movq	ARG6, %r12 // ldc
12839	sall	$3, %r12d
12840
12841#if MACRO_LEVEL>=1
12842	INNER_BLEND_SCALE_M1B_4X4_LIB
12843#else
12844	CALL(inner_blend_scale_m1b_4x4_lib)
12845#endif
12846
12847
12848	// solve
12849
12850	movq	ARG9, %r10  // E
12851	movq	ARG10, %r11  // inv_diag_E
12852
12853#if MACRO_LEVEL>=1
12854	INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
12855#else
12856	CALL(inner_edge_dtrsm_rlt_inv_4x4_lib4)
12857#endif
12858
12859
12860	// store
12861
12862	movq	ARG7, %r10 // D
12863	movq	ARG8, %r11 // ldd
12864	sall	$3, %r11d
12865
12866#if MACRO_LEVEL>=1
12867	INNER_STORE_4X4_LIB
12868#else
12869	CALL(inner_store_4x4_lib)
12870#endif
12871
12872
12873	EPILOGUE
12874
12875	ret
12876
12877	FUN_END(kernel_dtrsm_nt_rl_inv_4x4_lib44cc4)
12878
12879
12880
12881
12882
12883//                                             1      2          3          4             5          6        7          8        9          10                  11      12
12884// void kernel_dtrsm_nt_rl_inv_4x4_vs_lib44cc4(int k, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E, double *inv_diag_E, int m1, int n1);
12885
12886	.p2align 4,,15
12887	GLOB_FUN_START(kernel_dtrsm_nt_rl_inv_4x4_vs_lib44cc4)
12888
12889	PROLOGUE
12890
12891	// zero accumulation registers
12892
12893	ZERO_ACC
12894
12895
12896	// call inner dgemm kernel nt
12897
12898	movq	ARG1, %r10
12899	movq	ARG2, %r11
12900	movq	ARG3, %r12
12901
12902#if MACRO_LEVEL>=2
12903	INNER_KERNEL_DGEMM_NT_4X4_LIB4
12904#else
12905	CALL(inner_kernel_dgemm_nt_4x4_lib4)
12906#endif
12907
12908
12909	// call inner blender_loader nn
12910
12911	movq	ARG4, %r10 // beta
12912	movq	ARG5, %r11 // C
12913	movq	ARG6, %r12 // ldc
12914	sall	$3, %r12d
12915	movq	ARG11, %r13 // m1
12916	movq	ARG12, %r14 // n1
12917
12918#if MACRO_LEVEL>=1
12919	INNER_BLEND_SCALE_M1B_4X4_VS_LIB
12920#else
12921	CALL(inner_blend_scale_m1b_4x4_vs_lib)
12922#endif
12923
12924
12925	// solve
12926
12927	movq	ARG9, %r10  // E
12928	movq	ARG10, %r11  // inv_diag_E
12929
12930#if MACRO_LEVEL>=1
12931	INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4
12932#else
12933	CALL(inner_edge_dtrsm_rlt_inv_4x4_lib4)
12934#endif
12935
12936
12937	// store
12938
12939	movq	ARG7, %r10 // D
12940	movq	ARG8, %r11 // ldd
12941	sall	$3, %r11d
12942	movq	ARG11, %r12 // m1
12943	movq	ARG12, %r13 // n1
12944
12945#if MACRO_LEVEL>=1
12946	INNER_STORE_4X4_VS_LIB
12947#else
12948	CALL(inner_store_4x4_vs_lib)
12949#endif
12950
12951
12952	EPILOGUE
12953
12954	ret
12955
12956	FUN_END(kernel_dtrsm_nt_rl_inv_4x4_vs_lib44cc4)
12957
12958
12959
12960
12961
12962
12963//                                         1      2          3          4         5        6          7        8          9        10
12964// void kernel_dtrsm_nt_rl_inv_4x4_lib44ccc(int k, double *A, double *B, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E);
12965
12966	.p2align 4,,15
12967	GLOB_FUN_START(kernel_dtrsm_nt_rl_inv_4x4_lib44ccc)
12968
12969	PROLOGUE
12970
12971	// zero accumulation registers
12972
12973	ZERO_ACC
12974
12975
12976	// call inner dgemm kernel nt
12977
12978	movq	ARG1, %r10
12979	movq	ARG2, %r11
12980	movq	ARG3, %r12
12981
12982#if MACRO_LEVEL>=2
12983	INNER_KERNEL_DGEMM_NT_4X4_LIB4
12984#else
12985	CALL(inner_kernel_dgemm_nt_4x4_lib4)
12986#endif
12987
12988
12989	// call inner blender_loader nn
12990
12991	movq	ARG4, %r10 // C
12992	movq	ARG5, %r11 // ldc
12993	sall	$3, %r11d
12994
12995#if MACRO_LEVEL>=1
12996	INNER_BLEND_SCALE_M11_4X4_LIB
12997#else
12998	CALL(inner_blend_scale_m11_4x4_lib)
12999#endif
13000
13001
13002	// solve
13003
13004	movq	ARG8, %r10  // E
13005	movq	ARG9, %r11 // lde
13006	sall	$3, %r11d
13007	movq	ARG10, %r12  // inv_diag_E
13008
13009#if MACRO_LEVEL>=1
13010	INNER_EDGE_DTRSM_RLT_INV_4X4_LIB
13011#else
13012	CALL(inner_edge_dtrsm_rlt_inv_4x4_lib)
13013#endif
13014
13015
13016	// store
13017
13018	movq	ARG6, %r10 // D
13019	movq	ARG7, %r11 // ldd
13020	sall	$3, %r11d
13021
13022#if MACRO_LEVEL>=1
13023	INNER_STORE_4X4_LIB
13024#else
13025	CALL(inner_store_4x4_lib)
13026#endif
13027
13028
13029	EPILOGUE
13030
13031	ret
13032
13033	FUN_END(kernel_dtrsm_nt_rl_inv_4x4_lib44ccc)
13034
13035
13036
13037
13038//                                             1      2          3          4         5        6          7        8          9        10                  11      12
13039// void kernel_dtrsm_nt_rl_inv_4x4_vs_lib44ccc(int k, double *A, double *B, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E, int m1, int n1);
13040
13041	.p2align 4,,15
13042	GLOB_FUN_START(kernel_dtrsm_nt_rl_inv_4x4_vs_lib44ccc)
13043
13044	PROLOGUE
13045
13046	// zero accumulation registers
13047
13048	ZERO_ACC
13049
13050
13051	// call inner dgemm kernel nt
13052
13053	movq	ARG1, %r10
13054	movq	ARG2, %r11
13055	movq	ARG3, %r12
13056
13057#if MACRO_LEVEL>=2
13058	INNER_KERNEL_DGEMM_NT_4X4_LIB4
13059#else
13060	CALL(inner_kernel_dgemm_nt_4x4_lib4)
13061#endif
13062
13063
13064	// call inner blender_loader nn
13065
13066	movq	ARG4, %r10 // C
13067	movq	ARG5, %r11 // ldc
13068	sall	$3, %r11d
13069	movq	ARG11, %r12 // m1
13070	movq	ARG12, %r13 // n1
13071
13072#if MACRO_LEVEL>=1
13073	INNER_BLEND_SCALE_M11_4X4_VS_LIB
13074#else
13075	CALL(inner_blend_scale_m11_4x4_vs_lib)
13076#endif
13077
13078
13079	// solve
13080
13081	movq	ARG8, %r10  // E
13082	movq	ARG9, %r11 // lde
13083	sall	$3, %r11d
13084	movq	ARG10, %r12  // inv_diag_E
13085	movq	ARG12, %r13 // n1
13086
13087#if MACRO_LEVEL>=1
13088	INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB
13089#else
13090	CALL(inner_edge_dtrsm_rlt_inv_4x4_vs_lib)
13091#endif
13092
13093
13094	// store
13095
13096	movq	ARG6, %r10 // D
13097	movq	ARG7, %r11 // ldd
13098	sall	$3, %r11d
13099	movq	ARG11, %r12 // m1
13100	movq	ARG12, %r13 // n1
13101
13102#if MACRO_LEVEL>=1
13103	INNER_STORE_4X4_VS_LIB
13104#else
13105	CALL(inner_store_4x4_vs_lib)
13106#endif
13107
13108
13109	EPILOGUE
13110
13111	ret
13112
13113	FUN_END(kernel_dtrsm_nt_rl_inv_4x4_vs_lib44ccc)
13114
13115
13116
13117
13118
13119//                                         1      2          3          4        5             6          7          8          9        10
13120// void kernel_dtrsm_nt_rl_inv_4x4_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, double *inv_diag_E);
13121
13122	.p2align 4,,15
13123	GLOB_FUN_START(kernel_dtrsm_nt_rl_inv_4x4_lib4c44c)
13124
13125	PROLOGUE
13126
13127	// zero accumulation registers
13128
13129	ZERO_ACC
13130
13131
13132	// call inner dgemm kernel nt
13133
13134	movq	ARG1, %r10 // kmax
13135	movq	ARG2, %r11 // A
13136	movq	ARG3, %r12 // B
13137	movq	ARG4, %r13 // ldb
13138	sall	$3, %r13d
13139
13140#if MACRO_LEVEL>=2
13141	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
13142#else
13143	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
13144#endif
13145
13146
13147	// call inner blender_loader nn
13148
13149	movq	ARG5, %r10 // beta
13150	movq	ARG6, %r11 // C
13151
13152#if MACRO_LEVEL>=1
13153	INNER_SCALE_M1B_4X4_LIB4
13154#else
13155	CALL(inner_scale_m1b_4x4_lib4)
13156#endif
13157
13158
13159	// solve
13160
13161	movq	ARG8, %r10  // E
13162	movq	ARG9, %r11 // lde
13163	sall	$3, %r11d
13164	movq	ARG10, %r12  // inv_diag_E
13165
13166#if MACRO_LEVEL>=1
13167	INNER_EDGE_DTRSM_RLT_INV_4X4_LIB
13168#else
13169	CALL(inner_edge_dtrsm_rlt_inv_4x4_lib)
13170#endif
13171
13172
13173	// store
13174
13175	movq	ARG7, %r10 // D
13176
13177#if MACRO_LEVEL>=1
13178	INNER_STORE_4X4_LIB4
13179#else
13180	CALL(inner_store_4x4_lib4)
13181#endif
13182
13183
13184	EPILOGUE
13185
13186	ret
13187
13188	FUN_END(kernel_dtrsm_nt_rl_inv_4x4_lib4c44c)
13189
13190
13191
13192
13193
13194//                                            1      2          3          4        5             6          7          8          9        10                  11      12
13195// void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, double *inv_diag_E, int m1, int n1);
13196
13197	.p2align 4,,15
13198	GLOB_FUN_START(kernel_dtrsm_nt_rl_inv_4x4_vs_lib4c44c)
13199
13200	PROLOGUE
13201
13202	// zero accumulation registers
13203
13204	ZERO_ACC
13205
13206
13207	// call inner dgemm kernel nt
13208
13209	movq	ARG1, %r10 // kmax
13210	movq	ARG2, %r11 // A
13211	movq	ARG3, %r12 // B
13212	movq	ARG4, %r13 // ldb
13213	sall	$3, %r13d
13214
13215	movq	ARG12, %r14  // n1
13216	cmpl	$1, %r14d
13217	jg		100f
13218
13219#if MACRO_LEVEL>=2
13220	INNER_KERNEL_DGEMM_NT_4X1_LIB4C
13221#else
13222	CALL(inner_kernel_dgemm_nt_4x1_lib4c)
13223#endif
13224
13225	jmp		103f
13226
13227100:
13228
13229	movq	ARG12, %r14  // n1
13230	cmpl	$2, %r14d
13231	jg		101f
13232
13233#if MACRO_LEVEL>=2
13234	INNER_KERNEL_DGEMM_NT_4X2_LIB4C
13235#else
13236	CALL(inner_kernel_dgemm_nt_4x2_lib4c)
13237#endif
13238
13239	jmp		103f
13240
13241101:
13242
13243	movq	ARG12, %r14  // n1
13244	cmpl	$3, %r14d
13245	jg		102f
13246
13247#if MACRO_LEVEL>=2
13248	INNER_KERNEL_DGEMM_NT_4X3_LIB4C
13249#else
13250	CALL(inner_kernel_dgemm_nt_4x3_lib4c)
13251#endif
13252
13253	jmp		103f
13254
13255102:
13256
13257#if MACRO_LEVEL>=2
13258	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
13259#else
13260	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
13261#endif
13262
13263103:
13264
13265
13266	// call inner blender_loader nn
13267
13268	movq	ARG5, %r10 // beta
13269	movq	ARG6, %r11 // C
13270
13271#if MACRO_LEVEL>=1
13272	INNER_SCALE_M1B_4X4_LIB4
13273#else
13274	CALL(inner_scale_m1b_4x4_lib4)
13275#endif
13276
13277
13278	// solve
13279
13280	movq	ARG8, %r10  // E
13281	movq	ARG9, %r11 // lde
13282	sall	$3, %r11d
13283	movq	ARG10, %r12  // inv_diag_E
13284	movq	ARG12, %r13 // n1
13285
13286#if MACRO_LEVEL>=1
13287	INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB
13288#else
13289	CALL(inner_edge_dtrsm_rlt_inv_4x4_vs_lib)
13290#endif
13291
13292
13293	// store
13294
13295	movq	ARG7, %r10 // D
13296	movq	ARG11, %r11 // m1
13297	movq	ARG12, %r12 // n1
13298
13299#if MACRO_LEVEL>=1
13300	INNER_STORE_4X4_VS_LIB4
13301#else
13302	CALL(inner_store_4x4_vs_lib4)
13303#endif
13304
13305
13306	EPILOGUE
13307
13308	ret
13309
13310	FUN_END(kernel_dtrsm_nt_rl_inv_4x4_vs_lib4c44c)
13311
13312
13313
13314
13315
13316//                                         1      2          3          4        5             6          7        8          9        10         11       12
13317// void kernel_dtrsm_nt_rl_inv_4x4_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E);
13318
13319	.p2align 4,,15
13320	GLOB_FUN_START(kernel_dtrsm_nt_rl_inv_4x4_lib4cccc)
13321
13322	PROLOGUE
13323
13324	// zero accumulation registers
13325
13326	ZERO_ACC
13327
13328
13329	// call inner dgemm kernel nt
13330
13331	movq	ARG1, %r10
13332	movq	ARG2, %r11
13333	movq	ARG3, %r12
13334	movq	ARG4, %r13 // ldb
13335	sall	$3, %r13d
13336
13337#if MACRO_LEVEL>=2
13338	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
13339#else
13340	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
13341#endif
13342
13343
13344	// call inner blender_loader nn
13345
13346	movq	ARG5, %r10 // beta
13347	movq	ARG6, %r11 // C
13348	movq	ARG7, %r12 // ldc
13349	sall	$3, %r12d
13350
13351#if MACRO_LEVEL>=1
13352	INNER_SCALE_M1B_4X4_LIB
13353#else
13354	CALL(inner_scale_m1b_4x4_lib)
13355#endif
13356
13357
13358	// solve
13359
13360	movq	ARG10, %r10  // E
13361	movq	ARG11, %r11 // lde
13362	sall	$3, %r11d
13363	movq	ARG12, %r12  // inv_diag_E
13364
13365#if MACRO_LEVEL>=1
13366	INNER_EDGE_DTRSM_RLT_INV_4X4_LIB
13367#else
13368	CALL(inner_edge_dtrsm_rlt_inv_4x4_lib)
13369#endif
13370
13371
13372	// store
13373
13374	movq	ARG8, %r10 // D
13375	movq	ARG9, %r11 // ldd
13376	sall	$3, %r11d
13377
13378#if MACRO_LEVEL>=1
13379	INNER_STORE_4X4_LIB
13380#else
13381	CALL(inner_store_4x4_lib)
13382#endif
13383
13384
13385	EPILOGUE
13386
13387	ret
13388
13389	FUN_END(kernel_dtrsm_nt_rl_inv_4x4_lib4cccc)
13390
13391
13392
13393
13394
13395//                                             1      2          3          4        5             6          7        8          9        10         11       12                  13      14
13396// void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E, int m1, int n1);
13397
13398	.p2align 4,,15
13399	GLOB_FUN_START(kernel_dtrsm_nt_rl_inv_4x4_vs_lib4cccc)
13400
13401	PROLOGUE
13402
13403	// zero accumulation registers
13404
13405	ZERO_ACC
13406
13407
13408	// call inner dgemm kernel nt
13409
13410	movq	ARG1, %r10
13411	movq	ARG2, %r11
13412	movq	ARG3, %r12
13413	movq	ARG4, %r13 // ldb
13414	sall	$3, %r13d
13415
13416
13417	movq	ARG14, %r14  // n1
13418	cmpl	$1, %r14d
13419	jg		100f
13420
13421#if MACRO_LEVEL>=2
13422	INNER_KERNEL_DGEMM_NT_4X1_LIB4C
13423#else
13424	CALL(inner_kernel_dgemm_nt_4x1_lib4c)
13425#endif
13426
13427	jmp		103f
13428
13429100:
13430
13431	movq	ARG14, %r14  // n1
13432	cmpl	$2, %r14d
13433	jg		101f
13434
13435#if MACRO_LEVEL>=2
13436	INNER_KERNEL_DGEMM_NT_4X2_LIB4C
13437#else
13438	CALL(inner_kernel_dgemm_nt_4x2_lib4c)
13439#endif
13440
13441	jmp		103f
13442
13443101:
13444
13445	movq	ARG14, %r14  // n1
13446	cmpl	$3, %r14d
13447	jg		102f
13448
13449#if MACRO_LEVEL>=2
13450	INNER_KERNEL_DGEMM_NT_4X3_LIB4C
13451#else
13452	CALL(inner_kernel_dgemm_nt_4x3_lib4c)
13453#endif
13454
13455	jmp		103f
13456
13457102:
13458
13459#if MACRO_LEVEL>=2
13460	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
13461#else
13462	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
13463#endif
13464
13465103:
13466
13467
13468	// call inner blender_loader nn
13469
13470	movq	ARG5, %r10 // beta
13471	movq	ARG6, %r11 // C
13472	movq	ARG7, %r12 // ldc
13473	sall	$3, %r12d
13474	movq	ARG13, %r13 // m1
13475	movq	ARG14, %r14 // n1
13476
13477#if MACRO_LEVEL>=1
13478	INNER_SCALE_M1B_4X4_VS_LIB
13479#else
13480	CALL(inner_scale_m1b_4x4_vs_lib)
13481#endif
13482
13483
13484	// solve
13485
13486	movq	ARG10, %r10  // E
13487	movq	ARG11, %r11 // lde
13488	sall	$3, %r11d
13489	movq	ARG12, %r12  // inv_diag_E
13490	movq	ARG14, %r13 // n1
13491
13492#if MACRO_LEVEL>=1
13493	INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB
13494#else
13495	CALL(inner_edge_dtrsm_rlt_inv_4x4_vs_lib)
13496#endif
13497
13498
13499	// store
13500
13501	movq	ARG8, %r10 // D
13502	movq	ARG9, %r11 // ldd
13503	sall	$3, %r11d
13504	movq	ARG13, %r12 // m1
13505	movq	ARG14, %r13 // n1
13506
13507#if MACRO_LEVEL>=1
13508	INNER_STORE_4X4_VS_LIB
13509#else
13510	CALL(inner_store_4x4_vs_lib)
13511#endif
13512
13513
13514	EPILOGUE
13515
13516	ret
13517
13518	FUN_END(kernel_dtrsm_nt_rl_inv_4x4_vs_lib4cccc)
13519
13520
13521
13522
13523
13524//                                         1      2          3          4             5          6        7          8        9
13525// void kernel_dtrsm_nt_rl_one_4x4_lib44cc4(int k, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E);
13526
13527	.p2align 4,,15
13528	GLOB_FUN_START(kernel_dtrsm_nt_rl_one_4x4_lib44cc4)
13529
13530	PROLOGUE
13531
13532	// zero accumulation registers
13533
13534	ZERO_ACC
13535
13536
13537	// call inner dgemm kernel nt
13538
13539	movq	ARG1, %r10
13540	movq	ARG2, %r11
13541	movq	ARG3, %r12
13542
13543#if MACRO_LEVEL>=2
13544	INNER_KERNEL_DGEMM_NT_4X4_LIB4
13545#else
13546	CALL(inner_kernel_dgemm_nt_4x4_lib4)
13547#endif
13548
13549
13550	// call inner blender_loader nn
13551
13552	movq	ARG4, %r10 // beta
13553	movq	ARG5, %r11 // C
13554	movq	ARG6, %r12 // ldc
13555	sall	$3, %r12d
13556
13557#if MACRO_LEVEL>=1
13558	INNER_BLEND_SCALE_M1B_4X4_LIB
13559#else
13560	CALL(inner_blend_scale_m1b_4x4_lib)
13561#endif
13562
13563
13564	// solve
13565
13566	movq	ARG9, %r10  // E
13567
13568#if MACRO_LEVEL>=1
13569	INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB4
13570#else
13571	CALL(inner_edge_dtrsm_rlt_one_4x4_lib4)
13572#endif
13573
13574
13575	// store
13576
13577	movq	ARG7, %r10 // D
13578	movq	ARG8, %r11 // ldd
13579	sall	$3, %r11d
13580
13581#if MACRO_LEVEL>=1
13582	INNER_STORE_4X4_LIB
13583#else
13584	CALL(inner_store_4x4_lib)
13585#endif
13586
13587
13588	EPILOGUE
13589
13590	ret
13591
13592	FUN_END(kernel_dtrsm_nt_rl_one_4x4_lib44cc4)
13593
13594
13595
13596
13597
13598//                                             1      2          3          4             5          6        7          8        9          10      11
13599// void kernel_dtrsm_nt_rl_one_4x4_vs_lib44cc4(int k, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E, int m1, int n1);
13600
13601	.p2align 4,,15
13602	GLOB_FUN_START(kernel_dtrsm_nt_rl_one_4x4_vs_lib44cc4)
13603
13604	PROLOGUE
13605
13606	// zero accumulation registers
13607
13608	ZERO_ACC
13609
13610
13611	// call inner dgemm kernel nt
13612
13613	movq	ARG1, %r10
13614	movq	ARG2, %r11
13615	movq	ARG3, %r12
13616
13617#if MACRO_LEVEL>=2
13618	INNER_KERNEL_DGEMM_NT_4X4_LIB4
13619#else
13620	CALL(inner_kernel_dgemm_nt_4x4_lib4)
13621#endif
13622
13623
13624	// call inner blender_loader nn
13625
13626	movq	ARG4, %r10 // beta
13627	movq	ARG5, %r11 // C
13628	movq	ARG6, %r12 // ldc
13629	sall	$3, %r12d
13630	movq	ARG10, %r13 // m1
13631	movq	ARG11, %r14 // n1
13632
13633#if MACRO_LEVEL>=1
13634	INNER_BLEND_SCALE_M1B_4X4_VS_LIB
13635#else
13636	CALL(inner_blend_scale_m1b_4x4_vs_lib)
13637#endif
13638
13639
13640	// solve
13641
13642	movq	ARG9, %r10  // E
13643
13644#if MACRO_LEVEL>=1
13645	INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB4
13646#else
13647	CALL(inner_edge_dtrsm_rlt_one_4x4_lib4)
13648#endif
13649
13650
13651	// store
13652
13653	movq	ARG7, %r10 // D
13654	movq	ARG8, %r11 // ldd
13655	sall	$3, %r11d
13656	movq	ARG10, %r12 // m1
13657	movq	ARG11, %r13 // n1
13658
13659#if MACRO_LEVEL>=1
13660	INNER_STORE_4X4_VS_LIB
13661#else
13662	CALL(inner_store_4x4_vs_lib)
13663#endif
13664
13665
13666	EPILOGUE
13667
13668	ret
13669
13670	FUN_END(kernel_dtrsm_nt_rl_one_4x4_vs_lib44cc4)
13671
13672
13673
13674
13675
13676
13677//                                         1      2          3          4        5             6          7          8          9
13678// void kernel_dtrsm_nt_rl_one_4x4_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde);
13679
13680	.p2align 4,,15
13681	GLOB_FUN_START(kernel_dtrsm_nt_rl_one_4x4_lib4c44c)
13682
13683	PROLOGUE
13684
13685	// zero accumulation registers
13686
13687	ZERO_ACC
13688
13689
13690	// call inner dgemm kernel nt
13691
13692	movq	ARG1, %r10 // kmax
13693	movq	ARG2, %r11 // A
13694	movq	ARG3, %r12 // B
13695	movq	ARG4, %r13 // ldb
13696	sall	$3, %r13d
13697
13698#if MACRO_LEVEL>=2
13699	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
13700#else
13701	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
13702#endif
13703
13704
13705	// call inner blender_loader nn
13706
13707	movq	ARG5, %r10 // beta
13708	movq	ARG6, %r11 // C
13709
13710#if MACRO_LEVEL>=1
13711	INNER_SCALE_M1B_4X4_LIB4
13712#else
13713	CALL(inner_scale_m1b_4x4_lib4)
13714#endif
13715
13716
13717	// solve
13718
13719	movq	ARG8, %r10  // E
13720	movq	ARG9, %r11 // lde
13721	sall	$3, %r11d
13722
13723#if MACRO_LEVEL>=1
13724	INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB
13725#else
13726	CALL(inner_edge_dtrsm_rlt_one_4x4_lib)
13727#endif
13728
13729
13730	// store
13731
13732	movq	ARG7, %r10 // D
13733
13734#if MACRO_LEVEL>=1
13735	INNER_STORE_4X4_LIB4
13736#else
13737	CALL(inner_store_4x4_lib4)
13738#endif
13739
13740
13741	EPILOGUE
13742
13743	ret
13744
13745	FUN_END(kernel_dtrsm_nt_rl_one_4x4_lib4c44c)
13746
13747
13748
13749
13750
13751//                                            1      2          3          4        5             6          7          8          9        10      11
13752// void kernel_dtrsm_nt_rl_one_4x4_vs_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, int m1, int n1);
13753
13754	.p2align 4,,15
13755	GLOB_FUN_START(kernel_dtrsm_nt_rl_one_4x4_vs_lib4c44c)
13756
13757	PROLOGUE
13758
13759	// zero accumulation registers
13760
13761	ZERO_ACC
13762
13763
13764	// call inner dgemm kernel nt
13765
13766	movq	ARG1, %r10 // kmax
13767	movq	ARG2, %r11 // A
13768	movq	ARG3, %r12 // B
13769	movq	ARG4, %r13 // ldb
13770	sall	$3, %r13d
13771
13772	movq	ARG11, %r14  // n1
13773	cmpl	$1, %r14d
13774	jg		100f
13775
13776#if MACRO_LEVEL>=2
13777	INNER_KERNEL_DGEMM_NT_4X1_LIB4C
13778#else
13779	CALL(inner_kernel_dgemm_nt_4x1_lib4c)
13780#endif
13781
13782	jmp		103f
13783
13784100:
13785
13786	movq	ARG11, %r14  // n1
13787	cmpl	$2, %r14d
13788	jg		101f
13789
13790#if MACRO_LEVEL>=2
13791	INNER_KERNEL_DGEMM_NT_4X2_LIB4C
13792#else
13793	CALL(inner_kernel_dgemm_nt_4x2_lib4c)
13794#endif
13795
13796	jmp		103f
13797
13798101:
13799
13800	movq	ARG11, %r14  // n1
13801	cmpl	$3, %r14d
13802	jg		102f
13803
13804#if MACRO_LEVEL>=2
13805	INNER_KERNEL_DGEMM_NT_4X3_LIB4C
13806#else
13807	CALL(inner_kernel_dgemm_nt_4x3_lib4c)
13808#endif
13809
13810	jmp		103f
13811
13812102:
13813
13814#if MACRO_LEVEL>=2
13815	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
13816#else
13817	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
13818#endif
13819
13820103:
13821
13822
13823	// call inner blender_loader nn
13824
13825	movq	ARG5, %r10 // beta
13826	movq	ARG6, %r11 // C
13827
13828#if MACRO_LEVEL>=1
13829	INNER_SCALE_M1B_4X4_LIB4
13830#else
13831	CALL(inner_scale_m1b_4x4_lib4)
13832#endif
13833
13834
13835	// solve
13836
13837	movq	ARG8, %r10  // E
13838	movq	ARG9, %r11 // lde
13839	sall	$3, %r11d
13840	movq	ARG11, %r12 // n1
13841
13842#if MACRO_LEVEL>=1
13843	INNER_EDGE_DTRSM_RLT_ONE_4X4_VS_LIB
13844#else
13845	CALL(inner_edge_dtrsm_rlt_one_4x4_vs_lib)
13846#endif
13847
13848
13849	// store
13850
13851	movq	ARG7, %r10 // D
13852	movq	ARG10, %r11 // m1
13853	movq	ARG11, %r12 // n1
13854
13855#if MACRO_LEVEL>=1
13856	INNER_STORE_4X4_VS_LIB4
13857#else
13858	CALL(inner_store_4x4_vs_lib4)
13859#endif
13860
13861
13862	EPILOGUE
13863
13864	ret
13865
13866	FUN_END(kernel_dtrsm_nt_rl_one_4x4_vs_lib4c44c)
13867
13868
13869
13870
13871
13872//                                         1      2          3          4        5             6          7        8          9        10         11
13873// void kernel_dtrsm_nt_rl_one_4x4_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde);
13874
13875	.p2align 4,,15
13876	GLOB_FUN_START(kernel_dtrsm_nt_rl_one_4x4_lib4cccc)
13877
13878	PROLOGUE
13879
13880	// zero accumulation registers
13881
13882	ZERO_ACC
13883
13884
13885	// call inner dgemm kernel nt
13886
13887	movq	ARG1, %r10
13888	movq	ARG2, %r11
13889	movq	ARG3, %r12
13890	movq	ARG4, %r13 // ldb
13891	sall	$3, %r13d
13892
13893#if MACRO_LEVEL>=2
13894	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
13895#else
13896	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
13897#endif
13898
13899
13900	// call inner blender_loader nn
13901
13902	movq	ARG5, %r10 // beta
13903	movq	ARG6, %r11 // C
13904	movq	ARG7, %r12 // ldc
13905	sall	$3, %r12d
13906
13907#if MACRO_LEVEL>=1
13908	INNER_SCALE_M1B_4X4_LIB
13909#else
13910	CALL(inner_scale_m1b_4x4_lib)
13911#endif
13912
13913
13914	// solve
13915
13916	movq	ARG10, %r10  // E
13917	movq	ARG11, %r11 // lde
13918	sall	$3, %r11d
13919
13920#if MACRO_LEVEL>=1
13921	INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB
13922#else
13923	CALL(inner_edge_dtrsm_rlt_one_4x4_lib)
13924#endif
13925
13926
13927	// store
13928
13929	movq	ARG8, %r10 // D
13930	movq	ARG9, %r11 // ldd
13931	sall	$3, %r11d
13932
13933#if MACRO_LEVEL>=1
13934	INNER_STORE_4X4_LIB
13935#else
13936	CALL(inner_store_4x4_lib)
13937#endif
13938
13939
13940	EPILOGUE
13941
13942	ret
13943
13944	FUN_END(kernel_dtrsm_nt_rl_one_4x4_lib4cccc)
13945
13946
13947
13948
13949
13950//                                             1      2          3          4        5             6          7        8          9        10         11       12      13
13951// void kernel_dtrsm_nt_rl_one_4x4_vs_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, int m1, int n1);
13952
13953	.p2align 4,,15
13954	GLOB_FUN_START(kernel_dtrsm_nt_rl_one_4x4_vs_lib4cccc)
13955
13956	PROLOGUE
13957
13958	// zero accumulation registers
13959
13960	ZERO_ACC
13961
13962
13963	// call inner dgemm kernel nt
13964
13965	movq	ARG1, %r10
13966	movq	ARG2, %r11
13967	movq	ARG3, %r12
13968	movq	ARG4, %r13 // ldb
13969	sall	$3, %r13d
13970
13971
13972	movq	ARG13, %r14  // n1
13973	cmpl	$1, %r14d
13974	jg		100f
13975
13976#if MACRO_LEVEL>=2
13977	INNER_KERNEL_DGEMM_NT_4X1_LIB4C
13978#else
13979	CALL(inner_kernel_dgemm_nt_4x1_lib4c)
13980#endif
13981
13982	jmp		103f
13983
13984100:
13985
13986	movq	ARG13, %r14  // n1
13987	cmpl	$2, %r14d
13988	jg		101f
13989
13990#if MACRO_LEVEL>=2
13991	INNER_KERNEL_DGEMM_NT_4X2_LIB4C
13992#else
13993	CALL(inner_kernel_dgemm_nt_4x2_lib4c)
13994#endif
13995
13996	jmp		103f
13997
13998101:
13999
14000	movq	ARG13, %r14  // n1
14001	cmpl	$3, %r14d
14002	jg		102f
14003
14004#if MACRO_LEVEL>=2
14005	INNER_KERNEL_DGEMM_NT_4X3_LIB4C
14006#else
14007	CALL(inner_kernel_dgemm_nt_4x3_lib4c)
14008#endif
14009
14010	jmp		103f
14011
14012102:
14013
14014#if MACRO_LEVEL>=2
14015	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
14016#else
14017	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
14018#endif
14019
14020103:
14021
14022
14023	// call inner blender_loader nn
14024
14025	movq	ARG5, %r10 // beta
14026	movq	ARG6, %r11 // C
14027	movq	ARG7, %r12 // ldc
14028	sall	$3, %r12d
14029	movq	ARG12, %r13 // m1
14030	movq	ARG13, %r14 // n1
14031
14032#if MACRO_LEVEL>=1
14033	INNER_SCALE_M1B_4X4_VS_LIB
14034#else
14035	CALL(inner_scale_m1b_4x4_vs_lib)
14036#endif
14037
14038
14039	// solve
14040
14041	movq	ARG10, %r10  // E
14042	movq	ARG11, %r11 // lde
14043	sall	$3, %r11d
14044	movq	ARG13, %r12 // n1
14045
14046#if MACRO_LEVEL>=1
14047	INNER_EDGE_DTRSM_RLT_ONE_4X4_VS_LIB
14048#else
14049	CALL(inner_edge_dtrsm_rlt_one_4x4_vs_lib)
14050#endif
14051
14052
14053	// store
14054
14055	movq	ARG8, %r10 // D
14056	movq	ARG9, %r11 // ldd
14057	sall	$3, %r11d
14058	movq	ARG12, %r12 // m1
14059	movq	ARG13, %r13 // n1
14060
14061#if MACRO_LEVEL>=1
14062	INNER_STORE_4X4_VS_LIB
14063#else
14064	CALL(inner_store_4x4_vs_lib)
14065#endif
14066
14067
14068	EPILOGUE
14069
14070	ret
14071
14072	FUN_END(kernel_dtrsm_nt_rl_one_4x4_vs_lib4cccc)
14073
14074
14075
14076
14077
14078//                                         1      2          3          4        5             6          7          8          9        10
14079// void kernel_dtrsm_nn_ru_inv_4x4_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, double *inv_diag_E);
14080
14081	.p2align 4,,15
14082	GLOB_FUN_START(kernel_dtrsm_nn_ru_inv_4x4_lib4c44c)
14083
14084	PROLOGUE
14085
14086	// zero accumulation registers
14087
14088	ZERO_ACC
14089
14090
14091	// call inner dgemm kernel nt
14092
14093	movq	ARG1, %r10 // kmax
14094	movq	ARG2, %r11 // A
14095	movq	ARG3, %r12 // B
14096	movq	ARG4, %r13 // ldb
14097	sall	$3, %r13d
14098
14099#if MACRO_LEVEL>=2
14100	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
14101#else
14102	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
14103#endif
14104
14105
14106	// call inner blender_loader nn
14107
14108	movq	ARG5, %r10 // beta
14109	movq	ARG6, %r11 // C
14110
14111#if MACRO_LEVEL>=1
14112	INNER_SCALE_M1B_4X4_LIB4
14113#else
14114	CALL(inner_scale_m1b_4x4_lib4)
14115#endif
14116
14117
14118	// solve
14119
14120	movq	ARG8, %r10  // E
14121	movq	ARG9, %r11 // lde
14122	sall	$3, %r11d
14123	movq	ARG10, %r12  // inv_diag_E
14124
14125#if MACRO_LEVEL>=1
14126	INNER_EDGE_DTRSM_RUN_INV_4X4_LIB
14127#else
14128	CALL(inner_edge_dtrsm_run_inv_4x4_lib)
14129#endif
14130
14131
14132	// store
14133
14134	movq	ARG7, %r10 // D
14135
14136#if MACRO_LEVEL>=1
14137	INNER_STORE_4X4_LIB4
14138#else
14139	CALL(inner_store_4x4_lib4)
14140#endif
14141
14142
14143	EPILOGUE
14144
14145	ret
14146
14147	FUN_END(kernel_dtrsm_nn_ru_inv_4x4_lib4c44c)
14148
14149
14150
14151
14152
14153//                                            1      2          3          4        5             6          7          8          9        10                  11      12
14154// void kernel_dtrsm_nn_ru_inv_4x4_vs_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, double *inv_diag_E, int m1, int n1);
14155
14156	.p2align 4,,15
14157	GLOB_FUN_START(kernel_dtrsm_nn_ru_inv_4x4_vs_lib4c44c)
14158
14159	PROLOGUE
14160
14161	// zero accumulation registers
14162
14163	ZERO_ACC
14164
14165
14166	// call inner dgemm kernel nt
14167
14168	movq	ARG1, %r10 // kmax
14169	movq	ARG2, %r11 // A
14170	movq	ARG3, %r12 // B
14171	movq	ARG4, %r13 // ldb
14172	sall	$3, %r13d
14173
14174	movq	ARG12, %r14  // n1
14175	cmpl	$1, %r14d
14176	jg		100f
14177
14178#if MACRO_LEVEL>=2
14179	INNER_KERNEL_DGEMM_NN_4X1_LIB4C
14180#else
14181	CALL(inner_kernel_dgemm_nn_4x1_lib4c)
14182#endif
14183
14184	jmp		103f
14185
14186100:
14187
14188	movq	ARG12, %r14  // n1
14189	cmpl	$2, %r14d
14190	jg		101f
14191
14192#if MACRO_LEVEL>=2
14193	INNER_KERNEL_DGEMM_NN_4X2_LIB4C
14194#else
14195	CALL(inner_kernel_dgemm_nn_4x2_lib4c)
14196#endif
14197
14198	jmp		103f
14199
14200101:
14201
14202	movq	ARG12, %r14  // n1
14203	cmpl	$3, %r14d
14204	jg		102f
14205
14206#if MACRO_LEVEL>=2
14207	INNER_KERNEL_DGEMM_NN_4X3_LIB4C
14208#else
14209	CALL(inner_kernel_dgemm_nn_4x3_lib4c)
14210#endif
14211
14212	jmp		103f
14213
14214102:
14215
14216#if MACRO_LEVEL>=2
14217	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
14218#else
14219	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
14220#endif
14221
14222103:
14223
14224
14225	// call inner blender_loader nn
14226
14227	movq	ARG5, %r10 // beta
14228	movq	ARG6, %r11 // C
14229
14230#if MACRO_LEVEL>=1
14231	INNER_SCALE_M1B_4X4_LIB4
14232#else
14233	CALL(inner_scale_m1b_4x4_lib4)
14234#endif
14235
14236
14237	// solve
14238
14239	movq	ARG8, %r10  // E
14240	movq	ARG9, %r11 // lde
14241	sall	$3, %r11d
14242	movq	ARG10, %r12  // inv_diag_E
14243	movq	ARG12, %r13  // n1
14244
14245#if MACRO_LEVEL>=1
14246	INNER_EDGE_DTRSM_RUN_INV_4X4_VS_LIB
14247#else
14248	CALL(inner_edge_dtrsm_run_inv_4x4_vs_lib)
14249#endif
14250
14251
14252	// store
14253
14254	movq	ARG7, %r10 // D
14255	movq	ARG11, %r11  // m1
14256	movq	ARG12, %r12  // n1
14257
14258#if MACRO_LEVEL>=1
14259	INNER_STORE_4X4_VS_LIB4
14260#else
14261	CALL(inner_store_4x4_vs_lib4)
14262#endif
14263
14264
14265	EPILOGUE
14266
14267	ret
14268
14269	FUN_END(kernel_dtrsm_nn_ru_inv_4x4_vs_lib4c44c)
14270
14271
14272
14273
14274
14275//                                         1      2          3          4        5             6          7        8          9        10         11       12
14276// void kernel_dtrsm_nn_ru_inv_4x4_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E);
14277
14278	.p2align 4,,15
14279	GLOB_FUN_START(kernel_dtrsm_nn_ru_inv_4x4_lib4cccc)
14280
14281	PROLOGUE
14282
14283	// zero accumulation registers
14284
14285	ZERO_ACC
14286
14287
14288	// call inner dgemm kernel nt
14289
14290	movq	ARG1, %r10 // kmax
14291	movq	ARG2, %r11 // A
14292	movq	ARG3, %r12 // B
14293	movq	ARG4, %r13 // ldb
14294	sall	$3, %r13d
14295
14296#if MACRO_LEVEL>=2
14297	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
14298#else
14299	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
14300#endif
14301
14302
14303	// call inner blender_loader nn
14304
14305	movq	ARG5, %r10 // beta
14306	movq	ARG6, %r11 // C
14307	movq	ARG7, %r12 // ldc
14308	sall	$3, %r12d
14309
14310#if MACRO_LEVEL>=1
14311	INNER_SCALE_M1B_4X4_LIB
14312#else
14313	CALL(inner_scale_m1b_4x4_lib)
14314#endif
14315
14316
14317	// solve
14318
14319	movq	ARG10, %r10  // E
14320	movq	ARG11, %r11 // lde
14321	sall	$3, %r11d
14322	movq	ARG12, %r12  // inv_diag_E
14323
14324#if MACRO_LEVEL>=1
14325	INNER_EDGE_DTRSM_RUN_INV_4X4_LIB
14326#else
14327	CALL(inner_edge_dtrsm_run_inv_4x4_lib)
14328#endif
14329
14330
14331	// store
14332
14333	movq	ARG8, %r10 // D
14334	movq	ARG9, %r11 // ldd
14335	sall	$3, %r11d
14336
14337#if MACRO_LEVEL>=1
14338	INNER_STORE_4X4_LIB
14339#else
14340	CALL(inner_store_4x4_lib)
14341#endif
14342
14343
14344	EPILOGUE
14345
14346	ret
14347
14348	FUN_END(kernel_dtrsm_nn_ru_inv_4x4_lib4cccc)
14349
14350
14351
14352
14353
14354//                                             1      2          3          4        5             6          7        8          9        10         11       12                  13      14
14355// void kernel_dtrsm_nn_ru_inv_4x4_vs_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E, int m1, int n1);
14356
14357	.p2align 4,,15
14358	GLOB_FUN_START(kernel_dtrsm_nn_ru_inv_4x4_vs_lib4cccc)
14359
14360	PROLOGUE
14361
14362	// zero accumulation registers
14363
14364	ZERO_ACC
14365
14366
14367	// call inner dgemm kernel nt
14368
14369	movq	ARG1, %r10 // kmax
14370	movq	ARG2, %r11 // A
14371	movq	ARG3, %r12 // B
14372	movq	ARG4, %r13 // ldb
14373	sall	$3, %r13d
14374
14375	movq	ARG14, %r14  // n1
14376	cmpl	$1, %r14d
14377	jg		100f
14378
14379#if MACRO_LEVEL>=2
14380	INNER_KERNEL_DGEMM_NN_4X1_LIB4C
14381#else
14382	CALL(inner_kernel_dgemm_nn_4x1_lib4c)
14383#endif
14384
14385	jmp		103f
14386
14387100:
14388
14389	movq	ARG14, %r14  // n1
14390	cmpl	$2, %r14d
14391	jg		101f
14392
14393#if MACRO_LEVEL>=2
14394	INNER_KERNEL_DGEMM_NN_4X2_LIB4C
14395#else
14396	CALL(inner_kernel_dgemm_nn_4x2_lib4c)
14397#endif
14398
14399	jmp		103f
14400
14401101:
14402
14403	movq	ARG14, %r14  // n1
14404	cmpl	$3, %r14d
14405	jg		102f
14406
14407#if MACRO_LEVEL>=2
14408	INNER_KERNEL_DGEMM_NN_4X3_LIB4C
14409#else
14410	CALL(inner_kernel_dgemm_nn_4x3_lib4c)
14411#endif
14412
14413	jmp		103f
14414
14415102:
14416
14417#if MACRO_LEVEL>=2
14418	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
14419#else
14420	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
14421#endif
14422
14423103:
14424
14425
14426	// call inner blender_loader nn
14427
14428	movq	ARG5, %r10 // beta
14429	movq	ARG6, %r11 // C
14430	movq	ARG7, %r12 // ldc
14431	sall	$3, %r12d
14432	movq	ARG13, %r13 // m1
14433	movq	ARG14, %r14 // n1
14434
14435#if MACRO_LEVEL>=1
14436	INNER_SCALE_M1B_4X4_VS_LIB
14437#else
14438	CALL(inner_scale_m1b_4x4_vs_lib)
14439#endif
14440
14441
14442	// solve
14443
14444	movq	ARG10, %r10  // E
14445	movq	ARG11, %r11 // lde
14446	sall	$3, %r11d
14447	movq	ARG12, %r12  // inv_diag_E
14448	movq	ARG14, %r13 // n1
14449
14450#if MACRO_LEVEL>=1
14451	INNER_EDGE_DTRSM_RUN_INV_4X4_VS_LIB
14452#else
14453	CALL(inner_edge_dtrsm_run_inv_4x4_vs_lib)
14454#endif
14455
14456
14457	// store
14458
14459	movq	ARG8, %r10 // D
14460	movq	ARG9, %r11 // ldd
14461	sall	$3, %r11d
14462	movq	ARG13, %r12 // m1
14463	movq	ARG14, %r13 // n1
14464
14465#if MACRO_LEVEL>=1
14466	INNER_STORE_4X4_VS_LIB
14467#else
14468	CALL(inner_store_4x4_vs_lib)
14469#endif
14470
14471
14472	EPILOGUE
14473
14474	ret
14475
14476	FUN_END(kernel_dtrsm_nn_ru_inv_4x4_vs_lib4cccc)
14477
14478
14479
14480
14481
14482//                                         1      2          3          4        5             6          7          8          9
14483// void kernel_dtrsm_nn_ru_one_4x4_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde);
14484
14485	.p2align 4,,15
14486	GLOB_FUN_START(kernel_dtrsm_nn_ru_one_4x4_lib4c44c)
14487
14488	PROLOGUE
14489
14490	// zero accumulation registers
14491
14492	ZERO_ACC
14493
14494
14495	// call inner dgemm kernel nt
14496
14497	movq	ARG1, %r10 // kmax
14498	movq	ARG2, %r11 // A
14499	movq	ARG3, %r12 // B
14500	movq	ARG4, %r13 // ldb
14501	sall	$3, %r13d
14502
14503#if MACRO_LEVEL>=2
14504	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
14505#else
14506	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
14507#endif
14508
14509
14510	// call inner blender_loader nn
14511
14512	movq	ARG5, %r10 // beta
14513	movq	ARG6, %r11 // C
14514
14515#if MACRO_LEVEL>=1
14516	INNER_SCALE_M1B_4X4_LIB4
14517#else
14518	CALL(inner_scale_m1b_4x4_lib4)
14519#endif
14520
14521
14522	// solve
14523
14524	movq	ARG8, %r10  // E
14525	movq	ARG9, %r11 // lde
14526	sall	$3, %r11d
14527
14528#if MACRO_LEVEL>=1
14529	INNER_EDGE_DTRSM_RUN_ONE_4X4_LIB
14530#else
14531	CALL(inner_edge_dtrsm_run_one_4x4_lib)
14532#endif
14533
14534
14535	// store
14536
14537	movq	ARG7, %r10 // D
14538
14539#if MACRO_LEVEL>=1
14540	INNER_STORE_4X4_LIB4
14541#else
14542	CALL(inner_store_4x4_lib4)
14543#endif
14544
14545
14546	EPILOGUE
14547
14548	ret
14549
14550	FUN_END(kernel_dtrsm_nn_ru_one_4x4_lib4c44c)
14551
14552
14553
14554
14555
14556//                                            1      2          3          4        5             6          7          8          9        10      11
14557// void kernel_dtrsm_nn_ru_one_4x4_vs_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, int m1, int n1);
14558
14559	.p2align 4,,15
14560	GLOB_FUN_START(kernel_dtrsm_nn_ru_one_4x4_vs_lib4c44c)
14561
14562	PROLOGUE
14563
14564	// zero accumulation registers
14565
14566	ZERO_ACC
14567
14568
14569	// call inner dgemm kernel nt
14570
14571	movq	ARG1, %r10 // kmax
14572	movq	ARG2, %r11 // A
14573	movq	ARG3, %r12 // B
14574	movq	ARG4, %r13 // ldb
14575	sall	$3, %r13d
14576
14577	movq	ARG11, %r14  // n1
14578	cmpl	$1, %r14d
14579	jg		100f
14580
14581#if MACRO_LEVEL>=2
14582	INNER_KERNEL_DGEMM_NN_4X1_LIB4C
14583#else
14584	CALL(inner_kernel_dgemm_nn_4x1_lib4c)
14585#endif
14586
14587	jmp		103f
14588
14589100:
14590
14591	movq	ARG11, %r14  // n1
14592	cmpl	$2, %r14d
14593	jg		101f
14594
14595#if MACRO_LEVEL>=2
14596	INNER_KERNEL_DGEMM_NN_4X2_LIB4C
14597#else
14598	CALL(inner_kernel_dgemm_nn_4x2_lib4c)
14599#endif
14600
14601	jmp		103f
14602
14603101:
14604
14605	movq	ARG11, %r14  // n1
14606	cmpl	$3, %r14d
14607	jg		102f
14608
14609#if MACRO_LEVEL>=2
14610	INNER_KERNEL_DGEMM_NN_4X3_LIB4C
14611#else
14612	CALL(inner_kernel_dgemm_nn_4x3_lib4c)
14613#endif
14614
14615	jmp		103f
14616
14617102:
14618
14619#if MACRO_LEVEL>=2
14620	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
14621#else
14622	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
14623#endif
14624
14625103:
14626
14627
14628	// call inner blender_loader nn
14629
14630	movq	ARG5, %r10 // beta
14631	movq	ARG6, %r11 // C
14632
14633#if MACRO_LEVEL>=1
14634	INNER_SCALE_M1B_4X4_LIB4
14635#else
14636	CALL(inner_scale_m1b_4x4_lib4)
14637#endif
14638
14639
14640	// solve
14641
14642	movq	ARG8, %r10  // E
14643	movq	ARG9, %r11 // lde
14644	sall	$3, %r11d
14645	movq	ARG11, %r12  // n1
14646
14647#if MACRO_LEVEL>=1
14648	INNER_EDGE_DTRSM_RUN_ONE_4X4_VS_LIB
14649#else
14650	CALL(inner_edge_dtrsm_run_one_4x4_vs_lib)
14651#endif
14652
14653
14654	// store
14655
14656	movq	ARG7, %r10 // D
14657	movq	ARG10, %r11  // m1
14658	movq	ARG11, %r12  // n1
14659
14660#if MACRO_LEVEL>=1
14661	INNER_STORE_4X4_VS_LIB4
14662#else
14663	CALL(inner_store_4x4_vs_lib4)
14664#endif
14665
14666
14667	EPILOGUE
14668
14669	ret
14670
14671	FUN_END(kernel_dtrsm_nn_ru_one_4x4_vs_lib4c44c)
14672
14673
14674
14675
14676
14677//                                         1      2          3          4        5             6          7        8          9        10         11
14678// void kernel_dtrsm_nn_ru_one_4x4_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde);
14679
14680	.p2align 4,,15
14681	GLOB_FUN_START(kernel_dtrsm_nn_ru_one_4x4_lib4cccc)
14682
14683	PROLOGUE
14684
14685	// zero accumulation registers
14686
14687	ZERO_ACC
14688
14689
14690	// call inner dgemm kernel nt
14691
14692	movq	ARG1, %r10 // kmax
14693	movq	ARG2, %r11 // A
14694	movq	ARG3, %r12 // B
14695	movq	ARG4, %r13 // ldb
14696	sall	$3, %r13d
14697
14698#if MACRO_LEVEL>=2
14699	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
14700#else
14701	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
14702#endif
14703
14704
14705	// call inner blender_loader nn
14706
14707	movq	ARG5, %r10 // beta
14708	movq	ARG6, %r11 // C
14709	movq	ARG7, %r12 // ldc
14710	sall	$3, %r12d
14711
14712#if MACRO_LEVEL>=1
14713	INNER_SCALE_M1B_4X4_LIB
14714#else
14715	CALL(inner_scale_m1b_4x4_lib)
14716#endif
14717
14718
14719	// solve
14720
14721	movq	ARG10, %r10  // E
14722	movq	ARG11, %r11 // lde
14723	sall	$3, %r11d
14724
14725#if MACRO_LEVEL>=1
14726	INNER_EDGE_DTRSM_RUN_ONE_4X4_LIB
14727#else
14728	CALL(inner_edge_dtrsm_run_one_4x4_lib)
14729#endif
14730
14731
14732	// store
14733
14734	movq	ARG8, %r10 // D
14735	movq	ARG9, %r11 // ldd
14736	sall	$3, %r11d
14737
14738#if MACRO_LEVEL>=1
14739	INNER_STORE_4X4_LIB
14740#else
14741	CALL(inner_store_4x4_lib)
14742#endif
14743
14744
14745	EPILOGUE
14746
14747	ret
14748
14749	FUN_END(kernel_dtrsm_nn_ru_one_4x4_lib4cccc)
14750
14751
14752
14753
14754
14755//                                             1      2          3          4        5             6          7        8          9        10         11       12      13
14756// void kernel_dtrsm_nn_ru_one_4x4_vs_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, int m1, int n1);
14757
14758	.p2align 4,,15
14759	GLOB_FUN_START(kernel_dtrsm_nn_ru_one_4x4_vs_lib4cccc)
14760
14761	PROLOGUE
14762
14763	// zero accumulation registers
14764
14765	ZERO_ACC
14766
14767
14768	// call inner dgemm kernel nt
14769
14770	movq	ARG1, %r10 // kmax
14771	movq	ARG2, %r11 // A
14772	movq	ARG3, %r12 // B
14773	movq	ARG4, %r13 // ldb
14774	sall	$3, %r13d
14775
14776	movq	ARG13, %r14  // n1
14777	cmpl	$1, %r14d
14778	jg		100f
14779
14780#if MACRO_LEVEL>=2
14781	INNER_KERNEL_DGEMM_NN_4X1_LIB4C
14782#else
14783	CALL(inner_kernel_dgemm_nn_4x1_lib4c)
14784#endif
14785
14786	jmp		103f
14787
14788100:
14789
14790	movq	ARG13, %r14  // n1
14791	cmpl	$2, %r14d
14792	jg		101f
14793
14794#if MACRO_LEVEL>=2
14795	INNER_KERNEL_DGEMM_NN_4X2_LIB4C
14796#else
14797	CALL(inner_kernel_dgemm_nn_4x2_lib4c)
14798#endif
14799
14800	jmp		103f
14801
14802101:
14803
14804	movq	ARG13, %r14  // n1
14805	cmpl	$3, %r14d
14806	jg		102f
14807
14808#if MACRO_LEVEL>=2
14809	INNER_KERNEL_DGEMM_NN_4X3_LIB4C
14810#else
14811	CALL(inner_kernel_dgemm_nn_4x3_lib4c)
14812#endif
14813
14814	jmp		103f
14815
14816102:
14817
14818#if MACRO_LEVEL>=2
14819	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
14820#else
14821	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
14822#endif
14823
14824103:
14825
14826
14827	// call inner blender_loader nn
14828
14829	movq	ARG5, %r10 // beta
14830	movq	ARG6, %r11 // C
14831	movq	ARG7, %r12 // ldc
14832	sall	$3, %r12d
14833	movq	ARG12, %r13 // m1
14834	movq	ARG13, %r14 // n1
14835
14836#if MACRO_LEVEL>=1
14837	INNER_SCALE_M1B_4X4_VS_LIB
14838#else
14839	CALL(inner_scale_m1b_4x4_vs_lib)
14840#endif
14841
14842
14843	// solve
14844
14845	movq	ARG10, %r10  // E
14846	movq	ARG11, %r11 // lde
14847	sall	$3, %r11d
14848	movq	ARG13, %r12 // n1
14849
14850#if MACRO_LEVEL>=1
14851	INNER_EDGE_DTRSM_RUN_ONE_4X4_VS_LIB
14852#else
14853	CALL(inner_edge_dtrsm_run_one_4x4_vs_lib)
14854#endif
14855
14856
14857	// store
14858
14859	movq	ARG8, %r10 // D
14860	movq	ARG9, %r11 // ldd
14861	sall	$3, %r11d
14862	movq	ARG12, %r12 // m1
14863	movq	ARG13, %r13 // n1
14864
14865#if MACRO_LEVEL>=1
14866	INNER_STORE_4X4_VS_LIB
14867#else
14868	CALL(inner_store_4x4_vs_lib)
14869#endif
14870
14871
14872	EPILOGUE
14873
14874	ret
14875
14876	FUN_END(kernel_dtrsm_nn_ru_one_4x4_vs_lib4cccc)
14877
14878
14879
14880
14881
14882//                                         1      2          3          4             5          6        7          8        9          10
14883// void kernel_dtrsm_nt_ru_inv_4x4_lib44cc4(int k, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E, double *inv_diag_E);
14884
14885	.p2align 4,,15
14886	GLOB_FUN_START(kernel_dtrsm_nt_ru_inv_4x4_lib44cc4)
14887
14888	PROLOGUE
14889
14890	// zero accumulation registers
14891
14892	ZERO_ACC
14893
14894
14895	// call inner dgemm kernel nt
14896
14897	movq	ARG1, %r10 // kmax
14898	movq	ARG2, %r11 // A
14899	movq	ARG3, %r12 // B
14900
14901#if MACRO_LEVEL>=2
14902	INNER_KERNEL_DGEMM_NT_4X4_LIB4
14903#else
14904	CALL(inner_kernel_dgemm_nt_4x4_lib4)
14905#endif
14906
14907
14908	// call inner blender_loader nn
14909
14910	movq	ARG4, %r10 // beta
14911	movq	ARG5, %r11 // C
14912	movq	ARG6, %r12 // ldc
14913	sall	$3, %r12d
14914
14915#if MACRO_LEVEL>=1
14916	INNER_BLEND_SCALE_M1B_4X4_LIB
14917#else
14918	CALL(inner_blend_scale_m1b_4x4_lib)
14919#endif
14920
14921
14922	// solve
14923
14924	movq	ARG9, %r10  // E
14925	movq	ARG10, %r11 // inv_diag_E
14926
14927#if MACRO_LEVEL>=1
14928	INNER_EDGE_DTRSM_RUT_INV_4X4_LIB4
14929#else
14930	CALL(inner_edge_dtrsm_rut_inv_4x4_lib4)
14931#endif
14932
14933
14934	// store
14935
14936	movq	ARG7, %r10 // D
14937	movq	ARG8, %r11 // ldd
14938	sall	$3, %r11d
14939
14940#if MACRO_LEVEL>=1
14941	INNER_STORE_4X4_LIB
14942#else
14943	CALL(inner_store_4x4_lib)
14944#endif
14945
14946
14947	EPILOGUE
14948
14949	ret
14950
14951	FUN_END(kernel_dtrsm_nt_ru_inv_4x4_lib44cc4)
14952
14953
14954
14955
14956
14957//                                             1      2          3          4             5          6        7          8        9          10                  11      12
14958// void kernel_dtrsm_nt_ru_inv_4x4_vs_lib44cc4(int k, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E, double *inv_diag_E, int m1, in1 n1);
14959
14960	.p2align 4,,15
14961	GLOB_FUN_START(kernel_dtrsm_nt_ru_inv_4x4_vs_lib44cc4)
14962
14963	PROLOGUE
14964
14965	// zero accumulation registers
14966
14967	ZERO_ACC
14968
14969
14970	// call inner dgemm kernel nt
14971
14972	movq	ARG1, %r10 // kmax
14973	movq	ARG2, %r11 // A
14974	movq	ARG3, %r12 // B
14975
14976#if MACRO_LEVEL>=2
14977	INNER_KERNEL_DGEMM_NT_4X4_LIB4
14978#else
14979	CALL(inner_kernel_dgemm_nt_4x4_lib4)
14980#endif
14981
14982
14983	// call inner blender_loader nn
14984
14985	movq	ARG4, %r10 // beta
14986	movq	ARG5, %r11 // C
14987	movq	ARG6, %r12 // ldc
14988	sall	$3, %r12d
14989	movq	ARG11, %r13 // m1
14990	movq	ARG12, %r14 // n1
14991
14992#if MACRO_LEVEL>=1
14993	INNER_BLEND_SCALE_M1B_4X4_VS_LIB
14994#else
14995	CALL(inner_blend_scale_m1b_4x4_vs_lib)
14996#endif
14997
14998
14999	// solve
15000
15001	movq	ARG9, %r10  // E
15002	movq	ARG10, %r11 // inv_diag_E
15003	movq	ARG12, %r12 // n1
15004
15005#if MACRO_LEVEL>=1
15006	INNER_EDGE_DTRSM_RUT_INV_4X4_VS_LIB4
15007#else
15008	CALL(inner_edge_dtrsm_rut_inv_4x4_vs_lib4)
15009#endif
15010
15011
15012	// store
15013
15014	movq	ARG7, %r10 // D
15015	movq	ARG8, %r11 // ldd
15016	sall	$3, %r11d
15017	movq	ARG11, %r12 // m1
15018	movq	ARG12, %r13 // n1
15019
15020#if MACRO_LEVEL>=1
15021	INNER_STORE_4X4_VS_LIB
15022#else
15023	CALL(inner_store_4x4_vs_lib)
15024#endif
15025
15026
15027	EPILOGUE
15028
15029	ret
15030
15031	FUN_END(kernel_dtrsm_nt_ru_inv_4x4_vs_lib44cc4)
15032
15033
15034
15035
15036
15037//                                         1      2          3          4        5             6          7          8          9        10
15038// void kernel_dtrsm_nt_ru_inv_4x4_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, double *inv_diag_E);
15039
15040	.p2align 4,,15
15041	GLOB_FUN_START(kernel_dtrsm_nt_ru_inv_4x4_lib4c44c)
15042
15043	PROLOGUE
15044
15045	// zero accumulation registers
15046
15047	ZERO_ACC
15048
15049
15050	// call inner dgemm kernel nt
15051
15052	movq	ARG1, %r10 // kmax
15053	movq	ARG2, %r11 // A
15054	movq	ARG3, %r12 // B
15055	movq	ARG4, %r13 // ldb
15056	sall	$3, %r13d
15057
15058#if MACRO_LEVEL>=2
15059	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
15060#else
15061	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
15062#endif
15063
15064
15065	// call inner blender_loader nn
15066
15067	movq	ARG5, %r10 // beta
15068	movq	ARG6, %r11 // C
15069
15070#if MACRO_LEVEL>=1
15071	INNER_SCALE_M1B_4X4_LIB4
15072#else
15073	CALL(inner_scale_m1b_4x4_lib4)
15074#endif
15075
15076
15077	// solve
15078
15079	movq	ARG8, %r10  // E
15080	movq	ARG9, %r11 // lde
15081	sall	$3, %r11d
15082	movq	ARG10, %r12  // inv_diag_E
15083
15084#if MACRO_LEVEL>=1
15085	INNER_EDGE_DTRSM_RUT_INV_4X4_LIB
15086#else
15087	CALL(inner_edge_dtrsm_rut_inv_4x4_lib)
15088#endif
15089
15090
15091	// store
15092
15093	movq	ARG7, %r10 // D
15094
15095#if MACRO_LEVEL>=1
15096	INNER_STORE_4X4_LIB4
15097#else
15098	CALL(inner_store_4x4_lib4)
15099#endif
15100
15101
15102	EPILOGUE
15103
15104	ret
15105
15106	FUN_END(kernel_dtrsm_nt_ru_inv_4x4_lib4c44c)
15107
15108
15109
15110
15111
15112//                                            1      2          3          4        5             6          7          8          9        10                  11      12
15113// void kernel_dtrsm_nt_ru_inv_4x4_vs_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, double *inv_diag_E, int m1, int n1);
15114
15115	.p2align 4,,15
15116	GLOB_FUN_START(kernel_dtrsm_nt_ru_inv_4x4_vs_lib4c44c)
15117
15118	PROLOGUE
15119
15120	// zero accumulation registers
15121
15122	ZERO_ACC
15123
15124
15125	// call inner dgemm kernel nt
15126
15127	movq	ARG1, %r10 // kmax
15128	movq	ARG2, %r11 // A
15129	movq	ARG3, %r12 // B
15130	movq	ARG4, %r13 // ldb
15131	sall	$3, %r13d
15132
15133	movq	ARG12, %r14  // n1
15134	cmpl	$1, %r14d
15135	jg		100f
15136
15137#if MACRO_LEVEL>=2
15138	INNER_KERNEL_DGEMM_NT_4X1_LIB4C
15139#else
15140	CALL(inner_kernel_dgemm_nt_4x1_lib4c)
15141#endif
15142
15143	jmp		103f
15144
15145100:
15146
15147	movq	ARG12, %r14  // n1
15148	cmpl	$2, %r14d
15149	jg		101f
15150
15151#if MACRO_LEVEL>=2
15152	INNER_KERNEL_DGEMM_NT_4X2_LIB4C
15153#else
15154	CALL(inner_kernel_dgemm_nt_4x2_lib4c)
15155#endif
15156
15157	jmp		103f
15158
15159101:
15160
15161	movq	ARG12, %r14  // n1
15162	cmpl	$3, %r14d
15163	jg		102f
15164
15165#if MACRO_LEVEL>=2
15166	INNER_KERNEL_DGEMM_NT_4X3_LIB4C
15167#else
15168	CALL(inner_kernel_dgemm_nt_4x3_lib4c)
15169#endif
15170
15171	jmp		103f
15172
15173102:
15174
15175#if MACRO_LEVEL>=2
15176	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
15177#else
15178	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
15179#endif
15180
15181103:
15182
15183
15184	// call inner blender_loader nn
15185
15186	movq	ARG5, %r10 // beta
15187	movq	ARG6, %r11 // C
15188
15189#if MACRO_LEVEL>=1
15190	INNER_SCALE_M1B_4X4_LIB4
15191#else
15192	CALL(inner_scale_m1b_4x4_lib4)
15193#endif
15194
15195
15196	// solve
15197
15198	movq	ARG8, %r10  // E
15199	movq	ARG9, %r11 // lde
15200	sall	$3, %r11d
15201	movq	ARG10, %r12  // inv_diag_E
15202	movq	ARG12, %r13  // n1
15203
15204#if MACRO_LEVEL>=1
15205	INNER_EDGE_DTRSM_RUT_INV_4X4_VS_LIB
15206#else
15207	CALL(inner_edge_dtrsm_rut_inv_4x4_vs_lib)
15208#endif
15209
15210
15211	// store
15212
15213	movq	ARG7, %r10 // D
15214	movq	ARG11, %r11  // m1
15215	movq	ARG12, %r12  // n1
15216
15217#if MACRO_LEVEL>=1
15218	INNER_STORE_4X4_VS_LIB4
15219#else
15220	CALL(inner_store_4x4_vs_lib4)
15221#endif
15222
15223
15224	EPILOGUE
15225
15226	ret
15227
15228	FUN_END(kernel_dtrsm_nt_ru_inv_4x4_vs_lib4c44c)
15229
15230
15231
15232
15233//                                         1      2          3          4        5             6          7        8          9        10         11       12
15234// void kernel_dtrsm_nt_ru_inv_4x4_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E);
15235
15236	.p2align 4,,15
15237	GLOB_FUN_START(kernel_dtrsm_nt_ru_inv_4x4_lib4cccc)
15238
15239	PROLOGUE
15240
15241	// zero accumulation registers
15242
15243	ZERO_ACC
15244
15245
15246	// call inner dgemm kernel nt
15247
15248	movq	ARG1, %r10 // kmax
15249	movq	ARG2, %r11 // A
15250	movq	ARG3, %r12 // B
15251	movq	ARG4, %r13 // ldb
15252	sall	$3, %r13d
15253
15254#if MACRO_LEVEL>=2
15255	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
15256#else
15257	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
15258#endif
15259
15260
15261	// call inner blender_loader nn
15262
15263	movq	ARG5, %r10 // beta
15264	movq	ARG6, %r11 // C
15265	movq	ARG7, %r12 // ldc
15266	sall	$3, %r12d
15267
15268#if MACRO_LEVEL>=1
15269	INNER_SCALE_M1B_4X4_LIB
15270#else
15271	CALL(inner_scale_m1b_4x4_lib)
15272#endif
15273
15274
15275	// solve
15276
15277	movq	ARG10, %r10  // E
15278	movq	ARG11, %r11 // lde
15279	sall	$3, %r11d
15280	movq	ARG12, %r12  // inv_diag_E
15281
15282#if MACRO_LEVEL>=1
15283	INNER_EDGE_DTRSM_RUT_INV_4X4_LIB
15284#else
15285	CALL(inner_edge_dtrsm_rut_inv_4x4_lib)
15286#endif
15287
15288
15289	// store
15290
15291	movq	ARG8, %r10 // D
15292	movq	ARG9, %r11 // ldd
15293	sall	$3, %r11d
15294
15295#if MACRO_LEVEL>=1
15296	INNER_STORE_4X4_LIB
15297#else
15298	CALL(inner_store_4x4_lib)
15299#endif
15300
15301
15302	EPILOGUE
15303
15304	ret
15305
15306	FUN_END(kernel_dtrsm_nt_ru_inv_4x4_lib4cccc)
15307
15308
15309
15310
15311
15312//                                             1      2          3          4        5             6          7        8          9        10         11       12                  13      14
15313// void kernel_dtrsm_nt_ru_inv_4x4_vs_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E, int m1, int n1);
15314
15315	.p2align 4,,15
15316	GLOB_FUN_START(kernel_dtrsm_nt_ru_inv_4x4_vs_lib4cccc)
15317
15318	PROLOGUE
15319
15320	// zero accumulation registers
15321
15322	ZERO_ACC
15323
15324
15325	// call inner dgemm kernel nt
15326
15327	movq	ARG1, %r10 // kmax
15328	movq	ARG2, %r11 // A
15329	movq	ARG3, %r12 // B
15330	movq	ARG4, %r13 // ldb
15331	sall	$3, %r13d
15332
15333	movq	ARG14, %r14  // n1
15334	cmpl	$1, %r14d
15335	jg		100f
15336
15337#if MACRO_LEVEL>=2
15338	INNER_KERNEL_DGEMM_NT_4X1_LIB4C
15339#else
15340	CALL(inner_kernel_dgemm_nt_4x1_lib4c)
15341#endif
15342
15343	jmp		103f
15344
15345100:
15346
15347	movq	ARG14, %r14  // n1
15348	cmpl	$2, %r14d
15349	jg		101f
15350
15351#if MACRO_LEVEL>=2
15352	INNER_KERNEL_DGEMM_NT_4X2_LIB4C
15353#else
15354	CALL(inner_kernel_dgemm_nt_4x2_lib4c)
15355#endif
15356
15357	jmp		103f
15358
15359101:
15360
15361	movq	ARG14, %r14  // n1
15362	cmpl	$3, %r14d
15363	jg		102f
15364
15365#if MACRO_LEVEL>=2
15366	INNER_KERNEL_DGEMM_NT_4X3_LIB4C
15367#else
15368	CALL(inner_kernel_dgemm_nt_4x3_lib4c)
15369#endif
15370
15371	jmp		103f
15372
15373102:
15374
15375#if MACRO_LEVEL>=2
15376	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
15377#else
15378	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
15379#endif
15380
15381103:
15382
15383
15384	// call inner blender_loader nn
15385
15386	movq	ARG5, %r10 // beta
15387	movq	ARG6, %r11 // C
15388	movq	ARG7, %r12 // ldc
15389	sall	$3, %r12d
15390	movq	ARG13, %r13 // m1
15391	movq	ARG14, %r14 // n1
15392
15393#if MACRO_LEVEL>=1
15394	INNER_SCALE_M1B_4X4_VS_LIB
15395#else
15396	CALL(inner_scale_m1b_4x4_vs_lib)
15397#endif
15398
15399
15400	// solve
15401
15402	movq	ARG10, %r10  // E
15403	movq	ARG11, %r11 // lde
15404	sall	$3, %r11d
15405	movq	ARG12, %r12  // inv_diag_E
15406	movq	ARG14, %r13 // n1
15407
15408#if MACRO_LEVEL>=1
15409	INNER_EDGE_DTRSM_RUT_INV_4X4_VS_LIB
15410#else
15411	CALL(inner_edge_dtrsm_rut_inv_4x4_vs_lib)
15412#endif
15413
15414
15415	// store
15416
15417	movq	ARG8, %r10 // D
15418	movq	ARG9, %r11 // ldd
15419	sall	$3, %r11d
15420	movq	ARG13, %r12 // m1
15421	movq	ARG14, %r13 // n1
15422
15423#if MACRO_LEVEL>=1
15424	INNER_STORE_4X4_VS_LIB
15425#else
15426	CALL(inner_store_4x4_vs_lib)
15427#endif
15428
15429
15430	EPILOGUE
15431
15432	ret
15433
15434	FUN_END(kernel_dtrsm_nt_ru_inv_4x4_vs_lib4cccc)
15435
15436
15437
15438
15439
15440//                                         1      2          3          4             5          6        7          8        9
15441// void kernel_dtrsm_nt_ru_one_4x4_lib44cc4(int k, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E);
15442
15443	.p2align 4,,15
15444	GLOB_FUN_START(kernel_dtrsm_nt_ru_one_4x4_lib44cc4)
15445
15446	PROLOGUE
15447
15448	// zero accumulation registers
15449
15450	ZERO_ACC
15451
15452
15453	// call inner dgemm kernel nt
15454
15455	movq	ARG1, %r10 // kmax
15456	movq	ARG2, %r11 // A
15457	movq	ARG3, %r12 // B
15458
15459#if MACRO_LEVEL>=2
15460	INNER_KERNEL_DGEMM_NT_4X4_LIB4
15461#else
15462	CALL(inner_kernel_dgemm_nt_4x4_lib4)
15463#endif
15464
15465
15466	// call inner blender_loader nn
15467
15468	movq	ARG4, %r10 // beta
15469	movq	ARG5, %r11 // C
15470	movq	ARG6, %r12 // ldc
15471	sall	$3, %r12d
15472
15473#if MACRO_LEVEL>=1
15474	INNER_BLEND_SCALE_M1B_4X4_LIB
15475#else
15476	CALL(inner_blend_scale_m1b_4x4_lib)
15477#endif
15478
15479
15480	// solve
15481
15482	movq	ARG9, %r10  // E
15483
15484#if MACRO_LEVEL>=1
15485	INNER_EDGE_DTRSM_RUT_ONE_4X4_LIB4
15486#else
15487	CALL(inner_edge_dtrsm_rut_one_4x4_lib4)
15488#endif
15489
15490
15491	// store
15492
15493	movq	ARG7, %r10 // D
15494	movq	ARG8, %r11 // ldd
15495	sall	$3, %r11d
15496
15497#if MACRO_LEVEL>=1
15498	INNER_STORE_4X4_LIB
15499#else
15500	CALL(inner_store_4x4_lib)
15501#endif
15502
15503
15504	EPILOGUE
15505
15506	ret
15507
15508	FUN_END(kernel_dtrsm_nt_ru_one_4x4_lib44cc4)
15509
15510
15511
15512
15513
15514//                                             1      2          3          4             5          6        7          8        9          10      11
15515// void kernel_dtrsm_nt_ru_one_4x4_vs_lib44cc4(int k, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E, int m1, in1 n1);
15516
15517	.p2align 4,,15
15518	GLOB_FUN_START(kernel_dtrsm_nt_ru_one_4x4_vs_lib44cc4)
15519
15520	PROLOGUE
15521
15522	// zero accumulation registers
15523
15524	ZERO_ACC
15525
15526
15527	// call inner dgemm kernel nt
15528
15529	movq	ARG1, %r10 // kmax
15530	movq	ARG2, %r11 // A
15531	movq	ARG3, %r12 // B
15532
15533#if MACRO_LEVEL>=2
15534	INNER_KERNEL_DGEMM_NT_4X4_LIB4
15535#else
15536	CALL(inner_kernel_dgemm_nt_4x4_lib4)
15537#endif
15538
15539
15540	// call inner blender_loader nn
15541
15542	movq	ARG4, %r10 // beta
15543	movq	ARG5, %r11 // C
15544	movq	ARG6, %r12 // ldc
15545	sall	$3, %r12d
15546	movq	ARG10, %r13 // m1
15547	movq	ARG11, %r14 // n1
15548
15549#if MACRO_LEVEL>=1
15550	INNER_BLEND_SCALE_M1B_4X4_VS_LIB
15551#else
15552	CALL(inner_blend_scale_m1b_4x4_vs_lib)
15553#endif
15554
15555
15556	// solve
15557
15558	movq	ARG9, %r10  // E
15559	movq	ARG11, %r11 // n1
15560
15561#if MACRO_LEVEL>=1
15562	INNER_EDGE_DTRSM_RUT_ONE_4X4_VS_LIB4
15563#else
15564	CALL(inner_edge_dtrsm_rut_one_4x4_vs_lib4)
15565#endif
15566
15567
15568	// store
15569
15570	movq	ARG7, %r10 // D
15571	movq	ARG8, %r11 // ldd
15572	sall	$3, %r11d
15573	movq	ARG10, %r12 // m1
15574	movq	ARG11, %r13 // n1
15575
15576#if MACRO_LEVEL>=1
15577	INNER_STORE_4X4_VS_LIB
15578#else
15579	CALL(inner_store_4x4_vs_lib)
15580#endif
15581
15582
15583	EPILOGUE
15584
15585	ret
15586
15587	FUN_END(kernel_dtrsm_nt_ru_one_4x4_vs_lib44cc4)
15588
15589
15590
15591
15592
15593//                                         1      2          3          4        5             6          7          8          9
15594// void kernel_dtrsm_nt_ru_one_4x4_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde);
15595
15596	.p2align 4,,15
15597	GLOB_FUN_START(kernel_dtrsm_nt_ru_one_4x4_lib4c44c)
15598
15599	PROLOGUE
15600
15601	// zero accumulation registers
15602
15603	ZERO_ACC
15604
15605
15606	// call inner dgemm kernel nt
15607
15608	movq	ARG1, %r10 // kmax
15609	movq	ARG2, %r11 // A
15610	movq	ARG3, %r12 // B
15611	movq	ARG4, %r13 // ldb
15612	sall	$3, %r13d
15613
15614#if MACRO_LEVEL>=2
15615	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
15616#else
15617	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
15618#endif
15619
15620
15621	// call inner blender_loader nn
15622
15623	movq	ARG5, %r10 // beta
15624	movq	ARG6, %r11 // C
15625
15626#if MACRO_LEVEL>=1
15627	INNER_SCALE_M1B_4X4_LIB4
15628#else
15629	CALL(inner_scale_m1b_4x4_lib4)
15630#endif
15631
15632
15633	// solve
15634
15635	movq	ARG8, %r10  // E
15636	movq	ARG9, %r11 // lde
15637	sall	$3, %r11d
15638
15639#if MACRO_LEVEL>=1
15640	INNER_EDGE_DTRSM_RUT_ONE_4X4_LIB
15641#else
15642	CALL(inner_edge_dtrsm_rut_one_4x4_lib)
15643#endif
15644
15645
15646	// store
15647
15648	movq	ARG7, %r10 // D
15649
15650#if MACRO_LEVEL>=1
15651	INNER_STORE_4X4_LIB4
15652#else
15653	CALL(inner_store_4x4_lib4)
15654#endif
15655
15656
15657	EPILOGUE
15658
15659	ret
15660
15661	FUN_END(kernel_dtrsm_nt_ru_one_4x4_lib4c44c)
15662
15663
15664
15665
15666
15667//                                            1      2          3          4        5             6          7          8          9        10      11
15668// void kernel_dtrsm_nt_ru_one_4x4_vs_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, int m1, int n1);
15669
15670	.p2align 4,,15
15671	GLOB_FUN_START(kernel_dtrsm_nt_ru_one_4x4_vs_lib4c44c)
15672
15673	PROLOGUE
15674
15675	// zero accumulation registers
15676
15677	ZERO_ACC
15678
15679
15680	// call inner dgemm kernel nt
15681
15682	movq	ARG1, %r10 // kmax
15683	movq	ARG2, %r11 // A
15684	movq	ARG3, %r12 // B
15685	movq	ARG4, %r13 // ldb
15686	sall	$3, %r13d
15687
15688	movq	ARG11, %r14  // n1
15689	cmpl	$1, %r14d
15690	jg		100f
15691
15692#if MACRO_LEVEL>=2
15693	INNER_KERNEL_DGEMM_NT_4X1_LIB4C
15694#else
15695	CALL(inner_kernel_dgemm_nt_4x1_lib4c)
15696#endif
15697
15698	jmp		103f
15699
15700100:
15701
15702	movq	ARG11, %r14  // n1
15703	cmpl	$2, %r14d
15704	jg		101f
15705
15706#if MACRO_LEVEL>=2
15707	INNER_KERNEL_DGEMM_NT_4X2_LIB4C
15708#else
15709	CALL(inner_kernel_dgemm_nt_4x2_lib4c)
15710#endif
15711
15712	jmp		103f
15713
15714101:
15715
15716	movq	ARG11, %r14  // n1
15717	cmpl	$3, %r14d
15718	jg		102f
15719
15720#if MACRO_LEVEL>=2
15721	INNER_KERNEL_DGEMM_NT_4X3_LIB4C
15722#else
15723	CALL(inner_kernel_dgemm_nt_4x3_lib4c)
15724#endif
15725
15726	jmp		103f
15727
15728102:
15729
15730#if MACRO_LEVEL>=2
15731	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
15732#else
15733	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
15734#endif
15735
15736103:
15737
15738
15739	// call inner blender_loader nn
15740
15741	movq	ARG5, %r10 // beta
15742	movq	ARG6, %r11 // C
15743
15744#if MACRO_LEVEL>=1
15745	INNER_SCALE_M1B_4X4_LIB4
15746#else
15747	CALL(inner_scale_m1b_4x4_lib4)
15748#endif
15749
15750
15751	// solve
15752
15753	movq	ARG8, %r10  // E
15754	movq	ARG9, %r11 // lde
15755	sall	$3, %r11d
15756	movq	ARG11, %r12  // n1
15757
15758#if MACRO_LEVEL>=1
15759	INNER_EDGE_DTRSM_RUT_ONE_4X4_VS_LIB
15760#else
15761	CALL(inner_edge_dtrsm_rut_one_4x4_vs_lib)
15762#endif
15763
15764
15765	// store
15766
15767	movq	ARG7, %r10 // D
15768	movq	ARG10, %r11  // m1
15769	movq	ARG11, %r12  // n1
15770
15771#if MACRO_LEVEL>=1
15772	INNER_STORE_4X4_VS_LIB4
15773#else
15774	CALL(inner_store_4x4_vs_lib4)
15775#endif
15776
15777
15778	EPILOGUE
15779
15780	ret
15781
15782	FUN_END(kernel_dtrsm_nt_ru_one_4x4_vs_lib4c44c)
15783
15784
15785
15786
15787
15788//                                         1      2          3          4        5             6          7        8          9        10         11
15789// void kernel_dtrsm_nt_ru_one_4x4_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde);
15790
15791	.p2align 4,,15
15792	GLOB_FUN_START(kernel_dtrsm_nt_ru_one_4x4_lib4cccc)
15793
15794	PROLOGUE
15795
15796	// zero accumulation registers
15797
15798	ZERO_ACC
15799
15800
15801	// call inner dgemm kernel nt
15802
15803	movq	ARG1, %r10 // kmax
15804	movq	ARG2, %r11 // A
15805	movq	ARG3, %r12 // B
15806	movq	ARG4, %r13 // ldb
15807	sall	$3, %r13d
15808
15809#if MACRO_LEVEL>=2
15810	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
15811#else
15812	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
15813#endif
15814
15815
15816	// call inner blender_loader nn
15817
15818	movq	ARG5, %r10 // beta
15819	movq	ARG6, %r11 // C
15820	movq	ARG7, %r12 // ldc
15821	sall	$3, %r12d
15822
15823#if MACRO_LEVEL>=1
15824	INNER_SCALE_M1B_4X4_LIB
15825#else
15826	CALL(inner_scale_m1b_4x4_lib)
15827#endif
15828
15829
15830	// solve
15831
15832	movq	ARG10, %r10  // E
15833	movq	ARG11, %r11 // lde
15834	sall	$3, %r11d
15835
15836#if MACRO_LEVEL>=1
15837	INNER_EDGE_DTRSM_RUT_ONE_4X4_LIB
15838#else
15839	CALL(inner_edge_dtrsm_rut_one_4x4_lib)
15840#endif
15841
15842
15843	// store
15844
15845	movq	ARG8, %r10 // D
15846	movq	ARG9, %r11 // ldd
15847	sall	$3, %r11d
15848
15849#if MACRO_LEVEL>=1
15850	INNER_STORE_4X4_LIB
15851#else
15852	CALL(inner_store_4x4_lib)
15853#endif
15854
15855
15856	EPILOGUE
15857
15858	ret
15859
15860	FUN_END(kernel_dtrsm_nt_ru_one_4x4_lib4cccc)
15861
15862
15863
15864
15865
15866//                                             1      2          3          4        5             6          7        8          9        10         11       12      13
15867// void kernel_dtrsm_nt_ru_one_4x4_vs_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, int m1, int n1);
15868
15869	.p2align 4,,15
15870	GLOB_FUN_START(kernel_dtrsm_nt_ru_one_4x4_vs_lib4cccc)
15871
15872	PROLOGUE
15873
15874	// zero accumulation registers
15875
15876	ZERO_ACC
15877
15878
15879	// call inner dgemm kernel nt
15880
15881	movq	ARG1, %r10 // kmax
15882	movq	ARG2, %r11 // A
15883	movq	ARG3, %r12 // B
15884	movq	ARG4, %r13 // ldb
15885	sall	$3, %r13d
15886
15887	movq	ARG13, %r14  // n1
15888	cmpl	$1, %r14d
15889	jg		100f
15890
15891#if MACRO_LEVEL>=2
15892	INNER_KERNEL_DGEMM_NT_4X1_LIB4C
15893#else
15894	CALL(inner_kernel_dgemm_nt_4x1_lib4c)
15895#endif
15896
15897	jmp		103f
15898
15899100:
15900
15901	movq	ARG13, %r14  // n1
15902	cmpl	$2, %r14d
15903	jg		101f
15904
15905#if MACRO_LEVEL>=2
15906	INNER_KERNEL_DGEMM_NT_4X2_LIB4C
15907#else
15908	CALL(inner_kernel_dgemm_nt_4x2_lib4c)
15909#endif
15910
15911	jmp		103f
15912
15913101:
15914
15915	movq	ARG13, %r14  // n1
15916	cmpl	$3, %r14d
15917	jg		102f
15918
15919#if MACRO_LEVEL>=2
15920	INNER_KERNEL_DGEMM_NT_4X3_LIB4C
15921#else
15922	CALL(inner_kernel_dgemm_nt_4x3_lib4c)
15923#endif
15924
15925	jmp		103f
15926
15927102:
15928
15929#if MACRO_LEVEL>=2
15930	INNER_KERNEL_DGEMM_NT_4X4_LIB4C
15931#else
15932	CALL(inner_kernel_dgemm_nt_4x4_lib4c)
15933#endif
15934
15935103:
15936
15937
15938	// call inner blender_loader nn
15939
15940	movq	ARG5, %r10 // beta
15941	movq	ARG6, %r11 // C
15942	movq	ARG7, %r12 // ldc
15943	sall	$3, %r12d
15944	movq	ARG12, %r13 // m1
15945	movq	ARG13, %r14 // n1
15946
15947#if MACRO_LEVEL>=1
15948	INNER_SCALE_M1B_4X4_VS_LIB
15949#else
15950	CALL(inner_scale_m1b_4x4_vs_lib)
15951#endif
15952
15953
15954	// solve
15955
15956	movq	ARG10, %r10  // E
15957	movq	ARG11, %r11 // lde
15958	sall	$3, %r11d
15959	movq	ARG13, %r12 // n1
15960
15961#if MACRO_LEVEL>=1
15962	INNER_EDGE_DTRSM_RUT_ONE_4X4_VS_LIB
15963#else
15964	CALL(inner_edge_dtrsm_rut_one_4x4_vs_lib)
15965#endif
15966
15967
15968	// store
15969
15970	movq	ARG8, %r10 // D
15971	movq	ARG9, %r11 // ldd
15972	sall	$3, %r11d
15973	movq	ARG12, %r12 // m1
15974	movq	ARG13, %r13 // n1
15975
15976#if MACRO_LEVEL>=1
15977	INNER_STORE_4X4_VS_LIB
15978#else
15979	CALL(inner_store_4x4_vs_lib)
15980#endif
15981
15982
15983	EPILOGUE
15984
15985	ret
15986
15987	FUN_END(kernel_dtrsm_nt_ru_one_4x4_vs_lib4cccc)
15988
15989
15990
15991
15992
15993//                                         1      2          3          4        5             6          7        8          9        10         11
15994// void kernel_dtrsm_nn_ll_one_4x4_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde);
15995
15996	.p2align 4,,15
15997	GLOB_FUN_START(kernel_dtrsm_nn_ll_one_4x4_lib4cccc)
15998
15999	PROLOGUE
16000
16001	// zero accumulation registers
16002
16003	ZERO_ACC
16004
16005
16006	// call inner dgemm kernel nt
16007
16008	movq	ARG1, %r10 // k
16009	movq	ARG2, %r11 // A
16010	movq	ARG3, %r12 // B
16011	movq	ARG4, %r13 // ldb
16012	sall	$3, %r13d // ldb*sizeof(double)
16013
16014#if MACRO_LEVEL>=2
16015	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
16016#else
16017	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
16018#endif
16019
16020
16021	// call inner blender_loader nn
16022
16023	movq	ARG5, %r10 // beta
16024	movq	ARG6, %r11 // C
16025	movq	ARG7, %r12 // ldc
16026	sall	$3, %r12d
16027
16028#if MACRO_LEVEL>=1
16029	INNER_SCALE_M1B_4X4_LIB
16030#else
16031	CALL(inner_scale_m1b_4x4_lib)
16032#endif
16033
16034
16035	// solve
16036
16037	movq	ARG10, %r10  // E
16038	movq	ARG11, %r11 // lde
16039	sall	$3, %r11d
16040
16041#if MACRO_LEVEL>=1
16042	INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB
16043#else
16044	CALL(inner_edge_dtrsm_lln_one_4x4_lib)
16045#endif
16046
16047
16048	// store
16049
16050	movq	ARG8, %r10 // D
16051	movq	ARG9, %r11 // ldd
16052	sall	$3, %r11d
16053
16054#if MACRO_LEVEL>=1
16055	INNER_STORE_4X4_LIB
16056#else
16057	CALL(inner_store_4x4_lib)
16058#endif
16059
16060
16061	EPILOGUE
16062
16063	ret
16064
16065	FUN_END(kernel_dtrsm_nn_ll_one_4x4_lib4cccc)
16066
16067
16068
16069
16070
16071//                                             1      2          3          4        5             6          7        8          9        10         11       12      13
16072// void kernel_dtrsm_nn_ll_one_4x4_vs_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, int m1, int n1);
16073
16074	.p2align 4,,15
16075	GLOB_FUN_START(kernel_dtrsm_nn_ll_one_4x4_vs_lib4cccc)
16076
16077	PROLOGUE
16078
16079	// zero accumulation registers
16080
16081	ZERO_ACC
16082
16083
16084	// call inner dgemm kernel nt
16085
16086	movq	ARG1, %r10
16087	movq	ARG2, %r11
16088	movq	ARG3, %r12
16089	movq	ARG4, %r13 // ldb
16090	sall	$3, %r13d
16091
16092
16093	movq	ARG14, %r14  // n1
16094	cmpl	$1, %r14d
16095	jg		100f
16096
16097#if MACRO_LEVEL>=2
16098	INNER_KERNEL_DGEMM_NN_4X1_LIB4C
16099#else
16100	CALL(inner_kernel_dgemm_nn_4x1_lib4c)
16101#endif
16102
16103	jmp		103f
16104
16105100:
16106
16107	movq	ARG14, %r14  // n1
16108	cmpl	$2, %r14d
16109	jg		101f
16110
16111#if MACRO_LEVEL>=2
16112	INNER_KERNEL_DGEMM_NN_4X2_LIB4C
16113#else
16114	CALL(inner_kernel_dgemm_nn_4x2_lib4c)
16115#endif
16116
16117	jmp		103f
16118
16119101:
16120
16121	movq	ARG14, %r14  // n1
16122	cmpl	$3, %r14d
16123	jg		102f
16124
16125#if MACRO_LEVEL>=2
16126	INNER_KERNEL_DGEMM_NN_4X3_LIB4C
16127#else
16128	CALL(inner_kernel_dgemm_nn_4x3_lib4c)
16129#endif
16130
16131	jmp		103f
16132
16133102:
16134
16135#if MACRO_LEVEL>=2
16136	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
16137#else
16138	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
16139#endif
16140
16141103:
16142
16143
16144	// call inner blender_loader nn
16145
16146	movq	ARG5, %r10 // beta
16147	movq	ARG6, %r11 // C
16148	movq	ARG7, %r12 // ldc
16149	sall	$3, %r12d
16150	movq	ARG12, %r13 // m1
16151	movq	ARG13, %r14 // n1
16152
16153#if MACRO_LEVEL>=1
16154	INNER_SCALE_M1B_4X4_VS_LIB
16155#else
16156	CALL(inner_scale_m1b_4x4_vs_lib)
16157#endif
16158
16159
16160	// solve
16161
16162	movq	ARG10, %r10  // E
16163	movq	ARG11, %r11 // lde
16164	sall	$3, %r11d
16165
16166#if MACRO_LEVEL>=1
16167	INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB
16168#else
16169	CALL(inner_edge_dtrsm_lln_one_4x4_lib)
16170#endif
16171
16172
16173	// store
16174
16175	movq	ARG8, %r10 // D
16176	movq	ARG9, %r11 // ldd
16177	sall	$3, %r11d
16178	movq	ARG12, %r12 // m1
16179	movq	ARG13, %r13 // n1
16180
16181#if MACRO_LEVEL>=1
16182	INNER_STORE_4X4_VS_LIB
16183#else
16184	CALL(inner_store_4x4_vs_lib)
16185#endif
16186
16187
16188	EPILOGUE
16189
16190	ret
16191
16192	FUN_END(kernel_dtrsm_nn_ll_one_4x4_vs_lib4cccc)
16193
16194
16195
16196
16197
16198//                                   1      2          3        4          5          6        7          8        9
16199// void kernel_dgetrf_nn_4x4_lib4ccc(int k, double *A, double *B, int a, double *C, int ldc, double *D, int ldd, double *inv_diag_D);
16200
16201	.p2align 4,,15
16202	GLOB_FUN_START(kernel_dgetrf_nn_4x4_lib4ccc)
16203
16204	PROLOGUE
16205
16206	// zero accumulation registers
16207
16208	ZERO_ACC
16209
16210
16211	// call inner dgemm kernel nt
16212
16213	movq	ARG1, %r10 // k
16214	movq	ARG2, %r11  // A
16215	movq	ARG3, %r12  // B
16216	movq	ARG4, %r13  // ldb
16217	sall	$3, %r13d
16218
16219#if MACRO_LEVEL>=2
16220	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
16221#else
16222	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
16223#endif
16224
16225
16226	// call inner blender_loader nn
16227
16228	movq	ARG5, %r10 // C
16229	movq	ARG6, %r11 // ldc
16230	sall	$3, %r11d
16231
16232#if MACRO_LEVEL>=1
16233	INNER_SCALE_M11_4X4_LIB
16234#else
16235	CALL(inner_scale_m11_4x4_lib)
16236#endif
16237
16238
16239	// factorization
16240
16241	movq	ARG9, %r10  // inv_diag_D
16242
16243#if MACRO_LEVEL>=1
16244	INNER_EDGE_DGETRF_4X4_LIB4
16245#else
16246	CALL(inner_edge_dgetrf_4x4_lib4)
16247#endif
16248
16249
16250	// store
16251
16252	movq	ARG7, %r10 // D
16253	movq	ARG8, %r11 // ldd
16254	sall	$3, %r11d
16255
16256#if MACRO_LEVEL>=1
16257	INNER_STORE_4X4_LIB
16258#else
16259	CALL(inner_store_4x4_lib)
16260#endif
16261
16262
16263	EPILOGUE
16264
16265	ret
16266
16267	FUN_END(kernel_dgetrf_nn_4x4_lib4ccc)
16268
16269
16270
16271
16272
16273//                                   1      2          3        4          5          6        7          8        9                      10      11
16274// void kernel_dgetrf_nn_4x4_vs_lib4ccc(int k, double *A, double *B, int ldb, double *C, int ldc, double *D, int ldd, double *inv_diag_D, int m1, int n1);
16275
16276	.p2align 4,,15
16277	GLOB_FUN_START(kernel_dgetrf_nn_4x4_vs_lib4ccc)
16278
16279	PROLOGUE
16280
16281	// zero accumulation registers
16282
16283	ZERO_ACC
16284
16285
16286	// call inner dgemm kernel nt
16287
16288	movq	ARG1, %r10 // k
16289	movq	ARG2, %r11  // A
16290	movq	ARG3, %r12  // B
16291	movq	ARG4, %r13  // ldb
16292	sall	$3, %r13d
16293
16294	movq	ARG11, %r14  // m1
16295	cmpl	$1, %r14d
16296	jg		100f
16297
16298#if MACRO_LEVEL>=2
16299	INNER_KERNEL_DGEMM_NN_4X1_LIB4C
16300#else
16301	CALL(inner_kernel_dgemm_nn_4x1_lib4c)
16302#endif
16303
16304	jmp		103f
16305
16306100:
16307
16308	movq	ARG11, %r14  // m1
16309	cmpl	$2, %r14d
16310	jg		101f
16311
16312#if MACRO_LEVEL>=2
16313	INNER_KERNEL_DGEMM_NN_4X2_LIB4C
16314#else
16315	CALL(inner_kernel_dgemm_nn_4x2_lib4c)
16316#endif
16317
16318	jmp		103f
16319
16320101:
16321
16322	movq	ARG11, %r14  // m1
16323	cmpl	$3, %r14d
16324	jg		102f
16325
16326#if MACRO_LEVEL>=2
16327	INNER_KERNEL_DGEMM_NN_4X3_LIB4C
16328#else
16329	CALL(inner_kernel_dgemm_nn_4x3_lib4c)
16330#endif
16331
16332	jmp		103f
16333
16334102:
16335
16336#if MACRO_LEVEL>=2
16337	INNER_KERNEL_DGEMM_NN_4X4_LIB4C
16338#else
16339	CALL(inner_kernel_dgemm_nn_4x4_lib4c)
16340#endif
16341
16342103:
16343
16344
16345	// call inner blender_loader nn
16346
16347	movq	ARG5, %r10 // C
16348	movq	ARG6, %r11 // ldc
16349	sall	$3, %r11d
16350	movq	ARG10, %r12 // m1
16351	movq	ARG11, %r13 // n1
16352
16353#if MACRO_LEVEL>=1
16354	INNER_SCALE_M11_4X4_VS_LIB
16355#else
16356	CALL(inner_scale_m11_4x4_vs_lib)
16357#endif
16358
16359
16360	// factorization
16361
16362	movq	ARG9, %r10  // inv_diag_D
16363
16364#if MACRO_LEVEL>=1
16365	INNER_EDGE_DGETRF_4X4_LIB4
16366#else
16367	CALL(inner_edge_dgetrf_4x4_lib4)
16368#endif
16369
16370
16371	// store
16372
16373	movq	ARG7, %r10 // D
16374	movq	ARG8, %r11 // ldd
16375	sall	$3, %r11d
16376	movq	ARG10, %r12 // m1
16377	movq	ARG11, %r13 // n1
16378
16379#if MACRO_LEVEL>=1
16380	INNER_STORE_4X4_VS_LIB
16381#else
16382	CALL(inner_store_4x4_vs_lib)
16383#endif
16384
16385
16386	EPILOGUE
16387
16388	ret
16389
16390	FUN_END(kernel_dgetrf_nn_4x4_vs_lib4ccc)
16391
16392
16393
16394
16395
16396
16397