1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define OLD_M	%rdi
43#define OLD_N	%rsi
44#define OLD_K	%rdx
45
46#define M	%r13
47#define N	%r14
48#define K	%r15
49
50#define A	%rcx
51#define B	%r8
52#define C	%r9
53#define LDC	%rbp
54
55#define I	%r11
56#define AO	%rdi
57#define BO	%rsi
58#define	CO1	%rbx
59#define CO2	%rdx
60#define BB	%r12
61
62#define PREA	%r10
63
64#ifndef WINDOWS_ABI
65
66#define STACKSIZE 128
67
68#define OLD_LDC		 8 + STACKSIZE(%rsp)
69#define OLD_OFFSET	16 + STACKSIZE(%rsp)
70
71#define ALPHA_R	   48(%rsp)
72#define ALPHA_I	   56(%rsp)
73#define J	   64(%rsp)
74#define OFFSET	   72(%rsp)
75#define KK	   80(%rsp)
76#define KKK	   88(%rsp)
77
78#else
79
80#define STACKSIZE 512
81
82#define OLD_ALPHA_I	40 + STACKSIZE(%rsp)
83#define OLD_A		48 + STACKSIZE(%rsp)
84#define OLD_B		56 + STACKSIZE(%rsp)
85#define OLD_C		64 + STACKSIZE(%rsp)
86#define OLD_LDC		72 + STACKSIZE(%rsp)
87#define OLD_OFFSET	80 + STACKSIZE(%rsp)
88
89#define ALPHA_R	  224(%rsp)
90#define ALPHA_I	  232(%rsp)
91#define J	  240(%rsp)
92#define OFFSET	  248(%rsp)
93#define KK	  256(%rsp)
94#define KKK	  264(%rsp)
95
96#endif
97
98#define PREFETCHSIZE  4
99#define PREFETCH     prefetcht0
100
101#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
102#define ADD1	  addpd
103#define ADD2	  addpd
104#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
105#define ADD1	  addpd
106#define ADD2	  addpd
107#elif  defined(RN) || defined(RT) || defined(CN) || defined(CT)
108#define ADD1	  addpd
109#define ADD2	  addpd
110#else
111#define ADD1	  addpd
112#define ADD2	  subpd
113#endif
114
115	PROLOGUE
116	PROFCODE
117
118	subq	$STACKSIZE, %rsp
119
120	movq	%rbx,  0(%rsp)
121	movq	%rbp,  8(%rsp)
122	movq	%r12, 16(%rsp)
123	movq	%r13, 24(%rsp)
124	movq	%r14, 32(%rsp)
125	movq	%r15, 40(%rsp)
126
127#ifdef WINDOWS_ABI
128	movq	%rdi,    48(%rsp)
129	movq	%rsi,    56(%rsp)
130	movups	%xmm6,   64(%rsp)
131	movups	%xmm7,   80(%rsp)
132	movups	%xmm8,   96(%rsp)
133	movups	%xmm9,  112(%rsp)
134	movups	%xmm10, 128(%rsp)
135	movups	%xmm11, 144(%rsp)
136	movups	%xmm12, 160(%rsp)
137	movups	%xmm13, 176(%rsp)
138	movups	%xmm14, 192(%rsp)
139	movups	%xmm15, 208(%rsp)
140
141	movq	ARG1,      OLD_M
142	movq	ARG2,      OLD_N
143	movq	ARG3,      OLD_K
144	movq	OLD_A,     A
145	movq	OLD_B,     B
146	movq	OLD_C,     C
147	movq	OLD_LDC,   LDC
148#ifdef TRMMKERNEL
149	movq	OLD_OFFSET, %r11
150#endif
151	movaps	%xmm3, %xmm0
152	movsd	OLD_ALPHA_I, %xmm1
153#else
154	movq	OLD_LDC,   LDC
155#ifdef TRMMKERNEL
156	movq	OLD_OFFSET, %r11
157#endif
158
159#endif
160
161	movlps	 %xmm0, ALPHA_R
162	movlps	 %xmm1, ALPHA_I
163
164	subq	$-16 * SIZE, A
165	subq	$-16 * SIZE, B
166
167	movq	OLD_M, M
168	movq	OLD_N, N
169	movq	OLD_K, K
170
171	salq	$ZBASE_SHIFT, LDC
172
173#ifdef TRMMKERNEL
174	movq	%r11, OFFSET
175#ifndef LEFT
176	negq	%r11
177#endif
178	movq	%r11, KK
179#endif
180	testq	M, M
181	jle	.L999
182
183	movq	N,  J
184	sarq	$2, J
185	NOBRANCH
186	jle	.L20
187	ALIGN_4
188
189.L01:
190#if defined(TRMMKERNEL) && defined(LEFT)
191        movq    OFFSET, %rax
192	movq    %rax, KK
193#endif
194
195	movq	C, CO1
196	leaq	(C, LDC, 2), CO2
197	movq	A, AO
198
199	movq	K, %rax
200	salq	$ZBASE_SHIFT + 2, %rax
201	leaq	(B, %rax), BB
202
203	movq	M,  I
204	ALIGN_4
205
206.L11:
207	prefetcht2	 -16 * SIZE(BB)
208	subq		 $-8 * SIZE, BB
209
210#if !defined(TRMMKERNEL) || \
211	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
212	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
213
214	movq	B, BO
215#else
216	movq	B, BO
217
218	movq	KK, %rax
219	salq	$ZBASE_SHIFT, %rax
220	leaq	(AO, %rax, 1), AO
221	leaq	(BO, %rax, 4), BO
222#endif
223
224	PADDING
225	xorps	%xmm1, %xmm1
226	xorps	%xmm2, %xmm2
227	xorps	%xmm3, %xmm3
228	xorps	%xmm4, %xmm4
229
230	xorps	%xmm8,  %xmm8
231	prefetcht0     1 * SIZE(CO1)
232	xorps	%xmm9,  %xmm9
233	xorps	%xmm10, %xmm10
234	prefetcht0     3 * SIZE(CO1, LDC)
235	xorps	%xmm11, %xmm11
236
237	movaps	-16 * SIZE(AO), %xmm0
238
239	xorps	%xmm12, %xmm12
240	xorps	%xmm13, %xmm13
241	prefetcht0     1 * SIZE(CO2)
242	xorps	%xmm14, %xmm14
243	xorps	%xmm15, %xmm15
244	prefetcht0     3 * SIZE(CO2, LDC)
245
246#ifndef TRMMKERNEL
247	movq	K, %rax
248#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
249	movq	K, %rax
250	subq	KK, %rax
251	movq	%rax, KKK
252#else
253	movq	KK, %rax
254#ifdef LEFT
255	addq	$1, %rax
256#else
257	addq	$4, %rax
258#endif
259	movq	%rax, KKK
260#endif
261	sarq	$2, %rax
262	NOBRANCH
263	jle	.L15
264	ALIGN_3
265
266.L12:
267	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
268
269	ADD1	%xmm1, %xmm12
270	movaps	-16 * SIZE(BO), %xmm1
271	ADD2	%xmm2, %xmm13
272	pshufd	$0x4e, %xmm1, %xmm2
273	mulpd	%xmm0, %xmm1
274	mulpd	%xmm0, %xmm2
275
276	ADD1	%xmm3, %xmm14
277	movaps	-14 * SIZE(BO), %xmm3
278	ADD2	%xmm4, %xmm15
279	pshufd	$0x4e, %xmm3, %xmm4
280	mulpd	%xmm0, %xmm3
281	mulpd	%xmm0, %xmm4
282
283	ADD1	%xmm1, %xmm8
284	movaps	-12 * SIZE(BO), %xmm1
285	ADD2	%xmm2, %xmm9
286	pshufd	$0x4e, %xmm1, %xmm2
287	mulpd	%xmm0, %xmm1
288	mulpd	%xmm0, %xmm2
289
290	ADD1	%xmm3, %xmm10
291	movaps	-10 * SIZE(BO), %xmm3
292	ADD2	%xmm4, %xmm11
293	pshufd	$0x4e, %xmm3, %xmm4
294	movaps	-14 * SIZE(AO), %xmm5
295	mulpd	%xmm0, %xmm3
296	mulpd	%xmm0, %xmm4
297
298	ADD1	%xmm1, %xmm12
299	movaps	 -8 * SIZE(BO), %xmm1
300	ADD2	%xmm2, %xmm13
301	pshufd	$0x4e, %xmm1, %xmm2
302	mulpd	%xmm5, %xmm1
303	mulpd	%xmm5, %xmm2
304
305	ADD1	%xmm3, %xmm14
306	movaps	 -6 * SIZE(BO), %xmm3
307	ADD2	%xmm4, %xmm15
308	pshufd	$0x4e, %xmm3, %xmm4
309	mulpd	%xmm5, %xmm3
310	mulpd	%xmm5, %xmm4
311
312	ADD1	%xmm1, %xmm8
313	movaps	 -4 * SIZE(BO), %xmm1
314	ADD2	%xmm2, %xmm9
315	pshufd	$0x4e, %xmm1, %xmm2
316	mulpd	%xmm5, %xmm1
317	mulpd	%xmm5, %xmm2
318
319	ADD1	%xmm3, %xmm10
320	movaps	 -2 * SIZE(BO), %xmm3
321	ADD2	%xmm4, %xmm11
322	pshufd	$0x4e, %xmm3, %xmm4
323	movaps	-12 * SIZE(AO), %xmm0
324	mulpd	%xmm5, %xmm3
325	mulpd	%xmm5, %xmm4
326
327	ADD1	%xmm1, %xmm12
328	movaps	  0 * SIZE(BO), %xmm1
329	ADD2	%xmm2, %xmm13
330	pshufd	$0x4e, %xmm1, %xmm2
331	mulpd	%xmm0, %xmm1
332	mulpd	%xmm0, %xmm2
333
334	ADD1	%xmm3, %xmm14
335	movaps	  2 * SIZE(BO), %xmm3
336	ADD2	%xmm4, %xmm15
337	pshufd	$0x4e, %xmm3, %xmm4
338	mulpd	%xmm0, %xmm3
339	mulpd	%xmm0, %xmm4
340
341	ADD1	%xmm1, %xmm8
342	movaps	  4 * SIZE(BO), %xmm1
343	ADD2	%xmm2, %xmm9
344	pshufd	$0x4e, %xmm1, %xmm2
345	mulpd	%xmm0, %xmm1
346	mulpd	%xmm0, %xmm2
347
348	ADD1	%xmm3, %xmm10
349	movaps	  6 * SIZE(BO), %xmm3
350	ADD2	%xmm4, %xmm11
351	pshufd	$0x4e, %xmm3, %xmm4
352	mulpd	%xmm0, %xmm3
353	movaps	-10 * SIZE(AO), %xmm5
354	mulpd	%xmm0, %xmm4
355
356	ADD1	%xmm1, %xmm12
357	movaps	  8 * SIZE(BO), %xmm1
358	ADD2	%xmm2, %xmm13
359	pshufd	$0x4e, %xmm1, %xmm2
360	mulpd	%xmm5, %xmm1
361	mulpd	%xmm5, %xmm2
362
363	ADD1	%xmm3, %xmm14
364	movaps	 10 * SIZE(BO), %xmm3
365	ADD2	%xmm4, %xmm15
366	pshufd	$0x4e, %xmm3, %xmm4
367	mulpd	%xmm5, %xmm3
368	PADDING;
369	mulpd	%xmm5, %xmm4
370
371	ADD1	%xmm1, %xmm8
372	movaps	 12 * SIZE(BO), %xmm1
373	ADD2	%xmm2, %xmm9
374	pshufd	$0x4e, %xmm1, %xmm2
375	mulpd	%xmm5, %xmm1
376	PADDING;
377	mulpd	%xmm5, %xmm2
378
379	ADD1	%xmm3, %xmm10
380	movaps	 14 * SIZE(BO), %xmm3
381	ADD2	%xmm4, %xmm11
382	pshufd	$0x4e, %xmm3, %xmm4
383	mulpd	%xmm5, %xmm3
384	movaps	 -8 * SIZE(AO), %xmm0
385	mulpd	%xmm5, %xmm4
386
387	subq	$-32 * SIZE, BO
388	subq	$-8 * SIZE, AO
389
390	subq	$1, %rax
391	BRANCH
392	jg	.L12
393	ALIGN_3
394
395.L15:
396	movddup	ALPHA_R, %xmm6
397	movddup	ALPHA_I, %xmm7
398
399#ifndef TRMMKERNEL
400	movq	K, %rax
401#else
402	movq	KKK, %rax
403#endif
404	andq	$3, %rax		# if (k & 1)
405	BRANCH
406	je	.L18
407	ALIGN_3
408
409.L16:
410	ADD1	%xmm1, %xmm12
411	movaps	-16 * SIZE(BO), %xmm1
412	ADD2	%xmm2, %xmm13
413	pshufd	$0x4e, %xmm1, %xmm2
414	mulpd	%xmm0, %xmm1
415	mulpd	%xmm0, %xmm2
416
417	ADD1	%xmm3, %xmm14
418	movaps	-14 * SIZE(BO), %xmm3
419	ADD2	%xmm4, %xmm15
420	pshufd	$0x4e, %xmm3, %xmm4
421	mulpd	%xmm0, %xmm3
422	mulpd	%xmm0, %xmm4
423
424	ADD1	%xmm1, %xmm8
425	movaps	-12 * SIZE(BO), %xmm1
426	ADD2	%xmm2, %xmm9
427	pshufd	$0x4e, %xmm1, %xmm2
428	mulpd	%xmm0, %xmm1
429	mulpd	%xmm0, %xmm2
430
431	ADD1	%xmm3, %xmm10
432	movaps	-10 * SIZE(BO), %xmm3
433	ADD2	%xmm4, %xmm11
434	pshufd	$0x4e, %xmm3, %xmm4
435	mulpd	%xmm0, %xmm3
436	mulpd	%xmm0, %xmm4
437
438	movaps	-14 * SIZE(AO), %xmm0
439
440	addq	$2 * SIZE, AO
441	addq	$8 * SIZE, BO
442
443	subq	$1, %rax
444	BRANCH
445	jg	.L16
446	ALIGN_3
447
448.L18:
449	ADD1	%xmm1, %xmm12
450	ADD2	%xmm2, %xmm13
451	ADD1	%xmm3, %xmm14
452	ADD2	%xmm4, %xmm15
453
454	pcmpeqb	%xmm0, %xmm0
455	psllq	$63,   %xmm0
456
457#if   defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
458      defined(RR) || defined(RC) || defined(CR) || defined(CC)
459	shufps	$0x40, %xmm0, %xmm0
460
461	xorps	%xmm0, %xmm8
462	xorps	%xmm0, %xmm10
463	xorps	%xmm0, %xmm12
464	xorps	%xmm0, %xmm14
465#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
466	shufps	$0x04, %xmm0, %xmm0
467
468	xorps	%xmm0, %xmm9
469	xorps	%xmm0, %xmm11
470	xorps	%xmm0, %xmm13
471	xorps	%xmm0, %xmm15
472#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
473	shufps	$0x40, %xmm0, %xmm0
474
475	xorps	%xmm0, %xmm9
476	xorps	%xmm0, %xmm11
477	xorps	%xmm0, %xmm13
478	xorps	%xmm0, %xmm15
479#endif
480
481	haddpd	%xmm9,  %xmm8
482	haddpd	%xmm11, %xmm10
483	haddpd	%xmm13, %xmm12
484	haddpd	%xmm15, %xmm14
485
486	pshufd	$0x4e, %xmm8,  %xmm9
487	pshufd	$0x4e, %xmm10, %xmm11
488	pshufd	$0x4e, %xmm12, %xmm13
489	pshufd	$0x4e, %xmm14, %xmm15
490
491	mulpd	%xmm6, %xmm8
492	mulpd	%xmm7, %xmm9
493	mulpd	%xmm6, %xmm10
494	mulpd	%xmm7, %xmm11
495
496	mulpd	%xmm6, %xmm12
497	mulpd	%xmm7, %xmm13
498	mulpd	%xmm6, %xmm14
499	mulpd	%xmm7, %xmm15
500
501	addsubpd	%xmm9,  %xmm8
502	addsubpd	%xmm11, %xmm10
503	addsubpd	%xmm13, %xmm12
504	addsubpd	%xmm15, %xmm14
505
506	testq	$15, CO1
507	NOBRANCH
508	jne	.L18x
509
510#ifndef TRMMKERNEL
511	movaps	(CO1), %xmm0
512	movaps	(CO1, LDC), %xmm1
513	movaps	(CO2), %xmm2
514	movaps	(CO2, LDC), %xmm3
515
516	addpd	%xmm0, %xmm8
517	addpd	%xmm1, %xmm10
518	addpd	%xmm2, %xmm12
519	addpd	%xmm3, %xmm14
520#endif
521
522	movaps	%xmm8,  (CO1)
523	movaps	%xmm10, (CO1, LDC)
524	movaps	%xmm12, (CO2)
525	movaps	%xmm14, (CO2, LDC)
526
527#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
528    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
529	movq	K, %rax
530	subq	KKK, %rax
531	salq	$ZBASE_SHIFT, %rax
532	leaq	(AO, %rax, 1), AO
533	leaq	(BO, %rax, 4), BO
534#endif
535
536#if defined(TRMMKERNEL) && defined(LEFT)
537	addq	$1, KK
538#endif
539
540	addq	$2 * SIZE, CO1
541	addq	$2 * SIZE, CO2
542	decq	I
543	BRANCH
544	jg	.L11
545
546#if defined(TRMMKERNEL) && !defined(LEFT)
547	addq	$4, KK
548#endif
549
550	leaq	(C, LDC, 4), C
551	movq	BO, B
552
553	subq	$1, J
554	BRANCH
555	jg	.L01
556	jmp	.L20
557	ALIGN_4
558
559.L18x:
560#ifndef TRMMKERNEL
561	movups	(CO1), %xmm0
562	movups	(CO1, LDC), %xmm1
563	movups	(CO2), %xmm2
564	movups	(CO2, LDC), %xmm3
565
566	addpd	%xmm0, %xmm8
567	addpd	%xmm1, %xmm10
568	addpd	%xmm2, %xmm12
569	addpd	%xmm3, %xmm14
570#endif
571
572	movups	%xmm8,  (CO1)
573	movups	%xmm10, (CO1, LDC)
574	movups	%xmm12, (CO2)
575	movups	%xmm14, (CO2, LDC)
576
577#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
578    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
579	movq	K, %rax
580	subq	KKK, %rax
581	salq	$ZBASE_SHIFT, %rax
582	leaq	(AO, %rax, 1), AO
583	leaq	(BO, %rax, 4), BO
584#endif
585
586#if defined(TRMMKERNEL) && defined(LEFT)
587	addq	$1, KK
588#endif
589
590	addq	$2 * SIZE, CO1
591	addq	$2 * SIZE, CO2
592	decq	I
593	BRANCH
594	jg	.L11
595
596#if defined(TRMMKERNEL) && !defined(LEFT)
597	addq	$4, KK
598#endif
599
600	leaq	(C, LDC, 4), C
601	movq	BO, B
602
603	subq	$1, J
604	BRANCH
605	jg	.L01
606	ALIGN_4
607
608.L20:
609	testq	$2, N
610	BRANCH
611	jle	.L30
612
613#if defined(TRMMKERNEL) && defined(LEFT)
614        movq    OFFSET, %rax
615	movq    %rax, KK
616#endif
617
618	movq	C, CO1
619	leaq	(C, LDC, 1), CO2
620	movq	A, AO
621
622	movq	M,  I
623	ALIGN_4
624
625.L21:
626#if !defined(TRMMKERNEL) || \
627	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
628	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
629
630	movq	B, BO
631#else
632	movq	B, BO
633
634	movq	KK, %rax
635	salq	$ZBASE_SHIFT, %rax
636	leaq	(AO, %rax, 1), AO
637	leaq	(BO, %rax, 2), BO
638#endif
639
640	xorps	%xmm1, %xmm1
641	movaps	-16 * SIZE(AO), %xmm0
642	xorps	%xmm2, %xmm2
643	xorps	%xmm3, %xmm3
644	xorps	%xmm4, %xmm4
645
646	xorps	%xmm8,  %xmm8
647	prefetcht0     1 * SIZE(CO1)
648	xorps	%xmm9,  %xmm9
649	prefetcht0     2 * SIZE(CO2)
650	xorps	%xmm10, %xmm10
651	xorps	%xmm11, %xmm11
652
653#ifndef TRMMKERNEL
654	movq	K, %rax
655#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
656	movq	K, %rax
657	subq	KK, %rax
658	movq	%rax, KKK
659#else
660	movq	KK, %rax
661#ifdef LEFT
662	addq	$1, %rax
663#else
664	addq	$2, %rax
665#endif
666	movq	%rax, KKK
667#endif
668	sarq	$2, %rax
669	NOBRANCH
670	jle	.L25
671	ALIGN_3
672
673.L22:
674	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
675
676	ADD1	%xmm1, %xmm8
677	movaps	-16 * SIZE(BO), %xmm1
678	ADD2	%xmm2, %xmm9
679	pshufd	$0x4e, %xmm1, %xmm2
680	mulpd	%xmm0, %xmm1
681	mulpd	%xmm0, %xmm2
682
683	ADD1	%xmm3, %xmm10
684	movaps	-14 * SIZE(BO), %xmm3
685	ADD2	%xmm4, %xmm11
686	pshufd	$0x4e, %xmm3, %xmm4
687	mulpd	%xmm0, %xmm3
688	mulpd	%xmm0, %xmm4
689
690	movaps	-14 * SIZE(AO), %xmm0
691
692	ADD1	%xmm1, %xmm8
693	movaps	-12 * SIZE(BO), %xmm1
694	ADD2	%xmm2, %xmm9
695	pshufd	$0x4e, %xmm1, %xmm2
696	mulpd	%xmm0, %xmm1
697	mulpd	%xmm0, %xmm2
698
699	ADD1	%xmm3, %xmm10
700	movaps	-10 * SIZE(BO), %xmm3
701	ADD2	%xmm4, %xmm11
702	pshufd	$0x4e, %xmm3, %xmm4
703	mulpd	%xmm0, %xmm3
704	mulpd	%xmm0, %xmm4
705
706	movaps	-12 * SIZE(AO), %xmm0
707
708	ADD1	%xmm1, %xmm8
709	movaps	 -8 * SIZE(BO), %xmm1
710	ADD2	%xmm2, %xmm9
711	pshufd	$0x4e, %xmm1, %xmm2
712	mulpd	%xmm0, %xmm1
713	mulpd	%xmm0, %xmm2
714
715	ADD1	%xmm3, %xmm10
716	movaps	 -6 * SIZE(BO), %xmm3
717	ADD2	%xmm4, %xmm11
718	pshufd	$0x4e, %xmm3, %xmm4
719	mulpd	%xmm0, %xmm3
720	mulpd	%xmm0, %xmm4
721
722	movaps	-10 * SIZE(AO), %xmm0
723
724	ADD1	%xmm1, %xmm8
725	movaps	 -4 * SIZE(BO), %xmm1
726	ADD2	%xmm2, %xmm9
727	pshufd	$0x4e, %xmm1, %xmm2
728	mulpd	%xmm0, %xmm1
729	mulpd	%xmm0, %xmm2
730
731	ADD1	%xmm3, %xmm10
732	movaps	 -2 * SIZE(BO), %xmm3
733	ADD2	%xmm4, %xmm11
734	pshufd	$0x4e, %xmm3, %xmm4
735	mulpd	%xmm0, %xmm3
736	mulpd	%xmm0, %xmm4
737
738	movaps	 -8 * SIZE(AO), %xmm0
739
740	subq	$-8  * SIZE, AO
741	subq	$-16 * SIZE, BO
742
743	subq	$1, %rax
744	BRANCH
745	jg	.L22
746	ALIGN_3
747
748.L25:
749#ifndef TRMMKERNEL
750	movq	K, %rax
751#else
752	movq	KKK, %rax
753#endif
754	andq	$3, %rax		# if (k & 1)
755	BRANCH
756	je	.L28
757	ALIGN_3
758
759.L26:
760	ADD1	%xmm1, %xmm8
761	movaps	-16 * SIZE(BO), %xmm1
762	ADD2	%xmm2, %xmm9
763	pshufd	$0x4e, %xmm1, %xmm2
764	mulpd	%xmm0, %xmm1
765	mulpd	%xmm0, %xmm2
766
767	ADD1	%xmm3, %xmm10
768	movaps	-14 * SIZE(BO), %xmm3
769	ADD2	%xmm4, %xmm11
770	pshufd	$0x4e, %xmm3, %xmm4
771	mulpd	%xmm0, %xmm3
772	mulpd	%xmm0, %xmm4
773
774	movaps	-14 * SIZE(AO), %xmm0
775
776	addq	$2 * SIZE, AO
777	addq	$4 * SIZE, BO
778
779	subq	$1, %rax
780	BRANCH
781	jg	.L26
782	ALIGN_3
783
784.L28:
785	ADD1	%xmm1, %xmm8
786	ADD2	%xmm2, %xmm9
787	ADD1	%xmm3, %xmm10
788	ADD2	%xmm4, %xmm11
789
790	pcmpeqb	%xmm0, %xmm0
791	psllq	$63,   %xmm0
792
793	movddup	ALPHA_R, %xmm2
794	movddup	ALPHA_I, %xmm3
795
796#if   defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
797      defined(RR) || defined(RC) || defined(CR) || defined(CC)
798	shufps	$0x40, %xmm0, %xmm0
799
800	xorps	%xmm0, %xmm8
801	xorps	%xmm0, %xmm10
802#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
803	shufps	$0x04, %xmm0, %xmm0
804
805	xorps	%xmm0, %xmm9
806	xorps	%xmm0, %xmm11
807#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
808	shufps	$0x40, %xmm0, %xmm0
809
810	xorps	%xmm0, %xmm9
811	xorps	%xmm0, %xmm11
812#endif
813
814	haddpd	%xmm9,  %xmm8
815	haddpd	%xmm11, %xmm10
816
817	pshufd	$0x4e, %xmm8,  %xmm9
818	pshufd	$0x4e, %xmm10, %xmm11
819
820	mulpd	%xmm2, %xmm8
821	mulpd	%xmm3, %xmm9
822	mulpd	%xmm2, %xmm10
823	mulpd	%xmm3, %xmm11
824
825	addsubpd	%xmm9,  %xmm8
826	addsubpd	%xmm11, %xmm10
827
828#ifndef TRMMKERNEL
829	movsd	0 * SIZE(CO1), %xmm0
830	movhpd	1 * SIZE(CO1), %xmm0
831	movsd	0 * SIZE(CO2), %xmm1
832	movhpd	1 * SIZE(CO2), %xmm1
833
834	addpd	%xmm0, %xmm8
835	addpd	%xmm1, %xmm10
836#endif
837
838	movsd	%xmm8,  0 * SIZE(CO1)
839	movhpd	%xmm8,  1 * SIZE(CO1)
840	movsd	%xmm10, 0 * SIZE(CO2)
841	movhpd	%xmm10, 1 * SIZE(CO2)
842
843#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
844    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
845	movq	K, %rax
846	subq	KKK, %rax
847	salq	$ZBASE_SHIFT, %rax
848	leaq	(AO, %rax, 1), AO
849	leaq	(BO, %rax, 2), BO
850#endif
851
852#if defined(TRMMKERNEL) && defined(LEFT)
853	addq	$1, KK
854#endif
855
856	addq	$2 * SIZE, CO1
857	addq	$2 * SIZE, CO2
858	decq	I
859	BRANCH
860	jg	.L21
861
862#if defined(TRMMKERNEL) && !defined(LEFT)
863	addq	$2, KK
864#endif
865
866	leaq	(C, LDC, 2), C
867	movq	BO, B
868	ALIGN_4
869
870.L30:
871	testq	$1, N
872	BRANCH
873	jle	.L999
874
875#if defined(TRMMKERNEL) && defined(LEFT)
876        movq    OFFSET, %rax
877	movq    %rax, KK
878#endif
879
880	movq	C, CO1
881	movq	A, AO
882
883	movq	M,  I
884	ALIGN_4
885
886.L31:
887#if !defined(TRMMKERNEL) || \
888	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
889	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
890
891	movq	B, BO
892#else
893	movq	B, BO
894
895	movq	KK, %rax
896	salq	$ZBASE_SHIFT, %rax
897	leaq	(AO, %rax, 1), AO
898	leaq	(BO, %rax, 1), BO
899#endif
900
901	xorps	%xmm1, %xmm1
902	movaps	-16 * SIZE(AO), %xmm0
903	xorps	%xmm2, %xmm2
904
905	xorps	%xmm8,  %xmm8
906	prefetcht0     2 * SIZE(CO1)
907	xorps	%xmm9,  %xmm9
908	xorps	%xmm10, %xmm10
909	xorps	%xmm11, %xmm11
910
911#ifndef TRMMKERNEL
912	movq	K, %rax
913#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
914	movq	K, %rax
915	subq	KK, %rax
916	movq	%rax, KKK
917#else
918	movq	KK, %rax
919#ifdef LEFT
920	addq	$1, %rax
921#else
922	addq	$1, %rax
923#endif
924	movq	%rax, KKK
925#endif
926	sarq	$2, %rax
927	NOBRANCH
928	jle	.L35
929	ALIGN_3
930
931.L32:
932	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
933
934	ADD1	%xmm1, %xmm8
935	movaps	-16 * SIZE(BO), %xmm1
936	ADD2	%xmm2, %xmm9
937	pshufd	$0x4e, %xmm1, %xmm2
938	mulpd	%xmm0, %xmm1
939	mulpd	%xmm0, %xmm2
940	movaps	-14 * SIZE(AO), %xmm0
941
942	ADD1	%xmm1, %xmm10
943	movaps	-14 * SIZE(BO), %xmm1
944	ADD2	%xmm2, %xmm11
945	pshufd	$0x4e, %xmm1, %xmm2
946	mulpd	%xmm0, %xmm1
947	mulpd	%xmm0, %xmm2
948	movaps	-12 * SIZE(AO), %xmm0
949
950	ADD1	%xmm1, %xmm8
951	movaps	-12 * SIZE(BO), %xmm1
952	ADD2	%xmm2, %xmm9
953	pshufd	$0x4e, %xmm1, %xmm2
954	mulpd	%xmm0, %xmm1
955	mulpd	%xmm0, %xmm2
956	movaps	-10 * SIZE(AO), %xmm0
957
958	ADD1	%xmm1, %xmm10
959	movaps	-10 * SIZE(BO), %xmm1
960	ADD2	%xmm2, %xmm11
961	pshufd	$0x4e, %xmm1, %xmm2
962	mulpd	%xmm0, %xmm1
963	mulpd	%xmm0, %xmm2
964	movaps	 -8 * SIZE(AO), %xmm0
965
966	subq	$-8 * SIZE, AO
967	subq	$-8 * SIZE, BO
968
969	subq	$1, %rax
970	BRANCH
971	jg	.L32
972
973	addpd	%xmm10, %xmm8
974	addpd	%xmm11, %xmm9
975	ALIGN_3
976
977.L35:
978#ifndef TRMMKERNEL
979	movq	K, %rax
980#else
981	movq	KKK, %rax
982#endif
983	andq	$3, %rax
984	BRANCH
985	je	.L38
986	ALIGN_3
987
988.L36:
989	ADD1	%xmm1, %xmm8
990	movaps	-16 * SIZE(BO), %xmm1
991	ADD2	%xmm2, %xmm9
992	pshufd	$0x4e, %xmm1, %xmm2
993	mulpd	%xmm0, %xmm1
994	mulpd	%xmm0, %xmm2
995	movaps	-14 * SIZE(AO), %xmm0
996
997	addq	$2 * SIZE, AO
998	addq	$2 * SIZE, BO
999
1000	subq	$1, %rax
1001	BRANCH
1002	jg	.L36
1003	ALIGN_3
1004
1005.L38:
1006	ADD1	%xmm1, %xmm8
1007	ADD2	%xmm2, %xmm9
1008
1009	pcmpeqb	%xmm0, %xmm0
1010	psllq	$63,   %xmm0
1011
1012	movddup	ALPHA_R, %xmm2
1013	movddup	ALPHA_I, %xmm3
1014
1015#if   defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1016      defined(RR) || defined(RC) || defined(CR) || defined(CC)
1017	shufps	$0x40, %xmm0, %xmm0
1018
1019	xorps	%xmm0, %xmm8
1020#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
1021	shufps	$0x04, %xmm0, %xmm0
1022
1023	xorps	%xmm0, %xmm9
1024#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
1025	shufps	$0x40, %xmm0, %xmm0
1026
1027	xorps	%xmm0, %xmm9
1028#endif
1029
1030	haddpd	%xmm9,  %xmm8
1031	pshufd	$0x4e, %xmm8,  %xmm9
1032
1033	mulpd	%xmm2, %xmm8
1034	mulpd	%xmm3, %xmm9
1035
1036	addsubpd	%xmm9,  %xmm8
1037
1038#ifndef TRMMKERNEL
1039	movsd	0 * SIZE(CO1), %xmm0
1040	movhpd	1 * SIZE(CO1), %xmm0
1041
1042	addpd	%xmm0, %xmm8
1043#endif
1044
1045	movsd	%xmm8,  0 * SIZE(CO1)
1046	movhpd	%xmm8,  1 * SIZE(CO1)
1047
1048#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1049    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1050	movq	K, %rax
1051	subq	KKK, %rax
1052	salq	$ZBASE_SHIFT, %rax
1053	leaq	(AO, %rax, 1), AO
1054	leaq	(BO, %rax, 1), BO
1055#endif
1056
1057#if defined(TRMMKERNEL) && defined(LEFT)
1058	addq	$1, KK
1059#endif
1060
1061	addq	$2 * SIZE, CO1
1062	decq	I
1063	BRANCH
1064	jg	.L31
1065	ALIGN_4
1066
1067.L999:
1068	movq	  0(%rsp), %rbx
1069	movq	  8(%rsp), %rbp
1070	movq	 16(%rsp), %r12
1071	movq	 24(%rsp), %r13
1072	movq	 32(%rsp), %r14
1073	movq	 40(%rsp), %r15
1074
1075#ifdef WINDOWS_ABI
1076	movq	 48(%rsp), %rdi
1077	movq	 56(%rsp), %rsi
1078	movups	 64(%rsp), %xmm6
1079	movups	 80(%rsp), %xmm7
1080	movups	 96(%rsp), %xmm8
1081	movups	112(%rsp), %xmm9
1082	movups	128(%rsp), %xmm10
1083	movups	144(%rsp), %xmm11
1084	movups	160(%rsp), %xmm12
1085	movups	176(%rsp), %xmm13
1086	movups	192(%rsp), %xmm14
1087	movups	208(%rsp), %xmm15
1088#endif
1089
1090	addq	$STACKSIZE, %rsp
1091	ret
1092
1093	EPILOGUE
1094