1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#if defined(PENTIUM4) || defined(GENERIC)
43#define PREFETCHSIZE	16
44#define PREFETCH      prefetcht0
45#define PREFETCHW     prefetcht0
46#endif
47
48#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM)
49#define PREFETCHSIZE	16
50#define PREFETCH      prefetcht0
51#define PREFETCHW     prefetcht0
52#endif
53
54#ifdef ATOM
55#define PREFETCHSIZE	16
56#define PREFETCH      prefetcht0
57#define PREFETCHW     prefetcht0
58#endif
59
60#ifdef NANO
61#define PREFETCHSIZE	16
62#define PREFETCH      prefetcht0
63#define PREFETCHW     prefetcht0
64#endif
65
66#ifdef OPTERON
67#define PREFETCHSIZE	16
68#define PREFETCH      prefetch
69#define PREFETCHW     prefetchw
70#endif
71
72#ifdef GENERIC
73#define PREFETCHSIZE	16
74#define PREFETCH      prefetcht0
75#define PREFETCHW     prefetcht0
76#endif
77
78#ifndef WINDOWS_ABI
79
80#define M	ARG1	/* rdi */
81#define N	ARG2	/* rsi */
82#define A	ARG3	/* rdx */
83#define LDA	ARG4	/* rcx */
84#define B	ARG5	/* r8  */
85
86#define I	%r9
87
88#else
89
90#define STACKSIZE 256
91
92#define M	ARG1	/* rcx */
93#define N	ARG2	/* rdx */
94#define A	ARG3	/* r8  */
95#define LDA	ARG4	/* r9  */
96#define OLD_B		40 + 32 + STACKSIZE(%rsp)
97
98#define B	%r14
99#define I	%r15
100
101#endif
102
103#define J	%r10
104#define AO1	%r11
105#define AO2	%r12
106#define MM	%r13
107
108	PROLOGUE
109	PROFCODE
110
111#ifdef WINDOWS_ABI
112	pushq	%r15
113	pushq	%r14
114#endif
115	pushq	%r13
116	pushq	%r12
117
118#ifdef WINDOWS_ABI
119	subq	$STACKSIZE, %rsp
120
121	movups	%xmm6,    0(%rsp)
122	movups	%xmm7,   16(%rsp)
123
124	movq	OLD_B,     B
125#endif
126
127	leaq	(,LDA, SIZE), LDA
128	subq	$-16 * SIZE, B
129
130	movq	M, MM
131	leaq	-1(M), %rax
132	testq	$SIZE, A
133	cmovne	%rax, MM
134
135	testq	$SIZE, LDA
136	jne	.L50
137
138	movq	N,  J
139	sarq	$2, J
140	jle	.L20
141	ALIGN_4
142
143.L11:
144	movq	A, AO1
145	leaq	(A, LDA, 2), AO2
146	leaq	(A, LDA, 4), A
147
148	testq	$SIZE, A
149	je	.L12
150
151	movsd	0 * SIZE(AO1),      %xmm0
152	movsd	0 * SIZE(AO1, LDA), %xmm1
153	movsd	0 * SIZE(AO2),      %xmm2
154	movsd	0 * SIZE(AO2, LDA), %xmm3
155
156	unpcklpd %xmm1, %xmm0
157	unpcklpd %xmm3, %xmm2
158
159	movapd	%xmm0,  -16 * SIZE(B)
160	movapd	%xmm2,  -14 * SIZE(B)
161
162	addq	$1 * SIZE, AO1
163	addq	$1 * SIZE, AO2
164	subq	$-4 * SIZE, B
165	ALIGN_3
166
167.L12:
168	movq	MM, I
169	sarq	$3, I
170	jle	.L14
171	ALIGN_4
172
173.L13:
174#ifdef PREFETCH
175	PREFETCH	PREFETCHSIZE * SIZE(AO1)
176#endif
177
178	movapd	0 * SIZE(AO1),      %xmm0
179	movapd	0 * SIZE(AO1, LDA), %xmm1
180	movapd	0 * SIZE(AO2),      %xmm2
181	movapd	0 * SIZE(AO2, LDA), %xmm3
182
183	movapd	 %xmm0, %xmm4
184	unpcklpd %xmm1, %xmm0
185	movapd	 %xmm2, %xmm6
186	unpcklpd %xmm3, %xmm2
187
188	unpckhpd %xmm1, %xmm4
189	unpckhpd %xmm3, %xmm6
190
191#ifdef PREFETCHW
192	PREFETCHW	(PREFETCHSIZE * 4 +  0) * SIZE(B)
193#endif
194
195	movapd	%xmm0,  -16 * SIZE(B)
196	movapd	%xmm2,  -14 * SIZE(B)
197	movapd	%xmm4,  -12 * SIZE(B)
198	movapd	%xmm6,  -10 * SIZE(B)
199
200#ifdef PREFETCH
201	PREFETCH	PREFETCHSIZE * SIZE(AO1, LDA)
202#endif
203
204	movapd	2 * SIZE(AO1),      %xmm0
205	movapd	2 * SIZE(AO1, LDA), %xmm1
206	movapd	2 * SIZE(AO2),      %xmm2
207	movapd	2 * SIZE(AO2, LDA), %xmm3
208
209	movapd	 %xmm0, %xmm4
210	unpcklpd %xmm1, %xmm0
211	movapd	 %xmm2, %xmm6
212	unpcklpd %xmm3, %xmm2
213
214	unpckhpd %xmm1, %xmm4
215	unpckhpd %xmm3, %xmm6
216
217#ifdef PREFETCHW
218	PREFETCHW	(PREFETCHSIZE * 4 +  8) * SIZE(B)
219#endif
220
221	movapd	%xmm0,  -8 * SIZE(B)
222	movapd	%xmm2,  -6 * SIZE(B)
223	movapd	%xmm4,  -4 * SIZE(B)
224	movapd	%xmm6,  -2 * SIZE(B)
225
226#ifdef PREFETCH
227	PREFETCH	PREFETCHSIZE * SIZE(AO2)
228#endif
229
230	movapd	4 * SIZE(AO1),      %xmm0
231	movapd	4 * SIZE(AO1, LDA), %xmm1
232	movapd	4 * SIZE(AO2),      %xmm2
233	movapd	4 * SIZE(AO2, LDA), %xmm3
234
235	movapd	 %xmm0, %xmm4
236	unpcklpd %xmm1, %xmm0
237	movapd	 %xmm2, %xmm6
238	unpcklpd %xmm3, %xmm2
239
240	unpckhpd %xmm1, %xmm4
241	unpckhpd %xmm3, %xmm6
242
243#ifdef PREFETCHW
244	PREFETCHW	(PREFETCHSIZE * 4 + 16) * SIZE(B)
245#endif
246
247	movapd	%xmm0,   0 * SIZE(B)
248	movapd	%xmm2,   2 * SIZE(B)
249	movapd	%xmm4,   4 * SIZE(B)
250	movapd	%xmm6,   6 * SIZE(B)
251
252#ifdef PREFETCH
253	PREFETCH	PREFETCHSIZE * SIZE(AO2, LDA)
254#endif
255
256	movapd	6 * SIZE(AO1),      %xmm0
257	movapd	6 * SIZE(AO1, LDA), %xmm1
258	movapd	6 * SIZE(AO2),      %xmm2
259	movapd	6 * SIZE(AO2, LDA), %xmm3
260
261	movapd	 %xmm0, %xmm4
262	unpcklpd %xmm1, %xmm0
263	movapd	 %xmm2, %xmm6
264	unpcklpd %xmm3, %xmm2
265
266	unpckhpd %xmm1, %xmm4
267	unpckhpd %xmm3, %xmm6
268
269#ifdef PREFETCHW
270	PREFETCHW	(PREFETCHSIZE * 4 + 24) * SIZE(B)
271#endif
272
273	movapd	%xmm0,   8 * SIZE(B)
274	movapd	%xmm2,  10 * SIZE(B)
275	movapd	%xmm4,  12 * SIZE(B)
276	movapd	%xmm6,  14 * SIZE(B)
277
278	addq	$8 * SIZE, AO1
279	addq	$8 * SIZE, AO2
280	subq	$-32 * SIZE, B
281
282	decq	I
283	jg	.L13
284	ALIGN_4
285
286.L14:
287	testq	$4, MM
288	jle	.L16
289
290	movapd	0 * SIZE(AO1),      %xmm0
291	movapd	0 * SIZE(AO1, LDA), %xmm1
292	movapd	0 * SIZE(AO2),      %xmm2
293	movapd	0 * SIZE(AO2, LDA), %xmm3
294
295	movapd	 %xmm0, %xmm4
296	unpcklpd %xmm1, %xmm0
297	movapd	 %xmm2, %xmm6
298	unpcklpd %xmm3, %xmm2
299
300	unpckhpd %xmm1, %xmm4
301	unpckhpd %xmm3, %xmm6
302
303	movapd	%xmm0,  -16 * SIZE(B)
304	movapd	%xmm2,  -14 * SIZE(B)
305	movapd	%xmm4,  -12 * SIZE(B)
306	movapd	%xmm6,  -10 * SIZE(B)
307
308	movapd	2 * SIZE(AO1),      %xmm0
309	movapd	2 * SIZE(AO1, LDA), %xmm1
310	movapd	2 * SIZE(AO2),      %xmm2
311	movapd	2 * SIZE(AO2, LDA), %xmm3
312
313	movapd	 %xmm0, %xmm4
314	unpcklpd %xmm1, %xmm0
315	movapd	 %xmm2, %xmm6
316	unpcklpd %xmm3, %xmm2
317
318	unpckhpd %xmm1, %xmm4
319	unpckhpd %xmm3, %xmm6
320
321	movapd	%xmm0,  -8 * SIZE(B)
322	movapd	%xmm2,  -6 * SIZE(B)
323	movapd	%xmm4,  -4 * SIZE(B)
324	movapd	%xmm6,  -2 * SIZE(B)
325
326	addq	$4 * SIZE, AO1
327	addq	$4 * SIZE, AO2
328	subq	$-16 * SIZE, B
329	ALIGN_4
330
331.L16:
332	testq	$2, MM
333	jle	.L18
334
335	movapd	0 * SIZE(AO1),      %xmm0
336	movapd	0 * SIZE(AO1, LDA), %xmm1
337	movapd	0 * SIZE(AO2),      %xmm2
338	movapd	0 * SIZE(AO2, LDA), %xmm3
339
340	movapd	 %xmm0, %xmm4
341	unpcklpd %xmm1, %xmm0
342	movapd	 %xmm2, %xmm6
343	unpcklpd %xmm3, %xmm2
344
345	unpckhpd %xmm1, %xmm4
346	unpckhpd %xmm3, %xmm6
347
348	movapd	%xmm0,  -16 * SIZE(B)
349	movapd	%xmm2,  -14 * SIZE(B)
350	movapd	%xmm4,  -12 * SIZE(B)
351	movapd	%xmm6,  -10 * SIZE(B)
352
353	addq	$2 * SIZE, AO1
354	addq	$2 * SIZE, AO2
355	subq	$-8 * SIZE, B
356	ALIGN_4
357
358.L18:
359	testq	$1, MM
360	jle	.L19
361
362	movsd	0 * SIZE(AO1),      %xmm0
363	movsd	0 * SIZE(AO1, LDA), %xmm1
364	movsd	0 * SIZE(AO2),      %xmm2
365	movsd	0 * SIZE(AO2, LDA), %xmm3
366
367	unpcklpd %xmm1, %xmm0
368	unpcklpd %xmm3, %xmm2
369
370	movapd	%xmm0,  -16 * SIZE(B)
371	movapd	%xmm2,  -14 * SIZE(B)
372	subq	$-4 * SIZE, B
373	ALIGN_4
374
375.L19:
376	decq	J
377	jg	.L11
378	ALIGN_4
379
380.L20:
381	testq	$2, N
382	jle	.L30
383
384	movq	A, AO1
385	leaq	(A, LDA), AO2
386	leaq	(A, LDA, 2), A
387
388	testq	$SIZE, A
389	je	.L22
390
391	movsd	0 * SIZE(AO1),      %xmm0
392	movsd	0 * SIZE(AO2),      %xmm1
393
394	unpcklpd %xmm1, %xmm0
395
396	movapd	%xmm0,  -16 * SIZE(B)
397
398	addq	$1 * SIZE, AO1
399	addq	$1 * SIZE, AO2
400	subq	$-2 * SIZE, B
401	ALIGN_3
402
403.L22:
404	movq	MM, I
405	sarq	$3, I
406	jle	.L24
407	ALIGN_4
408
409.L23:
410#ifdef PREFETCH
411	PREFETCH	PREFETCHSIZE * 2 * SIZE(AO1)
412#endif
413
414	movapd	0 * SIZE(AO1),      %xmm0
415	movapd	0 * SIZE(AO2),      %xmm1
416	movapd	2 * SIZE(AO1),      %xmm2
417	movapd	2 * SIZE(AO2),      %xmm3
418
419	movapd	 %xmm0, %xmm4
420	unpcklpd %xmm1, %xmm0
421	movapd	 %xmm2, %xmm6
422	unpcklpd %xmm3, %xmm2
423
424	unpckhpd %xmm1, %xmm4
425	unpckhpd %xmm3, %xmm6
426
427#ifdef PREFETCHW
428	PREFETCHW	(PREFETCHSIZE * 4 +  0) * SIZE(B)
429#endif
430
431	movapd	%xmm0,  -16 * SIZE(B)
432	movapd	%xmm4,  -14 * SIZE(B)
433	movapd	%xmm2,  -12 * SIZE(B)
434	movapd	%xmm6,  -10 * SIZE(B)
435
436#ifdef PREFETCH
437	PREFETCH	PREFETCHSIZE * 2 * SIZE(AO2)
438#endif
439
440	movapd	4 * SIZE(AO1), %xmm0
441	movapd	4 * SIZE(AO2), %xmm1
442	movapd	6 * SIZE(AO1), %xmm2
443	movapd	6 * SIZE(AO2), %xmm3
444
445	movapd	 %xmm0, %xmm4
446	unpcklpd %xmm1, %xmm0
447	movapd	 %xmm2, %xmm6
448	unpcklpd %xmm3, %xmm2
449
450	unpckhpd %xmm1, %xmm4
451	unpckhpd %xmm3, %xmm6
452
453#ifdef PREFETCHW
454	PREFETCHW	(PREFETCHSIZE * 4 +  8) * SIZE(B)
455#endif
456
457	movapd	%xmm0,  -8 * SIZE(B)
458	movapd	%xmm4,  -6 * SIZE(B)
459	movapd	%xmm2,  -4 * SIZE(B)
460	movapd	%xmm6,  -2 * SIZE(B)
461
462	addq	$8 * SIZE, AO1
463	addq	$8 * SIZE, AO2
464	subq	$-16 * SIZE, B
465
466	decq	I
467	jg	.L23
468	ALIGN_4
469
470.L24:
471	testq	$4, MM
472	jle	.L26
473
474	movapd	0 * SIZE(AO1),      %xmm0
475	movapd	0 * SIZE(AO2),      %xmm1
476	movapd	2 * SIZE(AO1),      %xmm2
477	movapd	2 * SIZE(AO2),      %xmm3
478
479	movapd	 %xmm0, %xmm4
480	unpcklpd %xmm1, %xmm0
481	unpckhpd %xmm1, %xmm4
482
483	movapd	 %xmm2, %xmm6
484	unpcklpd %xmm3, %xmm2
485	unpckhpd %xmm3, %xmm6
486
487	movapd	%xmm0,  -16 * SIZE(B)
488	movapd	%xmm4,  -14 * SIZE(B)
489	movapd	%xmm2,  -12 * SIZE(B)
490	movapd	%xmm6,  -10 * SIZE(B)
491
492	addq	$4 * SIZE, AO1
493	addq	$4 * SIZE, AO2
494	subq	$-8 * SIZE, B
495	ALIGN_4
496
497.L26:
498	testq	$2, MM
499	jle	.L28
500
501	movapd	0 * SIZE(AO1),      %xmm0
502	movapd	0 * SIZE(AO2),      %xmm1
503
504	movapd	 %xmm0, %xmm2
505	unpcklpd %xmm1, %xmm0
506	unpckhpd %xmm1, %xmm2
507
508	movapd	%xmm0,  -16 * SIZE(B)
509	movapd	%xmm2,  -14 * SIZE(B)
510
511	addq	$2 * SIZE, AO1
512	addq	$2 * SIZE, AO2
513	subq	$-4 * SIZE, B
514	ALIGN_4
515
516.L28:
517	testq	$1, MM
518	jle	.L30
519
520	movsd	0 * SIZE(AO1),      %xmm0
521	movsd	0 * SIZE(AO2),      %xmm1
522
523	unpcklpd %xmm1, %xmm0
524
525	movapd	%xmm0,  -16 * SIZE(B)
526	subq	$-2 * SIZE, B
527	ALIGN_4
528
529.L30:
530	testq	$1, N
531	jle	.L999
532
533	movq	A, AO1
534
535	testq	$SIZE, A
536	jne	.L35
537
538	movq	MM, I
539	sarq	$3, I
540	jle	.L32
541	ALIGN_4
542
543.L31:
544#ifdef PREFETCH
545	PREFETCH	PREFETCHSIZE * 4 * SIZE(AO1)
546#endif
547
548	movapd	0 * SIZE(AO1),      %xmm0
549	movapd	2 * SIZE(AO1),      %xmm1
550	movapd	4 * SIZE(AO1),      %xmm2
551	movapd	6 * SIZE(AO1),      %xmm3
552
553#ifdef PREFETCHW
554	PREFETCHW	(PREFETCHSIZE * 4 +  0) * SIZE(B)
555#endif
556
557	movapd	%xmm0,  -16 * SIZE(B)
558	movapd	%xmm1,  -14 * SIZE(B)
559	movapd	%xmm2,  -12 * SIZE(B)
560	movapd	%xmm3,  -10 * SIZE(B)
561
562	addq	$8 * SIZE, AO1
563	subq	$-8 * SIZE, B
564
565	decq	I
566	jg	.L31
567	ALIGN_4
568
569.L32:
570	testq	$4, MM
571	jle	.L33
572
573	movapd	0 * SIZE(AO1),      %xmm0
574	movapd	2 * SIZE(AO1),      %xmm1
575
576	movapd	%xmm0, -16 * SIZE(B)
577	movapd	%xmm1, -14 * SIZE(B)
578
579	addq	$4 * SIZE, AO1
580	subq	$-4 * SIZE, B
581	ALIGN_4
582
583.L33:
584	testq	$2, MM
585	jle	.L34
586
587	movapd	0 * SIZE(AO1),      %xmm0
588
589	movapd	%xmm0,  -16 * SIZE(B)
590
591	addq	$2 * SIZE, AO1
592	subq	$-2 * SIZE, B
593	ALIGN_4
594
595.L34:
596	testq	$1, MM
597	jle	.L999
598
599	movsd	0 * SIZE(AO1),      %xmm0
600
601	movlpd	%xmm0,  -16 * SIZE(B)
602	jmp	.L999
603	ALIGN_4
604
605.L35:
606	movapd	-1 * SIZE(AO1),      %xmm0
607
608	movq	MM, I
609	sarq	$3, I
610	jle	.L36
611	ALIGN_4
612
613.L36:
614#ifdef PREFETCH
615	PREFETCH	PREFETCHSIZE * 4 * SIZE(AO1)
616#endif
617
618	movapd	1 * SIZE(AO1),      %xmm1
619	movapd	3 * SIZE(AO1),      %xmm2
620	movapd	5 * SIZE(AO1),      %xmm3
621	movapd	7 * SIZE(AO1),      %xmm4
622
623	shufpd	$1, %xmm1, %xmm0
624	shufpd	$1, %xmm2, %xmm1
625	shufpd	$1, %xmm3, %xmm2
626	shufpd	$1, %xmm4, %xmm3
627
628#ifdef PREFETCHW
629	PREFETCHW	(PREFETCHSIZE * 4 +  0) * SIZE(B)
630#endif
631
632	movapd	%xmm0,  -16 * SIZE(B)
633	movapd	%xmm1,  -14 * SIZE(B)
634	movapd	%xmm2,  -12 * SIZE(B)
635	movapd	%xmm3,  -10 * SIZE(B)
636
637	movapd	%xmm4, %xmm0
638
639	addq	$8 * SIZE, AO1
640	subq	$-8 * SIZE, B
641
642	decq	I
643	jg	.L36
644	ALIGN_4
645
646.L37:
647	testq	$4, MM
648	jle	.L38
649
650	movapd	1 * SIZE(AO1),      %xmm1
651	movapd	3 * SIZE(AO1),      %xmm2
652
653	shufpd	$1, %xmm1, %xmm0
654	shufpd	$1, %xmm2, %xmm1
655
656	movapd	%xmm0,  -16 * SIZE(B)
657	movapd	%xmm1,  -14 * SIZE(B)
658
659	movapd	%xmm2, %xmm0
660
661	addq	$4 * SIZE, AO1
662	addq	$4 * SIZE, B
663	ALIGN_4
664
665.L38:
666	testq	$2, MM
667	jle	.L39
668
669	movapd	1 * SIZE(AO1),      %xmm1
670
671	shufpd	$1, %xmm1, %xmm0
672
673	movapd	%xmm0,  -16 * SIZE(B)
674
675	movapd	%xmm1, %xmm0
676
677	addq	$2 * SIZE, AO1
678	subq	$-2 * SIZE, B
679	ALIGN_4
680
681.L39:
682	testq	$1, MM
683	jle	.L999
684
685	shufpd	$1, %xmm0, %xmm0
686
687	movlpd	%xmm0,  -16 * SIZE(B)
688	jmp	.L999
689	ALIGN_4
690
691.L50:
692	movq	N,  J
693	sarq	$2, J
694	jle	.L60
695	ALIGN_4
696
697.L51:
698	movq	A, AO1
699	leaq	(A, LDA, 2), AO2
700	leaq	(A, LDA, 4), A
701
702	testq	$SIZE, A
703	je	.L52
704
705	movsd	0 * SIZE(AO1),      %xmm0
706	movsd	0 * SIZE(AO1, LDA), %xmm1
707	movsd	0 * SIZE(AO2),      %xmm2
708	movsd	0 * SIZE(AO2, LDA), %xmm3
709
710	unpcklpd %xmm1, %xmm0
711	unpcklpd %xmm3, %xmm2
712
713	movapd	%xmm0,  -16 * SIZE(B)
714	movapd	%xmm2,  -14 * SIZE(B)
715
716	addq	$1 * SIZE, AO1
717	addq	$1 * SIZE, AO2
718	subq	$-4 * SIZE, B
719	ALIGN_3
720
721.L52:
722	movapd	-1 * SIZE(AO1, LDA), %xmm5
723	movapd	-1 * SIZE(AO2, LDA), %xmm7
724
725	movq	MM, I
726	sarq	$3, I
727	jle	.L54
728	ALIGN_4
729
730.L53:
731#ifdef PREFETCH
732	PREFETCH	PREFETCHSIZE * SIZE(AO1)
733#endif
734
735	movapd	0 * SIZE(AO1),      %xmm0
736	movapd	1 * SIZE(AO1, LDA), %xmm1
737	movapd	0 * SIZE(AO2),      %xmm2
738	movapd	1 * SIZE(AO2, LDA), %xmm3
739
740	movsd	 %xmm0, %xmm5
741	movsd	 %xmm2, %xmm7
742	shufpd	 $1, %xmm1, %xmm0
743	shufpd	 $1, %xmm3, %xmm2
744
745#ifdef PREFETCHW
746	PREFETCHW	(PREFETCHSIZE * 4 +  0) * SIZE(B)
747#endif
748
749	movapd	%xmm5,  -16 * SIZE(B)
750	movapd	%xmm7,  -14 * SIZE(B)
751	movapd	%xmm0,  -12 * SIZE(B)
752	movapd	%xmm2,  -10 * SIZE(B)
753
754#ifdef PREFETCH
755	PREFETCH	PREFETCHSIZE * SIZE(AO1, LDA)
756#endif
757
758	movapd	2 * SIZE(AO1),      %xmm0
759	movapd	3 * SIZE(AO1, LDA), %xmm5
760	movapd	2 * SIZE(AO2),      %xmm2
761	movapd	3 * SIZE(AO2, LDA), %xmm7
762
763	movsd	 %xmm0, %xmm1
764	movsd	 %xmm2, %xmm3
765	shufpd	 $1, %xmm5, %xmm0
766	shufpd	 $1, %xmm7, %xmm2
767
768#ifdef PREFETCHW
769	PREFETCHW	(PREFETCHSIZE * 4 +  8) * SIZE(B)
770#endif
771
772	movapd	%xmm1,  -8 * SIZE(B)
773	movapd	%xmm3,  -6 * SIZE(B)
774	movapd	%xmm0,  -4 * SIZE(B)
775	movapd	%xmm2,  -2 * SIZE(B)
776
777#ifdef PREFETCH
778	PREFETCH	PREFETCHSIZE * SIZE(AO2)
779#endif
780
781	movapd	4 * SIZE(AO1),      %xmm0
782	movapd	5 * SIZE(AO1, LDA), %xmm1
783	movapd	4 * SIZE(AO2),      %xmm2
784	movapd	5 * SIZE(AO2, LDA), %xmm3
785
786	movsd	 %xmm0, %xmm5
787	movsd	 %xmm2, %xmm7
788	shufpd	 $1, %xmm1, %xmm0
789	shufpd	 $1, %xmm3, %xmm2
790
791#ifdef PREFETCHW
792	PREFETCHW	(PREFETCHSIZE * 4 + 16) * SIZE(B)
793#endif
794
795	movapd	%xmm5,    0 * SIZE(B)
796	movapd	%xmm7,    2 * SIZE(B)
797	movapd	%xmm0,    4 * SIZE(B)
798	movapd	%xmm2,    6 * SIZE(B)
799
800#ifdef PREFETCH
801	PREFETCH	PREFETCHSIZE * SIZE(AO2, LDA)
802#endif
803
804	movapd	6 * SIZE(AO1),      %xmm0
805	movapd	7 * SIZE(AO1, LDA), %xmm5
806	movapd	6 * SIZE(AO2),      %xmm2
807	movapd	7 * SIZE(AO2, LDA), %xmm7
808
809	movsd	 %xmm0, %xmm1
810	movsd	 %xmm2, %xmm3
811	shufpd	 $1, %xmm5, %xmm0
812	shufpd	 $1, %xmm7, %xmm2
813
814#ifdef PREFETCHW
815	PREFETCHW	(PREFETCHSIZE * 4 + 24) * SIZE(B)
816#endif
817
818	movapd	%xmm1,   8 * SIZE(B)
819	movapd	%xmm3,  10 * SIZE(B)
820	movapd	%xmm0,  12 * SIZE(B)
821	movapd	%xmm2,  14 * SIZE(B)
822
823	addq	$8 * SIZE, AO1
824	addq	$8 * SIZE, AO2
825	subq	$-32 * SIZE, B
826
827	decq	I
828	jg	.L53
829	ALIGN_4
830
831.L54:
832	testq	$4, MM
833	jle	.L56
834
835	movapd	0 * SIZE(AO1),      %xmm0
836	movapd	1 * SIZE(AO1, LDA), %xmm1
837	movapd	0 * SIZE(AO2),      %xmm2
838	movapd	1 * SIZE(AO2, LDA), %xmm3
839
840	movsd	 %xmm0, %xmm5
841	shufpd	 $1, %xmm1, %xmm0
842	movsd	 %xmm2, %xmm7
843	shufpd	 $1, %xmm3, %xmm2
844
845	movapd	%xmm5,  -16 * SIZE(B)
846	movapd	%xmm7,  -14 * SIZE(B)
847	movapd	%xmm0,  -12 * SIZE(B)
848	movapd	%xmm2,  -10 * SIZE(B)
849
850	movapd	2 * SIZE(AO1),      %xmm0
851	movapd	3 * SIZE(AO1, LDA), %xmm5
852	movapd	2 * SIZE(AO2),      %xmm2
853	movapd	3 * SIZE(AO2, LDA), %xmm7
854
855	movsd	 %xmm0, %xmm1
856	shufpd	 $1, %xmm5, %xmm0
857	movsd	 %xmm2, %xmm3
858	shufpd	 $1, %xmm7, %xmm2
859
860	movapd	%xmm1,  -8 * SIZE(B)
861	movapd	%xmm3,  -6 * SIZE(B)
862	movapd	%xmm0,  -4 * SIZE(B)
863	movapd	%xmm2,  -2 * SIZE(B)
864
865	addq	$4 * SIZE, AO1
866	addq	$4 * SIZE, AO2
867	subq	$-16 * SIZE, B
868	ALIGN_4
869
870.L56:
871	testq	$2, MM
872	jle	.L58
873
874	movapd	0 * SIZE(AO1),      %xmm0
875	movapd	1 * SIZE(AO1, LDA), %xmm1
876	movapd	0 * SIZE(AO2),      %xmm2
877	movapd	1 * SIZE(AO2, LDA), %xmm3
878
879	movsd	 %xmm0, %xmm5
880	movsd	 %xmm2, %xmm7
881	shufpd	 $1, %xmm1, %xmm0
882	shufpd	 $1, %xmm3, %xmm2
883
884	movapd	%xmm5,  -16 * SIZE(B)
885	movapd	%xmm7,  -14 * SIZE(B)
886	movapd	%xmm0,  -12 * SIZE(B)
887	movapd	%xmm2,  -10 * SIZE(B)
888
889	addq	$2 * SIZE, AO1
890	addq	$2 * SIZE, AO2
891	subq	$-8 * SIZE, B
892	ALIGN_4
893
894.L58:
895	testq	$1, MM
896	jle	.L59
897
898	movsd	0 * SIZE(AO1),      %xmm0
899	movsd	0 * SIZE(AO1, LDA), %xmm1
900	movsd	0 * SIZE(AO2),      %xmm2
901	movsd	0 * SIZE(AO2, LDA), %xmm3
902
903	unpcklpd %xmm1, %xmm0
904	unpcklpd %xmm3, %xmm2
905
906	movapd	%xmm0,  -16 * SIZE(B)
907	movapd	%xmm2,  -14 * SIZE(B)
908	subq	$-4 * SIZE, B
909	ALIGN_4
910
911.L59:
912	decq	J
913	jg	.L51
914	ALIGN_4
915
916.L60:
917	testq	$2, N
918	jle	.L70
919
920	movq	A, AO1
921	leaq	(A, LDA), AO2
922	leaq	(A, LDA, 2), A
923
924	testq	$SIZE, A
925	je	.L62
926
927	movsd	0 * SIZE(AO1), %xmm0
928	movsd	0 * SIZE(AO2), %xmm1
929
930	unpcklpd %xmm1, %xmm0
931
932	movapd	%xmm0,  -16 * SIZE(B)
933
934	addq	$1 * SIZE, AO1
935	addq	$1 * SIZE, AO2
936	subq	$-2 * SIZE, B
937	ALIGN_3
938
939.L62:
940	movapd	-1 * SIZE(AO2), %xmm5
941
942	movq	MM, I
943	sarq	$3, I
944	jle	.L64
945	ALIGN_4
946
947.L63:
948#ifdef PREFETCH
949	PREFETCH	PREFETCHSIZE * 2 * SIZE(AO1)
950#endif
951
952	movapd	0 * SIZE(AO1), %xmm0
953	movapd	1 * SIZE(AO2), %xmm1
954	movapd	2 * SIZE(AO1), %xmm2
955	movapd	3 * SIZE(AO2), %xmm3
956
957	movsd	 %xmm0, %xmm5
958	shufpd	 $1, %xmm1, %xmm0
959	movsd	 %xmm2, %xmm1
960	shufpd	 $1, %xmm3, %xmm2
961
962#ifdef PREFETCHW
963	PREFETCHW	(PREFETCHSIZE * 4 +  0) * SIZE(B)
964#endif
965
966	movapd	%xmm5,  -16 * SIZE(B)
967	movapd	%xmm0,  -14 * SIZE(B)
968	movapd	%xmm1,  -12 * SIZE(B)
969	movapd	%xmm2,  -10 * SIZE(B)
970
971#ifdef PREFETCH
972	PREFETCH	PREFETCHSIZE * 2 * SIZE(AO2)
973#endif
974
975	movapd	4 * SIZE(AO1), %xmm0
976	movapd	5 * SIZE(AO2), %xmm1
977	movapd	6 * SIZE(AO1), %xmm2
978	movapd	7 * SIZE(AO2), %xmm5
979
980	movsd	 %xmm0, %xmm3
981	shufpd	 $1, %xmm1, %xmm0
982	movsd	 %xmm2, %xmm1
983	shufpd	 $1, %xmm5, %xmm2
984
985#ifdef PREFETCHW
986	PREFETCHW	(PREFETCHSIZE * 4 +  0) * SIZE(B)
987#endif
988
989	movapd	%xmm3,   -8 * SIZE(B)
990	movapd	%xmm0,   -6 * SIZE(B)
991	movapd	%xmm1,   -4 * SIZE(B)
992	movapd	%xmm2,   -2 * SIZE(B)
993
994	addq	$8 * SIZE, AO1
995	addq	$8 * SIZE, AO2
996	subq	$-16 * SIZE, B
997
998	decq	I
999	jg	.L63
1000	ALIGN_4
1001
1002.L64:
1003	testq	$4, MM
1004	jle	.L66
1005
1006	movapd	0 * SIZE(AO1), %xmm0
1007	movapd	1 * SIZE(AO2), %xmm1
1008	movapd	2 * SIZE(AO1), %xmm2
1009	movapd	3 * SIZE(AO2), %xmm3
1010
1011	movsd	 %xmm0, %xmm5
1012	shufpd	 $1, %xmm1, %xmm0
1013	movsd	 %xmm2, %xmm1
1014	shufpd	 $1, %xmm3, %xmm2
1015
1016	movapd	%xmm5,  -16 * SIZE(B)
1017	movapd	%xmm0,  -14 * SIZE(B)
1018	movapd	%xmm1,  -12 * SIZE(B)
1019	movapd	%xmm2,  -10 * SIZE(B)
1020
1021	movaps	%xmm3, %xmm5
1022
1023	addq	$4 * SIZE, AO1
1024	addq	$4 * SIZE, AO2
1025	subq	$-8 * SIZE, B
1026	ALIGN_4
1027
1028.L66:
1029	testq	$2, MM
1030	jle	.L68
1031
1032	movapd	0 * SIZE(AO1), %xmm0
1033	movapd	1 * SIZE(AO2), %xmm1
1034
1035	movsd	 %xmm0, %xmm5
1036	shufpd	 $1, %xmm1, %xmm0
1037
1038	movapd	%xmm5,  -16 * SIZE(B)
1039	movapd	%xmm0,  -14 * SIZE(B)
1040
1041	addq	$2 * SIZE, AO1
1042	addq	$2 * SIZE, AO2
1043	subq	$-4 * SIZE, B
1044	ALIGN_4
1045
1046.L68:
1047	testq	$1, MM
1048	jle	.L70
1049
1050	movsd	0 * SIZE(AO1),      %xmm0
1051	movsd	0 * SIZE(AO2),      %xmm1
1052
1053	unpcklpd %xmm1, %xmm0
1054
1055	movapd	%xmm0,  -16 * SIZE(B)
1056	subq	$-2 * SIZE, B
1057	ALIGN_4
1058
1059.L70:
1060	testq	$1, N
1061	jle	.L999
1062
1063	movq	A, AO1
1064
1065	testq	$SIZE, A
1066	jne	.L75
1067
1068	movq	MM, I
1069	sarq	$3, I
1070	jle	.L72
1071	ALIGN_4
1072
1073.L71:
1074#ifdef PREFETCH
1075	PREFETCH	PREFETCHSIZE * 4 * SIZE(AO1)
1076#endif
1077
1078	movapd	0 * SIZE(AO1),      %xmm0
1079	movapd	2 * SIZE(AO1),      %xmm2
1080	movapd	4 * SIZE(AO1),      %xmm4
1081	movapd	6 * SIZE(AO1),      %xmm6
1082
1083#ifdef PREFETCHW
1084	PREFETCHW	(PREFETCHSIZE * 4 +  0) * SIZE(B)
1085#endif
1086
1087	movapd	%xmm0,  -16 * SIZE(B)
1088	movapd	%xmm2,  -14 * SIZE(B)
1089	movapd	%xmm4,  -12 * SIZE(B)
1090	movapd	%xmm6,  -10 * SIZE(B)
1091
1092	addq	$8 * SIZE, AO1
1093	subq	$-8 * SIZE, B
1094
1095	decq	I
1096	jg	.L71
1097	ALIGN_4
1098
1099.L72:
1100	testq	$4, MM
1101	jle	.L73
1102
1103	movapd	0 * SIZE(AO1),      %xmm0
1104	movapd	2 * SIZE(AO1),      %xmm2
1105
1106	movapd	%xmm0, -16 * SIZE(B)
1107	movapd	%xmm2, -14 * SIZE(B)
1108
1109	addq	$4 * SIZE, AO1
1110	subq	$-4 * SIZE, B
1111	ALIGN_4
1112
1113.L73:
1114	testq	$2, MM
1115	jle	.L74
1116
1117	movapd	0 * SIZE(AO1),      %xmm0
1118
1119	movapd	%xmm0,  -16 * SIZE(B)
1120
1121	addq	$2 * SIZE, AO1
1122	subq	$-2 * SIZE, B
1123	ALIGN_4
1124
1125.L74:
1126	testq	$1, MM
1127	jle	.L999
1128
1129	movsd	0 * SIZE(AO1),      %xmm0
1130
1131	movlpd	%xmm0,  -16 * SIZE(B)
1132	jmp	.L999
1133	ALIGN_4
1134
1135.L75:
1136	movapd	-1 * SIZE(AO1),      %xmm0
1137
1138	movq	MM, I
1139	sarq	$3, I
1140	jle	.L76
1141	ALIGN_4
1142
1143.L76:
1144#ifdef PREFETCH
1145	PREFETCH	PREFETCHSIZE * 4 * SIZE(AO1)
1146#endif
1147
1148	movapd	1 * SIZE(AO1),      %xmm1
1149	movapd	3 * SIZE(AO1),      %xmm2
1150	movapd	5 * SIZE(AO1),      %xmm3
1151	movapd	7 * SIZE(AO1),      %xmm4
1152
1153	shufpd	$1, %xmm1, %xmm0
1154	shufpd	$1, %xmm2, %xmm1
1155	shufpd	$1, %xmm3, %xmm2
1156	shufpd	$1, %xmm4, %xmm3
1157
1158#ifdef PREFETCHW
1159	PREFETCHW	(PREFETCHSIZE * 4 +  0) * SIZE(B)
1160#endif
1161
1162	movapd	%xmm0,  -16 * SIZE(B)
1163	movapd	%xmm1,  -14 * SIZE(B)
1164	movapd	%xmm2,  -12 * SIZE(B)
1165	movapd	%xmm3,  -10 * SIZE(B)
1166
1167	movapd	%xmm4, %xmm0
1168
1169	addq	$8 * SIZE, AO1
1170	subq	$-8 * SIZE, B
1171
1172	decq	I
1173	jg	.L76
1174	ALIGN_4
1175
1176.L77:
1177	testq	$4, MM
1178	jle	.L78
1179
1180	movapd	1 * SIZE(AO1),      %xmm1
1181	movapd	3 * SIZE(AO1),      %xmm2
1182
1183	shufpd	$1, %xmm1, %xmm0
1184	shufpd	$1, %xmm2, %xmm1
1185
1186	movapd	%xmm0,  -16 * SIZE(B)
1187	movapd	%xmm1,  -14 * SIZE(B)
1188
1189	movapd	%xmm2, %xmm0
1190
1191	addq	$4 * SIZE, AO1
1192	addq	$4 * SIZE, B
1193	ALIGN_4
1194
1195.L78:
1196	testq	$2, MM
1197	jle	.L79
1198
1199	movapd	1 * SIZE(AO1),      %xmm1
1200
1201	shufpd	$1, %xmm1, %xmm0
1202
1203	movapd	%xmm0,  -16 * SIZE(B)
1204
1205	movapd	%xmm1, %xmm0
1206
1207	addq	$2 * SIZE, AO1
1208	subq	$-2 * SIZE, B
1209	ALIGN_4
1210
1211.L79:
1212	testq	$1, MM
1213	jle	.L999
1214
1215	shufpd	$1, %xmm0, %xmm0
1216
1217	movlpd	%xmm0,  -16 * SIZE(B)
1218	ALIGN_4
1219
1220.L999:
1221#ifdef WINDOWS_ABI
1222	movups	  0(%rsp), %xmm6
1223	movups	 16(%rsp), %xmm7
1224
1225	addq	$STACKSIZE, %rsp
1226#endif
1227
1228	popq	%r12
1229	popq	%r13
1230
1231#ifdef WINDOWS_ABI
1232	popq	%r14
1233	popq	%r15
1234#endif
1235	ret
1236
1237	EPILOGUE
1238