1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#ifdef NEHALEM
43#define PREFETCHSIZE	12
44#define PREFETCH	prefetcht0
45#define PREFETCHW	prefetcht0
46#endif
47
48#ifdef SANDYBRIDGE
49#define PREFETCHSIZE	12
50#define PREFETCH	prefetcht0
51#define PREFETCHW	prefetcht0
52#endif
53
54#ifndef MOVAPS
55#define MOVAPS	movaps
56#endif
57
58#ifndef WINDOWS_ABI
59
60#define M	ARG1	/* rdi */
61#define N	ARG2	/* rsi */
62#define A	ARG3	/* rdx */
63#define LDA	ARG4	/* rcx */
64#define B	ARG5	/* r8  */
65
66#define AO1	%r9
67#define AO2	%r10
68#define LDA3	%r11
69#define J	%r12
70#define MM	%r13
71
72#else
73
74#define STACKSIZE 128
75
76#define M	ARG1	/* rcx */
77#define N	ARG2	/* rdx */
78#define A	ARG3	/* r8  */
79#define LDA	ARG4	/* r9  */
80#define OLD_B		40 + 32 + STACKSIZE(%rsp)
81
82#define B	%r15
83
84#define AO1	%r10
85#define AO2	%r11
86#define LDA3	%r12
87#define J	%r13
88#define MM	%r14
89
90#endif
91
92#define I	%rax
93
94	PROLOGUE
95	PROFCODE
96
97#ifdef WINDOWS_ABI
98	pushq	%r15
99	pushq	%r14
100#endif
101	pushq	%r13
102	pushq	%r12
103
104#ifdef WINDOWS_ABI
105	subq	$STACKSIZE, %rsp
106
107	movups	%xmm6,    0(%rsp)
108	movups	%xmm7,   16(%rsp)
109	movups	%xmm8,   32(%rsp)
110	movups	%xmm9,   48(%rsp)
111	movups	%xmm10,  64(%rsp)
112	movups	%xmm11,  80(%rsp)
113	movups	%xmm12,  96(%rsp)
114
115	movq	OLD_B,     B
116#endif
117
118	leaq	(,LDA, SIZE), LDA
119	leaq	(LDA, LDA, 2), LDA3
120	subq	$-16 * SIZE, B
121
122	movq	M, MM
123	leaq	-1(M), %rax
124	testq	$SIZE, A
125	cmovne	%rax, MM
126
127	testq	$SIZE, LDA
128	jne	.L50
129
130	movq	N,  J
131	sarq	$3, J
132	jle	.L20
133	ALIGN_4
134
135.L11:
136	movq	A, AO1
137	leaq	(A, LDA, 4), AO2
138	leaq	(A, LDA, 8), A
139
140	testq	$SIZE, A
141	je	.L12
142
143	movsd	0 * SIZE(AO1),         %xmm0
144	movsd	0 * SIZE(AO1, LDA),    %xmm1
145	movsd	0 * SIZE(AO1, LDA, 2), %xmm2
146	movsd	0 * SIZE(AO1, LDA3),   %xmm3
147
148	movsd	0 * SIZE(AO2),         %xmm4
149	movsd	0 * SIZE(AO2, LDA),    %xmm5
150	movsd	0 * SIZE(AO2, LDA, 2), %xmm6
151	movsd	0 * SIZE(AO2, LDA3),   %xmm7
152
153	unpcklpd %xmm1, %xmm0
154	unpcklpd %xmm3, %xmm2
155	unpcklpd %xmm5, %xmm4
156	unpcklpd %xmm7, %xmm6
157
158	movaps	%xmm0,  -16 * SIZE(B)
159	movaps	%xmm2,  -14 * SIZE(B)
160	movaps	%xmm4,  -12 * SIZE(B)
161	movaps	%xmm6,  -10 * SIZE(B)
162
163	addq	$1 * SIZE, AO1
164	addq	$1 * SIZE, AO2
165	subq	$-8 * SIZE, B
166	ALIGN_3
167
168.L12:
169	movq	MM, I
170	sarq	$3, I
171	jle	.L14
172	ALIGN_4
173
174.L13:
175#ifdef PREFETCH
176	PREFETCH	PREFETCHSIZE * SIZE(AO1)
177#endif
178
179	MOVAPS	0 * SIZE(AO1),         %xmm0
180	MOVAPS	0 * SIZE(AO1, LDA),    %xmm1
181	MOVAPS	0 * SIZE(AO1, LDA, 2), %xmm2
182	MOVAPS	0 * SIZE(AO1, LDA3),   %xmm3
183
184	movaps	 %xmm0, %xmm8
185	unpcklpd %xmm1, %xmm0
186	movaps	 %xmm2, %xmm9
187	unpcklpd %xmm3, %xmm2
188
189#ifdef PREFETCH
190	PREFETCH	PREFETCHSIZE * SIZE(AO1, LDA)
191#endif
192
193	MOVAPS	0 * SIZE(AO2),         %xmm4
194	MOVAPS	0 * SIZE(AO2, LDA),    %xmm5
195	MOVAPS	0 * SIZE(AO2, LDA, 2), %xmm6
196	MOVAPS	0 * SIZE(AO2, LDA3),   %xmm7
197
198	movaps	 %xmm4, %xmm10
199	unpcklpd %xmm5, %xmm4
200	movaps	 %xmm6, %xmm11
201	unpcklpd %xmm7, %xmm6
202
203#ifdef PREFETCHW
204	PREFETCHW	(PREFETCHSIZE * 8 +  0) * SIZE(B)
205#endif
206
207	movaps	%xmm0,  -16 * SIZE(B)
208	movaps	%xmm2,  -14 * SIZE(B)
209	movaps	%xmm4,  -12 * SIZE(B)
210	movaps	%xmm6,  -10 * SIZE(B)
211
212	unpckhpd %xmm1, %xmm8
213	unpckhpd %xmm3, %xmm9
214	unpckhpd %xmm5, %xmm10
215	unpckhpd %xmm7, %xmm11
216
217#ifdef PREFETCHW
218	PREFETCHW	(PREFETCHSIZE * 8 +  8) * SIZE(B)
219#endif
220
221	movaps	%xmm8,   -8 * SIZE(B)
222	movaps	%xmm9,   -6 * SIZE(B)
223	movaps	%xmm10,  -4 * SIZE(B)
224	movaps	%xmm11,  -2 * SIZE(B)
225
226#ifdef PREFETCH
227	PREFETCH	PREFETCHSIZE * SIZE(AO1, LDA, 2)
228#endif
229
230	MOVAPS	2 * SIZE(AO1),         %xmm0
231	MOVAPS	2 * SIZE(AO1, LDA),    %xmm1
232	MOVAPS	2 * SIZE(AO1, LDA, 2), %xmm2
233	MOVAPS	2 * SIZE(AO1, LDA3),   %xmm3
234
235	movaps	 %xmm0, %xmm8
236	unpcklpd %xmm1, %xmm0
237	movaps	 %xmm2, %xmm9
238	unpcklpd %xmm3, %xmm2
239
240#ifdef PREFETCH
241	PREFETCH	PREFETCHSIZE * SIZE(AO1, LDA3)
242#endif
243
244	MOVAPS	2 * SIZE(AO2),         %xmm4
245	MOVAPS	2 * SIZE(AO2, LDA),    %xmm5
246	MOVAPS	2 * SIZE(AO2, LDA, 2), %xmm6
247	MOVAPS	2 * SIZE(AO2, LDA3),   %xmm7
248
249	movaps	 %xmm4, %xmm10
250	unpcklpd %xmm5, %xmm4
251	movaps	 %xmm6, %xmm11
252	unpcklpd %xmm7, %xmm6
253
254#ifdef PREFETCHW
255	PREFETCHW	(PREFETCHSIZE * 8 + 16) * SIZE(B)
256#endif
257
258	movaps	%xmm0,    0 * SIZE(B)
259	movaps	%xmm2,    2 * SIZE(B)
260	movaps	%xmm4,    4 * SIZE(B)
261	movaps	%xmm6,    6 * SIZE(B)
262
263	unpckhpd %xmm1, %xmm8
264	unpckhpd %xmm3, %xmm9
265	unpckhpd %xmm5, %xmm10
266	unpckhpd %xmm7, %xmm11
267
268#ifdef PREFETCHW
269	PREFETCHW	(PREFETCHSIZE * 8 + 24) * SIZE(B)
270#endif
271
272	movaps	%xmm8,    8 * SIZE(B)
273	movaps	%xmm9,   10 * SIZE(B)
274	movaps	%xmm10,  12 * SIZE(B)
275	movaps	%xmm11,  14 * SIZE(B)
276
277#ifdef PREFETCH
278	PREFETCH	PREFETCHSIZE * SIZE(AO2)
279#endif
280
281	MOVAPS	4 * SIZE(AO1),         %xmm0
282	MOVAPS	4 * SIZE(AO1, LDA),    %xmm1
283	MOVAPS	4 * SIZE(AO1, LDA, 2), %xmm2
284	MOVAPS	4 * SIZE(AO1, LDA3),   %xmm3
285
286	movaps	 %xmm0, %xmm8
287	unpcklpd %xmm1, %xmm0
288	movaps	 %xmm2, %xmm9
289	unpcklpd %xmm3, %xmm2
290
291#ifdef PREFETCH
292	PREFETCH	PREFETCHSIZE * SIZE(AO2, LDA)
293#endif
294
295	MOVAPS	4 * SIZE(AO2),         %xmm4
296	MOVAPS	4 * SIZE(AO2, LDA),    %xmm5
297	MOVAPS	4 * SIZE(AO2, LDA, 2), %xmm6
298	MOVAPS	4 * SIZE(AO2, LDA3),   %xmm7
299
300	movaps	 %xmm4, %xmm10
301	unpcklpd %xmm5, %xmm4
302	movaps	 %xmm6, %xmm11
303	unpcklpd %xmm7, %xmm6
304
305#ifdef PREFETCHW
306	PREFETCHW	(PREFETCHSIZE * 8 + 32) * SIZE(B)
307#endif
308
309	movaps	%xmm0,   16 * SIZE(B)
310	movaps	%xmm2,   18 * SIZE(B)
311	movaps	%xmm4,   20 * SIZE(B)
312	movaps	%xmm6,   22 * SIZE(B)
313
314	unpckhpd %xmm1, %xmm8
315	unpckhpd %xmm3, %xmm9
316	unpckhpd %xmm5, %xmm10
317	unpckhpd %xmm7, %xmm11
318
319#ifdef PREFETCHW
320	PREFETCHW	(PREFETCHSIZE * 8 + 40) * SIZE(B)
321#endif
322
323	movaps	%xmm8,   24 * SIZE(B)
324	movaps	%xmm9,   26 * SIZE(B)
325	movaps	%xmm10,  28 * SIZE(B)
326	movaps	%xmm11,  30 * SIZE(B)
327
328#ifdef PREFETCH
329	PREFETCH	PREFETCHSIZE * SIZE(AO2, LDA, 2)
330#endif
331
332	MOVAPS	6 * SIZE(AO1),         %xmm0
333	MOVAPS	6 * SIZE(AO1, LDA),    %xmm1
334	MOVAPS	6 * SIZE(AO1, LDA, 2), %xmm2
335	MOVAPS	6 * SIZE(AO1, LDA3),   %xmm3
336
337	movaps	 %xmm0, %xmm8
338	unpcklpd %xmm1, %xmm0
339	movaps	 %xmm2, %xmm9
340	unpcklpd %xmm3, %xmm2
341
342#ifdef PREFETCH
343	PREFETCH	PREFETCHSIZE * SIZE(AO2, LDA3)
344#endif
345
346	MOVAPS	6 * SIZE(AO2),         %xmm4
347	MOVAPS	6 * SIZE(AO2, LDA),    %xmm5
348	MOVAPS	6 * SIZE(AO2, LDA, 2), %xmm6
349	MOVAPS	6 * SIZE(AO2, LDA3),   %xmm7
350
351	movaps	 %xmm4, %xmm10
352	unpcklpd %xmm5, %xmm4
353	movaps	 %xmm6, %xmm11
354	unpcklpd %xmm7, %xmm6
355
356#ifdef PREFETCHW
357	PREFETCHW	(PREFETCHSIZE * 8 + 48) * SIZE(B)
358#endif
359
360	movaps	%xmm0,   32 * SIZE(B)
361	movaps	%xmm2,   34 * SIZE(B)
362	movaps	%xmm4,   36 * SIZE(B)
363	movaps	%xmm6,   38 * SIZE(B)
364
365	unpckhpd %xmm1, %xmm8
366	unpckhpd %xmm3, %xmm9
367	unpckhpd %xmm5, %xmm10
368	unpckhpd %xmm7, %xmm11
369
370#ifdef PREFETCHW
371	PREFETCHW	(PREFETCHSIZE * 8 + 56) * SIZE(B)
372#endif
373
374	movaps	%xmm8,   40 * SIZE(B)
375	movaps	%xmm9,   42 * SIZE(B)
376	movaps	%xmm10,  44 * SIZE(B)
377	movaps	%xmm11,  46 * SIZE(B)
378
379	addq	$8 * SIZE, AO1
380	addq	$8 * SIZE, AO2
381	subq	$-64 * SIZE, B
382
383	decq	I
384	jg	.L13
385	ALIGN_4
386
387.L14:
388	testq	$4, MM
389	jle	.L16
390
391	MOVAPS	0 * SIZE(AO1),         %xmm0
392	MOVAPS	0 * SIZE(AO1, LDA),    %xmm1
393	MOVAPS	0 * SIZE(AO1, LDA, 2), %xmm2
394	MOVAPS	0 * SIZE(AO1, LDA3),   %xmm3
395
396	MOVAPS	0 * SIZE(AO2),         %xmm4
397	MOVAPS	0 * SIZE(AO2, LDA),    %xmm5
398	MOVAPS	0 * SIZE(AO2, LDA, 2), %xmm6
399	MOVAPS	0 * SIZE(AO2, LDA3),   %xmm7
400
401	movaps	 %xmm0, %xmm8
402	unpcklpd %xmm1, %xmm0
403	movaps	 %xmm2, %xmm9
404	unpcklpd %xmm3, %xmm2
405
406	movaps	 %xmm4, %xmm10
407	unpcklpd %xmm5, %xmm4
408	movaps	 %xmm6, %xmm11
409	unpcklpd %xmm7, %xmm6
410
411	movaps	%xmm0,  -16 * SIZE(B)
412	movaps	%xmm2,  -14 * SIZE(B)
413	movaps	%xmm4,  -12 * SIZE(B)
414	movaps	%xmm6,  -10 * SIZE(B)
415
416	unpckhpd %xmm1, %xmm8
417	unpckhpd %xmm3, %xmm9
418	unpckhpd %xmm5, %xmm10
419	unpckhpd %xmm7, %xmm11
420
421	movaps	%xmm8,   -8 * SIZE(B)
422	movaps	%xmm9,   -6 * SIZE(B)
423	movaps	%xmm10,  -4 * SIZE(B)
424	movaps	%xmm11,  -2 * SIZE(B)
425
426	MOVAPS	2 * SIZE(AO1),         %xmm0
427	MOVAPS	2 * SIZE(AO1, LDA),    %xmm1
428	MOVAPS	2 * SIZE(AO1, LDA, 2), %xmm2
429	MOVAPS	2 * SIZE(AO1, LDA3),   %xmm3
430
431	MOVAPS	2 * SIZE(AO2),         %xmm4
432	MOVAPS	2 * SIZE(AO2, LDA),    %xmm5
433	MOVAPS	2 * SIZE(AO2, LDA, 2), %xmm6
434	MOVAPS	2 * SIZE(AO2, LDA3),   %xmm7
435
436	movaps	 %xmm0, %xmm8
437	unpcklpd %xmm1, %xmm0
438	movaps	 %xmm2, %xmm9
439	unpcklpd %xmm3, %xmm2
440
441	movaps	 %xmm4, %xmm10
442	unpcklpd %xmm5, %xmm4
443	movaps	 %xmm6, %xmm11
444	unpcklpd %xmm7, %xmm6
445
446	movaps	%xmm0,    0 * SIZE(B)
447	movaps	%xmm2,    2 * SIZE(B)
448	movaps	%xmm4,    4 * SIZE(B)
449	movaps	%xmm6,    6 * SIZE(B)
450
451	unpckhpd %xmm1, %xmm8
452	unpckhpd %xmm3, %xmm9
453	unpckhpd %xmm5, %xmm10
454	unpckhpd %xmm7, %xmm11
455
456	movaps	%xmm8,    8 * SIZE(B)
457	movaps	%xmm9,   10 * SIZE(B)
458	movaps	%xmm10,  12 * SIZE(B)
459	movaps	%xmm11,  14 * SIZE(B)
460
461	addq	$4 * SIZE, AO1
462	addq	$4 * SIZE, AO2
463	subq	$-32 * SIZE, B
464	ALIGN_4
465
466.L16:
467	testq	$2, MM
468	jle	.L18
469
470	MOVAPS	0 * SIZE(AO1),         %xmm0
471	MOVAPS	0 * SIZE(AO1, LDA),    %xmm1
472	MOVAPS	0 * SIZE(AO1, LDA, 2), %xmm2
473	MOVAPS	0 * SIZE(AO1, LDA3),   %xmm3
474
475	MOVAPS	0 * SIZE(AO2),         %xmm4
476	MOVAPS	0 * SIZE(AO2, LDA),    %xmm5
477	MOVAPS	0 * SIZE(AO2, LDA, 2), %xmm6
478	MOVAPS	0 * SIZE(AO2, LDA3),   %xmm7
479
480	movaps	 %xmm0, %xmm8
481	unpcklpd %xmm1, %xmm0
482	movaps	 %xmm2, %xmm9
483	unpcklpd %xmm3, %xmm2
484
485	movaps	 %xmm4, %xmm10
486	unpcklpd %xmm5, %xmm4
487	movaps	 %xmm6, %xmm11
488	unpcklpd %xmm7, %xmm6
489
490	movaps	%xmm0,  -16 * SIZE(B)
491	movaps	%xmm2,  -14 * SIZE(B)
492	movaps	%xmm4,  -12 * SIZE(B)
493	movaps	%xmm6,  -10 * SIZE(B)
494
495	unpckhpd %xmm1, %xmm8
496	unpckhpd %xmm3, %xmm9
497	unpckhpd %xmm5, %xmm10
498	unpckhpd %xmm7, %xmm11
499
500	movaps	%xmm8,   -8 * SIZE(B)
501	movaps	%xmm9,   -6 * SIZE(B)
502	movaps	%xmm10,  -4 * SIZE(B)
503	movaps	%xmm11,  -2 * SIZE(B)
504
505	addq	$2 * SIZE, AO1
506	addq	$2 * SIZE, AO2
507	subq	$-16 * SIZE, B
508	ALIGN_4
509
510.L18:
511	testq	$1, MM
512	jle	.L19
513
514	movsd	0 * SIZE(AO1),         %xmm0
515	movsd	0 * SIZE(AO1, LDA),    %xmm1
516	movsd	0 * SIZE(AO1, LDA, 2), %xmm2
517	movsd	0 * SIZE(AO1, LDA3),   %xmm3
518
519	movsd	0 * SIZE(AO2),         %xmm4
520	movsd	0 * SIZE(AO2, LDA),    %xmm5
521	movsd	0 * SIZE(AO2, LDA, 2), %xmm6
522	movsd	0 * SIZE(AO2, LDA3),   %xmm7
523
524	unpcklpd %xmm1, %xmm0
525	unpcklpd %xmm3, %xmm2
526	unpcklpd %xmm5, %xmm4
527	unpcklpd %xmm7, %xmm6
528
529	movaps	%xmm0,  -16 * SIZE(B)
530	movaps	%xmm2,  -14 * SIZE(B)
531	movaps	%xmm4,  -12 * SIZE(B)
532	movaps	%xmm6,  -10 * SIZE(B)
533
534	subq	$-8 * SIZE, B
535	ALIGN_4
536
537.L19:
538	decq	J
539	jg	.L11
540	ALIGN_4
541
542.L20:
543	testq	$4, N
544	jle	.L30
545
546	movq	A, AO1
547	leaq	(A, LDA, 2), AO2
548	leaq	(A, LDA, 4), A
549
550	testq	$SIZE, A
551	je	.L22
552
553	movsd	0 * SIZE(AO1),      %xmm0
554	movsd	0 * SIZE(AO1, LDA), %xmm1
555	movsd	0 * SIZE(AO2),      %xmm2
556	movsd	0 * SIZE(AO2, LDA), %xmm3
557
558	unpcklpd %xmm1, %xmm0
559	unpcklpd %xmm3, %xmm2
560
561	movaps	%xmm0,  -16 * SIZE(B)
562	movaps	%xmm2,  -14 * SIZE(B)
563
564	addq	$1 * SIZE, AO1
565	addq	$1 * SIZE, AO2
566	subq	$-4 * SIZE, B
567	ALIGN_3
568
569.L22:
570	movq	MM, I
571	sarq	$3, I
572	jle	.L24
573	ALIGN_4
574
575.L23:
576#ifdef PREFETCH
577	PREFETCH	PREFETCHSIZE * 2 * SIZE(AO1)
578#endif
579
580	MOVAPS	0 * SIZE(AO1),      %xmm0
581	MOVAPS	0 * SIZE(AO1, LDA), %xmm1
582	MOVAPS	0 * SIZE(AO2),      %xmm2
583	MOVAPS	0 * SIZE(AO2, LDA), %xmm3
584
585	movaps	 %xmm0, %xmm4
586	unpcklpd %xmm1, %xmm0
587	movaps	 %xmm2, %xmm6
588	unpcklpd %xmm3, %xmm2
589
590	unpckhpd %xmm1, %xmm4
591	unpckhpd %xmm3, %xmm6
592
593#ifdef PREFETCHW
594	PREFETCHW	(PREFETCHSIZE * 8 +  0) * SIZE(B)
595#endif
596
597	movaps	%xmm0,  -16 * SIZE(B)
598	movaps	%xmm2,  -14 * SIZE(B)
599	movaps	%xmm4,  -12 * SIZE(B)
600	movaps	%xmm6,  -10 * SIZE(B)
601
602#ifdef PREFETCH
603	PREFETCH	PREFETCHSIZE * 2 * SIZE(AO1, LDA)
604#endif
605
606	MOVAPS	2 * SIZE(AO1),      %xmm0
607	MOVAPS	2 * SIZE(AO1, LDA), %xmm1
608	MOVAPS	2 * SIZE(AO2),      %xmm2
609	MOVAPS	2 * SIZE(AO2, LDA), %xmm3
610
611	movaps	 %xmm0, %xmm4
612	unpcklpd %xmm1, %xmm0
613	movaps	 %xmm2, %xmm6
614	unpcklpd %xmm3, %xmm2
615
616	unpckhpd %xmm1, %xmm4
617	unpckhpd %xmm3, %xmm6
618
619#ifdef PREFETCHW
620	PREFETCHW	(PREFETCHSIZE * 8 +  8) * SIZE(B)
621#endif
622
623	movaps	%xmm0,  -8 * SIZE(B)
624	movaps	%xmm2,  -6 * SIZE(B)
625	movaps	%xmm4,  -4 * SIZE(B)
626	movaps	%xmm6,  -2 * SIZE(B)
627
628#ifdef PREFETCH
629	PREFETCH	PREFETCHSIZE * 2 * SIZE(AO2)
630#endif
631
632	MOVAPS	4 * SIZE(AO1),      %xmm0
633	MOVAPS	4 * SIZE(AO1, LDA), %xmm1
634	MOVAPS	4 * SIZE(AO2),      %xmm2
635	MOVAPS	4 * SIZE(AO2, LDA), %xmm3
636
637	movaps	 %xmm0, %xmm4
638	unpcklpd %xmm1, %xmm0
639	movaps	 %xmm2, %xmm6
640	unpcklpd %xmm3, %xmm2
641
642	unpckhpd %xmm1, %xmm4
643	unpckhpd %xmm3, %xmm6
644
645#ifdef PREFETCHW
646	PREFETCHW	(PREFETCHSIZE * 8 + 16) * SIZE(B)
647#endif
648
649	movaps	%xmm0,   0 * SIZE(B)
650	movaps	%xmm2,   2 * SIZE(B)
651	movaps	%xmm4,   4 * SIZE(B)
652	movaps	%xmm6,   6 * SIZE(B)
653
654#ifdef PREFETCH
655	PREFETCH	PREFETCHSIZE * 2 * SIZE(AO2, LDA)
656#endif
657
658	MOVAPS	6 * SIZE(AO1),      %xmm0
659	MOVAPS	6 * SIZE(AO1, LDA), %xmm1
660	MOVAPS	6 * SIZE(AO2),      %xmm2
661	MOVAPS	6 * SIZE(AO2, LDA), %xmm3
662
663	movaps	 %xmm0, %xmm4
664	unpcklpd %xmm1, %xmm0
665	movaps	 %xmm2, %xmm6
666	unpcklpd %xmm3, %xmm2
667
668	unpckhpd %xmm1, %xmm4
669	unpckhpd %xmm3, %xmm6
670
671#ifdef PREFETCHW
672	PREFETCHW	(PREFETCHSIZE * 8 + 24) * SIZE(B)
673#endif
674
675	movaps	%xmm0,   8 * SIZE(B)
676	movaps	%xmm2,  10 * SIZE(B)
677	movaps	%xmm4,  12 * SIZE(B)
678	movaps	%xmm6,  14 * SIZE(B)
679
680	addq	$8 * SIZE, AO1
681	addq	$8 * SIZE, AO2
682	subq	$-32 * SIZE, B
683
684	decq	I
685	jg	.L23
686	ALIGN_4
687
688.L24:
689	testq	$4, MM
690	jle	.L26
691
692	MOVAPS	0 * SIZE(AO1),      %xmm0
693	MOVAPS	0 * SIZE(AO1, LDA), %xmm1
694	MOVAPS	0 * SIZE(AO2),      %xmm2
695	MOVAPS	0 * SIZE(AO2, LDA), %xmm3
696
697	movaps	 %xmm0, %xmm4
698	unpcklpd %xmm1, %xmm0
699	movaps	 %xmm2, %xmm6
700	unpcklpd %xmm3, %xmm2
701
702	unpckhpd %xmm1, %xmm4
703	unpckhpd %xmm3, %xmm6
704
705	movaps	%xmm0,  -16 * SIZE(B)
706	movaps	%xmm2,  -14 * SIZE(B)
707	movaps	%xmm4,  -12 * SIZE(B)
708	movaps	%xmm6,  -10 * SIZE(B)
709
710	MOVAPS	2 * SIZE(AO1),      %xmm0
711	MOVAPS	2 * SIZE(AO1, LDA), %xmm1
712	MOVAPS	2 * SIZE(AO2),      %xmm2
713	MOVAPS	2 * SIZE(AO2, LDA), %xmm3
714
715	movaps	 %xmm0, %xmm4
716	unpcklpd %xmm1, %xmm0
717	movaps	 %xmm2, %xmm6
718	unpcklpd %xmm3, %xmm2
719
720	unpckhpd %xmm1, %xmm4
721	unpckhpd %xmm3, %xmm6
722
723	movaps	%xmm0,  -8 * SIZE(B)
724	movaps	%xmm2,  -6 * SIZE(B)
725	movaps	%xmm4,  -4 * SIZE(B)
726	movaps	%xmm6,  -2 * SIZE(B)
727
728	addq	$4 * SIZE, AO1
729	addq	$4 * SIZE, AO2
730	subq	$-16 * SIZE, B
731	ALIGN_4
732
733.L26:
734	testq	$2, MM
735	jle	.L28
736
737	MOVAPS	0 * SIZE(AO1),      %xmm0
738	MOVAPS	0 * SIZE(AO1, LDA), %xmm1
739	MOVAPS	0 * SIZE(AO2),      %xmm2
740	MOVAPS	0 * SIZE(AO2, LDA), %xmm3
741
742	movaps	 %xmm0, %xmm4
743	unpcklpd %xmm1, %xmm0
744	movaps	 %xmm2, %xmm6
745	unpcklpd %xmm3, %xmm2
746
747	unpckhpd %xmm1, %xmm4
748	unpckhpd %xmm3, %xmm6
749
750	movaps	%xmm0,  -16 * SIZE(B)
751	movaps	%xmm2,  -14 * SIZE(B)
752	movaps	%xmm4,  -12 * SIZE(B)
753	movaps	%xmm6,  -10 * SIZE(B)
754
755	addq	$2 * SIZE, AO1
756	addq	$2 * SIZE, AO2
757	subq	$-8 * SIZE, B
758	ALIGN_4
759
760.L28:
761	testq	$1, MM
762	jle	.L30
763
764	movsd	0 * SIZE(AO1),      %xmm0
765	movsd	0 * SIZE(AO1, LDA), %xmm1
766	movsd	0 * SIZE(AO2),      %xmm2
767	movsd	0 * SIZE(AO2, LDA), %xmm3
768
769	unpcklpd %xmm1, %xmm0
770	unpcklpd %xmm3, %xmm2
771
772	movaps	%xmm0,  -16 * SIZE(B)
773	movaps	%xmm2,  -14 * SIZE(B)
774	subq	$-4 * SIZE, B
775	ALIGN_4
776
777.L30:
778	testq	$2, N
779	jle	.L40
780
781	movq	A, AO1
782	leaq	(A, LDA), AO2
783	leaq	(A, LDA, 2), A
784
785	testq	$SIZE, A
786	je	.L32
787
788	movsd	0 * SIZE(AO1),      %xmm0
789	movsd	0 * SIZE(AO2),      %xmm1
790
791	unpcklpd %xmm1, %xmm0
792
793	movaps	%xmm0,  -16 * SIZE(B)
794
795	addq	$1 * SIZE, AO1
796	addq	$1 * SIZE, AO2
797	subq	$-2 * SIZE, B
798	ALIGN_3
799
800.L32:
801	movq	MM, I
802	sarq	$3, I
803	jle	.L34
804	ALIGN_4
805
806.L33:
807#ifdef PREFETCH
808	PREFETCH	PREFETCHSIZE * 4 * SIZE(AO1)
809#endif
810
811	MOVAPS	0 * SIZE(AO1),      %xmm0
812	MOVAPS	0 * SIZE(AO2),      %xmm1
813	MOVAPS	2 * SIZE(AO1),      %xmm2
814	MOVAPS	2 * SIZE(AO2),      %xmm3
815
816	movaps	 %xmm0, %xmm4
817	unpcklpd %xmm1, %xmm0
818	movaps	 %xmm2, %xmm6
819	unpcklpd %xmm3, %xmm2
820
821	unpckhpd %xmm1, %xmm4
822	unpckhpd %xmm3, %xmm6
823
824#ifdef PREFETCHW
825	PREFETCHW	(PREFETCHSIZE * 8 +  0) * SIZE(B)
826#endif
827
828	movaps	%xmm0,  -16 * SIZE(B)
829	movaps	%xmm4,  -14 * SIZE(B)
830	movaps	%xmm2,  -12 * SIZE(B)
831	movaps	%xmm6,  -10 * SIZE(B)
832
833#ifdef PREFETCH
834	PREFETCH	PREFETCHSIZE * 4 * SIZE(AO2)
835#endif
836
837	MOVAPS	4 * SIZE(AO1), %xmm0
838	MOVAPS	4 * SIZE(AO2), %xmm1
839	MOVAPS	6 * SIZE(AO1), %xmm2
840	MOVAPS	6 * SIZE(AO2), %xmm3
841
842	movaps	 %xmm0, %xmm4
843	unpcklpd %xmm1, %xmm0
844	movaps	 %xmm2, %xmm6
845	unpcklpd %xmm3, %xmm2
846
847	unpckhpd %xmm1, %xmm4
848	unpckhpd %xmm3, %xmm6
849
850#ifdef PREFETCHW
851	PREFETCHW	(PREFETCHSIZE * 8 +  8) * SIZE(B)
852#endif
853
854	movaps	%xmm0,  -8 * SIZE(B)
855	movaps	%xmm4,  -6 * SIZE(B)
856	movaps	%xmm2,  -4 * SIZE(B)
857	movaps	%xmm6,  -2 * SIZE(B)
858
859	addq	$8 * SIZE, AO1
860	addq	$8 * SIZE, AO2
861	subq	$-16 * SIZE, B
862
863	decq	I
864	jg	.L33
865	ALIGN_4
866
867.L34:
868	testq	$4, MM
869	jle	.L36
870
871	MOVAPS	0 * SIZE(AO1),      %xmm0
872	MOVAPS	0 * SIZE(AO2),      %xmm1
873	MOVAPS	2 * SIZE(AO1),      %xmm2
874	MOVAPS	2 * SIZE(AO2),      %xmm3
875
876	movaps	 %xmm0, %xmm4
877	unpcklpd %xmm1, %xmm0
878	unpckhpd %xmm1, %xmm4
879
880	movaps	 %xmm2, %xmm6
881	unpcklpd %xmm3, %xmm2
882	unpckhpd %xmm3, %xmm6
883
884	movaps	%xmm0,  -16 * SIZE(B)
885	movaps	%xmm4,  -14 * SIZE(B)
886	movaps	%xmm2,  -12 * SIZE(B)
887	movaps	%xmm6,  -10 * SIZE(B)
888
889	addq	$4 * SIZE, AO1
890	addq	$4 * SIZE, AO2
891	subq	$-8 * SIZE, B
892	ALIGN_4
893
894.L36:
895	testq	$2, MM
896	jle	.L38
897
898	MOVAPS	0 * SIZE(AO1),      %xmm0
899	MOVAPS	0 * SIZE(AO2),      %xmm1
900
901	movaps	 %xmm0, %xmm2
902	unpcklpd %xmm1, %xmm0
903	unpckhpd %xmm1, %xmm2
904
905	movaps	%xmm0,  -16 * SIZE(B)
906	movaps	%xmm2,  -14 * SIZE(B)
907
908	addq	$2 * SIZE, AO1
909	addq	$2 * SIZE, AO2
910	subq	$-4 * SIZE, B
911	ALIGN_4
912
913.L38:
914	testq	$1, MM
915	jle	.L40
916
917	movsd	0 * SIZE(AO1),      %xmm0
918	movsd	0 * SIZE(AO2),      %xmm1
919
920	unpcklpd %xmm1, %xmm0
921
922	movaps	%xmm0,  -16 * SIZE(B)
923	subq	$-2 * SIZE, B
924	ALIGN_4
925
926.L40:
927	testq	$1, N
928	jle	.L999
929
930	movq	A, AO1
931
932	testq	$SIZE, A
933	jne	.L45
934
935	movq	MM, I
936	sarq	$3, I
937	jle	.L42
938	ALIGN_4
939
940.L41:
941#ifdef PREFETCH
942	PREFETCH	PREFETCHSIZE * 8 * SIZE(AO1)
943#endif
944
945	MOVAPS	0 * SIZE(AO1),      %xmm0
946	MOVAPS	2 * SIZE(AO1),      %xmm1
947	MOVAPS	4 * SIZE(AO1),      %xmm2
948	MOVAPS	6 * SIZE(AO1),      %xmm3
949
950#ifdef PREFETCHW
951	PREFETCHW	(PREFETCHSIZE * 8 +  0) * SIZE(B)
952#endif
953
954	movaps	%xmm0,  -16 * SIZE(B)
955	movaps	%xmm1,  -14 * SIZE(B)
956	movaps	%xmm2,  -12 * SIZE(B)
957	movaps	%xmm3,  -10 * SIZE(B)
958
959	addq	$8 * SIZE, AO1
960	subq	$-8 * SIZE, B
961
962	decq	I
963	jg	.L41
964	ALIGN_4
965
966.L42:
967	testq	$4, MM
968	jle	.L43
969
970	MOVAPS	0 * SIZE(AO1),      %xmm0
971	MOVAPS	2 * SIZE(AO1),      %xmm1
972
973	movaps	%xmm0, -16 * SIZE(B)
974	movaps	%xmm1, -14 * SIZE(B)
975
976	addq	$4 * SIZE, AO1
977	subq	$-4 * SIZE, B
978	ALIGN_4
979
980.L43:
981	testq	$2, MM
982	jle	.L44
983
984	MOVAPS	0 * SIZE(AO1),      %xmm0
985
986	movaps	%xmm0,  -16 * SIZE(B)
987
988	addq	$2 * SIZE, AO1
989	subq	$-2 * SIZE, B
990	ALIGN_4
991
992.L44:
993	testq	$1, MM
994	jle	.L999
995
996	movsd	0 * SIZE(AO1),      %xmm0
997
998	movlpd	%xmm0,  -16 * SIZE(B)
999	jmp	.L999
1000	ALIGN_4
1001
1002.L45:
1003	MOVAPS	-1 * SIZE(AO1),      %xmm0
1004
1005	movq	M, I
1006	sarq	$3, I
1007	jle	.L46
1008	ALIGN_4
1009
1010.L46:
1011#ifdef PREFETCH
1012	PREFETCH	PREFETCHSIZE * 8 * SIZE(AO1)
1013#endif
1014
1015	MOVAPS	1 * SIZE(AO1),      %xmm1
1016	MOVAPS	3 * SIZE(AO1),      %xmm2
1017	MOVAPS	5 * SIZE(AO1),      %xmm3
1018	MOVAPS	7 * SIZE(AO1),      %xmm4
1019
1020	shufpd	$1, %xmm1, %xmm0
1021	shufpd	$1, %xmm2, %xmm1
1022	shufpd	$1, %xmm3, %xmm2
1023	shufpd	$1, %xmm4, %xmm3
1024
1025#ifdef PREFETCHW
1026	PREFETCHW	(PREFETCHSIZE * 8 +  0) * SIZE(B)
1027#endif
1028
1029	movaps	%xmm0,  -16 * SIZE(B)
1030	movaps	%xmm1,  -14 * SIZE(B)
1031	movaps	%xmm2,  -12 * SIZE(B)
1032	movaps	%xmm3,  -10 * SIZE(B)
1033
1034	movaps	%xmm4, %xmm0
1035
1036	addq	$8 * SIZE, AO1
1037	subq	$-8 * SIZE, B
1038
1039	decq	I
1040	jg	.L46
1041	ALIGN_4
1042
1043.L47:
1044	testq	$4, M
1045	jle	.L48
1046
1047	MOVAPS	1 * SIZE(AO1),      %xmm1
1048	MOVAPS	3 * SIZE(AO1),      %xmm2
1049
1050	shufpd	$1, %xmm1, %xmm0
1051	shufpd	$1, %xmm2, %xmm1
1052
1053	movaps	%xmm0,  -16 * SIZE(B)
1054	movaps	%xmm1,  -14 * SIZE(B)
1055
1056	movaps	%xmm2, %xmm0
1057
1058	addq	$4 * SIZE, AO1
1059	addq	$4 * SIZE, B
1060	ALIGN_4
1061
1062.L48:
1063	testq	$2, M
1064	jle	.L49
1065
1066	MOVAPS	1 * SIZE(AO1),      %xmm1
1067
1068	shufpd	$1, %xmm1, %xmm0
1069
1070	movaps	%xmm0,  -16 * SIZE(B)
1071
1072	movaps	%xmm1, %xmm0
1073
1074	addq	$2 * SIZE, AO1
1075	subq	$-2 * SIZE, B
1076	ALIGN_4
1077
1078.L49:
1079	testq	$1, M
1080	jle	.L999
1081
1082	shufpd	$1, %xmm0, %xmm0
1083
1084	movlpd	%xmm0,  -16 * SIZE(B)
1085	jmp	.L999
1086	ALIGN_4
1087
1088.L50:
1089	movq	N,  J
1090	sarq	$3, J
1091	jle	.L60
1092	ALIGN_4
1093
1094.L51:
1095	movq	A, AO1
1096	leaq	(A, LDA, 4), AO2
1097	leaq	(A, LDA, 8), A
1098
1099	testq	$SIZE, A
1100	je	.L52
1101
1102	movsd	0 * SIZE(AO1),         %xmm0
1103	movsd	0 * SIZE(AO1, LDA),    %xmm1
1104	movsd	0 * SIZE(AO1, LDA, 2), %xmm2
1105	movsd	0 * SIZE(AO1, LDA3),   %xmm3
1106	movsd	0 * SIZE(AO2),         %xmm4
1107	movsd	0 * SIZE(AO2, LDA),    %xmm5
1108	movsd	0 * SIZE(AO2, LDA, 2), %xmm6
1109	movsd	0 * SIZE(AO2, LDA3),   %xmm7
1110
1111	unpcklpd %xmm1, %xmm0
1112	unpcklpd %xmm3, %xmm2
1113	unpcklpd %xmm5, %xmm4
1114	unpcklpd %xmm7, %xmm6
1115
1116	movaps	%xmm0,  -16 * SIZE(B)
1117	movaps	%xmm2,  -14 * SIZE(B)
1118	movaps	%xmm4,  -12 * SIZE(B)
1119	movaps	%xmm6,  -10 * SIZE(B)
1120
1121	addq	$1 * SIZE, AO1
1122	addq	$1 * SIZE, AO2
1123	subq	$-8 * SIZE, B
1124	ALIGN_3
1125
1126.L52:
1127	MOVAPS	-1 * SIZE(AO1, LDA),  %xmm9
1128	MOVAPS	-1 * SIZE(AO1, LDA3), %xmm10
1129	MOVAPS	-1 * SIZE(AO2, LDA),  %xmm11
1130	MOVAPS	-1 * SIZE(AO2, LDA3), %xmm12
1131
1132	movq	MM, I
1133	sarq	$3, I
1134	jle	.L54
1135	ALIGN_4
1136
1137.L53:
1138#ifdef PREFETCH
1139	PREFETCH	PREFETCHSIZE * SIZE(AO1)
1140#endif
1141
1142	MOVAPS	0 * SIZE(AO1),         %xmm0
1143	MOVAPS	1 * SIZE(AO1, LDA),    %xmm1
1144	MOVAPS	0 * SIZE(AO1, LDA, 2), %xmm2
1145	MOVAPS	1 * SIZE(AO1, LDA3),   %xmm3
1146
1147#ifdef PREFETCH
1148	PREFETCH	PREFETCHSIZE * SIZE(AO1, LDA)
1149#endif
1150
1151	MOVAPS	0 * SIZE(AO2),         %xmm4
1152	MOVAPS	1 * SIZE(AO2, LDA),    %xmm5
1153	MOVAPS	0 * SIZE(AO2, LDA, 2), %xmm6
1154	MOVAPS	1 * SIZE(AO2, LDA3),   %xmm7
1155
1156	movsd	 %xmm0, %xmm9
1157	movsd	 %xmm2, %xmm10
1158	movsd	 %xmm4, %xmm11
1159	movsd	 %xmm6, %xmm12
1160
1161#ifdef PREFETCHW
1162	PREFETCHW	(PREFETCHSIZE * 8 +  0) * SIZE(B)
1163#endif
1164
1165	movaps	%xmm9,  -16 * SIZE(B)
1166	movaps	%xmm10, -14 * SIZE(B)
1167	movaps	%xmm11, -12 * SIZE(B)
1168	movaps	%xmm12, -10 * SIZE(B)
1169
1170	shufpd	 $1, %xmm1, %xmm0
1171	shufpd	 $1, %xmm3, %xmm2
1172	shufpd	 $1, %xmm5, %xmm4
1173	shufpd	 $1, %xmm7, %xmm6
1174
1175#ifdef PREFETCHW
1176	PREFETCHW	(PREFETCHSIZE * 8 +  8) * SIZE(B)
1177#endif
1178
1179	movaps	%xmm0,   -8 * SIZE(B)
1180	movaps	%xmm2,   -6 * SIZE(B)
1181	movaps	%xmm4,   -4 * SIZE(B)
1182	movaps	%xmm6,   -2 * SIZE(B)
1183
1184#ifdef PREFETCH
1185	PREFETCH	PREFETCHSIZE * SIZE(AO1, LDA, 2)
1186#endif
1187
1188	MOVAPS	2 * SIZE(AO1),         %xmm0
1189	MOVAPS	3 * SIZE(AO1, LDA),    %xmm9
1190	MOVAPS	2 * SIZE(AO1, LDA, 2), %xmm2
1191	MOVAPS	3 * SIZE(AO1, LDA3),   %xmm10
1192
1193#ifdef PREFETCH
1194	PREFETCH	PREFETCHSIZE * SIZE(AO1, LDA3)
1195#endif
1196
1197	MOVAPS	2 * SIZE(AO2),         %xmm4
1198	MOVAPS	3 * SIZE(AO2, LDA),    %xmm11
1199	MOVAPS	2 * SIZE(AO2, LDA, 2), %xmm6
1200	MOVAPS	3 * SIZE(AO2, LDA3),   %xmm12
1201
1202	movsd	 %xmm0, %xmm1
1203	movsd	 %xmm2, %xmm3
1204	movsd	 %xmm4, %xmm5
1205	movsd	 %xmm6, %xmm7
1206
1207#ifdef PREFETCHW
1208	PREFETCHW	(PREFETCHSIZE * 8 + 16) * SIZE(B)
1209#endif
1210
1211	movaps	%xmm1,    0 * SIZE(B)
1212	movaps	%xmm3,    2 * SIZE(B)
1213	movaps	%xmm5,    4 * SIZE(B)
1214	movaps	%xmm7,    6 * SIZE(B)
1215
1216	shufpd	 $1, %xmm9,  %xmm0
1217	shufpd	 $1, %xmm10, %xmm2
1218	shufpd	 $1, %xmm11, %xmm4
1219	shufpd	 $1, %xmm12, %xmm6
1220
1221#ifdef PREFETCHW
1222	PREFETCHW	(PREFETCHSIZE * 8 + 24) * SIZE(B)
1223#endif
1224
1225	movaps	%xmm0,    8 * SIZE(B)
1226	movaps	%xmm2,   10 * SIZE(B)
1227	movaps	%xmm4,   12 * SIZE(B)
1228	movaps	%xmm6,   14 * SIZE(B)
1229
1230#ifdef PREFETCH
1231	PREFETCH	PREFETCHSIZE * SIZE(AO2)
1232#endif
1233
1234	MOVAPS	4 * SIZE(AO1),         %xmm0
1235	MOVAPS	5 * SIZE(AO1, LDA),    %xmm1
1236	MOVAPS	4 * SIZE(AO1, LDA, 2), %xmm2
1237	MOVAPS	5 * SIZE(AO1, LDA3),   %xmm3
1238
1239#ifdef PREFETCH
1240	PREFETCH	PREFETCHSIZE * SIZE(AO2, LDA)
1241#endif
1242
1243	MOVAPS	4 * SIZE(AO2),         %xmm4
1244	MOVAPS	5 * SIZE(AO2, LDA),    %xmm5
1245	MOVAPS	4 * SIZE(AO2, LDA, 2), %xmm6
1246	MOVAPS	5 * SIZE(AO2, LDA3),   %xmm7
1247
1248	movsd	 %xmm0, %xmm9
1249	movsd	 %xmm2, %xmm10
1250	movsd	 %xmm4, %xmm11
1251	movsd	 %xmm6, %xmm12
1252
1253#ifdef PREFETCHW
1254	PREFETCHW	(PREFETCHSIZE * 8 + 32) * SIZE(B)
1255#endif
1256
1257	movaps	%xmm9,   16 * SIZE(B)
1258	movaps	%xmm10,  18 * SIZE(B)
1259	movaps	%xmm11,  20 * SIZE(B)
1260	movaps	%xmm12,  22 * SIZE(B)
1261
1262	shufpd	 $1, %xmm1, %xmm0
1263	shufpd	 $1, %xmm3, %xmm2
1264	shufpd	 $1, %xmm5, %xmm4
1265	shufpd	 $1, %xmm7, %xmm6
1266
1267#ifdef PREFETCHW
1268	PREFETCHW	(PREFETCHSIZE * 4 +  8) * SIZE(B)
1269#endif
1270
1271	movaps	%xmm0,   24 * SIZE(B)
1272	movaps	%xmm2,   26 * SIZE(B)
1273	movaps	%xmm4,   28 * SIZE(B)
1274	movaps	%xmm6,   30 * SIZE(B)
1275
1276#ifdef PREFETCH
1277	PREFETCH	PREFETCHSIZE * SIZE(AO2, LDA, 2)
1278#endif
1279
1280	MOVAPS	6 * SIZE(AO1),         %xmm0
1281	MOVAPS	7 * SIZE(AO1, LDA),    %xmm9
1282	MOVAPS	6 * SIZE(AO1, LDA, 2), %xmm2
1283	MOVAPS	7 * SIZE(AO1, LDA3),   %xmm10
1284
1285#ifdef PREFETCH
1286	PREFETCH	PREFETCHSIZE * SIZE(AO2, LDA3)
1287#endif
1288
1289	MOVAPS	6 * SIZE(AO2),         %xmm4
1290	MOVAPS	7 * SIZE(AO2, LDA),    %xmm11
1291	MOVAPS	6 * SIZE(AO2, LDA, 2), %xmm6
1292	MOVAPS	7 * SIZE(AO2, LDA3),   %xmm12
1293
1294	movsd	 %xmm0, %xmm1
1295	movsd	 %xmm2, %xmm3
1296	movsd	 %xmm4, %xmm5
1297	movsd	 %xmm6, %xmm7
1298
1299#ifdef PREFETCHW
1300	PREFETCHW	(PREFETCHSIZE * 8 + 40) * SIZE(B)
1301#endif
1302
1303	movaps	%xmm1,   32 * SIZE(B)
1304	movaps	%xmm3,   34 * SIZE(B)
1305	movaps	%xmm5,   36 * SIZE(B)
1306	movaps	%xmm7,   38 * SIZE(B)
1307
1308	shufpd	 $1, %xmm9,  %xmm0
1309	shufpd	 $1, %xmm10, %xmm2
1310	shufpd	 $1, %xmm11, %xmm4
1311	shufpd	 $1, %xmm12, %xmm6
1312
1313#ifdef PREFETCHW
1314	PREFETCHW	(PREFETCHSIZE * 8 + 48) * SIZE(B)
1315#endif
1316	movaps	%xmm0,   40 * SIZE(B)
1317	movaps	%xmm2,   42 * SIZE(B)
1318	movaps	%xmm4,   44 * SIZE(B)
1319	movaps	%xmm6,   46 * SIZE(B)
1320
1321	addq	$8 * SIZE, AO1
1322	addq	$8 * SIZE, AO2
1323	subq	$-64 * SIZE, B
1324
1325	decq	I
1326	jg	.L53
1327	ALIGN_4
1328
1329.L54:
1330	testq	$4, MM
1331	jle	.L56
1332
1333	MOVAPS	0 * SIZE(AO1),         %xmm0
1334	MOVAPS	1 * SIZE(AO1, LDA),    %xmm1
1335	MOVAPS	0 * SIZE(AO1, LDA, 2), %xmm2
1336	MOVAPS	1 * SIZE(AO1, LDA3),   %xmm3
1337	MOVAPS	0 * SIZE(AO2),         %xmm4
1338	MOVAPS	1 * SIZE(AO2, LDA),    %xmm5
1339	MOVAPS	0 * SIZE(AO2, LDA, 2), %xmm6
1340	MOVAPS	1 * SIZE(AO2, LDA3),   %xmm7
1341
1342	movsd	 %xmm0, %xmm9
1343	movsd	 %xmm2, %xmm10
1344	movsd	 %xmm4, %xmm11
1345	movsd	 %xmm6, %xmm12
1346
1347	movaps	%xmm9,  -16 * SIZE(B)
1348	movaps	%xmm10, -14 * SIZE(B)
1349	movaps	%xmm11, -12 * SIZE(B)
1350	movaps	%xmm12, -10 * SIZE(B)
1351
1352	shufpd	 $1, %xmm1, %xmm0
1353	shufpd	 $1, %xmm3, %xmm2
1354	shufpd	 $1, %xmm5, %xmm4
1355	shufpd	 $1, %xmm7, %xmm6
1356
1357	movaps	%xmm0,   -8 * SIZE(B)
1358	movaps	%xmm2,   -6 * SIZE(B)
1359	movaps	%xmm4,   -4 * SIZE(B)
1360	movaps	%xmm6,   -2 * SIZE(B)
1361
1362	MOVAPS	2 * SIZE(AO1),         %xmm0
1363	MOVAPS	3 * SIZE(AO1, LDA),    %xmm9
1364	MOVAPS	2 * SIZE(AO1, LDA, 2), %xmm2
1365	MOVAPS	3 * SIZE(AO1, LDA3),   %xmm10
1366	MOVAPS	2 * SIZE(AO2),         %xmm4
1367	MOVAPS	3 * SIZE(AO2, LDA),    %xmm11
1368	MOVAPS	2 * SIZE(AO2, LDA, 2), %xmm6
1369	MOVAPS	3 * SIZE(AO2, LDA3),   %xmm12
1370
1371	movsd	 %xmm0, %xmm1
1372	movsd	 %xmm2, %xmm3
1373	movsd	 %xmm4, %xmm5
1374	movsd	 %xmm6, %xmm7
1375
1376	movaps	%xmm1,    0 * SIZE(B)
1377	movaps	%xmm3,    2 * SIZE(B)
1378	movaps	%xmm5,    4 * SIZE(B)
1379	movaps	%xmm7,    6 * SIZE(B)
1380
1381	shufpd	 $1, %xmm9,  %xmm0
1382	shufpd	 $1, %xmm10, %xmm2
1383	shufpd	 $1, %xmm11, %xmm4
1384	shufpd	 $1, %xmm12, %xmm6
1385
1386	movaps	%xmm0,    8 * SIZE(B)
1387	movaps	%xmm2,   10 * SIZE(B)
1388	movaps	%xmm4,   12 * SIZE(B)
1389	movaps	%xmm6,   14 * SIZE(B)
1390
1391	addq	$4 * SIZE, AO1
1392	addq	$4 * SIZE, AO2
1393	subq	$-32 * SIZE, B
1394	ALIGN_4
1395
1396.L56:
1397	testq	$2, MM
1398	jle	.L58
1399
1400	MOVAPS	0 * SIZE(AO1),         %xmm0
1401	MOVAPS	1 * SIZE(AO1, LDA),    %xmm1
1402	MOVAPS	0 * SIZE(AO1, LDA, 2), %xmm2
1403	MOVAPS	1 * SIZE(AO1, LDA3),   %xmm3
1404	MOVAPS	0 * SIZE(AO2),         %xmm4
1405	MOVAPS	1 * SIZE(AO2, LDA),    %xmm5
1406	MOVAPS	0 * SIZE(AO2, LDA, 2), %xmm6
1407	MOVAPS	1 * SIZE(AO2, LDA3),   %xmm7
1408
1409	movsd	 %xmm0, %xmm9
1410	movsd	 %xmm2, %xmm10
1411	movsd	 %xmm4, %xmm11
1412	movsd	 %xmm6, %xmm12
1413
1414	movaps	%xmm9,  -16 * SIZE(B)
1415	movaps	%xmm10, -14 * SIZE(B)
1416	movaps	%xmm11, -12 * SIZE(B)
1417	movaps	%xmm12, -10 * SIZE(B)
1418
1419	shufpd	 $1, %xmm1, %xmm0
1420	shufpd	 $1, %xmm3, %xmm2
1421	shufpd	 $1, %xmm5, %xmm4
1422	shufpd	 $1, %xmm7, %xmm6
1423
1424	movaps	%xmm0,   -8 * SIZE(B)
1425	movaps	%xmm2,   -6 * SIZE(B)
1426	movaps	%xmm4,   -4 * SIZE(B)
1427	movaps	%xmm6,   -2 * SIZE(B)
1428
1429	addq	$2 * SIZE, AO1
1430	addq	$2 * SIZE, AO2
1431	subq	$-16 * SIZE, B
1432	ALIGN_4
1433
1434.L58:
1435	testq	$1, MM
1436	jle	.L59
1437
1438	movsd	0 * SIZE(AO1),         %xmm0
1439	movsd	0 * SIZE(AO1, LDA),    %xmm1
1440	movsd	0 * SIZE(AO1, LDA, 2), %xmm2
1441	movsd	0 * SIZE(AO1, LDA3),   %xmm3
1442	movsd	0 * SIZE(AO2),         %xmm4
1443	movsd	0 * SIZE(AO2, LDA),    %xmm5
1444	movsd	0 * SIZE(AO2, LDA, 2), %xmm6
1445	movsd	0 * SIZE(AO2, LDA3),   %xmm7
1446
1447	unpcklpd %xmm1, %xmm0
1448	unpcklpd %xmm3, %xmm2
1449	unpcklpd %xmm5, %xmm4
1450	unpcklpd %xmm7, %xmm6
1451
1452	movaps	%xmm0,  -16 * SIZE(B)
1453	movaps	%xmm2,  -14 * SIZE(B)
1454	movaps	%xmm4,  -12 * SIZE(B)
1455	movaps	%xmm6,  -10 * SIZE(B)
1456
1457	subq	$-8 * SIZE, B
1458	ALIGN_4
1459
1460.L59:
1461	decq	J
1462	jg	.L51
1463	ALIGN_4
1464
1465.L60:
1466	testq	$4, N
1467	jle	.L70
1468
1469	movq	A, AO1
1470	leaq	(A, LDA, 2), AO2
1471	leaq	(A, LDA, 4), A
1472
1473	testq	$SIZE, A
1474	je	.L62
1475
1476	movsd	0 * SIZE(AO1),      %xmm0
1477	movsd	0 * SIZE(AO1, LDA), %xmm1
1478	movsd	0 * SIZE(AO2),      %xmm2
1479	movsd	0 * SIZE(AO2, LDA), %xmm3
1480
1481	unpcklpd %xmm1, %xmm0
1482	unpcklpd %xmm3, %xmm2
1483
1484	movaps	%xmm0,  -16 * SIZE(B)
1485	movaps	%xmm2,  -14 * SIZE(B)
1486
1487	addq	$1 * SIZE, AO1
1488	addq	$1 * SIZE, AO2
1489	subq	$-4 * SIZE, B
1490	ALIGN_3
1491
1492.L62:
1493	movaps	-1 * SIZE(AO1, LDA), %xmm5
1494	movaps	-1 * SIZE(AO2, LDA), %xmm7
1495
1496	movq	MM, I
1497	sarq	$3, I
1498	jle	.L64
1499	ALIGN_4
1500
1501.L63:
1502#ifdef PREFETCH
1503	PREFETCH	PREFETCHSIZE * 2 * SIZE(AO1)
1504#endif
1505
1506	MOVAPS	0 * SIZE(AO1),      %xmm0
1507	MOVAPS	1 * SIZE(AO1, LDA), %xmm1
1508	MOVAPS	0 * SIZE(AO2),      %xmm2
1509	MOVAPS	1 * SIZE(AO2, LDA), %xmm3
1510
1511	movsd	 %xmm0, %xmm5
1512	movsd	 %xmm2, %xmm7
1513	shufpd	 $1, %xmm1, %xmm0
1514	shufpd	 $1, %xmm3, %xmm2
1515
1516#ifdef PREFETCHW
1517	PREFETCHW	(PREFETCHSIZE * 8 +  0) * SIZE(B)
1518#endif
1519
1520	movaps	%xmm5,  -16 * SIZE(B)
1521	movaps	%xmm7,  -14 * SIZE(B)
1522	movaps	%xmm0,  -12 * SIZE(B)
1523	movaps	%xmm2,  -10 * SIZE(B)
1524
1525#ifdef PREFETCH
1526	PREFETCH	PREFETCHSIZE * 2 * SIZE(AO1, LDA)
1527#endif
1528
1529	MOVAPS	2 * SIZE(AO1),      %xmm0
1530	MOVAPS	3 * SIZE(AO1, LDA), %xmm5
1531	MOVAPS	2 * SIZE(AO2),      %xmm2
1532	MOVAPS	3 * SIZE(AO2, LDA), %xmm7
1533
1534	movsd	 %xmm0, %xmm1
1535	movsd	 %xmm2, %xmm3
1536	shufpd	 $1, %xmm5, %xmm0
1537	shufpd	 $1, %xmm7, %xmm2
1538
1539#ifdef PREFETCHW
1540	PREFETCHW	(PREFETCHSIZE * 8 +  8) * SIZE(B)
1541#endif
1542
1543	movaps	%xmm1,  -8 * SIZE(B)
1544	movaps	%xmm3,  -6 * SIZE(B)
1545	movaps	%xmm0,  -4 * SIZE(B)
1546	movaps	%xmm2,  -2 * SIZE(B)
1547
1548#ifdef PREFETCH
1549	PREFETCH	PREFETCHSIZE * 2 * SIZE(AO2)
1550#endif
1551
1552	MOVAPS	4 * SIZE(AO1),      %xmm0
1553	MOVAPS	5 * SIZE(AO1, LDA), %xmm1
1554	MOVAPS	4 * SIZE(AO2),      %xmm2
1555	MOVAPS	5 * SIZE(AO2, LDA), %xmm3
1556
1557	movsd	 %xmm0, %xmm5
1558	movsd	 %xmm2, %xmm7
1559	shufpd	 $1, %xmm1, %xmm0
1560	shufpd	 $1, %xmm3, %xmm2
1561
1562#ifdef PREFETCHW
1563	PREFETCHW	(PREFETCHSIZE * 8 + 16) * SIZE(B)
1564#endif
1565
1566	movaps	%xmm5,    0 * SIZE(B)
1567	movaps	%xmm7,    2 * SIZE(B)
1568	movaps	%xmm0,    4 * SIZE(B)
1569	movaps	%xmm2,    6 * SIZE(B)
1570
1571#ifdef PREFETCH
1572	PREFETCH	PREFETCHSIZE * 2 * SIZE(AO2, LDA)
1573#endif
1574
1575	MOVAPS	6 * SIZE(AO1),      %xmm0
1576	MOVAPS	7 * SIZE(AO1, LDA), %xmm5
1577	MOVAPS	6 * SIZE(AO2),      %xmm2
1578	MOVAPS	7 * SIZE(AO2, LDA), %xmm7
1579
1580	movsd	 %xmm0, %xmm1
1581	movsd	 %xmm2, %xmm3
1582	shufpd	 $1, %xmm5, %xmm0
1583	shufpd	 $1, %xmm7, %xmm2
1584
1585#ifdef PREFETCHW
1586	PREFETCHW	(PREFETCHSIZE * 8 + 24) * SIZE(B)
1587#endif
1588
1589	movaps	%xmm1,   8 * SIZE(B)
1590	movaps	%xmm3,  10 * SIZE(B)
1591	movaps	%xmm0,  12 * SIZE(B)
1592	movaps	%xmm2,  14 * SIZE(B)
1593
1594	addq	$8 * SIZE, AO1
1595	addq	$8 * SIZE, AO2
1596	subq	$-32 * SIZE, B
1597
1598	decq	I
1599	jg	.L63
1600	ALIGN_4
1601
1602.L64:
1603	testq	$4, MM
1604	jle	.L66
1605
1606	MOVAPS	0 * SIZE(AO1),      %xmm0
1607	MOVAPS	1 * SIZE(AO1, LDA), %xmm1
1608	MOVAPS	0 * SIZE(AO2),      %xmm2
1609	MOVAPS	1 * SIZE(AO2, LDA), %xmm3
1610
1611	movsd	 %xmm0, %xmm5
1612	shufpd	 $1, %xmm1, %xmm0
1613	movsd	 %xmm2, %xmm7
1614	shufpd	 $1, %xmm3, %xmm2
1615
1616	movaps	%xmm5,  -16 * SIZE(B)
1617	movaps	%xmm7,  -14 * SIZE(B)
1618	movaps	%xmm0,  -12 * SIZE(B)
1619	movaps	%xmm2,  -10 * SIZE(B)
1620
1621	MOVAPS	2 * SIZE(AO1),      %xmm0
1622	MOVAPS	3 * SIZE(AO1, LDA), %xmm5
1623	MOVAPS	2 * SIZE(AO2),      %xmm2
1624	MOVAPS	3 * SIZE(AO2, LDA), %xmm7
1625
1626	movsd	 %xmm0, %xmm1
1627	shufpd	 $1, %xmm5, %xmm0
1628	movsd	 %xmm2, %xmm3
1629	shufpd	 $1, %xmm7, %xmm2
1630
1631	movaps	%xmm1,  -8 * SIZE(B)
1632	movaps	%xmm3,  -6 * SIZE(B)
1633	movaps	%xmm0,  -4 * SIZE(B)
1634	movaps	%xmm2,  -2 * SIZE(B)
1635
1636	addq	$4 * SIZE, AO1
1637	addq	$4 * SIZE, AO2
1638	subq	$-16 * SIZE, B
1639	ALIGN_4
1640
1641.L66:
1642	testq	$2, MM
1643	jle	.L68
1644
1645	MOVAPS	0 * SIZE(AO1),      %xmm0
1646	MOVAPS	1 * SIZE(AO1, LDA), %xmm1
1647	MOVAPS	0 * SIZE(AO2),      %xmm2
1648	MOVAPS	1 * SIZE(AO2, LDA), %xmm3
1649
1650	movsd	 %xmm0, %xmm5
1651	movsd	 %xmm2, %xmm7
1652	shufpd	 $1, %xmm1, %xmm0
1653	shufpd	 $1, %xmm3, %xmm2
1654
1655	movaps	%xmm5,  -16 * SIZE(B)
1656	movaps	%xmm7,  -14 * SIZE(B)
1657	movaps	%xmm0,  -12 * SIZE(B)
1658	movaps	%xmm2,  -10 * SIZE(B)
1659
1660	addq	$2 * SIZE, AO1
1661	addq	$2 * SIZE, AO2
1662	subq	$-8 * SIZE, B
1663	ALIGN_4
1664
1665.L68:
1666	testq	$1, MM
1667	jle	.L70
1668
1669	movsd	0 * SIZE(AO1),      %xmm0
1670	movsd	0 * SIZE(AO1, LDA), %xmm1
1671	movsd	0 * SIZE(AO2),      %xmm2
1672	movsd	0 * SIZE(AO2, LDA), %xmm3
1673
1674	unpcklpd %xmm1, %xmm0
1675	unpcklpd %xmm3, %xmm2
1676
1677	movaps	%xmm0,  -16 * SIZE(B)
1678	movaps	%xmm2,  -14 * SIZE(B)
1679	subq	$-4 * SIZE, B
1680	ALIGN_4
1681
1682.L70:
1683	testq	$2, N
1684	jle	.L80
1685
1686	movq	A, AO1
1687	leaq	(A, LDA), AO2
1688	leaq	(A, LDA, 2), A
1689
1690	testq	$SIZE, A
1691	je	.L72
1692
1693	movsd	0 * SIZE(AO1), %xmm0
1694	movsd	0 * SIZE(AO2), %xmm1
1695
1696	unpcklpd %xmm1, %xmm0
1697
1698	movaps	%xmm0,  -16 * SIZE(B)
1699
1700	addq	$1 * SIZE, AO1
1701	addq	$1 * SIZE, AO2
1702	subq	$-2 * SIZE, B
1703	ALIGN_3
1704
1705.L72:
1706	MOVAPS	-1 * SIZE(AO2), %xmm5
1707
1708	movq	MM, I
1709	sarq	$3, I
1710	jle	.L74
1711	ALIGN_4
1712
1713.L73:
1714#ifdef PREFETCH
1715	PREFETCH	PREFETCHSIZE * 4 * SIZE(AO1)
1716#endif
1717
1718	MOVAPS	0 * SIZE(AO1), %xmm0
1719	MOVAPS	1 * SIZE(AO2), %xmm1
1720	MOVAPS	2 * SIZE(AO1), %xmm2
1721	MOVAPS	3 * SIZE(AO2), %xmm3
1722
1723	movsd	 %xmm0, %xmm5
1724	shufpd	 $1, %xmm1, %xmm0
1725	movsd	 %xmm2, %xmm1
1726	shufpd	 $1, %xmm3, %xmm2
1727
1728#ifdef PREFETCHW
1729	PREFETCHW	(PREFETCHSIZE * 8 +  0) * SIZE(B)
1730#endif
1731
1732	movaps	%xmm5,  -16 * SIZE(B)
1733	movaps	%xmm0,  -14 * SIZE(B)
1734	movaps	%xmm1,  -12 * SIZE(B)
1735	movaps	%xmm2,  -10 * SIZE(B)
1736
1737#ifdef PREFETCH
1738	PREFETCH	PREFETCHSIZE * 4 * SIZE(AO2)
1739#endif
1740
1741	MOVAPS	4 * SIZE(AO1), %xmm0
1742	MOVAPS	5 * SIZE(AO2), %xmm1
1743	MOVAPS	6 * SIZE(AO1), %xmm2
1744	MOVAPS	7 * SIZE(AO2), %xmm5
1745
1746	movsd	 %xmm0, %xmm3
1747	shufpd	 $1, %xmm1, %xmm0
1748	movsd	 %xmm2, %xmm1
1749	shufpd	 $1, %xmm5, %xmm2
1750
1751#ifdef PREFETCHW
1752	PREFETCHW	(PREFETCHSIZE * 8 +  8) * SIZE(B)
1753#endif
1754
1755	movaps	%xmm3,   -8 * SIZE(B)
1756	movaps	%xmm0,   -6 * SIZE(B)
1757	movaps	%xmm1,   -4 * SIZE(B)
1758	movaps	%xmm2,   -2 * SIZE(B)
1759
1760	addq	$8 * SIZE, AO1
1761	addq	$8 * SIZE, AO2
1762	subq	$-16 * SIZE, B
1763
1764	decq	I
1765	jg	.L73
1766	ALIGN_4
1767
1768.L74:
1769	testq	$4, MM
1770	jle	.L76
1771
1772	MOVAPS	0 * SIZE(AO1), %xmm0
1773	MOVAPS	1 * SIZE(AO2), %xmm1
1774	MOVAPS	2 * SIZE(AO1), %xmm2
1775	MOVAPS	3 * SIZE(AO2), %xmm3
1776
1777	movsd	 %xmm0, %xmm5
1778	shufpd	 $1, %xmm1, %xmm0
1779	movsd	 %xmm2, %xmm1
1780	shufpd	 $1, %xmm3, %xmm2
1781
1782	movaps	%xmm5,  -16 * SIZE(B)
1783	movaps	%xmm0,  -14 * SIZE(B)
1784	movaps	%xmm1,  -12 * SIZE(B)
1785	movaps	%xmm2,  -10 * SIZE(B)
1786
1787	movaps	%xmm3, %xmm5
1788
1789	addq	$4 * SIZE, AO1
1790	addq	$4 * SIZE, AO2
1791	subq	$-8 * SIZE, B
1792	ALIGN_4
1793
1794.L76:
1795	testq	$2, MM
1796	jle	.L78
1797
1798	MOVAPS	0 * SIZE(AO1), %xmm0
1799	MOVAPS	1 * SIZE(AO2), %xmm1
1800
1801	movsd	 %xmm0, %xmm5
1802	shufpd	 $1, %xmm1, %xmm0
1803
1804	movaps	%xmm5,  -16 * SIZE(B)
1805	movaps	%xmm0,  -14 * SIZE(B)
1806
1807	addq	$2 * SIZE, AO1
1808	addq	$2 * SIZE, AO2
1809	subq	$-4 * SIZE, B
1810	ALIGN_4
1811
1812.L78:
1813	testq	$1, MM
1814	jle	.L80
1815
1816	movsd	0 * SIZE(AO1),      %xmm0
1817	movsd	0 * SIZE(AO2),      %xmm1
1818
1819	unpcklpd %xmm1, %xmm0
1820
1821	movaps	%xmm0,  -16 * SIZE(B)
1822	subq	$-2 * SIZE, B
1823	ALIGN_4
1824
1825.L80:
1826	testq	$1, N
1827	jle	.L999
1828
1829	movq	A, AO1
1830
1831	testq	$SIZE, A
1832	jne	.L85
1833
1834	movq	MM, I
1835	sarq	$3, I
1836	jle	.L82
1837	ALIGN_4
1838
1839.L81:
1840#ifdef PREFETCH
1841	PREFETCH	PREFETCHSIZE * 8 * SIZE(AO1)
1842#endif
1843
1844	MOVAPS	0 * SIZE(AO1),      %xmm0
1845	MOVAPS	2 * SIZE(AO1),      %xmm2
1846	MOVAPS	4 * SIZE(AO1),      %xmm4
1847	MOVAPS	6 * SIZE(AO1),      %xmm6
1848
1849#ifdef PREFETCHW
1850	PREFETCHW	(PREFETCHSIZE * 8 +  0) * SIZE(B)
1851#endif
1852
1853	movaps	%xmm0,  -16 * SIZE(B)
1854	movaps	%xmm2,  -14 * SIZE(B)
1855	movaps	%xmm4,  -12 * SIZE(B)
1856	movaps	%xmm6,  -10 * SIZE(B)
1857
1858	addq	$8 * SIZE, AO1
1859	subq	$-8 * SIZE, B
1860
1861	decq	I
1862	jg	.L81
1863	ALIGN_4
1864
1865.L82:
1866	testq	$4, MM
1867	jle	.L83
1868
1869	MOVAPS	0 * SIZE(AO1),      %xmm0
1870	MOVAPS	2 * SIZE(AO1),      %xmm2
1871
1872	movaps	%xmm0, -16 * SIZE(B)
1873	movaps	%xmm2, -14 * SIZE(B)
1874
1875	addq	$4 * SIZE, AO1
1876	subq	$-4 * SIZE, B
1877	ALIGN_4
1878
1879.L83:
1880	testq	$2, MM
1881	jle	.L84
1882
1883	MOVAPS	0 * SIZE(AO1),      %xmm0
1884
1885	movaps	%xmm0,  -16 * SIZE(B)
1886
1887	addq	$2 * SIZE, AO1
1888	subq	$-2 * SIZE, B
1889	ALIGN_4
1890
1891.L84:
1892	testq	$1, MM
1893	jle	.L999
1894
1895	movsd	0 * SIZE(AO1),      %xmm0
1896
1897	movlpd	%xmm0,  -16 * SIZE(B)
1898	jmp	.L999
1899	ALIGN_4
1900
1901.L85:
1902	MOVAPS	-1 * SIZE(AO1),      %xmm0
1903
1904	movq	M, I
1905	sarq	$3, I
1906	jle	.L86
1907	ALIGN_4
1908
1909.L86:
1910#ifdef PREFETCH
1911	PREFETCH	PREFETCHSIZE * 8 * SIZE(AO1)
1912#endif
1913
1914	MOVAPS	1 * SIZE(AO1),      %xmm1
1915	MOVAPS	3 * SIZE(AO1),      %xmm2
1916	MOVAPS	5 * SIZE(AO1),      %xmm3
1917	MOVAPS	7 * SIZE(AO1),      %xmm4
1918
1919	shufpd	$1, %xmm1, %xmm0
1920	shufpd	$1, %xmm2, %xmm1
1921	shufpd	$1, %xmm3, %xmm2
1922	shufpd	$1, %xmm4, %xmm3
1923
1924#ifdef PREFETCHW
1925	PREFETCHW	(PREFETCHSIZE * 8 +  0) * SIZE(B)
1926#endif
1927
1928	movaps	%xmm0,  -16 * SIZE(B)
1929	movaps	%xmm1,  -14 * SIZE(B)
1930	movaps	%xmm2,  -12 * SIZE(B)
1931	movaps	%xmm3,  -10 * SIZE(B)
1932
1933	movaps	%xmm4, %xmm0
1934
1935	addq	$8 * SIZE, AO1
1936	subq	$-8 * SIZE, B
1937
1938	decq	I
1939	jg	.L86
1940	ALIGN_4
1941
1942.L87:
1943	testq	$4, M
1944	jle	.L88
1945
1946	MOVAPS	1 * SIZE(AO1),      %xmm1
1947	MOVAPS	3 * SIZE(AO1),      %xmm2
1948
1949	shufpd	$1, %xmm1, %xmm0
1950	shufpd	$1, %xmm2, %xmm1
1951
1952	movaps	%xmm0,  -16 * SIZE(B)
1953	movaps	%xmm1,  -14 * SIZE(B)
1954
1955	movaps	%xmm2, %xmm0
1956
1957	addq	$4 * SIZE, AO1
1958	addq	$4 * SIZE, B
1959	ALIGN_4
1960
1961.L88:
1962	testq	$2, M
1963	jle	.L89
1964
1965	MOVAPS	1 * SIZE(AO1),      %xmm1
1966
1967	shufpd	$1, %xmm1, %xmm0
1968
1969	movaps	%xmm0,  -16 * SIZE(B)
1970
1971	movaps	%xmm1, %xmm0
1972
1973	addq	$2 * SIZE, AO1
1974	subq	$-2 * SIZE, B
1975	ALIGN_4
1976
1977.L89:
1978	testq	$1, M
1979	jle	.L999
1980
1981	shufpd	$1, %xmm0, %xmm0
1982
1983	movlpd	%xmm0,  -16 * SIZE(B)
1984	ALIGN_4
1985
1986.L999:
1987#ifdef WINDOWS_ABI
1988	movups	  0(%rsp), %xmm6
1989	movups	 16(%rsp), %xmm7
1990	movups	 32(%rsp), %xmm8
1991	movups	 48(%rsp), %xmm9
1992	movups	 64(%rsp), %xmm10
1993	movups	 80(%rsp), %xmm11
1994	movups	 96(%rsp), %xmm12
1995
1996	addq	$STACKSIZE, %rsp
1997#endif
1998
1999	popq	%r12
2000	popq	%r13
2001
2002#ifdef WINDOWS_ABI
2003	popq	%r14
2004	popq	%r15
2005#endif
2006	ret
2007
2008	EPILOGUE
2009