1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define STACK	16
26#define ARGS	16
27
28#define M	 4 + STACK + ARGS(%esp)
29#define N	 8 + STACK + ARGS(%esp)
30#define K	12 + STACK + ARGS(%esp)
31#define ALPHA_R	16 + STACK + ARGS(%esp)
32#define ALPHA_I	24 + STACK + ARGS(%esp)
33#define A	32 + STACK + ARGS(%esp)
34#define OLD_B	36 + STACK + ARGS(%esp)
35#define C	40 + STACK + ARGS(%esp)
36#define OLD_LDC	44 + STACK + ARGS(%esp)
37#define OFFSET	48 + STACK + ARGS(%esp)
38
39#define J	 0 + STACK(%esp)
40#define BX	 4 + STACK(%esp)
41#define KK	 8 + STACK(%esp)
42#define KKK	12 + STACK(%esp)
43
44#define B	%edi
45#define LDC	%ebp
46#define AO	%edx
47#define BO	%ecx
48#define CO	%esi
49#define	I	%ebx
50
51#define movsd  movlps
52#define movapd movups
53#define movlpd movlps
54#define movhpd movhps
55
56#define PREFETCH     prefetch
57#define PREFETCHSIZE  (8 *  7 + 0)
58
59#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
60#define ADD1	addpd
61#define ADD2	addpd
62#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
63#define ADD1	addpd
64#define ADD2	subpd
65#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
66#define ADD1	subpd
67#define ADD2	addpd
68#else
69#define ADD1	subpd
70#define ADD2	subpd
71#endif
72
73#define KERNEL1(address) \
74	mulpd	%xmm0, %xmm1; \
75	PREFETCH (PREFETCHSIZE +  0) * SIZE(AO, %eax, 2); \
76	mulpd	-14 * SIZE(BO, %eax, 4), %xmm0; \
77	ADD1	%xmm1, %xmm4; \
78	movapd	-12 * SIZE(BO, %eax, 4), %xmm1; \
79	ADD1	%xmm0, %xmm6; \
80	movddup	-15 * SIZE(AO, %eax, 2), %xmm0; \
81	mulpd	%xmm0, %xmm2; \
82	mulpd	-14 * SIZE(BO, %eax, 4), %xmm0; \
83	ADD2	%xmm0, %xmm7; \
84	movddup	-14 * SIZE(AO, %eax, 2), %xmm0
85
86#define KERNEL2(address) \
87	ADD2	%xmm2, %xmm5; \
88	movapd	%xmm1, %xmm2; \
89	mulpd	%xmm0, %xmm1; \
90	mulpd	-10 * SIZE(BO, %eax, 4), %xmm0; \
91	ADD1	%xmm1, %xmm4; \
92	movapd	 -8 * SIZE(BO, %eax, 4), %xmm1; \
93	ADD1	%xmm0, %xmm6; \
94	movddup	-13 * SIZE(AO, %eax, 2), %xmm0; \
95	mulpd	%xmm0, %xmm2; \
96	mulpd	-10 * SIZE(BO, %eax, 4), %xmm0; \
97	ADD2	%xmm0, %xmm7; \
98	movddup	-12 * SIZE(AO, %eax, 2), %xmm0
99
100#define KERNEL3(address) \
101	ADD2	%xmm2, %xmm5; \
102	movapd	%xmm1, %xmm2; \
103	mulpd	%xmm0, %xmm1; \
104	mulpd	 -6 * SIZE(BO, %eax, 4), %xmm0; \
105	ADD1	%xmm1, %xmm4; \
106	movapd	 -4 * SIZE(BO, %eax, 4), %xmm1; \
107	ADD1	%xmm0, %xmm6; \
108	movddup	-11 * SIZE(AO, %eax, 2), %xmm0; \
109	mulpd	%xmm0, %xmm2; \
110	mulpd	 -6 * SIZE(BO, %eax, 4), %xmm0; \
111	ADD2	%xmm0, %xmm7; \
112	movddup	-10 * SIZE(AO, %eax, 2), %xmm0
113
114#define KERNEL4(address) \
115	ADD2	%xmm2, %xmm5; \
116	movapd	%xmm1, %xmm2; \
117	mulpd	%xmm0, %xmm1; \
118	mulpd	 -2 * SIZE(BO, %eax, 4), %xmm0; \
119	ADD1	%xmm1, %xmm4; \
120	movapd	          (BO, %eax, 4), %xmm1; \
121	ADD1	%xmm0, %xmm6; \
122	movddup	 -9 * SIZE(AO, %eax, 2), %xmm0; \
123	mulpd	%xmm0, %xmm2; \
124	mulpd	 -2 * SIZE(BO, %eax, 4), %xmm0; \
125	ADD2	%xmm0, %xmm7; \
126	movddup	          (AO, %eax, 2), %xmm0
127
128#define KERNEL5(address) \
129	ADD2	%xmm2, %xmm5; \
130	movapd	%xmm1, %xmm2; \
131	mulpd	%xmm3, %xmm1; \
132	mulpd	  2 * SIZE(BO, %eax, 4), %xmm3; \
133	ADD1	%xmm1, %xmm4; \
134	movapd	  4 * SIZE(BO, %eax, 4), %xmm1; \
135	ADD1	%xmm3, %xmm6; \
136	movddup	 -7 * SIZE(AO, %eax, 2), %xmm3; \
137	mulpd	%xmm3, %xmm2; \
138	mulpd	  2 * SIZE(BO, %eax, 4), %xmm3; \
139	ADD2	%xmm3, %xmm7; \
140	movddup	 -6 * SIZE(AO, %eax, 2), %xmm3
141
142#define KERNEL6(address) \
143	ADD2	%xmm2, %xmm5; \
144	movapd	%xmm1, %xmm2; \
145	mulpd	%xmm3, %xmm1; \
146	mulpd	  6 * SIZE(BO, %eax, 4), %xmm3; \
147	ADD1	%xmm1, %xmm4; \
148	movapd	  8 * SIZE(BO, %eax, 4), %xmm1; \
149	ADD1	%xmm3, %xmm6; \
150	movddup	 -5 * SIZE(AO, %eax, 2), %xmm3; \
151	mulpd	%xmm3, %xmm2; \
152	mulpd	  6 * SIZE(BO, %eax, 4), %xmm3; \
153	ADD2	%xmm3, %xmm7; \
154	movddup	 -4 * SIZE(AO, %eax, 2), %xmm3
155
156#define KERNEL7(address) \
157	ADD2	%xmm2, %xmm5; \
158	movapd	%xmm1, %xmm2; \
159	mulpd	%xmm3, %xmm1; \
160	mulpd	 10 * SIZE(BO, %eax, 4), %xmm3; \
161	ADD1	%xmm1, %xmm4; \
162	movapd	 12 * SIZE(BO, %eax, 4), %xmm1; \
163	ADD1	%xmm3, %xmm6; \
164	movddup	 -3 * SIZE(AO, %eax, 2), %xmm3; \
165	mulpd	%xmm3, %xmm2; \
166	mulpd	 10 * SIZE(BO, %eax, 4), %xmm3; \
167	ADD2	%xmm3, %xmm7; \
168	movddup	 -2 * SIZE(AO, %eax, 2), %xmm3
169
170#define KERNEL8(address) \
171	ADD2	%xmm2, %xmm5; \
172	movapd	%xmm1, %xmm2; \
173	mulpd	%xmm3, %xmm1; \
174	mulpd	 14 * SIZE(BO, %eax, 4), %xmm3; \
175	ADD1	%xmm1, %xmm4; \
176	movapd	 16 * SIZE(BO, %eax, 4), %xmm1; \
177	ADD1	%xmm3, %xmm6; \
178	movddup	 -1 * SIZE(AO, %eax, 2), %xmm3; \
179	mulpd	%xmm3, %xmm2; \
180	mulpd	 14 * SIZE(BO, %eax, 4), %xmm3; \
181	ADD2	%xmm3, %xmm7; \
182	movddup	  8 * SIZE(AO, %eax, 2), %xmm3; \
183	ADD2	%xmm2, %xmm5; \
184	movapd	%xmm1, %xmm2
185
186	PROLOGUE
187
188	subl	$ARGS, %esp
189
190	pushl	%ebp
191	pushl	%edi
192	pushl	%esi
193	pushl	%ebx
194
195	PROFCODE
196
197	movl	OLD_B,   B
198	movl	OLD_LDC, LDC
199
200#ifdef TRMMKERNEL
201	movl	OFFSET, %eax
202
203#ifndef LEFT
204	negl	%eax
205#endif
206
207	movl	%eax, KK
208#endif
209
210	subl	$-16 * SIZE, A
211	subl	$-16 * SIZE, B
212
213	sall	$ZBASE_SHIFT, LDC
214
215	movl	N,  %eax
216	sarl	$1, %eax
217	movl	%eax, J			# j = n
218	jle	.L100
219	ALIGN_4
220
221.L01:
222#if defined(TRMMKERNEL) && defined(LEFT)
223	movl	OFFSET, %eax
224	movl	%eax, KK
225#endif
226
227	leal	GEMM_DEFAULT_Q * GEMM_DEFAULT_UNROLL_N * SIZE(B), %eax
228	movl	%eax, BX
229
230	movl	C, CO
231	movl	A, AO
232	movl	M,  I
233	testl	I,  I
234	jle	.L100
235	ALIGN_4
236
237.L10:
238#if !defined(TRMMKERNEL) || \
239	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
240	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
241
242	movl	B, BO
243#else
244	movl	KK, %eax
245	leal	(, %eax, SIZE), %eax
246	leal	(AO, %eax, 2), AO
247	leal	(B,  %eax, 4), BO
248#endif
249
250	movl	BX, %eax
251
252	prefetcht2  0 * SIZE(%eax)
253
254	subl	$-8 * SIZE, BX
255
256	movddup	-16 * SIZE(AO), %xmm0
257	movapd	-16 * SIZE(BO), %xmm1
258	pxor	%xmm4, %xmm4
259	movddup	 -8 * SIZE(AO), %xmm3
260	pxor	%xmm5, %xmm5
261
262	prefetchw 1 * SIZE(CO)
263	pxor	%xmm6, %xmm6
264	prefetchw 1 * SIZE(CO, LDC)
265	pxor	%xmm7, %xmm7
266	movapd	%xmm1, %xmm2
267
268#ifndef TRMMKERNEL
269	movl	K, %eax
270#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
271	movl	K, %eax
272	subl	KK, %eax
273	movl	%eax, KKK
274#else
275	movl	KK, %eax
276#ifdef LEFT
277	addl	$1, %eax
278#else
279	addl	$2, %eax
280#endif
281	movl	%eax, KKK
282#endif
283
284	andl	$-8, %eax
285
286	leal	(, %eax, SIZE), %eax
287	leal	(AO, %eax, 2), AO
288	leal	(BO, %eax, 4), BO
289	negl	%eax
290	NOBRANCH
291	je	.L15
292	ALIGN_3
293
294.L12:
295	KERNEL1(16 *  0)
296	KERNEL2(16 *  0)
297	KERNEL3(16 *  0)
298	KERNEL4(16 *  0)
299	KERNEL5(16 *  0)
300	KERNEL6(16 *  0)
301	KERNEL7(16 *  0)
302	KERNEL8(16 *  0)
303	addl	$8 * SIZE, %eax
304	NOBRANCH
305	je	.L15
306	KERNEL1(16 *  0)
307	KERNEL2(16 *  0)
308	KERNEL3(16 *  0)
309	KERNEL4(16 *  0)
310	KERNEL5(16 *  0)
311	KERNEL6(16 *  0)
312	KERNEL7(16 *  0)
313	KERNEL8(16 *  0)
314	addl	$8 * SIZE, %eax
315	NOBRANCH
316	je	.L15
317	KERNEL1(16 *  0)
318	KERNEL2(16 *  0)
319	KERNEL3(16 *  0)
320	KERNEL4(16 *  0)
321	KERNEL5(16 *  0)
322	KERNEL6(16 *  0)
323	KERNEL7(16 *  0)
324	KERNEL8(16 *  0)
325	addl	$8 * SIZE, %eax
326	NOBRANCH
327	je	.L15
328	KERNEL1(16 *  0)
329	KERNEL2(16 *  0)
330	KERNEL3(16 *  0)
331	KERNEL4(16 *  0)
332	KERNEL5(16 *  0)
333	KERNEL6(16 *  0)
334	KERNEL7(16 *  0)
335	KERNEL8(16 *  0)
336	addl	$8 * SIZE, %eax
337	NOBRANCH
338	je	.L15
339	KERNEL1(16 *  0)
340	KERNEL2(16 *  0)
341	KERNEL3(16 *  0)
342	KERNEL4(16 *  0)
343	KERNEL5(16 *  0)
344	KERNEL6(16 *  0)
345	KERNEL7(16 *  0)
346	KERNEL8(16 *  0)
347	addl	$8 * SIZE, %eax
348	NOBRANCH
349	je	.L15
350	KERNEL1(16 *  0)
351	KERNEL2(16 *  0)
352	KERNEL3(16 *  0)
353	KERNEL4(16 *  0)
354	KERNEL5(16 *  0)
355	KERNEL6(16 *  0)
356	KERNEL7(16 *  0)
357	KERNEL8(16 *  0)
358	addl	$8 * SIZE, %eax
359	NOBRANCH
360	je	.L15
361	KERNEL1(16 *  0)
362	KERNEL2(16 *  0)
363	KERNEL3(16 *  0)
364	KERNEL4(16 *  0)
365	KERNEL5(16 *  0)
366	KERNEL6(16 *  0)
367	KERNEL7(16 *  0)
368	KERNEL8(16 *  0)
369	addl	$8 * SIZE, %eax
370	NOBRANCH
371	je	.L15
372	KERNEL1(16 *  0)
373	KERNEL2(16 *  0)
374	KERNEL3(16 *  0)
375	KERNEL4(16 *  0)
376	KERNEL5(16 *  0)
377	KERNEL6(16 *  0)
378	KERNEL7(16 *  0)
379	KERNEL8(16 *  0)
380	addl	$8 * SIZE, %eax
381	BRANCH
382	jl	.L12
383	ALIGN_3
384
385.L15:
386#ifndef TRMMKERNEL
387	movl	K, %eax
388#else
389	movl	KKK, %eax
390#endif
391	andl	$7, %eax		# if (k & 1)
392	BRANCH
393	je .L14
394
395	leal	(, %eax, SIZE), %eax
396	leal	(AO, %eax, 2), AO
397	leal	(BO, %eax, 4), BO
398	negl	%eax
399	ALIGN_4
400
401.L16:
402	mulpd	%xmm0, %xmm1
403	mulpd	-14 * SIZE(BO, %eax, 4), %xmm0
404	ADD1	%xmm1, %xmm4
405	movapd	-12 * SIZE(BO, %eax, 4), %xmm1
406	ADD1	%xmm0, %xmm6
407	movddup	-15 * SIZE(AO, %eax, 2), %xmm0
408	mulpd	%xmm0, %xmm2
409	mulpd	-14 * SIZE(BO, %eax, 4), %xmm0
410	ADD2	%xmm0, %xmm7
411	movddup	-14 * SIZE(AO, %eax, 2), %xmm0
412	ADD2	%xmm2, %xmm5
413	movapd	%xmm1, %xmm2
414
415	addl	$SIZE, %eax
416	jl	.L16
417	ALIGN_4
418
419.L14:
420#ifndef TRMMKERNEL
421	movupd	0 * SIZE(CO), %xmm0
422	movupd	0 * SIZE(CO, LDC), %xmm1
423#endif
424
425	movddup	ALPHA_R, %xmm2
426	movddup	ALPHA_I, %xmm3
427
428	SHUFPD_1 %xmm5, %xmm5
429	SHUFPD_1 %xmm7, %xmm7
430
431#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
432    defined(RN) || defined(RT) || defined(CN) || defined(CT)
433	addsubpd %xmm5,  %xmm4
434	addsubpd %xmm7,  %xmm6
435
436	pshufd	$0x4e, %xmm4, %xmm5
437	pshufd	$0x4e, %xmm6, %xmm7
438#else
439	addsubpd %xmm4,  %xmm5
440	addsubpd %xmm6,  %xmm7
441
442	movapd	 %xmm5,  %xmm4
443	pshufd	$0x4e, %xmm5,  %xmm5
444	movapd	 %xmm7,  %xmm6
445	pshufd	$0x4e, %xmm7,  %xmm7
446#endif
447
448	mulpd	%xmm2, %xmm4
449	mulpd	%xmm3, %xmm5
450	mulpd	%xmm2, %xmm6
451	mulpd	%xmm3, %xmm7
452
453	addsubpd %xmm5, %xmm4
454	addsubpd %xmm7, %xmm6
455
456#ifndef TRMMKERNEL
457	addpd	%xmm0, %xmm4
458	addpd	%xmm1, %xmm6
459#endif
460
461	movlpd	%xmm4, 0 * SIZE(CO)
462	movhpd	%xmm4, 1 * SIZE(CO)
463	movlpd	%xmm6, 0 * SIZE(CO, LDC)
464	movhpd	%xmm6, 1 * SIZE(CO, LDC)
465
466#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
467    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
468	movl	K, %eax
469	subl	KKK, %eax
470	leal	(,%eax, SIZE), %eax
471	leal	(AO, %eax, 2), AO
472	leal	(BO, %eax, 4), BO
473#endif
474
475#if defined(TRMMKERNEL) && defined(LEFT)
476	addl	$1, KK
477#endif
478
479	addl	$2 * SIZE, CO		# coffset += 4
480	decl	I			# i --
481	jg	.L10
482	ALIGN_4
483
484.L99:
485#if defined(TRMMKERNEL) && !defined(LEFT)
486	addl	$2, KK
487#endif
488
489	movl	BO, B
490
491	leal	(, LDC, 2), %eax
492	addl	%eax, C			# c += ldc
493	decl	J			# j --
494	jg	.L01
495	ALIGN_4
496
497.L100:
498	movl	N, %eax
499	andl	$1, %eax
500	jle	.L500
501	ALIGN_4
502
503.L101:
504#if defined(TRMMKERNEL) && defined(LEFT)
505	movl	OFFSET, %eax
506	movl	%eax, KK
507#endif
508
509	movl	C, CO
510	movl	A, AO
511
512	movl	M,    I
513	testl	%ebx, I
514	jle	.L500
515	ALIGN_4
516
517.L110:
518#if !defined(TRMMKERNEL) || \
519	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
520	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
521
522	movl	B, BO
523#else
524	movl	KK, %eax
525	leal	(, %eax, SIZE), %eax
526	leal	(AO, %eax, 2), AO
527	leal	(B,  %eax, 2), BO
528#endif
529
530	movddup	-16 * SIZE(AO), %xmm0
531	pxor	%xmm4, %xmm4
532	movddup	-15 * SIZE(AO), %xmm1
533	pxor	%xmm5, %xmm5
534	pxor	%xmm6, %xmm6
535	pxor	%xmm7, %xmm7
536
537	prefetchw 1 * SIZE(CO)
538
539#ifndef TRMMKERNEL
540	movl	K, %eax
541#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
542	movl	K, %eax
543	subl	KK, %eax
544	movl	%eax, KKK
545#else
546	movl	KK, %eax
547#ifdef LEFT
548	addl	$1, %eax
549#else
550	addl	$1, %eax
551#endif
552	movl	%eax, KKK
553#endif
554	sarl	$3, %eax
555	je	.L112
556	ALIGN_4
557
558.L111:
559	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AO)
560
561	mulpd	-16 * SIZE(BO), %xmm0
562	ADD1	%xmm0, %xmm4
563	movddup	-14 * SIZE(AO), %xmm0
564	mulpd	-16 * SIZE(BO), %xmm1
565	ADD2	%xmm1, %xmm5
566	movddup	-13 * SIZE(AO), %xmm1
567
568	mulpd	-14 * SIZE(BO), %xmm0
569	ADD1	%xmm0, %xmm6
570	movddup	-12 * SIZE(AO), %xmm0
571	mulpd	-14 * SIZE(BO), %xmm1
572	ADD2	%xmm1, %xmm7
573	movddup	-11 * SIZE(AO), %xmm1
574
575	mulpd	-12 * SIZE(BO), %xmm0
576	ADD1	%xmm0, %xmm4
577	movddup	-10 * SIZE(AO), %xmm0
578	mulpd	-12 * SIZE(BO), %xmm1
579	ADD2	%xmm1, %xmm5
580	movddup	 -9 * SIZE(AO), %xmm1
581
582	mulpd	-10 * SIZE(BO), %xmm0
583	ADD1	%xmm0, %xmm6
584	movddup	 -8 * SIZE(AO), %xmm0
585	mulpd	-10 * SIZE(BO), %xmm1
586	ADD2	%xmm1, %xmm7
587	movddup	 -7 * SIZE(AO), %xmm1
588
589	mulpd	 -8 * SIZE(BO), %xmm0
590	ADD1	%xmm0, %xmm4
591	movddup	 -6 * SIZE(AO), %xmm0
592	mulpd	 -8 * SIZE(BO), %xmm1
593	ADD2	%xmm1, %xmm5
594	movddup	 -5 * SIZE(AO), %xmm1
595
596	mulpd	 -6 * SIZE(BO), %xmm0
597	ADD1	%xmm0, %xmm6
598	movddup	 -4 * SIZE(AO), %xmm0
599	mulpd	 -6 * SIZE(BO), %xmm1
600	ADD2	%xmm1, %xmm7
601	movddup	 -3 * SIZE(AO), %xmm1
602
603	mulpd	 -4 * SIZE(BO), %xmm0
604	ADD1	%xmm0, %xmm4
605	movddup	 -2 * SIZE(AO), %xmm0
606	mulpd	 -4 * SIZE(BO), %xmm1
607	ADD2	%xmm1, %xmm5
608	movddup	 -1 * SIZE(AO), %xmm1
609
610	mulpd	 -2 * SIZE(BO), %xmm0
611	ADD1	%xmm0, %xmm6
612	movddup	  0 * SIZE(AO), %xmm0
613	mulpd	 -2 * SIZE(BO), %xmm1
614	ADD2	%xmm1, %xmm7
615	movddup	  1 * SIZE(AO), %xmm1
616
617	subl   $-16 * SIZE, AO
618	subl   $-16 * SIZE, BO
619	decl   %eax
620	jne    .L111
621	ALIGN_4
622
623.L112:
624#ifndef TRMMKERNEL
625	movl	K, %eax
626#else
627	movl	KKK, %eax
628#endif
629	andl	$7, %eax		# if (k & 1)
630	BRANCH
631	je .L114
632	ALIGN_4
633
634.L113:
635	mulpd	-16 * SIZE(BO), %xmm0
636	ADD1	%xmm0, %xmm4
637	movddup	-14 * SIZE(AO), %xmm0
638	mulpd	-16 * SIZE(BO), %xmm1
639	ADD2	%xmm1, %xmm5
640	movddup	-13 * SIZE(AO), %xmm1
641
642	addl	$2 * SIZE, AO
643	addl	$2 * SIZE, BO
644	decl	%eax
645	jg	.L113
646	ALIGN_4
647
648.L114:
649#ifndef TRMMKERNEL
650	movupd	0 * SIZE(CO), %xmm0
651#endif
652
653	movddup	ALPHA_R, %xmm2
654	movddup ALPHA_I, %xmm3
655
656	addpd	%xmm6, %xmm4
657	addpd	%xmm7, %xmm5
658
659	SHUFPD_1 %xmm5, %xmm5
660
661#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
662    defined(RN) || defined(RT) || defined(CN) || defined(CT)
663	addsubpd %xmm5,  %xmm4
664	pshufd	$0x4e, %xmm4, %xmm5
665#else
666	addsubpd %xmm4,  %xmm5
667	movapd	 %xmm5,  %xmm4
668	pshufd	$0x4e, %xmm5,  %xmm5
669#endif
670
671	mulpd	%xmm2, %xmm4
672	mulpd	%xmm3, %xmm5
673
674	addsubpd %xmm5, %xmm4
675
676#ifndef TRMMKERNEL
677	addpd	%xmm0, %xmm4
678#endif
679
680	movlpd	%xmm4, 0 * SIZE(CO)
681	movhpd	%xmm4, 1 * SIZE(CO)
682
683#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
684    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
685	movl	K, %eax
686	subl	KKK, %eax
687	leal	(,%eax, SIZE), %eax
688	leal	(AO, %eax, 2), AO
689	leal	(BO, %eax, 2), BO
690#endif
691
692#if defined(TRMMKERNEL) && defined(LEFT)
693	addl	$1, KK
694#endif
695
696	addl	$2 * SIZE, CO		# coffset += 4
697	decl	I			# i --
698	jg	.L110
699	ALIGN_4
700
701.L500:
702	popl	%ebx
703	popl	%esi
704	popl	%edi
705	popl	%ebp
706
707	addl	$ARGS, %esp
708
709	ret
710
711	EPILOGUE
712