1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define M	%rdi
26#define N	%rsi
27#define K	%rdx
28
29#define A	%rcx
30#define B	%r8
31#define C	%r9
32#define LDC	%r10
33
34#define I	%r11
35#define AO	%r12
36#define BO	%r13
37#define	CO1	%r14
38#define CO2	%r15
39#define BB	%rbp
40
41#ifndef WINDOWS_ABI
42
43#define STACKSIZE 64
44
45#else
46
47#define STACKSIZE 256
48
49#define OLD_ALPHA_I	40 + STACKSIZE(%rsp)
50#define OLD_A		48 + STACKSIZE(%rsp)
51#define OLD_B		56 + STACKSIZE(%rsp)
52#define OLD_C		64 + STACKSIZE(%rsp)
53#define OLD_LDC		72 + STACKSIZE(%rsp)
54#define OLD_OFFSET	80 + STACKSIZE(%rsp)
55
56#endif
57
58#define ALPHA_R	  0(%rsp)
59#define ALPHA_I	 16(%rsp)
60#define J	 32(%rsp)
61#define OFFSET	 40(%rsp)
62#define KK	 48(%rsp)
63#define KKK	 56(%rsp)
64#define BUFFER	128(%rsp)
65
66#define PREFETCH     prefetcht0
67#define PREFETCHSIZE  320
68
69#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
70    defined(RN) || defined(RT) || defined(CN) || defined(CT)
71#define ADDSUB	addps
72#else
73#define ADDSUB	subps
74#endif
75
76#define KERNEL1(address) \
77	mulps	%xmm8, %xmm9; \
78	PREFETCH  (PREFETCHSIZE +  0) * SIZE + (address) * 2 * SIZE(AO); \
79	addps	%xmm9, %xmm0; \
80	movshdup  0 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \
81	mulps	%xmm8, %xmm9; \
82	ADDSUB	%xmm9, %xmm1; \
83	movsldup  4 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \
84	mulps	%xmm8, %xmm9; \
85	addps	%xmm9, %xmm2; \
86	movshdup  4 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \
87	mulps	%xmm8, %xmm9; \
88	movaps	  4 * SIZE + (address) * 2 * SIZE(AO), %xmm8; \
89	ADDSUB	%xmm9, %xmm3; \
90	movsldup  0 * SIZE + (address) * 2 * SIZE(BO), %xmm9
91
92#define KERNEL2(address) \
93	mulps	%xmm8, %xmm9; \
94	addps	%xmm9, %xmm4; \
95	movshdup  0 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \
96	mulps	%xmm8, %xmm9; \
97	ADDSUB	%xmm9, %xmm5; \
98	movsldup  4 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \
99	mulps	%xmm8, %xmm9; \
100	addps	%xmm9, %xmm6; \
101	movshdup  4 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \
102	mulps	%xmm8, %xmm9; \
103	movaps	  8 * SIZE + (address) * 2 * SIZE(AO), %xmm8; \
104	ADDSUB	%xmm9, %xmm7; \
105	movsldup  8 * SIZE + (address) * 2 * SIZE(BO), %xmm9
106
107#define KERNEL3(address) \
108	mulps	%xmm8, %xmm9; \
109	addps	%xmm9, %xmm0; \
110	movshdup  8 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \
111	mulps	%xmm8, %xmm9; \
112	ADDSUB	%xmm9, %xmm1; \
113	movsldup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \
114	mulps	%xmm8, %xmm9; \
115	addps	%xmm9, %xmm2; \
116	movshdup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \
117	mulps	%xmm8, %xmm9; \
118	movaps	 12 * SIZE + (address) * 2 * SIZE(AO), %xmm8; \
119	ADDSUB	%xmm9, %xmm3; \
120	movsldup  8 * SIZE + (address) * 2 * SIZE(BO), %xmm9
121
122#define KERNEL4(address) \
123	mulps	%xmm8, %xmm9; \
124	addps	%xmm9, %xmm4; \
125	movshdup  8 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \
126	mulps	%xmm8, %xmm9; \
127	ADDSUB	%xmm9, %xmm5; \
128	movsldup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \
129	mulps	%xmm8, %xmm9; \
130	addps	%xmm9, %xmm6; \
131	movshdup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \
132	mulps	%xmm8, %xmm9; \
133	movaps	 64 * SIZE + (address) * 2 * SIZE(AO), %xmm8; \
134	ADDSUB	%xmm9, %xmm7; \
135	movsldup 64 * SIZE + (address) * 2 * SIZE(BO), %xmm9
136
137#define KERNEL5(address) \
138	mulps	%xmm10, %xmm11; \
139	addps	%xmm11, %xmm0; \
140	movshdup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \
141	mulps	%xmm10, %xmm11; \
142	ADDSUB	%xmm11, %xmm1; \
143	movsldup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \
144	mulps	%xmm10, %xmm11; \
145	addps	%xmm11, %xmm2; \
146	movshdup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \
147	mulps	%xmm10, %xmm11; \
148	movaps	 20 * SIZE + (address) * 2 * SIZE(AO), %xmm10; \
149	ADDSUB	%xmm11, %xmm3; \
150	movsldup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm11
151
152#define KERNEL6(address) \
153	mulps	%xmm10, %xmm11; \
154	addps	%xmm11, %xmm4; \
155	movshdup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \
156	mulps	%xmm10, %xmm11; \
157	ADDSUB	%xmm11, %xmm5; \
158	movsldup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \
159	mulps	%xmm10, %xmm11; \
160	addps	%xmm11, %xmm6; \
161	movshdup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \
162	mulps	%xmm10, %xmm11; \
163	movaps	 24 * SIZE + (address) * 2 * SIZE(AO), %xmm10; \
164	ADDSUB	%xmm11, %xmm7; \
165	movsldup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm11
166
167#define KERNEL7(address) \
168	mulps	%xmm10, %xmm11; \
169	addps	%xmm11, %xmm0; \
170	movshdup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \
171	mulps	%xmm10, %xmm11; \
172	ADDSUB	%xmm11, %xmm1; \
173	movsldup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \
174	mulps	%xmm10, %xmm11; \
175	addps	%xmm11, %xmm2; \
176	movshdup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \
177	mulps	%xmm10, %xmm11; \
178	movaps	 28 * SIZE + (address) * 2 * SIZE(AO), %xmm10; \
179	ADDSUB	%xmm11, %xmm3; \
180	movsldup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm11
181
182#define KERNEL8(address) \
183	mulps	%xmm10, %xmm11; \
184	addps	%xmm11, %xmm4; \
185	movshdup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \
186	mulps	%xmm10, %xmm11; \
187	ADDSUB	%xmm11, %xmm5; \
188	movsldup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \
189	mulps	%xmm10, %xmm11; \
190	addps	%xmm11, %xmm6; \
191	movshdup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \
192	mulps	%xmm10, %xmm11; \
193	movaps	 80 * SIZE + (address) * 2 * SIZE(AO), %xmm10; \
194	ADDSUB	%xmm11, %xmm7; \
195	movsldup 80 * SIZE + (address) * 2 * SIZE(BO), %xmm11
196
197#define KERNEL9(address) \
198	mulps	%xmm12, %xmm13; \
199	PREFETCH  (PREFETCHSIZE + 32) * SIZE + (address) * 2 * SIZE(AO); \
200	addps	%xmm13, %xmm0; \
201	movshdup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \
202	mulps	%xmm12, %xmm13; \
203	ADDSUB	%xmm13, %xmm1; \
204	movsldup 36 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \
205	mulps	%xmm12, %xmm13; \
206	addps	%xmm13, %xmm2; \
207	movshdup 36 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \
208	mulps	%xmm12, %xmm13; \
209	movaps	 36 * SIZE + (address) * 2 * SIZE(AO), %xmm12; \
210	ADDSUB	%xmm13, %xmm3; \
211	movsldup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm13
212
213#define KERNEL10(address) \
214	mulps	%xmm12, %xmm13; \
215	addps	%xmm13, %xmm4; \
216	movshdup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \
217	mulps	%xmm12, %xmm13; \
218	ADDSUB	%xmm13, %xmm5; \
219	movsldup 36 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \
220	mulps	%xmm12, %xmm13; \
221	addps	%xmm13, %xmm6; \
222	movshdup 36 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \
223	mulps	%xmm12, %xmm13; \
224	movaps	 40 * SIZE + (address) * 2 * SIZE(AO), %xmm12; \
225	ADDSUB	%xmm13, %xmm7; \
226	movsldup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm13
227
228#define KERNEL11(address) \
229	mulps	%xmm12, %xmm13; \
230	addps	%xmm13, %xmm0; \
231	movshdup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \
232	mulps	%xmm12, %xmm13; \
233	ADDSUB	%xmm13, %xmm1; \
234	movsldup 44 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \
235	mulps	%xmm12, %xmm13; \
236	addps	%xmm13, %xmm2; \
237	movshdup 44 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \
238	mulps	%xmm12, %xmm13; \
239	movaps	 44 * SIZE + (address) * 2 * SIZE(AO), %xmm12; \
240	ADDSUB	%xmm13, %xmm3; \
241	movsldup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm13
242
243#define KERNEL12(address) \
244	mulps	%xmm12, %xmm13; \
245	addps	%xmm13, %xmm4; \
246	movshdup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \
247	mulps	%xmm12, %xmm13; \
248	ADDSUB	%xmm13, %xmm5; \
249	movsldup 44 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \
250	mulps	%xmm12, %xmm13; \
251	addps	%xmm13, %xmm6; \
252	movshdup 44 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \
253	mulps	%xmm12, %xmm13; \
254 	movaps	 96 * SIZE + (address) * 2 * SIZE(AO), %xmm12; \
255	ADDSUB	%xmm13, %xmm7; \
256	movsldup 96 * SIZE + (address) * 2 * SIZE(BO), %xmm13
257
258#define KERNEL13(address) \
259	mulps	%xmm14, %xmm15; \
260	addps	%xmm15, %xmm0; \
261	movshdup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \
262	mulps	%xmm14, %xmm15; \
263	ADDSUB	%xmm15, %xmm1; \
264	movsldup 52 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \
265	mulps	%xmm14, %xmm15; \
266	addps	%xmm15, %xmm2; \
267	movshdup 52 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \
268	mulps	%xmm14, %xmm15; \
269	movaps	 52 * SIZE + (address) * 2 * SIZE(AO), %xmm14; \
270	ADDSUB	%xmm15, %xmm3; \
271	movsldup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm15
272
273#define KERNEL14(address) \
274	mulps	%xmm14, %xmm15; \
275	addps	%xmm15, %xmm4; \
276	movshdup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \
277	mulps	%xmm14, %xmm15; \
278	ADDSUB	%xmm15, %xmm5; \
279	movsldup 52 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \
280	mulps	%xmm14, %xmm15; \
281	addps	%xmm15, %xmm6; \
282	movshdup 52 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \
283	mulps	%xmm14, %xmm15; \
284	movaps	 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14; \
285	ADDSUB	%xmm15, %xmm7; \
286	movsldup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15
287
288#define KERNEL15(address) \
289	mulps	%xmm14, %xmm15; \
290	addps	%xmm15, %xmm0; \
291	movshdup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \
292	mulps	%xmm14, %xmm15; \
293	ADDSUB	%xmm15, %xmm1; \
294	movsldup 60 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \
295	mulps	%xmm14, %xmm15; \
296	addps	%xmm15, %xmm2; \
297	movshdup 60 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \
298	mulps	%xmm14, %xmm15; \
299	movaps	 60 * SIZE + (address) * 2 * SIZE(AO), %xmm14; \
300	ADDSUB	%xmm15, %xmm3; \
301	movsldup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15
302
303#define KERNEL16(address) \
304	mulps	%xmm14, %xmm15; \
305	addps	%xmm15, %xmm4; \
306	movshdup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \
307	mulps	%xmm14, %xmm15; \
308	ADDSUB	%xmm15, %xmm5; \
309	movsldup 60 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \
310	mulps	%xmm14, %xmm15; \
311	addps	%xmm15, %xmm6; \
312	movshdup 60 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \
313	mulps	%xmm14, %xmm15; \
314	movaps	112 * SIZE + (address) * 2 * SIZE(AO), %xmm14; \
315	ADDSUB	%xmm15, %xmm7; \
316	movsldup 112 * SIZE + (address) * 2 * SIZE(BO), %xmm15
317
318	PROLOGUE
319	PROFCODE
320
321	subq	$STACKSIZE, %rsp
322
323	movq	%rbx,  0(%rsp)
324	movq	%rbp,  8(%rsp)
325	movq	%r12, 16(%rsp)
326	movq	%r13, 24(%rsp)
327	movq	%r14, 32(%rsp)
328	movq	%r15, 40(%rsp)
329
330#ifdef WINDOWS_ABI
331	movq	%rdi,    48(%rsp)
332	movq	%rsi,    56(%rsp)
333	movups	%xmm6,   64(%rsp)
334	movups	%xmm7,   80(%rsp)
335	movups	%xmm8,   96(%rsp)
336	movups	%xmm9,  112(%rsp)
337	movups	%xmm10, 128(%rsp)
338	movups	%xmm11, 144(%rsp)
339	movups	%xmm12, 160(%rsp)
340	movups	%xmm13, 176(%rsp)
341	movups	%xmm14, 192(%rsp)
342	movups	%xmm15, 208(%rsp)
343
344	movq	ARG1,      M
345	movq	ARG2,      N
346	movq	ARG3,      K
347	movq	OLD_A,     A
348	movq	OLD_B,     B
349	movq	OLD_C,     C
350	movq	OLD_LDC,   LDC
351#ifdef TRMMKERNEL
352	movsd	OLD_OFFSET, %xmm4
353#endif
354	movaps	%xmm3,       %xmm0
355	movsd	OLD_ALPHA_I, %xmm1
356#else
357	movq	72(%rsp), LDC
358#ifdef TRMMKERNEL
359	movsd	80(%rsp), %xmm4
360#endif
361
362#endif
363
364	movq	%rsp, %rbx	# save old stack
365	subq	$128 + LOCAL_BUFFER_SIZE, %rsp
366	andq	$-4096, %rsp	# align stack
367
368	STACK_TOUCHING
369
370	pxor	%xmm15, %xmm15
371	cmpeqps	%xmm15, %xmm15
372	pslld	$31, %xmm15	# Generate mask
373	pxor	%xmm2, %xmm2
374
375	shufps	$0, %xmm0, %xmm0
376	movaps	 %xmm0,  0 + ALPHA_R
377
378	movss	 %xmm1,  4 + ALPHA_I
379	movss	 %xmm1, 12 + ALPHA_I
380	xorps	 %xmm15, %xmm1
381	movss	 %xmm1,  0 + ALPHA_I
382	movss	 %xmm1,  8 + ALPHA_I
383
384#ifdef TRMMKERNEL
385	movsd	%xmm4, OFFSET
386	movsd	%xmm4, KK
387#ifndef LEFT
388	negq	KK
389#endif
390#endif
391
392	salq	$ZBASE_SHIFT, LDC
393	movq	N,  J
394	sarq	$1, J		# j = (n >> 2)
395	jle	.L40
396	ALIGN_4
397
398.L01:
399#if defined(TRMMKERNEL) && defined(LEFT)
400	movq	OFFSET, %rax
401	movq	%rax, KK
402#endif
403
404/* Copying to Sub Buffer */
405	leaq	BUFFER, BO
406
407	movq	K, %rax
408	sarq	$2, %rax
409	jle	.L03
410	ALIGN_4
411
412.L02:
413	movddup	  0 * SIZE(B), %xmm0
414	movddup	  2 * SIZE(B), %xmm1
415	movddup	  4 * SIZE(B), %xmm2
416	movddup	  6 * SIZE(B), %xmm3
417	movddup	  8 * SIZE(B), %xmm4
418	movddup	 10 * SIZE(B), %xmm5
419	movddup	 12 * SIZE(B), %xmm6
420	movddup	 14 * SIZE(B), %xmm7
421
422	movaps	%xmm0,  0 * SIZE(BO)
423	movaps	%xmm1,  4 * SIZE(BO)
424	movaps	%xmm2,  8 * SIZE(BO)
425	movaps	%xmm3, 12 * SIZE(BO)
426	movaps	%xmm4, 16 * SIZE(BO)
427	movaps	%xmm5, 20 * SIZE(BO)
428	movaps	%xmm6, 24 * SIZE(BO)
429	movaps	%xmm7, 28 * SIZE(BO)
430
431	prefetcht1	128 * SIZE(BO)
432	prefetcht0	112 * SIZE(B)
433
434 	addq	$16 * SIZE, B
435	addq	$32 * SIZE, BO
436	decq	%rax
437	jne	.L02
438	ALIGN_4
439
440.L03:
441	movq	K, %rax
442	andq	$3, %rax
443	BRANCH
444	jle	.L10
445	ALIGN_4
446
447.L04:
448	movddup	 0 * SIZE(B), %xmm0
449	movddup	 2 * SIZE(B), %xmm1
450
451	movaps	%xmm0,  0 * SIZE(BO)
452	movaps	%xmm1,  4 * SIZE(BO)
453
454	addq	$4 * SIZE, B
455	addq	$8 * SIZE, BO
456	decq	%rax
457	jne	.L04
458	ALIGN_4
459
460.L10:
461	movq	C, CO1			# coffset1 = c
462	leaq	(C, LDC, 1), CO2	# coffset2 = c + ldc
463	movq	A, AO		# aoffset = a
464
465	leaq	112 * SIZE(B), BB
466
467	movq	M,  I
468	sarq	$2, I		# i = (m >> 2)
469	jle	.L20
470	ALIGN_4
471
472.L11:
473	prefetcht0	 0 * SIZE(BB)
474	subq	   $-8 * SIZE, BB
475
476#if !defined(TRMMKERNEL) || \
477	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
478	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
479
480	leaq	BUFFER, BO
481#else
482	leaq	BUFFER, BO
483	movq	KK, %rax
484	leaq	(, %rax,   8), %rax
485	leaq	(AO, %rax, 4), AO
486	leaq	(BO, %rax, 4), BO
487#endif
488
489	movaps	  0 * SIZE(AO), %xmm8
490	pxor	%xmm0, %xmm0
491	movaps	 16 * SIZE(AO), %xmm10
492	pxor	%xmm1, %xmm1
493	movaps	 32 * SIZE(AO), %xmm12
494	pxor	%xmm2, %xmm2
495	movaps	 48 * SIZE(AO), %xmm14
496	pxor	%xmm3, %xmm3
497
498	movsldup  0 * SIZE(BO), %xmm9
499	pxor	%xmm4, %xmm4
500	movsldup 16 * SIZE(BO), %xmm11
501	pxor	%xmm5, %xmm5
502	movsldup 32 * SIZE(BO), %xmm13
503	pxor	%xmm6, %xmm6
504	movsldup 48 * SIZE(BO), %xmm15
505	pxor	%xmm7, %xmm7
506
507	prefetchnta     8 * SIZE(CO1)
508	prefetchnta     8 * SIZE(CO2)
509
510#ifndef TRMMKERNEL
511	movq	K, %rax
512#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
513	movq	K, %rax
514	subq	KK, %rax
515	movq	%rax, KKK
516#else
517	movq	KK, %rax
518#ifdef LEFT
519	addq	$4, %rax
520#else
521	addq	$2, %rax
522#endif
523	movq	%rax, KKK
524#endif
525#if 1
526	andq	$-8, %rax
527	salq	$4, %rax
528	je	.L15
529
530.L1X:
531	KERNEL1 (32  *  0)
532	KERNEL2 (32  *  0)
533	KERNEL3 (32  *  0)
534	KERNEL4 (32  *  0)
535	KERNEL5 (32  *  0)
536	KERNEL6 (32  *  0)
537	KERNEL7 (32  *  0)
538	KERNEL8 (32  *  0)
539	KERNEL9 (32  *  0)
540	KERNEL10(32  *  0)
541	KERNEL11(32  *  0)
542	KERNEL12(32  *  0)
543	KERNEL13(32  *  0)
544	KERNEL14(32  *  0)
545	KERNEL15(32  *  0)
546	KERNEL16(32  *  0)
547	cmpq	$128 *  1, %rax
548	jle	.L12
549	KERNEL1 (32  *  1)
550	KERNEL2 (32  *  1)
551	KERNEL3 (32  *  1)
552	KERNEL4 (32  *  1)
553	KERNEL5 (32  *  1)
554	KERNEL6 (32  *  1)
555	KERNEL7 (32  *  1)
556	KERNEL8 (32  *  1)
557	KERNEL9 (32  *  1)
558	KERNEL10(32  *  1)
559	KERNEL11(32  *  1)
560	KERNEL12(32  *  1)
561	KERNEL13(32  *  1)
562	KERNEL14(32  *  1)
563	KERNEL15(32  *  1)
564	KERNEL16(32  *  1)
565	cmpq	$128 *  2, %rax
566	jle	.L12
567	KERNEL1 (32  *  2)
568	KERNEL2 (32  *  2)
569	KERNEL3 (32  *  2)
570	KERNEL4 (32  *  2)
571	KERNEL5 (32  *  2)
572	KERNEL6 (32  *  2)
573	KERNEL7 (32  *  2)
574	KERNEL8 (32  *  2)
575	KERNEL9 (32  *  2)
576	KERNEL10(32  *  2)
577	KERNEL11(32  *  2)
578	KERNEL12(32  *  2)
579	KERNEL13(32  *  2)
580	KERNEL14(32  *  2)
581	KERNEL15(32  *  2)
582	KERNEL16(32  *  2)
583	cmpq	$128 *  3, %rax
584	jle	.L12
585	KERNEL1 (32  *  3)
586	KERNEL2 (32  *  3)
587	KERNEL3 (32  *  3)
588	KERNEL4 (32  *  3)
589	KERNEL5 (32  *  3)
590	KERNEL6 (32  *  3)
591	KERNEL7 (32  *  3)
592	KERNEL8 (32  *  3)
593	KERNEL9 (32  *  3)
594	KERNEL10(32  *  3)
595	KERNEL11(32  *  3)
596	KERNEL12(32  *  3)
597	KERNEL13(32  *  3)
598	KERNEL14(32  *  3)
599	KERNEL15(32  *  3)
600	KERNEL16(32  *  3)
601	cmpq	$128 *  4, %rax
602	jle	.L12
603	KERNEL1 (32  *  4)
604	KERNEL2 (32  *  4)
605	KERNEL3 (32  *  4)
606	KERNEL4 (32  *  4)
607	KERNEL5 (32  *  4)
608	KERNEL6 (32  *  4)
609	KERNEL7 (32  *  4)
610	KERNEL8 (32  *  4)
611	KERNEL9 (32  *  4)
612	KERNEL10(32  *  4)
613	KERNEL11(32  *  4)
614	KERNEL12(32  *  4)
615	KERNEL13(32  *  4)
616	KERNEL14(32  *  4)
617	KERNEL15(32  *  4)
618	KERNEL16(32  *  4)
619	cmpq	$128 *  5, %rax
620	jle	.L12
621	KERNEL1 (32  *  5)
622	KERNEL2 (32  *  5)
623	KERNEL3 (32  *  5)
624	KERNEL4 (32  *  5)
625	KERNEL5 (32  *  5)
626	KERNEL6 (32  *  5)
627	KERNEL7 (32  *  5)
628	KERNEL8 (32  *  5)
629	KERNEL9 (32  *  5)
630	KERNEL10(32  *  5)
631	KERNEL11(32  *  5)
632	KERNEL12(32  *  5)
633	KERNEL13(32  *  5)
634	KERNEL14(32  *  5)
635	KERNEL15(32  *  5)
636	KERNEL16(32  *  5)
637	cmpq	$128 *  6, %rax
638	jle	.L12
639	KERNEL1 (32  *  6)
640	KERNEL2 (32  *  6)
641	KERNEL3 (32  *  6)
642	KERNEL4 (32  *  6)
643	KERNEL5 (32  *  6)
644	KERNEL6 (32  *  6)
645	KERNEL7 (32  *  6)
646	KERNEL8 (32  *  6)
647	KERNEL9 (32  *  6)
648	KERNEL10(32  *  6)
649	KERNEL11(32  *  6)
650	KERNEL12(32  *  6)
651	KERNEL13(32  *  6)
652	KERNEL14(32  *  6)
653	KERNEL15(32  *  6)
654	KERNEL16(32  *  6)
655	cmpq	$128 *  7, %rax
656	jle	.L12
657	KERNEL1 (32  *  7)
658	KERNEL2 (32  *  7)
659	KERNEL3 (32  *  7)
660	KERNEL4 (32  *  7)
661	KERNEL5 (32  *  7)
662	KERNEL6 (32  *  7)
663	KERNEL7 (32  *  7)
664	KERNEL8 (32  *  7)
665	KERNEL9 (32  *  7)
666	KERNEL10(32  *  7)
667	KERNEL11(32  *  7)
668	KERNEL12(32  *  7)
669	KERNEL13(32  *  7)
670	KERNEL14(32  *  7)
671	KERNEL15(32  *  7)
672	KERNEL16(32  *  7)
673
674	addq	$64 * 8  * SIZE, AO
675	addq	$64 * 8  * SIZE, BO
676	subq	$128 * 8, %rax
677	jg	.L1X
678
679.L12:
680	leaq	(AO, %rax, 2), AO	# * 16
681	leaq	(BO, %rax, 2), BO	# * 64
682#else
683	sarq	$3, %rax
684	je	.L15
685	ALIGN_4
686
687.L12:
688	KERNEL1 (32  *  0)
689	KERNEL2 (32  *  0)
690	KERNEL3 (32  *  0)
691	KERNEL4 (32  *  0)
692	KERNEL5 (32  *  0)
693	KERNEL6 (32  *  0)
694	KERNEL7 (32  *  0)
695	KERNEL8 (32  *  0)
696	KERNEL9 (32  *  0)
697	KERNEL10(32  *  0)
698	KERNEL11(32  *  0)
699	KERNEL12(32  *  0)
700	KERNEL13(32  *  0)
701	KERNEL14(32  *  0)
702	KERNEL15(32  *  0)
703	KERNEL16(32  *  0)
704
705	addq   $64 * SIZE, AO
706	addq   $64 * SIZE, BO
707	decq   %rax
708	jne    .L12
709#endif
710	ALIGN_4
711
712.L15:
713#ifndef TRMMKERNEL
714	movq	K, %rax
715#else
716	movq	KKK, %rax
717#endif
718	movaps	ALPHA_R, %xmm14
719	movaps	ALPHA_I, %xmm15
720	andq	$7, %rax		# if (k & 1)
721	BRANCH
722	je .L18
723	ALIGN_4
724
725.L16:
726	mulps	%xmm8, %xmm9
727	addps	%xmm9, %xmm0
728	movshdup  0 * SIZE(BO), %xmm9
729	mulps	%xmm8, %xmm9
730	ADDSUB	%xmm9, %xmm1
731	movsldup  4 * SIZE(BO), %xmm9
732	mulps	%xmm8, %xmm9
733	addps	%xmm9, %xmm2
734	movshdup  4 * SIZE(BO), %xmm9
735	mulps	%xmm8, %xmm9
736	movaps	  4 * SIZE(AO), %xmm8
737	ADDSUB	%xmm9, %xmm3
738	movsldup  0 * SIZE(BO), %xmm9
739	mulps	%xmm8, %xmm9
740	addps	%xmm9, %xmm4
741	movshdup  0 * SIZE(BO), %xmm9
742	mulps	%xmm8, %xmm9
743	ADDSUB	%xmm9, %xmm5
744	movsldup  4 * SIZE(BO), %xmm9
745	mulps	%xmm8, %xmm9
746	addps	%xmm9, %xmm6
747	movshdup  4 * SIZE(BO), %xmm9
748	mulps	%xmm8, %xmm9
749	movaps	  8 * SIZE(AO), %xmm8
750	ADDSUB	%xmm9, %xmm7
751	movsldup  8 * SIZE(BO), %xmm9
752
753	addq	$8 * SIZE, AO
754	addq	$8 * SIZE, BO
755	decq	%rax
756	jg	.L16
757	ALIGN_4
758
759.L18:
760#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
761    defined(NR) || defined(NC) || defined(TR) || defined(TC)
762
763	shufps	$0xb1, %xmm1, %xmm1
764	shufps	$0xb1, %xmm3, %xmm3
765	shufps	$0xb1, %xmm5, %xmm5
766	shufps	$0xb1, %xmm7, %xmm7
767
768	addsubps	%xmm1, %xmm0
769	addsubps	%xmm3, %xmm2
770	addsubps	%xmm5, %xmm4
771	addsubps	%xmm7, %xmm6
772
773	movaps	%xmm0, %xmm1
774	movaps	%xmm2, %xmm3
775	movaps	%xmm4, %xmm5
776	movaps	%xmm6, %xmm7
777
778	shufps	$0xb1, %xmm0, %xmm0
779	shufps	$0xb1, %xmm2, %xmm2
780	shufps	$0xb1, %xmm4, %xmm4
781	shufps	$0xb1, %xmm6, %xmm6
782#else
783	shufps	$0xb1, %xmm0, %xmm0
784	shufps	$0xb1, %xmm2, %xmm2
785	shufps	$0xb1, %xmm4, %xmm4
786	shufps	$0xb1, %xmm6, %xmm6
787
788	addsubps	%xmm0, %xmm1
789	addsubps	%xmm2, %xmm3
790	addsubps	%xmm4, %xmm5
791	addsubps	%xmm6, %xmm7
792
793	movaps	%xmm1, %xmm0
794	movaps	%xmm3, %xmm2
795	movaps	%xmm5, %xmm4
796	movaps	%xmm7, %xmm6
797
798	shufps	$0xb1, %xmm1, %xmm1
799	shufps	$0xb1, %xmm3, %xmm3
800	shufps	$0xb1, %xmm5, %xmm5
801	shufps	$0xb1, %xmm7, %xmm7
802#endif
803
804	mulps	%xmm14, %xmm1
805	mulps	%xmm15, %xmm0
806	mulps	%xmm14, %xmm3
807	mulps	%xmm15, %xmm2
808
809	mulps	%xmm14, %xmm5
810	mulps	%xmm15, %xmm4
811	mulps	%xmm14, %xmm7
812	mulps	%xmm15, %xmm6
813
814	addps	%xmm1, %xmm0
815	addps	%xmm3, %xmm2
816	addps	%xmm5, %xmm4
817	addps	%xmm7, %xmm6
818
819#if! defined(TRMMKERNEL) && !defined(BETAZERO)
820	shufps	$0xe4, %xmm8,  %xmm8
821	shufps	$0xe4, %xmm9,  %xmm9
822	shufps	$0xe4, %xmm10, %xmm10
823	shufps	$0xe4, %xmm11, %xmm11
824
825	movsd	0 * SIZE(CO1), %xmm8
826	movhps	2 * SIZE(CO1), %xmm8
827	movsd	4 * SIZE(CO1), %xmm10
828	movhps	6 * SIZE(CO1), %xmm10
829
830	movsd	0 * SIZE(CO2), %xmm9
831	movhps	2 * SIZE(CO2), %xmm9
832	movsd	4 * SIZE(CO2), %xmm11
833	movhps	6 * SIZE(CO2), %xmm11
834
835	addps	%xmm8,  %xmm0
836	addps	%xmm9,  %xmm2
837	addps	%xmm10, %xmm4
838	addps	%xmm11, %xmm6
839#endif
840
841	movsd	%xmm0, 0 * SIZE(CO1)
842	movhps	%xmm0, 2 * SIZE(CO1)
843	movsd	%xmm4, 4 * SIZE(CO1)
844	movhps	%xmm4, 6 * SIZE(CO1)
845
846	movsd	%xmm2, 0 * SIZE(CO2)
847	movhps	%xmm2, 2 * SIZE(CO2)
848	movsd	%xmm6, 4 * SIZE(CO2)
849	movhps	%xmm6, 6 * SIZE(CO2)
850
851#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
852    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
853	movq	K, %rax
854	subq	KKK, %rax
855	leaq	(,%rax, 8), %rax
856	leaq	(AO, %rax, 4), AO
857	leaq	(BO, %rax, 4), BO
858#endif
859
860#if defined(TRMMKERNEL) && defined(LEFT)
861	addq	$4, KK
862#endif
863
864	addq	$8 * SIZE, CO1		# coffset += 4
865	addq	$8 * SIZE, CO2		# coffset += 4
866	decq	I			# i --
867	jg	.L11
868	ALIGN_4
869
870.L20:
871	testq	$2, M
872	je	.L30
873
874#if !defined(TRMMKERNEL) || \
875	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
876	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
877
878	leaq	BUFFER, BO
879#else
880	leaq	BUFFER, BO
881	movq	KK, %rax
882	leaq	(, %rax,   8), %rax
883	leaq	(AO, %rax, 2), AO
884	leaq	(BO, %rax, 4), BO
885#endif
886
887	movaps	  0 * SIZE(AO), %xmm8
888	pxor	%xmm0, %xmm0
889	movaps	 16 * SIZE(AO), %xmm10
890	pxor	%xmm1, %xmm1
891
892	movsldup  0 * SIZE(BO), %xmm9
893	pxor	%xmm2, %xmm2
894	movsldup 16 * SIZE(BO), %xmm11
895	pxor	%xmm3, %xmm3
896	movsldup 32 * SIZE(BO), %xmm13
897	movsldup 48 * SIZE(BO), %xmm15
898
899#ifndef TRMMKERNEL
900	movq	K, %rax
901#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
902	movq	K, %rax
903	subq	KK, %rax
904	movq	%rax, KKK
905#else
906	movq	KK, %rax
907#ifdef LEFT
908	addq	$2, %rax
909#else
910	addq	$2, %rax
911#endif
912	movq	%rax, KKK
913#endif
914	sarq	$3, %rax
915	je	.L25
916	ALIGN_4
917
918.L22:
919	mulps	%xmm8, %xmm9
920	PREFETCH  (PREFETCHSIZE +  0) * SIZE(AO)
921	addps	%xmm9, %xmm0
922	movshdup  0 * SIZE(BO), %xmm9
923	mulps	%xmm8, %xmm9
924	ADDSUB	%xmm9, %xmm1
925	movsldup  4 * SIZE(BO), %xmm9
926	mulps	%xmm8, %xmm9
927	addps	%xmm9, %xmm2
928	movshdup  4 * SIZE(BO), %xmm9
929	mulps	%xmm8, %xmm9
930	movaps	  4 * SIZE(AO), %xmm8
931	ADDSUB	%xmm9, %xmm3
932	movsldup  8 * SIZE(BO), %xmm9
933	mulps	%xmm8, %xmm9
934	addps	%xmm9, %xmm0
935	movshdup  8 * SIZE(BO), %xmm9
936	mulps	%xmm8, %xmm9
937	ADDSUB	%xmm9, %xmm1
938	movsldup 12 * SIZE(BO), %xmm9
939	mulps	%xmm8, %xmm9
940	addps	%xmm9, %xmm2
941	movshdup 12 * SIZE(BO), %xmm9
942	mulps	%xmm8, %xmm9
943	movaps	  8 * SIZE(AO), %xmm8
944	ADDSUB	%xmm9, %xmm3
945	movsldup 64 * SIZE(BO), %xmm9
946	mulps	%xmm8, %xmm11
947	addps	%xmm11, %xmm0
948	movshdup 16 * SIZE(BO), %xmm11
949	mulps	%xmm8, %xmm11
950	ADDSUB	%xmm11, %xmm1
951	movsldup 20 * SIZE(BO), %xmm11
952	mulps	%xmm8, %xmm11
953	addps	%xmm11, %xmm2
954	movshdup 20 * SIZE(BO), %xmm11
955	mulps	%xmm8, %xmm11
956	movaps	 12 * SIZE(AO), %xmm8
957	ADDSUB	%xmm11, %xmm3
958	movsldup 24 * SIZE(BO), %xmm11
959	mulps	%xmm8, %xmm11
960	addps	%xmm11, %xmm0
961	movshdup 24 * SIZE(BO), %xmm11
962	mulps	%xmm8, %xmm11
963	ADDSUB	%xmm11, %xmm1
964	movsldup 28 * SIZE(BO), %xmm11
965	mulps	%xmm8, %xmm11
966	addps	%xmm11, %xmm2
967	movshdup 28 * SIZE(BO), %xmm11
968	mulps	%xmm8, %xmm11
969	movaps	 32 * SIZE(AO), %xmm8
970	ADDSUB	%xmm11, %xmm3
971	movsldup 80 * SIZE(BO), %xmm11
972	mulps	%xmm10, %xmm13
973	addps	%xmm13, %xmm0
974	movshdup 32 * SIZE(BO), %xmm13
975	mulps	%xmm10, %xmm13
976	ADDSUB	%xmm13, %xmm1
977	movsldup 36 * SIZE(BO), %xmm13
978	mulps	%xmm10, %xmm13
979	addps	%xmm13, %xmm2
980	movshdup 36 * SIZE(BO), %xmm13
981	mulps	%xmm10, %xmm13
982	movaps	 20 * SIZE(AO), %xmm10
983	ADDSUB	%xmm13, %xmm3
984	movsldup 40 * SIZE(BO), %xmm13
985	mulps	%xmm10, %xmm13
986	addps	%xmm13, %xmm0
987	movshdup 40 * SIZE(BO), %xmm13
988	mulps	%xmm10, %xmm13
989	ADDSUB	%xmm13, %xmm1
990	movsldup 44 * SIZE(BO), %xmm13
991	mulps	%xmm10, %xmm13
992	addps	%xmm13, %xmm2
993	movshdup 44 * SIZE(BO), %xmm13
994	mulps	%xmm10, %xmm13
995	movaps	 24 * SIZE(AO), %xmm10
996	ADDSUB	%xmm13, %xmm3
997	movsldup 96 * SIZE(BO), %xmm13
998	mulps	%xmm10, %xmm15
999	addps	%xmm15, %xmm0
1000	movshdup 48 * SIZE(BO), %xmm15
1001	mulps	%xmm10, %xmm15
1002	ADDSUB	%xmm15, %xmm1
1003	movsldup 52 * SIZE(BO), %xmm15
1004	mulps	%xmm10, %xmm15
1005	addps	%xmm15, %xmm2
1006	movshdup 52 * SIZE(BO), %xmm15
1007	mulps	%xmm10, %xmm15
1008	movaps	 28 * SIZE(AO), %xmm10
1009	ADDSUB	%xmm15, %xmm3
1010	movsldup 56 * SIZE(BO), %xmm15
1011	mulps	%xmm10, %xmm15
1012	addps	%xmm15, %xmm0
1013	movshdup 56 * SIZE(BO), %xmm15
1014	mulps	%xmm10, %xmm15
1015	ADDSUB	%xmm15, %xmm1
1016	movsldup 60 * SIZE(BO), %xmm15
1017	mulps	%xmm10, %xmm15
1018	addps	%xmm15, %xmm2
1019	movshdup 60 * SIZE(BO), %xmm15
1020	mulps	%xmm10, %xmm15
1021	movaps	 48 * SIZE(AO), %xmm10
1022	ADDSUB	%xmm15, %xmm3
1023	movsldup 112 * SIZE(BO), %xmm15
1024
1025	addq   $32 * SIZE, AO
1026	addq   $64 * SIZE, BO
1027
1028	decq   %rax
1029	jne    .L22
1030	ALIGN_4
1031
1032.L25:
1033#ifndef TRMMKERNEL
1034	movq	K, %rax
1035#else
1036	movq	KKK, %rax
1037#endif
1038	movaps	ALPHA_R, %xmm14
1039	movaps	ALPHA_I, %xmm15
1040	andq	$7, %rax		# if (k & 1)
1041	BRANCH
1042	je .L28
1043	ALIGN_4
1044
1045.L26:
1046	mulps	%xmm8, %xmm9
1047	addps	%xmm9, %xmm0
1048	movshdup  0 * SIZE(BO), %xmm9
1049	mulps	%xmm8, %xmm9
1050	ADDSUB	%xmm9, %xmm1
1051	movsldup  4 * SIZE(BO), %xmm9
1052	mulps	%xmm8, %xmm9
1053	addps	%xmm9, %xmm2
1054	movshdup  4 * SIZE(BO), %xmm9
1055	mulps	%xmm8, %xmm9
1056	movaps	  4 * SIZE(AO), %xmm8
1057	ADDSUB	%xmm9, %xmm3
1058	movsldup  8 * SIZE(BO), %xmm9
1059
1060	addq	$ 4 * SIZE, AO
1061	addq	$ 8 * SIZE, BO
1062	decq	%rax
1063	jg	.L26
1064	ALIGN_4
1065
1066.L28:
1067#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1068    defined(NR) || defined(NC) || defined(TR) || defined(TC)
1069
1070	shufps	$0xb1, %xmm1, %xmm1
1071	shufps	$0xb1, %xmm3, %xmm3
1072
1073	addsubps	%xmm1, %xmm0
1074	addsubps	%xmm3, %xmm2
1075
1076	movaps	%xmm0, %xmm1
1077	movaps	%xmm2, %xmm3
1078
1079	shufps	$0xb1, %xmm0, %xmm0
1080	shufps	$0xb1, %xmm2, %xmm2
1081#else
1082	shufps	$0xb1, %xmm0, %xmm0
1083	shufps	$0xb1, %xmm2, %xmm2
1084
1085	addsubps	%xmm0, %xmm1
1086	addsubps	%xmm2, %xmm3
1087
1088	movaps	%xmm1, %xmm0
1089	movaps	%xmm3, %xmm2
1090
1091	shufps	$0xb1, %xmm1, %xmm1
1092	shufps	$0xb1, %xmm3, %xmm3
1093#endif
1094
1095	mulps	%xmm14, %xmm1
1096	mulps	%xmm15, %xmm0
1097	mulps	%xmm14, %xmm3
1098	mulps	%xmm15, %xmm2
1099
1100	addps	%xmm1, %xmm0
1101	addps	%xmm3, %xmm2
1102
1103#if! defined(TRMMKERNEL) && !defined(BETAZERO)
1104	shufps	$0xe4, %xmm8,  %xmm8
1105	shufps	$0xe4, %xmm10, %xmm10
1106
1107	movsd	0 * SIZE(CO1), %xmm8
1108	movhps	2 * SIZE(CO1), %xmm8
1109	movsd	0 * SIZE(CO2), %xmm10
1110	movhps	2 * SIZE(CO2), %xmm10
1111
1112	addps	%xmm8,  %xmm0
1113	addps	%xmm10, %xmm2
1114#endif
1115
1116	movsd	%xmm0, 0 * SIZE(CO1)
1117	movhps	%xmm0, 2 * SIZE(CO1)
1118	movsd	%xmm2, 0 * SIZE(CO2)
1119	movhps	%xmm2, 2 * SIZE(CO2)
1120
1121#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1122    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1123	movq	K, %rax
1124	subq	KKK, %rax
1125	leaq	(,%rax,    8), %rax
1126	leaq	(AO, %rax, 2), AO
1127	leaq	(BO, %rax, 4), BO
1128#endif
1129
1130#if defined(TRMMKERNEL) && defined(LEFT)
1131	addq	$2, KK
1132#endif
1133
1134	addq	$4 * SIZE, CO1		# coffset += 4
1135	addq	$4 * SIZE, CO2		# coffset += 4
1136	ALIGN_4
1137
1138.L30:
1139	testq	$1, M
1140	je	.L39
1141
1142#if !defined(TRMMKERNEL) || \
1143	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1144	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1145
1146	leaq	BUFFER, BO
1147#else
1148	leaq	BUFFER, BO
1149	movq	KK, %rax
1150	leaq	(, %rax,   8), %rax
1151	leaq	(AO, %rax, 1), AO
1152	leaq	(BO, %rax, 4), BO
1153#endif
1154
1155 	movddup	  0 * SIZE(AO), %xmm8
1156	pxor	%xmm0, %xmm0
1157 	movddup	  8 * SIZE(AO), %xmm10
1158	pxor	%xmm1, %xmm1
1159	movsd     0 * SIZE(BO), %xmm9
1160	pxor	%xmm2, %xmm2
1161	movsd    16 * SIZE(BO), %xmm11
1162	pxor	%xmm3, %xmm3
1163	movsd    32 * SIZE(BO), %xmm13
1164	movsd    48 * SIZE(BO), %xmm15
1165
1166#ifndef TRMMKERNEL
1167	movq	K, %rax
1168#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1169	movq	K, %rax
1170	subq	KK, %rax
1171	movq	%rax, KKK
1172#else
1173	movq	KK, %rax
1174#ifdef LEFT
1175	addq	$1, %rax
1176#else
1177	addq	$2, %rax
1178#endif
1179	movq	%rax, KKK
1180#endif
1181	sarq	$3, %rax
1182	je	.L35
1183	ALIGN_4
1184
1185.L32:
1186	shufps	$0x50, %xmm9, %xmm9
1187	PREFETCH  (PREFETCHSIZE +  0) * SIZE(AO)
1188	mulps	%xmm8, %xmm9
1189	addps	%xmm9, %xmm0
1190	movsd     4 * SIZE(BO), %xmm9
1191	shufps	$0x50, %xmm9, %xmm9
1192	mulps	%xmm8, %xmm9
1193 	movddup	  2 * SIZE(AO), %xmm8
1194	addps	%xmm9, %xmm1
1195	movsd     8 * SIZE(BO), %xmm9
1196	shufps	$0x50, %xmm9, %xmm9
1197	mulps	%xmm8, %xmm9
1198	addps	%xmm9, %xmm0
1199	movsd    12 * SIZE(BO), %xmm9
1200	shufps	$0x50, %xmm9, %xmm9
1201	mulps	%xmm8, %xmm9
1202 	movddup	  4 * SIZE(AO), %xmm8
1203	addps	%xmm9, %xmm1
1204	movsd    64 * SIZE(BO), %xmm9
1205	shufps	$0x50, %xmm11, %xmm11
1206	mulps	%xmm8, %xmm11
1207	addps	%xmm11, %xmm0
1208	movsd    20 * SIZE(BO), %xmm11
1209	shufps	$0x50, %xmm11, %xmm11
1210	mulps	%xmm8, %xmm11
1211 	movddup	  6 * SIZE(AO), %xmm8
1212	addps	%xmm11, %xmm1
1213	movsd    24 * SIZE(BO), %xmm11
1214	shufps	$0x50, %xmm11, %xmm11
1215	mulps	%xmm8, %xmm11
1216	addps	%xmm11, %xmm0
1217	movsd    28 * SIZE(BO), %xmm11
1218	shufps	$0x50, %xmm11, %xmm11
1219	mulps	%xmm8, %xmm11
1220 	movddup	 16 * SIZE(AO), %xmm8
1221	addps	%xmm11, %xmm1
1222	movsd    80 * SIZE(BO), %xmm11
1223	shufps	$0x50, %xmm13, %xmm13
1224	mulps	%xmm10, %xmm13
1225	addps	%xmm13, %xmm0
1226	movsd    36 * SIZE(BO), %xmm13
1227	shufps	$0x50, %xmm13, %xmm13
1228	mulps	%xmm10, %xmm13
1229 	movddup	 10 * SIZE(AO), %xmm10
1230	addps	%xmm13, %xmm1
1231	movsd    40 * SIZE(BO), %xmm13
1232	shufps	$0x50, %xmm13, %xmm13
1233	mulps	%xmm10, %xmm13
1234	addps	%xmm13, %xmm0
1235	movsd    44 * SIZE(BO), %xmm13
1236	shufps	$0x50, %xmm13, %xmm13
1237	mulps	%xmm10, %xmm13
1238 	movddup	 12 * SIZE(AO), %xmm10
1239	addps	%xmm13, %xmm1
1240	movsd    96 * SIZE(BO), %xmm13
1241	shufps	$0x50, %xmm15, %xmm15
1242	mulps	%xmm10, %xmm15
1243	addps	%xmm15, %xmm0
1244	movsd    52 * SIZE(BO), %xmm15
1245	shufps	$0x50, %xmm15, %xmm15
1246	mulps	%xmm10, %xmm15
1247 	movddup	 14 * SIZE(AO), %xmm10
1248	addps	%xmm15, %xmm1
1249	movsd    56 * SIZE(BO), %xmm15
1250	shufps	$0x50, %xmm15, %xmm15
1251	mulps	%xmm10, %xmm15
1252	addps	%xmm15, %xmm0
1253	movsd    60 * SIZE(BO), %xmm15
1254	shufps	$0x50, %xmm15, %xmm15
1255	mulps	%xmm10, %xmm15
1256  	movddup	 24 * SIZE(AO), %xmm10
1257	addps	%xmm15, %xmm1
1258	movsd   112 * SIZE(BO), %xmm15
1259
1260	addq   $16 * SIZE, AO
1261	addq   $64 * SIZE, BO
1262
1263	decq   %rax
1264	jne    .L32
1265	ALIGN_4
1266
1267.L35:
1268#ifndef TRMMKERNEL
1269	movq	K, %rax
1270#else
1271	movq	KKK, %rax
1272#endif
1273	movaps	ALPHA_R, %xmm14
1274	movaps	ALPHA_I, %xmm15
1275	andq	$7, %rax		# if (k & 1)
1276	BRANCH
1277	je .L38
1278	ALIGN_4
1279
1280.L36:
1281	shufps	$0x50, %xmm9, %xmm9
1282	mulps	%xmm8, %xmm9
1283	addps	%xmm9, %xmm0
1284	movsd     4 * SIZE(BO), %xmm9
1285	shufps	$0x50, %xmm9, %xmm9
1286	mulps	%xmm8, %xmm9
1287 	movddup	  2 * SIZE(AO), %xmm8
1288	addps	%xmm9, %xmm1
1289	movsd     8 * SIZE(BO), %xmm9
1290
1291	addq	$2 * SIZE, AO
1292	addq	$8 * SIZE, BO
1293	decq	%rax
1294	jg	.L36
1295	ALIGN_4
1296
1297.L38:
1298	movaps	%xmm0, %xmm6
1299	movlhps	%xmm1, %xmm0
1300	movhlps %xmm6, %xmm1
1301
1302#if  defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
1303     defined(RR) || defined(RC) || defined(CR) || defined(CC)
1304	cmpeqps	%xmm7, %xmm7
1305	pslld	$31,   %xmm7
1306	xorps	%xmm7, %xmm1
1307#endif
1308
1309#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1310    defined(NR) || defined(NC) || defined(TR) || defined(TC)
1311	shufps	$0xb1, %xmm1, %xmm1
1312
1313	addsubps	%xmm1, %xmm0
1314
1315	movaps	%xmm0, %xmm1
1316
1317	shufps	$0xb1, %xmm0, %xmm0
1318#else
1319	shufps	$0xb1, %xmm0, %xmm0
1320
1321	addsubps	%xmm0, %xmm1
1322
1323	movaps	%xmm1, %xmm0
1324
1325	shufps	$0xb1, %xmm1, %xmm1
1326#endif
1327
1328	mulps	%xmm14, %xmm1
1329	mulps	%xmm15, %xmm0
1330
1331	addps	%xmm1, %xmm0
1332
1333#if! defined(TRMMKERNEL) && !defined(BETAZERO)
1334	movsd	0 * SIZE(CO1), %xmm8
1335	movhps	0 * SIZE(CO2), %xmm8
1336
1337	addps	%xmm8, %xmm0
1338#endif
1339
1340	movsd	%xmm0, 0 * SIZE(CO1)
1341	movhps	%xmm0, 0 * SIZE(CO2)
1342
1343#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1344    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1345	movq	K, %rax
1346	subq	KKK, %rax
1347	leaq	(,%rax,    8), %rax
1348	leaq	(AO, %rax, 1), AO
1349	leaq	(BO, %rax, 4), BO
1350#endif
1351
1352#if defined(TRMMKERNEL) && defined(LEFT)
1353	addq	$1, KK
1354#endif
1355	ALIGN_4
1356
1357.L39:
1358#if defined(TRMMKERNEL) && !defined(LEFT)
1359	addl	$2, KK
1360#endif
1361
1362	leaq	(C, LDC, 2), C		# c += 2 * ldc
1363	decq	J			# j --
1364	jg	.L01
1365	ALIGN_4
1366
1367.L40:
1368	testq	$1, N
1369	je	.L999
1370	ALIGN_4
1371
1372.L41:
1373#if defined(TRMMKERNEL) && defined(LEFT)
1374	movq	OFFSET, %rax
1375	movq	%rax, KK
1376#endif
1377
1378/* Copying to Sub Buffer */
1379	leaq	BUFFER, BO
1380
1381	movq	K, %rax
1382	sarq	$3, %rax
1383	jle	.L43
1384	ALIGN_4
1385
1386.L42:
1387	movddup	 0 * SIZE(B), %xmm0
1388	movddup	 2 * SIZE(B), %xmm1
1389	movddup	 4 * SIZE(B), %xmm2
1390	movddup	 6 * SIZE(B), %xmm3
1391	movddup	 8 * SIZE(B), %xmm4
1392	movddup	10 * SIZE(B), %xmm5
1393	movddup	12 * SIZE(B), %xmm6
1394	movddup	14 * SIZE(B), %xmm7
1395
1396	movaps	%xmm0,  0 * SIZE(BO)
1397	movaps	%xmm1,  4 * SIZE(BO)
1398	movaps	%xmm2,  8 * SIZE(BO)
1399	movaps	%xmm3, 12 * SIZE(BO)
1400	movaps	%xmm4, 16 * SIZE(BO)
1401	movaps	%xmm5, 20 * SIZE(BO)
1402	movaps	%xmm6, 24 * SIZE(BO)
1403	movaps	%xmm7, 28 * SIZE(BO)
1404
1405	prefetcht1	128 * SIZE(BO)
1406	prefetcht0	112 * SIZE(B)
1407
1408	addq	$16 * SIZE, B
1409	addq	$32 * SIZE, BO
1410	decq	%rax
1411	jne	.L42
1412	ALIGN_4
1413
1414.L43:
1415	movq	K, %rax
1416	andq	$7, %rax
1417	BRANCH
1418	jle	.L50
1419	ALIGN_4
1420
1421.L44:
1422	movddup	 0 * SIZE(B), %xmm0
1423
1424	movaps	%xmm0,  0 * SIZE(BO)
1425
1426	addq	$2 * SIZE, B
1427	addq	$4 * SIZE, BO
1428	decq	%rax
1429	jne	.L44
1430	ALIGN_4
1431
1432.L50:
1433	movq	C, CO1			# coffset1 = c
1434	movq	A, AO		# aoffset = a
1435
1436	movq	M,  I
1437	sarq	$2, I		# i = (m >> 2)
1438	jle	.L60
1439	ALIGN_4
1440
1441.L51:
1442#if !defined(TRMMKERNEL) || \
1443	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1444	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1445
1446	leaq	BUFFER, BO
1447#else
1448	leaq	BUFFER, BO
1449	movq	KK, %rax
1450	leaq	(, %rax, 8), %rax
1451	leaq	(AO, %rax, 4), AO
1452	leaq	(BO, %rax, 2), BO
1453#endif
1454
1455	movaps	  0 * SIZE(AO), %xmm8
1456	pxor	%xmm0, %xmm0
1457	movaps	 16 * SIZE(AO), %xmm10
1458	pxor	%xmm1, %xmm1
1459	movaps	 32 * SIZE(AO), %xmm12
1460	pxor	%xmm4, %xmm4
1461	movaps	 48 * SIZE(AO), %xmm14
1462	pxor	%xmm5, %xmm5
1463
1464	movsldup  0 * SIZE(BO), %xmm9
1465	movsldup 16 * SIZE(BO), %xmm11
1466
1467	prefetchnta     4 * SIZE(CO1)
1468
1469#ifndef TRMMKERNEL
1470	movq	K, %rax
1471#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1472	movq	K, %rax
1473	subq	KK, %rax
1474	movq	%rax, KKK
1475#else
1476	movq	KK, %rax
1477#ifdef LEFT
1478	addq	$4, %rax
1479#else
1480	addq	$1, %rax
1481#endif
1482	movq	%rax, KKK
1483#endif
1484	sarq	$3, %rax
1485	je	.L55
1486	ALIGN_4
1487
1488.L52:
1489	mulps	%xmm8, %xmm9
1490	PREFETCH  (PREFETCHSIZE +  0) * SIZE(AO)
1491	addps	%xmm9, %xmm0
1492	movshdup  0 * SIZE(BO), %xmm9
1493	mulps	%xmm8, %xmm9
1494	movaps	  4 * SIZE(AO), %xmm8
1495	ADDSUB	%xmm9, %xmm1
1496	movsldup  0 * SIZE(BO), %xmm9
1497	mulps	%xmm8, %xmm9
1498	addps	%xmm9, %xmm4
1499	movshdup  0 * SIZE(BO), %xmm9
1500	mulps	%xmm8, %xmm9
1501	movaps	  8 * SIZE(AO), %xmm8
1502	ADDSUB	%xmm9, %xmm5
1503	movsldup  4 * SIZE(BO), %xmm9
1504	mulps	%xmm8, %xmm9
1505	addps	%xmm9, %xmm0
1506	movshdup  4 * SIZE(BO), %xmm9
1507	mulps	%xmm8, %xmm9
1508	movaps	 12 * SIZE(AO), %xmm8
1509	ADDSUB	%xmm9, %xmm1
1510	movsldup  4 * SIZE(BO), %xmm9
1511	mulps	%xmm8, %xmm9
1512	addps	%xmm9, %xmm4
1513	movshdup  4 * SIZE(BO), %xmm9
1514	mulps	%xmm8, %xmm9
1515	movaps	 64 * SIZE(AO), %xmm8
1516	ADDSUB	%xmm9, %xmm5
1517	movsldup  8 * SIZE(BO), %xmm9
1518	mulps	%xmm10, %xmm9
1519	addps	%xmm9, %xmm0
1520	movshdup  8 * SIZE(BO), %xmm9
1521	mulps	%xmm10, %xmm9
1522	movaps	 20 * SIZE(AO), %xmm10
1523	ADDSUB	%xmm9, %xmm1
1524	movsldup  8 * SIZE(BO), %xmm9
1525	mulps	%xmm10, %xmm9
1526	addps	%xmm9, %xmm4
1527	movshdup  8 * SIZE(BO), %xmm9
1528	mulps	%xmm10, %xmm9
1529	movaps	 24 * SIZE(AO), %xmm10
1530	ADDSUB	%xmm9, %xmm5
1531	movsldup 12 * SIZE(BO), %xmm9
1532	mulps	%xmm10, %xmm9
1533	addps	%xmm9, %xmm0
1534	movshdup 12 * SIZE(BO), %xmm9
1535	mulps	%xmm10, %xmm9
1536	movaps	 28 * SIZE(AO), %xmm10
1537	ADDSUB	%xmm9, %xmm1
1538	movsldup 12 * SIZE(BO), %xmm9
1539	mulps	%xmm10, %xmm9
1540	addps	%xmm9, %xmm4
1541	movshdup 12 * SIZE(BO), %xmm9
1542	mulps	%xmm10, %xmm9
1543	movaps	 80 * SIZE(AO), %xmm10
1544	ADDSUB	%xmm9, %xmm5
1545	movsldup 32 * SIZE(BO), %xmm9
1546	mulps	%xmm12, %xmm11
1547	PREFETCH  (PREFETCHSIZE + 32) * SIZE(AO)
1548	addps	%xmm11, %xmm0
1549	movshdup 16 * SIZE(BO), %xmm11
1550	mulps	%xmm12, %xmm11
1551	movaps	 36 * SIZE(AO), %xmm12
1552	ADDSUB	%xmm11, %xmm1
1553	movsldup 16 * SIZE(BO), %xmm11
1554	mulps	%xmm12, %xmm11
1555	addps	%xmm11, %xmm4
1556	movshdup 16 * SIZE(BO), %xmm11
1557	mulps	%xmm12, %xmm11
1558	movaps	 40 * SIZE(AO), %xmm12
1559	ADDSUB	%xmm11, %xmm5
1560	movsldup 20 * SIZE(BO), %xmm11
1561	mulps	%xmm12, %xmm11
1562	addps	%xmm11, %xmm0
1563	movshdup 20 * SIZE(BO), %xmm11
1564	mulps	%xmm12, %xmm11
1565	movaps	 44 * SIZE(AO), %xmm12
1566	ADDSUB	%xmm11, %xmm1
1567	movsldup 20 * SIZE(BO), %xmm11
1568	mulps	%xmm12, %xmm11
1569	addps	%xmm11, %xmm4
1570	movshdup 20 * SIZE(BO), %xmm11
1571	mulps	%xmm12, %xmm11
1572	movaps	 96 * SIZE(AO), %xmm12
1573	ADDSUB	%xmm11, %xmm5
1574	movsldup 24 * SIZE(BO), %xmm11
1575	mulps	%xmm14, %xmm11
1576	addps	%xmm11, %xmm0
1577	movshdup 24 * SIZE(BO), %xmm11
1578	mulps	%xmm14, %xmm11
1579	movaps	 52 * SIZE(AO), %xmm14
1580	ADDSUB	%xmm11, %xmm1
1581	movsldup 24 * SIZE(BO), %xmm11
1582	mulps	%xmm14, %xmm11
1583	addps	%xmm11, %xmm4
1584	movshdup 24 * SIZE(BO), %xmm11
1585	mulps	%xmm14, %xmm11
1586	movaps	 56 * SIZE(AO), %xmm14
1587	ADDSUB	%xmm11, %xmm5
1588	movsldup 28 * SIZE(BO), %xmm11
1589	mulps	%xmm14, %xmm11
1590	addps	%xmm11, %xmm0
1591	movshdup 28 * SIZE(BO), %xmm11
1592	mulps	%xmm14, %xmm11
1593	movaps	 60 * SIZE(AO), %xmm14
1594	ADDSUB	%xmm11, %xmm1
1595	movsldup 28 * SIZE(BO), %xmm11
1596	mulps	%xmm14, %xmm11
1597	addps	%xmm11, %xmm4
1598	movshdup 28 * SIZE(BO), %xmm11
1599	mulps	%xmm14, %xmm11
1600	movaps	112 * SIZE(AO), %xmm14
1601	ADDSUB	%xmm11, %xmm5
1602	movsldup 48 * SIZE(BO), %xmm11
1603
1604	addq   $64 * SIZE, AO
1605	addq   $32 * SIZE, BO
1606	decq   %rax
1607	jne    .L52
1608	ALIGN_4
1609
1610.L55:
1611#ifndef TRMMKERNEL
1612	movq	K, %rax
1613#else
1614	movq	KKK, %rax
1615#endif
1616	movaps	ALPHA_R, %xmm14
1617	movaps	ALPHA_I, %xmm15
1618	andq	$7, %rax		# if (k & 1)
1619	BRANCH
1620	je .L58
1621	ALIGN_4
1622
1623.L56:
1624	mulps	%xmm8, %xmm9
1625	addps	%xmm9, %xmm0
1626	movshdup  0 * SIZE(BO), %xmm9
1627	mulps	%xmm8, %xmm9
1628	movaps	  4 * SIZE(AO), %xmm8
1629	ADDSUB	%xmm9, %xmm1
1630	movsldup  0 * SIZE(BO), %xmm9
1631	mulps	%xmm8, %xmm9
1632	addps	%xmm9, %xmm4
1633	movshdup  0 * SIZE(BO), %xmm9
1634	mulps	%xmm8, %xmm9
1635	movaps	  8 * SIZE(AO), %xmm8
1636	ADDSUB	%xmm9, %xmm5
1637	movsldup  4 * SIZE(BO), %xmm9
1638
1639	addq	$ 8 * SIZE, AO
1640	addq	$ 4 * SIZE, BO
1641	decq	%rax
1642	jg	.L56
1643	ALIGN_4
1644
1645.L58:
1646#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1647    defined(NR) || defined(NC) || defined(TR) || defined(TC)
1648
1649	shufps	$0xb1, %xmm1, %xmm1
1650	shufps	$0xb1, %xmm5, %xmm5
1651
1652	addsubps	%xmm1, %xmm0
1653	addsubps	%xmm5, %xmm4
1654
1655	movaps	%xmm0, %xmm1
1656	movaps	%xmm4, %xmm5
1657
1658	shufps	$0xb1, %xmm0, %xmm0
1659	shufps	$0xb1, %xmm4, %xmm4
1660#else
1661	shufps	$0xb1, %xmm0, %xmm0
1662	shufps	$0xb1, %xmm4, %xmm4
1663
1664	addsubps	%xmm0, %xmm1
1665	addsubps	%xmm4, %xmm5
1666
1667	movaps	%xmm1, %xmm0
1668	movaps	%xmm5, %xmm4
1669
1670	shufps	$0xb1, %xmm1, %xmm1
1671	shufps	$0xb1, %xmm5, %xmm5
1672#endif
1673
1674	mulps	%xmm14, %xmm1
1675	mulps	%xmm15, %xmm0
1676	mulps	%xmm14, %xmm5
1677	mulps	%xmm15, %xmm4
1678
1679	addps	%xmm1, %xmm0
1680	addps	%xmm5, %xmm4
1681
1682#if! defined(TRMMKERNEL) && !defined(BETAZERO)
1683	movsd	0 * SIZE(CO1), %xmm8
1684	movhps	2 * SIZE(CO1), %xmm8
1685	movsd	4 * SIZE(CO1), %xmm9
1686	movhps	6 * SIZE(CO1), %xmm9
1687
1688	addps	%xmm8, %xmm0
1689	addps	%xmm9, %xmm4
1690#endif
1691
1692	movsd	%xmm0, 0 * SIZE(CO1)
1693	movhps	%xmm0, 2 * SIZE(CO1)
1694	movsd	%xmm4, 4 * SIZE(CO1)
1695	movhps	%xmm4, 6 * SIZE(CO1)
1696
1697#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1698    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1699	movq	K, %rax
1700	subq	KKK, %rax
1701	leaq	(,%rax, 8), %rax
1702	leaq	(AO, %rax, 4), AO
1703	leaq	(BO, %rax, 2), BO
1704#endif
1705
1706#if defined(TRMMKERNEL) && defined(LEFT)
1707	addq	$4, KK
1708#endif
1709
1710	addq	$8 * SIZE, CO1		# coffset += 4
1711	decq	I			# i --
1712	jg	.L51
1713	ALIGN_4
1714
1715.L60:
1716	testq	$2, M
1717	je	.L70
1718
1719#if !defined(TRMMKERNEL) || \
1720	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1721	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1722
1723	leaq	BUFFER, BO
1724#else
1725	leaq	BUFFER, BO
1726	movq	KK, %rax
1727	leaq	(, %rax, 8), %rax
1728	leaq	(AO, %rax, 2), AO
1729	leaq	(BO, %rax, 2), BO
1730#endif
1731
1732	movaps	  0 * SIZE(AO), %xmm8
1733	pxor	%xmm0, %xmm0
1734	movsldup  0 * SIZE(BO), %xmm9
1735	pxor	%xmm1, %xmm1
1736	movaps	 16 * SIZE(AO), %xmm10
1737	movsldup 16 * SIZE(BO), %xmm11
1738
1739#ifndef TRMMKERNEL
1740	movq	K, %rax
1741#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1742	movq	K, %rax
1743	subq	KK, %rax
1744	movq	%rax, KKK
1745#else
1746	movq	KK, %rax
1747#ifdef LEFT
1748	addq	$2, %rax
1749#else
1750	addq	$1, %rax
1751#endif
1752	movq	%rax, KKK
1753#endif
1754	sarq	$3, %rax
1755	je	.L65
1756	ALIGN_4
1757
1758.L62:
1759	mulps	%xmm8, %xmm9
1760	PREFETCH  (PREFETCHSIZE +  0) * SIZE(AO)
1761	addps	%xmm9, %xmm0
1762	movshdup  0 * SIZE(BO), %xmm9
1763	mulps	%xmm8, %xmm9
1764	movaps	  4 * SIZE(AO), %xmm8
1765	ADDSUB	%xmm9, %xmm1
1766	movsldup  4 * SIZE(BO), %xmm9
1767	mulps	%xmm8, %xmm9
1768	addps	%xmm9, %xmm0
1769	movshdup  4 * SIZE(BO), %xmm9
1770	mulps	%xmm8, %xmm9
1771	movaps	  8 * SIZE(AO), %xmm8
1772	ADDSUB	%xmm9, %xmm1
1773	movsldup  8 * SIZE(BO), %xmm9
1774	mulps	%xmm8, %xmm9
1775	addps	%xmm9, %xmm0
1776	movshdup  8 * SIZE(BO), %xmm9
1777	mulps	%xmm8, %xmm9
1778	movaps	 12 * SIZE(AO), %xmm8
1779	ADDSUB	%xmm9, %xmm1
1780	movsldup 12 * SIZE(BO), %xmm9
1781	mulps	%xmm8, %xmm9
1782	addps	%xmm9, %xmm0
1783	movshdup 12 * SIZE(BO), %xmm9
1784	mulps	%xmm8, %xmm9
1785	movaps	 32 * SIZE(AO), %xmm8
1786	ADDSUB	%xmm9, %xmm1
1787	movsldup 32 * SIZE(BO), %xmm9
1788	mulps	%xmm10, %xmm11
1789	addps	%xmm11, %xmm0
1790	movshdup 16 * SIZE(BO), %xmm11
1791	mulps	%xmm10, %xmm11
1792	movaps	 20 * SIZE(AO), %xmm10
1793	ADDSUB	%xmm11, %xmm1
1794	movsldup 20 * SIZE(BO), %xmm11
1795	mulps	%xmm10, %xmm11
1796	addps	%xmm11, %xmm0
1797	movshdup 20 * SIZE(BO), %xmm11
1798	mulps	%xmm10, %xmm11
1799	movaps	 24 * SIZE(AO), %xmm10
1800	ADDSUB	%xmm11, %xmm1
1801	movsldup 24 * SIZE(BO), %xmm11
1802	mulps	%xmm10, %xmm11
1803	addps	%xmm11, %xmm0
1804	movshdup 24 * SIZE(BO), %xmm11
1805	mulps	%xmm10, %xmm11
1806	movaps	 28 * SIZE(AO), %xmm10
1807	ADDSUB	%xmm11, %xmm1
1808	movsldup 28 * SIZE(BO), %xmm11
1809	mulps	%xmm10, %xmm11
1810	addps	%xmm11, %xmm0
1811	movshdup 28 * SIZE(BO), %xmm11
1812	mulps	%xmm10, %xmm11
1813	movaps	 48 * SIZE(AO), %xmm10
1814	ADDSUB	%xmm11, %xmm1
1815	movsldup 48 * SIZE(BO), %xmm11
1816
1817	addq   $32 * SIZE, AO
1818	addq   $32 * SIZE, BO
1819
1820	decq   %rax
1821	jne    .L62
1822	ALIGN_4
1823
1824.L65:
1825#ifndef TRMMKERNEL
1826	movq	K, %rax
1827#else
1828	movq	KKK, %rax
1829#endif
1830	movaps	ALPHA_R, %xmm14
1831	movaps	ALPHA_I, %xmm15
1832	andq	$7, %rax		# if (k & 1)
1833	BRANCH
1834	je .L68
1835	ALIGN_4
1836
1837.L66:
1838	mulps	%xmm8, %xmm9
1839	addps	%xmm9, %xmm0
1840	movshdup  0 * SIZE(BO), %xmm9
1841	mulps	%xmm8, %xmm9
1842	movaps	  4 * SIZE(AO), %xmm8
1843	ADDSUB	%xmm9, %xmm1
1844	movsldup  4 * SIZE(BO), %xmm9
1845
1846	addq	$4 * SIZE, AO		# aoffset  += 4
1847	addq	$4 * SIZE, BO		# boffset1 += 8
1848	decq	%rax
1849	jg	.L66
1850	ALIGN_4
1851
1852.L68:
1853#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1854    defined(NR) || defined(NC) || defined(TR) || defined(TC)
1855	shufps	$0xb1, %xmm1, %xmm1
1856	addsubps	%xmm1, %xmm0
1857	movaps	%xmm0, %xmm1
1858	shufps	$0xb1, %xmm0, %xmm0
1859#else
1860	shufps	$0xb1, %xmm0, %xmm0
1861	addsubps	%xmm0, %xmm1
1862	movaps	%xmm1, %xmm0
1863	shufps	$0xb1, %xmm1, %xmm1
1864#endif
1865
1866	mulps	%xmm14, %xmm1
1867	mulps	%xmm15, %xmm0
1868	addps	%xmm1, %xmm0
1869
1870#if! defined(TRMMKERNEL) && !defined(BETAZERO)
1871	movsd	0 * SIZE(CO1), %xmm8
1872	movhps	2 * SIZE(CO1), %xmm8
1873
1874	addps	%xmm8, %xmm0
1875#endif
1876
1877	movsd	%xmm0, 0 * SIZE(CO1)
1878	movhps	%xmm0, 2 * SIZE(CO1)
1879
1880#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1881    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1882	movq	K, %rax
1883	subq	KKK, %rax
1884	leaq	(,%rax,  8), %rax
1885	leaq	(AO, %rax, 2), AO
1886	leaq	(BO, %rax, 2), BO
1887#endif
1888
1889#if defined(TRMMKERNEL) && defined(LEFT)
1890	addq	$2, KK
1891#endif
1892	addq	$4 * SIZE, CO1		# coffset += 4
1893	ALIGN_4
1894
1895.L70:
1896	testq	$1, M
1897	je	.L999
1898
1899#if !defined(TRMMKERNEL) || \
1900	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1901	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1902
1903	leaq	BUFFER, BO
1904#else
1905	leaq	BUFFER, BO
1906	movq	KK, %rax
1907	leaq	(, %rax, 8), %rax
1908	leaq	(AO, %rax, 1), AO
1909	leaq	(BO, %rax, 2), BO
1910#endif
1911
1912 	movddup	  0 * SIZE(AO), %xmm8
1913	pxor	%xmm0, %xmm0
1914	movsd     0 * SIZE(BO), %xmm9
1915	pxor	%xmm1, %xmm1
1916 	movddup	  8 * SIZE(AO), %xmm10
1917	movsd    16 * SIZE(BO), %xmm11
1918
1919#ifndef TRMMKERNEL
1920	movq	K, %rax
1921#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1922	movq	K, %rax
1923	subq	KK, %rax
1924	movq	%rax, KKK
1925#else
1926	movq	KK, %rax
1927#ifdef LEFT
1928	addq	$1, %rax
1929#else
1930	addq	$1, %rax
1931#endif
1932	movq	%rax, KKK
1933#endif
1934	sarq	$3, %rax
1935	je	.L75
1936	ALIGN_4
1937
1938.L72:
1939	shufps	$0x50, %xmm9, %xmm9
1940	PREFETCH  (PREFETCHSIZE +  0) * SIZE(AO)
1941	mulps	%xmm8, %xmm9
1942 	movddup	  2 * SIZE(AO), %xmm8
1943	addps	%xmm9, %xmm0
1944	movsd     4 * SIZE(BO), %xmm9
1945	shufps	$0x50, %xmm9, %xmm9
1946	mulps	%xmm8, %xmm9
1947 	movddup	  4 * SIZE(AO), %xmm8
1948	addps	%xmm9, %xmm1
1949	movsd     8 * SIZE(BO), %xmm9
1950	shufps	$0x50, %xmm9, %xmm9
1951	mulps	%xmm8, %xmm9
1952 	movddup	  6 * SIZE(AO), %xmm8
1953	addps	%xmm9, %xmm0
1954	movsd    12 * SIZE(BO), %xmm9
1955	shufps	$0x50, %xmm9, %xmm9
1956	mulps	%xmm8, %xmm9
1957 	movddup	 16 * SIZE(AO), %xmm8
1958	addps	%xmm9, %xmm1
1959	movsd    32 * SIZE(BO), %xmm9
1960	shufps	$0x50, %xmm11, %xmm11
1961	mulps	%xmm10, %xmm11
1962 	movddup	 10 * SIZE(AO), %xmm10
1963	addps	%xmm11, %xmm0
1964	movsd    20 * SIZE(BO), %xmm11
1965	shufps	$0x50, %xmm11, %xmm11
1966	mulps	%xmm10, %xmm11
1967 	movddup	 12 * SIZE(AO), %xmm10
1968	addps	%xmm11, %xmm1
1969	movsd    24 * SIZE(BO), %xmm11
1970	shufps	$0x50, %xmm11, %xmm11
1971	mulps	%xmm10, %xmm11
1972 	movddup	 14 * SIZE(AO), %xmm10
1973	addps	%xmm11, %xmm0
1974	movsd    28 * SIZE(BO), %xmm11
1975	shufps	$0x50, %xmm11, %xmm11
1976	mulps	%xmm10, %xmm11
1977 	movddup	 24 * SIZE(AO), %xmm10
1978	addps	%xmm11, %xmm1
1979	movsd    48 * SIZE(BO), %xmm11
1980
1981	addq   $16 * SIZE, AO
1982	addq   $32 * SIZE, BO
1983	decq   %rax
1984	jne    .L72
1985	ALIGN_4
1986
1987.L75:
1988#ifndef TRMMKERNEL
1989	movq	K, %rax
1990#else
1991	movq	KKK, %rax
1992#endif
1993	movaps	ALPHA_R, %xmm14
1994	movaps	ALPHA_I, %xmm15
1995	andq	$7, %rax		# if (k & 1)
1996	BRANCH
1997	je .L78
1998	ALIGN_4
1999
2000.L76:
2001	shufps	$0x50, %xmm9, %xmm9
2002	mulps	%xmm8, %xmm9
2003 	movddup	  2 * SIZE(AO), %xmm8
2004	addps	%xmm9, %xmm0
2005	movsd     4 * SIZE(BO), %xmm9
2006
2007	addq	$2 * SIZE, AO
2008	addq	$4 * SIZE, BO
2009	decq	%rax
2010	jg	.L76
2011	ALIGN_4
2012
2013.L78:
2014	addps	%xmm1, %xmm0
2015
2016	movhlps	%xmm0, %xmm1
2017
2018#if  defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
2019     defined(RR) || defined(RC) || defined(CR) || defined(CC)
2020	cmpeqps	%xmm7, %xmm7
2021	pslld	$31,   %xmm7
2022	xorps	%xmm7, %xmm1
2023#endif
2024
2025#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
2026    defined(NR) || defined(NC) || defined(TR) || defined(TC)
2027	shufps	$0xb1, %xmm1, %xmm1
2028
2029	addsubps	%xmm1, %xmm0
2030
2031	movaps	%xmm0, %xmm1
2032
2033	shufps	$0xb1, %xmm0, %xmm0
2034#else
2035	shufps	$0xb1, %xmm0, %xmm0
2036
2037	addsubps	%xmm0, %xmm1
2038
2039	movaps	%xmm1, %xmm0
2040
2041	shufps	$0xb1, %xmm1, %xmm1
2042#endif
2043
2044	mulps	%xmm14, %xmm1
2045	mulps	%xmm15, %xmm0
2046
2047	addps	%xmm1, %xmm0
2048
2049#if! defined(TRMMKERNEL) && !defined(BETAZERO)
2050	movsd	0 * SIZE(CO1), %xmm8
2051	addps	%xmm8, %xmm0
2052#endif
2053	movsd	%xmm0, 0 * SIZE(CO1)
2054	ALIGN_4
2055
2056.L999:
2057	movq	%rbx, %rsp
2058
2059	movq	  0(%rsp), %rbx
2060	movq	  8(%rsp), %rbp
2061	movq	 16(%rsp), %r12
2062	movq	 24(%rsp), %r13
2063	movq	 32(%rsp), %r14
2064	movq	 40(%rsp), %r15
2065
2066#ifdef WINDOWS_ABI
2067	movq	 48(%rsp), %rdi
2068	movq	 56(%rsp), %rsi
2069	movups	 64(%rsp), %xmm6
2070	movups	 80(%rsp), %xmm7
2071	movups	 96(%rsp), %xmm8
2072	movups	112(%rsp), %xmm9
2073	movups	128(%rsp), %xmm10
2074	movups	144(%rsp), %xmm11
2075	movups	160(%rsp), %xmm12
2076	movups	176(%rsp), %xmm13
2077	movups	192(%rsp), %xmm14
2078	movups	208(%rsp), %xmm15
2079#endif
2080
2081	addq	$STACKSIZE, %rsp
2082	ret
2083
2084	EPILOGUE
2085