1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define OLD_M	%rdi
26#define OLD_N	%rsi
27#define M	%r13
28#define N	%r14
29#define K	%rdx
30
31#define A	%rcx
32#define B	%r8
33#define C	%r9
34#define LDC	%r10
35
36#define I	%r11
37#define AO	%rdi
38#define BO	%rsi
39#define	CO1	%r15
40#define CO2	%rbp
41#define BB	%r12
42
43#ifndef WINDOWS_ABI
44
45#define STACKSIZE 64
46
47#define OLD_LDC		 8 + STACKSIZE(%rsp)
48#define OLD_OFFSET	16 + STACKSIZE(%rsp)
49
50#else
51
52#define STACKSIZE 256
53
54#define OLD_A		40 + STACKSIZE(%rsp)
55#define OLD_B		48 + STACKSIZE(%rsp)
56#define OLD_C		56 + STACKSIZE(%rsp)
57#define OLD_LDC		64 + STACKSIZE(%rsp)
58#define OLD_OFFSET	72 + STACKSIZE(%rsp)
59
60#endif
61
62#define ALPHA	  0(%rsp)
63#define J	 16(%rsp)
64#define OFFSET	 24(%rsp)
65#define KK	 32(%rsp)
66#define KKK	 40(%rsp)
67#define BUFFER	256(%rsp)
68
69#ifdef OPTERON
70#define PREFETCH     prefetch
71#define PREFETCHW    prefetchw
72#define PREFETCHSIZE (8 * 9 + 4)
73#define movsd	movlps
74#define movapd	movaps
75#endif
76
77#ifdef GENERIC
78#define PREFETCH     prefetcht0
79#define PREFETCHW    prefetcht0
80#define PREFETCHSIZE (8 * 13 + 4)
81#define movapd	movaps
82#endif
83
84#ifndef GENERIC
85#define KERNEL1(xx) \
86	mulpd	%xmm0, %xmm1 ;\
87	addpd	%xmm1, %xmm8 ;\
88	movaps	-16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
89	mulpd	%xmm0, %xmm3 ;\
90	addpd	%xmm3, %xmm9 ;\
91	movapd	-14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
92	mulpd	%xmm0, %xmm5 ;\
93	PREFETCH	(PREFETCHSIZE     +  0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\
94	mulpd	-10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\
95	addpd	%xmm5, %xmm10 ;\
96	movapd	-12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
97	addpd	%xmm0, %xmm11 ;\
98	movapd	-8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0
99
100#define KERNEL2(xx) \
101	mulpd	%xmm2, %xmm1 ;\
102	addpd	%xmm1, %xmm12 ;\
103	movapd	 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
104	mulpd	%xmm2, %xmm3 ;\
105	addpd	%xmm3, %xmm13 ;\
106	movapd	-6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
107	mulpd	%xmm2, %xmm5 ;\
108	mulpd	-10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\
109	addpd	%xmm5, %xmm14 ;\
110	movapd	 -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
111	addpd	%xmm2, %xmm15 ;\
112	movapd	-6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2
113
114#define KERNEL3(xx) \
115	mulpd	%xmm4, %xmm7 ;\
116	addpd	%xmm7, %xmm8 ;\
117	movapd	-8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
118	mulpd	%xmm4, %xmm3 ;\
119	addpd	%xmm3, %xmm9 ;\
120	movapd	-6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
121	mulpd	%xmm4, %xmm5 ;\
122	mulpd	-2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\
123	addpd	%xmm5, %xmm10 ;\
124	movapd	-4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
125	addpd	%xmm4, %xmm11 ;\
126	movapd	-4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4
127
128#define KERNEL4(xx) \
129	mulpd	%xmm6, %xmm7 ;\
130	addpd	%xmm7, %xmm12 ;\
131	movapd	 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
132	mulpd	%xmm6, %xmm3 ;\
133	addpd	%xmm3, %xmm13 ;\
134	movapd	 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
135	mulpd	%xmm6, %xmm5 ;\
136	mulpd	-2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\
137	addpd	%xmm5, %xmm14 ;\
138	movapd	 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
139 	PREFETCH	(PREFETCHSIZE     +  8) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\
140	addpd	%xmm6, %xmm15 ;\
141	movapd	-2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6
142
143#define KERNEL5(xx) \
144	mulpd	%xmm0, %xmm1 ;\
145	addpd	%xmm1, %xmm8 ;\
146	movapd	 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
147	mulpd	%xmm0, %xmm3 ;\
148	addpd	%xmm3, %xmm9 ;\
149	movapd	 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
150	mulpd	%xmm0, %xmm5 ;\
151	mulpd	 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\
152	addpd	%xmm5, %xmm10 ;\
153	movapd	 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
154	addpd	%xmm0, %xmm11 ;\
155	movapd	 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0
156
157#define KERNEL6(xx) \
158	mulpd	%xmm2, %xmm1 ;\
159	addpd	%xmm1, %xmm12 ;\
160	movapd	16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
161	mulpd	%xmm2, %xmm3 ;\
162	addpd	%xmm3, %xmm13 ;\
163	movapd	10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
164	mulpd	%xmm2, %xmm5 ;\
165	mulpd	 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\
166	addpd	%xmm5, %xmm14 ;\
167	movapd	12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
168	addpd	%xmm2, %xmm15 ;\
169	movapd	 2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2
170
171#define KERNEL7(xx) \
172	mulpd	%xmm4, %xmm7 ;\
173	addpd	%xmm7, %xmm8 ;\
174	movapd	 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
175	mulpd	%xmm4, %xmm3 ;\
176	addpd	%xmm3, %xmm9 ;\
177	movapd	10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
178	mulpd	%xmm4, %xmm5 ;\
179	mulpd	14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\
180	addpd	%xmm5, %xmm10 ;\
181	movapd	12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
182	addpd	%xmm4, %xmm11 ;\
183	movapd	 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4
184
185#define KERNEL8(xx) \
186	mulpd	%xmm6, %xmm7 ;\
187	addpd	%xmm7, %xmm12 ;\
188	movapd	24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
189	mulpd	%xmm6, %xmm3 ;\
190	addpd	%xmm3, %xmm13 ;\
191	movapd	18 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
192	mulpd	%xmm6, %xmm5 ;\
193	mulpd	14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\
194	addpd	%xmm5, %xmm14 ;\
195	movapd	20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
196	addpd	%xmm6, %xmm15 ;\
197	movapd	 6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6
198
199#else
200
201#define KERNEL1(xx) \
202	mulpd	%xmm0, %xmm1 ;\
203	addpd	%xmm1, %xmm8 ;\
204	movapd	-16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
205	mulpd	%xmm0, %xmm3 ;\
206	addpd	%xmm3, %xmm9 ;\
207	movapd	-14 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
208	mulpd	%xmm0, %xmm5 ;\
209	PREFETCH	(PREFETCHSIZE     +  0) * SIZE + 1 * (xx) * SIZE(AO) ;\
210	mulpd	-10 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\
211	addpd	%xmm5, %xmm10 ;\
212	movapd	-12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
213	addpd	%xmm0, %xmm11 ;\
214	movapd	-8 * SIZE + 1 * (xx) * SIZE(AO), %xmm0
215
216#define KERNEL2(xx) \
217	mulpd	%xmm2, %xmm1 ;\
218	addpd	%xmm1, %xmm12 ;\
219	movapd	 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
220	mulpd	%xmm2, %xmm3 ;\
221	addpd	%xmm3, %xmm13 ;\
222	movapd	-6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
223	mulpd	%xmm2, %xmm5 ;\
224	mulpd	-10 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\
225	addpd	%xmm5, %xmm14 ;\
226	movapd	 -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
227	addpd	%xmm2, %xmm15 ;\
228	movapd	-6 * SIZE + 1 * (xx) * SIZE(AO), %xmm2
229
230#define KERNEL3(xx) \
231	mulpd	%xmm4, %xmm7 ;\
232	addpd	%xmm7, %xmm8 ;\
233	movapd	-8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
234	mulpd	%xmm4, %xmm3 ;\
235	addpd	%xmm3, %xmm9 ;\
236	movapd	-6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
237	mulpd	%xmm4, %xmm5 ;\
238	mulpd	-2 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\
239	addpd	%xmm5, %xmm10 ;\
240	movapd	-4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
241	addpd	%xmm4, %xmm11 ;\
242	movapd	-4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4
243
244#define KERNEL4(xx) \
245	mulpd	%xmm6, %xmm7 ;\
246	addpd	%xmm7, %xmm12 ;\
247	movapd	 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
248	mulpd	%xmm6, %xmm3 ;\
249	addpd	%xmm3, %xmm13 ;\
250	movapd	 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
251	mulpd	%xmm6, %xmm5 ;\
252	mulpd	-2 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\
253	addpd	%xmm5, %xmm14 ;\
254	movapd	 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
255 	PREFETCH	(PREFETCHSIZE     +  8) * SIZE + 1 * (xx) * SIZE(AO) ;\
256	addpd	%xmm6, %xmm15 ;\
257	movapd	-2 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
258
259#define KERNEL5(xx) \
260	mulpd	%xmm0, %xmm1 ;\
261	addpd	%xmm1, %xmm8 ;\
262	movapd	 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
263	mulpd	%xmm0, %xmm3 ;\
264	addpd	%xmm3, %xmm9 ;\
265	movapd	 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
266	mulpd	%xmm0, %xmm5 ;\
267	mulpd	 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\
268	addpd	%xmm5, %xmm10 ;\
269	movapd	 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
270	addpd	%xmm0, %xmm11 ;\
271	movapd	 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0
272
273#define KERNEL6(xx) \
274	mulpd	%xmm2, %xmm1 ;\
275	addpd	%xmm1, %xmm12 ;\
276	movapd	16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
277	mulpd	%xmm2, %xmm3 ;\
278	addpd	%xmm3, %xmm13 ;\
279	movapd	10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
280	mulpd	%xmm2, %xmm5 ;\
281	mulpd	 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\
282	addpd	%xmm5, %xmm14 ;\
283	movapd	12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
284	addpd	%xmm2, %xmm15 ;\
285	movapd	 2 * SIZE + 1 * (xx) * SIZE(AO), %xmm2
286
287#define KERNEL7(xx) \
288	mulpd	%xmm4, %xmm7 ;\
289	addpd	%xmm7, %xmm8 ;\
290	movapd	 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
291	mulpd	%xmm4, %xmm3 ;\
292	addpd	%xmm3, %xmm9 ;\
293	movapd	10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
294	mulpd	%xmm4, %xmm5 ;\
295	mulpd	14 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\
296	addpd	%xmm5, %xmm10 ;\
297	movapd	12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
298	addpd	%xmm4, %xmm11 ;\
299	movapd	 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4
300
301#define KERNEL8(xx) \
302	mulpd	%xmm6, %xmm7 ;\
303	addpd	%xmm7, %xmm12 ;\
304	movapd	24 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
305	mulpd	%xmm6, %xmm3 ;\
306	addpd	%xmm3, %xmm13 ;\
307	movapd	18 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
308	mulpd	%xmm6, %xmm5 ;\
309	mulpd	14 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\
310	addpd	%xmm5, %xmm14 ;\
311	movapd	20 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
312	addpd	%xmm6, %xmm15 ;\
313	movapd	 6 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
314#endif
315
316	PROLOGUE
317	PROFCODE
318
319	subq	$STACKSIZE, %rsp
320	movq	%rbx,  0(%rsp)
321	movq	%rbp,  8(%rsp)
322	movq	%r12, 16(%rsp)
323	movq	%r13, 24(%rsp)
324	movq	%r14, 32(%rsp)
325	movq	%r15, 40(%rsp)
326
327#ifdef WINDOWS_ABI
328	movq	%rdi,    48(%rsp)
329	movq	%rsi,    56(%rsp)
330	movups	%xmm6,   64(%rsp)
331	movups	%xmm7,   80(%rsp)
332	movups	%xmm8,   96(%rsp)
333	movups	%xmm9,  112(%rsp)
334	movups	%xmm10, 128(%rsp)
335	movups	%xmm11, 144(%rsp)
336	movups	%xmm12, 160(%rsp)
337	movups	%xmm13, 176(%rsp)
338	movups	%xmm14, 192(%rsp)
339	movups	%xmm15, 208(%rsp)
340
341	movq	ARG1,      OLD_M
342	movq	ARG2,      OLD_N
343	movq	ARG3,      K
344	movq	OLD_A,     A
345	movq	OLD_B,     B
346	movq	OLD_C,     C
347	movq	OLD_LDC,   LDC
348#ifdef TRMMKERNEL
349	movsd	OLD_OFFSET, %xmm12
350#endif
351	movaps	%xmm3, %xmm0
352
353#else
354	movq	OLD_LDC,   LDC
355#ifdef TRMMKERNEL
356	movsd	OLD_OFFSET, %xmm12
357#endif
358
359#endif
360
361	EMMS
362
363	movq	%rsp, %rbx	# save old stack
364	subq	$256 + LOCAL_BUFFER_SIZE, %rsp
365	andq	$-4096, %rsp	# align stack
366
367	STACK_TOUCHING
368
369	movq	OLD_M, M
370	movq	OLD_N, N
371
372	subq	$-16 * SIZE, A
373
374	unpcklpd %xmm0, %xmm0
375	movapd	 %xmm0, ALPHA
376
377	leaq	(, LDC, SIZE), LDC
378
379#ifdef TRMMKERNEL
380	movsd	%xmm12, OFFSET
381	movsd	%xmm12, KK
382#ifndef LEFT
383	negq	KK
384#endif
385#endif
386	movq	N,  J
387	sarq	$2, J		# j = (n >> 2)
388	jle	.L40
389	ALIGN_3
390
391.L01:
392/* Copying to Sub Buffer */
393	leaq	16 * SIZE + BUFFER, BO
394	movq	C, CO1			# coffset1 = c
395	leaq	(C, LDC, 1), CO2	# coffset2 = c + ldc
396
397#if defined(TRMMKERNEL) && defined(LEFT)
398	movq	OFFSET, %rax
399	movq	%rax, KK
400#endif
401
402	movq	K, %rax
403	sarq	$2, %rax
404	jle	.L03
405	ALIGN_3
406
407
408#define RPREFETCHSIZE (8 *  7 + 4)
409#define WPREFETCHSIZE (8 *  8 + 4)
410
411.L02:
412	PREFETCH	 (RPREFETCHSIZE +  0)  * SIZE(B)
413
414	movq	 0 * SIZE(B), %mm0
415	movq	%mm0,  -16 * SIZE(BO)
416	movq	%mm0,  -15 * SIZE(BO)
417	movq	 1 * SIZE(B), %mm1
418	movq	%mm1,  -14 * SIZE(BO)
419	movq	%mm1,  -13 * SIZE(BO)
420
421	movq	 2 * SIZE(B), %mm2
422	movq	%mm2,  -12 * SIZE(BO)
423	movq	%mm2,  -11 * SIZE(BO)
424	movq	 3 * SIZE(B), %mm3
425	movq	%mm3,  -10 * SIZE(BO)
426	movq	%mm3,   -9 * SIZE(BO)
427
428	PREFETCHW	 (WPREFETCHSIZE +  0)  * SIZE(BO)
429
430	movq	 4 * SIZE(B), %mm4
431	movq	%mm4,   -8 * SIZE(BO)
432	movq	%mm4,   -7 * SIZE(BO)
433	movq	 5 * SIZE(B), %mm5
434	movq	%mm5,   -6 * SIZE(BO)
435	movq	%mm5,   -5 * SIZE(BO)
436
437	PREFETCHW	 (WPREFETCHSIZE +  8)  * SIZE(BO)
438
439	movq	 6 * SIZE(B), %mm6
440	movq	%mm6,   -4 * SIZE(BO)
441	movq	%mm6,   -3 * SIZE(BO)
442	movq	 7 * SIZE(B), %mm7
443	movq	%mm7,   -2 * SIZE(BO)
444	movq	%mm7,   -1 * SIZE(BO)
445
446	PREFETCH	 (RPREFETCHSIZE +  8)  * SIZE(B)
447
448	movq	 8 * SIZE(B), %mm0
449	movq	%mm0,   0 * SIZE(BO)
450	movq	%mm0,   1 * SIZE(BO)
451	movq	 9 * SIZE(B), %mm1
452	movq	%mm1,   2 * SIZE(BO)
453	movq	%mm1,   3 * SIZE(BO)
454
455	movq	10 * SIZE(B), %mm2
456	movq	%mm2,   4 * SIZE(BO)
457	movq	%mm2,   5 * SIZE(BO)
458	movq	11 * SIZE(B), %mm3
459	movq	%mm3,   6 * SIZE(BO)
460	movq	%mm3,   7 * SIZE(BO)
461
462	PREFETCHW	 (WPREFETCHSIZE + 16)  * SIZE(BO)
463
464	movq	12 * SIZE(B), %mm4
465	movq	%mm4,   8 * SIZE(BO)
466	movq	%mm4,   9 * SIZE(BO)
467	movq	13 * SIZE(B), %mm5
468	movq	%mm5,  10 * SIZE(BO)
469	movq	%mm5,  11 * SIZE(BO)
470
471	PREFETCHW	 (WPREFETCHSIZE + 24)  * SIZE(BO)
472
473	movq	14 * SIZE(B), %mm6
474	movq	%mm6,  12 * SIZE(BO)
475	movq	%mm6,  13 * SIZE(BO)
476	movq	15 * SIZE(B), %mm7
477	movq	%mm7,  14 * SIZE(BO)
478	movq	%mm7,  15 * SIZE(BO)
479
480	addq	$ 32 * SIZE, BO
481	subq	$-16 * SIZE, B
482
483	subq	$1, %rax
484	jne	.L02
485	ALIGN_3
486
487.L03:
488	movq	K, %rax
489	andq	$3, %rax
490	BRANCH
491	jle	.L10
492	ALIGN_3
493
494.L04:
495	movq	 0 * SIZE(B), %mm0
496	movq	%mm0, -16 * SIZE(BO)
497	movq	%mm0, -15 * SIZE(BO)
498	movq	 1 * SIZE(B), %mm1
499	movq	%mm1, -14 * SIZE(BO)
500	movq	%mm1, -13 * SIZE(BO)
501
502	movq	 2 * SIZE(B), %mm2
503	movq	%mm2, -12 * SIZE(BO)
504	movq	%mm2, -11 * SIZE(BO)
505	movq	 3 * SIZE(B), %mm3
506	movq	%mm3, -10 * SIZE(BO)
507	movq	%mm3,  -9 * SIZE(BO)
508
509	addq	$4 * SIZE, B
510	addq	$8 * SIZE, BO
511	subq	$1, %rax
512	jne	.L04
513	ALIGN_3
514
515.L10:
516	movq	A, AO		# aoffset = a
517
518	leaq	 (RPREFETCHSIZE +  0)  * SIZE(B), BB
519
520	movq	M,  I
521	sarq	$2, I	# i = (m >> 2)
522	jle	.L20
523	ALIGN_3
524
525.L11:
526#if !defined(TRMMKERNEL) || \
527	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
528	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
529
530	leaq	16 * SIZE + BUFFER, BO
531#else
532	leaq	16 * SIZE + BUFFER, BO
533	movq	KK, %rax
534	leaq	(, %rax, SIZE), %rax
535	leaq	(AO, %rax, 4), AO
536	leaq	(BO, %rax, 8), BO
537#endif
538
539	movapd	-16 * SIZE(AO), %xmm0
540	movapd	-16 * SIZE(BO), %xmm1
541	pxor	%xmm8, %xmm8
542	movapd	-14 * SIZE(AO), %xmm2
543	movapd	-14 * SIZE(BO), %xmm3
544	pxor	%xmm9, %xmm9
545	movapd	-12 * SIZE(AO), %xmm4
546	movapd	-12 * SIZE(BO), %xmm5
547	pxor	%xmm10, %xmm10
548	movapd	-10 * SIZE(AO), %xmm6
549	movapd	 -8 * SIZE(BO), %xmm7
550	pxor	%xmm11, %xmm11
551
552	PREFETCHW      3 * SIZE(CO1)
553	pxor	%xmm12, %xmm12
554	PREFETCHW      7 * SIZE(CO2)
555	pxor	%xmm13, %xmm13
556	PREFETCHW      3 * SIZE(CO1, LDC, 2)
557	pxor	%xmm14, %xmm14
558	PREFETCHW      7 * SIZE(CO2, LDC, 2)
559	pxor	%xmm15, %xmm15
560
561	PREFETCH	 0  * SIZE(BB)
562
563#ifndef TRMMKERNEL
564	movq	K, %rax
565#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
566	movq	K, %rax
567	subq	KK, %rax
568	movq	%rax, KKK
569#else
570	movq	KK, %rax
571#ifdef LEFT
572	addq	$4, %rax
573#else
574	addq	$4, %rax
575#endif
576	movq	%rax, KKK
577#endif
578
579#ifndef GENERIC
580	andq	$-8, %rax
581
582	leaq	(, %rax, SIZE), %rax
583	leaq	(AO, %rax, 4), AO
584	leaq	(BO, %rax, 8), BO
585	negq	%rax
586	NOBRANCH
587	je	.L15
588	ALIGN_3
589
590.L12:
591	KERNEL1(16 *  0)
592	KERNEL2(16 *  0)
593	KERNEL3(16 *  0)
594	KERNEL4(16 *  0)
595	KERNEL5(16 *  0)
596	KERNEL6(16 *  0)
597	KERNEL7(16 *  0)
598	KERNEL8(16 *  0)
599
600	KERNEL1(16 *  1)
601	KERNEL2(16 *  1)
602	KERNEL3(16 *  1)
603	KERNEL4(16 *  1)
604	KERNEL5(16 *  1)
605	KERNEL6(16 *  1)
606	KERNEL7(16 *  1)
607	KERNEL8(16 *  1)
608
609	addq	$8 * SIZE, %rax
610	NOBRANCH
611	je	.L15
612
613	KERNEL1(16 *  0)
614	KERNEL2(16 *  0)
615	KERNEL3(16 *  0)
616	KERNEL4(16 *  0)
617	KERNEL5(16 *  0)
618	KERNEL6(16 *  0)
619	KERNEL7(16 *  0)
620	KERNEL8(16 *  0)
621
622	KERNEL1(16 *  1)
623	KERNEL2(16 *  1)
624	KERNEL3(16 *  1)
625	KERNEL4(16 *  1)
626	KERNEL5(16 *  1)
627	KERNEL6(16 *  1)
628	KERNEL7(16 *  1)
629	KERNEL8(16 *  1)
630
631	addq	$8 * SIZE, %rax
632	NOBRANCH
633	je	.L15
634
635	KERNEL1(16 *  0)
636	KERNEL2(16 *  0)
637	KERNEL3(16 *  0)
638	KERNEL4(16 *  0)
639	KERNEL5(16 *  0)
640	KERNEL6(16 *  0)
641	KERNEL7(16 *  0)
642	KERNEL8(16 *  0)
643
644	KERNEL1(16 *  1)
645	KERNEL2(16 *  1)
646	KERNEL3(16 *  1)
647	KERNEL4(16 *  1)
648	KERNEL5(16 *  1)
649	KERNEL6(16 *  1)
650	KERNEL7(16 *  1)
651	KERNEL8(16 *  1)
652
653	addq	$8 * SIZE, %rax
654	NOBRANCH
655	je	.L15
656
657	KERNEL1(16 *  0)
658	KERNEL2(16 *  0)
659	KERNEL3(16 *  0)
660	KERNEL4(16 *  0)
661	KERNEL5(16 *  0)
662	KERNEL6(16 *  0)
663	KERNEL7(16 *  0)
664	KERNEL8(16 *  0)
665
666	KERNEL1(16 *  1)
667	KERNEL2(16 *  1)
668	KERNEL3(16 *  1)
669	KERNEL4(16 *  1)
670	KERNEL5(16 *  1)
671	KERNEL6(16 *  1)
672	KERNEL7(16 *  1)
673	KERNEL8(16 *  1)
674
675	addq	$8 * SIZE, %rax
676	NOBRANCH
677	je	.L15
678
679	KERNEL1(16 *  0)
680	KERNEL2(16 *  0)
681	KERNEL3(16 *  0)
682	KERNEL4(16 *  0)
683	KERNEL5(16 *  0)
684	KERNEL6(16 *  0)
685	KERNEL7(16 *  0)
686	KERNEL8(16 *  0)
687
688	KERNEL1(16 *  1)
689	KERNEL2(16 *  1)
690	KERNEL3(16 *  1)
691	KERNEL4(16 *  1)
692	KERNEL5(16 *  1)
693	KERNEL6(16 *  1)
694	KERNEL7(16 *  1)
695	KERNEL8(16 *  1)
696
697	addq	$8 * SIZE, %rax
698	NOBRANCH
699	je	.L15
700
701	KERNEL1(16 *  0)
702	KERNEL2(16 *  0)
703	KERNEL3(16 *  0)
704	KERNEL4(16 *  0)
705	KERNEL5(16 *  0)
706	KERNEL6(16 *  0)
707	KERNEL7(16 *  0)
708	KERNEL8(16 *  0)
709
710	KERNEL1(16 *  1)
711	KERNEL2(16 *  1)
712	KERNEL3(16 *  1)
713	KERNEL4(16 *  1)
714	KERNEL5(16 *  1)
715	KERNEL6(16 *  1)
716	KERNEL7(16 *  1)
717	KERNEL8(16 *  1)
718
719	addq	$8 * SIZE, %rax
720	NOBRANCH
721	je	.L15
722
723	KERNEL1(16 *  0)
724	KERNEL2(16 *  0)
725	KERNEL3(16 *  0)
726	KERNEL4(16 *  0)
727	KERNEL5(16 *  0)
728	KERNEL6(16 *  0)
729	KERNEL7(16 *  0)
730	KERNEL8(16 *  0)
731
732	KERNEL1(16 *  1)
733	KERNEL2(16 *  1)
734	KERNEL3(16 *  1)
735	KERNEL4(16 *  1)
736	KERNEL5(16 *  1)
737	KERNEL6(16 *  1)
738	KERNEL7(16 *  1)
739	KERNEL8(16 *  1)
740
741	addq	$8 * SIZE, %rax
742	NOBRANCH
743	je	.L15
744
745	KERNEL1(16 *  0)
746	KERNEL2(16 *  0)
747	KERNEL3(16 *  0)
748	KERNEL4(16 *  0)
749	KERNEL5(16 *  0)
750	KERNEL6(16 *  0)
751	KERNEL7(16 *  0)
752	KERNEL8(16 *  0)
753
754	KERNEL1(16 *  1)
755	KERNEL2(16 *  1)
756	KERNEL3(16 *  1)
757	KERNEL4(16 *  1)
758	KERNEL5(16 *  1)
759	KERNEL6(16 *  1)
760	KERNEL7(16 *  1)
761	KERNEL8(16 *  1)
762
763	addq	$8 * SIZE, %rax
764	BRANCH
765	jl	.L12
766	ALIGN_3
767
768.L15:
769#ifndef TRMMKERNEL
770	movq	K, %rax
771#else
772	movq	KKK, %rax
773#endif
774	testq	$4, %rax
775	je .L16
776	xorq	%rax, %rax
777	ALIGN_3
778
779	KERNEL1(16 *  0)
780	KERNEL2(16 *  0)
781	KERNEL3(16 *  0)
782	KERNEL4(16 *  0)
783	KERNEL5(16 *  0)
784	KERNEL6(16 *  0)
785	KERNEL7(16 *  0)
786	KERNEL8(16 *  0)
787
788	addq	$32 * SIZE, BO
789	addq	$16 * SIZE, AO
790	ALIGN_3
791
792#else
793	sarq	$2, %rax
794	NOBRANCH
795	jle	.L16
796	ALIGN_3
797
798.L12:
799	KERNEL1(16 *  0)
800	KERNEL2(16 *  0)
801	KERNEL3(16 *  0)
802	KERNEL4(16 *  0)
803	KERNEL5(16 *  0)
804	KERNEL6(16 *  0)
805	KERNEL7(16 *  0)
806	KERNEL8(16 *  0)
807
808	addq	$ 32 * SIZE, BO
809	subq	$-16 * SIZE, AO
810	decq	%rax
811	BRANCH
812	jg	.L12
813#endif
814
815.L16:
816	movapd	ALPHA, %xmm7
817
818#ifndef TRMMKERNEL
819	movq	K, %rax
820#else
821	movq	KKK, %rax
822#endif
823	andq	$3, %rax		# if (k & 1)
824	je .L19
825
826	leaq	(, %rax, SIZE), %rax
827	leaq	(AO, %rax, 4), AO
828	leaq	(BO, %rax, 8), BO
829	negq	%rax
830	ALIGN_3
831
832.L17:
833	mulpd	%xmm0, %xmm1
834	addpd	%xmm1, %xmm8
835	movapd	-14 * SIZE(BO, %rax, 8), %xmm1
836	mulpd	%xmm0, %xmm1
837	addpd	%xmm1, %xmm9
838	movapd	-12 * SIZE(BO, %rax, 8), %xmm1
839	mulpd	%xmm0, %xmm1
840	mulpd	-10 * SIZE(BO, %rax, 8), %xmm0
841	addpd	%xmm1, %xmm10
842	movapd	-16 * SIZE(BO, %rax, 8), %xmm1
843	addpd	%xmm0, %xmm11
844	movapd	-12 * SIZE(AO, %rax, 4), %xmm0
845	mulpd	%xmm2, %xmm1
846	addpd	%xmm1, %xmm12
847	movapd	-14 * SIZE(BO, %rax, 8), %xmm1
848	mulpd	%xmm2, %xmm1
849	addpd	%xmm1, %xmm13
850	movapd	-12 * SIZE(BO, %rax, 8), %xmm1
851	mulpd	%xmm2, %xmm1
852	mulpd	-10 * SIZE(BO, %rax, 8), %xmm2
853	addpd	%xmm1, %xmm14
854	movapd	 -8 * SIZE(BO, %rax, 8), %xmm1
855	addpd	%xmm2, %xmm15
856	movapd	-10 * SIZE(AO, %rax, 4), %xmm2
857
858	addq	$SIZE, %rax
859	jl	.L17
860	ALIGN_3
861
862.L19:
863	PREFETCH	 8  * SIZE(BB)
864	subq		 $-12 * SIZE, BB
865
866#ifndef TRMMKERNEL
867	movsd	0 * SIZE(CO1), %xmm0
868	movhpd	1 * SIZE(CO1), %xmm0
869	movsd	2 * SIZE(CO1), %xmm1
870	movhpd	3 * SIZE(CO1), %xmm1
871
872	movsd	0 * SIZE(CO2), %xmm2
873	movhpd	1 * SIZE(CO2), %xmm2
874	movsd	2 * SIZE(CO2), %xmm3
875	movhpd	3 * SIZE(CO2), %xmm3
876#endif
877
878	mulpd	%xmm7, %xmm8
879	mulpd	%xmm7, %xmm9
880	mulpd	%xmm7, %xmm10
881	mulpd	%xmm7, %xmm11
882
883	mulpd	%xmm7, %xmm12
884	mulpd	%xmm7, %xmm13
885	mulpd	%xmm7, %xmm14
886	mulpd	%xmm7, %xmm15
887
888#ifndef TRMMKERNEL
889	movlpd	0 * SIZE(CO1, LDC, 2), %xmm4
890	movhpd	1 * SIZE(CO1, LDC, 2), %xmm4
891	movlpd	2 * SIZE(CO1, LDC, 2), %xmm5
892	movhpd	3 * SIZE(CO1, LDC, 2), %xmm5
893
894	movlpd	0 * SIZE(CO2, LDC, 2), %xmm6
895	movhpd	1 * SIZE(CO2, LDC, 2), %xmm6
896	movlpd	2 * SIZE(CO2, LDC, 2), %xmm7
897	movhpd	3 * SIZE(CO2, LDC, 2), %xmm7
898
899	addpd	%xmm0, %xmm8
900	addpd	%xmm1, %xmm12
901	addpd	%xmm2, %xmm9
902	addpd	%xmm3, %xmm13
903#endif
904
905	movlpd	%xmm8, 0 * SIZE(CO1)
906	movhpd	%xmm8, 1 * SIZE(CO1)
907	movlpd	%xmm12, 2 * SIZE(CO1)
908	movhpd	%xmm12, 3 * SIZE(CO1)
909
910	movlpd	%xmm9, 0 * SIZE(CO2)
911	movhpd	%xmm9, 1 * SIZE(CO2)
912	movlpd	%xmm13, 2 * SIZE(CO2)
913	movhpd	%xmm13, 3 * SIZE(CO2)
914
915#ifndef TRMMKERNEL
916	addpd	%xmm4, %xmm10
917	addpd	%xmm5, %xmm14
918	addpd	%xmm6, %xmm11
919	addpd	%xmm7, %xmm15
920#endif
921
922	movlpd	%xmm10, 0 * SIZE(CO1, LDC, 2)
923	movhpd	%xmm10, 1 * SIZE(CO1, LDC, 2)
924	movlpd	%xmm14, 2 * SIZE(CO1, LDC, 2)
925	movhpd	%xmm14, 3 * SIZE(CO1, LDC, 2)
926
927	movlpd	%xmm11, 0 * SIZE(CO2, LDC, 2)
928	movhpd	%xmm11, 1 * SIZE(CO2, LDC, 2)
929	movlpd	%xmm15, 2 * SIZE(CO2, LDC, 2)
930	movhpd	%xmm15, 3 * SIZE(CO2, LDC, 2)
931
932#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
933    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
934	movq	K, %rax
935	subq	KKK, %rax
936	leaq	(,%rax, SIZE), %rax
937	leaq	(AO, %rax, 4), AO
938	leaq	(BO, %rax, 8), BO
939#endif
940
941#if defined(TRMMKERNEL) && defined(LEFT)
942	addq	$4, KK
943#endif
944
945	addq	$4 * SIZE, CO1		# coffset += 4
946	addq	$4 * SIZE, CO2		# coffset += 4
947	decq	I			# i --
948	BRANCH
949	jg	.L11
950	ALIGN_3
951
952.L20:
953	testq	$3, M
954	je	.L39
955
956	testq	$2, M
957	je	.L30
958	ALIGN_3
959
960.L21:
961#if !defined(TRMMKERNEL) || \
962	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
963	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
964
965	leaq	BUFFER, BO
966#else
967	leaq	BUFFER, BO
968	movq	KK, %rax
969	leaq	(, %rax, SIZE), %rax
970	leaq	(AO, %rax, 2), AO
971	leaq	(BO, %rax, 8), BO
972#endif
973
974	movapd	-16 * SIZE(AO), %xmm0
975	pxor	%xmm8, %xmm8
976	movapd	 0 * SIZE(BO), %xmm1
977	pxor	%xmm9, %xmm9
978	movapd	 -8 * SIZE(AO), %xmm2
979	pxor	%xmm10, %xmm10
980	movapd	 8 * SIZE(BO), %xmm3
981	pxor	%xmm11, %xmm11
982
983	movapd	16 * SIZE(BO), %xmm5
984	movapd	24 * SIZE(BO), %xmm7
985
986#ifndef TRMMKERNEL
987	movq	K, %rax
988#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
989	movq	K, %rax
990	subq	KK, %rax
991	movq	%rax, KKK
992#else
993	movq	KK, %rax
994#ifdef LEFT
995	addq	$2, %rax
996#else
997	addq	$4, %rax
998#endif
999	movq	%rax, KKK
1000#endif
1001	sarq	$3, %rax
1002	je	.L25
1003	ALIGN_3
1004
1005.L22:
1006	mulpd	%xmm0, %xmm1
1007	addpd	%xmm1, %xmm8
1008	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
1009	movapd	 2 * SIZE(BO), %xmm1
1010	mulpd	%xmm0, %xmm1
1011	addpd	%xmm1, %xmm9
1012	movapd	 4 * SIZE(BO), %xmm1
1013	mulpd	%xmm0, %xmm1
1014	mulpd	 6 * SIZE(BO), %xmm0
1015	addpd	%xmm1, %xmm10
1016	movapd	32 * SIZE(BO), %xmm1
1017	addpd	%xmm0, %xmm11
1018	movapd	-14 * SIZE(AO), %xmm0
1019
1020	mulpd	%xmm0, %xmm3
1021	addpd	%xmm3, %xmm8
1022	movapd	10 * SIZE(BO), %xmm3
1023	mulpd	%xmm0, %xmm3
1024	addpd	%xmm3, %xmm9
1025	movapd	12 * SIZE(BO), %xmm3
1026	mulpd	%xmm0, %xmm3
1027	mulpd	14 * SIZE(BO), %xmm0
1028	addpd	%xmm3, %xmm10
1029	movapd	40 * SIZE(BO), %xmm3
1030	addpd	%xmm0, %xmm11
1031	movapd	-12 * SIZE(AO), %xmm0
1032
1033	mulpd	%xmm0, %xmm5
1034	addpd	%xmm5, %xmm8
1035	movapd	18 * SIZE(BO), %xmm5
1036	mulpd	%xmm0, %xmm5
1037	addpd	%xmm5, %xmm9
1038	movapd	20 * SIZE(BO), %xmm5
1039	mulpd	%xmm0, %xmm5
1040	mulpd	22 * SIZE(BO), %xmm0
1041	addpd	%xmm5, %xmm10
1042	movapd	48 * SIZE(BO), %xmm5
1043	addpd	%xmm0, %xmm11
1044	movapd	-10 * SIZE(AO), %xmm0
1045
1046	mulpd	%xmm0, %xmm7
1047	addpd	%xmm7, %xmm8
1048	movapd	26 * SIZE(BO), %xmm7
1049	mulpd	%xmm0, %xmm7
1050	addpd	%xmm7, %xmm9
1051	movapd	28 * SIZE(BO), %xmm7
1052	mulpd	%xmm0, %xmm7
1053	mulpd	30 * SIZE(BO), %xmm0
1054	addpd	%xmm7, %xmm10
1055	movapd	56 * SIZE(BO), %xmm7
1056	addpd	%xmm0, %xmm11
1057	movapd	  0 * SIZE(AO), %xmm0
1058
1059	PREFETCH	(PREFETCHSIZE     +  8) * SIZE(AO)
1060	mulpd	%xmm2, %xmm1
1061	addpd	%xmm1, %xmm8
1062	movapd	34 * SIZE(BO), %xmm1
1063	mulpd	%xmm2, %xmm1
1064	addpd	%xmm1, %xmm9
1065	movapd	36 * SIZE(BO), %xmm1
1066	mulpd	%xmm2, %xmm1
1067	mulpd	38 * SIZE(BO), %xmm2
1068	addpd	%xmm1, %xmm10
1069	movapd	64 * SIZE(BO), %xmm1
1070	addpd	%xmm2, %xmm11
1071	movapd	-6 * SIZE(AO), %xmm2
1072
1073	mulpd	%xmm2, %xmm3
1074	addpd	%xmm3, %xmm8
1075	movapd	42 * SIZE(BO), %xmm3
1076	mulpd	%xmm2, %xmm3
1077	addpd	%xmm3, %xmm9
1078	movapd	44 * SIZE(BO), %xmm3
1079	mulpd	%xmm2, %xmm3
1080	mulpd	46 * SIZE(BO), %xmm2
1081	addpd	%xmm3, %xmm10
1082	movapd	72 * SIZE(BO), %xmm3
1083	addpd	%xmm2, %xmm11
1084	movapd	-4 * SIZE(AO), %xmm2
1085
1086	mulpd	%xmm2, %xmm5
1087	addpd	%xmm5, %xmm8
1088	movapd	50 * SIZE(BO), %xmm5
1089	mulpd	%xmm2, %xmm5
1090	addpd	%xmm5, %xmm9
1091	movapd	52 * SIZE(BO), %xmm5
1092	mulpd	%xmm2, %xmm5
1093	mulpd	54 * SIZE(BO), %xmm2
1094	addpd	%xmm5, %xmm10
1095	movapd	80 * SIZE(BO), %xmm5
1096	addpd	%xmm2, %xmm11
1097	movapd	-2 * SIZE(AO), %xmm2
1098
1099	mulpd	%xmm2, %xmm7
1100	addpd	%xmm7, %xmm8
1101	movapd	58 * SIZE(BO), %xmm7
1102	mulpd	%xmm2, %xmm7
1103	addpd	%xmm7, %xmm9
1104	movapd	60 * SIZE(BO), %xmm7
1105	mulpd	%xmm2, %xmm7
1106	mulpd	62 * SIZE(BO), %xmm2
1107	addpd	%xmm7, %xmm10
1108	movapd	88 * SIZE(BO), %xmm7
1109	addpd	%xmm2, %xmm11
1110	movapd	 8 * SIZE(AO), %xmm2
1111
1112	addq   $16 * SIZE, AO
1113	addq   $64 * SIZE, BO
1114	decq   %rax
1115	jne    .L22
1116	ALIGN_3
1117
1118.L25:
1119#ifndef TRMMKERNEL
1120	movq	K, %rax
1121#else
1122	movq	KKK, %rax
1123#endif
1124	movapd	ALPHA, %xmm7
1125	andq	$7, %rax		# if (k & 1)
1126	BRANCH
1127	je .L29
1128	ALIGN_3
1129
1130.L26:
1131	mulpd	%xmm0, %xmm1
1132	addpd	%xmm1, %xmm8
1133	movapd	  2 * SIZE(BO), %xmm1
1134	mulpd	%xmm0, %xmm1
1135	addpd	%xmm1, %xmm9
1136	movapd	  4 * SIZE(BO), %xmm1
1137	mulpd	%xmm0, %xmm1
1138	mulpd	  6 * SIZE(BO), %xmm0
1139	addpd	%xmm1, %xmm10
1140	movapd	  8 * SIZE(BO), %xmm1
1141	addpd	%xmm0, %xmm11
1142	movapd	-14 * SIZE(AO), %xmm0
1143
1144	addq	$2 * SIZE, AO		# aoffset  += 4
1145	addq	$8 * SIZE, BO		# boffset1 += 8
1146	decq	%rax
1147	jg	.L26
1148	ALIGN_3
1149
1150.L29:
1151#ifndef TRMMKERNEL
1152	movlpd	0 * SIZE(CO1), %xmm0
1153	movhpd	1 * SIZE(CO1), %xmm0
1154	movlpd	0 * SIZE(CO2), %xmm2
1155	movhpd	1 * SIZE(CO2), %xmm2
1156
1157	movlpd	0 * SIZE(CO1, LDC, 2), %xmm4
1158	movhpd	1 * SIZE(CO1, LDC, 2), %xmm4
1159	movlpd	0 * SIZE(CO2, LDC, 2), %xmm6
1160	movhpd	1 * SIZE(CO2, LDC, 2), %xmm6
1161#endif
1162	mulpd	%xmm7, %xmm8
1163	mulpd	%xmm7, %xmm9
1164	mulpd	%xmm7, %xmm10
1165	mulpd	%xmm7, %xmm11
1166
1167#ifndef TRMMKERNEL
1168	addpd	%xmm0,  %xmm8
1169	addpd	%xmm2, %xmm9
1170	addpd	%xmm4, %xmm10
1171	addpd	%xmm6, %xmm11
1172#endif
1173
1174	movlpd	%xmm8, 0 * SIZE(CO1)
1175	movhpd	%xmm8, 1 * SIZE(CO1)
1176	movlpd	%xmm9, 0 * SIZE(CO2)
1177	movhpd	%xmm9, 1 * SIZE(CO2)
1178	movlpd	%xmm10, 0 * SIZE(CO1, LDC, 2)
1179	movhpd	%xmm10, 1 * SIZE(CO1, LDC, 2)
1180	movlpd	%xmm11, 0 * SIZE(CO2, LDC, 2)
1181	movhpd	%xmm11, 1 * SIZE(CO2, LDC, 2)
1182
1183#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1184    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1185	movq	K, %rax
1186	subq	KKK, %rax
1187	leaq	(,%rax, SIZE), %rax
1188	leaq	(AO, %rax, 2), AO
1189	leaq	(BO, %rax, 8), BO
1190#endif
1191
1192#if defined(TRMMKERNEL) && defined(LEFT)
1193	addq	$2, KK
1194#endif
1195
1196	addq	$2 * SIZE, CO1		# coffset += 4
1197	addq	$2 * SIZE, CO2		# coffset += 4
1198	ALIGN_3
1199
1200.L30:
1201	testq	$1, M
1202	je	.L39
1203	ALIGN_3
1204
1205.L31:
1206#if !defined(TRMMKERNEL) || \
1207	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1208	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1209
1210	leaq	BUFFER, BO
1211#else
1212	leaq	BUFFER, BO
1213	movq	KK, %rax
1214	leaq	(, %rax, SIZE), %rax
1215	leaq	(AO, %rax, 1), AO
1216	leaq	(BO, %rax, 8), BO
1217#endif
1218
1219	movsd	-16 * SIZE(AO), %xmm0
1220	pxor	%xmm8, %xmm8
1221	movsd	 0 * SIZE(BO), %xmm1
1222	pxor	%xmm9, %xmm9
1223	movsd	 -8 * SIZE(AO), %xmm2
1224	pxor	%xmm10, %xmm10
1225	movsd	 8 * SIZE(BO), %xmm3
1226	pxor	%xmm11, %xmm11
1227
1228	movsd	16 * SIZE(BO), %xmm5
1229	movsd	24 * SIZE(BO), %xmm7
1230
1231#ifndef TRMMKERNEL
1232	movq	K, %rax
1233#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1234	movq	K, %rax
1235	subq	KK, %rax
1236	movq	%rax, KKK
1237#else
1238	movq	KK, %rax
1239#ifdef LEFT
1240	addq	$1, %rax
1241#else
1242	addq	$4, %rax
1243#endif
1244	movq	%rax, KKK
1245#endif
1246	sarq	$3, %rax
1247	je	.L35
1248	ALIGN_3
1249
1250.L32:
1251	mulsd	%xmm0, %xmm1
1252	addsd	%xmm1, %xmm8
1253	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
1254	movsd	 2 * SIZE(BO), %xmm1
1255	mulsd	%xmm0, %xmm1
1256	addsd	%xmm1, %xmm9
1257	movsd	 4 * SIZE(BO), %xmm1
1258	mulsd	%xmm0, %xmm1
1259	mulsd	 6 * SIZE(BO), %xmm0
1260	addsd	%xmm1, %xmm10
1261	movsd	32 * SIZE(BO), %xmm1
1262	addsd	%xmm0, %xmm11
1263	movsd	-15 * SIZE(AO), %xmm0
1264
1265	mulsd	%xmm0, %xmm3
1266	addsd	%xmm3, %xmm8
1267	movsd	10 * SIZE(BO), %xmm3
1268	mulsd	%xmm0, %xmm3
1269	addsd	%xmm3, %xmm9
1270	movsd	12 * SIZE(BO), %xmm3
1271	mulsd	%xmm0, %xmm3
1272	mulsd	14 * SIZE(BO), %xmm0
1273	addsd	%xmm3, %xmm10
1274	movsd	40 * SIZE(BO), %xmm3
1275	addsd	%xmm0, %xmm11
1276	movsd	-14 * SIZE(AO), %xmm0
1277
1278	mulsd	%xmm0, %xmm5
1279	addsd	%xmm5, %xmm8
1280	movsd	18 * SIZE(BO), %xmm5
1281	mulsd	%xmm0, %xmm5
1282	addsd	%xmm5, %xmm9
1283	movsd	20 * SIZE(BO), %xmm5
1284	mulsd	%xmm0, %xmm5
1285	mulsd	22 * SIZE(BO), %xmm0
1286	addsd	%xmm5, %xmm10
1287	movsd	48 * SIZE(BO), %xmm5
1288	addsd	%xmm0, %xmm11
1289	movsd	-13 * SIZE(AO), %xmm0
1290
1291	mulsd	%xmm0, %xmm7
1292	addsd	%xmm7, %xmm8
1293	movsd	26 * SIZE(BO), %xmm7
1294	mulsd	%xmm0, %xmm7
1295	addsd	%xmm7, %xmm9
1296	movsd	28 * SIZE(BO), %xmm7
1297	mulsd	%xmm0, %xmm7
1298	mulsd	30 * SIZE(BO), %xmm0
1299	addsd	%xmm7, %xmm10
1300	movsd	56 * SIZE(BO), %xmm7
1301	addsd	%xmm0, %xmm11
1302	movsd	-12 * SIZE(AO), %xmm0
1303
1304	mulsd	%xmm0, %xmm1
1305	addsd	%xmm1, %xmm8
1306	movsd	34 * SIZE(BO), %xmm1
1307	mulsd	%xmm0, %xmm1
1308	addsd	%xmm1, %xmm9
1309	movsd	36 * SIZE(BO), %xmm1
1310	mulsd	%xmm0, %xmm1
1311	mulsd	38 * SIZE(BO), %xmm0
1312	addsd	%xmm1, %xmm10
1313	movsd	64 * SIZE(BO), %xmm1
1314	addsd	%xmm0, %xmm11
1315	movsd	-11 * SIZE(AO), %xmm0
1316
1317	mulsd	%xmm0, %xmm3
1318	addsd	%xmm3, %xmm8
1319	movsd	42 * SIZE(BO), %xmm3
1320	mulsd	%xmm0, %xmm3
1321	addsd	%xmm3, %xmm9
1322	movsd	44 * SIZE(BO), %xmm3
1323	mulsd	%xmm0, %xmm3
1324	mulsd	46 * SIZE(BO), %xmm0
1325	addsd	%xmm3, %xmm10
1326	movsd	72 * SIZE(BO), %xmm3
1327	addsd	%xmm0, %xmm11
1328	movsd	-10 * SIZE(AO), %xmm0
1329
1330	mulsd	%xmm0, %xmm5
1331	addsd	%xmm5, %xmm8
1332	movsd	50 * SIZE(BO), %xmm5
1333	mulsd	%xmm0, %xmm5
1334	addsd	%xmm5, %xmm9
1335	movsd	52 * SIZE(BO), %xmm5
1336	mulsd	%xmm0, %xmm5
1337	mulsd	54 * SIZE(BO), %xmm0
1338	addsd	%xmm5, %xmm10
1339	movsd	80 * SIZE(BO), %xmm5
1340	addsd	%xmm0, %xmm11
1341	movsd	-9 * SIZE(AO), %xmm0
1342
1343	mulsd	%xmm0, %xmm7
1344	addsd	%xmm7, %xmm8
1345	movsd	58 * SIZE(BO), %xmm7
1346	mulsd	%xmm0, %xmm7
1347	addsd	%xmm7, %xmm9
1348	movsd	60 * SIZE(BO), %xmm7
1349	mulsd	%xmm0, %xmm7
1350	mulsd	62 * SIZE(BO), %xmm0
1351	addsd	%xmm7, %xmm10
1352	movsd	88 * SIZE(BO), %xmm7
1353	addsd	%xmm0, %xmm11
1354	movsd	-8 * SIZE(AO), %xmm0
1355
1356	addq   $ 8 * SIZE, AO
1357	addq   $64 * SIZE, BO
1358	decq   %rax
1359	jne    .L32
1360	ALIGN_3
1361
1362.L35:
1363#ifndef TRMMKERNEL
1364	movq	K, %rax
1365#else
1366	movq	KKK, %rax
1367#endif
1368	movsd	ALPHA, %xmm7
1369	andq	$7, %rax		# if (k & 1)
1370	BRANCH
1371	je .L38
1372	ALIGN_3
1373
1374.L36:
1375	mulsd	%xmm0, %xmm1
1376	addsd	%xmm1, %xmm8
1377	movsd	 2 * SIZE(BO), %xmm1
1378	mulsd	%xmm0, %xmm1
1379	addsd	%xmm1, %xmm9
1380	movsd	 4 * SIZE(BO), %xmm1
1381	mulsd	%xmm0, %xmm1
1382	mulsd	 6 * SIZE(BO), %xmm0
1383	addsd	%xmm1, %xmm10
1384	movsd	 8 * SIZE(BO), %xmm1
1385	addsd	%xmm0, %xmm11
1386	movsd	-15 * SIZE(AO), %xmm0
1387
1388	addq	$1 * SIZE, AO		# aoffset  += 4
1389	addq	$8 * SIZE, BO		# boffset1 += 8
1390	decq	%rax
1391	jg	.L36
1392	ALIGN_3
1393
1394.L38:
1395#ifndef TRMMKERNEL
1396	movsd	0 * SIZE(CO1), %xmm0
1397	movsd	0 * SIZE(CO2), %xmm2
1398	movsd	0 * SIZE(CO1, LDC, 2), %xmm4
1399	movsd	0 * SIZE(CO2, LDC, 2), %xmm6
1400#endif
1401
1402	mulsd	%xmm7, %xmm8
1403	mulsd	%xmm7, %xmm9
1404	mulsd	%xmm7, %xmm10
1405	mulsd	%xmm7, %xmm11
1406
1407#ifndef TRMMKERNEL
1408	addsd	%xmm0,  %xmm8
1409	addsd	%xmm2, %xmm9
1410	addsd	%xmm4, %xmm10
1411	addsd	%xmm6, %xmm11
1412#endif
1413
1414	movsd	%xmm8, 0 * SIZE(CO1)
1415	movsd	%xmm9, 0 * SIZE(CO2)
1416	movsd	%xmm10, 0 * SIZE(CO1, LDC, 2)
1417	movsd	%xmm11, 0 * SIZE(CO2, LDC, 2)
1418
1419#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1420    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1421	movq	K, %rax
1422	subq	KKK, %rax
1423	leaq	(,%rax, SIZE), %rax
1424	leaq	(AO, %rax, 1), AO
1425	leaq	(BO, %rax, 8), BO
1426#endif
1427
1428#if defined(TRMMKERNEL) && defined(LEFT)
1429	addq	$1, KK
1430#endif
1431	ALIGN_3
1432
1433.L39:
1434#if defined(TRMMKERNEL) && !defined(LEFT)
1435	addl	$4, KK
1436#endif
1437
1438	leaq	(C, LDC, 4), C		# c += 4 * ldc
1439	decq	J			# j --
1440	jg	.L01
1441	ALIGN_3
1442
1443.L40:
1444	testq	$3, N
1445	je	.L999
1446
1447	testq	$2, N
1448	je	.L80
1449	ALIGN_4
1450
1451.L41:
1452/* Copying to Sub Buffer */
1453	leaq	BUFFER, BO
1454
1455#if defined(TRMMKERNEL) && defined(LEFT)
1456	movq	OFFSET, %rax
1457	movq	%rax, KK
1458#endif
1459
1460	movq	K, %rax
1461	sarq	$2, %rax
1462	jle	.L43
1463	ALIGN_3
1464
1465.L42:
1466	PREFETCH	 56 * SIZE(B)
1467
1468	movq	 0 * SIZE(B), %mm0
1469	movq	 1 * SIZE(B), %mm1
1470	movq	 2 * SIZE(B), %mm2
1471	movq	 3 * SIZE(B), %mm3
1472	movq	 4 * SIZE(B), %mm4
1473	movq	 5 * SIZE(B), %mm5
1474	movq	 6 * SIZE(B), %mm6
1475	movq	 7 * SIZE(B), %mm7
1476
1477	addq	$ 8 * SIZE, B
1478	addq	$16 * SIZE, BO
1479
1480	movq	%mm0, -16 * SIZE(BO)
1481	movq	%mm0, -15 * SIZE(BO)
1482	movq	%mm1, -14 * SIZE(BO)
1483	movq	%mm1, -13 * SIZE(BO)
1484	movq	%mm2, -12 * SIZE(BO)
1485	movq	%mm2, -11 * SIZE(BO)
1486	movq	%mm3, -10 * SIZE(BO)
1487	movq	%mm3,  -9 * SIZE(BO)
1488	movq	%mm4,  -8 * SIZE(BO)
1489	movq	%mm4,  -7 * SIZE(BO)
1490	movq	%mm5,  -6 * SIZE(BO)
1491	movq	%mm5,  -5 * SIZE(BO)
1492	movq	%mm6,  -4 * SIZE(BO)
1493	movq	%mm6,  -3 * SIZE(BO)
1494	movq	%mm7,  -2 * SIZE(BO)
1495	movq	%mm7,  -1 * SIZE(BO)
1496
1497	decq	%rax
1498	jne	.L42
1499	ALIGN_3
1500
1501.L43:
1502	movq	K, %rax
1503	andq	$3, %rax
1504	BRANCH
1505	jle	.L50
1506	ALIGN_3
1507
1508.L44:
1509	movq	 0 * SIZE(B), %mm0
1510	movq	 1 * SIZE(B), %mm1
1511
1512	movq	%mm0,  0 * SIZE(BO)
1513	movq	%mm0,  1 * SIZE(BO)
1514	movq	%mm1,  2 * SIZE(BO)
1515	movq	%mm1,  3 * SIZE(BO)
1516
1517	addq	$2 * SIZE, B
1518	addq	$4 * SIZE, BO
1519	decq	%rax
1520	jne	.L44
1521	ALIGN_3
1522
1523.L50:
1524	movq	C, CO1			# coffset1 = c
1525	leaq	(C, LDC, 1), CO2	# coffset2 = c + ldc
1526	movq	A, AO		# aoffset = a
1527
1528	movq	M,  I
1529	sarq	$2, I	# i = (m >> 2)
1530	jle	.L60
1531	ALIGN_3
1532
1533.L51:
1534#if !defined(TRMMKERNEL) || \
1535	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1536	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1537
1538	leaq	BUFFER, BO
1539#else
1540	leaq	BUFFER, BO
1541	movq	KK, %rax
1542	leaq	(, %rax, SIZE), %rax
1543	leaq	(AO, %rax, 4), AO
1544	leaq	(BO, %rax, 4), BO
1545#endif
1546
1547	movapd	-16 * SIZE(AO), %xmm0
1548	pxor	%xmm8, %xmm8
1549	movapd	 0 * SIZE(BO), %xmm1
1550	pxor	%xmm9, %xmm9
1551	movapd	 -8 * SIZE(AO), %xmm2
1552	pxor	%xmm12, %xmm12
1553	movapd	 8 * SIZE(BO), %xmm3
1554	pxor	%xmm13, %xmm13
1555
1556	movapd	 0 * SIZE(AO), %xmm4
1557	movapd	16 * SIZE(BO), %xmm5
1558	movapd	 8 * SIZE(AO), %xmm6
1559	movapd	24 * SIZE(BO), %xmm7
1560
1561	PREFETCHW      4 * SIZE(CO1)
1562	PREFETCHW      4 * SIZE(CO2)
1563
1564#ifndef TRMMKERNEL
1565	movq	K, %rax
1566#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1567	movq	K, %rax
1568	subq	KK, %rax
1569	movq	%rax, KKK
1570#else
1571	movq	KK, %rax
1572#ifdef LEFT
1573	addq	$4, %rax
1574#else
1575	addq	$2, %rax
1576#endif
1577	movq	%rax, KKK
1578#endif
1579	sarq	$3, %rax
1580	je	.L55
1581	ALIGN_3
1582
1583.L52:
1584	mulpd	%xmm0, %xmm1
1585	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
1586	mulpd	 2 * SIZE(BO), %xmm0
1587	addpd	%xmm1, %xmm8
1588	movapd	 0 * SIZE(BO), %xmm1
1589	addpd	%xmm0, %xmm9
1590	movapd	-14 * SIZE(AO), %xmm0
1591	mulpd	%xmm0, %xmm1
1592	mulpd	 2 * SIZE(BO), %xmm0
1593	addpd	%xmm1, %xmm12
1594	movapd	 4 * SIZE(BO), %xmm1
1595	addpd	%xmm0, %xmm13
1596	movapd	-12 * SIZE(AO), %xmm0
1597
1598	mulpd	%xmm0, %xmm1
1599	mulpd	 6 * SIZE(BO), %xmm0
1600	addpd	%xmm1, %xmm8
1601	movapd	 4 * SIZE(BO), %xmm1
1602	addpd	%xmm0, %xmm9
1603	movapd	-10 * SIZE(AO), %xmm0
1604	mulpd	%xmm0, %xmm1
1605	mulpd	 6 * SIZE(BO), %xmm0
1606	addpd	%xmm1, %xmm12
1607	movapd	32 * SIZE(BO), %xmm1
1608	addpd	%xmm0, %xmm13
1609	movapd	16 * SIZE(AO), %xmm0
1610
1611	PREFETCH	(PREFETCHSIZE     +  8) * SIZE(AO)
1612	mulpd	%xmm2, %xmm3
1613	mulpd	10 * SIZE(BO), %xmm2
1614	addpd	%xmm3, %xmm8
1615	movapd	 8 * SIZE(BO), %xmm3
1616	addpd	%xmm2, %xmm9
1617	movapd	-6 * SIZE(AO), %xmm2
1618	mulpd	%xmm2, %xmm3
1619	mulpd	10 * SIZE(BO), %xmm2
1620	addpd	%xmm3, %xmm12
1621	movapd	12 * SIZE(BO), %xmm3
1622	addpd	%xmm2, %xmm13
1623	movapd	-4 * SIZE(AO), %xmm2
1624
1625	mulpd	%xmm2, %xmm3
1626	mulpd	14 * SIZE(BO), %xmm2
1627	addpd	%xmm3, %xmm8
1628	movapd	12 * SIZE(BO), %xmm3
1629	addpd	%xmm2, %xmm9
1630	movapd	-2 * SIZE(AO), %xmm2
1631	mulpd	%xmm2, %xmm3
1632	mulpd	14 * SIZE(BO), %xmm2
1633	addpd	%xmm3, %xmm12
1634	movapd	40 * SIZE(BO), %xmm3
1635	addpd	%xmm2, %xmm13
1636	movapd	24 * SIZE(AO), %xmm2
1637
1638	PREFETCH	(PREFETCHSIZE     + 16) * SIZE(AO)
1639	mulpd	%xmm4, %xmm5
1640	mulpd	18 * SIZE(BO), %xmm4
1641	addpd	%xmm5, %xmm8
1642	movapd	16 * SIZE(BO), %xmm5
1643	addpd	%xmm4, %xmm9
1644	movapd	 2 * SIZE(AO), %xmm4
1645	mulpd	%xmm4, %xmm5
1646	mulpd	18 * SIZE(BO), %xmm4
1647	addpd	%xmm5, %xmm12
1648	movapd	20 * SIZE(BO), %xmm5
1649	addpd	%xmm4, %xmm13
1650	movapd	 4 * SIZE(AO), %xmm4
1651
1652	mulpd	%xmm4, %xmm5
1653	mulpd	22 * SIZE(BO), %xmm4
1654	addpd	%xmm5, %xmm8
1655	movapd	20 * SIZE(BO), %xmm5
1656	addpd	%xmm4, %xmm9
1657	movapd	 6 * SIZE(AO), %xmm4
1658	mulpd	%xmm4, %xmm5
1659	mulpd	22 * SIZE(BO), %xmm4
1660	addpd	%xmm5, %xmm12
1661	movapd	48 * SIZE(BO), %xmm5
1662	addpd	%xmm4, %xmm13
1663	movapd	32 * SIZE(AO), %xmm4
1664
1665	PREFETCH	(PREFETCHSIZE     + 24) * SIZE(AO)
1666	mulpd	%xmm6, %xmm7
1667	mulpd	26 * SIZE(BO), %xmm6
1668	addpd	%xmm7, %xmm8
1669	movapd	24 * SIZE(BO), %xmm7
1670	addpd	%xmm6, %xmm9
1671	movapd	10 * SIZE(AO), %xmm6
1672	mulpd	%xmm6, %xmm7
1673	mulpd	26 * SIZE(BO), %xmm6
1674	addpd	%xmm7, %xmm12
1675	movapd	28 * SIZE(BO), %xmm7
1676	addpd	%xmm6, %xmm13
1677	movapd	12 * SIZE(AO), %xmm6
1678
1679	mulpd	%xmm6, %xmm7
1680	mulpd	30 * SIZE(BO), %xmm6
1681	addpd	%xmm7, %xmm8
1682	movapd	28 * SIZE(BO), %xmm7
1683	addpd	%xmm6, %xmm9
1684	movapd	14 * SIZE(AO), %xmm6
1685	mulpd	%xmm6, %xmm7
1686	mulpd	30 * SIZE(BO), %xmm6
1687	addpd	%xmm7, %xmm12
1688	movapd	56 * SIZE(BO), %xmm7
1689	addpd	%xmm6, %xmm13
1690	movapd	40 * SIZE(AO), %xmm6
1691
1692	addq   $32 * SIZE, AO
1693	addq   $32 * SIZE, BO
1694	decq   %rax
1695	jne    .L52
1696	ALIGN_3
1697
1698.L55:
1699#ifndef TRMMKERNEL
1700	movq	K, %rax
1701#else
1702	movq	KKK, %rax
1703#endif
1704	movapd	ALPHA, %xmm7
1705	andq	$7, %rax		# if (k & 1)
1706	BRANCH
1707	je .L59
1708	ALIGN_3
1709
1710.L56:
1711	movapd	 0 * SIZE(BO), %xmm1
1712	mulpd	%xmm0, %xmm1
1713	addpd	%xmm1, %xmm8
1714	mulpd	 2 * SIZE(BO), %xmm0
1715	addpd	%xmm0, %xmm9
1716	movapd	-14 * SIZE(AO), %xmm0
1717	movapd	 0 * SIZE(BO), %xmm1
1718	mulpd	%xmm0, %xmm1
1719	addpd	%xmm1, %xmm12
1720	mulpd	 2 * SIZE(BO), %xmm0
1721	addpd	%xmm0, %xmm13
1722	movapd	-12 * SIZE(AO), %xmm0
1723
1724	addq	$4 * SIZE, AO		# aoffset  += 4
1725	addq	$4 * SIZE, BO		# boffset1 += 8
1726	decq	%rax
1727	jg	.L56
1728	ALIGN_3
1729
1730.L59:
1731#ifndef TRMMKERNEL
1732	movsd	0 * SIZE(CO1), %xmm0
1733	movhpd	1 * SIZE(CO1), %xmm0
1734	movsd	2 * SIZE(CO1), %xmm1
1735	movhpd	3 * SIZE(CO1), %xmm1
1736	movsd	0 * SIZE(CO2), %xmm2
1737	movhpd	1 * SIZE(CO2), %xmm2
1738	movsd	2 * SIZE(CO2), %xmm3
1739	movhpd	3 * SIZE(CO2), %xmm3
1740#endif
1741
1742	mulpd	%xmm7, %xmm8
1743	mulpd	%xmm7, %xmm9
1744	mulpd	%xmm7, %xmm12
1745	mulpd	%xmm7, %xmm13
1746
1747#ifndef TRMMKERNEL
1748	addpd	%xmm0,  %xmm8
1749	addpd	%xmm1,  %xmm12
1750	addpd	%xmm2, %xmm9
1751	addpd	%xmm3, %xmm13
1752#endif
1753
1754	movsd	%xmm8, 0 * SIZE(CO1)
1755	movhpd	%xmm8, 1 * SIZE(CO1)
1756	movsd	%xmm12, 2 * SIZE(CO1)
1757	movhpd	%xmm12, 3 * SIZE(CO1)
1758	movsd	%xmm9, 0 * SIZE(CO2)
1759	movhpd	%xmm9, 1 * SIZE(CO2)
1760	movsd	%xmm13, 2 * SIZE(CO2)
1761	movhpd	%xmm13, 3 * SIZE(CO2)
1762
1763#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1764    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1765	movq	K, %rax
1766	subq	KKK, %rax
1767	leaq	(,%rax, SIZE), %rax
1768	leaq	(AO, %rax, 4), AO
1769	leaq	(BO, %rax, 4), BO
1770#endif
1771
1772#if defined(TRMMKERNEL) && defined(LEFT)
1773	addq	$4, KK
1774#endif
1775
1776	addq	$4 * SIZE, CO1		# coffset += 4
1777	addq	$4 * SIZE, CO2		# coffset += 4
1778	decq	I			# i --
1779	jg	.L51
1780	ALIGN_3
1781
1782.L60:
1783	testq	$2, M
1784	je	.L70
1785	ALIGN_3
1786
1787.L61:
1788#if !defined(TRMMKERNEL) || \
1789	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1790	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1791
1792	leaq	BUFFER, BO
1793#else
1794	leaq	BUFFER, BO
1795	movq	KK, %rax
1796	leaq	(, %rax, SIZE), %rax
1797	leaq	(AO, %rax, 2), AO
1798	leaq	(BO, %rax, 4), BO
1799#endif
1800
1801	movapd	-16 * SIZE(AO), %xmm0
1802	pxor	%xmm8, %xmm8
1803	movapd	 0 * SIZE(BO), %xmm1
1804	pxor	%xmm9, %xmm9
1805	movapd	 -8 * SIZE(AO), %xmm2
1806	pxor	%xmm10, %xmm10
1807	movapd	 8 * SIZE(BO), %xmm3
1808	pxor	%xmm11, %xmm11
1809
1810	movapd	16 * SIZE(BO), %xmm5
1811	movapd	24 * SIZE(BO), %xmm7
1812
1813#ifndef TRMMKERNEL
1814	movq	K, %rax
1815#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1816	movq	K, %rax
1817	subq	KK, %rax
1818	movq	%rax, KKK
1819#else
1820	movq	KK, %rax
1821#ifdef LEFT
1822	addq	$2, %rax
1823#else
1824	addq	$2, %rax
1825#endif
1826	movq	%rax, KKK
1827#endif
1828	sarq	$3, %rax
1829	je	.L65
1830	ALIGN_3
1831
1832.L62:
1833	mulpd	%xmm0, %xmm1
1834	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
1835	mulpd	 2 * SIZE(BO), %xmm0
1836	addpd	%xmm1, %xmm8
1837	movapd	 4 * SIZE(BO), %xmm1
1838	addpd	%xmm0, %xmm9
1839	movapd	-14 * SIZE(AO), %xmm0
1840
1841	mulpd	%xmm0, %xmm1
1842	mulpd	 6 * SIZE(BO), %xmm0
1843	addpd	%xmm1, %xmm10
1844	movapd	32 * SIZE(BO), %xmm1
1845	addpd	%xmm0, %xmm11
1846	movapd	-12 * SIZE(AO), %xmm0
1847
1848	mulpd	%xmm0, %xmm3
1849	mulpd	10 * SIZE(BO), %xmm0
1850	addpd	%xmm3, %xmm8
1851	movapd	12 * SIZE(BO), %xmm3
1852	addpd	%xmm0, %xmm9
1853	movapd	-10 * SIZE(AO), %xmm0
1854
1855	mulpd	%xmm0, %xmm3
1856	mulpd	14 * SIZE(BO), %xmm0
1857	addpd	%xmm3, %xmm10
1858	movapd	40 * SIZE(BO), %xmm3
1859	addpd	%xmm0, %xmm11
1860	movapd	 0 * SIZE(AO), %xmm0
1861
1862	PREFETCH	(PREFETCHSIZE     +  8) * SIZE(AO)
1863	mulpd	%xmm2, %xmm5
1864	mulpd	18 * SIZE(BO), %xmm2
1865	addpd	%xmm5, %xmm8
1866	movapd	20 * SIZE(BO), %xmm5
1867	addpd	%xmm2, %xmm9
1868	movapd	-6 * SIZE(AO), %xmm2
1869
1870	mulpd	%xmm2, %xmm5
1871	mulpd	22 * SIZE(BO), %xmm2
1872	addpd	%xmm5, %xmm10
1873	movapd	48 * SIZE(BO), %xmm5
1874	addpd	%xmm2, %xmm11
1875	movapd	-4 * SIZE(AO), %xmm2
1876
1877	mulpd	%xmm2, %xmm7
1878	mulpd	26 * SIZE(BO), %xmm2
1879	addpd	%xmm7, %xmm8
1880	movapd	28 * SIZE(BO), %xmm7
1881	addpd	%xmm2, %xmm9
1882	movapd	-2 * SIZE(AO), %xmm2
1883
1884	mulpd	%xmm2, %xmm7
1885	mulpd	30 * SIZE(BO), %xmm2
1886	addpd	%xmm7, %xmm10
1887	movapd	56 * SIZE(BO), %xmm7
1888	addpd	%xmm2, %xmm11
1889	movapd	 8 * SIZE(AO), %xmm2
1890
1891	addq   $16 * SIZE, AO
1892	addq   $32 * SIZE, BO
1893	decq   %rax
1894	jne    .L62
1895	ALIGN_3
1896
1897.L65:
1898#ifndef TRMMKERNEL
1899	movq	K, %rax
1900#else
1901	movq	KKK, %rax
1902#endif
1903	movapd	ALPHA, %xmm7
1904	andq	$7, %rax		# if (k & 1)
1905	BRANCH
1906	je .L69
1907	ALIGN_3
1908
1909.L66:
1910	mulpd	%xmm0, %xmm1
1911	mulpd	 2 * SIZE(BO), %xmm0
1912	addpd	%xmm1, %xmm8
1913	movapd	 4 * SIZE(BO), %xmm1
1914	addpd	%xmm0, %xmm9
1915	movapd	-14 * SIZE(AO), %xmm0
1916
1917	addq	$2 * SIZE, AO		# aoffset  += 4
1918	addq	$4 * SIZE, BO		# boffset1 += 8
1919	decq	%rax
1920	jg	.L66
1921	ALIGN_3
1922
1923.L69:
1924#ifndef TRMMKERNEL
1925	movsd	0 * SIZE(CO1), %xmm0
1926	movhpd	1 * SIZE(CO1), %xmm0
1927	movsd	0 * SIZE(CO2), %xmm2
1928	movhpd	1 * SIZE(CO2), %xmm2
1929#endif
1930
1931	addpd	%xmm10, %xmm8
1932	addpd	%xmm11, %xmm9
1933
1934	mulpd	%xmm7, %xmm8
1935	mulpd	%xmm7, %xmm9
1936
1937#ifndef TRMMKERNEL
1938	addpd	%xmm0,  %xmm8
1939	addpd	%xmm2, %xmm9
1940#endif
1941
1942	movsd	%xmm8, 0 * SIZE(CO1)
1943	movhpd	%xmm8, 1 * SIZE(CO1)
1944	movsd	%xmm9, 0 * SIZE(CO2)
1945	movhpd	%xmm9, 1 * SIZE(CO2)
1946
1947#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1948    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1949	movq	K, %rax
1950	subq	KKK, %rax
1951	leaq	(,%rax, SIZE), %rax
1952	leaq	(AO, %rax, 2), AO
1953	leaq	(BO, %rax, 4), BO
1954#endif
1955
1956#if defined(TRMMKERNEL) && defined(LEFT)
1957	addq	$2, KK
1958#endif
1959
1960	addq	$2 * SIZE, CO1		# coffset += 4
1961	addq	$2 * SIZE, CO2		# coffset += 4
1962	ALIGN_3
1963
1964.L70:
1965	testq	$1, M
1966	je	.L79
1967	ALIGN_3
1968
1969.L71:
1970#if !defined(TRMMKERNEL) || \
1971	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1972	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1973
1974	leaq	BUFFER, BO
1975#else
1976	leaq	BUFFER, BO
1977	movq	KK, %rax
1978	leaq	(, %rax, SIZE), %rax
1979	leaq	(AO, %rax, 1), AO
1980	leaq	(BO, %rax, 4), BO
1981#endif
1982
1983	movsd	-16 * SIZE(AO), %xmm0
1984	pxor	%xmm8, %xmm8
1985	movsd	 0 * SIZE(BO), %xmm1
1986	pxor	%xmm9, %xmm9
1987	movsd	-12 * SIZE(AO), %xmm2
1988	pxor	%xmm10, %xmm10
1989	movsd	 8 * SIZE(BO), %xmm3
1990	pxor	%xmm11, %xmm11
1991
1992	movsd	16 * SIZE(BO), %xmm5
1993	movsd	24 * SIZE(BO), %xmm7
1994
1995#ifndef TRMMKERNEL
1996	movq	K, %rax
1997#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1998	movq	K, %rax
1999	subq	KK, %rax
2000	movq	%rax, KKK
2001#else
2002	movq	KK, %rax
2003#ifdef LEFT
2004	addq	$1, %rax
2005#else
2006	addq	$2, %rax
2007#endif
2008	movq	%rax, KKK
2009#endif
2010	sarq	$3, %rax
2011	je	.L75
2012	ALIGN_3
2013
2014.L72:
2015	mulsd	%xmm0, %xmm1
2016	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
2017	mulsd	 2 * SIZE(BO), %xmm0
2018	addsd	%xmm1, %xmm8
2019	movsd	 4 * SIZE(BO), %xmm1
2020	addsd	%xmm0, %xmm9
2021	movsd	-15 * SIZE(AO), %xmm0
2022
2023	mulsd	%xmm0, %xmm1
2024	mulsd	 6 * SIZE(BO), %xmm0
2025	addsd	%xmm1, %xmm10
2026	movsd	32 * SIZE(BO), %xmm1
2027	addsd	%xmm0, %xmm11
2028	movsd	-14 * SIZE(AO), %xmm0
2029
2030	mulsd	%xmm0, %xmm3
2031	mulsd	10 * SIZE(BO), %xmm0
2032	addsd	%xmm3, %xmm8
2033	movsd	12 * SIZE(BO), %xmm3
2034	addsd	%xmm0, %xmm9
2035	movsd	-13 * SIZE(AO), %xmm0
2036
2037	mulsd	%xmm0, %xmm3
2038	mulsd	14 * SIZE(BO), %xmm0
2039	addsd	%xmm3, %xmm10
2040	movsd	40 * SIZE(BO), %xmm3
2041	addsd	%xmm0, %xmm11
2042	movsd	-8 * SIZE(AO), %xmm0
2043
2044	mulsd	%xmm2, %xmm5
2045	mulsd	18 * SIZE(BO), %xmm2
2046	addsd	%xmm5, %xmm8
2047	movsd	20 * SIZE(BO), %xmm5
2048	addsd	%xmm2, %xmm9
2049	movsd	-11 * SIZE(AO), %xmm2
2050
2051	mulsd	%xmm2, %xmm5
2052	mulsd	22 * SIZE(BO), %xmm2
2053	addsd	%xmm5, %xmm10
2054	movsd	48 * SIZE(BO), %xmm5
2055	addsd	%xmm2, %xmm11
2056	movsd	-10 * SIZE(AO), %xmm2
2057
2058	mulsd	%xmm2, %xmm7
2059	mulsd	26 * SIZE(BO), %xmm2
2060	addsd	%xmm7, %xmm8
2061	movsd	28 * SIZE(BO), %xmm7
2062	addsd	%xmm2, %xmm9
2063	movsd	-9 * SIZE(AO), %xmm2
2064
2065	mulsd	%xmm2, %xmm7
2066	mulsd	30 * SIZE(BO), %xmm2
2067	addsd	%xmm7, %xmm10
2068	movsd	56 * SIZE(BO), %xmm7
2069	addsd	%xmm2, %xmm11
2070	movsd	-4 * SIZE(AO), %xmm2
2071
2072	addq   $ 8 * SIZE, AO
2073	addq   $32 * SIZE, BO
2074	decq   %rax
2075	jne    .L72
2076	ALIGN_3
2077
2078.L75:
2079#ifndef TRMMKERNEL
2080	movq	K, %rax
2081#else
2082	movq	KKK, %rax
2083#endif
2084	movsd	ALPHA, %xmm7
2085	andq	$7, %rax		# if (k & 1)
2086	BRANCH
2087	je .L78
2088	ALIGN_3
2089
2090.L76:
2091	mulsd	%xmm0, %xmm1
2092	mulsd	 2 * SIZE(BO), %xmm0
2093	addsd	%xmm1, %xmm8
2094	addsd	%xmm0, %xmm9
2095	movsd	-15 * SIZE(AO), %xmm0
2096	movsd	 4 * SIZE(BO), %xmm1
2097
2098	addq	$1 * SIZE, AO		# aoffset  += 4
2099	addq	$4 * SIZE, BO		# boffset1 += 8
2100	decq	%rax
2101	jg	.L76
2102	ALIGN_3
2103
2104.L78:
2105#ifndef TRMMKERNEL
2106	movsd	0 * SIZE(CO1), %xmm0
2107	movsd	0 * SIZE(CO2), %xmm2
2108#endif
2109
2110	addsd	%xmm10, %xmm8
2111	addsd	%xmm11, %xmm9
2112
2113	mulsd	%xmm7, %xmm8
2114	mulsd	%xmm7, %xmm9
2115
2116#ifndef TRMMKERNEL
2117	addsd	%xmm0,  %xmm8
2118	addsd	%xmm2, %xmm9
2119#endif
2120
2121	movsd	%xmm8, 0 * SIZE(CO1)
2122	movsd	%xmm9, 0 * SIZE(CO2)
2123
2124#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2125    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2126	movq	K, %rax
2127	subq	KKK, %rax
2128	leaq	(,%rax, SIZE), %rax
2129	leaq	(AO, %rax, 1), AO
2130	leaq	(BO, %rax, 4), BO
2131#endif
2132
2133#if defined(TRMMKERNEL) && defined(LEFT)
2134	addq	$1, KK
2135#endif
2136	ALIGN_3
2137
2138.L79:
2139#if defined(TRMMKERNEL) && !defined(LEFT)
2140	addl	$2, KK
2141#endif
2142	leaq	(C, LDC, 2), C
2143	ALIGN_3
2144
2145.L80:
2146	testq	$1, N
2147	je	.L999
2148	ALIGN_4
2149
2150.L81:
2151/* Copying to Sub Buffer */
2152	leaq	BUFFER, BO
2153
2154#if defined(TRMMKERNEL) && defined(LEFT)
2155	movq	OFFSET, %rax
2156	movq	%rax, KK
2157#endif
2158
2159	movq	K, %rax
2160	sarq	$3, %rax
2161	jle	.L83
2162	ALIGN_3
2163
2164.L82:
2165	PREFETCH	 56 * SIZE(B)
2166
2167	movq	 0 * SIZE(B), %mm0
2168	movq	 1 * SIZE(B), %mm1
2169	movq	 2 * SIZE(B), %mm2
2170	movq	 3 * SIZE(B), %mm3
2171	movq	 4 * SIZE(B), %mm4
2172	movq	 5 * SIZE(B), %mm5
2173	movq	 6 * SIZE(B), %mm6
2174	movq	 7 * SIZE(B), %mm7
2175
2176	addq	$ 8 * SIZE, B
2177	addq	$16 * SIZE, BO
2178
2179	movq	%mm0, -16 * SIZE(BO)
2180	movq	%mm0, -15 * SIZE(BO)
2181	movq	%mm1, -14 * SIZE(BO)
2182	movq	%mm1, -13 * SIZE(BO)
2183	movq	%mm2, -12 * SIZE(BO)
2184	movq	%mm2, -11 * SIZE(BO)
2185	movq	%mm3, -10 * SIZE(BO)
2186	movq	%mm3,  -9 * SIZE(BO)
2187	movq	%mm4,  -8 * SIZE(BO)
2188	movq	%mm4,  -7 * SIZE(BO)
2189	movq	%mm5,  -6 * SIZE(BO)
2190	movq	%mm5,  -5 * SIZE(BO)
2191	movq	%mm6,  -4 * SIZE(BO)
2192	movq	%mm6,  -3 * SIZE(BO)
2193	movq	%mm7,  -2 * SIZE(BO)
2194	movq	%mm7,  -1 * SIZE(BO)
2195
2196	decq	%rax
2197	jne	.L82
2198	ALIGN_3
2199
2200.L83:
2201	movq	K, %rax
2202	andq	$7, %rax
2203	BRANCH
2204	jle	.L90
2205	ALIGN_3
2206
2207.L84:
2208	movq	 0 * SIZE(B), %mm0
2209
2210	movq	%mm0,  0 * SIZE(BO)
2211	movq	%mm0,  1 * SIZE(BO)
2212
2213	addq	$1 * SIZE, B
2214	addq	$2 * SIZE, BO
2215	decq	%rax
2216	jne	.L84
2217	ALIGN_3
2218
2219.L90:
2220	movq	C, CO1			# coffset1 = c
2221	movq	A, AO		# aoffset = a
2222
2223	movq	M,  I
2224	sarq	$2, I	# i = (m >> 2)
2225	jle	.L100
2226	ALIGN_3
2227
2228.L91:
2229#if !defined(TRMMKERNEL) || \
2230	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2231	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2232
2233	leaq	BUFFER, BO
2234#else
2235	leaq	BUFFER, BO
2236	movq	KK, %rax
2237	leaq	(, %rax, SIZE), %rax
2238	leaq	(AO, %rax, 4), AO
2239	leaq	(BO, %rax, 2), BO
2240#endif
2241
2242	movapd	-16 * SIZE(AO), %xmm0
2243	pxor	%xmm8, %xmm8
2244	movapd	 0 * SIZE(BO), %xmm1
2245	pxor	%xmm9, %xmm9
2246	movapd	 -8 * SIZE(AO), %xmm2
2247	pxor	%xmm10, %xmm10
2248	movapd	 8 * SIZE(BO), %xmm3
2249	pxor	%xmm11, %xmm11
2250
2251	movapd	 0 * SIZE(AO), %xmm4
2252	movapd	 8 * SIZE(AO), %xmm6
2253
2254	PREFETCHW      4 * SIZE(CO1)
2255
2256#ifndef TRMMKERNEL
2257	movq	K, %rax
2258#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2259	movq	K, %rax
2260	subq	KK, %rax
2261	movq	%rax, KKK
2262#else
2263	movq	KK, %rax
2264#ifdef LEFT
2265	addq	$4, %rax
2266#else
2267	addq	$1, %rax
2268#endif
2269	movq	%rax, KKK
2270#endif
2271	sarq	$3, %rax
2272	je	.L95
2273	ALIGN_3
2274
2275.L92:
2276	mulpd	%xmm1, %xmm0
2277	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
2278	mulpd	-14 * SIZE(AO), %xmm1
2279	addpd	%xmm0, %xmm8
2280	movapd	-12 * SIZE(AO), %xmm0
2281	addpd	%xmm1, %xmm9
2282	movapd	 2 * SIZE(BO), %xmm1
2283	mulpd	%xmm1, %xmm0
2284	mulpd	-10 * SIZE(AO), %xmm1
2285	addpd	%xmm0, %xmm10
2286	movapd	 16 * SIZE(AO), %xmm0
2287	PREFETCH	(PREFETCHSIZE     +  8) * SIZE(AO)
2288	addpd	%xmm1, %xmm11
2289	movapd	 4 * SIZE(BO), %xmm1
2290	mulpd	%xmm1, %xmm2
2291	mulpd	-6 * SIZE(AO), %xmm1
2292	addpd	%xmm2, %xmm8
2293	movapd	-4 * SIZE(AO), %xmm2
2294	addpd	%xmm1, %xmm9
2295	movapd	 6 * SIZE(BO), %xmm1
2296	mulpd	%xmm1, %xmm2
2297	mulpd	-2 * SIZE(AO), %xmm1
2298	addpd	%xmm2, %xmm10
2299	movapd	24 * SIZE(AO), %xmm2
2300	PREFETCH	(PREFETCHSIZE     + 16) * SIZE(AO)
2301	addpd	%xmm1, %xmm11
2302	movapd	16 * SIZE(BO), %xmm1
2303	mulpd	%xmm3, %xmm4
2304	mulpd	 2 * SIZE(AO), %xmm3
2305	addpd	%xmm4, %xmm8
2306	movapd	 4 * SIZE(AO), %xmm4
2307	addpd	%xmm3, %xmm9
2308	movapd	10 * SIZE(BO), %xmm3
2309	mulpd	%xmm3, %xmm4
2310	mulpd	 6 * SIZE(AO), %xmm3
2311	addpd	%xmm4, %xmm10
2312	movapd	32 * SIZE(AO), %xmm4
2313	PREFETCH	(PREFETCHSIZE     + 24) * SIZE(AO)
2314	addpd	%xmm3, %xmm11
2315	movapd	12 * SIZE(BO), %xmm3
2316	mulpd	%xmm3, %xmm6
2317	mulpd	10 * SIZE(AO), %xmm3
2318	addpd	%xmm6, %xmm8
2319	movapd	12 * SIZE(AO), %xmm6
2320	addpd	%xmm3, %xmm9
2321	movapd	14 * SIZE(BO), %xmm3
2322	mulpd	%xmm3, %xmm6
2323	mulpd	14 * SIZE(AO), %xmm3
2324	addpd	%xmm6, %xmm10
2325	movapd	40 * SIZE(AO), %xmm6
2326	addpd	%xmm3, %xmm11
2327	movapd	24 * SIZE(BO), %xmm3
2328
2329	addq   $32 * SIZE, AO
2330	addq   $16 * SIZE, BO
2331	decq   %rax
2332	jne    .L92
2333	ALIGN_3
2334
2335.L95:
2336#ifndef TRMMKERNEL
2337	movq	K, %rax
2338#else
2339	movq	KKK, %rax
2340#endif
2341	movapd	ALPHA, %xmm7
2342	andq	$7, %rax		# if (k & 1)
2343	BRANCH
2344	je .L99
2345	ALIGN_3
2346
2347.L96:
2348	mulpd	%xmm1, %xmm0
2349	mulpd	-14 * SIZE(AO), %xmm1
2350	addpd	%xmm0, %xmm8
2351	movapd	-12 * SIZE(AO), %xmm0
2352	addpd	%xmm1, %xmm9
2353	movapd	 2 * SIZE(BO), %xmm1
2354
2355	addq	$4 * SIZE, AO		# aoffset  += 4
2356	addq	$2 * SIZE, BO		# boffset1 += 8
2357	decq	%rax
2358	jg	.L96
2359	ALIGN_3
2360
2361.L99:
2362#ifndef TRMMKERNEL
2363	movsd	0 * SIZE(CO1), %xmm0
2364	movhpd	1 * SIZE(CO1), %xmm0
2365	movsd	2 * SIZE(CO1), %xmm1
2366	movhpd	3 * SIZE(CO1), %xmm1
2367#endif
2368
2369	addpd	%xmm10, %xmm8
2370	addpd	%xmm11, %xmm9
2371
2372	mulpd	%xmm7, %xmm8
2373	mulpd	%xmm7, %xmm9
2374
2375#ifndef TRMMKERNEL
2376	addpd	%xmm0,  %xmm8
2377	addpd	%xmm1,  %xmm9
2378#endif
2379
2380	movsd	%xmm8, 0 * SIZE(CO1)
2381	movhpd	%xmm8, 1 * SIZE(CO1)
2382	movsd	%xmm9, 2 * SIZE(CO1)
2383	movhpd	%xmm9, 3 * SIZE(CO1)
2384
2385#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2386    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2387	movq	K, %rax
2388	subq	KKK, %rax
2389	leaq	(,%rax, SIZE), %rax
2390	leaq	(AO, %rax, 4), AO
2391	leaq	(BO, %rax, 2), BO
2392#endif
2393
2394#if defined(TRMMKERNEL) && defined(LEFT)
2395	addq	$4, KK
2396#endif
2397
2398	addq	$4 * SIZE, CO1		# coffset += 4
2399	decq	I			# i --
2400	jg	.L91
2401	ALIGN_3
2402
2403.L100:
2404	testq	$2, M
2405	je	.L110
2406	ALIGN_3
2407
2408.L101:
2409#if !defined(TRMMKERNEL) || \
2410	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2411	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2412
2413	leaq	BUFFER, BO
2414#else
2415	leaq	BUFFER, BO
2416	movq	KK, %rax
2417	leaq	(, %rax, SIZE), %rax
2418	leaq	(AO, %rax, 2), AO
2419	leaq	(BO, %rax, 2), BO
2420#endif
2421
2422	movapd	-16 * SIZE(AO), %xmm0
2423	pxor	%xmm8, %xmm8
2424	movapd	 0 * SIZE(BO), %xmm1
2425	pxor	%xmm9, %xmm9
2426	movapd	-8 * SIZE(AO), %xmm2
2427	pxor	%xmm10, %xmm10
2428	movapd	 8 * SIZE(BO), %xmm3
2429	pxor	%xmm11, %xmm11
2430
2431#ifndef TRMMKERNEL
2432	movq	K, %rax
2433#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2434	movq	K, %rax
2435	subq	KK, %rax
2436	movq	%rax, KKK
2437#else
2438	movq	KK, %rax
2439#ifdef LEFT
2440	addq	$2, %rax
2441#else
2442	addq	$1, %rax
2443#endif
2444	movq	%rax, KKK
2445#endif
2446	sarq	$3, %rax
2447	je	.L105
2448	ALIGN_3
2449
2450.L102:
2451	mulpd	%xmm0, %xmm1
2452	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
2453	movapd	-14 * SIZE(AO), %xmm0
2454	mulpd	 2 * SIZE(BO), %xmm0
2455	addpd	%xmm1, %xmm8
2456	movapd	16 * SIZE(BO), %xmm1
2457	addpd	%xmm0, %xmm9
2458	movapd	-12 * SIZE(AO), %xmm0
2459	mulpd	 4 * SIZE(BO), %xmm0
2460	addpd	%xmm0, %xmm10
2461	movapd	-10 * SIZE(AO), %xmm0
2462	mulpd	 6 * SIZE(BO), %xmm0
2463	addpd	%xmm0, %xmm11
2464	movapd	 0 * SIZE(AO), %xmm0
2465	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
2466	mulpd	%xmm2, %xmm3
2467	movapd	-6 * SIZE(AO), %xmm2
2468	mulpd	10 * SIZE(BO), %xmm2
2469	addpd	%xmm3, %xmm8
2470	movapd	24 * SIZE(BO), %xmm3
2471	addpd	%xmm2, %xmm9
2472	movapd	-4 * SIZE(AO), %xmm2
2473	mulpd	12 * SIZE(BO), %xmm2
2474	addpd	%xmm2, %xmm10
2475	movapd	-2 * SIZE(AO), %xmm2
2476	mulpd	14 * SIZE(BO), %xmm2
2477	addpd	%xmm2, %xmm11
2478	movapd	 8 * SIZE(AO), %xmm2
2479
2480	addq   $16 * SIZE, AO
2481	addq   $16 * SIZE, BO
2482	decq   %rax
2483	jne    .L102
2484	ALIGN_3
2485
2486.L105:
2487#ifndef TRMMKERNEL
2488	movq	K, %rax
2489#else
2490	movq	KKK, %rax
2491#endif
2492	movapd	ALPHA, %xmm7
2493	andq	$7, %rax		# if (k & 1)
2494	BRANCH
2495	je .L109
2496	ALIGN_3
2497
2498.L106:
2499	mulpd	%xmm0, %xmm1
2500	addpd	%xmm1, %xmm8
2501	movapd	-14 * SIZE(AO), %xmm0
2502	movapd	  2 * SIZE(BO), %xmm1
2503
2504	addq	$2 * SIZE, AO		# aoffset  += 4
2505	addq	$2 * SIZE, BO		# boffset1 += 8
2506	decq	%rax
2507	jg	.L106
2508	ALIGN_3
2509
2510.L109:
2511	addpd	%xmm9, %xmm8
2512	addpd	%xmm11, %xmm10
2513	addpd	%xmm10, %xmm8
2514
2515	mulpd	%xmm7, %xmm8
2516
2517#ifndef TRMMKERNEL
2518	movsd	0 * SIZE(CO1), %xmm0
2519	movhpd	1 * SIZE(CO1), %xmm0
2520
2521	addpd	%xmm0,  %xmm8
2522#endif
2523
2524	movsd	%xmm8, 0 * SIZE(CO1)
2525	movhpd	%xmm8, 1 * SIZE(CO1)
2526	addq	$2 * SIZE, CO1		# coffset += 4
2527
2528#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2529    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2530	movq	K, %rax
2531	subq	KKK, %rax
2532	leaq	(,%rax, SIZE), %rax
2533	leaq	(AO, %rax, 2), AO
2534	leaq	(BO, %rax, 2), BO
2535#endif
2536
2537#if defined(TRMMKERNEL) && defined(LEFT)
2538	addq	$2, KK
2539#endif
2540	ALIGN_3
2541
2542.L110:
2543	testq	$1, M
2544	je	.L999
2545	ALIGN_3
2546
2547.L111:
2548#if !defined(TRMMKERNEL) || \
2549	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2550	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2551
2552	leaq	BUFFER, BO
2553#else
2554	leaq	BUFFER, BO
2555	movq	KK, %rax
2556	leaq	(, %rax, SIZE), %rax
2557	leaq	(AO, %rax, 1), AO
2558	leaq	(BO, %rax, 2), BO
2559#endif
2560
2561	movsd	-16 * SIZE(AO), %xmm0
2562	pxor	%xmm8, %xmm8
2563	movsd	 0 * SIZE(BO), %xmm1
2564	pxor	%xmm9, %xmm9
2565	movsd	-12 * SIZE(AO), %xmm2
2566	pxor	%xmm10, %xmm10
2567	movsd	 8 * SIZE(BO), %xmm3
2568	pxor	%xmm11, %xmm11
2569
2570#ifndef TRMMKERNEL
2571	movq	K, %rax
2572#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2573	movq	K, %rax
2574	subq	KK, %rax
2575	movq	%rax, KKK
2576#else
2577	movq	KK, %rax
2578#ifdef LEFT
2579	addq	$1, %rax
2580#else
2581	addq	$1, %rax
2582#endif
2583	movq	%rax, KKK
2584#endif
2585	sarq	$3, %rax
2586	je	.L115
2587	ALIGN_3
2588
2589.L112:
2590	mulsd	%xmm0, %xmm1
2591	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
2592	movsd	-15 * SIZE(AO), %xmm0
2593	addsd	%xmm1, %xmm8
2594	movsd	16 * SIZE(BO), %xmm1
2595	mulsd	 2 * SIZE(BO), %xmm0
2596	addsd	%xmm0, %xmm9
2597	movsd	-14 * SIZE(AO), %xmm0
2598	mulsd	 4 * SIZE(BO), %xmm0
2599	addsd	%xmm0, %xmm10
2600	movsd	-13 * SIZE(AO), %xmm0
2601	mulsd	 6 * SIZE(BO), %xmm0
2602	addsd	%xmm0, %xmm11
2603	movsd	 -8 * SIZE(AO), %xmm0
2604	mulsd	%xmm2, %xmm3
2605	movsd	-11 * SIZE(AO), %xmm2
2606	addsd	%xmm3, %xmm8
2607	movsd	24 * SIZE(BO), %xmm3
2608	mulsd	10 * SIZE(BO), %xmm2
2609	addsd	%xmm2, %xmm9
2610	movsd	-10 * SIZE(AO), %xmm2
2611	mulsd	12 * SIZE(BO), %xmm2
2612	addsd	%xmm2, %xmm10
2613	movsd	-9 * SIZE(AO), %xmm2
2614	mulsd	14 * SIZE(BO), %xmm2
2615	addsd	%xmm2, %xmm11
2616	movsd	-4 * SIZE(AO), %xmm2
2617
2618	addq   $ 8 * SIZE, AO
2619	addq   $16 * SIZE, BO
2620	decq   %rax
2621	jne    .L112
2622	ALIGN_3
2623
2624.L115:
2625#ifndef TRMMKERNEL
2626	movq	K, %rax
2627#else
2628	movq	KKK, %rax
2629#endif
2630	movsd	ALPHA, %xmm7
2631	andq	$7, %rax		# if (k & 1)
2632	BRANCH
2633	je .L118
2634	ALIGN_3
2635
2636.L116:
2637	mulsd	%xmm0, %xmm1
2638	movsd	-15 * SIZE(AO), %xmm0
2639	addsd	%xmm1, %xmm8
2640	movsd	 2 * SIZE(BO), %xmm1
2641
2642	addq	$1 * SIZE, AO		# aoffset  += 4
2643	addq	$2 * SIZE, BO		# boffset1 += 8
2644	decq	%rax
2645	jg	.L116
2646	ALIGN_3
2647
2648.L118:
2649	addsd	%xmm10, %xmm8
2650	addsd	%xmm11, %xmm9
2651	addsd	%xmm9, %xmm8
2652
2653	mulsd	%xmm7, %xmm8
2654#ifndef TRMMKERNEL
2655	addsd	0 * SIZE(CO1), %xmm8
2656#endif
2657	movsd	%xmm8, 0 * SIZE(CO1)
2658	ALIGN_3
2659
2660.L999:
2661	movq	%rbx, %rsp
2662
2663	EMMS
2664
2665	movq	  0(%rsp), %rbx
2666	movq	  8(%rsp), %rbp
2667	movq	 16(%rsp), %r12
2668	movq	 24(%rsp), %r13
2669	movq	 32(%rsp), %r14
2670	movq	 40(%rsp), %r15
2671
2672#ifdef WINDOWS_ABI
2673	movq	 48(%rsp), %rdi
2674	movq	 56(%rsp), %rsi
2675	movups	 64(%rsp), %xmm6
2676	movups	 80(%rsp), %xmm7
2677	movups	 96(%rsp), %xmm8
2678	movups	112(%rsp), %xmm9
2679	movups	128(%rsp), %xmm10
2680	movups	144(%rsp), %xmm11
2681	movups	160(%rsp), %xmm12
2682	movups	176(%rsp), %xmm13
2683	movups	192(%rsp), %xmm14
2684	movups	208(%rsp), %xmm15
2685#endif
2686
2687	addq	$STACKSIZE, %rsp
2688	ret
2689
2690	EPILOGUE
2691