1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define OLD_M	%rdi
26#define OLD_N	%rsi
27#define M	%r13
28#define N	%r14
29#define K	%rdx
30
31#define A	%rcx
32#define B	%r8
33#define C	%r9
34#define LDC	%r10
35#define I	%r11
36#define AO	%rdi
37#define BO	%rsi
38#define	CO1	%r15
39#define CO2	%rbp
40#define BB	%r12
41
42#ifndef WINDOWS_ABI
43
44#define STACKSIZE 64
45
46#else
47
48#define STACKSIZE 256
49
50#define OLD_ALPHA_I	40 + STACKSIZE(%rsp)
51#define OLD_A		48 + STACKSIZE(%rsp)
52#define OLD_B		56 + STACKSIZE(%rsp)
53#define OLD_C		64 + STACKSIZE(%rsp)
54#define OLD_LDC		72 + STACKSIZE(%rsp)
55#define OLD_OFFSET	80 + STACKSIZE(%rsp)
56
57#endif
58
59#define POSINV	  0(%rsp)
60#define ALPHA_R	 16(%rsp)
61#define ALPHA_I	 32(%rsp)
62#define J	 48(%rsp)
63#define OFFSET	 56(%rsp)
64#define KK	 64(%rsp)
65#define KKK	 72(%rsp)
66#define BUFFER	256(%rsp)
67
68#ifdef OPTERON
69#define PREFETCH     prefetch
70#define PREFETCHW    prefetchw
71#define PREFETCHSIZE (8 * 9 + 4)
72
73#define RPREFETCHSIZE (8 *  7 + 4)
74#define WPREFETCHSIZE (8 *  8 + 4)
75#endif
76
77#ifdef GENERIC
78#define PREFETCH     prefetcht0
79#define PREFETCHW    prefetcht0
80#define PREFETCHSIZE (8 * 5 + 4)
81
82#define RPREFETCHSIZE (8 *  7 + 4)
83#define WPREFETCHSIZE (8 *  8 + 4)
84#endif
85
86#ifndef GENERIC
87#define KERNEL1(xx) \
88	mulpd	%xmm0, %xmm1 ;\
89	addpd	%xmm1, %xmm8 ;\
90	movapd	-16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
91	mulpd	%xmm0, %xmm3 ;\
92	addpd	%xmm3, %xmm9 ;\
93	movapd	-14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
94	mulpd	%xmm0, %xmm5 ;\
95	PREFETCH	(PREFETCHSIZE     +  0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\
96	mulpd	-10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\
97	addpd	%xmm5, %xmm10 ;\
98	movapd	-12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
99	addpd	%xmm0, %xmm11 ;\
100	movapd	-8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0
101
102#define KERNEL2(xx) \
103	mulpd	%xmm2, %xmm1 ;\
104	addpd	%xmm1, %xmm12 ;\
105	movapd	 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
106	mulpd	%xmm2, %xmm3 ;\
107	addpd	%xmm3, %xmm13 ;\
108	movapd	-6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
109	mulpd	%xmm2, %xmm5 ;\
110	mulpd	-10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\
111	addpd	%xmm5, %xmm14 ;\
112	movapd	 -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
113	addpd	%xmm2, %xmm15 ;\
114	movapd	-6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2
115
116#define KERNEL3(xx) \
117	mulpd	%xmm4, %xmm7 ;\
118	addpd	%xmm7, %xmm8 ;\
119	movapd	-8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
120	mulpd	%xmm4, %xmm3 ;\
121	addpd	%xmm3, %xmm9 ;\
122	movapd	-6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
123	mulpd	%xmm4, %xmm5 ;\
124	mulpd	-2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\
125	addpd	%xmm5, %xmm10 ;\
126	movapd	-4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
127	addpd	%xmm4, %xmm11 ;\
128	movapd	-4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4
129
130#define KERNEL4(xx) \
131	mulpd	%xmm6, %xmm7 ;\
132	addpd	%xmm7, %xmm12 ;\
133	movapd	 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
134	mulpd	%xmm6, %xmm3 ;\
135	addpd	%xmm3, %xmm13 ;\
136	movapd	 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
137	mulpd	%xmm6, %xmm5 ;\
138	mulpd	-2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\
139	addpd	%xmm5, %xmm14 ;\
140	movapd	 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
141 	PREFETCH	(PREFETCHSIZE     +  8) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\
142	addpd	%xmm6, %xmm15 ;\
143	movapd	-2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6
144
145#define KERNEL5(xx) \
146	mulpd	%xmm0, %xmm1 ;\
147	addpd	%xmm1, %xmm8 ;\
148	movapd	 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
149	mulpd	%xmm0, %xmm3 ;\
150	addpd	%xmm3, %xmm9 ;\
151	movapd	 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
152	mulpd	%xmm0, %xmm5 ;\
153	mulpd	 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\
154	addpd	%xmm5, %xmm10 ;\
155	movapd	 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
156	addpd	%xmm0, %xmm11 ;\
157	movapd	 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0
158
159#define KERNEL6(xx) \
160	mulpd	%xmm2, %xmm1 ;\
161	addpd	%xmm1, %xmm12 ;\
162	movapd	16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
163	mulpd	%xmm2, %xmm3 ;\
164	addpd	%xmm3, %xmm13 ;\
165	movapd	10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
166	mulpd	%xmm2, %xmm5 ;\
167	mulpd	 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\
168	addpd	%xmm5, %xmm14 ;\
169	movapd	12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
170	addpd	%xmm2, %xmm15 ;\
171	movapd	 2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2
172
173#define KERNEL7(xx) \
174	mulpd	%xmm4, %xmm7 ;\
175	addpd	%xmm7, %xmm8 ;\
176	movapd	 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
177	mulpd	%xmm4, %xmm3 ;\
178	addpd	%xmm3, %xmm9 ;\
179	movapd	10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
180	mulpd	%xmm4, %xmm5 ;\
181	mulpd	14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\
182	addpd	%xmm5, %xmm10 ;\
183	movapd	12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
184	addpd	%xmm4, %xmm11 ;\
185	movapd	 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4
186
187#define KERNEL8(xx) \
188	mulpd	%xmm6, %xmm7 ;\
189	addpd	%xmm7, %xmm12 ;\
190	movapd	24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
191	mulpd	%xmm6, %xmm3 ;\
192	addpd	%xmm3, %xmm13 ;\
193	movapd	18 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
194	mulpd	%xmm6, %xmm5 ;\
195	mulpd	14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\
196	addpd	%xmm5, %xmm14 ;\
197	movapd	20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
198	addpd	%xmm6, %xmm15 ;\
199	movapd	 6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6
200
201#else
202#define KERNEL1(xx) \
203	mulpd	%xmm0, %xmm1 ;\
204	addpd	%xmm1, %xmm8 ;\
205	movapd	-16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
206	mulpd	%xmm0, %xmm3 ;\
207	addpd	%xmm3, %xmm9 ;\
208	movapd	-14 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
209	mulpd	%xmm0, %xmm5 ;\
210	PREFETCH	(PREFETCHSIZE     +  0) * SIZE + 1 * (xx) * SIZE(AO) ;\
211	mulpd	-10 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\
212	addpd	%xmm5, %xmm10 ;\
213	movapd	-12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
214	addpd	%xmm0, %xmm11 ;\
215	movapd	-8 * SIZE + 1 * (xx) * SIZE(AO), %xmm0
216
217#define KERNEL2(xx) \
218	mulpd	%xmm2, %xmm1 ;\
219	addpd	%xmm1, %xmm12 ;\
220	movapd	 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
221	mulpd	%xmm2, %xmm3 ;\
222	addpd	%xmm3, %xmm13 ;\
223	movapd	-6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
224	mulpd	%xmm2, %xmm5 ;\
225	mulpd	-10 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\
226	addpd	%xmm5, %xmm14 ;\
227	movapd	 -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
228	addpd	%xmm2, %xmm15 ;\
229	movapd	-6 * SIZE + 1 * (xx) * SIZE(AO), %xmm2
230
231#define KERNEL3(xx) \
232	mulpd	%xmm4, %xmm7 ;\
233	addpd	%xmm7, %xmm8 ;\
234	movapd	-8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
235	mulpd	%xmm4, %xmm3 ;\
236	addpd	%xmm3, %xmm9 ;\
237	movapd	-6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
238	mulpd	%xmm4, %xmm5 ;\
239	mulpd	-2 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\
240	addpd	%xmm5, %xmm10 ;\
241	movapd	-4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
242	addpd	%xmm4, %xmm11 ;\
243	movapd	-4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4
244
245#define KERNEL4(xx) \
246	mulpd	%xmm6, %xmm7 ;\
247	addpd	%xmm7, %xmm12 ;\
248	movapd	 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
249	mulpd	%xmm6, %xmm3 ;\
250	addpd	%xmm3, %xmm13 ;\
251	movapd	 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
252	mulpd	%xmm6, %xmm5 ;\
253	mulpd	-2 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\
254	addpd	%xmm5, %xmm14 ;\
255	movapd	 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
256 	PREFETCH	(PREFETCHSIZE     +  8) * SIZE + 1 * (xx) * SIZE(AO) ;\
257	addpd	%xmm6, %xmm15 ;\
258	movapd	-2 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
259
260#define KERNEL5(xx) \
261	mulpd	%xmm0, %xmm1 ;\
262	addpd	%xmm1, %xmm8 ;\
263	movapd	 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
264	mulpd	%xmm0, %xmm3 ;\
265	addpd	%xmm3, %xmm9 ;\
266	movapd	 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
267	mulpd	%xmm0, %xmm5 ;\
268	mulpd	 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\
269	addpd	%xmm5, %xmm10 ;\
270	movapd	 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
271	addpd	%xmm0, %xmm11 ;\
272	movapd	 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0
273
274#define KERNEL6(xx) \
275	mulpd	%xmm2, %xmm1 ;\
276	addpd	%xmm1, %xmm12 ;\
277	movapd	16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
278	mulpd	%xmm2, %xmm3 ;\
279	addpd	%xmm3, %xmm13 ;\
280	movapd	10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
281	mulpd	%xmm2, %xmm5 ;\
282	mulpd	 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\
283	addpd	%xmm5, %xmm14 ;\
284	movapd	12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
285	addpd	%xmm2, %xmm15 ;\
286	movapd	 2 * SIZE + 1 * (xx) * SIZE(AO), %xmm2
287
288#define KERNEL7(xx) \
289	mulpd	%xmm4, %xmm7 ;\
290	addpd	%xmm7, %xmm8 ;\
291	movapd	 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
292	mulpd	%xmm4, %xmm3 ;\
293	addpd	%xmm3, %xmm9 ;\
294	movapd	10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
295	mulpd	%xmm4, %xmm5 ;\
296	mulpd	14 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\
297	addpd	%xmm5, %xmm10 ;\
298	movapd	12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
299	addpd	%xmm4, %xmm11 ;\
300	movapd	 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4
301
302#define KERNEL8(xx) \
303	mulpd	%xmm6, %xmm7 ;\
304	addpd	%xmm7, %xmm12 ;\
305	movapd	24 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
306	mulpd	%xmm6, %xmm3 ;\
307	addpd	%xmm3, %xmm13 ;\
308	movapd	18 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
309	mulpd	%xmm6, %xmm5 ;\
310	mulpd	14 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\
311	addpd	%xmm5, %xmm14 ;\
312	movapd	20 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
313	addpd	%xmm6, %xmm15 ;\
314	movapd	 6 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
315
316#endif
317
318	PROLOGUE
319	PROFCODE
320
321	subq	$STACKSIZE, %rsp
322
323	movq	%rbx,  0(%rsp)
324	movq	%rbp,  8(%rsp)
325	movq	%r12, 16(%rsp)
326	movq	%r13, 24(%rsp)
327	movq	%r14, 32(%rsp)
328	movq	%r15, 40(%rsp)
329
330#ifdef WINDOWS_ABI
331	movq	%rdi,    48(%rsp)
332	movq	%rsi,    56(%rsp)
333	movups	%xmm6,   64(%rsp)
334	movups	%xmm7,   80(%rsp)
335	movups	%xmm8,   96(%rsp)
336	movups	%xmm9,  112(%rsp)
337	movups	%xmm10, 128(%rsp)
338	movups	%xmm11, 144(%rsp)
339	movups	%xmm12, 160(%rsp)
340	movups	%xmm13, 176(%rsp)
341	movups	%xmm14, 192(%rsp)
342	movups	%xmm15, 208(%rsp)
343
344	movq	ARG1,      OLD_M
345	movq	ARG2,      OLD_N
346	movq	ARG3,      K
347	movq	OLD_A,     A
348	movq	OLD_B,     B
349	movq	OLD_C,     C
350	movq	OLD_LDC,   LDC
351#ifdef TRMMKERNEL
352	movsd	OLD_OFFSET, %xmm12
353#endif
354	movaps	%xmm3,       %xmm0
355	movsd	OLD_ALPHA_I, %xmm1
356#else
357	movq	72(%rsp), LDC
358#ifdef TRMMKERNEL
359	movsd	80(%rsp), %xmm12
360#endif
361
362#endif
363
364	EMMS
365
366	movq	%rsp, %rbx	# save old stack
367	subq	$256 + LOCAL_BUFFER_SIZE, %rsp
368	andq	$-4096, %rsp	# align stack
369
370	STACK_TOUCHING
371
372	movq	OLD_M, M
373	movq	OLD_N, N
374
375	pcmpeqb	%xmm7, %xmm7
376	psllq	$63, %xmm7	# Generate mask
377	pxor	%xmm10, %xmm10
378
379	movlpd	 %xmm0, 0 + ALPHA_R
380	movlpd	 %xmm0, 8 + ALPHA_R
381
382	movlpd	 %xmm1, 8 + ALPHA_I
383	xorpd	 %xmm7, %xmm1
384	movlpd	 %xmm1, 0 + ALPHA_I
385
386	movlpd	  %xmm10,  0 + POSINV
387	movlpd	  %xmm7, 8 + POSINV
388
389#ifdef TRMMKERNEL
390	movlpd	%xmm12, OFFSET
391	movlpd	%xmm12, KK
392#ifndef LEFT
393	negq	KK
394#endif
395#endif
396
397	subq	$-16 * SIZE, A
398
399	salq	$ZBASE_SHIFT, LDC
400
401	movq	N,  J
402	sarq	$1, J		# j = (n >> 2)
403	jle	.L100
404	ALIGN_4
405
406.L01:
407	movq	C, CO1			# coffset1 = c
408	leaq	(C, LDC, 1), CO2	# coffset2 = c + ldc
409
410#if defined(TRMMKERNEL) && defined(LEFT)
411	movq	OFFSET, %rax
412	movq	%rax, KK
413#endif
414
415	leaq	16 * SIZE + BUFFER, BO
416
417	movq	K, %rax
418	sarq	$2, %rax
419	jle	.L03
420	ALIGN_4
421
422.L02:
423	PREFETCH	 (RPREFETCHSIZE +  0)  * SIZE(B)
424
425	movq	 0 * SIZE(B), %mm0
426	movq	%mm0,  -16 * SIZE(BO)
427	movq	%mm0,  -15 * SIZE(BO)
428	movq	 1 * SIZE(B), %mm1
429	movq	%mm1,  -14 * SIZE(BO)
430	movq	%mm1,  -13 * SIZE(BO)
431
432	movq	 2 * SIZE(B), %mm2
433	movq	%mm2,  -12 * SIZE(BO)
434	movq	%mm2,  -11 * SIZE(BO)
435	movq	 3 * SIZE(B), %mm3
436	movq	%mm3,  -10 * SIZE(BO)
437	movq	%mm3,   -9 * SIZE(BO)
438
439	PREFETCHW	 (WPREFETCHSIZE +  0)  * SIZE(BO)
440
441	movq	 4 * SIZE(B), %mm4
442	movq	%mm4,   -8 * SIZE(BO)
443	movq	%mm4,   -7 * SIZE(BO)
444	movq	 5 * SIZE(B), %mm5
445	movq	%mm5,   -6 * SIZE(BO)
446	movq	%mm5,   -5 * SIZE(BO)
447
448	PREFETCHW	 (WPREFETCHSIZE +  8)  * SIZE(BO)
449
450	movq	 6 * SIZE(B), %mm6
451	movq	%mm6,   -4 * SIZE(BO)
452	movq	%mm6,   -3 * SIZE(BO)
453	movq	 7 * SIZE(B), %mm7
454	movq	%mm7,   -2 * SIZE(BO)
455	movq	%mm7,   -1 * SIZE(BO)
456
457	PREFETCH	 (RPREFETCHSIZE +  8)  * SIZE(B)
458
459	movq	 8 * SIZE(B), %mm0
460	movq	%mm0,   0 * SIZE(BO)
461	movq	%mm0,   1 * SIZE(BO)
462	movq	 9 * SIZE(B), %mm1
463	movq	%mm1,   2 * SIZE(BO)
464	movq	%mm1,   3 * SIZE(BO)
465
466	movq	10 * SIZE(B), %mm2
467	movq	%mm2,   4 * SIZE(BO)
468	movq	%mm2,   5 * SIZE(BO)
469	movq	11 * SIZE(B), %mm3
470	movq	%mm3,   6 * SIZE(BO)
471	movq	%mm3,   7 * SIZE(BO)
472
473	PREFETCHW	 (WPREFETCHSIZE + 16)  * SIZE(BO)
474
475	movq	12 * SIZE(B), %mm4
476	movq	%mm4,   8 * SIZE(BO)
477	movq	%mm4,   9 * SIZE(BO)
478	movq	13 * SIZE(B), %mm5
479	movq	%mm5,  10 * SIZE(BO)
480	movq	%mm5,  11 * SIZE(BO)
481
482	PREFETCHW	 (WPREFETCHSIZE + 24)  * SIZE(BO)
483
484	movq	14 * SIZE(B), %mm6
485	movq	%mm6,  12 * SIZE(BO)
486	movq	%mm6,  13 * SIZE(BO)
487	movq	15 * SIZE(B), %mm7
488	movq	%mm7,  14 * SIZE(BO)
489	movq	%mm7,  15 * SIZE(BO)
490
491	addq	$ 32 * SIZE, BO
492	subq	$-16 * SIZE, B
493	decq	%rax
494	jne	.L02
495	ALIGN_4
496
497.L03:
498	movq	K, %rax
499	andq	$3, %rax
500	BRANCH
501	jle	.L05
502	ALIGN_4
503
504.L04:
505	movq	 0 * SIZE(B), %mm0
506	movq	%mm0, -16 * SIZE(BO)
507	movq	%mm0, -15 * SIZE(BO)
508	movq	 1 * SIZE(B), %mm1
509	movq	%mm1, -14 * SIZE(BO)
510	movq	%mm1, -13 * SIZE(BO)
511
512	movq	 2 * SIZE(B), %mm2
513	movq	%mm2, -12 * SIZE(BO)
514	movq	%mm2, -11 * SIZE(BO)
515	movq	 3 * SIZE(B), %mm3
516	movq	%mm3, -10 * SIZE(BO)
517	movq	%mm3,  -9 * SIZE(BO)
518
519	addq	$ 4 * SIZE, B
520	addq	$ 8 * SIZE, BO
521
522	decq	%rax
523	jne	.L04
524	ALIGN_4
525
526.L05:
527	movq	A, AO		# aoffset = a
528
529	leaq	 (RPREFETCHSIZE +  0)  * SIZE(B), BB
530
531	movq	M,  I
532	sarq	$1, I		# i = (m >> 2)
533	jle	.L30
534	ALIGN_4
535
536.L10:
537#if !defined(TRMMKERNEL) || \
538	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
539	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
540
541	leaq	16 * SIZE + BUFFER, BO
542#else
543	leaq	16 * SIZE + BUFFER, BO
544	movq	KK, %rax
545	leaq	(, %rax, SIZE), %rax
546	leaq	(AO, %rax, 4), AO
547	leaq	(BO, %rax, 8), BO
548#endif
549
550	movapd	-16 * SIZE(AO), %xmm0
551	movapd	-16 * SIZE(BO), %xmm1
552	pxor	%xmm8, %xmm8
553	PREFETCH	  0  * SIZE(BB)
554	movapd	-14 * SIZE(AO), %xmm2
555	movapd	-14 * SIZE(BO), %xmm3
556	pxor	%xmm9, %xmm9
557	movapd	-12 * SIZE(AO), %xmm4
558	movapd	-12 * SIZE(BO), %xmm5
559	pxor	%xmm10, %xmm10
560	movapd	-10 * SIZE(AO), %xmm6
561	movapd	 -8 * SIZE(BO), %xmm7
562	pxor	%xmm11, %xmm11
563
564	pxor	%xmm12, %xmm12
565	PREFETCHW      3 * SIZE(CO1)
566	pxor	%xmm13, %xmm13
567	PREFETCHW      3 * SIZE(CO2)
568	pxor	%xmm14, %xmm14
569	pxor	%xmm15, %xmm15
570
571#ifndef TRMMKERNEL
572	movq	K, %rax
573#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
574	movq	K, %rax
575	subq	KK, %rax
576	movq	%rax, KKK
577#else
578	movq	KK, %rax
579#ifdef LEFT
580	addq	$2, %rax
581#else
582	addq	$2, %rax
583#endif
584	movq	%rax, KKK
585#endif
586#ifndef GENERIC
587	andq	$-8, %rax
588
589	leaq	(, %rax, SIZE), %rax
590	leaq	(AO, %rax, 4), AO
591	leaq	(BO, %rax, 8), BO
592	negq	%rax
593	NOBRANCH
594	je	.L15
595	ALIGN_3
596
597.L12:
598	KERNEL1(16 *  0)
599	KERNEL2(16 *  0)
600	KERNEL3(16 *  0)
601	KERNEL4(16 *  0)
602	KERNEL5(16 *  0)
603	KERNEL6(16 *  0)
604	KERNEL7(16 *  0)
605	KERNEL8(16 *  0)
606
607	KERNEL1(16 *  1)
608	KERNEL2(16 *  1)
609	KERNEL3(16 *  1)
610	KERNEL4(16 *  1)
611	KERNEL5(16 *  1)
612	KERNEL6(16 *  1)
613	KERNEL7(16 *  1)
614	KERNEL8(16 *  1)
615
616	addq	$8 * SIZE, %rax
617	NOBRANCH
618	je	.L15
619	KERNEL1(16 *  0)
620	KERNEL2(16 *  0)
621	KERNEL3(16 *  0)
622	KERNEL4(16 *  0)
623	KERNEL5(16 *  0)
624	KERNEL6(16 *  0)
625	KERNEL7(16 *  0)
626	KERNEL8(16 *  0)
627
628	KERNEL1(16 *  1)
629	KERNEL2(16 *  1)
630	KERNEL3(16 *  1)
631	KERNEL4(16 *  1)
632	KERNEL5(16 *  1)
633	KERNEL6(16 *  1)
634	KERNEL7(16 *  1)
635	KERNEL8(16 *  1)
636
637	addq	$8 * SIZE, %rax
638	NOBRANCH
639	je	.L15
640	KERNEL1(16 *  0)
641	KERNEL2(16 *  0)
642	KERNEL3(16 *  0)
643	KERNEL4(16 *  0)
644	KERNEL5(16 *  0)
645	KERNEL6(16 *  0)
646	KERNEL7(16 *  0)
647	KERNEL8(16 *  0)
648
649	KERNEL1(16 *  1)
650	KERNEL2(16 *  1)
651	KERNEL3(16 *  1)
652	KERNEL4(16 *  1)
653	KERNEL5(16 *  1)
654	KERNEL6(16 *  1)
655	KERNEL7(16 *  1)
656	KERNEL8(16 *  1)
657
658	addq	$8 * SIZE, %rax
659	NOBRANCH
660	je	.L15
661	KERNEL1(16 *  0)
662	KERNEL2(16 *  0)
663	KERNEL3(16 *  0)
664	KERNEL4(16 *  0)
665	KERNEL5(16 *  0)
666	KERNEL6(16 *  0)
667	KERNEL7(16 *  0)
668	KERNEL8(16 *  0)
669
670	KERNEL1(16 *  1)
671	KERNEL2(16 *  1)
672	KERNEL3(16 *  1)
673	KERNEL4(16 *  1)
674	KERNEL5(16 *  1)
675	KERNEL6(16 *  1)
676	KERNEL7(16 *  1)
677	KERNEL8(16 *  1)
678
679	addq	$8 * SIZE, %rax
680	NOBRANCH
681	je	.L15
682	KERNEL1(16 *  0)
683	KERNEL2(16 *  0)
684	KERNEL3(16 *  0)
685	KERNEL4(16 *  0)
686	KERNEL5(16 *  0)
687	KERNEL6(16 *  0)
688	KERNEL7(16 *  0)
689	KERNEL8(16 *  0)
690
691	KERNEL1(16 *  1)
692	KERNEL2(16 *  1)
693	KERNEL3(16 *  1)
694	KERNEL4(16 *  1)
695	KERNEL5(16 *  1)
696	KERNEL6(16 *  1)
697	KERNEL7(16 *  1)
698	KERNEL8(16 *  1)
699
700	addq	$8 * SIZE, %rax
701	NOBRANCH
702	je	.L15
703	KERNEL1(16 *  0)
704	KERNEL2(16 *  0)
705	KERNEL3(16 *  0)
706	KERNEL4(16 *  0)
707	KERNEL5(16 *  0)
708	KERNEL6(16 *  0)
709	KERNEL7(16 *  0)
710	KERNEL8(16 *  0)
711
712	KERNEL1(16 *  1)
713	KERNEL2(16 *  1)
714	KERNEL3(16 *  1)
715	KERNEL4(16 *  1)
716	KERNEL5(16 *  1)
717	KERNEL6(16 *  1)
718	KERNEL7(16 *  1)
719	KERNEL8(16 *  1)
720
721	addq	$8 * SIZE, %rax
722	NOBRANCH
723	je	.L15
724	KERNEL1(16 *  0)
725	KERNEL2(16 *  0)
726	KERNEL3(16 *  0)
727	KERNEL4(16 *  0)
728	KERNEL5(16 *  0)
729	KERNEL6(16 *  0)
730	KERNEL7(16 *  0)
731	KERNEL8(16 *  0)
732
733	KERNEL1(16 *  1)
734	KERNEL2(16 *  1)
735	KERNEL3(16 *  1)
736	KERNEL4(16 *  1)
737	KERNEL5(16 *  1)
738	KERNEL6(16 *  1)
739	KERNEL7(16 *  1)
740	KERNEL8(16 *  1)
741
742	addq	$8 * SIZE, %rax
743	NOBRANCH
744	je	.L15
745	KERNEL1(16 *  0)
746	KERNEL2(16 *  0)
747	KERNEL3(16 *  0)
748	KERNEL4(16 *  0)
749	KERNEL5(16 *  0)
750	KERNEL6(16 *  0)
751	KERNEL7(16 *  0)
752	KERNEL8(16 *  0)
753
754	KERNEL1(16 *  1)
755	KERNEL2(16 *  1)
756	KERNEL3(16 *  1)
757	KERNEL4(16 *  1)
758	KERNEL5(16 *  1)
759	KERNEL6(16 *  1)
760	KERNEL7(16 *  1)
761	KERNEL8(16 *  1)
762
763	addq	$8 * SIZE, %rax
764	BRANCH
765	jl	.L12
766	ALIGN_3
767
768.L15:
769	PREFETCH	  8  * SIZE(BB)
770	subq	 $-16 * SIZE, BB
771
772#ifndef TRMMKERNEL
773	movq	K, %rax
774#else
775	movq	KKK, %rax
776#endif
777	testq	$4, %rax
778	je .L16
779	xorq	%rax, %rax
780	ALIGN_3
781
782	KERNEL1(16 *  0)
783	KERNEL2(16 *  0)
784	KERNEL3(16 *  0)
785	KERNEL4(16 *  0)
786	KERNEL5(16 *  0)
787	KERNEL6(16 *  0)
788	KERNEL7(16 *  0)
789	KERNEL8(16 *  0)
790
791	addq	$32 * SIZE, BO
792	addq	$16 * SIZE, AO
793	ALIGN_3
794#else
795	sarq	$2, %rax
796	NOBRANCH
797	jle	.L16
798	ALIGN_3
799
800.L12:
801	KERNEL1(16 *  0)
802	KERNEL2(16 *  0)
803	KERNEL3(16 *  0)
804	KERNEL4(16 *  0)
805	KERNEL5(16 *  0)
806	KERNEL6(16 *  0)
807	KERNEL7(16 *  0)
808	KERNEL8(16 *  0)
809
810	addq	$ 32 * SIZE, BO
811	subq	$-16 * SIZE, AO
812	decq	%rax
813	BRANCH
814	jg	.L12
815#endif
816
817.L16:
818	movapd	POSINV,  %xmm5
819	movapd	ALPHA_R, %xmm6
820	movapd	ALPHA_I, %xmm7
821
822#ifndef TRMMKERNEL
823	movq	K, %rax
824#else
825	movq	KKK, %rax
826#endif
827	andq	$3, %rax		# if (k & 1)
828	je .L19
829
830	leaq	(, %rax, SIZE), %rax
831	leaq	(AO, %rax, 4), AO
832	leaq	(BO, %rax, 8), BO
833	negq	%rax
834	ALIGN_3
835
836.L17:
837	mulpd	%xmm0, %xmm1
838	addpd	%xmm1, %xmm8
839	movapd	-14 * SIZE(BO, %rax, 8), %xmm1
840	mulpd	%xmm0, %xmm1
841	addpd	%xmm1, %xmm9
842	movapd	-12 * SIZE(BO, %rax, 8), %xmm1
843	mulpd	%xmm0, %xmm1
844	mulpd	-10 * SIZE(BO, %rax, 8), %xmm0
845	addpd	%xmm1, %xmm10
846	movapd	-16 * SIZE(BO, %rax, 8), %xmm1
847	addpd	%xmm0, %xmm11
848	movapd	-12 * SIZE(AO, %rax, 4), %xmm0
849	mulpd	%xmm2, %xmm1
850	addpd	%xmm1, %xmm12
851	movapd	-14 * SIZE(BO, %rax, 8), %xmm1
852	mulpd	%xmm2, %xmm1
853	addpd	%xmm1, %xmm13
854	movapd	-12 * SIZE(BO, %rax, 8), %xmm1
855	mulpd	%xmm2, %xmm1
856	mulpd	-10 * SIZE(BO, %rax, 8), %xmm2
857	addpd	%xmm1, %xmm14
858	movapd	 -8 * SIZE(BO, %rax, 8), %xmm1
859	addpd	%xmm2, %xmm15
860	movapd	-10 * SIZE(AO, %rax, 4), %xmm2
861
862	addq	$SIZE, %rax
863	jl	.L17
864	ALIGN_3
865
866.L19:
867#ifndef TRMMKERNEL
868	movlpd	0 * SIZE(CO1), %xmm0
869	movhpd	1 * SIZE(CO1), %xmm0
870	movlpd	2 * SIZE(CO1), %xmm2
871	movhpd	3 * SIZE(CO1), %xmm2
872
873	movlpd	0 * SIZE(CO2), %xmm1
874	movhpd	1 * SIZE(CO2), %xmm1
875	movlpd	2 * SIZE(CO2), %xmm3
876	movhpd	3 * SIZE(CO2), %xmm3
877#endif
878
879	SHUFPD_1 %xmm9, %xmm9
880	SHUFPD_1 %xmm11, %xmm11
881	SHUFPD_1 %xmm13, %xmm13
882	SHUFPD_1 %xmm15, %xmm15
883
884#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
885    defined(NR) || defined(NC) || defined(TR) || defined(TC)
886	xorpd	%xmm5, %xmm9
887	xorpd	%xmm5, %xmm11
888	xorpd	%xmm5, %xmm13
889	xorpd	%xmm5, %xmm15
890#else
891	xorpd	%xmm5, %xmm8
892	xorpd	%xmm5, %xmm10
893	xorpd	%xmm5, %xmm12
894	xorpd	%xmm5, %xmm14
895#endif
896
897#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
898    defined(RR) || defined(RC) || defined(CR) || defined(CC)
899	subpd	%xmm9, %xmm8
900	subpd	%xmm11, %xmm10
901	subpd	%xmm13, %xmm12
902	subpd	%xmm15, %xmm14
903#else
904	addpd	%xmm9, %xmm8
905	addpd	%xmm11, %xmm10
906	addpd	%xmm13, %xmm12
907	addpd	%xmm15, %xmm14
908#endif
909
910	pshufd	$0x4e, %xmm8, %xmm9
911	pshufd	$0x4e, %xmm10, %xmm11
912	pshufd	$0x4e, %xmm12, %xmm13
913	pshufd	$0x4e, %xmm14, %xmm15
914
915	mulpd	%xmm6, %xmm8
916	mulpd	%xmm7, %xmm9
917	mulpd	%xmm6, %xmm10
918	mulpd	%xmm7, %xmm11
919
920	mulpd	%xmm6, %xmm12
921	mulpd	%xmm7, %xmm13
922	mulpd	%xmm6, %xmm14
923	mulpd	%xmm7, %xmm15
924
925	addpd	%xmm9, %xmm8
926	addpd	%xmm11, %xmm10
927	addpd	%xmm13, %xmm12
928	addpd	%xmm15, %xmm14
929
930#ifndef TRMMKERNEL
931	addpd	%xmm0,  %xmm8
932	addpd	%xmm2, %xmm12
933	addpd	%xmm1,  %xmm10
934	addpd	%xmm3, %xmm14
935#endif
936
937	movlpd	%xmm8, 0 * SIZE(CO1)
938	movhpd	%xmm8, 1 * SIZE(CO1)
939	movlpd	%xmm12, 2 * SIZE(CO1)
940	movhpd	%xmm12, 3 * SIZE(CO1)
941
942	movlpd	%xmm10, 0 * SIZE(CO2)
943	movhpd	%xmm10, 1 * SIZE(CO2)
944	movlpd	%xmm14, 2 * SIZE(CO2)
945	movhpd	%xmm14, 3 * SIZE(CO2)
946
947#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
948    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
949	movq	K, %rax
950	subq	KKK, %rax
951	leaq	(,%rax, SIZE), %rax
952	leaq	(AO, %rax, 4), AO
953	leaq	(BO, %rax, 8), BO
954#endif
955
956#if defined(TRMMKERNEL) && defined(LEFT)
957	addq	$2, KK
958#endif
959
960	addq	$4 * SIZE, CO1		# coffset += 4
961	addq	$4 * SIZE, CO2		# coffset += 4
962	decq	I			# i --
963	jg	.L10
964	ALIGN_4
965
966.L30:
967	testq	$1, M
968	jle	.L99
969
970#if !defined(TRMMKERNEL) || \
971	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
972	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
973
974	leaq	16 * SIZE + BUFFER, BO
975#else
976	leaq	16 * SIZE + BUFFER, BO
977	movq	KK, %rax
978	leaq	(, %rax, SIZE), %rax
979	leaq	(AO, %rax, 2), AO
980	leaq	(BO, %rax, 8), BO
981#endif
982
983	movapd	-16 * SIZE(AO), %xmm0
984	pxor	%xmm8, %xmm8
985	movapd	 -8 * SIZE(AO), %xmm2
986	pxor	%xmm9, %xmm9
987	movapd	-16 * SIZE(BO), %xmm1
988	pxor	%xmm10, %xmm10
989	movapd	 -8 * SIZE(BO), %xmm3
990	pxor	%xmm11, %xmm11
991
992#ifndef TRMMKERNEL
993	movq	K, %rax
994#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
995	movq	K, %rax
996	subq	KK, %rax
997	movq	%rax, KKK
998#else
999	movq	KK, %rax
1000#ifdef LEFT
1001	addq	$1, %rax
1002#else
1003	addq	$2, %rax
1004#endif
1005	movq	%rax, KKK
1006#endif
1007	sarq	$3, %rax
1008	je	.L44
1009	ALIGN_4
1010
1011.L41:
1012	mulpd	%xmm0, %xmm1
1013	addpd	%xmm1, %xmm8
1014	movapd	-14 * SIZE(BO), %xmm1
1015	mulpd	%xmm0, %xmm1
1016	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
1017	addpd	%xmm1, %xmm9
1018	movapd	-12 * SIZE(BO), %xmm1
1019	mulpd	%xmm0, %xmm1
1020	mulpd	-10 * SIZE(BO), %xmm0
1021	addpd	%xmm1, %xmm10
1022	movapd	  0 * SIZE(BO), %xmm1
1023	addpd	%xmm0, %xmm11
1024	movapd	-14 * SIZE(AO), %xmm0
1025	mulpd	%xmm0, %xmm3
1026	addpd	%xmm3, %xmm8
1027	movapd	 -6 * SIZE(BO), %xmm3
1028	mulpd	%xmm0, %xmm3
1029	addpd	%xmm3, %xmm9
1030	movapd	 -4 * SIZE(BO), %xmm3
1031	mulpd	%xmm0, %xmm3
1032	mulpd	 -2 * SIZE(BO), %xmm0
1033	addpd	%xmm3, %xmm10
1034	movapd	  8 * SIZE(BO), %xmm3
1035	addpd	%xmm0, %xmm11
1036	movapd	-12 * SIZE(AO), %xmm0
1037	mulpd	%xmm0, %xmm1
1038	addpd	%xmm1, %xmm8
1039	movapd	  2 * SIZE(BO), %xmm1
1040	mulpd	%xmm0, %xmm1
1041	PREFETCH	(PREFETCHSIZE     +  8) * SIZE(AO)
1042	addpd	%xmm1, %xmm9
1043	movapd	  4 * SIZE(BO), %xmm1
1044	mulpd	%xmm0, %xmm1
1045	mulpd	  6 * SIZE(BO), %xmm0
1046	addpd	%xmm1, %xmm10
1047	movapd	 16 * SIZE(BO), %xmm1
1048	addpd	%xmm0, %xmm11
1049	movapd	-10 * SIZE(AO), %xmm0
1050	mulpd	%xmm0, %xmm3
1051	addpd	%xmm3, %xmm8
1052	movapd	 10 * SIZE(BO), %xmm3
1053	mulpd	%xmm0, %xmm3
1054	addpd	%xmm3, %xmm9
1055	movapd	 12 * SIZE(BO), %xmm3
1056	mulpd	%xmm0, %xmm3
1057	mulpd	 14 * SIZE(BO), %xmm0
1058	addpd	%xmm3, %xmm10
1059	movapd	 24 * SIZE(BO), %xmm3
1060	addpd	%xmm0, %xmm11
1061	movapd	  0 * SIZE(AO), %xmm0
1062	mulpd	%xmm2, %xmm1
1063	addpd	%xmm1, %xmm8
1064	movapd	 18 * SIZE(BO), %xmm1
1065	mulpd	%xmm2, %xmm1
1066	addpd	%xmm1, %xmm9
1067	movapd	 20 * SIZE(BO), %xmm1
1068	mulpd	%xmm2, %xmm1
1069	mulpd	 22 * SIZE(BO), %xmm2
1070	addpd	%xmm1, %xmm10
1071	movapd	 32 * SIZE(BO), %xmm1
1072	addpd	%xmm2, %xmm11
1073	movapd	 -6 * SIZE(AO), %xmm2
1074	mulpd	%xmm2, %xmm3
1075	addpd	%xmm3, %xmm8
1076	movapd	 26 * SIZE(BO), %xmm3
1077	mulpd	%xmm2, %xmm3
1078	addpd	%xmm3, %xmm9
1079	movapd	 28 * SIZE(BO), %xmm3
1080	mulpd	%xmm2, %xmm3
1081	mulpd	 30 * SIZE(BO), %xmm2
1082	addpd	%xmm3, %xmm10
1083	movapd	 40 * SIZE(BO), %xmm3
1084	addpd	%xmm2, %xmm11
1085	movapd	 -4 * SIZE(AO), %xmm2
1086	mulpd	%xmm2, %xmm1
1087	addpd	%xmm1, %xmm8
1088	movapd	 34 * SIZE(BO), %xmm1
1089	mulpd	%xmm2, %xmm1
1090	addpd	%xmm1, %xmm9
1091	movapd	 36 * SIZE(BO), %xmm1
1092	mulpd	%xmm2, %xmm1
1093	mulpd	 38 * SIZE(BO), %xmm2
1094	addpd	%xmm1, %xmm10
1095	movapd	 48 * SIZE(BO), %xmm1
1096	addpd	%xmm2, %xmm11
1097	movapd	 -2 * SIZE(AO), %xmm2
1098	mulpd	%xmm2, %xmm3
1099	addpd	%xmm3, %xmm8
1100	movapd	 42 * SIZE(BO), %xmm3
1101	mulpd	%xmm2, %xmm3
1102	addpd	%xmm3, %xmm9
1103	movapd	 44 * SIZE(BO), %xmm3
1104	mulpd	%xmm2, %xmm3
1105	mulpd	 46 * SIZE(BO), %xmm2
1106	addpd	%xmm3, %xmm10
1107	movapd	 56 * SIZE(BO), %xmm3
1108	addpd	%xmm2, %xmm11
1109	movapd	  8 * SIZE(AO), %xmm2
1110
1111	subq   $-16 * SIZE, AO
1112	addq   $64 * SIZE, BO
1113	decq   %rax
1114	jne    .L41
1115	ALIGN_4
1116
1117.L44:
1118#ifndef TRMMKERNEL
1119	movq	K, %rax
1120#else
1121	movq	KKK, %rax
1122#endif
1123	andq	$4, %rax
1124	BRANCH
1125	jle .L45
1126
1127	mulpd	%xmm0, %xmm1
1128	addpd	%xmm1, %xmm8
1129	movapd	-14 * SIZE(BO), %xmm1
1130	mulpd	%xmm0, %xmm1
1131	addpd	%xmm1, %xmm9
1132	movapd	-12 * SIZE(BO), %xmm1
1133	mulpd	%xmm0, %xmm1
1134	mulpd	-10 * SIZE(BO), %xmm0
1135	addpd	%xmm1, %xmm10
1136	movapd	  0 * SIZE(BO), %xmm1
1137	addpd	%xmm0, %xmm11
1138	movapd	-14 * SIZE(AO), %xmm0
1139	mulpd	%xmm0, %xmm3
1140	addpd	%xmm3, %xmm8
1141	movapd	 -6 * SIZE(BO), %xmm3
1142	mulpd	%xmm0, %xmm3
1143	addpd	%xmm3, %xmm9
1144	movapd	 -4 * SIZE(BO), %xmm3
1145	mulpd	%xmm0, %xmm3
1146	mulpd	 -2 * SIZE(BO), %xmm0
1147	addpd	%xmm3, %xmm10
1148	movapd	  8 * SIZE(BO), %xmm3
1149	addpd	%xmm0, %xmm11
1150	movapd	-12 * SIZE(AO), %xmm0
1151	mulpd	%xmm0, %xmm1
1152	addpd	%xmm1, %xmm8
1153	movapd	  2 * SIZE(BO), %xmm1
1154	mulpd	%xmm0, %xmm1
1155	addpd	%xmm1, %xmm9
1156	movapd	  4 * SIZE(BO), %xmm1
1157	mulpd	%xmm0, %xmm1
1158	mulpd	  6 * SIZE(BO), %xmm0
1159	addpd	%xmm1, %xmm10
1160	movapd	 16 * SIZE(BO), %xmm1
1161	addpd	%xmm0, %xmm11
1162	movapd	-10 * SIZE(AO), %xmm0
1163	mulpd	%xmm0, %xmm3
1164	addpd	%xmm3, %xmm8
1165	movapd	 10 * SIZE(BO), %xmm3
1166	mulpd	%xmm0, %xmm3
1167	addpd	%xmm3, %xmm9
1168	movapd	 12 * SIZE(BO), %xmm3
1169	mulpd	%xmm0, %xmm3
1170	mulpd	 14 * SIZE(BO), %xmm0
1171	addpd	%xmm3, %xmm10
1172	movapd	 24 * SIZE(BO), %xmm3
1173	addpd	%xmm0, %xmm11
1174	movapd	 -8 * SIZE(AO), %xmm0
1175
1176	addq   $ 8 * SIZE, AO
1177	addq   $32 * SIZE, BO
1178	ALIGN_4
1179
1180.L45:
1181#ifndef TRMMKERNEL
1182	movq	K, %rax
1183#else
1184	movq	KKK, %rax
1185#endif
1186	movapd	POSINV,  %xmm5
1187	movapd	ALPHA_R, %xmm6
1188	movapd	ALPHA_I, %xmm7
1189	andq	$3, %rax		# if (k & 1)
1190	BRANCH
1191	jle .L47
1192	ALIGN_4
1193
1194.L46:
1195	mulpd	%xmm0, %xmm1
1196	addpd	%xmm1, %xmm8
1197	movapd	-14 * SIZE(BO), %xmm1
1198	mulpd	%xmm0, %xmm1
1199	addpd	%xmm1, %xmm9
1200	movapd	-12 * SIZE(BO), %xmm1
1201	mulpd	%xmm0, %xmm1
1202	mulpd	-10 * SIZE(BO), %xmm0
1203	addpd	%xmm1, %xmm10
1204	movapd	 -8 * SIZE(BO), %xmm1
1205	addpd	%xmm0, %xmm11
1206	movapd	-14 * SIZE(AO), %xmm0
1207
1208	addq	$2 * SIZE, AO
1209	addq	$8 * SIZE, BO
1210
1211	decq	%rax
1212	jg	.L46
1213	ALIGN_4
1214
1215.L47:
1216#ifndef TRMMKERNEL
1217	movlpd	0 * SIZE(CO1), %xmm0
1218	movhpd	1 * SIZE(CO1), %xmm0
1219	movlpd	0 * SIZE(CO2), %xmm1
1220	movhpd	1 * SIZE(CO2), %xmm1
1221#endif
1222
1223	SHUFPD_1 %xmm9, %xmm9
1224	SHUFPD_1 %xmm11, %xmm11
1225
1226#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1227    defined(NR) || defined(NC) || defined(TR) || defined(TC)
1228	xorpd	%xmm5, %xmm9
1229	xorpd	%xmm5, %xmm11
1230#else
1231	xorpd	%xmm5, %xmm8
1232	xorpd	%xmm5, %xmm10
1233#endif
1234
1235#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1236    defined(RR) || defined(RC) || defined(CR) || defined(CC)
1237	subpd	%xmm9, %xmm8
1238	subpd	%xmm11, %xmm10
1239#else
1240	addpd	%xmm9, %xmm8
1241	addpd	%xmm11, %xmm10
1242#endif
1243
1244	pshufd	$0x4e, %xmm8, %xmm9
1245	pshufd	$0x4e, %xmm10, %xmm11
1246
1247	mulpd	%xmm6, %xmm8
1248	mulpd	%xmm7, %xmm9
1249	mulpd	%xmm6, %xmm10
1250	mulpd	%xmm7, %xmm11
1251
1252	addpd	%xmm9, %xmm8
1253	addpd	%xmm11, %xmm10
1254
1255#ifndef TRMMKERNEL
1256	addpd	%xmm0,  %xmm8
1257	addpd	%xmm1,  %xmm10
1258#endif
1259
1260	movlpd	%xmm8, 0 * SIZE(CO1)
1261	movhpd	%xmm8, 1 * SIZE(CO1)
1262	movlpd	%xmm10, 0 * SIZE(CO2)
1263	movhpd	%xmm10, 1 * SIZE(CO2)
1264
1265#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1266    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1267	movq	K, %rax
1268	subq	KKK, %rax
1269	leaq	(,%rax, SIZE), %rax
1270	leaq	(AO, %rax, 2), AO
1271	leaq	(BO, %rax, 8), BO
1272#endif
1273
1274#if defined(TRMMKERNEL) && defined(LEFT)
1275	addq	$1, KK
1276#endif
1277	ALIGN_4
1278
1279.L99:
1280#if defined(TRMMKERNEL) && !defined(LEFT)
1281	addl	$2, KK
1282#endif
1283
1284	leaq	(C, LDC, 2), C		# c += 2 * ldc
1285	decq	J			# j --
1286	jg	.L01
1287
1288.L100:
1289	testq	$1, N
1290	jle	.L999
1291
1292.L101:
1293#if defined(TRMMKERNEL) && defined(LEFT)
1294	movq	OFFSET, %rax
1295	movq	%rax, KK
1296#endif
1297
1298/* Copying to Sub Buffer */
1299	leaq	BUFFER, BO
1300
1301	movq	K, %rax
1302	sarq	$2, %rax
1303	jle	.L103
1304	ALIGN_4
1305
1306.L102:
1307	movlpd	 0 * SIZE(B), %xmm8
1308	movlpd	 1 * SIZE(B), %xmm9
1309	movlpd	 2 * SIZE(B), %xmm10
1310	movlpd	 3 * SIZE(B), %xmm11
1311	movlpd	 4 * SIZE(B), %xmm12
1312	movlpd	 5 * SIZE(B), %xmm13
1313	movlpd	 6 * SIZE(B), %xmm14
1314	movlpd	 7 * SIZE(B), %xmm15
1315
1316	movlpd	%xmm8,  0 * SIZE(BO)
1317	movlpd	%xmm8,  1 * SIZE(BO)
1318	movlpd	%xmm9,  2 * SIZE(BO)
1319	movlpd	%xmm9,  3 * SIZE(BO)
1320	movlpd	%xmm10,  4 * SIZE(BO)
1321	movlpd	%xmm10,  5 * SIZE(BO)
1322	movlpd	%xmm11,  6 * SIZE(BO)
1323	movlpd	%xmm11,  7 * SIZE(BO)
1324	movlpd	%xmm12,  8 * SIZE(BO)
1325	movlpd	%xmm12,  9 * SIZE(BO)
1326	movlpd	%xmm13, 10 * SIZE(BO)
1327	movlpd	%xmm13, 11 * SIZE(BO)
1328	movlpd	%xmm14, 12 * SIZE(BO)
1329	movlpd	%xmm14, 13 * SIZE(BO)
1330	movlpd	%xmm15, 14 * SIZE(BO)
1331	movlpd	%xmm15, 15 * SIZE(BO)
1332
1333	subq	$-16 * SIZE, BO
1334	addq	$ 8 * SIZE, B
1335	decq	%rax
1336	jne	.L102
1337	ALIGN_4
1338
1339.L103:
1340	movq	K, %rax
1341	andq	$3, %rax
1342	BRANCH
1343	jle	.L105
1344	ALIGN_4
1345
1346.L104:
1347	movlpd	 0 * SIZE(B), %xmm8
1348	movlpd	 1 * SIZE(B), %xmm9
1349
1350	movlpd	%xmm8,  0 * SIZE(BO)
1351	movlpd	%xmm8,  1 * SIZE(BO)
1352	movlpd	%xmm9,  2 * SIZE(BO)
1353	movlpd	%xmm9,  3 * SIZE(BO)
1354
1355	addq	$4 * SIZE, BO
1356	addq	$2 * SIZE, B
1357	decq	%rax
1358	jne	.L104
1359	ALIGN_4
1360
1361.L105:
1362	movq	C, CO1		# coffset1 = c
1363	movq	A, AO		# aoffset = a
1364
1365	movq	M,  I
1366	sarq	$1, I		# i = (m >> 2)
1367	jle	.L130
1368	ALIGN_4
1369
1370.L110:
1371#if !defined(TRMMKERNEL) || \
1372	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1373	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1374
1375	leaq	16 * SIZE + BUFFER, BO
1376#else
1377	leaq	16 * SIZE + BUFFER, BO
1378	movq	KK, %rax
1379	leaq	(, %rax, SIZE), %rax
1380	leaq	(AO, %rax, 4), AO
1381	leaq	(BO, %rax, 4), BO
1382#endif
1383
1384	movapd	-16 * SIZE(AO), %xmm0
1385	pxor	%xmm8, %xmm8
1386	movapd	-16 * SIZE(BO), %xmm1
1387	pxor	%xmm9, %xmm9
1388	movapd	 -8 * SIZE(AO), %xmm2
1389	pxor	%xmm12, %xmm12
1390	movapd	 -8 * SIZE(BO), %xmm3
1391	pxor	%xmm13, %xmm13
1392	PREFETCHW      3 * SIZE(CO1)
1393
1394#ifndef TRMMKERNEL
1395	movq	K, %rax
1396#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1397	movq	K, %rax
1398	subq	KK, %rax
1399	movq	%rax, KKK
1400#else
1401	movq	KK, %rax
1402#ifdef LEFT
1403	addq	$2, %rax
1404#else
1405	addq	$1, %rax
1406#endif
1407	movq	%rax, KKK
1408#endif
1409	sarq	$2, %rax
1410	je	.L112
1411
1412.L111:
1413	mulpd	%xmm0, %xmm1
1414	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
1415	mulpd	-14 * SIZE(BO), %xmm0
1416	addpd	%xmm1, %xmm8
1417	movapd	-16 * SIZE(BO), %xmm1
1418	addpd	%xmm0, %xmm9
1419	movapd	-14 * SIZE(AO), %xmm0
1420	mulpd	%xmm0, %xmm1
1421	mulpd	-14 * SIZE(BO), %xmm0
1422	addpd	%xmm1, %xmm12
1423	movapd	-12 * SIZE(BO), %xmm1
1424	addpd	%xmm0, %xmm13
1425	movapd	-12 * SIZE(AO), %xmm0
1426	mulpd	%xmm0, %xmm1
1427	mulpd	-10 * SIZE(BO), %xmm0
1428	addpd	%xmm1, %xmm8
1429	movapd	-12 * SIZE(BO), %xmm1
1430	addpd	%xmm0, %xmm9
1431	movapd	-10 * SIZE(AO), %xmm0
1432	mulpd	%xmm0, %xmm1
1433	mulpd	-10 * SIZE(BO), %xmm0
1434	addpd	%xmm1, %xmm12
1435	movapd	  0 * SIZE(BO), %xmm1
1436	addpd	%xmm0, %xmm13
1437	movapd	  0 * SIZE(AO), %xmm0
1438	mulpd	%xmm2, %xmm3
1439	PREFETCH	(PREFETCHSIZE     +  8) * SIZE(AO)
1440	mulpd	 -6 * SIZE(BO), %xmm2
1441	addpd	%xmm3, %xmm8
1442	movapd	 -8 * SIZE(BO), %xmm3
1443	addpd	%xmm2, %xmm9
1444	movapd	 -6 * SIZE(AO), %xmm2
1445	mulpd	%xmm2, %xmm3
1446	mulpd	 -6 * SIZE(BO), %xmm2
1447	addpd	%xmm3, %xmm12
1448	movapd	 -4 * SIZE(BO), %xmm3
1449	addpd	%xmm2, %xmm13
1450	movapd	 -4 * SIZE(AO), %xmm2
1451	mulpd	%xmm2, %xmm3
1452	mulpd	 -2 * SIZE(BO), %xmm2
1453	addpd	%xmm3, %xmm8
1454	movapd	 -4 * SIZE(BO), %xmm3
1455	addpd	%xmm2, %xmm9
1456	movapd	 -2 * SIZE(AO), %xmm2
1457	mulpd	%xmm2, %xmm3
1458	mulpd	 -2 * SIZE(BO), %xmm2
1459	addpd	%xmm3, %xmm12
1460	movapd	  8 * SIZE(BO), %xmm3
1461	addpd	%xmm2, %xmm13
1462	movapd	  8 * SIZE(AO), %xmm2
1463
1464	subq   $-16 * SIZE, AO
1465	subq   $-16 * SIZE, BO
1466	decq   %rax
1467	jne    .L111
1468	ALIGN_4
1469
1470.L112:
1471#ifndef TRMMKERNEL
1472	movq	K, %rax
1473#else
1474	movq	KKK, %rax
1475#endif
1476	movapd	POSINV,  %xmm5
1477	movapd	ALPHA_R, %xmm6
1478	movapd	ALPHA_I, %xmm7
1479	andq	$3, %rax		# if (k & 1)
1480	BRANCH
1481	jle .L114
1482
1483.L113:
1484	mulpd	%xmm0, %xmm1
1485	mulpd	 -14 * SIZE(BO), %xmm0
1486	addpd	%xmm1, %xmm8
1487	movapd	 -16 * SIZE(BO), %xmm1
1488	addpd	%xmm0, %xmm9
1489	movapd	 -14 * SIZE(AO), %xmm0
1490	mulpd	%xmm0, %xmm1
1491	mulpd	 -14 * SIZE(BO), %xmm0
1492	addpd	%xmm1, %xmm12
1493	movapd	 -12 * SIZE(BO), %xmm1
1494	addpd	%xmm0, %xmm13
1495	movapd	 -12 * SIZE(AO), %xmm0
1496
1497	addq	$4 * SIZE, AO		# aoffset  += 4
1498	addq	$4 * SIZE, BO		# boffset1 += 8
1499	decq	%rax
1500	jg	.L113
1501	ALIGN_4
1502
1503.L114:
1504#ifndef TRMMKERNEL
1505	movlpd	0 * SIZE(CO1), %xmm0
1506	movhpd	1 * SIZE(CO1), %xmm0
1507	movlpd	2 * SIZE(CO1), %xmm2
1508	movhpd	3 * SIZE(CO1), %xmm2
1509#endif
1510
1511	SHUFPD_1 %xmm9, %xmm9
1512	SHUFPD_1 %xmm13, %xmm13
1513
1514#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1515    defined(NR) || defined(NC) || defined(TR) || defined(TC)
1516	xorpd	%xmm5, %xmm9
1517	xorpd	%xmm5, %xmm13
1518#else
1519	xorpd	%xmm5, %xmm8
1520	xorpd	%xmm5, %xmm12
1521#endif
1522
1523#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1524    defined(RR) || defined(RC) || defined(CR) || defined(CC)
1525	subpd	%xmm9, %xmm8
1526	subpd	%xmm13, %xmm12
1527#else
1528	addpd	%xmm9, %xmm8
1529	addpd	%xmm13, %xmm12
1530#endif
1531
1532	pshufd	$0x4e, %xmm8, %xmm9
1533	pshufd	$0x4e, %xmm12, %xmm13
1534
1535	mulpd	%xmm6, %xmm8
1536	mulpd	%xmm7, %xmm9
1537	mulpd	%xmm6, %xmm12
1538	mulpd	%xmm7, %xmm13
1539
1540	addpd	%xmm9, %xmm8
1541	addpd	%xmm13, %xmm12
1542
1543#ifndef TRMMKERNEL
1544	addpd	%xmm0,  %xmm8
1545	addpd	%xmm2, %xmm12
1546#endif
1547
1548	movlpd	%xmm8, 0 * SIZE(CO1)
1549	movhpd	%xmm8, 1 * SIZE(CO1)
1550	movlpd	%xmm12, 2 * SIZE(CO1)
1551	movhpd	%xmm12, 3 * SIZE(CO1)
1552
1553#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1554    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1555	movq	K, %rax
1556	subq	KKK, %rax
1557	leaq	(,%rax, SIZE), %rax
1558	leaq	(AO, %rax, 4), AO
1559	leaq	(BO, %rax, 4), BO
1560#endif
1561
1562#if defined(TRMMKERNEL) && defined(LEFT)
1563	addq	$2, KK
1564#endif
1565
1566	addq	$4 * SIZE, CO1		# coffset += 4
1567	decq	I			# i --
1568	jg	.L110
1569	ALIGN_4
1570
1571.L130:
1572	testq	$1, M
1573	jle	.L999
1574
1575#if !defined(TRMMKERNEL) || \
1576	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1577	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1578
1579	leaq	16 * SIZE + BUFFER, BO
1580#else
1581	leaq	16 * SIZE + BUFFER, BO
1582	movq	KK, %rax
1583	leaq	(, %rax, SIZE), %rax
1584	leaq	(AO, %rax, 2), AO
1585	leaq	(BO, %rax, 4), BO
1586#endif
1587
1588	movapd	-16 * SIZE(AO), %xmm0
1589	movapd	-16 * SIZE(BO), %xmm1
1590	movapd	 -8 * SIZE(AO), %xmm2
1591	movapd	 -8 * SIZE(BO), %xmm3
1592
1593	pxor	%xmm8, %xmm8
1594	pxor	%xmm9, %xmm9
1595	pxor	%xmm10, %xmm10
1596	pxor	%xmm11, %xmm11
1597
1598#ifndef TRMMKERNEL
1599	movq	K, %rax
1600#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1601	movq	K, %rax
1602	subq	KK, %rax
1603	movq	%rax, KKK
1604#else
1605	movq	KK, %rax
1606#ifdef LEFT
1607	addq	$1, %rax
1608#else
1609	addq	$1, %rax
1610#endif
1611	movq	%rax, KKK
1612#endif
1613	sarq	$3, %rax
1614	je	.L144
1615	ALIGN_4
1616
1617.L141:
1618	mulpd	%xmm0, %xmm1
1619	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
1620	mulpd	-14 * SIZE(BO), %xmm0
1621	addpd	%xmm1, %xmm8
1622	movapd	-12 * SIZE(BO), %xmm1
1623	addpd	%xmm0, %xmm9
1624	movapd	-14 * SIZE(AO), %xmm0
1625	mulpd	%xmm0, %xmm1
1626	mulpd	-10 * SIZE(BO), %xmm0
1627	addpd	%xmm1, %xmm10
1628	movapd	  0 * SIZE(BO), %xmm1
1629	addpd	%xmm0, %xmm11
1630	movapd	-12 * SIZE(AO), %xmm0
1631	mulpd	%xmm0, %xmm3
1632	mulpd	 -6 * SIZE(BO), %xmm0
1633	addpd	%xmm3, %xmm8
1634	movapd	 -4 * SIZE(BO), %xmm3
1635	addpd	%xmm0, %xmm9
1636	movapd	-10 * SIZE(AO), %xmm0
1637	mulpd	%xmm0, %xmm3
1638	mulpd	 -2 * SIZE(BO), %xmm0
1639	addpd	%xmm3, %xmm10
1640	movapd	  8 * SIZE(BO), %xmm3
1641	addpd	%xmm0, %xmm11
1642	movapd	  0 * SIZE(AO), %xmm0
1643	mulpd	%xmm2, %xmm1
1644	PREFETCH	(PREFETCHSIZE     +  8) * SIZE(AO)
1645	mulpd	  2 * SIZE(BO), %xmm2
1646	addpd	%xmm1, %xmm8
1647	movapd	  4 * SIZE(BO), %xmm1
1648	addpd	%xmm2, %xmm9
1649	movapd	 -6 * SIZE(AO), %xmm2
1650	mulpd	%xmm2, %xmm1
1651	mulpd	  6 * SIZE(BO), %xmm2
1652	addpd	%xmm1, %xmm10
1653	movapd	 16 * SIZE(BO), %xmm1
1654	addpd	%xmm2, %xmm11
1655	movapd	 -4 * SIZE(AO), %xmm2
1656	mulpd	%xmm2, %xmm3
1657	mulpd	 10 * SIZE(BO), %xmm2
1658	addpd	%xmm3, %xmm8
1659	movapd	 12 * SIZE(BO), %xmm3
1660	addpd	%xmm2, %xmm9
1661	movapd	 -2 * SIZE(AO), %xmm2
1662	mulpd	%xmm2, %xmm3
1663	mulpd	 14 * SIZE(BO), %xmm2
1664	addpd	%xmm3, %xmm10
1665	movapd	 24 * SIZE(BO), %xmm3
1666	addpd	%xmm2, %xmm11
1667	movapd	  8 * SIZE(AO), %xmm2
1668
1669	subq   $-16 * SIZE, AO
1670	subq   $-32 * SIZE, BO
1671	decq   %rax
1672	jne    .L141
1673	ALIGN_4
1674
1675
1676.L144:
1677#ifndef TRMMKERNEL
1678	movq	K, %rax
1679#else
1680	movq	KKK, %rax
1681#endif
1682	andq	$4, %rax		# if (k & 1)
1683	BRANCH
1684	jle .L145
1685
1686	mulpd	%xmm0, %xmm1
1687	mulpd	-14 * SIZE(BO), %xmm0
1688	addpd	%xmm1, %xmm8
1689	movapd	-12 * SIZE(BO), %xmm1
1690	addpd	%xmm0, %xmm9
1691	movapd	-14 * SIZE(AO), %xmm0
1692	mulpd	%xmm0, %xmm1
1693	mulpd	-10 * SIZE(BO), %xmm0
1694	addpd	%xmm1, %xmm10
1695	movapd	  0 * SIZE(BO), %xmm1
1696	addpd	%xmm0, %xmm11
1697	movapd	-12 * SIZE(AO), %xmm0
1698	mulpd	%xmm0, %xmm3
1699	mulpd	 -6 * SIZE(BO), %xmm0
1700	addpd	%xmm3, %xmm8
1701	movapd	 -4 * SIZE(BO), %xmm3
1702	addpd	%xmm0, %xmm9
1703	movapd	-10 * SIZE(AO), %xmm0
1704	mulpd	%xmm0, %xmm3
1705	mulpd	 -2 * SIZE(BO), %xmm0
1706	addpd	%xmm3, %xmm10
1707	addpd	%xmm0, %xmm11
1708	movapd	 -8 * SIZE(AO), %xmm0
1709
1710	addq   $8   * SIZE, AO
1711	subq   $-16 * SIZE, BO
1712	ALIGN_4
1713
1714.L145:
1715	movapd	POSINV, %xmm5
1716	movapd	ALPHA_R, %xmm6
1717	movapd	ALPHA_I, %xmm7
1718
1719#ifndef TRMMKERNEL
1720	movq	K, %rax
1721#else
1722	movq	KKK, %rax
1723#endif
1724	andq	$3, %rax		# if (k & 1)
1725	BRANCH
1726	jle .L148
1727	ALIGN_4
1728
1729.L146:
1730	mulpd	%xmm0, %xmm1
1731	mulpd	-14 * SIZE(BO), %xmm0
1732	addpd	%xmm1, %xmm8
1733	movapd	-12 * SIZE(BO), %xmm1
1734	addpd	%xmm0, %xmm9
1735	movapd	-14 * SIZE(AO), %xmm0
1736
1737	addq	$2 * SIZE, AO		# aoffset  += 4
1738	addq	$4 * SIZE, BO		# boffset1 += 8
1739	decq	%rax
1740	jg	.L146
1741	ALIGN_4
1742
1743.L148:
1744	addpd	%xmm10, %xmm8
1745	addpd	%xmm11, %xmm9
1746
1747#ifndef TRMMKERNEL
1748	movlpd	0 * SIZE(CO1), %xmm0
1749	movhpd	1 * SIZE(CO1), %xmm0
1750#endif
1751
1752	SHUFPD_1 %xmm9, %xmm9
1753
1754#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1755    defined(NR) || defined(NC) || defined(TR) || defined(TC)
1756	xorpd	%xmm5, %xmm9
1757#else
1758	xorpd	%xmm5, %xmm8
1759#endif
1760
1761#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1762    defined(RR) || defined(RC) || defined(CR) || defined(CC)
1763	subpd	%xmm9, %xmm8
1764#else
1765	addpd	%xmm9, %xmm8
1766#endif
1767
1768	pshufd	$0x4e, %xmm8, %xmm9
1769
1770	mulpd	%xmm6, %xmm8
1771	mulpd	%xmm7, %xmm9
1772
1773	addpd	%xmm9, %xmm8
1774
1775#ifndef TRMMKERNEL
1776	addpd	%xmm0,  %xmm8
1777#endif
1778
1779	movlpd	%xmm8, 0 * SIZE(CO1)
1780	movhpd	%xmm8, 1 * SIZE(CO1)
1781	ALIGN_4
1782
1783.L999:
1784	movq	%rbx, %rsp
1785	EMMS
1786
1787	movq	  0(%rsp), %rbx
1788	movq	  8(%rsp), %rbp
1789	movq	 16(%rsp), %r12
1790	movq	 24(%rsp), %r13
1791	movq	 32(%rsp), %r14
1792	movq	 40(%rsp), %r15
1793
1794#ifdef WINDOWS_ABI
1795	movq	 48(%rsp), %rdi
1796	movq	 56(%rsp), %rsi
1797	movups	 64(%rsp), %xmm6
1798	movups	 80(%rsp), %xmm7
1799	movups	 96(%rsp), %xmm8
1800	movups	112(%rsp), %xmm9
1801	movups	128(%rsp), %xmm10
1802	movups	144(%rsp), %xmm11
1803	movups	160(%rsp), %xmm12
1804	movups	176(%rsp), %xmm13
1805	movups	192(%rsp), %xmm14
1806	movups	208(%rsp), %xmm15
1807#endif
1808
1809	addq	$STACKSIZE, %rsp
1810	ret
1811
1812	EPILOGUE
1813