1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define OLD_M	%rdi
43#define OLD_N	%rsi
44#define M	%r13
45#define N	%r14
46#define K	%rdx
47
48#define A	%rcx
49#define B	%r8
50#define C	%r9
51#define LDC	%r10
52#define I	%r11
53#define AO	%rdi
54#define BO	%rsi
55#define	CO1	%r15
56#define CO2	%rbp
57#define BB	%r12
58
59#ifndef WINDOWS_ABI
60
61#define STACKSIZE 64
62
63#else
64
65#define STACKSIZE 256
66
67#define OLD_ALPHA_I	40 + STACKSIZE(%rsp)
68#define OLD_A		48 + STACKSIZE(%rsp)
69#define OLD_B		56 + STACKSIZE(%rsp)
70#define OLD_C		64 + STACKSIZE(%rsp)
71#define OLD_LDC		72 + STACKSIZE(%rsp)
72#define OLD_OFFSET	80 + STACKSIZE(%rsp)
73
74#endif
75
76#define POSINV	  0(%rsp)
77#define ALPHA_R	 16(%rsp)
78#define ALPHA_I	 32(%rsp)
79#define J	 48(%rsp)
80#define OFFSET	 56(%rsp)
81#define KK	 64(%rsp)
82#define KKK	 72(%rsp)
83#define BUFFER	256(%rsp)
84
85#ifdef OPTERON
86#define PREFETCH     prefetch
87#define PREFETCHW    prefetchw
88#define PREFETCHSIZE (8 * 9 + 4)
89
90#define RPREFETCHSIZE (8 *  7 + 4)
91#define WPREFETCHSIZE (8 *  8 + 4)
92#endif
93
94#ifdef GENERIC
95#define PREFETCH     prefetcht0
96#define PREFETCHW    prefetcht0
97#define PREFETCHSIZE (8 * 5 + 4)
98
99#define RPREFETCHSIZE (8 *  7 + 4)
100#define WPREFETCHSIZE (8 *  8 + 4)
101#endif
102
103#ifndef GENERIC
104#define KERNEL1(xx) \
105	mulpd	%xmm0, %xmm1 ;\
106	addpd	%xmm1, %xmm8 ;\
107	movapd	-16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
108	mulpd	%xmm0, %xmm3 ;\
109	addpd	%xmm3, %xmm9 ;\
110	movapd	-14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
111	mulpd	%xmm0, %xmm5 ;\
112	PREFETCH	(PREFETCHSIZE     +  0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\
113	mulpd	-10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\
114	addpd	%xmm5, %xmm10 ;\
115	movapd	-12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
116	addpd	%xmm0, %xmm11 ;\
117	movapd	-8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0
118
119#define KERNEL2(xx) \
120	mulpd	%xmm2, %xmm1 ;\
121	addpd	%xmm1, %xmm12 ;\
122	movapd	 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
123	mulpd	%xmm2, %xmm3 ;\
124	addpd	%xmm3, %xmm13 ;\
125	movapd	-6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
126	mulpd	%xmm2, %xmm5 ;\
127	mulpd	-10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\
128	addpd	%xmm5, %xmm14 ;\
129	movapd	 -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
130	addpd	%xmm2, %xmm15 ;\
131	movapd	-6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2
132
133#define KERNEL3(xx) \
134	mulpd	%xmm4, %xmm7 ;\
135	addpd	%xmm7, %xmm8 ;\
136	movapd	-8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
137	mulpd	%xmm4, %xmm3 ;\
138	addpd	%xmm3, %xmm9 ;\
139	movapd	-6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
140	mulpd	%xmm4, %xmm5 ;\
141	mulpd	-2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\
142	addpd	%xmm5, %xmm10 ;\
143	movapd	-4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
144	addpd	%xmm4, %xmm11 ;\
145	movapd	-4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4
146
147#define KERNEL4(xx) \
148	mulpd	%xmm6, %xmm7 ;\
149	addpd	%xmm7, %xmm12 ;\
150	movapd	 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
151	mulpd	%xmm6, %xmm3 ;\
152	addpd	%xmm3, %xmm13 ;\
153	movapd	 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
154	mulpd	%xmm6, %xmm5 ;\
155	mulpd	-2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\
156	addpd	%xmm5, %xmm14 ;\
157	movapd	 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
158 	PREFETCH	(PREFETCHSIZE     +  8) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\
159	addpd	%xmm6, %xmm15 ;\
160	movapd	-2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6
161
162#define KERNEL5(xx) \
163	mulpd	%xmm0, %xmm1 ;\
164	addpd	%xmm1, %xmm8 ;\
165	movapd	 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
166	mulpd	%xmm0, %xmm3 ;\
167	addpd	%xmm3, %xmm9 ;\
168	movapd	 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
169	mulpd	%xmm0, %xmm5 ;\
170	mulpd	 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\
171	addpd	%xmm5, %xmm10 ;\
172	movapd	 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
173	addpd	%xmm0, %xmm11 ;\
174	movapd	 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0
175
176#define KERNEL6(xx) \
177	mulpd	%xmm2, %xmm1 ;\
178	addpd	%xmm1, %xmm12 ;\
179	movapd	16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
180	mulpd	%xmm2, %xmm3 ;\
181	addpd	%xmm3, %xmm13 ;\
182	movapd	10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
183	mulpd	%xmm2, %xmm5 ;\
184	mulpd	 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\
185	addpd	%xmm5, %xmm14 ;\
186	movapd	12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
187	addpd	%xmm2, %xmm15 ;\
188	movapd	 2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2
189
190#define KERNEL7(xx) \
191	mulpd	%xmm4, %xmm7 ;\
192	addpd	%xmm7, %xmm8 ;\
193	movapd	 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
194	mulpd	%xmm4, %xmm3 ;\
195	addpd	%xmm3, %xmm9 ;\
196	movapd	10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
197	mulpd	%xmm4, %xmm5 ;\
198	mulpd	14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\
199	addpd	%xmm5, %xmm10 ;\
200	movapd	12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
201	addpd	%xmm4, %xmm11 ;\
202	movapd	 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4
203
204#define KERNEL8(xx) \
205	mulpd	%xmm6, %xmm7 ;\
206	addpd	%xmm7, %xmm12 ;\
207	movapd	24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
208	mulpd	%xmm6, %xmm3 ;\
209	addpd	%xmm3, %xmm13 ;\
210	movapd	18 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
211	mulpd	%xmm6, %xmm5 ;\
212	mulpd	14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\
213	addpd	%xmm5, %xmm14 ;\
214	movapd	20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
215	addpd	%xmm6, %xmm15 ;\
216	movapd	 6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6
217
218#else
219#define KERNEL1(xx) \
220	mulpd	%xmm0, %xmm1 ;\
221	addpd	%xmm1, %xmm8 ;\
222	movapd	-16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
223	mulpd	%xmm0, %xmm3 ;\
224	addpd	%xmm3, %xmm9 ;\
225	movapd	-14 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
226	mulpd	%xmm0, %xmm5 ;\
227	PREFETCH	(PREFETCHSIZE     +  0) * SIZE + 1 * (xx) * SIZE(AO) ;\
228	mulpd	-10 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\
229	addpd	%xmm5, %xmm10 ;\
230	movapd	-12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
231	addpd	%xmm0, %xmm11 ;\
232	movapd	-8 * SIZE + 1 * (xx) * SIZE(AO), %xmm0
233
234#define KERNEL2(xx) \
235	mulpd	%xmm2, %xmm1 ;\
236	addpd	%xmm1, %xmm12 ;\
237	movapd	 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
238	mulpd	%xmm2, %xmm3 ;\
239	addpd	%xmm3, %xmm13 ;\
240	movapd	-6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
241	mulpd	%xmm2, %xmm5 ;\
242	mulpd	-10 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\
243	addpd	%xmm5, %xmm14 ;\
244	movapd	 -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
245	addpd	%xmm2, %xmm15 ;\
246	movapd	-6 * SIZE + 1 * (xx) * SIZE(AO), %xmm2
247
248#define KERNEL3(xx) \
249	mulpd	%xmm4, %xmm7 ;\
250	addpd	%xmm7, %xmm8 ;\
251	movapd	-8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
252	mulpd	%xmm4, %xmm3 ;\
253	addpd	%xmm3, %xmm9 ;\
254	movapd	-6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
255	mulpd	%xmm4, %xmm5 ;\
256	mulpd	-2 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\
257	addpd	%xmm5, %xmm10 ;\
258	movapd	-4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
259	addpd	%xmm4, %xmm11 ;\
260	movapd	-4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4
261
262#define KERNEL4(xx) \
263	mulpd	%xmm6, %xmm7 ;\
264	addpd	%xmm7, %xmm12 ;\
265	movapd	 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
266	mulpd	%xmm6, %xmm3 ;\
267	addpd	%xmm3, %xmm13 ;\
268	movapd	 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
269	mulpd	%xmm6, %xmm5 ;\
270	mulpd	-2 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\
271	addpd	%xmm5, %xmm14 ;\
272	movapd	 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
273 	PREFETCH	(PREFETCHSIZE     +  8) * SIZE + 1 * (xx) * SIZE(AO) ;\
274	addpd	%xmm6, %xmm15 ;\
275	movapd	-2 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
276
277#define KERNEL5(xx) \
278	mulpd	%xmm0, %xmm1 ;\
279	addpd	%xmm1, %xmm8 ;\
280	movapd	 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
281	mulpd	%xmm0, %xmm3 ;\
282	addpd	%xmm3, %xmm9 ;\
283	movapd	 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
284	mulpd	%xmm0, %xmm5 ;\
285	mulpd	 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\
286	addpd	%xmm5, %xmm10 ;\
287	movapd	 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
288	addpd	%xmm0, %xmm11 ;\
289	movapd	 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0
290
291#define KERNEL6(xx) \
292	mulpd	%xmm2, %xmm1 ;\
293	addpd	%xmm1, %xmm12 ;\
294	movapd	16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
295	mulpd	%xmm2, %xmm3 ;\
296	addpd	%xmm3, %xmm13 ;\
297	movapd	10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
298	mulpd	%xmm2, %xmm5 ;\
299	mulpd	 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\
300	addpd	%xmm5, %xmm14 ;\
301	movapd	12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
302	addpd	%xmm2, %xmm15 ;\
303	movapd	 2 * SIZE + 1 * (xx) * SIZE(AO), %xmm2
304
305#define KERNEL7(xx) \
306	mulpd	%xmm4, %xmm7 ;\
307	addpd	%xmm7, %xmm8 ;\
308	movapd	 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
309	mulpd	%xmm4, %xmm3 ;\
310	addpd	%xmm3, %xmm9 ;\
311	movapd	10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
312	mulpd	%xmm4, %xmm5 ;\
313	mulpd	14 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\
314	addpd	%xmm5, %xmm10 ;\
315	movapd	12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
316	addpd	%xmm4, %xmm11 ;\
317	movapd	 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4
318
319#define KERNEL8(xx) \
320	mulpd	%xmm6, %xmm7 ;\
321	addpd	%xmm7, %xmm12 ;\
322	movapd	24 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
323	mulpd	%xmm6, %xmm3 ;\
324	addpd	%xmm3, %xmm13 ;\
325	movapd	18 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
326	mulpd	%xmm6, %xmm5 ;\
327	mulpd	14 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\
328	addpd	%xmm5, %xmm14 ;\
329	movapd	20 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
330	addpd	%xmm6, %xmm15 ;\
331	movapd	 6 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
332
333#endif
334
335	PROLOGUE
336	PROFCODE
337
338	subq	$STACKSIZE, %rsp
339
340	movq	%rbx,  0(%rsp)
341	movq	%rbp,  8(%rsp)
342	movq	%r12, 16(%rsp)
343	movq	%r13, 24(%rsp)
344	movq	%r14, 32(%rsp)
345	movq	%r15, 40(%rsp)
346
347#ifdef WINDOWS_ABI
348	movq	%rdi,    48(%rsp)
349	movq	%rsi,    56(%rsp)
350	movups	%xmm6,   64(%rsp)
351	movups	%xmm7,   80(%rsp)
352	movups	%xmm8,   96(%rsp)
353	movups	%xmm9,  112(%rsp)
354	movups	%xmm10, 128(%rsp)
355	movups	%xmm11, 144(%rsp)
356	movups	%xmm12, 160(%rsp)
357	movups	%xmm13, 176(%rsp)
358	movups	%xmm14, 192(%rsp)
359	movups	%xmm15, 208(%rsp)
360
361	movq	ARG1,      OLD_M
362	movq	ARG2,      OLD_N
363	movq	ARG3,      K
364	movq	OLD_A,     A
365	movq	OLD_B,     B
366	movq	OLD_C,     C
367	movq	OLD_LDC,   LDC
368#ifdef TRMMKERNEL
369	movsd	OLD_OFFSET, %xmm12
370#endif
371	movaps	%xmm3,       %xmm0
372	movsd	OLD_ALPHA_I, %xmm1
373#else
374	movq	72(%rsp), LDC
375#ifdef TRMMKERNEL
376	movsd	80(%rsp), %xmm12
377#endif
378
379#endif
380
381	EMMS
382
383	movq	%rsp, %rbx	# save old stack
384	subq	$256 + LOCAL_BUFFER_SIZE, %rsp
385	andq	$-4096, %rsp	# align stack
386
387	STACK_TOUCHING
388
389	movq	OLD_M, M
390	movq	OLD_N, N
391
392	pcmpeqb	%xmm7, %xmm7
393	psllq	$63, %xmm7	# Generate mask
394	pxor	%xmm10, %xmm10
395
396	movlpd	 %xmm0, 0 + ALPHA_R
397	movlpd	 %xmm0, 8 + ALPHA_R
398
399	movlpd	 %xmm1, 8 + ALPHA_I
400	xorpd	 %xmm7, %xmm1
401	movlpd	 %xmm1, 0 + ALPHA_I
402
403	movlpd	  %xmm10,  0 + POSINV
404	movlpd	  %xmm7, 8 + POSINV
405
406#ifdef TRMMKERNEL
407	movlpd	%xmm12, OFFSET
408	movlpd	%xmm12, KK
409#ifndef LEFT
410	negq	KK
411#endif
412#endif
413
414	subq	$-16 * SIZE, A
415
416	salq	$ZBASE_SHIFT, LDC
417
418	movq	N,  J
419	sarq	$1, J		# j = (n >> 2)
420	jle	.L100
421	ALIGN_4
422
423.L01:
424	movq	C, CO1			# coffset1 = c
425	leaq	(C, LDC, 1), CO2	# coffset2 = c + ldc
426
427#if defined(TRMMKERNEL) && defined(LEFT)
428	movq	OFFSET, %rax
429	movq	%rax, KK
430#endif
431
432	leaq	16 * SIZE + BUFFER, BO
433
434	movq	K, %rax
435	sarq	$2, %rax
436	jle	.L03
437	ALIGN_4
438
439.L02:
440	PREFETCH	 (RPREFETCHSIZE +  0)  * SIZE(B)
441
442	movq	 0 * SIZE(B), %mm0
443	movq	%mm0,  -16 * SIZE(BO)
444	movq	%mm0,  -15 * SIZE(BO)
445	movq	 1 * SIZE(B), %mm1
446	movq	%mm1,  -14 * SIZE(BO)
447	movq	%mm1,  -13 * SIZE(BO)
448
449	movq	 2 * SIZE(B), %mm2
450	movq	%mm2,  -12 * SIZE(BO)
451	movq	%mm2,  -11 * SIZE(BO)
452	movq	 3 * SIZE(B), %mm3
453	movq	%mm3,  -10 * SIZE(BO)
454	movq	%mm3,   -9 * SIZE(BO)
455
456	PREFETCHW	 (WPREFETCHSIZE +  0)  * SIZE(BO)
457
458	movq	 4 * SIZE(B), %mm4
459	movq	%mm4,   -8 * SIZE(BO)
460	movq	%mm4,   -7 * SIZE(BO)
461	movq	 5 * SIZE(B), %mm5
462	movq	%mm5,   -6 * SIZE(BO)
463	movq	%mm5,   -5 * SIZE(BO)
464
465	PREFETCHW	 (WPREFETCHSIZE +  8)  * SIZE(BO)
466
467	movq	 6 * SIZE(B), %mm6
468	movq	%mm6,   -4 * SIZE(BO)
469	movq	%mm6,   -3 * SIZE(BO)
470	movq	 7 * SIZE(B), %mm7
471	movq	%mm7,   -2 * SIZE(BO)
472	movq	%mm7,   -1 * SIZE(BO)
473
474	PREFETCH	 (RPREFETCHSIZE +  8)  * SIZE(B)
475
476	movq	 8 * SIZE(B), %mm0
477	movq	%mm0,   0 * SIZE(BO)
478	movq	%mm0,   1 * SIZE(BO)
479	movq	 9 * SIZE(B), %mm1
480	movq	%mm1,   2 * SIZE(BO)
481	movq	%mm1,   3 * SIZE(BO)
482
483	movq	10 * SIZE(B), %mm2
484	movq	%mm2,   4 * SIZE(BO)
485	movq	%mm2,   5 * SIZE(BO)
486	movq	11 * SIZE(B), %mm3
487	movq	%mm3,   6 * SIZE(BO)
488	movq	%mm3,   7 * SIZE(BO)
489
490	PREFETCHW	 (WPREFETCHSIZE + 16)  * SIZE(BO)
491
492	movq	12 * SIZE(B), %mm4
493	movq	%mm4,   8 * SIZE(BO)
494	movq	%mm4,   9 * SIZE(BO)
495	movq	13 * SIZE(B), %mm5
496	movq	%mm5,  10 * SIZE(BO)
497	movq	%mm5,  11 * SIZE(BO)
498
499	PREFETCHW	 (WPREFETCHSIZE + 24)  * SIZE(BO)
500
501	movq	14 * SIZE(B), %mm6
502	movq	%mm6,  12 * SIZE(BO)
503	movq	%mm6,  13 * SIZE(BO)
504	movq	15 * SIZE(B), %mm7
505	movq	%mm7,  14 * SIZE(BO)
506	movq	%mm7,  15 * SIZE(BO)
507
508	addq	$ 32 * SIZE, BO
509	subq	$-16 * SIZE, B
510	decq	%rax
511	jne	.L02
512	ALIGN_4
513
514.L03:
515	movq	K, %rax
516	andq	$3, %rax
517	BRANCH
518	jle	.L05
519	ALIGN_4
520
521.L04:
522	movq	 0 * SIZE(B), %mm0
523	movq	%mm0, -16 * SIZE(BO)
524	movq	%mm0, -15 * SIZE(BO)
525	movq	 1 * SIZE(B), %mm1
526	movq	%mm1, -14 * SIZE(BO)
527	movq	%mm1, -13 * SIZE(BO)
528
529	movq	 2 * SIZE(B), %mm2
530	movq	%mm2, -12 * SIZE(BO)
531	movq	%mm2, -11 * SIZE(BO)
532	movq	 3 * SIZE(B), %mm3
533	movq	%mm3, -10 * SIZE(BO)
534	movq	%mm3,  -9 * SIZE(BO)
535
536	addq	$ 4 * SIZE, B
537	addq	$ 8 * SIZE, BO
538
539	decq	%rax
540	jne	.L04
541	ALIGN_4
542
543.L05:
544	movq	A, AO		# aoffset = a
545
546	leaq	 (RPREFETCHSIZE +  0)  * SIZE(B), BB
547
548	movq	M,  I
549	sarq	$1, I		# i = (m >> 2)
550	jle	.L30
551	ALIGN_4
552
553.L10:
554#if !defined(TRMMKERNEL) || \
555	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
556	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
557
558	leaq	16 * SIZE + BUFFER, BO
559#else
560	leaq	16 * SIZE + BUFFER, BO
561	movq	KK, %rax
562	leaq	(, %rax, SIZE), %rax
563	leaq	(AO, %rax, 4), AO
564	leaq	(BO, %rax, 8), BO
565#endif
566
567	movapd	-16 * SIZE(AO), %xmm0
568	movapd	-16 * SIZE(BO), %xmm1
569	pxor	%xmm8, %xmm8
570	PREFETCH	  0  * SIZE(BB)
571	movapd	-14 * SIZE(AO), %xmm2
572	movapd	-14 * SIZE(BO), %xmm3
573	pxor	%xmm9, %xmm9
574	movapd	-12 * SIZE(AO), %xmm4
575	movapd	-12 * SIZE(BO), %xmm5
576	pxor	%xmm10, %xmm10
577	movapd	-10 * SIZE(AO), %xmm6
578	movapd	 -8 * SIZE(BO), %xmm7
579	pxor	%xmm11, %xmm11
580
581	pxor	%xmm12, %xmm12
582	PREFETCHW      3 * SIZE(CO1)
583	pxor	%xmm13, %xmm13
584	PREFETCHW      3 * SIZE(CO2)
585	pxor	%xmm14, %xmm14
586	pxor	%xmm15, %xmm15
587
588#ifndef TRMMKERNEL
589	movq	K, %rax
590#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
591	movq	K, %rax
592	subq	KK, %rax
593	movq	%rax, KKK
594#else
595	movq	KK, %rax
596#ifdef LEFT
597	addq	$2, %rax
598#else
599	addq	$2, %rax
600#endif
601	movq	%rax, KKK
602#endif
603#ifndef GENERIC
604	andq	$-8, %rax
605
606	leaq	(, %rax, SIZE), %rax
607	leaq	(AO, %rax, 4), AO
608	leaq	(BO, %rax, 8), BO
609	negq	%rax
610	NOBRANCH
611	je	.L15
612	ALIGN_3
613
614.L12:
615	KERNEL1(16 *  0)
616	KERNEL2(16 *  0)
617	KERNEL3(16 *  0)
618	KERNEL4(16 *  0)
619	KERNEL5(16 *  0)
620	KERNEL6(16 *  0)
621	KERNEL7(16 *  0)
622	KERNEL8(16 *  0)
623
624	KERNEL1(16 *  1)
625	KERNEL2(16 *  1)
626	KERNEL3(16 *  1)
627	KERNEL4(16 *  1)
628	KERNEL5(16 *  1)
629	KERNEL6(16 *  1)
630	KERNEL7(16 *  1)
631	KERNEL8(16 *  1)
632
633	addq	$8 * SIZE, %rax
634	NOBRANCH
635	je	.L15
636	KERNEL1(16 *  0)
637	KERNEL2(16 *  0)
638	KERNEL3(16 *  0)
639	KERNEL4(16 *  0)
640	KERNEL5(16 *  0)
641	KERNEL6(16 *  0)
642	KERNEL7(16 *  0)
643	KERNEL8(16 *  0)
644
645	KERNEL1(16 *  1)
646	KERNEL2(16 *  1)
647	KERNEL3(16 *  1)
648	KERNEL4(16 *  1)
649	KERNEL5(16 *  1)
650	KERNEL6(16 *  1)
651	KERNEL7(16 *  1)
652	KERNEL8(16 *  1)
653
654	addq	$8 * SIZE, %rax
655	NOBRANCH
656	je	.L15
657	KERNEL1(16 *  0)
658	KERNEL2(16 *  0)
659	KERNEL3(16 *  0)
660	KERNEL4(16 *  0)
661	KERNEL5(16 *  0)
662	KERNEL6(16 *  0)
663	KERNEL7(16 *  0)
664	KERNEL8(16 *  0)
665
666	KERNEL1(16 *  1)
667	KERNEL2(16 *  1)
668	KERNEL3(16 *  1)
669	KERNEL4(16 *  1)
670	KERNEL5(16 *  1)
671	KERNEL6(16 *  1)
672	KERNEL7(16 *  1)
673	KERNEL8(16 *  1)
674
675	addq	$8 * SIZE, %rax
676	NOBRANCH
677	je	.L15
678	KERNEL1(16 *  0)
679	KERNEL2(16 *  0)
680	KERNEL3(16 *  0)
681	KERNEL4(16 *  0)
682	KERNEL5(16 *  0)
683	KERNEL6(16 *  0)
684	KERNEL7(16 *  0)
685	KERNEL8(16 *  0)
686
687	KERNEL1(16 *  1)
688	KERNEL2(16 *  1)
689	KERNEL3(16 *  1)
690	KERNEL4(16 *  1)
691	KERNEL5(16 *  1)
692	KERNEL6(16 *  1)
693	KERNEL7(16 *  1)
694	KERNEL8(16 *  1)
695
696	addq	$8 * SIZE, %rax
697	NOBRANCH
698	je	.L15
699	KERNEL1(16 *  0)
700	KERNEL2(16 *  0)
701	KERNEL3(16 *  0)
702	KERNEL4(16 *  0)
703	KERNEL5(16 *  0)
704	KERNEL6(16 *  0)
705	KERNEL7(16 *  0)
706	KERNEL8(16 *  0)
707
708	KERNEL1(16 *  1)
709	KERNEL2(16 *  1)
710	KERNEL3(16 *  1)
711	KERNEL4(16 *  1)
712	KERNEL5(16 *  1)
713	KERNEL6(16 *  1)
714	KERNEL7(16 *  1)
715	KERNEL8(16 *  1)
716
717	addq	$8 * SIZE, %rax
718	NOBRANCH
719	je	.L15
720	KERNEL1(16 *  0)
721	KERNEL2(16 *  0)
722	KERNEL3(16 *  0)
723	KERNEL4(16 *  0)
724	KERNEL5(16 *  0)
725	KERNEL6(16 *  0)
726	KERNEL7(16 *  0)
727	KERNEL8(16 *  0)
728
729	KERNEL1(16 *  1)
730	KERNEL2(16 *  1)
731	KERNEL3(16 *  1)
732	KERNEL4(16 *  1)
733	KERNEL5(16 *  1)
734	KERNEL6(16 *  1)
735	KERNEL7(16 *  1)
736	KERNEL8(16 *  1)
737
738	addq	$8 * SIZE, %rax
739	NOBRANCH
740	je	.L15
741	KERNEL1(16 *  0)
742	KERNEL2(16 *  0)
743	KERNEL3(16 *  0)
744	KERNEL4(16 *  0)
745	KERNEL5(16 *  0)
746	KERNEL6(16 *  0)
747	KERNEL7(16 *  0)
748	KERNEL8(16 *  0)
749
750	KERNEL1(16 *  1)
751	KERNEL2(16 *  1)
752	KERNEL3(16 *  1)
753	KERNEL4(16 *  1)
754	KERNEL5(16 *  1)
755	KERNEL6(16 *  1)
756	KERNEL7(16 *  1)
757	KERNEL8(16 *  1)
758
759	addq	$8 * SIZE, %rax
760	NOBRANCH
761	je	.L15
762	KERNEL1(16 *  0)
763	KERNEL2(16 *  0)
764	KERNEL3(16 *  0)
765	KERNEL4(16 *  0)
766	KERNEL5(16 *  0)
767	KERNEL6(16 *  0)
768	KERNEL7(16 *  0)
769	KERNEL8(16 *  0)
770
771	KERNEL1(16 *  1)
772	KERNEL2(16 *  1)
773	KERNEL3(16 *  1)
774	KERNEL4(16 *  1)
775	KERNEL5(16 *  1)
776	KERNEL6(16 *  1)
777	KERNEL7(16 *  1)
778	KERNEL8(16 *  1)
779
780	addq	$8 * SIZE, %rax
781	BRANCH
782	jl	.L12
783	ALIGN_3
784
785.L15:
786	PREFETCH	  8  * SIZE(BB)
787	subq	 $-16 * SIZE, BB
788
789#ifndef TRMMKERNEL
790	movq	K, %rax
791#else
792	movq	KKK, %rax
793#endif
794	testq	$4, %rax
795	je .L16
796	xorq	%rax, %rax
797	ALIGN_3
798
799	KERNEL1(16 *  0)
800	KERNEL2(16 *  0)
801	KERNEL3(16 *  0)
802	KERNEL4(16 *  0)
803	KERNEL5(16 *  0)
804	KERNEL6(16 *  0)
805	KERNEL7(16 *  0)
806	KERNEL8(16 *  0)
807
808	addq	$32 * SIZE, BO
809	addq	$16 * SIZE, AO
810	ALIGN_3
811#else
812	sarq	$2, %rax
813	NOBRANCH
814	jle	.L16
815	ALIGN_3
816
817.L12:
818	KERNEL1(16 *  0)
819	KERNEL2(16 *  0)
820	KERNEL3(16 *  0)
821	KERNEL4(16 *  0)
822	KERNEL5(16 *  0)
823	KERNEL6(16 *  0)
824	KERNEL7(16 *  0)
825	KERNEL8(16 *  0)
826
827	addq	$ 32 * SIZE, BO
828	subq	$-16 * SIZE, AO
829	decq	%rax
830	BRANCH
831	jg	.L12
832#endif
833
834.L16:
835	movapd	POSINV,  %xmm5
836	movapd	ALPHA_R, %xmm6
837	movapd	ALPHA_I, %xmm7
838
839#ifndef TRMMKERNEL
840	movq	K, %rax
841#else
842	movq	KKK, %rax
843#endif
844	andq	$3, %rax		# if (k & 1)
845	je .L19
846
847	leaq	(, %rax, SIZE), %rax
848	leaq	(AO, %rax, 4), AO
849	leaq	(BO, %rax, 8), BO
850	negq	%rax
851	ALIGN_3
852
853.L17:
854	mulpd	%xmm0, %xmm1
855	addpd	%xmm1, %xmm8
856	movapd	-14 * SIZE(BO, %rax, 8), %xmm1
857	mulpd	%xmm0, %xmm1
858	addpd	%xmm1, %xmm9
859	movapd	-12 * SIZE(BO, %rax, 8), %xmm1
860	mulpd	%xmm0, %xmm1
861	mulpd	-10 * SIZE(BO, %rax, 8), %xmm0
862	addpd	%xmm1, %xmm10
863	movapd	-16 * SIZE(BO, %rax, 8), %xmm1
864	addpd	%xmm0, %xmm11
865	movapd	-12 * SIZE(AO, %rax, 4), %xmm0
866	mulpd	%xmm2, %xmm1
867	addpd	%xmm1, %xmm12
868	movapd	-14 * SIZE(BO, %rax, 8), %xmm1
869	mulpd	%xmm2, %xmm1
870	addpd	%xmm1, %xmm13
871	movapd	-12 * SIZE(BO, %rax, 8), %xmm1
872	mulpd	%xmm2, %xmm1
873	mulpd	-10 * SIZE(BO, %rax, 8), %xmm2
874	addpd	%xmm1, %xmm14
875	movapd	 -8 * SIZE(BO, %rax, 8), %xmm1
876	addpd	%xmm2, %xmm15
877	movapd	-10 * SIZE(AO, %rax, 4), %xmm2
878
879	addq	$SIZE, %rax
880	jl	.L17
881	ALIGN_3
882
883.L19:
884#ifndef TRMMKERNEL
885	movlpd	0 * SIZE(CO1), %xmm0
886	movhpd	1 * SIZE(CO1), %xmm0
887	movlpd	2 * SIZE(CO1), %xmm2
888	movhpd	3 * SIZE(CO1), %xmm2
889
890	movlpd	0 * SIZE(CO2), %xmm1
891	movhpd	1 * SIZE(CO2), %xmm1
892	movlpd	2 * SIZE(CO2), %xmm3
893	movhpd	3 * SIZE(CO2), %xmm3
894#endif
895
896	SHUFPD_1 %xmm9, %xmm9
897	SHUFPD_1 %xmm11, %xmm11
898	SHUFPD_1 %xmm13, %xmm13
899	SHUFPD_1 %xmm15, %xmm15
900
901#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
902    defined(NR) || defined(NC) || defined(TR) || defined(TC)
903	xorpd	%xmm5, %xmm9
904	xorpd	%xmm5, %xmm11
905	xorpd	%xmm5, %xmm13
906	xorpd	%xmm5, %xmm15
907#else
908	xorpd	%xmm5, %xmm8
909	xorpd	%xmm5, %xmm10
910	xorpd	%xmm5, %xmm12
911	xorpd	%xmm5, %xmm14
912#endif
913
914#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
915    defined(RR) || defined(RC) || defined(CR) || defined(CC)
916	subpd	%xmm9, %xmm8
917	subpd	%xmm11, %xmm10
918	subpd	%xmm13, %xmm12
919	subpd	%xmm15, %xmm14
920#else
921	addpd	%xmm9, %xmm8
922	addpd	%xmm11, %xmm10
923	addpd	%xmm13, %xmm12
924	addpd	%xmm15, %xmm14
925#endif
926
927	pshufd	$0x4e, %xmm8, %xmm9
928	pshufd	$0x4e, %xmm10, %xmm11
929	pshufd	$0x4e, %xmm12, %xmm13
930	pshufd	$0x4e, %xmm14, %xmm15
931
932	mulpd	%xmm6, %xmm8
933	mulpd	%xmm7, %xmm9
934	mulpd	%xmm6, %xmm10
935	mulpd	%xmm7, %xmm11
936
937	mulpd	%xmm6, %xmm12
938	mulpd	%xmm7, %xmm13
939	mulpd	%xmm6, %xmm14
940	mulpd	%xmm7, %xmm15
941
942	addpd	%xmm9, %xmm8
943	addpd	%xmm11, %xmm10
944	addpd	%xmm13, %xmm12
945	addpd	%xmm15, %xmm14
946
947#ifndef TRMMKERNEL
948	addpd	%xmm0,  %xmm8
949	addpd	%xmm2, %xmm12
950	addpd	%xmm1,  %xmm10
951	addpd	%xmm3, %xmm14
952#endif
953
954	movlpd	%xmm8, 0 * SIZE(CO1)
955	movhpd	%xmm8, 1 * SIZE(CO1)
956	movlpd	%xmm12, 2 * SIZE(CO1)
957	movhpd	%xmm12, 3 * SIZE(CO1)
958
959	movlpd	%xmm10, 0 * SIZE(CO2)
960	movhpd	%xmm10, 1 * SIZE(CO2)
961	movlpd	%xmm14, 2 * SIZE(CO2)
962	movhpd	%xmm14, 3 * SIZE(CO2)
963
964#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
965    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
966	movq	K, %rax
967	subq	KKK, %rax
968	leaq	(,%rax, SIZE), %rax
969	leaq	(AO, %rax, 4), AO
970	leaq	(BO, %rax, 8), BO
971#endif
972
973#if defined(TRMMKERNEL) && defined(LEFT)
974	addq	$2, KK
975#endif
976
977	addq	$4 * SIZE, CO1		# coffset += 4
978	addq	$4 * SIZE, CO2		# coffset += 4
979	decq	I			# i --
980	jg	.L10
981	ALIGN_4
982
983.L30:
984	testq	$1, M
985	jle	.L99
986
987#if !defined(TRMMKERNEL) || \
988	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
989	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
990
991	leaq	16 * SIZE + BUFFER, BO
992#else
993	leaq	16 * SIZE + BUFFER, BO
994	movq	KK, %rax
995	leaq	(, %rax, SIZE), %rax
996	leaq	(AO, %rax, 2), AO
997	leaq	(BO, %rax, 8), BO
998#endif
999
1000	movapd	-16 * SIZE(AO), %xmm0
1001	pxor	%xmm8, %xmm8
1002	movapd	 -8 * SIZE(AO), %xmm2
1003	pxor	%xmm9, %xmm9
1004	movapd	-16 * SIZE(BO), %xmm1
1005	pxor	%xmm10, %xmm10
1006	movapd	 -8 * SIZE(BO), %xmm3
1007	pxor	%xmm11, %xmm11
1008
1009#ifndef TRMMKERNEL
1010	movq	K, %rax
1011#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1012	movq	K, %rax
1013	subq	KK, %rax
1014	movq	%rax, KKK
1015#else
1016	movq	KK, %rax
1017#ifdef LEFT
1018	addq	$1, %rax
1019#else
1020	addq	$2, %rax
1021#endif
1022	movq	%rax, KKK
1023#endif
1024	sarq	$3, %rax
1025	je	.L44
1026	ALIGN_4
1027
1028.L41:
1029	mulpd	%xmm0, %xmm1
1030	addpd	%xmm1, %xmm8
1031	movapd	-14 * SIZE(BO), %xmm1
1032	mulpd	%xmm0, %xmm1
1033	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
1034	addpd	%xmm1, %xmm9
1035	movapd	-12 * SIZE(BO), %xmm1
1036	mulpd	%xmm0, %xmm1
1037	mulpd	-10 * SIZE(BO), %xmm0
1038	addpd	%xmm1, %xmm10
1039	movapd	  0 * SIZE(BO), %xmm1
1040	addpd	%xmm0, %xmm11
1041	movapd	-14 * SIZE(AO), %xmm0
1042	mulpd	%xmm0, %xmm3
1043	addpd	%xmm3, %xmm8
1044	movapd	 -6 * SIZE(BO), %xmm3
1045	mulpd	%xmm0, %xmm3
1046	addpd	%xmm3, %xmm9
1047	movapd	 -4 * SIZE(BO), %xmm3
1048	mulpd	%xmm0, %xmm3
1049	mulpd	 -2 * SIZE(BO), %xmm0
1050	addpd	%xmm3, %xmm10
1051	movapd	  8 * SIZE(BO), %xmm3
1052	addpd	%xmm0, %xmm11
1053	movapd	-12 * SIZE(AO), %xmm0
1054	mulpd	%xmm0, %xmm1
1055	addpd	%xmm1, %xmm8
1056	movapd	  2 * SIZE(BO), %xmm1
1057	mulpd	%xmm0, %xmm1
1058	PREFETCH	(PREFETCHSIZE     +  8) * SIZE(AO)
1059	addpd	%xmm1, %xmm9
1060	movapd	  4 * SIZE(BO), %xmm1
1061	mulpd	%xmm0, %xmm1
1062	mulpd	  6 * SIZE(BO), %xmm0
1063	addpd	%xmm1, %xmm10
1064	movapd	 16 * SIZE(BO), %xmm1
1065	addpd	%xmm0, %xmm11
1066	movapd	-10 * SIZE(AO), %xmm0
1067	mulpd	%xmm0, %xmm3
1068	addpd	%xmm3, %xmm8
1069	movapd	 10 * SIZE(BO), %xmm3
1070	mulpd	%xmm0, %xmm3
1071	addpd	%xmm3, %xmm9
1072	movapd	 12 * SIZE(BO), %xmm3
1073	mulpd	%xmm0, %xmm3
1074	mulpd	 14 * SIZE(BO), %xmm0
1075	addpd	%xmm3, %xmm10
1076	movapd	 24 * SIZE(BO), %xmm3
1077	addpd	%xmm0, %xmm11
1078	movapd	  0 * SIZE(AO), %xmm0
1079	mulpd	%xmm2, %xmm1
1080	addpd	%xmm1, %xmm8
1081	movapd	 18 * SIZE(BO), %xmm1
1082	mulpd	%xmm2, %xmm1
1083	addpd	%xmm1, %xmm9
1084	movapd	 20 * SIZE(BO), %xmm1
1085	mulpd	%xmm2, %xmm1
1086	mulpd	 22 * SIZE(BO), %xmm2
1087	addpd	%xmm1, %xmm10
1088	movapd	 32 * SIZE(BO), %xmm1
1089	addpd	%xmm2, %xmm11
1090	movapd	 -6 * SIZE(AO), %xmm2
1091	mulpd	%xmm2, %xmm3
1092	addpd	%xmm3, %xmm8
1093	movapd	 26 * SIZE(BO), %xmm3
1094	mulpd	%xmm2, %xmm3
1095	addpd	%xmm3, %xmm9
1096	movapd	 28 * SIZE(BO), %xmm3
1097	mulpd	%xmm2, %xmm3
1098	mulpd	 30 * SIZE(BO), %xmm2
1099	addpd	%xmm3, %xmm10
1100	movapd	 40 * SIZE(BO), %xmm3
1101	addpd	%xmm2, %xmm11
1102	movapd	 -4 * SIZE(AO), %xmm2
1103	mulpd	%xmm2, %xmm1
1104	addpd	%xmm1, %xmm8
1105	movapd	 34 * SIZE(BO), %xmm1
1106	mulpd	%xmm2, %xmm1
1107	addpd	%xmm1, %xmm9
1108	movapd	 36 * SIZE(BO), %xmm1
1109	mulpd	%xmm2, %xmm1
1110	mulpd	 38 * SIZE(BO), %xmm2
1111	addpd	%xmm1, %xmm10
1112	movapd	 48 * SIZE(BO), %xmm1
1113	addpd	%xmm2, %xmm11
1114	movapd	 -2 * SIZE(AO), %xmm2
1115	mulpd	%xmm2, %xmm3
1116	addpd	%xmm3, %xmm8
1117	movapd	 42 * SIZE(BO), %xmm3
1118	mulpd	%xmm2, %xmm3
1119	addpd	%xmm3, %xmm9
1120	movapd	 44 * SIZE(BO), %xmm3
1121	mulpd	%xmm2, %xmm3
1122	mulpd	 46 * SIZE(BO), %xmm2
1123	addpd	%xmm3, %xmm10
1124	movapd	 56 * SIZE(BO), %xmm3
1125	addpd	%xmm2, %xmm11
1126	movapd	  8 * SIZE(AO), %xmm2
1127
1128	subq   $-16 * SIZE, AO
1129	addq   $64 * SIZE, BO
1130	decq   %rax
1131	jne    .L41
1132	ALIGN_4
1133
1134.L44:
1135#ifndef TRMMKERNEL
1136	movq	K, %rax
1137#else
1138	movq	KKK, %rax
1139#endif
1140	andq	$4, %rax
1141	BRANCH
1142	jle .L45
1143
1144	mulpd	%xmm0, %xmm1
1145	addpd	%xmm1, %xmm8
1146	movapd	-14 * SIZE(BO), %xmm1
1147	mulpd	%xmm0, %xmm1
1148	addpd	%xmm1, %xmm9
1149	movapd	-12 * SIZE(BO), %xmm1
1150	mulpd	%xmm0, %xmm1
1151	mulpd	-10 * SIZE(BO), %xmm0
1152	addpd	%xmm1, %xmm10
1153	movapd	  0 * SIZE(BO), %xmm1
1154	addpd	%xmm0, %xmm11
1155	movapd	-14 * SIZE(AO), %xmm0
1156	mulpd	%xmm0, %xmm3
1157	addpd	%xmm3, %xmm8
1158	movapd	 -6 * SIZE(BO), %xmm3
1159	mulpd	%xmm0, %xmm3
1160	addpd	%xmm3, %xmm9
1161	movapd	 -4 * SIZE(BO), %xmm3
1162	mulpd	%xmm0, %xmm3
1163	mulpd	 -2 * SIZE(BO), %xmm0
1164	addpd	%xmm3, %xmm10
1165	movapd	  8 * SIZE(BO), %xmm3
1166	addpd	%xmm0, %xmm11
1167	movapd	-12 * SIZE(AO), %xmm0
1168	mulpd	%xmm0, %xmm1
1169	addpd	%xmm1, %xmm8
1170	movapd	  2 * SIZE(BO), %xmm1
1171	mulpd	%xmm0, %xmm1
1172	addpd	%xmm1, %xmm9
1173	movapd	  4 * SIZE(BO), %xmm1
1174	mulpd	%xmm0, %xmm1
1175	mulpd	  6 * SIZE(BO), %xmm0
1176	addpd	%xmm1, %xmm10
1177	movapd	 16 * SIZE(BO), %xmm1
1178	addpd	%xmm0, %xmm11
1179	movapd	-10 * SIZE(AO), %xmm0
1180	mulpd	%xmm0, %xmm3
1181	addpd	%xmm3, %xmm8
1182	movapd	 10 * SIZE(BO), %xmm3
1183	mulpd	%xmm0, %xmm3
1184	addpd	%xmm3, %xmm9
1185	movapd	 12 * SIZE(BO), %xmm3
1186	mulpd	%xmm0, %xmm3
1187	mulpd	 14 * SIZE(BO), %xmm0
1188	addpd	%xmm3, %xmm10
1189	movapd	 24 * SIZE(BO), %xmm3
1190	addpd	%xmm0, %xmm11
1191	movapd	 -8 * SIZE(AO), %xmm0
1192
1193	addq   $ 8 * SIZE, AO
1194	addq   $32 * SIZE, BO
1195	ALIGN_4
1196
1197.L45:
1198#ifndef TRMMKERNEL
1199	movq	K, %rax
1200#else
1201	movq	KKK, %rax
1202#endif
1203	movapd	POSINV,  %xmm5
1204	movapd	ALPHA_R, %xmm6
1205	movapd	ALPHA_I, %xmm7
1206	andq	$3, %rax		# if (k & 1)
1207	BRANCH
1208	jle .L47
1209	ALIGN_4
1210
1211.L46:
1212	mulpd	%xmm0, %xmm1
1213	addpd	%xmm1, %xmm8
1214	movapd	-14 * SIZE(BO), %xmm1
1215	mulpd	%xmm0, %xmm1
1216	addpd	%xmm1, %xmm9
1217	movapd	-12 * SIZE(BO), %xmm1
1218	mulpd	%xmm0, %xmm1
1219	mulpd	-10 * SIZE(BO), %xmm0
1220	addpd	%xmm1, %xmm10
1221	movapd	 -8 * SIZE(BO), %xmm1
1222	addpd	%xmm0, %xmm11
1223	movapd	-14 * SIZE(AO), %xmm0
1224
1225	addq	$2 * SIZE, AO
1226	addq	$8 * SIZE, BO
1227
1228	decq	%rax
1229	jg	.L46
1230	ALIGN_4
1231
1232.L47:
1233#ifndef TRMMKERNEL
1234	movlpd	0 * SIZE(CO1), %xmm0
1235	movhpd	1 * SIZE(CO1), %xmm0
1236	movlpd	0 * SIZE(CO2), %xmm1
1237	movhpd	1 * SIZE(CO2), %xmm1
1238#endif
1239
1240	SHUFPD_1 %xmm9, %xmm9
1241	SHUFPD_1 %xmm11, %xmm11
1242
1243#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1244    defined(NR) || defined(NC) || defined(TR) || defined(TC)
1245	xorpd	%xmm5, %xmm9
1246	xorpd	%xmm5, %xmm11
1247#else
1248	xorpd	%xmm5, %xmm8
1249	xorpd	%xmm5, %xmm10
1250#endif
1251
1252#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1253    defined(RR) || defined(RC) || defined(CR) || defined(CC)
1254	subpd	%xmm9, %xmm8
1255	subpd	%xmm11, %xmm10
1256#else
1257	addpd	%xmm9, %xmm8
1258	addpd	%xmm11, %xmm10
1259#endif
1260
1261	pshufd	$0x4e, %xmm8, %xmm9
1262	pshufd	$0x4e, %xmm10, %xmm11
1263
1264	mulpd	%xmm6, %xmm8
1265	mulpd	%xmm7, %xmm9
1266	mulpd	%xmm6, %xmm10
1267	mulpd	%xmm7, %xmm11
1268
1269	addpd	%xmm9, %xmm8
1270	addpd	%xmm11, %xmm10
1271
1272#ifndef TRMMKERNEL
1273	addpd	%xmm0,  %xmm8
1274	addpd	%xmm1,  %xmm10
1275#endif
1276
1277	movlpd	%xmm8, 0 * SIZE(CO1)
1278	movhpd	%xmm8, 1 * SIZE(CO1)
1279	movlpd	%xmm10, 0 * SIZE(CO2)
1280	movhpd	%xmm10, 1 * SIZE(CO2)
1281
1282#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1283    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1284	movq	K, %rax
1285	subq	KKK, %rax
1286	leaq	(,%rax, SIZE), %rax
1287	leaq	(AO, %rax, 2), AO
1288	leaq	(BO, %rax, 8), BO
1289#endif
1290
1291#if defined(TRMMKERNEL) && defined(LEFT)
1292	addq	$1, KK
1293#endif
1294	ALIGN_4
1295
1296.L99:
1297#if defined(TRMMKERNEL) && !defined(LEFT)
1298	addl	$2, KK
1299#endif
1300
1301	leaq	(C, LDC, 2), C		# c += 2 * ldc
1302	decq	J			# j --
1303	jg	.L01
1304
1305.L100:
1306	testq	$1, N
1307	jle	.L999
1308
1309.L101:
1310#if defined(TRMMKERNEL) && defined(LEFT)
1311	movq	OFFSET, %rax
1312	movq	%rax, KK
1313#endif
1314
1315/* Copying to Sub Buffer */
1316	leaq	BUFFER, BO
1317
1318	movq	K, %rax
1319	sarq	$2, %rax
1320	jle	.L103
1321	ALIGN_4
1322
1323.L102:
1324	movlpd	 0 * SIZE(B), %xmm8
1325	movlpd	 1 * SIZE(B), %xmm9
1326	movlpd	 2 * SIZE(B), %xmm10
1327	movlpd	 3 * SIZE(B), %xmm11
1328	movlpd	 4 * SIZE(B), %xmm12
1329	movlpd	 5 * SIZE(B), %xmm13
1330	movlpd	 6 * SIZE(B), %xmm14
1331	movlpd	 7 * SIZE(B), %xmm15
1332
1333	movlpd	%xmm8,  0 * SIZE(BO)
1334	movlpd	%xmm8,  1 * SIZE(BO)
1335	movlpd	%xmm9,  2 * SIZE(BO)
1336	movlpd	%xmm9,  3 * SIZE(BO)
1337	movlpd	%xmm10,  4 * SIZE(BO)
1338	movlpd	%xmm10,  5 * SIZE(BO)
1339	movlpd	%xmm11,  6 * SIZE(BO)
1340	movlpd	%xmm11,  7 * SIZE(BO)
1341	movlpd	%xmm12,  8 * SIZE(BO)
1342	movlpd	%xmm12,  9 * SIZE(BO)
1343	movlpd	%xmm13, 10 * SIZE(BO)
1344	movlpd	%xmm13, 11 * SIZE(BO)
1345	movlpd	%xmm14, 12 * SIZE(BO)
1346	movlpd	%xmm14, 13 * SIZE(BO)
1347	movlpd	%xmm15, 14 * SIZE(BO)
1348	movlpd	%xmm15, 15 * SIZE(BO)
1349
1350	subq	$-16 * SIZE, BO
1351	addq	$ 8 * SIZE, B
1352	decq	%rax
1353	jne	.L102
1354	ALIGN_4
1355
1356.L103:
1357	movq	K, %rax
1358	andq	$3, %rax
1359	BRANCH
1360	jle	.L105
1361	ALIGN_4
1362
1363.L104:
1364	movlpd	 0 * SIZE(B), %xmm8
1365	movlpd	 1 * SIZE(B), %xmm9
1366
1367	movlpd	%xmm8,  0 * SIZE(BO)
1368	movlpd	%xmm8,  1 * SIZE(BO)
1369	movlpd	%xmm9,  2 * SIZE(BO)
1370	movlpd	%xmm9,  3 * SIZE(BO)
1371
1372	addq	$4 * SIZE, BO
1373	addq	$2 * SIZE, B
1374	decq	%rax
1375	jne	.L104
1376	ALIGN_4
1377
1378.L105:
1379	movq	C, CO1		# coffset1 = c
1380	movq	A, AO		# aoffset = a
1381
1382	movq	M,  I
1383	sarq	$1, I		# i = (m >> 2)
1384	jle	.L130
1385	ALIGN_4
1386
1387.L110:
1388#if !defined(TRMMKERNEL) || \
1389	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1390	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1391
1392	leaq	16 * SIZE + BUFFER, BO
1393#else
1394	leaq	16 * SIZE + BUFFER, BO
1395	movq	KK, %rax
1396	leaq	(, %rax, SIZE), %rax
1397	leaq	(AO, %rax, 4), AO
1398	leaq	(BO, %rax, 4), BO
1399#endif
1400
1401	movapd	-16 * SIZE(AO), %xmm0
1402	pxor	%xmm8, %xmm8
1403	movapd	-16 * SIZE(BO), %xmm1
1404	pxor	%xmm9, %xmm9
1405	movapd	 -8 * SIZE(AO), %xmm2
1406	pxor	%xmm12, %xmm12
1407	movapd	 -8 * SIZE(BO), %xmm3
1408	pxor	%xmm13, %xmm13
1409	PREFETCHW      3 * SIZE(CO1)
1410
1411#ifndef TRMMKERNEL
1412	movq	K, %rax
1413#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1414	movq	K, %rax
1415	subq	KK, %rax
1416	movq	%rax, KKK
1417#else
1418	movq	KK, %rax
1419#ifdef LEFT
1420	addq	$2, %rax
1421#else
1422	addq	$1, %rax
1423#endif
1424	movq	%rax, KKK
1425#endif
1426	sarq	$2, %rax
1427	je	.L112
1428
1429.L111:
1430	mulpd	%xmm0, %xmm1
1431	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
1432	mulpd	-14 * SIZE(BO), %xmm0
1433	addpd	%xmm1, %xmm8
1434	movapd	-16 * SIZE(BO), %xmm1
1435	addpd	%xmm0, %xmm9
1436	movapd	-14 * SIZE(AO), %xmm0
1437	mulpd	%xmm0, %xmm1
1438	mulpd	-14 * SIZE(BO), %xmm0
1439	addpd	%xmm1, %xmm12
1440	movapd	-12 * SIZE(BO), %xmm1
1441	addpd	%xmm0, %xmm13
1442	movapd	-12 * SIZE(AO), %xmm0
1443	mulpd	%xmm0, %xmm1
1444	mulpd	-10 * SIZE(BO), %xmm0
1445	addpd	%xmm1, %xmm8
1446	movapd	-12 * SIZE(BO), %xmm1
1447	addpd	%xmm0, %xmm9
1448	movapd	-10 * SIZE(AO), %xmm0
1449	mulpd	%xmm0, %xmm1
1450	mulpd	-10 * SIZE(BO), %xmm0
1451	addpd	%xmm1, %xmm12
1452	movapd	  0 * SIZE(BO), %xmm1
1453	addpd	%xmm0, %xmm13
1454	movapd	  0 * SIZE(AO), %xmm0
1455	mulpd	%xmm2, %xmm3
1456	PREFETCH	(PREFETCHSIZE     +  8) * SIZE(AO)
1457	mulpd	 -6 * SIZE(BO), %xmm2
1458	addpd	%xmm3, %xmm8
1459	movapd	 -8 * SIZE(BO), %xmm3
1460	addpd	%xmm2, %xmm9
1461	movapd	 -6 * SIZE(AO), %xmm2
1462	mulpd	%xmm2, %xmm3
1463	mulpd	 -6 * SIZE(BO), %xmm2
1464	addpd	%xmm3, %xmm12
1465	movapd	 -4 * SIZE(BO), %xmm3
1466	addpd	%xmm2, %xmm13
1467	movapd	 -4 * SIZE(AO), %xmm2
1468	mulpd	%xmm2, %xmm3
1469	mulpd	 -2 * SIZE(BO), %xmm2
1470	addpd	%xmm3, %xmm8
1471	movapd	 -4 * SIZE(BO), %xmm3
1472	addpd	%xmm2, %xmm9
1473	movapd	 -2 * SIZE(AO), %xmm2
1474	mulpd	%xmm2, %xmm3
1475	mulpd	 -2 * SIZE(BO), %xmm2
1476	addpd	%xmm3, %xmm12
1477	movapd	  8 * SIZE(BO), %xmm3
1478	addpd	%xmm2, %xmm13
1479	movapd	  8 * SIZE(AO), %xmm2
1480
1481	subq   $-16 * SIZE, AO
1482	subq   $-16 * SIZE, BO
1483	decq   %rax
1484	jne    .L111
1485	ALIGN_4
1486
1487.L112:
1488#ifndef TRMMKERNEL
1489	movq	K, %rax
1490#else
1491	movq	KKK, %rax
1492#endif
1493	movapd	POSINV,  %xmm5
1494	movapd	ALPHA_R, %xmm6
1495	movapd	ALPHA_I, %xmm7
1496	andq	$3, %rax		# if (k & 1)
1497	BRANCH
1498	jle .L114
1499
1500.L113:
1501	mulpd	%xmm0, %xmm1
1502	mulpd	 -14 * SIZE(BO), %xmm0
1503	addpd	%xmm1, %xmm8
1504	movapd	 -16 * SIZE(BO), %xmm1
1505	addpd	%xmm0, %xmm9
1506	movapd	 -14 * SIZE(AO), %xmm0
1507	mulpd	%xmm0, %xmm1
1508	mulpd	 -14 * SIZE(BO), %xmm0
1509	addpd	%xmm1, %xmm12
1510	movapd	 -12 * SIZE(BO), %xmm1
1511	addpd	%xmm0, %xmm13
1512	movapd	 -12 * SIZE(AO), %xmm0
1513
1514	addq	$4 * SIZE, AO		# aoffset  += 4
1515	addq	$4 * SIZE, BO		# boffset1 += 8
1516	decq	%rax
1517	jg	.L113
1518	ALIGN_4
1519
1520.L114:
1521#ifndef TRMMKERNEL
1522	movlpd	0 * SIZE(CO1), %xmm0
1523	movhpd	1 * SIZE(CO1), %xmm0
1524	movlpd	2 * SIZE(CO1), %xmm2
1525	movhpd	3 * SIZE(CO1), %xmm2
1526#endif
1527
1528	SHUFPD_1 %xmm9, %xmm9
1529	SHUFPD_1 %xmm13, %xmm13
1530
1531#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1532    defined(NR) || defined(NC) || defined(TR) || defined(TC)
1533	xorpd	%xmm5, %xmm9
1534	xorpd	%xmm5, %xmm13
1535#else
1536	xorpd	%xmm5, %xmm8
1537	xorpd	%xmm5, %xmm12
1538#endif
1539
1540#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1541    defined(RR) || defined(RC) || defined(CR) || defined(CC)
1542	subpd	%xmm9, %xmm8
1543	subpd	%xmm13, %xmm12
1544#else
1545	addpd	%xmm9, %xmm8
1546	addpd	%xmm13, %xmm12
1547#endif
1548
1549	pshufd	$0x4e, %xmm8, %xmm9
1550	pshufd	$0x4e, %xmm12, %xmm13
1551
1552	mulpd	%xmm6, %xmm8
1553	mulpd	%xmm7, %xmm9
1554	mulpd	%xmm6, %xmm12
1555	mulpd	%xmm7, %xmm13
1556
1557	addpd	%xmm9, %xmm8
1558	addpd	%xmm13, %xmm12
1559
1560#ifndef TRMMKERNEL
1561	addpd	%xmm0,  %xmm8
1562	addpd	%xmm2, %xmm12
1563#endif
1564
1565	movlpd	%xmm8, 0 * SIZE(CO1)
1566	movhpd	%xmm8, 1 * SIZE(CO1)
1567	movlpd	%xmm12, 2 * SIZE(CO1)
1568	movhpd	%xmm12, 3 * SIZE(CO1)
1569
1570#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1571    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1572	movq	K, %rax
1573	subq	KKK, %rax
1574	leaq	(,%rax, SIZE), %rax
1575	leaq	(AO, %rax, 4), AO
1576	leaq	(BO, %rax, 4), BO
1577#endif
1578
1579#if defined(TRMMKERNEL) && defined(LEFT)
1580	addq	$2, KK
1581#endif
1582
1583	addq	$4 * SIZE, CO1		# coffset += 4
1584	decq	I			# i --
1585	jg	.L110
1586	ALIGN_4
1587
1588.L130:
1589	testq	$1, M
1590	jle	.L999
1591
1592#if !defined(TRMMKERNEL) || \
1593	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1594	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1595
1596	leaq	16 * SIZE + BUFFER, BO
1597#else
1598	leaq	16 * SIZE + BUFFER, BO
1599	movq	KK, %rax
1600	leaq	(, %rax, SIZE), %rax
1601	leaq	(AO, %rax, 2), AO
1602	leaq	(BO, %rax, 4), BO
1603#endif
1604
1605	movapd	-16 * SIZE(AO), %xmm0
1606	movapd	-16 * SIZE(BO), %xmm1
1607	movapd	 -8 * SIZE(AO), %xmm2
1608	movapd	 -8 * SIZE(BO), %xmm3
1609
1610	pxor	%xmm8, %xmm8
1611	pxor	%xmm9, %xmm9
1612	pxor	%xmm10, %xmm10
1613	pxor	%xmm11, %xmm11
1614
1615#ifndef TRMMKERNEL
1616	movq	K, %rax
1617#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1618	movq	K, %rax
1619	subq	KK, %rax
1620	movq	%rax, KKK
1621#else
1622	movq	KK, %rax
1623#ifdef LEFT
1624	addq	$1, %rax
1625#else
1626	addq	$1, %rax
1627#endif
1628	movq	%rax, KKK
1629#endif
1630	sarq	$3, %rax
1631	je	.L144
1632	ALIGN_4
1633
1634.L141:
1635	mulpd	%xmm0, %xmm1
1636	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
1637	mulpd	-14 * SIZE(BO), %xmm0
1638	addpd	%xmm1, %xmm8
1639	movapd	-12 * SIZE(BO), %xmm1
1640	addpd	%xmm0, %xmm9
1641	movapd	-14 * SIZE(AO), %xmm0
1642	mulpd	%xmm0, %xmm1
1643	mulpd	-10 * SIZE(BO), %xmm0
1644	addpd	%xmm1, %xmm10
1645	movapd	  0 * SIZE(BO), %xmm1
1646	addpd	%xmm0, %xmm11
1647	movapd	-12 * SIZE(AO), %xmm0
1648	mulpd	%xmm0, %xmm3
1649	mulpd	 -6 * SIZE(BO), %xmm0
1650	addpd	%xmm3, %xmm8
1651	movapd	 -4 * SIZE(BO), %xmm3
1652	addpd	%xmm0, %xmm9
1653	movapd	-10 * SIZE(AO), %xmm0
1654	mulpd	%xmm0, %xmm3
1655	mulpd	 -2 * SIZE(BO), %xmm0
1656	addpd	%xmm3, %xmm10
1657	movapd	  8 * SIZE(BO), %xmm3
1658	addpd	%xmm0, %xmm11
1659	movapd	  0 * SIZE(AO), %xmm0
1660	mulpd	%xmm2, %xmm1
1661	PREFETCH	(PREFETCHSIZE     +  8) * SIZE(AO)
1662	mulpd	  2 * SIZE(BO), %xmm2
1663	addpd	%xmm1, %xmm8
1664	movapd	  4 * SIZE(BO), %xmm1
1665	addpd	%xmm2, %xmm9
1666	movapd	 -6 * SIZE(AO), %xmm2
1667	mulpd	%xmm2, %xmm1
1668	mulpd	  6 * SIZE(BO), %xmm2
1669	addpd	%xmm1, %xmm10
1670	movapd	 16 * SIZE(BO), %xmm1
1671	addpd	%xmm2, %xmm11
1672	movapd	 -4 * SIZE(AO), %xmm2
1673	mulpd	%xmm2, %xmm3
1674	mulpd	 10 * SIZE(BO), %xmm2
1675	addpd	%xmm3, %xmm8
1676	movapd	 12 * SIZE(BO), %xmm3
1677	addpd	%xmm2, %xmm9
1678	movapd	 -2 * SIZE(AO), %xmm2
1679	mulpd	%xmm2, %xmm3
1680	mulpd	 14 * SIZE(BO), %xmm2
1681	addpd	%xmm3, %xmm10
1682	movapd	 24 * SIZE(BO), %xmm3
1683	addpd	%xmm2, %xmm11
1684	movapd	  8 * SIZE(AO), %xmm2
1685
1686	subq   $-16 * SIZE, AO
1687	subq   $-32 * SIZE, BO
1688	decq   %rax
1689	jne    .L141
1690	ALIGN_4
1691
1692
1693.L144:
1694#ifndef TRMMKERNEL
1695	movq	K, %rax
1696#else
1697	movq	KKK, %rax
1698#endif
1699	andq	$4, %rax		# if (k & 1)
1700	BRANCH
1701	jle .L145
1702
1703	mulpd	%xmm0, %xmm1
1704	mulpd	-14 * SIZE(BO), %xmm0
1705	addpd	%xmm1, %xmm8
1706	movapd	-12 * SIZE(BO), %xmm1
1707	addpd	%xmm0, %xmm9
1708	movapd	-14 * SIZE(AO), %xmm0
1709	mulpd	%xmm0, %xmm1
1710	mulpd	-10 * SIZE(BO), %xmm0
1711	addpd	%xmm1, %xmm10
1712	movapd	  0 * SIZE(BO), %xmm1
1713	addpd	%xmm0, %xmm11
1714	movapd	-12 * SIZE(AO), %xmm0
1715	mulpd	%xmm0, %xmm3
1716	mulpd	 -6 * SIZE(BO), %xmm0
1717	addpd	%xmm3, %xmm8
1718	movapd	 -4 * SIZE(BO), %xmm3
1719	addpd	%xmm0, %xmm9
1720	movapd	-10 * SIZE(AO), %xmm0
1721	mulpd	%xmm0, %xmm3
1722	mulpd	 -2 * SIZE(BO), %xmm0
1723	addpd	%xmm3, %xmm10
1724	addpd	%xmm0, %xmm11
1725	movapd	 -8 * SIZE(AO), %xmm0
1726
1727	addq   $8   * SIZE, AO
1728	subq   $-16 * SIZE, BO
1729	ALIGN_4
1730
1731.L145:
1732	movapd	POSINV, %xmm5
1733	movapd	ALPHA_R, %xmm6
1734	movapd	ALPHA_I, %xmm7
1735
1736#ifndef TRMMKERNEL
1737	movq	K, %rax
1738#else
1739	movq	KKK, %rax
1740#endif
1741	andq	$3, %rax		# if (k & 1)
1742	BRANCH
1743	jle .L148
1744	ALIGN_4
1745
1746.L146:
1747	mulpd	%xmm0, %xmm1
1748	mulpd	-14 * SIZE(BO), %xmm0
1749	addpd	%xmm1, %xmm8
1750	movapd	-12 * SIZE(BO), %xmm1
1751	addpd	%xmm0, %xmm9
1752	movapd	-14 * SIZE(AO), %xmm0
1753
1754	addq	$2 * SIZE, AO		# aoffset  += 4
1755	addq	$4 * SIZE, BO		# boffset1 += 8
1756	decq	%rax
1757	jg	.L146
1758	ALIGN_4
1759
1760.L148:
1761	addpd	%xmm10, %xmm8
1762	addpd	%xmm11, %xmm9
1763
1764#ifndef TRMMKERNEL
1765	movlpd	0 * SIZE(CO1), %xmm0
1766	movhpd	1 * SIZE(CO1), %xmm0
1767#endif
1768
1769	SHUFPD_1 %xmm9, %xmm9
1770
1771#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1772    defined(NR) || defined(NC) || defined(TR) || defined(TC)
1773	xorpd	%xmm5, %xmm9
1774#else
1775	xorpd	%xmm5, %xmm8
1776#endif
1777
1778#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1779    defined(RR) || defined(RC) || defined(CR) || defined(CC)
1780	subpd	%xmm9, %xmm8
1781#else
1782	addpd	%xmm9, %xmm8
1783#endif
1784
1785	pshufd	$0x4e, %xmm8, %xmm9
1786
1787	mulpd	%xmm6, %xmm8
1788	mulpd	%xmm7, %xmm9
1789
1790	addpd	%xmm9, %xmm8
1791
1792#ifndef TRMMKERNEL
1793	addpd	%xmm0,  %xmm8
1794#endif
1795
1796	movlpd	%xmm8, 0 * SIZE(CO1)
1797	movhpd	%xmm8, 1 * SIZE(CO1)
1798	ALIGN_4
1799
1800.L999:
1801	movq	%rbx, %rsp
1802	EMMS
1803
1804	movq	  0(%rsp), %rbx
1805	movq	  8(%rsp), %rbp
1806	movq	 16(%rsp), %r12
1807	movq	 24(%rsp), %r13
1808	movq	 32(%rsp), %r14
1809	movq	 40(%rsp), %r15
1810
1811#ifdef WINDOWS_ABI
1812	movq	 48(%rsp), %rdi
1813	movq	 56(%rsp), %rsi
1814	movups	 64(%rsp), %xmm6
1815	movups	 80(%rsp), %xmm7
1816	movups	 96(%rsp), %xmm8
1817	movups	112(%rsp), %xmm9
1818	movups	128(%rsp), %xmm10
1819	movups	144(%rsp), %xmm11
1820	movups	160(%rsp), %xmm12
1821	movups	176(%rsp), %xmm13
1822	movups	192(%rsp), %xmm14
1823	movups	208(%rsp), %xmm15
1824#endif
1825
1826	addq	$STACKSIZE, %rsp
1827	ret
1828
1829	EPILOGUE
1830