1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define OLD_M	%rdi
43#define OLD_N	%rsi
44#define M	%r13
45#define N	%r14
46#define K	%rdx
47
48#define A	%rcx
49#define B	%r8
50#define C	%r9
51#define LDC	%r10
52
53#define I	%r11
54#define AO	%rdi
55#define BO	%rsi
56#define	CO1	%r15
57#define CO2	%r12
58
59#ifndef WINDOWS_ABI
60
61#define STACKSIZE 64
62
63#else
64
65#define STACKSIZE 256
66
67#define OLD_A		40 + STACKSIZE(%rsp)
68#define OLD_B		48 + STACKSIZE(%rsp)
69#define OLD_C		56 + STACKSIZE(%rsp)
70#define OLD_LDC		64 + STACKSIZE(%rsp)
71#define OLD_OFFSET	72 + STACKSIZE(%rsp)
72
73#endif
74
75#define ALPHA	  0(%rsp)
76#define J	 16(%rsp)
77#define OFFSET	 24(%rsp)
78#define KK	 32(%rsp)
79#define KKK	 40(%rsp)
80#define BUFFER	128(%rsp)
81
82#define PREFETCH     prefetch
83#define PREFETCHSIZE  (16 *  17 + 0)
84
85#define RPREFETCHSIZE (16 *  4 + 0)
86#define WPREFETCHSIZE (16 *  9 + 0)
87
88#define KERNEL1(xx) \
89	vfmaddps  %xmm8,%xmm1,%xmm0,%xmm8 ;\
90	vmovaps	%xmm2, %xmm0 ;\
91	vmovups -28 * SIZE(AO, %rax, 4),%xmm2 ;\
92	vfmaddps %xmm12,%xmm2, %xmm1, %xmm12 ;\
93	vmovups	-24 * SIZE(BO, %rax, 8), %xmm1 ;\
94	vfmaddps %xmm9,%xmm3, %xmm0, %xmm9 ;\
95	vfmaddps %xmm13,%xmm2, %xmm3, %xmm13 ;\
96	vmovups	-20 * SIZE(BO, %rax, 8), %xmm3 ;\
97	vfmaddps %xmm10,%xmm1, %xmm0, %xmm10 ;\
98	vfmaddps %xmm14,%xmm2, %xmm1, %xmm14 ;\
99	vfmaddps %xmm11,%xmm3, %xmm0, %xmm11 ;\
100	vfmaddps %xmm15,%xmm2, %xmm3, %xmm15 ;\
101	vmovups	-24 * SIZE(AO, %rax, 4), %xmm0 ;\
102	vmovups	-16 * SIZE(BO, %rax, 8), %xmm1 ;\
103 	vmovups	-12 * SIZE(BO, %rax, 8), %xmm3 ;\
104	vmovaps	%xmm0, %xmm2
105
106
107#define KERNEL2(xx) \
108	vfmaddps  %xmm8,%xmm1,%xmm0,%xmm8 ;\
109	vmovaps	%xmm2, %xmm0 ;\
110	vmovups -20 * SIZE(AO, %rax, 4),%xmm2 ;\
111	vfmaddps %xmm12,%xmm2, %xmm1, %xmm12 ;\
112	vmovups	-8 * SIZE(BO, %rax, 8), %xmm1 ;\
113	vfmaddps %xmm9,%xmm3, %xmm0, %xmm9 ;\
114	vfmaddps %xmm13,%xmm2, %xmm3, %xmm13 ;\
115	vmovups	-4 * SIZE(BO, %rax, 8), %xmm3 ;\
116	vfmaddps %xmm10,%xmm1, %xmm0, %xmm10 ;\
117	vfmaddps %xmm14,%xmm2, %xmm1, %xmm14 ;\
118	vfmaddps %xmm11,%xmm3, %xmm0, %xmm11 ;\
119	vfmaddps %xmm15,%xmm2, %xmm3, %xmm15 ;\
120 	vmovups	4 * SIZE(BO, %rax, 8), %xmm3 ;\
121	vmovaps	%xmm4, %xmm2
122
123
124
125#define KERNEL3(xx) \
126	vfmaddps  %xmm8,%xmm5,%xmm4,%xmm8 ;\
127	vmovups -12 * SIZE(AO, %rax, 4),%xmm2 ;\
128	vfmaddps %xmm12,%xmm2, %xmm5, %xmm12 ;\
129	vmovups	 32 * SIZE(BO, %rax, 8), %xmm1 ;\
130	vmovups	  8 * SIZE(BO, %rax, 8), %xmm5 ;\
131	vfmaddps %xmm9,%xmm3, %xmm4, %xmm9 ;\
132	vfmaddps %xmm13,%xmm2, %xmm3, %xmm13 ;\
133	vmovups	 12 * SIZE(BO, %rax, 8), %xmm3 ;\
134	vfmaddps %xmm10,%xmm5, %xmm4, %xmm10 ;\
135	vfmaddps %xmm14,%xmm2, %xmm5, %xmm14 ;\
136	vfmaddps %xmm11,%xmm3, %xmm4, %xmm11 ;\
137	vfmaddps %xmm15,%xmm2, %xmm3, %xmm15 ;\
138	vmovups	 -8 * SIZE(AO, %rax, 4), %xmm4 ;\
139	vmovups	 16 * SIZE(BO, %rax, 8), %xmm5 ;\
140 	vmovups	 20 * SIZE(BO, %rax, 8), %xmm3 ;\
141	vmovaps	%xmm4, %xmm2
142
143#define KERNEL4(xx) \
144	vfmaddps %xmm8,%xmm5, %xmm4, %xmm8 ;\
145	vmovups -4 * SIZE(AO, %rax, 4),%xmm2 ;\
146	vfmaddps %xmm12,%xmm2, %xmm5, %xmm12 ;\
147	vmovups	 24 * SIZE(BO, %rax, 8), %xmm5 ;\
148	vfmaddps %xmm9,%xmm3, %xmm4, %xmm9 ;\
149	vfmaddps %xmm13,%xmm2, %xmm3, %xmm13 ;\
150	vmovups	 28 * SIZE(BO, %rax, 8), %xmm3 ;\
151	vfmaddps %xmm10,%xmm5, %xmm4, %xmm10 ;\
152	vfmaddps %xmm14,%xmm2, %xmm5, %xmm14 ;\
153	vmovups	 64 * SIZE(BO, %rax, 8), %xmm5 ;\
154	vfmaddps %xmm11,%xmm3, %xmm4, %xmm11 ;\
155	vfmaddps %xmm15,%xmm2, %xmm3, %xmm15 ;\
156	vmovups	  (AO, %rax, 4), %xmm6 ;\
157 	vmovups	 36 * SIZE(BO, %rax, 8), %xmm3 ;\
158	vmovaps	%xmm6, %xmm2
159
160#define KERNEL5(xx) \
161	vfmaddps %xmm8,%xmm1, %xmm6, %xmm8 ;\
162	vmovups 4 * SIZE(AO, %rax, 4),%xmm2 ;\
163	vfmaddps %xmm12,%xmm2, %xmm1, %xmm12 ;\
164	vmovups	 40 * SIZE(BO, %rax, 8), %xmm1 ;\
165	vfmaddps %xmm9,%xmm3, %xmm6, %xmm9 ;\
166	vfmaddps %xmm13,%xmm2, %xmm3, %xmm13 ;\
167	vmovups	 16 * SIZE(AO, %rax, 4), %xmm7 ;\
168	vmovups	 44 * SIZE(BO, %rax, 8), %xmm3 ;\
169	vfmaddps %xmm10,%xmm1, %xmm6, %xmm10 ;\
170	vfmaddps %xmm14,%xmm2, %xmm1, %xmm14 ;\
171	vfmaddps %xmm11,%xmm3, %xmm6, %xmm11 ;\
172	vfmaddps %xmm15,%xmm2, %xmm3, %xmm15 ;\
173	vmovups	  8 * SIZE(AO, %rax, 4), %xmm6 ;\
174	vmovups	 48 * SIZE(BO, %rax, 8), %xmm1 ;\
175 	vmovups	 52 * SIZE(BO, %rax, 8), %xmm3 ;\
176	vmovaps	%xmm6, %xmm2
177
178#define KERNEL6(xx) \
179	vfmaddps %xmm8,%xmm1, %xmm6, %xmm8 ;\
180	vmovups 12 * SIZE(AO, %rax, 4),%xmm2 ;\
181	vfmaddps %xmm12,%xmm2, %xmm1, %xmm12 ;\
182	vmovups	 56 * SIZE(BO, %rax, 8), %xmm1 ;\
183	vfmaddps %xmm9,%xmm3, %xmm6, %xmm9 ;\
184	vfmaddps %xmm13,%xmm2, %xmm3, %xmm13 ;\
185	vmovups	 60 * SIZE(BO, %rax, 8), %xmm3 ;\
186	vfmaddps %xmm10,%xmm1, %xmm6, %xmm10 ;\
187	vfmaddps %xmm14,%xmm2, %xmm1, %xmm14 ;\
188	vfmaddps %xmm11,%xmm3, %xmm6, %xmm11 ;\
189	vfmaddps %xmm15,%xmm2, %xmm3, %xmm15 ;\
190	vmovups	 32 * SIZE(AO, %rax, 4), %xmm0 ;\
191 	vmovups	 68 * SIZE(BO, %rax, 8), %xmm3 ;\
192	vmovaps	%xmm7, %xmm2
193
194#define KERNEL7(xx) \
195	vfmaddps %xmm8,%xmm5, %xmm7, %xmm8 ;\
196	vmovups 20 * SIZE(AO, %rax, 4),%xmm2 ;\
197	vfmaddps %xmm12,%xmm2, %xmm5, %xmm12 ;\
198	vmovups	 96 * SIZE(BO, %rax, 8), %xmm1 ;\
199	vmovups	 72 * SIZE(BO, %rax, 8), %xmm5 ;\
200	vfmaddps %xmm9,%xmm3, %xmm7, %xmm9 ;\
201	vfmaddps %xmm13,%xmm2, %xmm3, %xmm13 ;\
202	vmovups	 76 * SIZE(BO, %rax, 8), %xmm3 ;\
203	vfmaddps %xmm10,%xmm5, %xmm7, %xmm10 ;\
204	vfmaddps %xmm14,%xmm2, %xmm5, %xmm14 ;\
205	vfmaddps %xmm11,%xmm3, %xmm7, %xmm11 ;\
206	vfmaddps %xmm15,%xmm2, %xmm3, %xmm15 ;\
207	vmovups	 24 * SIZE(AO, %rax, 4), %xmm7 ;\
208	vmovups	 80 * SIZE(BO, %rax, 8), %xmm5 ;\
209 	vmovups	 84 * SIZE(BO, %rax, 8), %xmm3 ;\
210	movaps	%xmm7, %xmm2
211
212#define KERNEL8(xx) \
213	vfmaddps %xmm8,%xmm5, %xmm7, %xmm8 ;\
214	vmovups 28 * SIZE(AO, %rax, 4),%xmm2 ;\
215	vfmaddps %xmm12,%xmm2, %xmm5, %xmm12 ;\
216	vmovups	 88 * SIZE(BO, %rax, 8), %xmm5 ;\
217	vfmaddps %xmm9, %xmm3, %xmm7, %xmm9 ;\
218	vfmaddps %xmm13,%xmm2, %xmm3,  %xmm13 ;\
219	vmovups	 92 * SIZE(BO, %rax, 8), %xmm3 ;\
220	vfmaddps %xmm10,%xmm5, %xmm7, %xmm10 ;\
221	vfmaddps %xmm14,%xmm2, %xmm5, %xmm14 ;\
222	vmovups	 48 * SIZE(AO, %rax, 4), %xmm4 ;\
223	vmovups	128 * SIZE(BO, %rax, 8), %xmm5 ;\
224	vfmaddps %xmm11,%xmm3, %xmm7, %xmm11 ;\
225	vfmaddps %xmm15,%xmm2, %xmm3, %xmm15 ;\
226 	vmovups	100 * SIZE(BO, %rax, 8), %xmm3 ;\
227	vmovaps	%xmm0, %xmm2 ;\
228	addq	$16 * SIZE, %rax
229
230#define KERNEL_SUB1(xx) \
231	mulps	%xmm1, %xmm0 ;\
232	mulps	-28 * SIZE(AO, %rax, 4), %xmm1 ;\
233	addps	%xmm0, %xmm8 ;\
234	movaps	%xmm2, %xmm0 ;\
235	addps	%xmm1, %xmm12 ;\
236	movaps	-24 * SIZE(BO, %rax, 8), %xmm1 ;\
237	mulps	%xmm3, %xmm2 ;\
238	mulps	-28 * SIZE(AO, %rax, 4), %xmm3 ;\
239	addps	%xmm2, %xmm9 ;\
240	movaps	%xmm0, %xmm2 ;\
241	addps	%xmm3, %xmm13 ;\
242	movaps	-20 * SIZE(BO, %rax, 8), %xmm3 ;\
243	mulps	%xmm1, %xmm0 ;\
244	mulps	-28 * SIZE(AO, %rax, 4), %xmm1 ;\
245	addps	%xmm0, %xmm10 ;\
246	movaps	-24 * SIZE(AO, %rax, 4), %xmm0 ;\
247	addps	%xmm1, %xmm14 ;\
248	movaps	-16 * SIZE(BO, %rax, 8), %xmm1 ;\
249	mulps	%xmm3, %xmm2 ;\
250	mulps	-28 * SIZE(AO, %rax, 4), %xmm3 ;\
251	addps	%xmm2, %xmm11 ;\
252	addps	%xmm3, %xmm15 ;\
253 	movaps	-12 * SIZE(BO, %rax, 8), %xmm3 ;\
254	movaps	%xmm0, %xmm2
255
256#define KERNEL_SUB2(xx) \
257	mulps	%xmm1, %xmm0 ;\
258	mulps	-20 * SIZE(AO, %rax, 4), %xmm1 ;\
259	addps	%xmm0, %xmm8 ;\
260	movaps	%xmm2, %xmm0 ;\
261	addps	%xmm1, %xmm12 ;\
262	movaps	 -8 * SIZE(BO, %rax, 8), %xmm1 ;\
263	mulps	%xmm3, %xmm2 ;\
264	mulps	-20 * SIZE(AO, %rax, 4), %xmm3 ;\
265	addps	%xmm2, %xmm9 ;\
266	movaps	%xmm0, %xmm2 ;\
267	addps	%xmm3, %xmm13 ;\
268	movaps	 -4 * SIZE(BO, %rax, 8), %xmm3 ;\
269	mulps	%xmm1, %xmm0 ;\
270	mulps	-20 * SIZE(AO, %rax, 4), %xmm1 ;\
271	addps	%xmm0, %xmm10 ;\
272	movaps	  (AO, %rax, 4), %xmm0 ;\
273	addps	%xmm1, %xmm14 ;\
274	movaps	 32 * SIZE(BO, %rax, 8), %xmm1 ;\
275	mulps	%xmm3, %xmm2 ;\
276	mulps	-20 * SIZE(AO, %rax, 4), %xmm3 ;\
277	addps	%xmm2, %xmm11 ;\
278	addps	%xmm3, %xmm15 ;\
279 	movaps	  4 * SIZE(BO, %rax, 8), %xmm3 ;\
280	movaps	%xmm4, %xmm2
281
282#define KERNEL_SUB3(xx) \
283	mulps	%xmm5, %xmm4 ;\
284	mulps	-12 * SIZE(AO, %rax, 4), %xmm5 ;\
285	addps	%xmm4, %xmm8 ;\
286	movaps	%xmm2, %xmm4 ;\
287	addps	%xmm5, %xmm12 ;\
288	movaps	  8 * SIZE(BO, %rax, 8), %xmm5 ;\
289	mulps	%xmm3, %xmm2 ;\
290	mulps	-12 * SIZE(AO, %rax, 4), %xmm3 ;\
291	addps	%xmm2, %xmm9 ;\
292	movaps	%xmm4, %xmm2 ;\
293	addps	%xmm3, %xmm13 ;\
294	movaps	 12 * SIZE(BO, %rax, 8), %xmm3 ;\
295	mulps	%xmm5, %xmm4 ;\
296	mulps	-12 * SIZE(AO, %rax, 4), %xmm5 ;\
297	addps	%xmm4, %xmm10 ;\
298	movaps	 -8 * SIZE(AO, %rax, 4), %xmm4 ;\
299	addps	%xmm5, %xmm14 ;\
300	movaps	 16 * SIZE(BO, %rax, 8), %xmm5 ;\
301	mulps	%xmm3, %xmm2 ;\
302	mulps	-12 * SIZE(AO, %rax, 4), %xmm3 ;\
303	addps	%xmm2, %xmm11 ;\
304	addps	%xmm3, %xmm15 ;\
305 	movaps	 20 * SIZE(BO, %rax, 8), %xmm3 ;\
306	movaps	%xmm4, %xmm2
307
308#define KERNEL_SUB4(xx) \
309	mulps	%xmm5, %xmm4 ;\
310	mulps	 -4 * SIZE(AO, %rax, 4), %xmm5 ;\
311	addps	%xmm4, %xmm8 ;\
312	movaps	%xmm2, %xmm4 ;\
313	addps	%xmm5, %xmm12 ;\
314	movaps	 24 * SIZE(BO, %rax, 8), %xmm5 ;\
315	mulps	%xmm3, %xmm2 ;\
316	mulps	 -4 * SIZE(AO, %rax, 4), %xmm3 ;\
317	addps	%xmm2, %xmm9 ;\
318	movaps	%xmm4, %xmm2 ;\
319	addps	%xmm3, %xmm13 ;\
320	movaps	 28 * SIZE(BO, %rax, 8), %xmm3 ;\
321	mulps	%xmm5, %xmm4 ;\
322	mulps	 -4 * SIZE(AO, %rax, 4), %xmm5 ;\
323	addps	%xmm4, %xmm10 ;\
324	addps	%xmm5, %xmm14 ;\
325	mulps	%xmm3, %xmm2 ;\
326	mulps	 -4 * SIZE(AO, %rax, 4), %xmm3 ;\
327	addps	%xmm2, %xmm11 ;\
328	addps	%xmm3, %xmm15 ;\
329 	movaps	 36 * SIZE(BO, %rax, 8), %xmm3 ;\
330	movaps	%xmm0, %xmm2
331
332#if defined(OS_LINUX) && defined(CORE_BULLDOZER) && !defined(TRMMKERNEL)
333	.align 32768
334#endif
335	PROLOGUE
336	PROFCODE
337
338	subq	$STACKSIZE, %rsp
339
340	movq	%rbx,  0(%rsp)
341	movq	%rbp,  8(%rsp)
342	movq	%r12, 16(%rsp)
343	movq	%r13, 24(%rsp)
344	movq	%r14, 32(%rsp)
345	movq	%r15, 40(%rsp)
346
347#ifdef WINDOWS_ABI
348	movq	%rdi,    48(%rsp)
349	movq	%rsi,    56(%rsp)
350	movups	%xmm6,   64(%rsp)
351	movups	%xmm7,   80(%rsp)
352	movups	%xmm8,   96(%rsp)
353	movups	%xmm9,  112(%rsp)
354	movups	%xmm10, 128(%rsp)
355	movups	%xmm11, 144(%rsp)
356	movups	%xmm12, 160(%rsp)
357	movups	%xmm13, 176(%rsp)
358	movups	%xmm14, 192(%rsp)
359	movups	%xmm15, 208(%rsp)
360
361	movq	ARG1,      OLD_M
362	movq	ARG2,      OLD_N
363	movq	ARG3,      K
364	movq	OLD_A,     A
365	movq	OLD_B,     B
366	movq	OLD_C,     C
367	movq	OLD_LDC,   LDC
368#ifdef TRMMKERNEL
369	movsd	OLD_OFFSET, %xmm12
370#endif
371	movaps	%xmm3, %xmm0
372
373#else
374	movq	72(%rsp), LDC
375#ifdef TRMMKERNEL
376	movsd	80(%rsp), %xmm12
377#endif
378
379#endif
380
381	movq	%rsp, %rbx	# save old stack
382	subq	$128 + LOCAL_BUFFER_SIZE, %rsp
383	andq	$-4096, %rsp	# align stack
384
385	STACK_TOUCHING
386
387	movq	OLD_M, M
388	movq	OLD_N, N
389
390	shufps	$0, %xmm0, %xmm0
391	movaps	%xmm0, ALPHA
392
393#ifdef TRMMKERNEL
394	movsd	%xmm12, OFFSET
395	movsd	%xmm12, KK
396#ifndef LEFT
397	negq	KK
398#endif
399#endif
400
401	subq	$-32 * SIZE, A
402
403	leaq	(, LDC, SIZE), LDC
404
405	movq	N,  J
406	sarq	$2, J		# j = (n >> 2)
407	jle	.L50
408
409.L01:
410#if defined(TRMMKERNEL) && defined(LEFT)
411	movq	OFFSET, %rax
412	movq	%rax, KK
413#endif
414
415/* Copying to Sub Buffer */
416	leaq	BUFFER, BO
417
418	movq	K,  %rax
419	sarq	$2, %rax
420	jle	.L03
421	ALIGN_4
422
423.L02:
424
425	prefetcht0 192(B)
426	prefetcht0 256(B)
427	prefetcht0 192(BO)
428	prefetcht0 256(BO)
429	movaps	 0 * SIZE(B), %xmm3
430	movaps	 0 * SIZE(B), %xmm3
431	movaps	 4 * SIZE(B), %xmm7
432	movaps	 8 * SIZE(B), %xmm11
433	movaps	12 * SIZE(B), %xmm15
434
435
436	pshufd	$0x00, %xmm3, %xmm0
437	pshufd	$0x55, %xmm3, %xmm1
438	pshufd	$0xaa, %xmm3, %xmm2
439	pshufd	$0xff, %xmm3, %xmm3
440
441
442	pshufd	$0x00, %xmm7, %xmm4
443	pshufd	$0x55, %xmm7, %xmm5
444	pshufd	$0xaa, %xmm7, %xmm6
445	pshufd	$0xff, %xmm7, %xmm7
446
447	movaps	%xmm0,   0 * SIZE(BO)
448	movaps	%xmm1,   4 * SIZE(BO)
449	movaps	%xmm2,   8 * SIZE(BO)
450	movaps	%xmm3,  12 * SIZE(BO)
451	movaps	%xmm4,  16 * SIZE(BO)
452	movaps	%xmm5,  20 * SIZE(BO)
453	movaps	%xmm6,  24 * SIZE(BO)
454	movaps	%xmm7,  28 * SIZE(BO)
455
456
457	pshufd	$0x00, %xmm11, %xmm0
458	pshufd	$0x55, %xmm11, %xmm1
459	pshufd	$0xaa, %xmm11, %xmm2
460	pshufd	$0xff, %xmm11, %xmm3
461
462
463	pshufd	$0x00, %xmm15, %xmm4
464	pshufd	$0x55, %xmm15, %xmm5
465	pshufd	$0xaa, %xmm15, %xmm6
466	pshufd	$0xff, %xmm15, %xmm7
467
468	movaps	%xmm0,  32 * SIZE(BO)
469	movaps	%xmm1,  36 * SIZE(BO)
470	movaps	%xmm2,  40 * SIZE(BO)
471	movaps	%xmm3,  44 * SIZE(BO)
472	movaps	%xmm4,  48 * SIZE(BO)
473	movaps	%xmm5,  52 * SIZE(BO)
474	movaps	%xmm6,  56 * SIZE(BO)
475	movaps	%xmm7,  60 * SIZE(BO)
476
477	addq	$16 * SIZE, B
478	addq	$64 * SIZE, BO
479
480	decq	%rax
481	jne	.L02
482	ALIGN_4
483
484.L03:
485	movq	K,  %rax
486	andq	$3, %rax
487	BRANCH
488	jle	.L10
489	ALIGN_4
490
491.L04:
492	movaps	 0 * SIZE(B), %xmm3
493
494	pshufd	 $0x00, %xmm3, %xmm0
495	pshufd	 $0x55, %xmm3, %xmm1
496	pshufd	 $0xaa, %xmm3, %xmm2
497	pshufd	 $0xff, %xmm3, %xmm3
498
499	movaps	%xmm0,  0 * SIZE(BO)
500	movaps	%xmm1,  4 * SIZE(BO)
501	movaps	%xmm2,  8 * SIZE(BO)
502	movaps	%xmm3, 12 * SIZE(BO)
503
504	addq	$ 4 * SIZE, B
505	addq	$16 * SIZE, BO
506	decq	%rax
507	jne	.L04
508	ALIGN_4
509
510.L10:
511	movq	C, CO1
512	leaq	(C, LDC, 1), CO2
513	movq	A, AO
514
515
516	movq	M,  I
517	sarq	$3, I	# i = (m >> 3)
518	jle	.L20
519	ALIGN_4
520
521.L11:
522#if !defined(TRMMKERNEL) || \
523	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
524	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
525
526	leaq	32 * SIZE + BUFFER, BO
527#else
528	leaq	32 * SIZE + BUFFER, BO
529	movq	KK, %rax
530	leaq	(, %rax,   8), %rax
531	leaq	(AO, %rax, 4), AO
532	leaq	(BO, %rax, 8), BO
533#endif
534
535	movaps	-32 * SIZE(AO), %xmm0
536	movaps	-32 * SIZE(BO), %xmm1
537	xorps	%xmm8, %xmm8
538 	movaps	-28 * SIZE(BO), %xmm3
539	xorps	%xmm9, %xmm9
540	movaps	-16 * SIZE(AO), %xmm4
541	xorps	%xmm10, %xmm10
542	movaps	  0 * SIZE(BO), %xmm5
543	xorps	%xmm11, %xmm11
544
545
546	xorps	%xmm12, %xmm12
547	xorps	%xmm13, %xmm13
548	xorps	%xmm14, %xmm14
549	xorps	%xmm15, %xmm15
550	movaps	%xmm0, %xmm2
551	prefetcht0 (CO1)
552	prefetcht0 (CO1,LDC, 2)
553	prefetcht0 (CO2)
554	prefetcht0 (CO2,LDC, 2)
555
556#ifndef TRMMKERNEL
557	movq	K, %rax
558#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
559	movq	K, %rax
560	subq	KK, %rax
561	movq	%rax, KKK
562#else
563	movq	KK, %rax
564#ifdef LEFT
565	addq	$8, %rax
566#else
567	addq	$4, %rax
568#endif
569	movq	%rax, KKK
570#endif
571	andq	$-8, %rax
572
573	leaq	(, %rax, 8), %rax
574	leaq	(AO, %rax, 4), AO
575	leaq	(BO, %rax, 8), BO
576	negq	%rax
577	NOBRANCH
578	je	.L15
579	ALIGN_3
580
581.L12:
582	KERNEL1(32 *  0)
583	KERNEL2(32 *  0)
584	KERNEL3(32 *  0)
585	KERNEL4(32 *  0)
586	KERNEL5(32 *  0)
587	KERNEL6(32 *  0)
588	KERNEL7(32 *  0)
589	KERNEL8(32 *  0)
590	NOBRANCH
591	je	.L15
592	KERNEL1(32 *  0)
593	KERNEL2(32 *  0)
594	KERNEL3(32 *  0)
595	KERNEL4(32 *  0)
596	KERNEL5(32 *  0)
597	KERNEL6(32 *  0)
598	KERNEL7(32 *  0)
599	KERNEL8(32 *  0)
600	NOBRANCH
601	je	.L15
602	KERNEL1(32 *  0)
603	KERNEL2(32 *  0)
604	KERNEL3(32 *  0)
605	KERNEL4(32 *  0)
606	KERNEL5(32 *  0)
607	KERNEL6(32 *  0)
608	KERNEL7(32 *  0)
609	KERNEL8(32 *  0)
610	NOBRANCH
611	je	.L15
612	KERNEL1(32 *  0)
613	KERNEL2(32 *  0)
614	KERNEL3(32 *  0)
615	KERNEL4(32 *  0)
616	KERNEL5(32 *  0)
617	KERNEL6(32 *  0)
618	KERNEL7(32 *  0)
619	KERNEL8(32 *  0)
620	NOBRANCH
621	je	.L15
622	KERNEL1(32 *  0)
623	KERNEL2(32 *  0)
624	KERNEL3(32 *  0)
625	KERNEL4(32 *  0)
626	KERNEL5(32 *  0)
627	KERNEL6(32 *  0)
628	KERNEL7(32 *  0)
629	KERNEL8(32 *  0)
630	NOBRANCH
631	je	.L15
632	KERNEL1(32 *  0)
633	KERNEL2(32 *  0)
634	KERNEL3(32 *  0)
635	KERNEL4(32 *  0)
636	KERNEL5(32 *  0)
637	KERNEL6(32 *  0)
638	KERNEL7(32 *  0)
639	KERNEL8(32 *  0)
640	NOBRANCH
641	je	.L15
642	KERNEL1(32 *  0)
643	KERNEL2(32 *  0)
644	KERNEL3(32 *  0)
645	KERNEL4(32 *  0)
646	KERNEL5(32 *  0)
647	KERNEL6(32 *  0)
648	KERNEL7(32 *  0)
649	KERNEL8(32 *  0)
650	NOBRANCH
651	je	.L15
652	KERNEL1(32 *  0)
653	KERNEL2(32 *  0)
654	KERNEL3(32 *  0)
655	KERNEL4(32 *  0)
656	KERNEL5(32 *  0)
657	KERNEL6(32 *  0)
658	KERNEL7(32 *  0)
659	KERNEL8(32 *  0)
660	BRANCH
661	jl	.L12
662	ALIGN_4
663
664.L15:
665
666	movaps	ALPHA, %xmm7
667
668#ifndef TRMMKERNEL
669	movq	K, %rax
670#else
671	movq	KKK, %rax
672#endif
673	testq	$4, %rax
674	je .L16
675	xorq	%rax, %rax
676	ALIGN_3
677
678	KERNEL_SUB1(32 *  0)
679	KERNEL_SUB2(32 *  0)
680	KERNEL_SUB3(32 *  0)
681	KERNEL_SUB4(32 *  0)
682
683	addq	$32 * SIZE, AO
684	addq	$64 * SIZE, BO
685	ALIGN_3
686
687.L16:
688#ifndef TRMMKERNEL
689	movq	K, %rax
690#else
691	movq	KKK, %rax
692#endif
693	andq	$3, %rax		# if (k & 1)
694	je .L18
695
696	leaq	(, %rax, 8), %rax
697	leaq	(AO, %rax, 4), AO
698	leaq	(BO, %rax, 8), BO
699	negq	%rax
700	ALIGN_4
701
702.L17:
703	mulps	%xmm1, %xmm0
704	mulps	-28 * SIZE(AO, %rax, 4), %xmm1
705	addps	%xmm0, %xmm8
706	movaps	%xmm2, %xmm0
707	addps	%xmm1, %xmm12
708	movaps	-24 * SIZE(BO, %rax, 8), %xmm1
709	mulps	%xmm3, %xmm2
710	mulps	-28 * SIZE(AO, %rax, 4), %xmm3
711	addps	%xmm2, %xmm9
712	movaps	%xmm0, %xmm2
713	addps	%xmm3, %xmm13
714	movaps	-20 * SIZE(BO, %rax, 8), %xmm3
715	mulps	%xmm1, %xmm0
716	mulps	-28 * SIZE(AO, %rax, 4), %xmm1
717	addps	%xmm0, %xmm10
718	movaps	-24 * SIZE(AO, %rax, 4), %xmm0
719	addps	%xmm1, %xmm14
720	movaps	-16 * SIZE(BO, %rax, 8), %xmm1
721	mulps	%xmm3, %xmm2
722	mulps	-28 * SIZE(AO, %rax, 4), %xmm3
723	addps	%xmm2, %xmm11
724	addps	%xmm3, %xmm15
725 	movaps	-12 * SIZE(BO, %rax, 8), %xmm3
726	movaps	%xmm0, %xmm2
727
728	addq	$SIZE * 2, %rax
729	jl	.L17
730	ALIGN_4
731
732.L18:
733#ifndef TRMMKERNEL
734
735	vfmaddps 0 * SIZE(CO1),%xmm7, %xmm8, %xmm8
736	vfmaddps 4 * SIZE(CO1),%xmm7, %xmm12, %xmm12
737	vfmaddps 0 * SIZE(CO2),%xmm7, %xmm9, %xmm9
738	vfmaddps 4 * SIZE(CO2),%xmm7, %xmm13, %xmm13
739	vfmaddps 0 * SIZE(CO1, LDC, 2),%xmm7, %xmm10, %xmm10
740	vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm7, %xmm14, %xmm14
741	vfmaddps 0 * SIZE(CO2, LDC, 2),%xmm7, %xmm11, %xmm11
742	vfmaddps 4 * SIZE(CO2, LDC, 2),%xmm7, %xmm15, %xmm15
743
744#else
745
746	vmulps	%xmm7, %xmm8, %xmm8
747	vmulps	%xmm7, %xmm9, %xmm9
748	vmulps	%xmm7, %xmm10, %xmm10
749	vmulps	%xmm7, %xmm11, %xmm11
750
751	vmulps	%xmm7, %xmm12,%xmm12
752	vmulps	%xmm7, %xmm13,%xmm13
753	vmulps	%xmm7, %xmm14,%xmm14
754	vmulps	%xmm7, %xmm15,%xmm15
755
756#endif
757
758
759	vmovups	%xmm8,  0 * SIZE(CO1)
760	vmovups	%xmm12, 4 * SIZE(CO1)
761	vmovups	%xmm9,  0 * SIZE(CO2)
762	vmovups	%xmm13, 4 * SIZE(CO2)
763	vmovups	%xmm10, 0 * SIZE(CO1, LDC, 2)
764	vmovups	%xmm14, 4 * SIZE(CO1, LDC, 2)
765	vmovups	%xmm11, 0 * SIZE(CO2, LDC, 2)
766	vmovups	%xmm15, 4 * SIZE(CO2, LDC, 2)
767	prefetcht0 64(CO1)
768	prefetcht0 64(CO1,LDC, 2)
769	prefetcht0 64(CO2)
770	prefetcht0 64(CO2,LDC, 2)
771
772#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
773    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
774	movq	K, %rax
775	subq	KKK, %rax
776	leaq	(,%rax,    8), %rax
777	leaq	(AO, %rax, 4), AO
778	leaq	(BO, %rax, 8), BO
779#endif
780
781#if defined(TRMMKERNEL) && defined(LEFT)
782	addq	$8, KK
783#endif
784
785	addq	$8 * SIZE, CO1		# coffset += 4
786	addq	$8 * SIZE, CO2		# coffset += 4
787	decq	I			# i --
788	jg	.L11
789	ALIGN_4
790
791.L20:
792	testq	$4, M
793	je	.L30
794
795#if !defined(TRMMKERNEL) || \
796	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
797	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
798
799	leaq	BUFFER, BO
800#else
801	leaq	BUFFER, BO
802	movq	KK, %rax
803	leaq	(, %rax,   8), %rax
804	leaq	(AO, %rax, 2), AO
805	leaq	(BO, %rax, 8), BO
806#endif
807
808	movaps	-32 * SIZE(AO), %xmm8
809	movaps	-16 * SIZE(AO), %xmm10
810
811	movaps	  0 * SIZE(BO), %xmm9
812	movaps	 16 * SIZE(BO), %xmm11
813	movaps	 32 * SIZE(BO), %xmm13
814	movaps	 48 * SIZE(BO), %xmm15
815
816	xorps	%xmm0, %xmm0
817	xorps	%xmm1, %xmm1
818	xorps	%xmm2, %xmm2
819	xorps	%xmm3, %xmm3
820
821#ifndef TRMMKERNEL
822	movq	K, %rax
823#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
824	movq	K, %rax
825	subq	KK, %rax
826	movq	%rax, KKK
827#else
828	movq	KK, %rax
829#ifdef LEFT
830	addq	$4, %rax
831#else
832	addq	$4, %rax
833#endif
834	movq	%rax, KKK
835#endif
836	sarq	$3, %rax
837	je	.L25
838	ALIGN_4
839
840.L22:
841	mulps	%xmm8, %xmm9
842	addps	%xmm9, %xmm0
843	movaps	 4 * SIZE(BO), %xmm9
844	mulps	%xmm8, %xmm9
845	addps	%xmm9, %xmm1
846	movaps	 8 * SIZE(BO), %xmm9
847	mulps	%xmm8, %xmm9
848	mulps	12 * SIZE(BO), %xmm8
849	addps	%xmm9, %xmm2
850	movaps	64 * SIZE(BO), %xmm9
851	addps	%xmm8, %xmm3
852	movaps	-28 * SIZE(AO), %xmm8
853
854	mulps	%xmm8, %xmm11
855	addps	%xmm11, %xmm0
856	movaps	20 * SIZE(BO), %xmm11
857	mulps	%xmm8, %xmm11
858	addps	%xmm11, %xmm1
859	movaps	24 * SIZE(BO), %xmm11
860	mulps	%xmm8, %xmm11
861	mulps	28 * SIZE(BO), %xmm8
862	addps	%xmm11, %xmm2
863	movaps	80 * SIZE(BO), %xmm11
864	addps	%xmm8, %xmm3
865	movaps	-24 * SIZE(AO), %xmm8
866
867	mulps	%xmm8, %xmm13
868	addps	%xmm13, %xmm0
869	movaps	36 * SIZE(BO), %xmm13
870	mulps	%xmm8, %xmm13
871	addps	%xmm13, %xmm1
872	movaps	40 * SIZE(BO), %xmm13
873	mulps	%xmm8, %xmm13
874	mulps	44 * SIZE(BO), %xmm8
875	addps	%xmm13, %xmm2
876	movaps	96 * SIZE(BO), %xmm13
877	addps	%xmm8, %xmm3
878	movaps	-20 * SIZE(AO), %xmm8
879
880	mulps	%xmm8, %xmm15
881	addps	%xmm15, %xmm0
882	movaps	52 * SIZE(BO), %xmm15
883	mulps	%xmm8, %xmm15
884	addps	%xmm15, %xmm1
885	movaps	56 * SIZE(BO), %xmm15
886	mulps	%xmm8, %xmm15
887	mulps	60 * SIZE(BO), %xmm8
888	addps	%xmm15, %xmm2
889	movaps	112 * SIZE(BO), %xmm15
890	addps	%xmm8, %xmm3
891	movaps	 0 * SIZE(AO), %xmm8
892
893	mulps	%xmm10, %xmm9
894	addps	%xmm9, %xmm0
895	movaps	68 * SIZE(BO), %xmm9
896	mulps	%xmm10, %xmm9
897	addps	%xmm9, %xmm1
898	movaps	72 * SIZE(BO), %xmm9
899	mulps	%xmm10, %xmm9
900	mulps	76 * SIZE(BO), %xmm10
901	addps	%xmm9, %xmm2
902	movaps	128 * SIZE(BO), %xmm9
903	addps	%xmm10, %xmm3
904	movaps	-12 * SIZE(AO), %xmm10
905
906	mulps	%xmm10, %xmm11
907	addps	%xmm11, %xmm0
908	movaps	84 * SIZE(BO), %xmm11
909	mulps	%xmm10, %xmm11
910	addps	%xmm11, %xmm1
911	movaps	88 * SIZE(BO), %xmm11
912	mulps	%xmm10, %xmm11
913	mulps	92 * SIZE(BO), %xmm10
914	addps	%xmm11, %xmm2
915	movaps	144 * SIZE(BO), %xmm11
916	addps	%xmm10, %xmm3
917	movaps	-8 * SIZE(AO), %xmm10
918
919	mulps	%xmm10, %xmm13
920	addps	%xmm13, %xmm0
921	movaps	100 * SIZE(BO), %xmm13
922	mulps	%xmm10, %xmm13
923	addps	%xmm13, %xmm1
924	movaps	104 * SIZE(BO), %xmm13
925	mulps	%xmm10, %xmm13
926	mulps	108 * SIZE(BO), %xmm10
927	addps	%xmm13, %xmm2
928	movaps	160 * SIZE(BO), %xmm13
929	addps	%xmm10, %xmm3
930	movaps	-4 * SIZE(AO), %xmm10
931
932	mulps	%xmm10, %xmm15
933	addps	%xmm15, %xmm0
934	movaps	116 * SIZE(BO), %xmm15
935	mulps	%xmm10, %xmm15
936	addps	%xmm15, %xmm1
937	movaps	120 * SIZE(BO), %xmm15
938	mulps	%xmm10, %xmm15
939	mulps	124 * SIZE(BO), %xmm10
940	addps	%xmm15, %xmm2
941	movaps	176 * SIZE(BO), %xmm15
942	addps	%xmm10, %xmm3
943	movaps	16 * SIZE(AO), %xmm10
944
945	addq   $ 32 * SIZE, AO
946	addq   $128 * SIZE, BO
947	decq   %rax
948	jne    .L22
949	ALIGN_4
950
951.L25:
952#ifndef TRMMKERNEL
953	movq	K, %rax
954#else
955	movq	KKK, %rax
956#endif
957	movaps	ALPHA, %xmm15
958	andq	$7, %rax		# if (k & 1)
959	BRANCH
960	je .L28
961	ALIGN_4
962
963.L26:
964	mulps	%xmm8, %xmm9
965	addps	%xmm9, %xmm0
966	movaps	 4 * SIZE(BO), %xmm9
967	mulps	%xmm8, %xmm9
968	addps	%xmm9, %xmm1
969	movaps	 8 * SIZE(BO), %xmm9
970	mulps	%xmm8, %xmm9
971	mulps	12 * SIZE(BO), %xmm8
972	addps	%xmm9, %xmm2
973	movaps	16 * SIZE(BO), %xmm9
974	addps	%xmm8, %xmm3
975	movaps	-28 * SIZE(AO), %xmm8
976
977	addq	$ 4 * SIZE, AO		# aoffset  += 4
978	addq	$16 * SIZE, BO		# boffset1 += 8
979	decq	%rax
980	jg	.L26
981	ALIGN_4
982
983.L28:
984	mulps	%xmm15, %xmm0
985	mulps	%xmm15, %xmm1
986 	mulps	%xmm15, %xmm2
987	mulps	%xmm15, %xmm3
988
989#ifndef TRMMKERNEL
990	movsd	0 * SIZE(CO1), %xmm8
991	movhps	2 * SIZE(CO1), %xmm8
992	movsd	0 * SIZE(CO2), %xmm10
993	movhps	2 * SIZE(CO2), %xmm10
994
995	movsd	0 * SIZE(CO1, LDC, 2), %xmm12
996	movhps	2 * SIZE(CO1, LDC, 2), %xmm12
997	movsd	0 * SIZE(CO2, LDC, 2), %xmm14
998	movhps	2 * SIZE(CO2, LDC, 2), %xmm14
999
1000	addps	%xmm8,  %xmm0
1001	addps	%xmm10, %xmm1
1002	addps	%xmm12, %xmm2
1003	addps	%xmm14, %xmm3
1004#endif
1005
1006	vmovups	%xmm0, 0 * SIZE(CO1)
1007	vmovups	%xmm1, 0 * SIZE(CO2)
1008
1009	vmovups	%xmm2, 0 * SIZE(CO1, LDC, 2)
1010	vmovups	%xmm3, 0 * SIZE(CO2, LDC, 2)
1011
1012#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1013    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1014	movq	K, %rax
1015	subq	KKK, %rax
1016	leaq	(,%rax,    8), %rax
1017	leaq	(AO, %rax, 2), AO
1018	leaq	(BO, %rax, 8), BO
1019#endif
1020
1021#if defined(TRMMKERNEL) && defined(LEFT)
1022	addq	$4, KK
1023#endif
1024
1025	addq	$4 * SIZE, CO1		# coffset += 4
1026	addq	$4 * SIZE, CO2		# coffset += 4
1027	ALIGN_4
1028
1029.L30:
1030	testq	$2, M
1031	je	.L40
1032
1033#if !defined(TRMMKERNEL) || \
1034	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1035	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1036
1037	leaq	BUFFER, BO
1038#else
1039	leaq	BUFFER, BO
1040	movq	KK, %rax
1041	leaq	(, %rax,   8), %rax
1042	leaq	(AO, %rax, 1), AO
1043	leaq	(BO, %rax, 8), BO
1044#endif
1045
1046	movaps	-32 * SIZE(AO), %xmm8
1047	movaps	-24 * SIZE(AO), %xmm10
1048
1049	movaps	 0 * SIZE(BO), %xmm9
1050	movaps	16 * SIZE(BO), %xmm11
1051	movaps	32 * SIZE(BO), %xmm13
1052	movaps	48 * SIZE(BO), %xmm15
1053
1054	xorps	%xmm0, %xmm0
1055	xorps	%xmm1, %xmm1
1056	xorps	%xmm2, %xmm2
1057	xorps	%xmm3, %xmm3
1058
1059#ifndef TRMMKERNEL
1060	movq	K, %rax
1061#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1062	movq	K, %rax
1063	subq	KK, %rax
1064	movq	%rax, KKK
1065#else
1066	movq	KK, %rax
1067#ifdef LEFT
1068	addq	$2, %rax
1069#else
1070	addq	$4, %rax
1071#endif
1072	movq	%rax, KKK
1073#endif
1074	sarq	$3, %rax
1075	je	.L35
1076	ALIGN_4
1077
1078.L32:
1079	mulps	%xmm8, %xmm9
1080	addps	%xmm9, %xmm0
1081	movsd	 4 * SIZE(BO), %xmm9
1082	mulps	%xmm8, %xmm9
1083	addps	%xmm9, %xmm1
1084	movsd	 8 * SIZE(BO), %xmm9
1085	mulps	%xmm8, %xmm9
1086	addps	%xmm9, %xmm2
1087	movsd	12 * SIZE(BO), %xmm9
1088	mulps	%xmm8, %xmm9
1089	movsd	-30 * SIZE(AO), %xmm8
1090	addps	%xmm9, %xmm3
1091	movsd	64 * SIZE(BO), %xmm9
1092
1093	mulps	%xmm8, %xmm11
1094	addps	%xmm11, %xmm0
1095	movsd	20 * SIZE(BO), %xmm11
1096	mulps	%xmm8, %xmm11
1097	addps	%xmm11, %xmm1
1098	movsd	24 * SIZE(BO), %xmm11
1099	mulps	%xmm8, %xmm11
1100	addps	%xmm11, %xmm2
1101	movsd	28 * SIZE(BO), %xmm11
1102	mulps	%xmm8, %xmm11
1103	movsd	-28 * SIZE(AO), %xmm8
1104	addps	%xmm11, %xmm3
1105	movsd	80 * SIZE(BO), %xmm11
1106
1107	mulps	%xmm8, %xmm13
1108	addps	%xmm13, %xmm0
1109	movsd	36 * SIZE(BO), %xmm13
1110	mulps	%xmm8, %xmm13
1111	addps	%xmm13, %xmm1
1112	movsd	40 * SIZE(BO), %xmm13
1113	mulps	%xmm8, %xmm13
1114	addps	%xmm13, %xmm2
1115	movsd	44 * SIZE(BO), %xmm13
1116	mulps	%xmm8, %xmm13
1117	movsd	-26 * SIZE(AO), %xmm8
1118	addps	%xmm13, %xmm3
1119	movsd	96 * SIZE(BO), %xmm13
1120
1121	mulps	%xmm8, %xmm15
1122	addps	%xmm15, %xmm0
1123	movsd	52 * SIZE(BO), %xmm15
1124	mulps	%xmm8, %xmm15
1125	addps	%xmm15, %xmm1
1126	movsd	56 * SIZE(BO), %xmm15
1127	mulps	%xmm8, %xmm15
1128	addps	%xmm15, %xmm2
1129	movsd	60 * SIZE(BO), %xmm15
1130	mulps	%xmm8, %xmm15
1131	movsd	-16 * SIZE(AO), %xmm8
1132	addps	%xmm15, %xmm3
1133	movsd	112 * SIZE(BO), %xmm15
1134
1135	mulps	%xmm10, %xmm9
1136	addps	%xmm9, %xmm0
1137	movsd	68 * SIZE(BO), %xmm9
1138	mulps	%xmm10, %xmm9
1139	addps	%xmm9, %xmm1
1140	movsd	72 * SIZE(BO), %xmm9
1141	mulps	%xmm10, %xmm9
1142	addps	%xmm9, %xmm2
1143	movsd	76 * SIZE(BO), %xmm9
1144	mulps	%xmm10, %xmm9
1145	movsd	-22 * SIZE(AO), %xmm10
1146	addps	%xmm9, %xmm3
1147	movsd	128 * SIZE(BO), %xmm9
1148
1149	mulps	%xmm10, %xmm11
1150	addps	%xmm11, %xmm0
1151	movsd	84 * SIZE(BO), %xmm11
1152	mulps	%xmm10, %xmm11
1153	addps	%xmm11, %xmm1
1154	movsd	88 * SIZE(BO), %xmm11
1155	mulps	%xmm10, %xmm11
1156	addps	%xmm11, %xmm2
1157	movsd	92 * SIZE(BO), %xmm11
1158	mulps	%xmm10, %xmm11
1159	movsd	-20 * SIZE(AO), %xmm10
1160	addps	%xmm11, %xmm3
1161	movsd	144 * SIZE(BO), %xmm11
1162
1163	mulps	%xmm10, %xmm13
1164	addps	%xmm13, %xmm0
1165	movsd	100 * SIZE(BO), %xmm13
1166	mulps	%xmm10, %xmm13
1167	addps	%xmm13, %xmm1
1168	movsd	104 * SIZE(BO), %xmm13
1169	mulps	%xmm10, %xmm13
1170	addps	%xmm13, %xmm2
1171	movsd	108 * SIZE(BO), %xmm13
1172	mulps	%xmm10, %xmm13
1173	movsd	-18 * SIZE(AO), %xmm10
1174	addps	%xmm13, %xmm3
1175	movsd	160 * SIZE(BO), %xmm13
1176
1177	mulps	%xmm10, %xmm15
1178	addps	%xmm15, %xmm0
1179	movsd	116 * SIZE(BO), %xmm15
1180	mulps	%xmm10, %xmm15
1181	addps	%xmm15, %xmm1
1182	movsd	120 * SIZE(BO), %xmm15
1183	mulps	%xmm10, %xmm15
1184	addps	%xmm15, %xmm2
1185	movsd	124 * SIZE(BO), %xmm15
1186	mulps	%xmm10, %xmm15
1187	movsd	 -8 * SIZE(AO), %xmm10
1188	addps	%xmm15, %xmm3
1189	movsd	176 * SIZE(BO), %xmm15
1190
1191	addq   $ 16 * SIZE, AO
1192	addq   $128 * SIZE, BO
1193	decq   %rax
1194	jne    .L32
1195	ALIGN_4
1196
1197.L35:
1198#ifndef TRMMKERNEL
1199	movq	K, %rax
1200#else
1201	movq	KKK, %rax
1202#endif
1203	movaps	ALPHA, %xmm15
1204	andq	$7, %rax		# if (k & 1)
1205	BRANCH
1206	je .L38
1207	ALIGN_4
1208
1209.L36:
1210	mulps	%xmm8, %xmm9
1211	addps	%xmm9, %xmm0
1212	movsd	  4 * SIZE(BO), %xmm9
1213	mulps	%xmm8, %xmm9
1214	addps	%xmm9, %xmm1
1215	movsd	  8 * SIZE(BO), %xmm9
1216	mulps	%xmm8, %xmm9
1217	addps	%xmm9, %xmm2
1218	movsd	 12 * SIZE(BO), %xmm9
1219	mulps	%xmm8, %xmm9
1220	movsd	-30 * SIZE(AO), %xmm8
1221	addps	%xmm9, %xmm3
1222	movsd	 16 * SIZE(BO), %xmm9
1223
1224	addq	$ 2 * SIZE, AO		# aoffset  += 4
1225	addq	$16 * SIZE, BO		# boffset1 += 8
1226	decq	%rax
1227	jg	.L36
1228	ALIGN_4
1229
1230.L38:
1231	mulps	%xmm15, %xmm0
1232	mulps	%xmm15, %xmm1
1233 	mulps	%xmm15, %xmm2
1234	mulps	%xmm15, %xmm3
1235
1236#ifndef TRMMKERNEL
1237	movsd	0 * SIZE(CO1), %xmm8
1238	movsd	0 * SIZE(CO2), %xmm10
1239	movsd	0 * SIZE(CO1, LDC, 2), %xmm12
1240	movsd	0 * SIZE(CO2, LDC, 2), %xmm14
1241
1242	addps	%xmm8,  %xmm0
1243	addps	%xmm10, %xmm1
1244	addps	%xmm12, %xmm2
1245	addps	%xmm14, %xmm3
1246#endif
1247
1248	movsd	%xmm0, 0 * SIZE(CO1)
1249	movsd	%xmm1, 0 * SIZE(CO2)
1250	movsd	%xmm2, 0 * SIZE(CO1, LDC, 2)
1251	movsd	%xmm3, 0 * SIZE(CO2, LDC, 2)
1252
1253#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1254    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1255	movq	K, %rax
1256	subq	KKK, %rax
1257	leaq	(,%rax,    8), %rax
1258	leaq	(AO, %rax, 1), AO
1259	leaq	(BO, %rax, 8), BO
1260#endif
1261
1262#if defined(TRMMKERNEL) && defined(LEFT)
1263	addq	$2, KK
1264#endif
1265
1266	addq	$2 * SIZE, CO1		# coffset += 4
1267	addq	$2 * SIZE, CO2		# coffset += 4
1268	ALIGN_4
1269
1270.L40:
1271	testq	$1, M
1272	je	.L49
1273
1274#if !defined(TRMMKERNEL) || \
1275	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1276	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1277
1278	leaq	BUFFER, BO
1279#else
1280	leaq	BUFFER, BO
1281	movq	KK, %rax
1282	leaq	(, %rax,   4), %rax
1283	leaq	(AO, %rax, 1), AO
1284	leaq	(BO, %rax, 8), BO
1285	leaq	(BO, %rax, 8), BO
1286#endif
1287
1288	movss	-32 * SIZE(AO), %xmm8
1289	movss	-28 * SIZE(AO), %xmm10
1290
1291	movss	 0 * SIZE(BO), %xmm9
1292	movss	16 * SIZE(BO), %xmm11
1293	movss	32 * SIZE(BO), %xmm13
1294	movss	48 * SIZE(BO), %xmm15
1295
1296	xorps	%xmm0, %xmm0
1297	xorps	%xmm1, %xmm1
1298	xorps	%xmm2, %xmm2
1299	xorps	%xmm3, %xmm3
1300
1301#ifndef TRMMKERNEL
1302	movq	K, %rax
1303#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1304	movq	K, %rax
1305	subq	KK, %rax
1306	movq	%rax, KKK
1307#else
1308	movq	KK, %rax
1309#ifdef LEFT
1310	addq	$1, %rax
1311#else
1312	addq	$4, %rax
1313#endif
1314	movq	%rax, KKK
1315#endif
1316	sarq	$3, %rax
1317	je	.L45
1318	ALIGN_4
1319
1320.L42:
1321	mulss	%xmm8, %xmm9
1322	addss	%xmm9, %xmm0
1323	movss	 4 * SIZE(BO), %xmm9
1324	mulss	%xmm8, %xmm9
1325	addss	%xmm9, %xmm1
1326	movss	 8 * SIZE(BO), %xmm9
1327	mulss	%xmm8, %xmm9
1328	addss	%xmm9, %xmm2
1329	movss	12 * SIZE(BO), %xmm9
1330	mulss	%xmm8, %xmm9
1331	movss	-31 * SIZE(AO), %xmm8
1332	addss	%xmm9, %xmm3
1333	movss	64 * SIZE(BO), %xmm9
1334
1335	mulss	%xmm8, %xmm11
1336	addss	%xmm11, %xmm0
1337	movss	20 * SIZE(BO), %xmm11
1338	mulss	%xmm8, %xmm11
1339	addss	%xmm11, %xmm1
1340	movss	24 * SIZE(BO), %xmm11
1341	mulss	%xmm8, %xmm11
1342	addss	%xmm11, %xmm2
1343	movss	28 * SIZE(BO), %xmm11
1344	mulss	%xmm8, %xmm11
1345	movss	-30 * SIZE(AO), %xmm8
1346	addss	%xmm11, %xmm3
1347	movss	80 * SIZE(BO), %xmm11
1348
1349	mulss	%xmm8, %xmm13
1350	addss	%xmm13, %xmm0
1351	movss	36 * SIZE(BO), %xmm13
1352	mulss	%xmm8, %xmm13
1353	addss	%xmm13, %xmm1
1354	movss	40 * SIZE(BO), %xmm13
1355	mulss	%xmm8, %xmm13
1356	addss	%xmm13, %xmm2
1357	movss	44 * SIZE(BO), %xmm13
1358	mulss	%xmm8, %xmm13
1359	movss	-29 * SIZE(AO), %xmm8
1360	addss	%xmm13, %xmm3
1361	movss	96 * SIZE(BO), %xmm13
1362
1363	mulss	%xmm8, %xmm15
1364	addss	%xmm15, %xmm0
1365	movss	52 * SIZE(BO), %xmm15
1366	mulss	%xmm8, %xmm15
1367	addss	%xmm15, %xmm1
1368	movss	56 * SIZE(BO), %xmm15
1369	mulss	%xmm8, %xmm15
1370	addss	%xmm15, %xmm2
1371	movss	60 * SIZE(BO), %xmm15
1372	mulss	%xmm8, %xmm15
1373	movss	-24 * SIZE(AO), %xmm8
1374	addss	%xmm15, %xmm3
1375	movss	112 * SIZE(BO), %xmm15
1376
1377	mulss	%xmm10, %xmm9
1378	addss	%xmm9, %xmm0
1379	movss	68 * SIZE(BO), %xmm9
1380	mulss	%xmm10, %xmm9
1381	addss	%xmm9, %xmm1
1382	movss	72 * SIZE(BO), %xmm9
1383	mulss	%xmm10, %xmm9
1384	addss	%xmm9, %xmm2
1385	movss	76 * SIZE(BO), %xmm9
1386	mulss	%xmm10, %xmm9
1387	movss	-27 * SIZE(AO), %xmm10
1388	addss	%xmm9, %xmm3
1389	movss	128 * SIZE(BO), %xmm9
1390
1391	mulss	%xmm10, %xmm11
1392	addss	%xmm11, %xmm0
1393	movss	84 * SIZE(BO), %xmm11
1394	mulss	%xmm10, %xmm11
1395	addss	%xmm11, %xmm1
1396	movss	88 * SIZE(BO), %xmm11
1397	mulss	%xmm10, %xmm11
1398	addss	%xmm11, %xmm2
1399	movss	92 * SIZE(BO), %xmm11
1400	mulss	%xmm10, %xmm11
1401	movss	-26 * SIZE(AO), %xmm10
1402	addss	%xmm11, %xmm3
1403	movss	144 * SIZE(BO), %xmm11
1404
1405	mulss	%xmm10, %xmm13
1406	addss	%xmm13, %xmm0
1407	movss	100 * SIZE(BO), %xmm13
1408	mulss	%xmm10, %xmm13
1409	addss	%xmm13, %xmm1
1410	movss	104 * SIZE(BO), %xmm13
1411	mulss	%xmm10, %xmm13
1412	addss	%xmm13, %xmm2
1413	movss	108 * SIZE(BO), %xmm13
1414	mulss	%xmm10, %xmm13
1415	movss	-25 * SIZE(AO), %xmm10
1416	addss	%xmm13, %xmm3
1417	movss	160 * SIZE(BO), %xmm13
1418
1419	mulss	%xmm10, %xmm15
1420	addss	%xmm15, %xmm0
1421	movss	116 * SIZE(BO), %xmm15
1422	mulss	%xmm10, %xmm15
1423	addss	%xmm15, %xmm1
1424	movss	120 * SIZE(BO), %xmm15
1425	mulss	%xmm10, %xmm15
1426	addss	%xmm15, %xmm2
1427	movss	124 * SIZE(BO), %xmm15
1428	mulss	%xmm10, %xmm15
1429	movss	-20 * SIZE(AO), %xmm10
1430	addss	%xmm15, %xmm3
1431	movss	176 * SIZE(BO), %xmm15
1432
1433	addq   $  8 * SIZE, AO
1434	addq   $128 * SIZE, BO
1435	decq   %rax
1436	jne    .L42
1437	ALIGN_4
1438
1439.L45:
1440#ifndef TRMMKERNEL
1441	movq	K, %rax
1442#else
1443	movq	KKK, %rax
1444#endif
1445	movaps	ALPHA, %xmm15
1446	andq	$7, %rax		# if (k & 1)
1447	BRANCH
1448	je .L48
1449	ALIGN_4
1450
1451.L46:
1452	mulps	%xmm8, %xmm9
1453	addps	%xmm9, %xmm0
1454	movss	 4 * SIZE(BO), %xmm9
1455	mulps	%xmm8, %xmm9
1456	addps	%xmm9, %xmm1
1457	movss	 8 * SIZE(BO), %xmm9
1458	mulps	%xmm8, %xmm9
1459	addps	%xmm9, %xmm2
1460	movss	12 * SIZE(BO), %xmm9
1461	mulps	%xmm8, %xmm9
1462	movss	-31 * SIZE(AO), %xmm8
1463	addps	%xmm9, %xmm3
1464	movss	16 * SIZE(BO), %xmm9
1465
1466	addq	$ 1 * SIZE, AO		# aoffset  += 4
1467	addq	$16 * SIZE, BO		# boffset1 += 8
1468	decq	%rax
1469	jg	.L46
1470	ALIGN_4
1471
1472.L48:
1473	mulss	%xmm15, %xmm0
1474	mulss	%xmm15, %xmm1
1475 	mulss	%xmm15, %xmm2
1476	mulss	%xmm15, %xmm3
1477
1478#ifndef TRMMKERNEL
1479	movss	0 * SIZE(CO1), %xmm8
1480	movss	0 * SIZE(CO2), %xmm10
1481	movss	0 * SIZE(CO1, LDC, 2), %xmm12
1482	movss	0 * SIZE(CO2, LDC, 2), %xmm14
1483
1484	addss	%xmm8,  %xmm0
1485	addss	%xmm10, %xmm1
1486	addss	%xmm12, %xmm2
1487	addss	%xmm14, %xmm3
1488#endif
1489
1490	movss	%xmm0, 0 * SIZE(CO1)
1491	movss	%xmm1, 0 * SIZE(CO2)
1492	movss	%xmm2, 0 * SIZE(CO1, LDC, 2)
1493	movss	%xmm3, 0 * SIZE(CO2, LDC, 2)
1494
1495#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1496    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1497	movq	K, %rax
1498	subq	KKK, %rax
1499	leaq	(,%rax,    4), %rax
1500	leaq	(AO, %rax, 1), AO
1501	leaq	(BO, %rax, 8), BO
1502	leaq	(BO, %rax, 8), BO
1503#endif
1504
1505#if defined(TRMMKERNEL) && defined(LEFT)
1506	addq	$1, KK
1507#endif
1508	ALIGN_4
1509
1510.L49:
1511#if defined(TRMMKERNEL) && !defined(LEFT)
1512	addl	$4, KK
1513#endif
1514	leaq	(C, LDC, 4), C		# c += 4 * ldc
1515	decq	J			# j --
1516	jg	.L01
1517
1518.L50:
1519	testq	$2, N
1520	je	.L100
1521
1522.L51:
1523#if defined(TRMMKERNEL) && defined(LEFT)
1524	movq	OFFSET, %rax
1525	movq	%rax, KK
1526#endif
1527
1528/* Copying to Sub Buffer */
1529	leaq	BUFFER, BO
1530
1531	movq	K, %rax
1532	sarq	$2, %rax
1533	jle	.L53
1534	ALIGN_4
1535
1536.L52:
1537
1538	movaps	 0 * SIZE(B), %xmm3
1539	movaps	 4 * SIZE(B), %xmm7
1540
1541
1542	pshufd	$0x00, %xmm3, %xmm0
1543	pshufd	$0x55, %xmm3, %xmm1
1544	pshufd	$0xaa, %xmm3, %xmm2
1545	pshufd	$0xff, %xmm3, %xmm3
1546
1547
1548	pshufd	$0x00, %xmm7, %xmm4
1549	pshufd	$0x55, %xmm7, %xmm5
1550	pshufd	$0xaa, %xmm7, %xmm6
1551	pshufd	$0xff, %xmm7, %xmm7
1552
1553	movaps	%xmm0,   0 * SIZE(BO)
1554	movaps	%xmm1,   4 * SIZE(BO)
1555	movaps	%xmm2,   8 * SIZE(BO)
1556	movaps	%xmm3,  12 * SIZE(BO)
1557	movaps	%xmm4,  16 * SIZE(BO)
1558	movaps	%xmm5,  20 * SIZE(BO)
1559	movaps	%xmm6,  24 * SIZE(BO)
1560	movaps	%xmm7,  28 * SIZE(BO)
1561
1562	addq	$ 8 * SIZE, B
1563	addq	$32 * SIZE, BO
1564
1565	decq	%rax
1566	jne	.L52
1567	ALIGN_4
1568
1569.L53:
1570	movq	K, %rax
1571	andq	$3, %rax
1572	BRANCH
1573	jle	.L60
1574	ALIGN_4
1575
1576.L54:
1577	movsd	 0 * SIZE(B), %xmm3
1578
1579	pshufd	$0x00, %xmm3, %xmm0
1580	pshufd	$0x55, %xmm3, %xmm1
1581
1582	pshufd	$0x00, %xmm7, %xmm4
1583	pshufd	$0x55, %xmm7, %xmm5
1584	pshufd	$0xaa, %xmm7, %xmm6
1585	pshufd	$0xff, %xmm7, %xmm7
1586
1587	movaps	%xmm0,   0 * SIZE(BO)
1588	movaps	%xmm1,   4 * SIZE(BO)
1589
1590	addq	$ 2 * SIZE, B
1591	addq	$ 8 * SIZE, BO
1592	decq	%rax
1593	jne	.L54
1594	ALIGN_4
1595
1596.L60:
1597	movq	C, CO1			# coffset1 = c
1598	leaq	(C, LDC, 1), CO2	# coffset2 = c + ldc
1599	movq	A, AO		# aoffset = a
1600
1601	movq	M,  I
1602	sarq	$3, I	# i = (m >> 3)
1603	jle	.L70
1604	ALIGN_4
1605
1606.L61:
1607#if !defined(TRMMKERNEL) || \
1608	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1609	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1610
1611	leaq	BUFFER, BO
1612#else
1613	leaq	BUFFER, BO
1614	movq	KK, %rax
1615	leaq	(, %rax,   8), %rax
1616	leaq	(AO, %rax, 4), AO
1617	leaq	(BO, %rax, 4), BO
1618#endif
1619
1620	movaps	-32 * SIZE(AO), %xmm8
1621	movaps	-16 * SIZE(AO), %xmm10
1622	movaps	  0 * SIZE(AO), %xmm12
1623	movaps	 16 * SIZE(AO), %xmm14
1624
1625	movaps	 0 * SIZE(BO), %xmm9
1626	movaps	16 * SIZE(BO), %xmm11
1627	movaps	32 * SIZE(BO), %xmm13
1628	movaps	48 * SIZE(BO), %xmm15
1629
1630	xorps	%xmm0, %xmm0
1631	xorps	%xmm1, %xmm1
1632
1633	prefetchw      4 * SIZE(CO1)
1634	xorps	%xmm4, %xmm4
1635	prefetchw      4 * SIZE(CO2)
1636	xorps	%xmm5, %xmm5
1637
1638#ifndef TRMMKERNEL
1639	movq	K, %rax
1640#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1641	movq	K, %rax
1642	subq	KK, %rax
1643	movq	%rax, KKK
1644#else
1645	movq	KK, %rax
1646#ifdef LEFT
1647	addq	$8, %rax
1648#else
1649	addq	$2, %rax
1650#endif
1651	movq	%rax, KKK
1652#endif
1653	sarq	$3, %rax
1654	je	.L65
1655	ALIGN_4
1656
1657.L62:
1658	mulps	%xmm8, %xmm9
1659	mulps	 4 * SIZE(BO), %xmm8
1660	addps	%xmm9, %xmm0
1661	movaps	 0 * SIZE(BO), %xmm9
1662	addps	%xmm8, %xmm1
1663	movaps	-28 * SIZE(AO), %xmm8
1664	mulps	%xmm8, %xmm9
1665	mulps	 4 * SIZE(BO), %xmm8
1666	addps	%xmm9, %xmm4
1667	movaps	 8 * SIZE(BO), %xmm9
1668	addps	%xmm8, %xmm5
1669	movaps	-24 * SIZE(AO), %xmm8
1670
1671	mulps	%xmm8, %xmm9
1672	mulps	12 * SIZE(BO), %xmm8
1673	addps	%xmm9, %xmm0
1674	movaps	 8 * SIZE(BO), %xmm9
1675	addps	%xmm8, %xmm1
1676	movaps	-20 * SIZE(AO), %xmm8
1677	mulps	%xmm8, %xmm9
1678	mulps	12 * SIZE(BO), %xmm8
1679	addps	%xmm9, %xmm4
1680	movaps	64 * SIZE(BO), %xmm9
1681	addps	%xmm8, %xmm5
1682	movaps	32 * SIZE(AO), %xmm8
1683
1684	mulps	%xmm10, %xmm11
1685	mulps	20 * SIZE(BO), %xmm10
1686	addps	%xmm11, %xmm0
1687	movaps	16 * SIZE(BO), %xmm11
1688	addps	%xmm10, %xmm1
1689	movaps	-12 * SIZE(AO), %xmm10
1690	mulps	%xmm10, %xmm11
1691	mulps	20 * SIZE(BO), %xmm10
1692	addps	%xmm11, %xmm4
1693	movaps	24 * SIZE(BO), %xmm11
1694	addps	%xmm10, %xmm5
1695	movaps	 -8 * SIZE(AO), %xmm10
1696
1697	mulps	%xmm10, %xmm11
1698	mulps	28 * SIZE(BO), %xmm10
1699	addps	%xmm11, %xmm0
1700	movaps	24 * SIZE(BO), %xmm11
1701	addps	%xmm10, %xmm1
1702	movaps	-4 * SIZE(AO), %xmm10
1703	mulps	%xmm10, %xmm11
1704	mulps	28 * SIZE(BO), %xmm10
1705	addps	%xmm11, %xmm4
1706	movaps	80 * SIZE(BO), %xmm11
1707	addps	%xmm10, %xmm5
1708	movaps	48 * SIZE(AO), %xmm10
1709
1710	mulps	%xmm12, %xmm13
1711	mulps	36 * SIZE(BO), %xmm12
1712	addps	%xmm13, %xmm0
1713	movaps	32 * SIZE(BO), %xmm13
1714	addps	%xmm12, %xmm1
1715	movaps	 4 * SIZE(AO), %xmm12
1716	mulps	%xmm12, %xmm13
1717	mulps	36 * SIZE(BO), %xmm12
1718	addps	%xmm13, %xmm4
1719	movaps	40 * SIZE(BO), %xmm13
1720	addps	%xmm12, %xmm5
1721	movaps	 8 * SIZE(AO), %xmm12
1722
1723	mulps	%xmm12, %xmm13
1724	mulps	44 * SIZE(BO), %xmm12
1725	addps	%xmm13, %xmm0
1726	movaps	40 * SIZE(BO), %xmm13
1727	addps	%xmm12, %xmm1
1728	movaps	12 * SIZE(AO), %xmm12
1729	mulps	%xmm12, %xmm13
1730	mulps	44 * SIZE(BO), %xmm12
1731	addps	%xmm13, %xmm4
1732	movaps	96 * SIZE(BO), %xmm13
1733	addps	%xmm12, %xmm5
1734	movaps	64 * SIZE(AO), %xmm12
1735
1736	mulps	%xmm14, %xmm15
1737	mulps	52 * SIZE(BO), %xmm14
1738	addps	%xmm15, %xmm0
1739	movaps	48 * SIZE(BO), %xmm15
1740	addps	%xmm14, %xmm1
1741	movaps	20 * SIZE(AO), %xmm14
1742	mulps	%xmm14, %xmm15
1743	mulps	52 * SIZE(BO), %xmm14
1744	addps	%xmm15, %xmm4
1745	movaps	56 * SIZE(BO), %xmm15
1746	addps	%xmm14, %xmm5
1747	movaps	24 * SIZE(AO), %xmm14
1748
1749	mulps	%xmm14, %xmm15
1750	mulps	60 * SIZE(BO), %xmm14
1751	addps	%xmm15, %xmm0
1752	movaps	56 * SIZE(BO), %xmm15
1753	addps	%xmm14, %xmm1
1754	movaps	28 * SIZE(AO), %xmm14
1755	mulps	%xmm14, %xmm15
1756	mulps	60 * SIZE(BO), %xmm14
1757	addps	%xmm15, %xmm4
1758	movaps	112 * SIZE(BO), %xmm15
1759	addps	%xmm14, %xmm5
1760	movaps	80 * SIZE(AO), %xmm14
1761
1762	addq   $64 * SIZE, AO
1763	addq   $64 * SIZE, BO
1764	decq   %rax
1765	jne    .L62
1766	ALIGN_4
1767
1768.L65:
1769#ifndef TRMMKERNEL
1770	movq	K, %rax
1771#else
1772	movq	KKK, %rax
1773#endif
1774	movaps	ALPHA, %xmm15
1775	andq	$7, %rax		# if (k & 1)
1776	BRANCH
1777	je .L68
1778	ALIGN_4
1779
1780.L66:
1781	mulps	%xmm8, %xmm9
1782	mulps	 4 * SIZE(BO), %xmm8
1783	addps	%xmm9, %xmm0
1784	movaps	 0 * SIZE(BO), %xmm9
1785	addps	%xmm8, %xmm1
1786	movaps	-28 * SIZE(AO), %xmm8
1787	mulps	%xmm8, %xmm9
1788	mulps	 4 * SIZE(BO), %xmm8
1789	addps	%xmm9, %xmm4
1790	movaps	 8 * SIZE(BO), %xmm9
1791	addps	%xmm8, %xmm5
1792	movaps	-24 * SIZE(AO), %xmm8
1793
1794	addq	$8 * SIZE, AO		# aoffset  += 4
1795	addq	$8 * SIZE, BO		# boffset1 += 8
1796	decq	%rax
1797	jg	.L66
1798	ALIGN_4
1799
1800.L68:
1801#ifndef TRMMKERNEL
1802	movsd	0 * SIZE(CO1), %xmm8
1803	movhps	2 * SIZE(CO1), %xmm8
1804	movsd	4 * SIZE(CO1), %xmm9
1805	movhps	6 * SIZE(CO1), %xmm9
1806
1807	movsd	0 * SIZE(CO2), %xmm10
1808	movhps	2 * SIZE(CO2), %xmm10
1809	movsd	4 * SIZE(CO2), %xmm11
1810	movhps	6 * SIZE(CO2), %xmm11
1811#endif
1812
1813	mulps	%xmm15, %xmm0
1814	mulps	%xmm15, %xmm4
1815	mulps	%xmm15, %xmm1
1816	mulps	%xmm15, %xmm5
1817
1818#ifndef TRMMKERNEL
1819	addps	%xmm8,  %xmm0
1820	addps	%xmm9,  %xmm4
1821	addps	%xmm10, %xmm1
1822	addps	%xmm11, %xmm5
1823#endif
1824
1825	vmovups	%xmm0, 0 * SIZE(CO1)
1826	vmovups	%xmm4, 4 * SIZE(CO1)
1827
1828	vmovups	%xmm1, 0 * SIZE(CO2)
1829	vmovups	%xmm5, 4 * SIZE(CO2)
1830
1831#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1832    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1833	movq	K, %rax
1834	subq	KKK, %rax
1835	leaq	(,%rax,    8), %rax
1836	leaq	(AO, %rax, 4), AO
1837	leaq	(BO, %rax, 4), BO
1838#endif
1839
1840#if defined(TRMMKERNEL) && defined(LEFT)
1841	addq	$8, KK
1842#endif
1843
1844	addq	$8 * SIZE, CO1		# coffset += 4
1845	addq	$8 * SIZE, CO2		# coffset += 4
1846	decq	I			# i --
1847	jg	.L61
1848	ALIGN_4
1849
1850.L70:
1851	testq	$4, M
1852	je	.L80
1853
1854
1855#if !defined(TRMMKERNEL) || \
1856	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1857	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1858
1859	leaq	BUFFER, BO
1860#else
1861	leaq	BUFFER, BO
1862	movq	KK, %rax
1863	leaq	(, %rax,   8), %rax
1864	leaq	(AO, %rax, 2), AO
1865	leaq	(BO, %rax, 4), BO
1866#endif
1867
1868	movaps	-32 * SIZE(AO), %xmm8
1869	movaps	-16 * SIZE(AO), %xmm10
1870
1871	movaps	 0 * SIZE(BO), %xmm9
1872	movaps	16 * SIZE(BO), %xmm11
1873	movaps	32 * SIZE(BO), %xmm13
1874	movaps	48 * SIZE(BO), %xmm15
1875
1876	xorps	%xmm0, %xmm0
1877	xorps	%xmm1, %xmm1
1878	xorps	%xmm2, %xmm2
1879	xorps	%xmm3, %xmm3
1880
1881#ifndef TRMMKERNEL
1882	movq	K, %rax
1883#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1884	movq	K, %rax
1885	subq	KK, %rax
1886	movq	%rax, KKK
1887#else
1888	movq	KK, %rax
1889#ifdef LEFT
1890	addq	$4, %rax
1891#else
1892	addq	$2, %rax
1893#endif
1894	movq	%rax, KKK
1895#endif
1896	sarq	$3, %rax
1897	je	.L75
1898	ALIGN_4
1899
1900.L72:
1901	mulps	%xmm8, %xmm9
1902
1903	mulps	 4 * SIZE(BO), %xmm8
1904	addps	%xmm9, %xmm0
1905	movaps	 8 * SIZE(BO), %xmm9
1906	addps	%xmm8, %xmm1
1907	movaps	-28 * SIZE(AO), %xmm8
1908
1909	mulps	%xmm8, %xmm9
1910	mulps	12 * SIZE(BO), %xmm8
1911	addps	%xmm9, %xmm2
1912	movaps	64 * SIZE(BO), %xmm9
1913	addps	%xmm8, %xmm3
1914	movaps	-24 * SIZE(AO), %xmm8
1915
1916	mulps	%xmm8, %xmm11
1917	mulps	20 * SIZE(BO), %xmm8
1918	addps	%xmm11, %xmm0
1919	movaps	24 * SIZE(BO), %xmm11
1920	addps	%xmm8, %xmm1
1921	movaps	-20 * SIZE(AO), %xmm8
1922
1923	mulps	%xmm8, %xmm11
1924	mulps	28 * SIZE(BO), %xmm8
1925	addps	%xmm11, %xmm2
1926	movaps	80 * SIZE(BO), %xmm11
1927	addps	%xmm8, %xmm3
1928	movaps	 0 * SIZE(AO), %xmm8
1929
1930	mulps	%xmm10, %xmm13
1931	mulps	36 * SIZE(BO), %xmm10
1932	addps	%xmm13, %xmm0
1933	movaps	40 * SIZE(BO), %xmm13
1934	addps	%xmm10, %xmm1
1935	movaps	-12 * SIZE(AO), %xmm10
1936
1937	mulps	%xmm10, %xmm13
1938	mulps	44 * SIZE(BO), %xmm10
1939	addps	%xmm13, %xmm2
1940	movaps	96 * SIZE(BO), %xmm13
1941	addps	%xmm10, %xmm3
1942	movaps	 -8 * SIZE(AO), %xmm10
1943
1944	mulps	%xmm10, %xmm15
1945	mulps	52 * SIZE(BO), %xmm10
1946	addps	%xmm15, %xmm0
1947	movaps	56 * SIZE(BO), %xmm15
1948	addps	%xmm10, %xmm1
1949	movaps	 -4 * SIZE(AO), %xmm10
1950
1951	mulps	%xmm10, %xmm15
1952	mulps	60 * SIZE(BO), %xmm10
1953	addps	%xmm15, %xmm2
1954	movaps	112 * SIZE(BO), %xmm15
1955	addps	%xmm10, %xmm3
1956	movaps	16 * SIZE(AO), %xmm10
1957
1958	addq   $32 * SIZE, AO
1959	addq   $64 * SIZE, BO
1960	decq   %rax
1961	jne    .L72
1962	ALIGN_4
1963
1964.L75:
1965#ifndef TRMMKERNEL
1966	movq	K, %rax
1967#else
1968	movq	KKK, %rax
1969#endif
1970	movaps	ALPHA, %xmm15
1971	andq	$7, %rax		# if (k & 1)
1972	BRANCH
1973	je .L78
1974	ALIGN_4
1975
1976.L76:
1977	mulps	%xmm8, %xmm9
1978	mulps	 4 * SIZE(BO), %xmm8
1979	addps	%xmm9, %xmm0
1980	movaps	 8 * SIZE(BO), %xmm9
1981	addps	%xmm8, %xmm1
1982	movaps	-28 * SIZE(AO), %xmm8
1983
1984	addq	$4 * SIZE, AO		# aoffset  += 4
1985	addq	$8 * SIZE, BO		# boffset1 += 8
1986	decq	%rax
1987	jg	.L76
1988	ALIGN_4
1989
1990.L78:
1991#ifndef TRMMKERNEL
1992	movsd	0 * SIZE(CO1), %xmm8
1993	movhps	2 * SIZE(CO1), %xmm8
1994	movsd	0 * SIZE(CO2), %xmm10
1995	movhps	2 * SIZE(CO2), %xmm10
1996#endif
1997
1998	addps	%xmm2, %xmm0
1999	addps	%xmm3, %xmm1
2000
2001	mulps	%xmm15, %xmm0
2002	mulps	%xmm15, %xmm1
2003
2004#ifndef TRMMKERNEL
2005	addps	%xmm8,  %xmm0
2006	addps	%xmm10, %xmm1
2007#endif
2008
2009	vmovups	%xmm0, 0 * SIZE(CO1)
2010	vmovups	%xmm1, 0 * SIZE(CO2)
2011
2012#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2013    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2014	movq	K, %rax
2015	subq	KKK, %rax
2016	leaq	(,%rax,    8), %rax
2017	leaq	(AO, %rax, 2), AO
2018	leaq	(BO, %rax, 4), BO
2019#endif
2020
2021#if defined(TRMMKERNEL) && defined(LEFT)
2022	addq	$4, KK
2023#endif
2024
2025	addq	$4 * SIZE, CO1		# coffset += 4
2026	addq	$4 * SIZE, CO2		# coffset += 4
2027	ALIGN_4
2028
2029.L80:
2030	testq	$2, M
2031	je	.L90
2032
2033#if !defined(TRMMKERNEL) || \
2034	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2035	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2036
2037	leaq	BUFFER, BO
2038#else
2039	leaq	BUFFER, BO
2040	movq	KK, %rax
2041	leaq	(, %rax,   8), %rax
2042	leaq	(AO, %rax, 1), AO
2043	leaq	(BO, %rax, 4), BO
2044#endif
2045
2046	movaps	-32 * SIZE(AO), %xmm8
2047	movaps	-24 * SIZE(AO), %xmm10
2048
2049	movaps	 0 * SIZE(BO), %xmm9
2050	movaps	16 * SIZE(BO), %xmm11
2051	movaps	32 * SIZE(BO), %xmm13
2052	movaps	48 * SIZE(BO), %xmm15
2053
2054	xorps	%xmm0, %xmm0
2055	xorps	%xmm1, %xmm1
2056	xorps	%xmm2, %xmm2
2057	xorps	%xmm3, %xmm3
2058
2059#ifndef TRMMKERNEL
2060	movq	K, %rax
2061#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2062	movq	K, %rax
2063	subq	KK, %rax
2064	movq	%rax, KKK
2065#else
2066	movq	KK, %rax
2067#ifdef LEFT
2068	addq	$2, %rax
2069#else
2070	addq	$2, %rax
2071#endif
2072	movq	%rax, KKK
2073#endif
2074	sarq	$3, %rax
2075	je	.L85
2076	ALIGN_4
2077
2078.L82:
2079	mulps	%xmm8, %xmm9
2080	addps	%xmm9, %xmm0
2081	movsd	 4 * SIZE(BO), %xmm9
2082	mulps	%xmm8, %xmm9
2083	movsd	-30 * SIZE(AO), %xmm8
2084	addps	%xmm9, %xmm1
2085	movsd	 8 * SIZE(BO), %xmm9
2086
2087	mulps	%xmm8, %xmm9
2088	addps	%xmm9, %xmm2
2089	movsd	12 * SIZE(BO), %xmm9
2090	mulps	%xmm8, %xmm9
2091	movsd	-28 * SIZE(AO), %xmm8
2092	addps	%xmm9, %xmm3
2093	movsd	64 * SIZE(BO), %xmm9
2094
2095	mulps	%xmm8, %xmm11
2096	addps	%xmm11, %xmm0
2097	movsd	20 * SIZE(BO), %xmm11
2098	mulps	%xmm8, %xmm11
2099	movsd	-26 * SIZE(AO), %xmm8
2100	addps	%xmm11, %xmm1
2101	movsd	24 * SIZE(BO), %xmm11
2102
2103	mulps	%xmm8, %xmm11
2104	addps	%xmm11, %xmm2
2105	movsd	28 * SIZE(BO), %xmm11
2106	mulps	%xmm8, %xmm11
2107	movsd	-16 * SIZE(AO), %xmm8
2108	addps	%xmm11, %xmm3
2109	movsd	 80 * SIZE(BO), %xmm11
2110
2111	mulps	%xmm10, %xmm13
2112	addps	%xmm13, %xmm0
2113	movsd	36 * SIZE(BO), %xmm13
2114	mulps	%xmm10, %xmm13
2115	movsd	-22 * SIZE(AO), %xmm10
2116	addps	%xmm13, %xmm1
2117	movsd	40 * SIZE(BO), %xmm13
2118
2119	mulps	%xmm10, %xmm13
2120	addps	%xmm13, %xmm2
2121	movsd	44 * SIZE(BO), %xmm13
2122	mulps	%xmm10, %xmm13
2123	movsd	-20 * SIZE(AO), %xmm10
2124	addps	%xmm13, %xmm3
2125	movsd	 96 * SIZE(BO), %xmm13
2126
2127	mulps	%xmm10, %xmm15
2128	addps	%xmm15, %xmm0
2129	movsd	52 * SIZE(BO), %xmm15
2130	mulps	%xmm10, %xmm15
2131	movsd	-18 * SIZE(AO), %xmm10
2132	addps	%xmm15, %xmm1
2133	movsd	56 * SIZE(BO), %xmm15
2134
2135	mulps	%xmm10, %xmm15
2136	addps	%xmm15, %xmm2
2137	movsd	60 * SIZE(BO), %xmm15
2138	mulps	%xmm10, %xmm15
2139	movsd	-8 * SIZE(AO), %xmm10
2140	addps	%xmm15, %xmm3
2141	movsd	112 * SIZE(BO), %xmm15
2142
2143	addq   $16 * SIZE, AO
2144	addq   $64 * SIZE, BO
2145	decq   %rax
2146	jne    .L82
2147	ALIGN_4
2148
2149.L85:
2150#ifndef TRMMKERNEL
2151	movq	K, %rax
2152#else
2153	movq	KKK, %rax
2154#endif
2155	movaps	ALPHA, %xmm15
2156	andq	$7, %rax		# if (k & 1)
2157	BRANCH
2158	je .L88
2159	ALIGN_4
2160
2161.L86:
2162	mulps	%xmm8, %xmm9
2163	addps	%xmm9, %xmm0
2164	movsd	  4 * SIZE(BO), %xmm9
2165	mulps	%xmm8, %xmm9
2166	movsd	-30 * SIZE(AO), %xmm8
2167	addps	%xmm9, %xmm1
2168	movsd	  8 * SIZE(BO), %xmm9
2169
2170	addq	$2 * SIZE, AO		# aoffset  += 4
2171	addq	$8 * SIZE, BO		# boffset1 += 8
2172	decq	%rax
2173	jg	.L86
2174	ALIGN_4
2175
2176.L88:
2177#ifndef TRMMKERNEL
2178	movsd	0 * SIZE(CO1), %xmm8
2179	movsd	0 * SIZE(CO2), %xmm10
2180#endif
2181
2182	addps	%xmm2, %xmm0
2183	addps	%xmm3, %xmm1
2184
2185	mulps	%xmm15, %xmm0
2186	mulps	%xmm15, %xmm1
2187
2188#ifndef TRMMKERNEL
2189	addps	%xmm8,  %xmm0
2190	addps	%xmm10, %xmm1
2191#endif
2192
2193	movsd	%xmm0, 0 * SIZE(CO1)
2194	movsd	%xmm1, 0 * SIZE(CO2)
2195
2196#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2197    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2198	movq	K, %rax
2199	subq	KKK, %rax
2200	leaq	(,%rax,    8), %rax
2201	leaq	(AO, %rax, 1), AO
2202	leaq	(BO, %rax, 4), BO
2203#endif
2204
2205#if defined(TRMMKERNEL) && defined(LEFT)
2206	addq	$2, KK
2207#endif
2208
2209	addq	$2 * SIZE, CO1		# coffset += 4
2210	addq	$2 * SIZE, CO2		# coffset += 4
2211	ALIGN_4
2212
2213.L90:
2214	testq	$1, M
2215	je	.L99
2216
2217#if !defined(TRMMKERNEL) || \
2218	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2219	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2220
2221	leaq	BUFFER, BO
2222#else
2223	leaq	BUFFER, BO
2224	movq	KK, %rax
2225	leaq	(, %rax,   4), %rax
2226	leaq	(AO, %rax, 1), AO
2227	leaq	(BO, %rax, 8), BO
2228#endif
2229
2230	movss	-32 * SIZE(AO), %xmm8
2231	movss	-28 * SIZE(AO), %xmm10
2232
2233	movss	 0 * SIZE(BO), %xmm9
2234	movss	16 * SIZE(BO), %xmm11
2235	movss	32 * SIZE(BO), %xmm13
2236	movss	48 * SIZE(BO), %xmm15
2237
2238	xorps	%xmm0, %xmm0
2239	xorps	%xmm1, %xmm1
2240	xorps	%xmm2, %xmm2
2241	xorps	%xmm3, %xmm3
2242
2243#ifndef TRMMKERNEL
2244	movq	K, %rax
2245#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2246	movq	K, %rax
2247	subq	KK, %rax
2248	movq	%rax, KKK
2249#else
2250	movq	KK, %rax
2251#ifdef LEFT
2252	addq	$1, %rax
2253#else
2254	addq	$2, %rax
2255#endif
2256	movq	%rax, KKK
2257#endif
2258	sarq	$3, %rax
2259	je	.L95
2260	ALIGN_4
2261
2262.L92:
2263	mulps	%xmm8, %xmm9
2264	addps	%xmm9, %xmm0
2265	movss	 4 * SIZE(BO), %xmm9
2266	mulps	%xmm8, %xmm9
2267	movss	-31 * SIZE(AO), %xmm8
2268	addps	%xmm9, %xmm1
2269	movss	 8 * SIZE(BO), %xmm9
2270
2271	mulps	%xmm8, %xmm9
2272	addps	%xmm9, %xmm2
2273	movss	12 * SIZE(BO), %xmm9
2274	mulps	%xmm8, %xmm9
2275	movss	-30 * SIZE(AO), %xmm8
2276	addps	%xmm9, %xmm3
2277	movss	64 * SIZE(BO), %xmm9
2278
2279	mulps	%xmm8, %xmm11
2280	addps	%xmm11, %xmm0
2281	movss	20 * SIZE(BO), %xmm11
2282	mulps	%xmm8, %xmm11
2283	movss	-29 * SIZE(AO), %xmm8
2284	addps	%xmm11, %xmm1
2285	movss	24 * SIZE(BO), %xmm11
2286
2287	mulps	%xmm8, %xmm11
2288	addps	%xmm11, %xmm2
2289	movss	28 * SIZE(BO), %xmm11
2290	mulps	%xmm8, %xmm11
2291	movss	-24 * SIZE(AO), %xmm8
2292	addps	%xmm11, %xmm3
2293	movss	 80 * SIZE(BO), %xmm11
2294
2295	mulps	%xmm10, %xmm13
2296	addps	%xmm13, %xmm0
2297	movss	36 * SIZE(BO), %xmm13
2298	mulps	%xmm10, %xmm13
2299	movss	-27 * SIZE(AO), %xmm10
2300	addps	%xmm13, %xmm1
2301	movss	40 * SIZE(BO), %xmm13
2302
2303	mulps	%xmm10, %xmm13
2304	addps	%xmm13, %xmm2
2305	movss	44 * SIZE(BO), %xmm13
2306	mulps	%xmm10, %xmm13
2307	movss	-26 * SIZE(AO), %xmm10
2308	addps	%xmm13, %xmm3
2309	movss	 96 * SIZE(BO), %xmm13
2310
2311	mulps	%xmm10, %xmm15
2312	addps	%xmm15, %xmm0
2313	movss	52 * SIZE(BO), %xmm15
2314	mulps	%xmm10, %xmm15
2315	movss	-25 * SIZE(AO), %xmm10
2316	addps	%xmm15, %xmm1
2317	movss	56 * SIZE(BO), %xmm15
2318
2319	mulps	%xmm10, %xmm15
2320	addps	%xmm15, %xmm2
2321	movss	60 * SIZE(BO), %xmm15
2322	mulps	%xmm10, %xmm15
2323	movss	-20 * SIZE(AO), %xmm10
2324	addps	%xmm15, %xmm3
2325	movss	112 * SIZE(BO), %xmm15
2326
2327	addq   $ 8 * SIZE, AO
2328	addq   $64 * SIZE, BO
2329	decq   %rax
2330	jne    .L92
2331	ALIGN_4
2332
2333.L95:
2334#ifndef TRMMKERNEL
2335	movq	K, %rax
2336#else
2337	movq	KKK, %rax
2338#endif
2339	movaps	ALPHA, %xmm15
2340	andq	$7, %rax		# if (k & 1)
2341	BRANCH
2342	je .L98
2343	ALIGN_4
2344
2345.L96:
2346	mulps	%xmm8, %xmm9
2347	addps	%xmm9, %xmm0
2348	movss	 4 * SIZE(BO), %xmm9
2349	mulps	%xmm8, %xmm9
2350	movss	-31 * SIZE(AO), %xmm8
2351	addps	%xmm9, %xmm1
2352	movss	 8 * SIZE(BO), %xmm9
2353
2354	addq	$1 * SIZE, AO		# aoffset  += 4
2355	addq	$8 * SIZE, BO		# boffset1 += 8
2356	decq	%rax
2357	jg	.L96
2358	ALIGN_4
2359
2360.L98:
2361#ifndef TRMMKERNEL
2362	movss	0 * SIZE(CO1), %xmm8
2363	movss	0 * SIZE(CO2), %xmm10
2364#endif
2365
2366	addss	%xmm2, %xmm0
2367	addss	%xmm3, %xmm1
2368	mulss	%xmm15, %xmm0
2369	mulss	%xmm15, %xmm1
2370
2371#ifndef TRMMKERNEL
2372	addss	%xmm8,  %xmm0
2373	addss	%xmm10, %xmm1
2374#endif
2375
2376	movss	%xmm0, 0 * SIZE(CO1)
2377	movss	%xmm1, 0 * SIZE(CO2)
2378
2379#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2380    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2381	movq	K, %rax
2382	subq	KKK, %rax
2383	leaq	(,%rax,    4), %rax
2384	leaq	(AO, %rax, 1), AO
2385	leaq	(BO, %rax, 8), BO
2386#endif
2387
2388#if defined(TRMMKERNEL) && defined(LEFT)
2389	addq	$1, KK
2390#endif
2391	ALIGN_4
2392
2393.L99:
2394#if defined(TRMMKERNEL) && !defined(LEFT)
2395	addl	$2, KK
2396#endif
2397	leaq	(C, LDC, 2), C		# c += 4 * ldc
2398	ALIGN_4
2399
2400
2401.L100:
2402	testq	$1, N
2403	je	.L999
2404
2405.L101:
2406#if defined(TRMMKERNEL) && defined(LEFT)
2407	movq	OFFSET, %rax
2408	movq	%rax, KK
2409#endif
2410
2411/* Copying to Sub Buffer */
2412	leaq	BUFFER, BO
2413
2414	movq	K, %rax
2415	sarq	$3, %rax
2416	jle	.L103
2417	ALIGN_4
2418
2419
2420.L102:
2421
2422	movups	 0 * SIZE(B), %xmm3
2423	movups	 4 * SIZE(B), %xmm7
2424
2425
2426	pshufd	$0x00, %xmm3, %xmm0
2427	pshufd	$0x55, %xmm3, %xmm1
2428	pshufd	$0xaa, %xmm3, %xmm2
2429	pshufd	$0xff, %xmm3, %xmm3
2430
2431
2432	pshufd	$0x00, %xmm7, %xmm4
2433	pshufd	$0x55, %xmm7, %xmm5
2434	pshufd	$0xaa, %xmm7, %xmm6
2435	pshufd	$0xff, %xmm7, %xmm7
2436
2437	movaps	%xmm0,   0 * SIZE(BO)
2438	movaps	%xmm1,   4 * SIZE(BO)
2439	movaps	%xmm2,   8 * SIZE(BO)
2440	movaps	%xmm3,  12 * SIZE(BO)
2441	movaps	%xmm4,  16 * SIZE(BO)
2442	movaps	%xmm5,  20 * SIZE(BO)
2443	movaps	%xmm6,  24 * SIZE(BO)
2444	movaps	%xmm7,  28 * SIZE(BO)
2445
2446	addq	$ 8 * SIZE, B
2447	addq	$32 * SIZE, BO
2448
2449	decq	%rax
2450	jne	.L102
2451	ALIGN_4
2452
2453.L103:
2454	movq	K, %rax
2455	andq	$7, %rax
2456	BRANCH
2457	jle	.L110
2458	ALIGN_4
2459
2460.L104:
2461	movss	 0 * SIZE(B), %xmm3
2462
2463	pshufd	$0x00, %xmm3, %xmm0
2464
2465	movaps	%xmm0,   0 * SIZE(BO)
2466
2467	addq	$ 1 * SIZE, B
2468	addq	$ 4 * SIZE, BO
2469	decq	%rax
2470	jne	.L104
2471	ALIGN_4
2472
2473.L110:
2474	movq	C, CO1			# coffset1 = c
2475	movq	A, AO		# aoffset = a
2476
2477	movq	M,  I
2478	sarq	$3, I	# i = (m >> 3)
2479	jle	.L120
2480	ALIGN_4
2481
2482.L111:
2483#if !defined(TRMMKERNEL) || \
2484	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2485	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2486
2487	leaq	BUFFER, BO
2488#else
2489	leaq	BUFFER, BO
2490	movq	KK, %rax
2491	leaq	(, %rax,   8), %rax
2492	leaq	(AO, %rax, 4), AO
2493	leaq	(BO, %rax, 2), BO
2494#endif
2495
2496	movaps	-32 * SIZE(AO), %xmm8
2497	movaps	-16 * SIZE(AO), %xmm10
2498	movaps	  0 * SIZE(AO), %xmm12
2499	movaps	 16 * SIZE(AO), %xmm14
2500
2501	movaps	 0 * SIZE(BO), %xmm9
2502	movaps	16 * SIZE(BO), %xmm11
2503	movaps	32 * SIZE(BO), %xmm13
2504	movaps	48 * SIZE(BO), %xmm15
2505
2506	xorps	%xmm0, %xmm0
2507	xorps	%xmm1, %xmm1
2508
2509	prefetchw      4 * SIZE(CO1)
2510	xorps	%xmm4, %xmm4
2511	xorps	%xmm5, %xmm5
2512
2513#ifndef TRMMKERNEL
2514	movq	K, %rax
2515#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2516	movq	K, %rax
2517	subq	KK, %rax
2518	movq	%rax, KKK
2519#else
2520	movq	KK, %rax
2521#ifdef LEFT
2522	addq	$8, %rax
2523#else
2524	addq	$1, %rax
2525#endif
2526	movq	%rax, KKK
2527#endif
2528	sarq	$3, %rax
2529	je	.L115
2530	ALIGN_4
2531
2532.L112:
2533	mulps	%xmm9, %xmm8
2534
2535	mulps	-28 * SIZE(AO), %xmm9
2536	addps	%xmm8, %xmm0
2537	movaps	-24 * SIZE(AO), %xmm8
2538	addps	%xmm9, %xmm4
2539	movaps	 4 * SIZE(BO), %xmm9
2540
2541	mulps	%xmm9, %xmm8
2542	mulps	-20 * SIZE(AO), %xmm9
2543	addps	%xmm8, %xmm0
2544	movaps	 32 * SIZE(AO), %xmm8
2545	addps	%xmm9, %xmm4
2546	movaps	 8 * SIZE(BO), %xmm9
2547
2548	mulps	%xmm9, %xmm10
2549	mulps	-12 * SIZE(AO), %xmm9
2550	addps	%xmm10, %xmm0
2551	movaps	 -8 * SIZE(AO), %xmm10
2552	addps	%xmm9, %xmm4
2553	movaps	12 * SIZE(BO), %xmm9
2554
2555	mulps	%xmm9, %xmm10
2556	mulps	 -4 * SIZE(AO), %xmm9
2557	addps	%xmm10, %xmm0
2558	movaps	 48 * SIZE(AO), %xmm10
2559	addps	%xmm9, %xmm4
2560	movaps	32 * SIZE(BO), %xmm9
2561
2562	mulps	%xmm11, %xmm12
2563	mulps	 4 * SIZE(AO), %xmm11
2564	addps	%xmm12, %xmm0
2565	movaps	 8 * SIZE(AO), %xmm12
2566	addps	%xmm11, %xmm4
2567	movaps	20 * SIZE(BO), %xmm11
2568
2569	mulps	%xmm11, %xmm12
2570	mulps	12 * SIZE(AO), %xmm11
2571	addps	%xmm12, %xmm0
2572	movaps	64 * SIZE(AO), %xmm12
2573	addps	%xmm11, %xmm4
2574	movaps	24 * SIZE(BO), %xmm11
2575
2576	mulps	%xmm11, %xmm14
2577	mulps	20 * SIZE(AO), %xmm11
2578	addps	%xmm14, %xmm0
2579	movaps	24 * SIZE(AO), %xmm14
2580	addps	%xmm11, %xmm4
2581	movaps	28 * SIZE(BO), %xmm11
2582
2583	mulps	%xmm11, %xmm14
2584	mulps	28 * SIZE(AO), %xmm11
2585	addps	%xmm14, %xmm0
2586	movaps	80 * SIZE(AO), %xmm14
2587	addps	%xmm11, %xmm4
2588	movaps	48 * SIZE(BO), %xmm11
2589
2590	addq   $64 * SIZE, AO
2591	addq   $32 * SIZE, BO
2592	decq   %rax
2593	jne    .L112
2594	ALIGN_4
2595
2596.L115:
2597#ifndef TRMMKERNEL
2598	movq	K, %rax
2599#else
2600	movq	KKK, %rax
2601#endif
2602	movaps	ALPHA, %xmm15
2603	andq	$7, %rax		# if (k & 1)
2604	BRANCH
2605	je .L118
2606	ALIGN_4
2607
2608.L116:
2609	mulps	%xmm9, %xmm8
2610	mulps	-28 * SIZE(AO), %xmm9
2611	addps	%xmm8, %xmm0
2612	movaps	-24 * SIZE(AO), %xmm8
2613	addps	%xmm9, %xmm4
2614	movaps	 4 * SIZE(BO), %xmm9
2615
2616	addq	$8 * SIZE, AO		# aoffset  += 4
2617	addq	$4 * SIZE, BO		# boffset1 += 8
2618	decq	%rax
2619	jg	.L116
2620	ALIGN_4
2621
2622.L118:
2623#ifndef TRMMKERNEL
2624	movsd	0 * SIZE(CO1), %xmm8
2625	movhps	2 * SIZE(CO1), %xmm8
2626	movsd	4 * SIZE(CO1), %xmm9
2627	movhps	6 * SIZE(CO1), %xmm9
2628#endif
2629
2630	mulps	%xmm15, %xmm0
2631	mulps	%xmm15, %xmm4
2632#ifndef TRMMKERNEL
2633	addps	%xmm8,  %xmm0
2634	addps	%xmm9,  %xmm4
2635#endif
2636
2637	vmovups	%xmm0, 0 * SIZE(CO1)
2638	vmovups	%xmm4, 4 * SIZE(CO1)
2639
2640#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2641    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2642	movq	K, %rax
2643	subq	KKK, %rax
2644	leaq	(,%rax,    8), %rax
2645	leaq	(AO, %rax, 4), AO
2646	leaq	(BO, %rax, 2), BO
2647#endif
2648
2649#if defined(TRMMKERNEL) && defined(LEFT)
2650	addq	$8, KK
2651#endif
2652
2653	addq	$8 * SIZE, CO1		# coffset += 4
2654	decq	I			# i --
2655	jg	.L111
2656	ALIGN_4
2657
2658.L120:
2659	testq	$4, M
2660	je	.L130
2661
2662#if !defined(TRMMKERNEL) || \
2663	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2664	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2665
2666	leaq	BUFFER, BO
2667#else
2668	leaq	BUFFER, BO
2669	movq	KK, %rax
2670	leaq	(, %rax,   8), %rax
2671	leaq	(AO, %rax, 2), AO
2672	leaq	(BO, %rax, 2), BO
2673#endif
2674
2675	movaps	-32 * SIZE(AO), %xmm8
2676	movaps	-16 * SIZE(AO), %xmm10
2677
2678	movaps	 0 * SIZE(BO), %xmm9
2679	movaps	16 * SIZE(BO), %xmm11
2680
2681	xorps	%xmm0, %xmm0
2682	xorps	%xmm1, %xmm1
2683	xorps	%xmm2, %xmm2
2684	xorps	%xmm3, %xmm3
2685
2686#ifndef TRMMKERNEL
2687	movq	K, %rax
2688#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2689	movq	K, %rax
2690	subq	KK, %rax
2691	movq	%rax, KKK
2692#else
2693	movq	KK, %rax
2694#ifdef LEFT
2695	addq	$4, %rax
2696#else
2697	addq	$1, %rax
2698#endif
2699	movq	%rax, KKK
2700#endif
2701	sarq	$3, %rax
2702	je	.L125
2703	ALIGN_4
2704
2705.L122:
2706	mulps	%xmm8, %xmm9
2707	movaps	-28 * SIZE(AO), %xmm8
2708	mulps	 4 * SIZE(BO), %xmm8
2709	addps	%xmm9, %xmm0
2710	movaps	32 * SIZE(BO), %xmm9
2711	addps	%xmm8, %xmm1
2712	movaps	-24 * SIZE(AO), %xmm8
2713	mulps	 8 * SIZE(BO), %xmm8
2714	addps	%xmm8, %xmm2
2715	movaps	-20 * SIZE(AO), %xmm8
2716	mulps	12 * SIZE(BO), %xmm8
2717	addps	%xmm8, %xmm3
2718	movaps	  0 * SIZE(AO), %xmm8
2719
2720	mulps	%xmm10, %xmm11
2721	movaps	-12 * SIZE(AO), %xmm10
2722	mulps	20 * SIZE(BO), %xmm10
2723	addps	%xmm11, %xmm0
2724	movaps	48 * SIZE(BO), %xmm11
2725	addps	%xmm10, %xmm1
2726	movaps	 -8 * SIZE(AO), %xmm10
2727	mulps	24 * SIZE(BO), %xmm10
2728	addps	%xmm10, %xmm2
2729	movaps	-4 * SIZE(AO), %xmm10
2730	mulps	28 * SIZE(BO), %xmm10
2731	addps	%xmm10, %xmm3
2732	movaps	16 * SIZE(AO), %xmm10
2733
2734	addq   $32 * SIZE, AO
2735	addq   $32 * SIZE, BO
2736	decq   %rax
2737	jne    .L122
2738	ALIGN_4
2739
2740.L125:
2741#ifndef TRMMKERNEL
2742	movq	K, %rax
2743#else
2744	movq	KKK, %rax
2745#endif
2746	movaps	ALPHA, %xmm15
2747	andq	$7, %rax		# if (k & 1)
2748	BRANCH
2749	je .L128
2750	ALIGN_4
2751
2752.L126:
2753	mulps	%xmm8, %xmm9
2754	movaps	-28 * SIZE(AO), %xmm8
2755	addps	%xmm9, %xmm0
2756	movaps	 4 * SIZE(BO), %xmm9
2757
2758	addq	$4 * SIZE, AO		# aoffset  += 4
2759	addq	$4 * SIZE, BO		# boffset1 += 8
2760	decq	%rax
2761	jg	.L126
2762	ALIGN_4
2763
2764.L128:
2765#ifndef TRMMKERNEL
2766	movsd	0 * SIZE(CO1), %xmm8
2767	movhps	2 * SIZE(CO1), %xmm8
2768#endif
2769
2770	addps	%xmm1, %xmm0
2771	addps	%xmm3, %xmm2
2772	addps	%xmm2, %xmm0
2773
2774	mulps	%xmm15, %xmm0
2775#ifndef TRMMKERNEL
2776	addps	%xmm8,  %xmm0
2777#endif
2778
2779	vmovups	%xmm0, 0 * SIZE(CO1)
2780
2781#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2782    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2783	movq	K, %rax
2784	subq	KKK, %rax
2785	leaq	(,%rax,    8), %rax
2786	leaq	(AO, %rax, 2), AO
2787	leaq	(BO, %rax, 2), BO
2788#endif
2789
2790#if defined(TRMMKERNEL) && defined(LEFT)
2791	addq	$4, KK
2792#endif
2793
2794	addq	$4 * SIZE, CO1		# coffset += 4
2795	ALIGN_4
2796
2797.L130:
2798	testq	$2, M
2799	je	.L140
2800
2801#if !defined(TRMMKERNEL) || \
2802	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2803	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2804
2805	leaq	BUFFER, BO
2806#else
2807	leaq	BUFFER, BO
2808	movq	KK, %rax
2809	leaq	(, %rax,   8), %rax
2810	leaq	(AO, %rax, 1), AO
2811	leaq	(BO, %rax, 2), BO
2812#endif
2813
2814	movaps	-32 * SIZE(AO), %xmm8
2815	movaps	-24 * SIZE(AO), %xmm10
2816
2817	movaps	 0 * SIZE(BO), %xmm9
2818	movaps	16 * SIZE(BO), %xmm11
2819
2820	xorps	%xmm0, %xmm0
2821	xorps	%xmm1, %xmm1
2822	xorps	%xmm2, %xmm2
2823	xorps	%xmm3, %xmm3
2824
2825#ifndef TRMMKERNEL
2826	movq	K, %rax
2827#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2828	movq	K, %rax
2829	subq	KK, %rax
2830	movq	%rax, KKK
2831#else
2832	movq	KK, %rax
2833#ifdef LEFT
2834	addq	$2, %rax
2835#else
2836	addq	$1, %rax
2837#endif
2838	movq	%rax, KKK
2839#endif
2840	sarq	$3, %rax
2841	je	.L135
2842	ALIGN_4
2843
2844.L132:
2845	mulps	%xmm8, %xmm9
2846	movsd	-30 * SIZE(AO), %xmm8
2847	addps	%xmm9, %xmm0
2848	movsd	 4 * SIZE(BO), %xmm9
2849	mulps	%xmm8, %xmm9
2850	movsd	-28 * SIZE(AO), %xmm8
2851	addps	%xmm9, %xmm1
2852	movsd	 8 * SIZE(BO), %xmm9
2853
2854	mulps	%xmm8, %xmm9
2855	movsd	-26 * SIZE(AO), %xmm8
2856	addps	%xmm9, %xmm0
2857	movsd	12 * SIZE(BO), %xmm9
2858
2859	mulps	%xmm8, %xmm9
2860	movsd	-16 * SIZE(AO), %xmm8
2861	addps	%xmm9, %xmm1
2862	movsd	32 * SIZE(BO), %xmm9
2863
2864	mulps	%xmm10, %xmm11
2865	movsd	-22 * SIZE(AO), %xmm10
2866	addps	%xmm11, %xmm0
2867	movsd	20 * SIZE(BO), %xmm11
2868
2869	mulps	%xmm10, %xmm11
2870	movsd	-20 * SIZE(AO), %xmm10
2871	addps	%xmm11, %xmm1
2872	movsd	24 * SIZE(BO), %xmm11
2873
2874	mulps	%xmm10, %xmm11
2875	movsd	-18 * SIZE(AO), %xmm10
2876	addps	%xmm11, %xmm0
2877	movsd	28 * SIZE(BO), %xmm11
2878
2879	mulps	%xmm10, %xmm11
2880	movsd	 -8 * SIZE(AO), %xmm10
2881	addps	%xmm11, %xmm1
2882	movsd	48 * SIZE(BO), %xmm11
2883
2884	addq   $16 * SIZE, AO
2885	addq   $32 * SIZE, BO
2886	decq   %rax
2887	jne    .L132
2888	ALIGN_4
2889
2890.L135:
2891#ifndef TRMMKERNEL
2892	movq	K, %rax
2893#else
2894	movq	KKK, %rax
2895#endif
2896	movaps	ALPHA, %xmm15
2897	andq	$7, %rax		# if (k & 1)
2898	BRANCH
2899	je .L138
2900	ALIGN_4
2901
2902.L136:
2903	mulps	%xmm8, %xmm9
2904	movsd	-30 * SIZE(AO), %xmm8
2905	addps	%xmm9, %xmm0
2906	movsd	 4 * SIZE(BO), %xmm9
2907
2908	addq	$2 * SIZE, AO		# aoffset  += 4
2909	addq	$4 * SIZE, BO		# boffset1 += 8
2910	decq	%rax
2911	jg	.L136
2912	ALIGN_4
2913
2914.L138:
2915	addps	%xmm1,  %xmm0
2916	mulps	%xmm15, %xmm0
2917
2918#ifndef TRMMKERNEL
2919	movsd	0 * SIZE(CO1), %xmm8
2920	addps	%xmm8,  %xmm0
2921#endif
2922
2923	movsd	%xmm0, 0 * SIZE(CO1)
2924
2925#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2926    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2927	movq	K, %rax
2928	subq	KKK, %rax
2929	leaq	(,%rax,    8), %rax
2930	leaq	(AO, %rax, 1), AO
2931	leaq	(BO, %rax, 2), BO
2932#endif
2933
2934#if defined(TRMMKERNEL) && defined(LEFT)
2935	addq	$2, KK
2936#endif
2937
2938	addq	$2 * SIZE, CO1		# coffset += 4
2939	ALIGN_4
2940
2941.L140:
2942	testq	$1, M
2943	je	.L999
2944
2945#if !defined(TRMMKERNEL) || \
2946	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2947	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2948
2949	leaq	BUFFER, BO
2950#else
2951	leaq	BUFFER, BO
2952	movq	KK, %rax
2953	leaq	(, %rax,   4), %rax
2954	leaq	(AO, %rax, 1), AO
2955	leaq	(BO, %rax, 4), BO
2956#endif
2957
2958	movss	-32 * SIZE(AO), %xmm8
2959	movss	-28 * SIZE(AO), %xmm10
2960
2961	movss	 0 * SIZE(BO), %xmm9
2962	movss	16 * SIZE(BO), %xmm11
2963
2964	xorps	%xmm0, %xmm0
2965	xorps	%xmm1, %xmm1
2966	xorps	%xmm2, %xmm2
2967	xorps	%xmm3, %xmm3
2968
2969#ifndef TRMMKERNEL
2970	movq	K, %rax
2971#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2972	movq	K, %rax
2973	subq	KK, %rax
2974	movq	%rax, KKK
2975#else
2976	movq	KK, %rax
2977#ifdef LEFT
2978	addq	$1, %rax
2979#else
2980	addq	$1, %rax
2981#endif
2982	movq	%rax, KKK
2983#endif
2984	sarq	$3, %rax
2985	je	.L145
2986	ALIGN_4
2987
2988.L142:
2989	mulss	%xmm8, %xmm9
2990	movss	-31 * SIZE(AO), %xmm8
2991	mulss	 4 * SIZE(BO), %xmm8
2992	addss	%xmm9, %xmm0
2993	movss	32 * SIZE(BO), %xmm9
2994	addss	%xmm8, %xmm1
2995	movss	-30 * SIZE(AO), %xmm8
2996	mulss	 8 * SIZE(BO), %xmm8
2997	addss	%xmm8, %xmm2
2998	movss	-29 * SIZE(AO), %xmm8
2999	mulss	12 * SIZE(BO), %xmm8
3000	addss	%xmm8, %xmm3
3001	movss	-24 * SIZE(AO), %xmm8
3002	mulss	%xmm10, %xmm11
3003	movss	-27 * SIZE(AO), %xmm10
3004	mulss	20 * SIZE(BO), %xmm10
3005	addss	%xmm11, %xmm0
3006	movss	48 * SIZE(BO), %xmm11
3007	addss	%xmm10, %xmm1
3008	movss	-26 * SIZE(AO), %xmm10
3009	mulss	24 * SIZE(BO), %xmm10
3010	addss	%xmm10, %xmm2
3011	movss	-25 * SIZE(AO), %xmm10
3012	mulss	28 * SIZE(BO), %xmm10
3013	addss	%xmm10, %xmm3
3014	movss	-20 * SIZE(AO), %xmm10
3015
3016	addq   $ 8 * SIZE, AO
3017	addq   $32 * SIZE, BO
3018	decq   %rax
3019	jne    .L142
3020	ALIGN_4
3021
3022.L145:
3023#ifndef TRMMKERNEL
3024	movq	K, %rax
3025#else
3026	movq	KKK, %rax
3027#endif
3028	movss	ALPHA, %xmm15
3029	andq	$7, %rax		# if (k & 1)
3030	BRANCH
3031	je .L148
3032	ALIGN_4
3033
3034.L146:
3035	mulss	%xmm8, %xmm9
3036	movss	-31 * SIZE(AO), %xmm8
3037	addss	%xmm9, %xmm0
3038	movss	 4 * SIZE(BO), %xmm9
3039
3040	addq	$1 * SIZE, AO
3041	addq	$4 * SIZE, BO
3042	decq	%rax
3043	jg	.L146
3044	ALIGN_4
3045
3046.L148:
3047	addss	%xmm1, %xmm0
3048	addss	%xmm3, %xmm2
3049	addss	%xmm2, %xmm0
3050
3051	mulss	%xmm15, %xmm0
3052
3053#ifndef TRMMKERNEL
3054	movss	0 * SIZE(CO1), %xmm8
3055	addss	%xmm8,  %xmm0
3056#endif
3057	movss	%xmm0, 0 * SIZE(CO1)
3058	ALIGN_4
3059
3060.L999:
3061	movq	%rbx, %rsp
3062	movq	  0(%rsp), %rbx
3063	movq	  8(%rsp), %rbp
3064	movq	 16(%rsp), %r12
3065	movq	 24(%rsp), %r13
3066	movq	 32(%rsp), %r14
3067	movq	 40(%rsp), %r15
3068
3069#ifdef WINDOWS_ABI
3070	movq	 48(%rsp), %rdi
3071	movq	 56(%rsp), %rsi
3072	movups	 64(%rsp), %xmm6
3073	movups	 80(%rsp), %xmm7
3074	movups	 96(%rsp), %xmm8
3075	movups	112(%rsp), %xmm9
3076	movups	128(%rsp), %xmm10
3077	movups	144(%rsp), %xmm11
3078	movups	160(%rsp), %xmm12
3079	movups	176(%rsp), %xmm13
3080	movups	192(%rsp), %xmm14
3081	movups	208(%rsp), %xmm15
3082#endif
3083
3084	addq	$STACKSIZE, %rsp
3085	ret
3086
3087	EPILOGUE
3088