1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define OLD_M	%rdi
43#define OLD_N	%rsi
44#define M	%r13
45#define N	%r14
46#define K	%rdx
47
48#define A	%rcx
49#define B	%r8
50#define C	%r9
51#define LDC	%r10
52
53#define I	%r11
54#define AO	%rdi
55#define BO	%rsi
56#define	CO1	%r15
57#define CO2	%r12
58#define BB	%rbp
59#define	J	%rbx
60
61#ifndef WINDOWS_ABI
62
63#define STACKSIZE 96
64
65#define OFFSET	 48(%rsp)
66#define AORIG	 56(%rsp)
67#define KK	 64(%rsp)
68#define KKK	 72(%rsp)
69
70#else
71
72#define STACKSIZE 256
73
74#define OLD_A		40 + STACKSIZE(%rsp)
75#define OLD_B		48 + STACKSIZE(%rsp)
76#define OLD_C		56 + STACKSIZE(%rsp)
77#define OLD_LDC		64 + STACKSIZE(%rsp)
78#define OLD_OFFSET	72 + STACKSIZE(%rsp)
79
80#define OFFSET	224(%rsp)
81#define AORIG	232(%rsp)
82#define KK	240(%rsp)
83#define KKK	248(%rsp)
84
85#endif
86
87#define PREFETCH     prefetch
88#define PREFETCHSIZE  (8 *  7 + 0)
89
90#define movlpd	movsd
91#define movapd	movups
92#define movupd	movups
93
94#define KERNEL1(xx) \
95	mulpd	%xmm1, %xmm0 ;\
96	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1 ;\
97	addpd	%xmm0, %xmm8 ;\
98	movapd	%xmm2, %xmm0 ;\
99	PREFETCH (PREFETCHSIZE +  0) * SIZE(AO, %rax, 4) ;\
100	addpd	%xmm1, %xmm12 ;\
101	movddup	-14 * SIZE(BO, %rax, 4), %xmm1 ;\
102	mulpd	%xmm3, %xmm2 ;\
103	mulpd	-14 * SIZE(AO, %rax, 4), %xmm3 ;\
104	addpd	%xmm2, %xmm9 ;\
105	movapd	%xmm0, %xmm2 ;\
106	addpd	%xmm3, %xmm13 ;\
107	movddup	-13 * SIZE(BO, %rax, 4), %xmm3 ;\
108	mulpd	%xmm1, %xmm0 ;\
109	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1 ;\
110	addpd	%xmm0, %xmm10 ;\
111	movapd	-12 * SIZE(AO, %rax, 4), %xmm0 ;\
112	addpd	%xmm1, %xmm14 ;\
113	movddup	-12 * SIZE(BO, %rax, 4), %xmm1 ;\
114	mulpd	%xmm3, %xmm2 ;\
115	mulpd	-14 * SIZE(AO, %rax, 4), %xmm3 ;\
116	addpd	%xmm2, %xmm11 ;\
117	addpd	%xmm3, %xmm15 ;\
118 	movddup	-11 * SIZE(BO, %rax, 4), %xmm3 ;\
119	movapd	%xmm0, %xmm2
120
121#define KERNEL2(xx) \
122	mulpd	%xmm1, %xmm0 ;\
123	mulpd	-10 * SIZE(AO, %rax, 4), %xmm1 ;\
124	addpd	%xmm0, %xmm8 ;\
125	movapd	%xmm2, %xmm0 ;\
126	addpd	%xmm1, %xmm12 ;\
127	movddup	-10 * SIZE(BO, %rax, 4), %xmm1 ;\
128	mulpd	%xmm3, %xmm2 ;\
129	mulpd	-10 * SIZE(AO, %rax, 4), %xmm3 ;\
130	addpd	%xmm2, %xmm9 ;\
131	movapd	%xmm0, %xmm2 ;\
132	addpd	%xmm3, %xmm13 ;\
133	movddup	 -9 * SIZE(BO, %rax, 4), %xmm3 ;\
134	mulpd	%xmm1, %xmm0 ;\
135	mulpd	-10 * SIZE(AO, %rax, 4), %xmm1 ;\
136	addpd	%xmm0, %xmm10 ;\
137	addpd	%xmm1, %xmm14 ;\
138	mulpd	%xmm3, %xmm2 ;\
139	mulpd	-10 * SIZE(AO, %rax, 4), %xmm3 ;\
140	addpd	%xmm2, %xmm11 ;\
141	addpd	%xmm3, %xmm15 ;\
142 	movddup	 -7 * SIZE(BO, %rax, 4), %xmm3 ;\
143/**/	movddup	  (BO, %rax, 4), %xmm1 ;\
144	movapd	%xmm4, %xmm2
145
146#define KERNEL3(xx) \
147	mulpd	%xmm5, %xmm4 ;\
148	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm5 ;\
149	addpd	%xmm4, %xmm8 ;\
150	movapd	%xmm2, %xmm4 ;\
151	addpd	%xmm5, %xmm12 ;\
152	movddup	 -6 * SIZE(BO, %rax, 4), %xmm5 ;\
153	mulpd	%xmm3, %xmm2 ;\
154	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm3 ;\
155	addpd	%xmm2, %xmm9 ;\
156	movapd	%xmm4, %xmm2 ;\
157	addpd	%xmm3, %xmm13 ;\
158	movddup	 -5 * SIZE(BO, %rax, 4), %xmm3 ;\
159	mulpd	%xmm5, %xmm4 ;\
160	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm5 ;\
161	addpd	%xmm4, %xmm10 ;\
162	movapd	 -4 * SIZE(AO, %rax, 4), %xmm4 ;\
163	addpd	%xmm5, %xmm14 ;\
164	movddup	 -4 * SIZE(BO, %rax, 4), %xmm5 ;\
165	mulpd	%xmm3, %xmm2 ;\
166	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm3 ;\
167	addpd	%xmm2, %xmm11 ;\
168	addpd	%xmm3, %xmm15 ;\
169 	movddup	 -3 * SIZE(BO, %rax, 4), %xmm3 ;\
170	movapd	%xmm4, %xmm2
171
172#define KERNEL4(xx) \
173	mulpd	%xmm5, %xmm4 ;\
174	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm5 ;\
175	addpd	%xmm4, %xmm8 ;\
176	movapd	%xmm2, %xmm4 ;\
177	addpd	%xmm5, %xmm12 ;\
178	movddup	 -2 * SIZE(BO, %rax, 4), %xmm5 ;\
179	mulpd	%xmm3, %xmm2 ;\
180	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm3 ;\
181	addpd	%xmm2, %xmm9 ;\
182	movapd	%xmm4, %xmm2 ;\
183	addpd	%xmm3, %xmm13 ;\
184	movddup	 -1 * SIZE(BO, %rax, 4), %xmm3 ;\
185	mulpd	%xmm5, %xmm4 ;\
186	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm5 ;\
187/***/	movapd	  (AO, %rax, 4), %xmm6 ;\
188	addpd	%xmm4, %xmm10 ;\
189	addpd	%xmm5, %xmm14 ;\
190	mulpd	%xmm3, %xmm2 ;\
191	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm3 ;\
192	addpd	%xmm2, %xmm11 ;\
193	addpd	%xmm3, %xmm15 ;\
194 	movddup	  1 * SIZE(BO, %rax, 4), %xmm3 ;\
195	movddup	  8 * SIZE(BO, %rax, 4), %xmm5 ;\
196	movapd	%xmm6, %xmm2
197
198#define KERNEL5(xx) \
199	mulpd	%xmm1, %xmm6 ;\
200	mulpd	  2 * SIZE(AO, %rax, 4), %xmm1 ;\
201	addpd	%xmm6, %xmm8 ;\
202	movapd	%xmm2, %xmm6 ;\
203	addpd	%xmm1, %xmm12 ;\
204	movddup	  2 * SIZE(BO, %rax, 4), %xmm1 ;\
205	mulpd	%xmm3, %xmm2 ;\
206	mulpd	  2 * SIZE(AO, %rax, 4), %xmm3 ;\
207	addpd	%xmm2, %xmm9 ;\
208/**/	movapd	  8 * SIZE(AO, %rax, 4), %xmm7 ;\
209	movapd	%xmm6, %xmm2 ;\
210	addpd	%xmm3, %xmm13 ;\
211	movddup	  3 * SIZE(BO, %rax, 4), %xmm3 ;\
212	mulpd	%xmm1, %xmm6 ;\
213	mulpd	  2 * SIZE(AO, %rax, 4), %xmm1 ;\
214	addpd	%xmm6, %xmm10 ;\
215	movapd	  4 * SIZE(AO, %rax, 4), %xmm6 ;\
216	addpd	%xmm1, %xmm14 ;\
217	movddup	  4 * SIZE(BO, %rax, 4), %xmm1 ;\
218	mulpd	%xmm3, %xmm2 ;\
219	mulpd	  2 * SIZE(AO, %rax, 4), %xmm3 ;\
220	addpd	%xmm2, %xmm11 ;\
221	addpd	%xmm3, %xmm15 ;\
222 	movddup	  5 * SIZE(BO, %rax, 4), %xmm3 ;\
223	movapd	%xmm6, %xmm2
224
225#define KERNEL6(xx) \
226	mulpd	%xmm1, %xmm6 ;\
227	mulpd	  6 * SIZE(AO, %rax, 4), %xmm1 ;\
228	addpd	%xmm6, %xmm8 ;\
229	movapd	%xmm2, %xmm6 ;\
230	addpd	%xmm1, %xmm12 ;\
231	movddup	  6 * SIZE(BO, %rax, 4), %xmm1 ;\
232	mulpd	%xmm3, %xmm2 ;\
233	mulpd	  6 * SIZE(AO, %rax, 4), %xmm3 ;\
234	addpd	%xmm2, %xmm9 ;\
235	movapd	%xmm6, %xmm2 ;\
236	addpd	%xmm3, %xmm13 ;\
237	movddup	  7 * SIZE(BO, %rax, 4), %xmm3 ;\
238	mulpd	%xmm1, %xmm6 ;\
239	mulpd	  6 * SIZE(AO, %rax, 4), %xmm1 ;\
240	addpd	%xmm6, %xmm10 ;\
241/***/	movapd	 16 * SIZE(AO, %rax, 4), %xmm0 ;\
242	addpd	%xmm1, %xmm14 ;\
243	mulpd	%xmm3, %xmm2 ;\
244	mulpd	  6 * SIZE(AO, %rax, 4), %xmm3 ;\
245	addpd	%xmm2, %xmm11 ;\
246	addpd	%xmm3, %xmm15 ;\
247 	movddup	  9 * SIZE(BO, %rax, 4), %xmm3 ;\
248	movddup	 16 * SIZE(BO, %rax, 4), %xmm1 ;\
249	movapd	%xmm7, %xmm2
250
251#define KERNEL7(xx) \
252	mulpd	%xmm5, %xmm7 ;\
253	mulpd	 10 * SIZE(AO, %rax, 4), %xmm5 ;\
254	addpd	%xmm7, %xmm8 ;\
255	movapd	%xmm2, %xmm7 ;\
256	addpd	%xmm5, %xmm12 ;\
257	movddup	 10 * SIZE(BO, %rax, 4), %xmm5 ;\
258	mulpd	%xmm3, %xmm2 ;\
259	mulpd	 10 * SIZE(AO, %rax, 4), %xmm3 ;\
260	addpd	%xmm2, %xmm9 ;\
261	movapd	%xmm7, %xmm2 ;\
262	addpd	%xmm3, %xmm13 ;\
263	movddup	 11 * SIZE(BO, %rax, 4), %xmm3 ;\
264	mulpd	%xmm5, %xmm7 ;\
265	mulpd	 10 * SIZE(AO, %rax, 4), %xmm5 ;\
266	addpd	%xmm7, %xmm10 ;\
267	movapd	 12 * SIZE(AO, %rax, 4), %xmm7 ;\
268	addpd	%xmm5, %xmm14 ;\
269	movddup	 12 * SIZE(BO, %rax, 4), %xmm5 ;\
270	mulpd	%xmm3, %xmm2 ;\
271	mulpd	 10 * SIZE(AO, %rax, 4), %xmm3 ;\
272	addpd	%xmm2, %xmm11 ;\
273	addpd	%xmm3, %xmm15 ;\
274 	movddup	 13 * SIZE(BO, %rax, 4), %xmm3 ;\
275	movapd	%xmm7, %xmm2
276
277#define KERNEL8(xx) \
278	mulpd	%xmm5, %xmm7 ;\
279	mulpd	 14 * SIZE(AO, %rax, 4), %xmm5 ;\
280	addpd	%xmm7, %xmm8 ;\
281	movapd	%xmm2, %xmm7 ;\
282	addpd	%xmm5, %xmm12 ;\
283	movddup	 14 * SIZE(BO, %rax, 4), %xmm5 ;\
284	mulpd	%xmm3, %xmm2 ;\
285	mulpd	 14 * SIZE(AO, %rax, 4), %xmm3 ;\
286	addpd	%xmm2, %xmm9 ;\
287	movapd	%xmm7, %xmm2 ;\
288	addpd	%xmm3, %xmm13 ;\
289	movddup	 15 * SIZE(BO, %rax, 4), %xmm3 ;\
290	mulpd	%xmm5, %xmm7 ;\
291	mulpd	 14 * SIZE(AO, %rax, 4), %xmm5 ;\
292	addpd	%xmm7, %xmm10 ;\
293	addpd	%xmm5, %xmm14 ;\
294/**/	movapd	 24 * SIZE(AO, %rax, 4), %xmm4 ;\
295	mulpd	%xmm3, %xmm2 ;\
296	mulpd	 14 * SIZE(AO, %rax, 4), %xmm3 ;\
297	addpd	%xmm3, %xmm15 ;\
298 	movddup	 17 * SIZE(BO, %rax, 4), %xmm3 ;\
299	addpd	%xmm2, %xmm11 ;\
300	movddup	 24 * SIZE(BO, %rax, 4), %xmm5 ;\
301	movapd	%xmm0, %xmm2 ;\
302	addq	$8 * SIZE, %rax
303
304#define KERNEL_SUB1(xx) \
305	mulpd	%xmm1, %xmm0 ;\
306	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1 ;\
307	addpd	%xmm0, %xmm8 ;\
308	movapd	%xmm2, %xmm0 ;\
309	addpd	%xmm1, %xmm12 ;\
310	movddup	-14 * SIZE(BO, %rax, 4), %xmm1 ;\
311	mulpd	%xmm3, %xmm2 ;\
312	mulpd	-14 * SIZE(AO, %rax, 4), %xmm3 ;\
313	addpd	%xmm2, %xmm9 ;\
314	movapd	%xmm0, %xmm2 ;\
315	addpd	%xmm3, %xmm13 ;\
316	movddup	-13 * SIZE(BO, %rax, 4), %xmm3 ;\
317	mulpd	%xmm1, %xmm0 ;\
318	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1 ;\
319	addpd	%xmm0, %xmm10 ;\
320	movapd	-12 * SIZE(AO, %rax, 4), %xmm0 ;\
321	addpd	%xmm1, %xmm14 ;\
322	movddup	-12 * SIZE(BO, %rax, 4), %xmm1 ;\
323	mulpd	%xmm3, %xmm2 ;\
324	mulpd	-14 * SIZE(AO, %rax, 4), %xmm3 ;\
325	addpd	%xmm2, %xmm11 ;\
326	addpd	%xmm3, %xmm15 ;\
327 	movddup	-11 * SIZE(BO, %rax, 4), %xmm3 ;\
328	movapd	%xmm0, %xmm2
329
330#define KERNEL_SUB2(xx) \
331	mulpd	%xmm1, %xmm0 ;\
332	mulpd	-10 * SIZE(AO, %rax, 4), %xmm1 ;\
333	addpd	%xmm0, %xmm8 ;\
334	movapd	%xmm2, %xmm0 ;\
335	addpd	%xmm1, %xmm12 ;\
336	movddup	-10 * SIZE(BO, %rax, 4), %xmm1 ;\
337	mulpd	%xmm3, %xmm2 ;\
338	mulpd	-10 * SIZE(AO, %rax, 4), %xmm3 ;\
339	addpd	%xmm2, %xmm9 ;\
340	movapd	%xmm0, %xmm2 ;\
341	addpd	%xmm3, %xmm13 ;\
342	movddup	 -9 * SIZE(BO, %rax, 4), %xmm3 ;\
343	mulpd	%xmm1, %xmm0 ;\
344	mulpd	-10 * SIZE(AO, %rax, 4), %xmm1 ;\
345	addpd	%xmm0, %xmm10 ;\
346	movapd	  (AO, %rax, 4), %xmm0 ;\
347	addpd	%xmm1, %xmm14 ;\
348	movddup	  (BO, %rax, 4), %xmm1 ;\
349	mulpd	%xmm3, %xmm2 ;\
350	mulpd	-10 * SIZE(AO, %rax, 4), %xmm3 ;\
351	addpd	%xmm2, %xmm11 ;\
352	addpd	%xmm3, %xmm15 ;\
353 	movddup	 -7 * SIZE(BO, %rax, 4), %xmm3 ;\
354	movapd	%xmm4, %xmm2
355
356#define KERNEL_SUB3(xx) \
357	mulpd	%xmm5, %xmm4 ;\
358	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm5 ;\
359	addpd	%xmm4, %xmm8 ;\
360	movapd	%xmm2, %xmm4 ;\
361	addpd	%xmm5, %xmm12 ;\
362	movddup	 -6 * SIZE(BO, %rax, 4), %xmm5 ;\
363	mulpd	%xmm3, %xmm2 ;\
364	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm3 ;\
365	addpd	%xmm2, %xmm9 ;\
366	movapd	%xmm4, %xmm2 ;\
367	addpd	%xmm3, %xmm13 ;\
368	movddup	 -5 * SIZE(BO, %rax, 4), %xmm3 ;\
369	mulpd	%xmm5, %xmm4 ;\
370	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm5 ;\
371	addpd	%xmm4, %xmm10 ;\
372	movapd	 -4 * SIZE(AO, %rax, 4), %xmm4 ;\
373	addpd	%xmm5, %xmm14 ;\
374	movddup	 -4 * SIZE(BO, %rax, 4), %xmm5 ;\
375	mulpd	%xmm3, %xmm2 ;\
376	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm3 ;\
377	addpd	%xmm2, %xmm11 ;\
378	addpd	%xmm3, %xmm15 ;\
379 	movddup	 -3 * SIZE(BO, %rax, 4), %xmm3 ;\
380	movapd	%xmm4, %xmm2
381
382#define KERNEL_SUB4(xx) \
383	mulpd	%xmm5, %xmm4 ;\
384	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm5 ;\
385	addpd	%xmm4, %xmm8 ;\
386	movapd	%xmm2, %xmm4 ;\
387	addpd	%xmm5, %xmm12 ;\
388	movddup	 -2 * SIZE(BO, %rax, 4), %xmm5 ;\
389	mulpd	%xmm3, %xmm2 ;\
390	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm3 ;\
391	addpd	%xmm2, %xmm9 ;\
392	movapd	%xmm4, %xmm2 ;\
393	addpd	%xmm3, %xmm13 ;\
394	movddup	 -1 * SIZE(BO, %rax, 4), %xmm3 ;\
395	mulpd	%xmm5, %xmm4 ;\
396	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm5 ;\
397	addpd	%xmm4, %xmm10 ;\
398	addpd	%xmm5, %xmm14 ;\
399	mulpd	%xmm3, %xmm2 ;\
400	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm3 ;\
401	addpd	%xmm2, %xmm11 ;\
402	addpd	%xmm3, %xmm15 ;\
403 	movddup	  1 * SIZE(BO, %rax, 4), %xmm3 ;\
404	movapd	%xmm0, %xmm2
405
406	PROLOGUE
407	PROFCODE
408
409	subq	$STACKSIZE, %rsp
410	movq	%rbx,   (%rsp)
411	movq	%rbp,  8(%rsp)
412	movq	%r12, 16(%rsp)
413	movq	%r13, 24(%rsp)
414	movq	%r14, 32(%rsp)
415	movq	%r15, 40(%rsp)
416
417#ifdef WINDOWS_ABI
418	movq	%rdi,    48(%rsp)
419	movq	%rsi,    56(%rsp)
420	movups	%xmm6,   64(%rsp)
421	movups	%xmm7,   80(%rsp)
422	movups	%xmm8,   96(%rsp)
423	movups	%xmm9,  112(%rsp)
424	movups	%xmm10, 128(%rsp)
425	movups	%xmm11, 144(%rsp)
426	movups	%xmm12, 160(%rsp)
427	movups	%xmm13, 176(%rsp)
428	movups	%xmm14, 192(%rsp)
429	movups	%xmm15, 208(%rsp)
430
431	movq	ARG1,      OLD_M
432	movq	ARG2,      OLD_N
433	movq	ARG3,      K
434	movq	OLD_A,     A
435	movq	OLD_B,     B
436	movq	OLD_C,     C
437	movq	OLD_LDC,   LDC
438	movsd	OLD_OFFSET, %xmm12
439#else
440	movq	STACKSIZE +  8(%rsp), LDC
441	movsd	STACKSIZE + 16(%rsp), %xmm12
442#endif
443
444	movq	OLD_M, M
445	movq	OLD_N, N
446
447	subq	$-16 * SIZE, A
448	subq	$-16 * SIZE, B
449
450	movsd	%xmm12, OFFSET
451	movsd	%xmm12, KK
452
453	leaq	(, LDC, SIZE), LDC
454
455#ifdef LN
456       leaq	(, M, SIZE), %rax
457       addq	%rax, C
458       imulq	K, %rax
459       addq	%rax, A
460#endif
461
462#ifdef RT
463       leaq	(, N, SIZE), %rax
464       imulq	K, %rax
465       addq	%rax, B
466       movq	N, %rax
467       imulq	LDC, %rax
468       addq	%rax, C
469#endif
470
471#ifdef RN
472	negq	KK
473#endif
474
475#ifdef RT
476       movq	N, %rax
477       subq	OFFSET, %rax
478       movq	%rax, KK
479#endif
480
481	movq	N,  J
482	sarq	$2, J		# j = (n >> 2)
483	jle	.L40
484
485.L01:
486#if defined(LT) || defined(RN)
487	movq	A, AO
488#else
489	movq	A, AORIG
490#endif
491
492#ifdef RT
493       movq	K, %rax
494       salq	$2 + BASE_SHIFT, %rax
495       subq	%rax, B
496
497       leaq	(, LDC, 4), %rax
498       subq	%rax, C
499#endif
500
501	movq	C, CO1			# coffset1 = c
502	leaq	(C, LDC, 1), CO2	# coffset2 = c + ldc
503#ifndef RT
504	leaq	(C, LDC, 4), C
505#endif
506
507#ifdef LN
508	movq	OFFSET, %rax
509	addq	M, %rax
510	movq	%rax, KK
511#endif
512
513	movq	K, %rax
514	salq	$BASE_SHIFT + 2, %rax
515	leaq	(B, %rax), BB
516
517#if defined(LT)
518	movq	OFFSET, %rax
519	movq	%rax, KK
520#endif
521
522	testq	$1, M
523	je	.L20
524
525#ifdef LN
526       movq	K, %rax
527       salq	$0 + BASE_SHIFT, %rax
528       subq	%rax, AORIG
529#endif
530
531#if defined(LN) || defined(RT)
532	movq	KK, %rax
533	movq	AORIG, AO
534	leaq	(, %rax, SIZE), %rax
535	leaq	(AO, %rax, 1), AO
536#endif
537
538	movq	B, BO
539
540#if defined(LN) || defined(RT)
541	movq	KK, %rax
542	leaq	(, %rax, SIZE), %rax
543	leaq	(BO, %rax, 4), BO
544#endif
545
546	movddup	-16 * SIZE(AO), %xmm0
547	pxor	%xmm8, %xmm8
548	movddup	-14 * SIZE(AO), %xmm2
549	pxor	%xmm9, %xmm9
550	movddup	-15 * SIZE(AO), %xmm4
551	pxor	%xmm10, %xmm10
552	movapd	-16 * SIZE(BO), %xmm1
553	pxor	%xmm11, %xmm11
554	movapd	 -8 * SIZE(BO), %xmm3
555
556#if defined(LT) || defined(RN)
557	movq	KK, %rax
558#else
559	movq	K, %rax
560	subq	KK, %rax
561#endif
562	andq	$-4, %rax
563	leaq	(, %rax, SIZE), %rax
564	leaq	(AO, %rax, 1), AO
565	leaq	(BO, %rax, 4), BO
566	negq	%rax
567	NOBRANCH
568	je	.L36
569	ALIGN_4
570
571.L32:
572	mulpd	%xmm0, %xmm1
573	mulpd	-14 * SIZE(BO, %rax, 4), %xmm0
574	addpd	%xmm1, %xmm8
575	movapd	-12 * SIZE(BO, %rax, 4), %xmm1
576	addpd	%xmm0, %xmm9
577	movddup	-12 * SIZE(AO, %rax, 1), %xmm0
578	mulpd	%xmm4, %xmm1
579	mulpd	-10 * SIZE(BO, %rax, 4), %xmm4
580	addpd	%xmm1, %xmm10
581	movapd	  (BO, %rax, 4), %xmm1
582	addpd	%xmm4, %xmm11
583	movddup	-11 * SIZE(AO, %rax, 1), %xmm4
584	mulpd	%xmm2, %xmm3
585	mulpd	 -6 * SIZE(BO, %rax, 4), %xmm2
586	addpd	%xmm3, %xmm8
587	movapd	 -4 * SIZE(BO, %rax, 4), %xmm3
588	addpd	%xmm2, %xmm9
589	movddup	-13 * SIZE(AO, %rax, 1), %xmm2
590	mulpd	%xmm2, %xmm3
591	mulpd	 -2 * SIZE(BO, %rax, 4), %xmm2
592	addpd	%xmm3, %xmm10
593	movapd	  8 * SIZE(BO, %rax, 4), %xmm3
594	addpd	%xmm2, %xmm11
595	movddup	-10 * SIZE(AO, %rax, 1), %xmm2
596
597	addq	$4 * SIZE, %rax
598	BRANCH
599	jl	.L32
600	ALIGN_4
601
602.L36:
603#if defined(LT) || defined(RN)
604	movq	KK, %rax
605#else
606	movq	K, %rax
607	subq	KK, %rax
608#endif
609	andq	$3, %rax		# if (k & 1)
610	je .L38
611
612	leaq	(, %rax, SIZE), %rax
613	leaq	(AO, %rax, 1), AO
614	leaq	(BO, %rax, 4), BO
615	negq	%rax
616	ALIGN_4
617
618.L37:
619	mulpd	%xmm0, %xmm1
620	mulpd	-14 * SIZE(BO, %rax, 4), %xmm0
621	addpd	%xmm1, %xmm8
622	movapd	-12 * SIZE(BO, %rax, 4), %xmm1
623	addpd	%xmm0, %xmm9
624	movddup	-15 * SIZE(AO, %rax, 1), %xmm0
625
626	addq	$SIZE, %rax
627	jl	.L37
628	ALIGN_4
629
630.L38:
631	addpd	%xmm10, %xmm8
632	addpd	%xmm11, %xmm9
633
634#if defined(LN) || defined(RT)
635	movq	KK, %rax
636#ifdef LN
637	subq	$1, %rax
638#else
639	subq	$4, %rax
640#endif
641
642	leaq	(, %rax, SIZE), %rax
643
644	movq	AORIG, AO
645	leaq	(AO, %rax, 1), AO
646	leaq	(B,  %rax, 4), BO
647#endif
648
649#if defined(LN) || defined(LT)
650	movapd	-16 * SIZE(BO), %xmm2
651	movapd	-14 * SIZE(BO), %xmm3
652
653	subpd	%xmm8,  %xmm2
654	subpd	%xmm9,  %xmm3
655#else
656	movapd	-16 * SIZE(AO), %xmm2
657	movapd	-14 * SIZE(AO), %xmm3
658
659	subpd	%xmm8, %xmm2
660	subpd	%xmm9, %xmm3
661#endif
662
663#if defined(LN) || defined(LT)
664	movddup	-16 * SIZE(AO), %xmm0
665	mulpd	 %xmm0, %xmm2
666	mulpd	 %xmm0, %xmm3
667#endif
668
669#ifdef RN
670	movapd	%xmm2, %xmm0
671        unpckhpd %xmm0, %xmm0
672
673	movapd	%xmm3, %xmm1
674        unpckhpd %xmm1, %xmm1
675
676	movsd	-16 * SIZE(BO), %xmm4
677	mulsd	 %xmm4, %xmm2
678
679	movsd	-15 * SIZE(BO), %xmm5
680	mulsd	 %xmm2, %xmm5
681	subsd	 %xmm5, %xmm0
682	movsd	-14 * SIZE(BO), %xmm6
683	mulsd	 %xmm2, %xmm6
684	subsd	 %xmm6, %xmm3
685	movsd	-13 * SIZE(BO), %xmm7
686	mulsd	 %xmm2, %xmm7
687	subsd	 %xmm7, %xmm1
688
689	movsd	-11 * SIZE(BO), %xmm4
690	mulsd	 %xmm4, %xmm0
691
692	movsd	-10 * SIZE(BO), %xmm5
693	mulsd	 %xmm0, %xmm5
694	subsd	 %xmm5, %xmm3
695	movsd	 -9 * SIZE(BO), %xmm6
696	mulsd	 %xmm0, %xmm6
697	subsd	 %xmm6, %xmm1
698
699	movsd	 -6 * SIZE(BO), %xmm4
700	mulsd	 %xmm4, %xmm3
701
702	movsd	 -5 * SIZE(BO), %xmm5
703	mulsd	 %xmm3, %xmm5
704	subsd	 %xmm5, %xmm1
705
706	movsd	 -1 * SIZE(BO), %xmm4
707	mulsd	 %xmm4, %xmm1
708
709	unpcklpd %xmm0, %xmm2
710	unpcklpd %xmm1, %xmm3
711#endif
712
713#ifdef RT
714	movapd	%xmm2, %xmm0
715        unpckhpd %xmm0, %xmm0
716
717	movapd	%xmm3, %xmm1
718        unpckhpd %xmm1, %xmm1
719
720	movsd	 -1 * SIZE(BO), %xmm4
721	mulsd	 %xmm4, %xmm1
722
723	movsd	 -2 * SIZE(BO), %xmm5
724	mulsd	 %xmm1, %xmm5
725	subsd	 %xmm5, %xmm3
726	movsd	 -3 * SIZE(BO), %xmm6
727	mulsd	 %xmm1, %xmm6
728	subsd	 %xmm6, %xmm0
729	movsd	 -4 * SIZE(BO), %xmm7
730	mulsd	 %xmm1, %xmm7
731	subsd	 %xmm7, %xmm2
732
733	movsd	 -6 * SIZE(BO), %xmm4
734	mulsd	 %xmm4, %xmm3
735
736	movsd	 -7 * SIZE(BO), %xmm5
737	mulsd	 %xmm3, %xmm5
738	subsd	 %xmm5, %xmm0
739	movsd	 -8 * SIZE(BO), %xmm6
740	mulsd	 %xmm3, %xmm6
741	subsd	 %xmm6, %xmm2
742
743	movsd	-11 * SIZE(BO), %xmm4
744	mulsd	 %xmm4, %xmm0
745
746	movsd	-12 * SIZE(BO), %xmm5
747	mulsd	 %xmm0, %xmm5
748	subsd	 %xmm5, %xmm2
749
750	movsd	-16 * SIZE(BO), %xmm4
751	mulsd	 %xmm4, %xmm2
752
753	unpcklpd %xmm0, %xmm2
754	unpcklpd %xmm1, %xmm3
755
756#endif
757
758#ifdef LN
759	subq	$1 * SIZE, CO1
760	subq	$1 * SIZE, CO2
761#endif
762
763#if defined(LN) || defined(LT)
764	movlpd	%xmm2,  0 * SIZE(CO1)
765	movhpd	%xmm2,  0 * SIZE(CO2)
766	movlpd	%xmm3,  0 * SIZE(CO1, LDC, 2)
767	movhpd	%xmm3,  0 * SIZE(CO2, LDC, 2)
768#else
769	movlpd	%xmm2,  0 * SIZE(CO1)
770	movhpd	%xmm2,  0 * SIZE(CO2)
771	movlpd	%xmm3,  0 * SIZE(CO1, LDC, 2)
772	movhpd	%xmm3,  0 * SIZE(CO2, LDC, 2)
773#endif
774
775#if defined(LN) || defined(LT)
776	movaps	%xmm2, -16 * SIZE(BO)
777	movaps	%xmm3, -14 * SIZE(BO)
778#else
779	movaps	%xmm2, -16 * SIZE(AO)
780	movaps	%xmm3, -14 * SIZE(AO)
781#endif
782
783#ifndef LN
784	addq	$1 * SIZE, CO1
785	addq	$1 * SIZE, CO2
786#endif
787
788#if defined(LT) || defined(RN)
789	movq	K,  %rax
790	subq	KK, %rax
791	leaq	(,%rax, SIZE), %rax
792	leaq	(AO, %rax, 1), AO
793	leaq	(BO, %rax, 4), BO
794#endif
795
796#ifdef LN
797	subq	$1, KK
798#endif
799
800#ifdef LT
801	addq	$1, KK
802#endif
803
804#ifdef RT
805       movq	K, %rax
806       salq	$0 + BASE_SHIFT, %rax
807       addq	%rax, AORIG
808#endif
809	ALIGN_4
810
811.L20:
812	testq	$2, M
813	je	.L30
814	ALIGN_4
815
816.L21:
817#ifdef LN
818       movq	K, %rax
819       salq	$1 + BASE_SHIFT, %rax
820       subq	%rax, AORIG
821#endif
822
823#if defined(LN) || defined(RT)
824	movq	KK, %rax
825	movq	AORIG, AO
826	leaq	(, %rax, SIZE), %rax
827	leaq	(AO, %rax, 2), AO
828#endif
829
830	movq	B, BO
831
832#if defined(LN) || defined(RT)
833	movq	KK, %rax
834	leaq	(, %rax, SIZE), %rax
835	leaq	(BO, %rax, 4), BO
836#endif
837
838	movapd	-16 * SIZE(AO), %xmm0
839	pxor	%xmm8, %xmm8
840	movapd	-12 * SIZE(AO), %xmm2
841	pxor	%xmm9, %xmm9
842	movddup	-16 * SIZE(BO), %xmm1
843	pxor	%xmm10, %xmm10
844	movddup	-15 * SIZE(BO), %xmm5
845	pxor	%xmm11, %xmm11
846	movddup	 -8 * SIZE(BO), %xmm3
847
848#if defined(LT) || defined(RN)
849	movq	KK, %rax
850#else
851	movq	K, %rax
852	subq	KK, %rax
853#endif
854	andq	$-4, %rax
855	leaq	(, %rax, SIZE), %rax
856	leaq	(AO, %rax, 2), AO
857	leaq	(BO, %rax, 4), BO
858	negq	%rax
859	NOBRANCH
860	je	.L26
861	ALIGN_4
862
863.L22:
864	mulpd	%xmm0, %xmm1
865	addpd	%xmm1, %xmm8
866	movddup	-14 * SIZE(BO, %rax, 4), %xmm1
867	mulpd	%xmm0, %xmm5
868	addpd	%xmm5, %xmm9
869	movddup	-13 * SIZE(BO, %rax, 4), %xmm5
870	mulpd	%xmm0, %xmm1
871	addpd	%xmm1, %xmm10
872	movddup	-12 * SIZE(BO, %rax, 4), %xmm1
873	mulpd	%xmm0, %xmm5
874	movapd	-14 * SIZE(AO, %rax, 2), %xmm0
875	addpd	%xmm5, %xmm11
876	movddup	-11 * SIZE(BO, %rax, 4), %xmm5
877	mulpd	%xmm0, %xmm1
878	addpd	%xmm1, %xmm8
879	movddup	-10 * SIZE(BO, %rax, 4), %xmm1
880	mulpd	%xmm0, %xmm5
881	addpd	%xmm5, %xmm9
882	movddup	 -9 * SIZE(BO, %rax, 4), %xmm5
883	mulpd	%xmm0, %xmm1
884	addpd	%xmm1, %xmm10
885	movddup	  (BO, %rax, 4), %xmm1
886	mulpd	%xmm0, %xmm5
887	movapd	 -8 * SIZE(AO, %rax, 2), %xmm0
888	addpd	%xmm5, %xmm11
889	movddup	 -7 * SIZE(BO, %rax, 4), %xmm5
890	mulpd	%xmm2, %xmm3
891	addpd	%xmm3, %xmm8
892	movddup	 -6 * SIZE(BO, %rax, 4), %xmm3
893	mulpd	%xmm2, %xmm5
894	addpd	%xmm5, %xmm9
895	movddup	 -5 * SIZE(BO, %rax, 4), %xmm5
896	mulpd	%xmm2, %xmm3
897	addpd	%xmm3, %xmm10
898	movddup	 -4 * SIZE(BO, %rax, 4), %xmm3
899	mulpd	%xmm2, %xmm5
900	movapd	-10 * SIZE(AO, %rax, 2), %xmm2
901	addpd	%xmm5, %xmm11
902	movddup	 -3 * SIZE(BO, %rax, 4), %xmm5
903	mulpd	%xmm2, %xmm3
904	addpd	%xmm3, %xmm8
905	movddup	 -2 * SIZE(BO, %rax, 4), %xmm3
906	mulpd	%xmm2, %xmm5
907	addpd	%xmm5, %xmm9
908	movddup	 -1 * SIZE(BO, %rax, 4), %xmm5
909	mulpd	%xmm2, %xmm3
910	addpd	%xmm3, %xmm10
911	movddup	  8 * SIZE(BO, %rax, 4), %xmm3
912	mulpd	%xmm2, %xmm5
913	movapd	 -4 * SIZE(AO, %rax, 2), %xmm2
914	addpd	%xmm5, %xmm11
915	movddup	  1 * SIZE(BO, %rax, 4), %xmm5
916
917	addq	$4 * SIZE, %rax
918	BRANCH
919	jl	.L22
920	ALIGN_4
921
922.L26:
923#if defined(LT) || defined(RN)
924	movq	KK, %rax
925#else
926	movq	K, %rax
927	subq	KK, %rax
928#endif
929	andq	$3, %rax		# if (k & 1)
930	je .L29
931
932	leaq	(, %rax, SIZE), %rax
933	leaq	(AO, %rax, 2), AO
934	leaq	(BO, %rax, 4), BO
935	negq	%rax
936	ALIGN_4
937
938.L27:
939	mulpd	%xmm0, %xmm1
940	addpd	%xmm1, %xmm8
941	movddup	-14 * SIZE(BO, %rax, 4), %xmm1
942	mulpd	%xmm0, %xmm5
943	addpd	%xmm5, %xmm9
944	movddup	-13 * SIZE(BO, %rax, 4), %xmm5
945	mulpd	%xmm0, %xmm1
946	addpd	%xmm1, %xmm10
947	movddup	-12 * SIZE(BO, %rax, 4), %xmm1
948	mulpd	%xmm0, %xmm5
949	movapd	-14 * SIZE(AO, %rax, 2), %xmm0
950	addpd	%xmm5, %xmm11
951	movddup	-11 * SIZE(BO, %rax, 4), %xmm5
952
953	addq	$SIZE, %rax
954	jl	.L27
955	ALIGN_4
956
957.L29:
958#if defined(LN) || defined(RT)
959	movq	KK, %rax
960#ifdef LN
961	subq	$2, %rax
962#else
963	subq	$4, %rax
964#endif
965
966	leaq	(, %rax, SIZE), %rax
967
968	movq	AORIG, AO
969	leaq	(AO, %rax, 2), AO
970	leaq	(B,  %rax, 4), BO
971#endif
972
973#if defined(LN) || defined(LT)
974	movapd	 %xmm8,  %xmm0
975	unpcklpd %xmm9,  %xmm8
976	unpckhpd %xmm9,  %xmm0
977
978	movapd	 %xmm10, %xmm2
979	unpcklpd %xmm11, %xmm10
980	unpckhpd %xmm11, %xmm2
981
982	movapd	-16 * SIZE(BO), %xmm9
983	movapd	-14 * SIZE(BO), %xmm11
984	movapd	-12 * SIZE(BO), %xmm13
985	movapd	-10 * SIZE(BO), %xmm15
986
987	subpd	%xmm8,  %xmm9
988	subpd	%xmm10, %xmm11
989	subpd	%xmm0,  %xmm13
990	subpd	%xmm2,  %xmm15
991#else
992	movapd	-16 * SIZE(AO), %xmm0
993	movapd	-14 * SIZE(AO), %xmm2
994	movapd	-12 * SIZE(AO), %xmm4
995	movapd	-10 * SIZE(AO), %xmm6
996
997	subpd	%xmm8,  %xmm0
998	subpd	%xmm9,  %xmm2
999	subpd	%xmm10, %xmm4
1000	subpd	%xmm11, %xmm6
1001#endif
1002
1003#ifdef LN
1004	movddup	-13 * SIZE(AO), %xmm8
1005	mulpd	 %xmm8, %xmm13
1006	mulpd	 %xmm8, %xmm15
1007
1008	movddup	-14 * SIZE(AO), %xmm10
1009	mulpd	 %xmm13, %xmm10
1010	subpd	 %xmm10, %xmm9
1011	movddup	-14 * SIZE(AO), %xmm10
1012	mulpd	 %xmm15, %xmm10
1013	subpd	 %xmm10, %xmm11
1014
1015	movddup	-16 * SIZE(AO), %xmm8
1016	mulpd	 %xmm8, %xmm9
1017	mulpd	 %xmm8, %xmm11
1018#endif
1019
1020#ifdef LT
1021	movddup	-16 * SIZE(AO), %xmm8
1022	mulpd	 %xmm8, %xmm9
1023	mulpd	 %xmm8, %xmm11
1024
1025	movddup	-15 * SIZE(AO), %xmm10
1026	mulpd	 %xmm9, %xmm10
1027	subpd	 %xmm10, %xmm13
1028	movddup	-15 * SIZE(AO), %xmm10
1029	mulpd	 %xmm11, %xmm10
1030	subpd	 %xmm10, %xmm15
1031
1032	movddup	-13 * SIZE(AO), %xmm8
1033	mulpd	 %xmm8, %xmm13
1034	mulpd	 %xmm8, %xmm15
1035#endif
1036
1037#ifdef RN
1038	movddup	-16 * SIZE(BO), %xmm8
1039	mulpd	 %xmm8, %xmm0
1040
1041	movddup	-15 * SIZE(BO), %xmm9
1042	mulpd	 %xmm0, %xmm9
1043	subpd	 %xmm9, %xmm2
1044	movddup	-14 * SIZE(BO), %xmm10
1045	mulpd	 %xmm0, %xmm10
1046	subpd	 %xmm10, %xmm4
1047	movddup	-13 * SIZE(BO), %xmm11
1048	mulpd	 %xmm0, %xmm11
1049	subpd	 %xmm11, %xmm6
1050
1051	movddup	-11 * SIZE(BO), %xmm8
1052	mulpd	 %xmm8, %xmm2
1053	movddup	-10 * SIZE(BO), %xmm9
1054	mulpd	 %xmm2, %xmm9
1055	subpd	 %xmm9, %xmm4
1056	movddup	 -9 * SIZE(BO), %xmm10
1057	mulpd	 %xmm2, %xmm10
1058	subpd	 %xmm10, %xmm6
1059
1060	movddup	 -6 * SIZE(BO), %xmm8
1061	mulpd	 %xmm8, %xmm4
1062
1063	movddup	 -5 * SIZE(BO), %xmm9
1064	mulpd	 %xmm4, %xmm9
1065	subpd	 %xmm9, %xmm6
1066
1067	movddup	 -1 * SIZE(BO), %xmm8
1068	mulpd	 %xmm8, %xmm6
1069#endif
1070
1071#ifdef RT
1072	movddup	 -1 * SIZE(BO), %xmm8
1073	mulpd	 %xmm8, %xmm6
1074
1075	movddup	 -2 * SIZE(BO), %xmm9
1076	mulpd	 %xmm6, %xmm9
1077	subpd	 %xmm9, %xmm4
1078	movddup	 -3 * SIZE(BO), %xmm10
1079	mulpd	 %xmm6, %xmm10
1080	subpd	 %xmm10, %xmm2
1081	movddup	 -4 * SIZE(BO), %xmm11
1082	mulpd	 %xmm6, %xmm11
1083	subpd	 %xmm11, %xmm0
1084
1085	movddup	 -6 * SIZE(BO), %xmm8
1086	mulpd	 %xmm8, %xmm4
1087	movddup	 -7 * SIZE(BO), %xmm9
1088	mulpd	 %xmm4, %xmm9
1089	subpd	 %xmm9, %xmm2
1090	movddup	 -8 * SIZE(BO), %xmm10
1091	mulpd	 %xmm4, %xmm10
1092	subpd	 %xmm10, %xmm0
1093
1094	movddup	-11 * SIZE(BO), %xmm8
1095	mulpd	 %xmm8, %xmm2
1096	movddup	-12 * SIZE(BO), %xmm9
1097	mulpd	 %xmm2, %xmm9
1098	subpd	 %xmm9, %xmm0
1099
1100	movddup	-16 * SIZE(BO), %xmm8
1101	mulpd	 %xmm8, %xmm0
1102#endif
1103
1104#ifdef LN
1105	subq	$2 * SIZE, CO1
1106	subq	$2 * SIZE, CO2
1107#endif
1108
1109#if defined(LN) || defined(LT)
1110	movlpd	%xmm9,   0 * SIZE(CO1)
1111	movlpd	%xmm13,  1 * SIZE(CO1)
1112
1113	movhpd	%xmm9,   0 * SIZE(CO2)
1114	movhpd	%xmm13,  1 * SIZE(CO2)
1115
1116	movlpd	%xmm11,  0 * SIZE(CO1, LDC, 2)
1117	movlpd	%xmm15,  1 * SIZE(CO1, LDC, 2)
1118
1119	movhpd	%xmm11,  0 * SIZE(CO2, LDC, 2)
1120	movhpd	%xmm15,  1 * SIZE(CO2, LDC, 2)
1121#else
1122	movlpd	%xmm0,  0 * SIZE(CO1)
1123	movhpd	%xmm0,  1 * SIZE(CO1)
1124
1125	movlpd	%xmm2,  0 * SIZE(CO2)
1126	movhpd	%xmm2,  1 * SIZE(CO2)
1127
1128	movlpd	%xmm4,  0 * SIZE(CO1, LDC, 2)
1129	movhpd	%xmm4,  1 * SIZE(CO1, LDC, 2)
1130
1131	movlpd	%xmm6,  0 * SIZE(CO2, LDC, 2)
1132	movhpd	%xmm6,  1 * SIZE(CO2, LDC, 2)
1133#endif
1134
1135#if defined(LN) || defined(LT)
1136	movaps	%xmm9,  -16 * SIZE(BO)
1137	movaps	%xmm11, -14 * SIZE(BO)
1138	movaps	%xmm13, -12 * SIZE(BO)
1139	movaps	%xmm15, -10 * SIZE(BO)
1140#else
1141	movaps	%xmm0,  -16 * SIZE(AO)
1142	movaps	%xmm2,  -14 * SIZE(AO)
1143	movaps	%xmm4,  -12 * SIZE(AO)
1144	movaps	%xmm6,  -10 * SIZE(AO)
1145#endif
1146
1147#ifndef LN
1148	addq	$2 * SIZE, CO1
1149	addq	$2 * SIZE, CO2
1150#endif
1151
1152#if defined(LT) || defined(RN)
1153	movq	K,  %rax
1154	subq	KK, %rax
1155	leaq	(,%rax, SIZE), %rax
1156	leaq	(AO, %rax, 2), AO
1157	leaq	(BO, %rax, 4), BO
1158#endif
1159
1160#ifdef LN
1161	subq	$2, KK
1162#endif
1163
1164#ifdef LT
1165	addq	$2, KK
1166#endif
1167
1168#ifdef RT
1169       movq	K, %rax
1170       salq	$1 + BASE_SHIFT, %rax
1171       addq	%rax, AORIG
1172#endif
1173	ALIGN_4
1174
1175.L30:
1176	movq	M,  I
1177	sarq	$2, I	# i = (m >> 2)
1178	jle	.L39
1179	ALIGN_4
1180
1181.L11:
1182#ifdef LN
1183       movq	K, %rax
1184       salq	$2 + BASE_SHIFT, %rax
1185       subq	%rax, AORIG
1186#endif
1187
1188#if defined(LN) || defined(RT)
1189	movq	KK, %rax
1190	movq	AORIG, AO
1191	leaq	(, %rax, SIZE), %rax
1192	leaq	(AO, %rax, 4), AO
1193#endif
1194
1195	movq	B, BO
1196
1197#if defined(LN) || defined(RT)
1198	movq	KK, %rax
1199	leaq	(, %rax, SIZE), %rax
1200	leaq	(BO, %rax, 4), BO
1201#endif
1202
1203	movapd	-16 * SIZE(AO), %xmm0
1204	movddup	-16 * SIZE(BO), %xmm1
1205	pxor	%xmm8, %xmm8
1206 	movddup	-15 * SIZE(BO), %xmm3
1207	pxor	%xmm9, %xmm9
1208	movapd	 -8 * SIZE(AO), %xmm4
1209	pxor	%xmm10, %xmm10
1210	movddup	 -8 * SIZE(BO), %xmm5
1211	pxor	%xmm11, %xmm11
1212
1213#ifndef LN
1214	prefetchw      3 * SIZE(CO1)
1215	pxor	%xmm12, %xmm12
1216	prefetchw      3 * SIZE(CO2)
1217	pxor	%xmm13, %xmm13
1218	prefetchw      3 * SIZE(CO1, LDC, 2)
1219	pxor	%xmm14, %xmm14
1220	prefetchw      3 * SIZE(CO2, LDC, 2)
1221	pxor	%xmm15, %xmm15
1222	movapd	%xmm0, %xmm2
1223#else
1224	prefetchw     -8 * SIZE(CO1)
1225	pxor	%xmm12, %xmm12
1226	prefetchw     -8 * SIZE(CO2)
1227	pxor	%xmm13, %xmm13
1228	prefetchw     -8 * SIZE(CO1, LDC, 2)
1229	pxor	%xmm14, %xmm14
1230	prefetchw     -8 * SIZE(CO2, LDC, 2)
1231	pxor	%xmm15, %xmm15
1232	movapd	%xmm0, %xmm2
1233#endif
1234
1235	prefetch	 -10 * SIZE(BB)
1236
1237#if defined(LT) || defined(RN)
1238	movq	KK, %rax
1239#else
1240	movq	K, %rax
1241	subq	KK, %rax
1242#endif
1243
1244	andq	$-8, %rax
1245	leaq	(, %rax, SIZE), %rax
1246	leaq	(AO, %rax, 4), AO
1247	leaq	(BO, %rax, 4), BO
1248	negq	%rax
1249	NOBRANCH
1250	je	.L15
1251	ALIGN_4
1252
1253.L12:
1254	KERNEL1(16 *  0)
1255	KERNEL2(16 *  0)
1256	KERNEL3(16 *  0)
1257	KERNEL4(16 *  0)
1258	KERNEL5(16 *  0)
1259	KERNEL6(16 *  0)
1260	KERNEL7(16 *  0)
1261	KERNEL8(16 *  0)
1262	BRANCH
1263	jl	.L12
1264	ALIGN_4
1265
1266.L15:
1267	prefetch	  14 * SIZE(BB)
1268	subq		 $-16 * SIZE, BB
1269
1270#if defined(LT) || defined(RN)
1271	movq	KK, %rax
1272#else
1273	movq	K, %rax
1274	subq	KK, %rax
1275#endif
1276	testq	$4, %rax
1277	je .L16
1278	xorq	%rax, %rax
1279	ALIGN_4
1280
1281	KERNEL_SUB1(16 *  0)
1282	KERNEL_SUB2(16 *  0)
1283	KERNEL_SUB3(16 *  0)
1284	KERNEL_SUB4(16 *  0)
1285
1286	subq	$-16 * SIZE, BO
1287	subq	$-16 * SIZE, AO
1288	ALIGN_4
1289
1290.L16:
1291#if defined(LT) || defined(RN)
1292	movq	KK, %rax
1293#else
1294	movq	K, %rax
1295	subq	KK, %rax
1296#endif
1297	andq	$3, %rax		# if (k & 1)
1298	je .L19
1299
1300	leaq	(, %rax, SIZE), %rax
1301	leaq	(AO, %rax, 4), AO
1302	leaq	(BO, %rax, 4), BO
1303	negq	%rax
1304	ALIGN_4
1305
1306.L17:
1307	mulpd	%xmm1, %xmm0
1308	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1
1309	addpd	%xmm0, %xmm8
1310	movapd	%xmm2, %xmm0
1311	addpd	%xmm1, %xmm12
1312	movddup	-14 * SIZE(BO, %rax, 4), %xmm1
1313	mulpd	%xmm3, %xmm2
1314	mulpd	-14 * SIZE(AO, %rax, 4), %xmm3
1315	addpd	%xmm2, %xmm9
1316	movapd	%xmm0, %xmm2
1317	addpd	%xmm3, %xmm13
1318	movddup	-13 * SIZE(BO, %rax, 4), %xmm3
1319	mulpd	%xmm1, %xmm0
1320	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1
1321	addpd	%xmm0, %xmm10
1322	movapd	-12 * SIZE(AO, %rax, 4), %xmm0
1323	addpd	%xmm1, %xmm14
1324	movddup	-12 * SIZE(BO, %rax, 4), %xmm1
1325	mulpd	%xmm3, %xmm2
1326	mulpd	-14 * SIZE(AO, %rax, 4), %xmm3
1327	addpd	%xmm2, %xmm11
1328	addpd	%xmm3, %xmm15
1329 	movddup	-11 * SIZE(BO, %rax, 4), %xmm3
1330	movapd	%xmm0, %xmm2
1331
1332	addq	$SIZE, %rax
1333	jl	.L17
1334	ALIGN_4
1335
1336.L19:
1337#if defined(LN) || defined(RT)
1338	movq	KK, %rax
1339#ifdef LN
1340	subq	$4, %rax
1341#else
1342	subq	$4, %rax
1343#endif
1344
1345	leaq	(, %rax, SIZE), %rax
1346
1347	movq	AORIG, AO
1348	leaq	(AO, %rax, 4), AO
1349	leaq	(B,  %rax, 4), BO
1350#endif
1351
1352#if defined(LN) || defined(LT)
1353	movapd	%xmm8, %xmm0
1354	unpcklpd %xmm9, %xmm8
1355	unpckhpd %xmm9, %xmm0
1356
1357	movapd	%xmm10, %xmm2
1358	unpcklpd %xmm11, %xmm10
1359	unpckhpd %xmm11, %xmm2
1360
1361	movapd	%xmm12, %xmm4
1362	unpcklpd %xmm13, %xmm12
1363	unpckhpd %xmm13, %xmm4
1364
1365	movapd	%xmm14, %xmm6
1366	unpcklpd %xmm15, %xmm14
1367	unpckhpd %xmm15, %xmm6
1368
1369	movapd	-16 * SIZE(BO), %xmm9
1370	movapd	-14 * SIZE(BO), %xmm11
1371	movapd	-12 * SIZE(BO), %xmm13
1372	movapd	-10 * SIZE(BO), %xmm15
1373	movapd	 -8 * SIZE(BO), %xmm1
1374	movapd	 -6 * SIZE(BO), %xmm3
1375	movapd	 -4 * SIZE(BO), %xmm5
1376	movapd	 -2 * SIZE(BO), %xmm7
1377
1378	subpd	%xmm8,  %xmm9
1379	subpd	%xmm10, %xmm11
1380	subpd	%xmm0,  %xmm13
1381	subpd	%xmm2,  %xmm15
1382	subpd	%xmm12, %xmm1
1383	subpd	%xmm14, %xmm3
1384	subpd	%xmm4,  %xmm5
1385	subpd	%xmm6,  %xmm7
1386#else
1387	movapd	-16 * SIZE(AO), %xmm0
1388	movapd	-14 * SIZE(AO), %xmm1
1389	movapd	-12 * SIZE(AO), %xmm2
1390	movapd	-10 * SIZE(AO), %xmm3
1391
1392	movapd	 -8 * SIZE(AO), %xmm4
1393	movapd	 -6 * SIZE(AO), %xmm5
1394	movapd	 -4 * SIZE(AO), %xmm6
1395	movapd	 -2 * SIZE(AO), %xmm7
1396
1397	subpd	%xmm8,  %xmm0
1398	subpd	%xmm12, %xmm1
1399	subpd	%xmm9,  %xmm2
1400	subpd	%xmm13, %xmm3
1401	subpd	%xmm10, %xmm4
1402	subpd	%xmm14, %xmm5
1403	subpd	%xmm11, %xmm6
1404	subpd	%xmm15, %xmm7
1405#endif
1406
1407#ifdef LN
1408	movddup	 -1 * SIZE(AO), %xmm8
1409	mulpd	 %xmm8, %xmm5
1410	mulpd	 %xmm8, %xmm7
1411
1412	movddup	 -2 * SIZE(AO), %xmm10
1413	mulpd	 %xmm5, %xmm10
1414	subpd	 %xmm10, %xmm1
1415	movddup	 -2 * SIZE(AO), %xmm10
1416	mulpd	 %xmm7, %xmm10
1417	subpd	 %xmm10, %xmm3
1418
1419	movddup	 -3 * SIZE(AO), %xmm12
1420	mulpd	 %xmm5, %xmm12
1421	subpd	 %xmm12, %xmm13
1422	movddup	 -3 * SIZE(AO), %xmm12
1423	mulpd	 %xmm7, %xmm12
1424	subpd	 %xmm12, %xmm15
1425
1426	movddup	 -4 * SIZE(AO), %xmm14
1427	mulpd	 %xmm5, %xmm14
1428	subpd	 %xmm14, %xmm9
1429	movddup	 -4 * SIZE(AO), %xmm14
1430	mulpd	 %xmm7, %xmm14
1431	subpd	 %xmm14, %xmm11
1432
1433	movddup	 -6 * SIZE(AO), %xmm8
1434	mulpd	 %xmm8, %xmm1
1435	mulpd	 %xmm8, %xmm3
1436
1437	movddup	 -7 * SIZE(AO), %xmm10
1438	mulpd	 %xmm1, %xmm10
1439	subpd	 %xmm10, %xmm13
1440	movddup	 -7 * SIZE(AO), %xmm10
1441	mulpd	 %xmm3, %xmm10
1442	subpd	 %xmm10, %xmm15
1443
1444	movddup	 -8 * SIZE(AO), %xmm12
1445	mulpd	 %xmm1, %xmm12
1446	subpd	 %xmm12, %xmm9
1447	movddup	 -8 * SIZE(AO), %xmm12
1448	mulpd	 %xmm3, %xmm12
1449	subpd	 %xmm12, %xmm11
1450
1451	movddup	-11 * SIZE(AO), %xmm8
1452	mulpd	 %xmm8, %xmm13
1453	mulpd	 %xmm8, %xmm15
1454
1455	movddup	-12 * SIZE(AO), %xmm10
1456	mulpd	 %xmm13, %xmm10
1457	subpd	 %xmm10, %xmm9
1458	movddup	-12 * SIZE(AO), %xmm10
1459	mulpd	 %xmm15, %xmm10
1460	subpd	 %xmm10, %xmm11
1461
1462	movddup	-16 * SIZE(AO), %xmm8
1463	mulpd	 %xmm8, %xmm9
1464	mulpd	 %xmm8, %xmm11
1465#endif
1466
1467#ifdef LT
1468	movddup -16 * SIZE(AO), %xmm8
1469	mulpd	 %xmm8, %xmm9
1470	mulpd	 %xmm8, %xmm11
1471
1472	movddup	-15 * SIZE(AO), %xmm10
1473	mulpd	 %xmm9, %xmm10
1474	subpd	 %xmm10, %xmm13
1475
1476	movddup	-15 * SIZE(AO), %xmm10
1477	mulpd	 %xmm11, %xmm10
1478	subpd	 %xmm10, %xmm15
1479
1480	movddup	-14 * SIZE(AO), %xmm12
1481	mulpd	 %xmm9, %xmm12
1482	subpd	 %xmm12, %xmm1
1483	movddup	-14 * SIZE(AO), %xmm12
1484	mulpd	 %xmm11, %xmm12
1485	subpd	 %xmm12, %xmm3
1486
1487	movddup	-13 * SIZE(AO), %xmm14
1488	mulpd	 %xmm9, %xmm14
1489	subpd	 %xmm14, %xmm5
1490	movddup	-13 * SIZE(AO), %xmm14
1491	mulpd	 %xmm11, %xmm14
1492	subpd	 %xmm14, %xmm7
1493
1494	movddup	-11 * SIZE(AO), %xmm8
1495	mulpd	 %xmm8, %xmm13
1496	mulpd	 %xmm8, %xmm15
1497
1498	movddup	-10 * SIZE(AO), %xmm10
1499	mulpd	 %xmm13, %xmm10
1500	subpd	 %xmm10, %xmm1
1501	movddup	-10 * SIZE(AO), %xmm10
1502	mulpd	 %xmm15, %xmm10
1503	subpd	 %xmm10, %xmm3
1504
1505	movddup	 -9 * SIZE(AO), %xmm12
1506	mulpd	 %xmm13, %xmm12
1507	subpd	 %xmm12, %xmm5
1508	movddup	 -9 * SIZE(AO), %xmm12
1509	mulpd	 %xmm15, %xmm12
1510	subpd	 %xmm12, %xmm7
1511
1512	movddup	 -6 * SIZE(AO), %xmm8
1513	mulpd	 %xmm8, %xmm1
1514	mulpd	 %xmm8, %xmm3
1515
1516	movddup	 -5 * SIZE(AO), %xmm10
1517	mulpd	 %xmm1, %xmm10
1518	subpd	 %xmm10, %xmm5
1519	movddup	 -5 * SIZE(AO), %xmm10
1520	mulpd	 %xmm3, %xmm10
1521	subpd	 %xmm10, %xmm7
1522
1523	movddup	 -1 * SIZE(AO), %xmm8
1524	mulpd	 %xmm8, %xmm5
1525	mulpd	 %xmm8, %xmm7
1526#endif
1527
1528#ifdef RN
1529	movddup	-16 * SIZE(BO), %xmm8
1530	mulpd	 %xmm8, %xmm0
1531	mulpd	 %xmm8, %xmm1
1532
1533	movddup	-15 * SIZE(BO), %xmm9
1534	mulpd	 %xmm0, %xmm9
1535	subpd	 %xmm9, %xmm2
1536	movddup	-15 * SIZE(BO), %xmm9
1537	mulpd	 %xmm1, %xmm9
1538	subpd	 %xmm9, %xmm3
1539
1540	movddup	-14 * SIZE(BO), %xmm10
1541	mulpd	 %xmm0, %xmm10
1542	subpd	 %xmm10, %xmm4
1543	movddup	-14 * SIZE(BO), %xmm10
1544	mulpd	 %xmm1, %xmm10
1545	subpd	 %xmm10, %xmm5
1546
1547	movddup	 -13 * SIZE(BO), %xmm11
1548	mulpd	 %xmm0, %xmm11
1549	subpd	 %xmm11, %xmm6
1550	movddup	 -13 * SIZE(BO), %xmm11
1551	mulpd	 %xmm1, %xmm11
1552	subpd	 %xmm11, %xmm7
1553
1554	movddup	 -11 * SIZE(BO), %xmm8
1555	mulpd	 %xmm8, %xmm2
1556	mulpd	 %xmm8, %xmm3
1557
1558	movddup	 -10 * SIZE(BO), %xmm9
1559	mulpd	 %xmm2, %xmm9
1560	subpd	 %xmm9, %xmm4
1561	movddup	 -10 * SIZE(BO), %xmm9
1562	mulpd	 %xmm3, %xmm9
1563	subpd	 %xmm9, %xmm5
1564
1565	movddup	  -9 * SIZE(BO), %xmm10
1566	mulpd	 %xmm2, %xmm10
1567	subpd	 %xmm10, %xmm6
1568	movddup	  -9 * SIZE(BO), %xmm10
1569	mulpd	 %xmm3, %xmm10
1570	subpd	 %xmm10, %xmm7
1571
1572	movddup	 -6 * SIZE(BO), %xmm8
1573	mulpd	 %xmm8, %xmm4
1574	mulpd	 %xmm8, %xmm5
1575
1576	movddup	 -5 * SIZE(BO), %xmm9
1577	mulpd	 %xmm4, %xmm9
1578	subpd	 %xmm9, %xmm6
1579	movddup	 -5 * SIZE(BO), %xmm9
1580	mulpd	 %xmm5, %xmm9
1581	subpd	 %xmm9, %xmm7
1582
1583	movddup	 -1 * SIZE(BO), %xmm8
1584	mulpd	 %xmm8, %xmm6
1585	mulpd	 %xmm8, %xmm7
1586#endif
1587
1588#ifdef RT
1589	movddup	 -1 * SIZE(BO), %xmm8
1590	mulpd	 %xmm8, %xmm6
1591	mulpd	 %xmm8, %xmm7
1592
1593	movddup	 -2 * SIZE(BO), %xmm9
1594	mulpd	 %xmm6, %xmm9
1595	subpd	 %xmm9, %xmm4
1596	movddup	 -2 * SIZE(BO), %xmm9
1597	mulpd	 %xmm7, %xmm9
1598	subpd	 %xmm9, %xmm5
1599
1600	movddup	 -3 * SIZE(BO), %xmm10
1601	mulpd	 %xmm6, %xmm10
1602	subpd	 %xmm10, %xmm2
1603	movddup	 -3 * SIZE(BO), %xmm10
1604	mulpd	 %xmm7, %xmm10
1605	subpd	 %xmm10, %xmm3
1606
1607	movddup	 -4 * SIZE(BO), %xmm11
1608	mulpd	 %xmm6, %xmm11
1609	subpd	 %xmm11, %xmm0
1610	movddup	 -4 * SIZE(BO), %xmm11
1611	mulpd	 %xmm7, %xmm11
1612	subpd	 %xmm11, %xmm1
1613
1614	movddup	 -6 * SIZE(BO), %xmm8
1615	mulpd	 %xmm8, %xmm4
1616	mulpd	 %xmm8, %xmm5
1617
1618	movddup	 -7 * SIZE(BO), %xmm9
1619	mulpd	 %xmm4, %xmm9
1620	subpd	 %xmm9, %xmm2
1621	movddup	 -7 * SIZE(BO), %xmm9
1622	mulpd	 %xmm5, %xmm9
1623	subpd	 %xmm9, %xmm3
1624
1625	movddup	 -8 * SIZE(BO), %xmm10
1626	mulpd	 %xmm4, %xmm10
1627	subpd	 %xmm10, %xmm0
1628	movddup	 -8 * SIZE(BO), %xmm10
1629	mulpd	 %xmm5, %xmm10
1630	subpd	 %xmm10, %xmm1
1631
1632	movddup	-11 * SIZE(BO), %xmm8
1633	mulpd	 %xmm8, %xmm2
1634	mulpd	 %xmm8, %xmm3
1635
1636	movddup	-12 * SIZE(BO), %xmm9
1637	mulpd	 %xmm2, %xmm9
1638	subpd	 %xmm9, %xmm0
1639	movddup	-12 * SIZE(BO), %xmm9
1640	mulpd	 %xmm3, %xmm9
1641	subpd	 %xmm9, %xmm1
1642
1643	movddup	-16 * SIZE(BO), %xmm8
1644	mulpd	 %xmm8, %xmm0
1645	mulpd	 %xmm8, %xmm1
1646#endif
1647
1648#ifdef LN
1649	subq	$4 * SIZE, CO1
1650	subq	$4 * SIZE, CO2
1651#endif
1652
1653#if defined(LN) || defined(LT)
1654	movlpd	%xmm9,  0 * SIZE(CO1)
1655	movlpd	%xmm13, 1 * SIZE(CO1)
1656	movlpd	%xmm1,  2 * SIZE(CO1)
1657	movlpd	%xmm5,  3 * SIZE(CO1)
1658
1659	movhpd	%xmm9,  0 * SIZE(CO2)
1660	movhpd	%xmm13, 1 * SIZE(CO2)
1661	movhpd	%xmm1,  2 * SIZE(CO2)
1662	movhpd	%xmm5,  3 * SIZE(CO2)
1663
1664	movlpd	%xmm11, 0 * SIZE(CO1, LDC, 2)
1665	movlpd	%xmm15, 1 * SIZE(CO1, LDC, 2)
1666	movlpd	%xmm3,  2 * SIZE(CO1, LDC, 2)
1667	movlpd	%xmm7,  3 * SIZE(CO1, LDC, 2)
1668
1669	movhpd	%xmm11, 0 * SIZE(CO2, LDC, 2)
1670	movhpd	%xmm15, 1 * SIZE(CO2, LDC, 2)
1671	movhpd	%xmm3,  2 * SIZE(CO2, LDC, 2)
1672	movhpd	%xmm7,  3 * SIZE(CO2, LDC, 2)
1673#else
1674	movlpd	%xmm0,  0 * SIZE(CO1)
1675	movhpd	%xmm0,  1 * SIZE(CO1)
1676	movlpd	%xmm1,  2 * SIZE(CO1)
1677	movhpd	%xmm1,  3 * SIZE(CO1)
1678
1679	movlpd	%xmm2,  0 * SIZE(CO2)
1680	movhpd	%xmm2,  1 * SIZE(CO2)
1681	movlpd	%xmm3,  2 * SIZE(CO2)
1682	movhpd	%xmm3,  3 * SIZE(CO2)
1683
1684	movlpd	%xmm4,  0 * SIZE(CO1, LDC, 2)
1685	movhpd	%xmm4,  1 * SIZE(CO1, LDC, 2)
1686	movlpd	%xmm5,  2 * SIZE(CO1, LDC, 2)
1687	movhpd	%xmm5,  3 * SIZE(CO1, LDC, 2)
1688
1689	movlpd	%xmm6,  0 * SIZE(CO2, LDC, 2)
1690	movhpd	%xmm6,  1 * SIZE(CO2, LDC, 2)
1691	movlpd	%xmm7,  2 * SIZE(CO2, LDC, 2)
1692	movhpd	%xmm7,  3 * SIZE(CO2, LDC, 2)
1693#endif
1694
1695#if defined(LN) || defined(LT)
1696	movaps	%xmm9,  -16 * SIZE(BO)
1697	movaps	%xmm11, -14 * SIZE(BO)
1698	movaps	%xmm13, -12 * SIZE(BO)
1699	movaps	%xmm15, -10 * SIZE(BO)
1700	movaps	%xmm1,   -8 * SIZE(BO)
1701	movaps	%xmm3,   -6 * SIZE(BO)
1702	movaps	%xmm5,   -4 * SIZE(BO)
1703	movaps	%xmm7,   -2 * SIZE(BO)
1704#else
1705	movaps	%xmm0,  -16 * SIZE(AO)
1706	movaps	%xmm1,  -14 * SIZE(AO)
1707	movaps	%xmm2,  -12 * SIZE(AO)
1708	movaps	%xmm3,  -10 * SIZE(AO)
1709	movaps	%xmm4,   -8 * SIZE(AO)
1710	movaps	%xmm5,   -6 * SIZE(AO)
1711	movaps	%xmm6,   -4 * SIZE(AO)
1712	movaps	%xmm7,   -2 * SIZE(AO)
1713#endif
1714
1715#ifndef LN
1716	addq	$4 * SIZE, CO1
1717	addq	$4 * SIZE, CO2
1718#endif
1719
1720#if defined(LT) || defined(RN)
1721	movq	K,  %rax
1722	subq	KK, %rax
1723	leaq	(,%rax, SIZE), %rax
1724	leaq	(AO, %rax, 4), AO
1725	leaq	(BO, %rax, 4), BO
1726#endif
1727
1728#ifdef LN
1729	subq	$4, KK
1730#endif
1731
1732#ifdef LT
1733	addq	$4, KK
1734#endif
1735
1736#ifdef RT
1737	movq	K, %rax
1738	salq	$2 + BASE_SHIFT, %rax
1739	addq	%rax, AORIG
1740#endif
1741
1742	decq	I			# i --
1743	jg	.L11
1744	ALIGN_4
1745
1746.L39:
1747#ifdef LN
1748       leaq	(, K, SIZE), %rax
1749       leaq	(B, %rax, 4), B
1750#endif
1751
1752#if defined(LT) || defined(RN)
1753	movq	BO, B
1754#endif
1755
1756#ifdef RN
1757	addq	$4, KK
1758#endif
1759
1760#ifdef RT
1761	subq	$4, KK
1762#endif
1763
1764	decq	J			# j --
1765	jg	.L01
1766	ALIGN_4
1767
1768.L40:
1769	testq	$2, N
1770	je	.L80
1771
1772#if defined(LT) || defined(RN)
1773	movq	A, AO
1774#else
1775	movq	A, AORIG
1776#endif
1777
1778#ifdef RT
1779       movq	K, %rax
1780       salq	$1 + BASE_SHIFT, %rax
1781       subq	%rax, B
1782
1783       leaq	(, LDC, 2), %rax
1784       subq	%rax, C
1785#endif
1786
1787	movq	C, CO1			# coffset1 = c
1788	leaq	(C, LDC, 1), CO2	# coffset2 = c + ldc
1789#ifndef RT
1790	leaq	(C, LDC, 2), C
1791#endif
1792
1793#ifdef LN
1794	movq	OFFSET, %rax
1795	addq	M, %rax
1796	movq	%rax, KK
1797#endif
1798
1799#if defined(LT)
1800	movq	OFFSET, %rax
1801	movq	%rax, KK
1802#endif
1803
1804	testq	$1, M
1805	je	.L60
1806	ALIGN_4
1807
1808.L71:
1809#ifdef LN
1810       movq	K, %rax
1811       salq	$0 + BASE_SHIFT, %rax
1812       subq	%rax, AORIG
1813#endif
1814
1815#if defined(LN) || defined(RT)
1816	movq	KK, %rax
1817	movq	AORIG, AO
1818	leaq	(, %rax, SIZE), %rax
1819	leaq	(AO, %rax, 1), AO
1820#endif
1821
1822	movq	B, BO
1823
1824#if defined(LN) || defined(RT)
1825	movq	KK, %rax
1826	salq	$1 + BASE_SHIFT, %rax
1827	leaq	(BO, %rax, 1), BO
1828#endif
1829
1830	movddup	-16 * SIZE(AO), %xmm0
1831	pxor	%xmm8, %xmm8
1832	movddup	-15 * SIZE(AO), %xmm1
1833	pxor	%xmm9, %xmm9
1834	movddup	-14 * SIZE(AO), %xmm2
1835	pxor	%xmm10, %xmm10
1836	movddup	-13 * SIZE(AO), %xmm3
1837	pxor	%xmm11, %xmm11
1838
1839#if defined(LT) || defined(RN)
1840	movq	KK, %rax
1841#else
1842	movq	K, %rax
1843	subq	KK, %rax
1844#endif
1845	andq	$-4, %rax
1846	leaq	(, %rax, SIZE), %rax
1847	leaq	(AO, %rax, 1), AO
1848	leaq	(BO, %rax, 2), BO
1849	negq	%rax
1850	NOBRANCH
1851	je	.L76
1852	ALIGN_4
1853
1854.L72:
1855	mulpd	-16 * SIZE(BO, %rax, 2), %xmm0
1856	addpd	%xmm0, %xmm8
1857	movddup	-12 * SIZE(AO, %rax, 1), %xmm0
1858
1859	mulpd	-14 * SIZE(BO, %rax, 2), %xmm1
1860	addpd	%xmm1, %xmm9
1861	movddup	-11 * SIZE(AO, %rax, 1), %xmm1
1862
1863	mulpd	-12 * SIZE(BO, %rax, 2), %xmm2
1864	addpd	%xmm2, %xmm10
1865	movddup	-10 * SIZE(AO, %rax, 1), %xmm2
1866
1867	mulpd	-10 * SIZE(BO, %rax, 2), %xmm3
1868	addpd	%xmm3, %xmm11
1869	movddup	 -9 * SIZE(AO, %rax, 1), %xmm3
1870
1871	addq	$4 * SIZE, %rax
1872	BRANCH
1873	jl	.L72
1874	ALIGN_4
1875
1876.L76:
1877#if defined(LT) || defined(RN)
1878	movq	KK, %rax
1879#else
1880	movq	K, %rax
1881	subq	KK, %rax
1882#endif
1883	andq	$3, %rax		# if (k & 1)
1884	je .L78
1885
1886	leaq	(, %rax, SIZE), %rax
1887	leaq	(AO, %rax, 1), AO
1888	leaq	(BO, %rax, 2), BO
1889	negq	%rax
1890	ALIGN_4
1891
1892.L77:
1893	mulpd	-16 * SIZE(BO, %rax, 2), %xmm0
1894	addpd	%xmm0, %xmm8
1895	movddup	-15 * SIZE(AO, %rax, 1), %xmm0
1896
1897	addq	$SIZE, %rax
1898	jl	.L77
1899	ALIGN_4
1900
1901.L78:
1902	addpd	%xmm9,  %xmm8
1903	addpd	%xmm11, %xmm10
1904	addpd	%xmm10, %xmm8
1905
1906#if defined(LN) || defined(RT)
1907	movq	KK, %rax
1908#ifdef LN
1909	subq	$1, %rax
1910#else
1911	subq	$2, %rax
1912#endif
1913
1914	leaq	(, %rax, SIZE), %rax
1915
1916	movq	AORIG, AO
1917	leaq	(AO, %rax, 1), AO
1918	leaq	(B,  %rax, 2), BO
1919#endif
1920
1921#if defined(LN) || defined(LT)
1922	movapd	-16 * SIZE(BO), %xmm2
1923#else
1924	movapd	-16 * SIZE(AO), %xmm2
1925#endif
1926
1927	subpd	%xmm8,  %xmm2
1928
1929#if defined(LN) || defined(LT)
1930	movddup	-16 * SIZE(AO), %xmm0
1931
1932	mulpd	 %xmm0, %xmm2
1933#endif
1934
1935#ifdef RN
1936	movapd	%xmm2,  %xmm0
1937        unpckhpd %xmm0, %xmm0
1938
1939	mulsd	-16 * SIZE(BO), %xmm2
1940	movsd	-15 * SIZE(BO), %xmm4
1941	mulsd	 %xmm2, %xmm4
1942	subsd	 %xmm4, %xmm0
1943
1944	mulsd	-13 * SIZE(BO), %xmm0
1945	unpcklpd %xmm0, %xmm2
1946#endif
1947
1948#ifdef RT
1949	movapd	%xmm2,  %xmm0
1950        unpckhpd %xmm0, %xmm0
1951
1952	mulsd	-13 * SIZE(BO), %xmm0
1953
1954	movlpd	-14 * SIZE(BO), %xmm4
1955	mulsd	 %xmm0, %xmm4
1956	subsd	 %xmm4, %xmm2
1957
1958	mulsd	-16 * SIZE(BO), %xmm2
1959	unpcklpd %xmm0, %xmm2
1960#endif
1961
1962#ifdef LN
1963	subq	$1 * SIZE, CO1
1964	subq	$1 * SIZE, CO2
1965#endif
1966
1967	movlpd	%xmm2,  0 * SIZE(CO1)
1968	movhpd	%xmm2,  0 * SIZE(CO2)
1969
1970#if defined(LN) || defined(LT)
1971	movaps	%xmm2, -16 * SIZE(BO)
1972#else
1973	movaps	%xmm2, -16 * SIZE(AO)
1974#endif
1975
1976#ifndef LN
1977	addq	$1 * SIZE, CO1
1978	addq	$1 * SIZE, CO2
1979#endif
1980
1981#if defined(LT) || defined(RN)
1982	movq	K,  %rax
1983	subq	KK, %rax
1984	leaq	(,%rax, SIZE), %rax
1985	leaq	(AO, %rax, 1), AO
1986	leaq	(BO, %rax, 2), BO
1987#endif
1988
1989#ifdef LN
1990	subq	$1, KK
1991#endif
1992
1993#ifdef LT
1994	addq	$1, KK
1995#endif
1996
1997#ifdef RT
1998	movq	K, %rax
1999	salq	$0 + BASE_SHIFT, %rax
2000	addq	%rax, AORIG
2001#endif
2002	ALIGN_4
2003
2004.L60:
2005	testq	$2, M
2006	je	.L70
2007
2008#ifdef LN
2009       movq	K, %rax
2010       salq	$1 + BASE_SHIFT, %rax
2011       subq	%rax, AORIG
2012#endif
2013
2014#if defined(LN) || defined(RT)
2015	movq	KK, %rax
2016	movq	AORIG, AO
2017	leaq	(, %rax, SIZE), %rax
2018	leaq	(AO, %rax, 2), AO
2019#endif
2020
2021	movq	B, BO
2022
2023#if defined(LN) || defined(RT)
2024	movq	KK, %rax
2025	leaq	(, %rax, SIZE), %rax
2026	leaq	(BO, %rax, 2), BO
2027#endif
2028
2029	movapd	-16 * SIZE(AO), %xmm0
2030	pxor	%xmm8, %xmm8
2031	movapd	-12 * SIZE(AO), %xmm2
2032	pxor	%xmm9, %xmm9
2033	movddup	-16 * SIZE(BO), %xmm1
2034	pxor	%xmm10, %xmm10
2035	movddup	-15 * SIZE(BO), %xmm3
2036	pxor	%xmm11, %xmm11
2037
2038#if defined(LT) || defined(RN)
2039	movq	KK, %rax
2040#else
2041	movq	K, %rax
2042	subq	KK, %rax
2043#endif
2044	andq	$-4, %rax
2045	leaq	(, %rax, SIZE), %rax
2046	leaq	(AO, %rax, 2), AO
2047	leaq	(BO, %rax, 2), BO
2048	negq	%rax
2049	NOBRANCH
2050	je	.L66
2051	ALIGN_4
2052
2053.L62:
2054	mulpd	%xmm0, %xmm1
2055	addpd	%xmm1, %xmm8
2056	movddup	-14 * SIZE(BO, %rax, 2), %xmm1
2057	mulpd	%xmm0, %xmm3
2058	movapd	-14 * SIZE(AO, %rax, 2), %xmm0
2059	addpd	%xmm3, %xmm9
2060	movddup	-13 * SIZE(BO, %rax, 2), %xmm3
2061	mulpd	%xmm0, %xmm1
2062	addpd	%xmm1, %xmm10
2063	movddup	-12 * SIZE(BO, %rax, 2), %xmm1
2064	mulpd	%xmm0, %xmm3
2065	movapd	 -8 * SIZE(AO, %rax, 2), %xmm0
2066	addpd	%xmm3, %xmm11
2067	movddup	-11 * SIZE(BO, %rax, 2), %xmm3
2068	mulpd	%xmm2, %xmm1
2069	addpd	%xmm1, %xmm8
2070	movddup	-10 * SIZE(BO, %rax, 2), %xmm1
2071	mulpd	%xmm2, %xmm3
2072	movapd	-10 * SIZE(AO, %rax, 2), %xmm2
2073	addpd	%xmm3, %xmm9
2074	movddup	 -9 * SIZE(BO, %rax, 2), %xmm3
2075	mulpd	%xmm2, %xmm1
2076	addpd	%xmm1, %xmm10
2077	movddup	 -8 * SIZE(BO, %rax, 2), %xmm1
2078	mulpd	%xmm2, %xmm3
2079	movapd	 -4 * SIZE(AO, %rax, 2), %xmm2
2080	addpd	%xmm3, %xmm11
2081	movddup	 -7 * SIZE(BO, %rax, 2), %xmm3
2082
2083	addq	$4 * SIZE, %rax
2084	BRANCH
2085	jl	.L62
2086	ALIGN_4
2087
2088.L66:
2089#if defined(LT) || defined(RN)
2090	movq	KK, %rax
2091#else
2092	movq	K, %rax
2093	subq	KK, %rax
2094#endif
2095	andq	$3, %rax		# if (k & 1)
2096	je .L69
2097
2098	leaq	(, %rax, SIZE), %rax
2099	leaq	(AO, %rax, 2), AO
2100	leaq	(BO, %rax, 2), BO
2101	negq	%rax
2102	ALIGN_4
2103
2104.L67:
2105	mulpd	%xmm0, %xmm1
2106	addpd	%xmm1, %xmm8
2107	movddup	-14 * SIZE(BO, %rax, 2), %xmm1
2108	mulpd	%xmm0, %xmm3
2109	movapd	-14 * SIZE(AO, %rax, 2), %xmm0
2110	addpd	%xmm3, %xmm9
2111	movddup	-13 * SIZE(BO, %rax, 2), %xmm3
2112
2113	addq	$SIZE, %rax
2114	jl	.L67
2115	ALIGN_4
2116
2117.L69:
2118	addpd	%xmm10, %xmm8
2119	addpd	%xmm11, %xmm9
2120
2121#if defined(LN) || defined(RT)
2122	movq	KK, %rax
2123#ifdef LN
2124	subq	$2, %rax
2125#else
2126	subq	$2, %rax
2127#endif
2128
2129	leaq	(, %rax, SIZE), %rax
2130
2131	movq	AORIG, AO
2132	leaq	(AO, %rax, 2), AO
2133	leaq	(B,  %rax, 2), BO
2134#endif
2135
2136#if defined(LN) || defined(LT)
2137	movapd	%xmm8, %xmm0
2138	unpcklpd %xmm9, %xmm8
2139	unpckhpd %xmm9, %xmm0
2140
2141	movapd	-16 * SIZE(BO), %xmm9
2142	movapd	-14 * SIZE(BO), %xmm13
2143
2144	subpd	%xmm8,  %xmm9
2145	subpd	%xmm0,  %xmm13
2146#else
2147	movapd	-16 * SIZE(AO), %xmm0
2148	movapd	-14 * SIZE(AO), %xmm2
2149
2150	subpd	%xmm8, %xmm0
2151	subpd	%xmm9, %xmm2
2152#endif
2153
2154
2155#ifdef LN
2156	movddup	-13 * SIZE(AO), %xmm8
2157	mulpd	 %xmm8, %xmm13
2158
2159	movddup	-14 * SIZE(AO), %xmm10
2160	mulpd	 %xmm13, %xmm10
2161	subpd	 %xmm10, %xmm9
2162
2163	movddup	-16 * SIZE(AO), %xmm8
2164	mulpd	 %xmm8, %xmm9
2165#endif
2166
2167#ifdef LT
2168	movddup	-16 * SIZE(AO), %xmm8
2169	mulpd	 %xmm8, %xmm9
2170
2171	movddup	-15 * SIZE(AO), %xmm10
2172	mulpd	 %xmm9, %xmm10
2173	subpd	 %xmm10, %xmm13
2174
2175	movddup	-13 * SIZE(AO), %xmm8
2176	mulpd	 %xmm8, %xmm13
2177#endif
2178
2179#ifdef RN
2180	movddup	-16 * SIZE(BO), %xmm8
2181	mulpd	 %xmm8, %xmm0
2182
2183	movddup	-15 * SIZE(BO), %xmm9
2184	mulpd	 %xmm0, %xmm9
2185	subpd	 %xmm9, %xmm2
2186
2187	movddup	-13 * SIZE(BO), %xmm8
2188	mulpd	 %xmm8, %xmm2
2189#endif
2190
2191#ifdef RT
2192	movddup	-13 * SIZE(BO), %xmm8
2193	mulpd	 %xmm8, %xmm2
2194
2195	movddup	-14 * SIZE(BO), %xmm9
2196	mulpd	 %xmm2, %xmm9
2197	subpd	 %xmm9, %xmm0
2198
2199	movddup	-16 * SIZE(BO), %xmm8
2200	mulpd	 %xmm8, %xmm0
2201#endif
2202
2203#ifdef LN
2204	subq	$2 * SIZE, CO1
2205	subq	$2 * SIZE, CO2
2206#endif
2207
2208#if defined(LN) || defined(LT)
2209	movlpd	%xmm9,   0 * SIZE(CO1)
2210	movlpd	%xmm13,  1 * SIZE(CO1)
2211
2212	movhpd	%xmm9,   0 * SIZE(CO2)
2213	movhpd	%xmm13,  1 * SIZE(CO2)
2214#else
2215	movlpd	%xmm0,   0 * SIZE(CO1)
2216	movhpd	%xmm0,   1 * SIZE(CO1)
2217
2218	movlpd	%xmm2,   0 * SIZE(CO2)
2219	movhpd	%xmm2,   1 * SIZE(CO2)
2220#endif
2221
2222#if defined(LN) || defined(LT)
2223	movaps	%xmm9,  -16 * SIZE(BO)
2224	movaps	%xmm13, -14 * SIZE(BO)
2225#else
2226	movaps	%xmm0,  -16 * SIZE(AO)
2227	movaps	%xmm2,  -14 * SIZE(AO)
2228#endif
2229
2230#ifndef LN
2231	addq	$2 * SIZE, CO1
2232	addq	$2 * SIZE, CO2
2233#endif
2234
2235#if defined(LT) || defined(RN)
2236	movq	K,  %rax
2237	subq	KK, %rax
2238	leaq	(,%rax, SIZE), %rax
2239	leaq	(AO, %rax, 2), AO
2240	leaq	(BO, %rax, 2), BO
2241#endif
2242
2243#ifdef LN
2244	subq	$2, KK
2245#endif
2246
2247#ifdef LT
2248	addq	$2, KK
2249#endif
2250
2251#ifdef RT
2252	movq	K, %rax
2253	salq	$1 + BASE_SHIFT, %rax
2254	addq	%rax, AORIG
2255#endif
2256	ALIGN_4
2257
2258.L70:
2259	movq	M,  I
2260	sarq	$2, I	# i = (m >> 2)
2261	jle	.L79
2262	ALIGN_4
2263
2264.L51:
2265#ifdef LN
2266       movq	K, %rax
2267       salq	$2 + BASE_SHIFT, %rax
2268       subq	%rax, AORIG
2269#endif
2270
2271#if defined(LN) || defined(RT)
2272	movq	KK, %rax
2273	movq	AORIG, AO
2274	leaq	(, %rax, SIZE), %rax
2275	leaq	(AO, %rax, 4), AO
2276#endif
2277
2278	movq	B, BO
2279
2280#if defined(LN) || defined(RT)
2281	movq	KK, %rax
2282	leaq	(, %rax, SIZE), %rax
2283	leaq	(BO, %rax, 2), BO
2284#endif
2285
2286	movddup	-16 * SIZE(BO), %xmm1
2287	movddup	-15 * SIZE(BO), %xmm5
2288	pxor	%xmm8, %xmm8
2289	movddup	-12 * SIZE(BO), %xmm3
2290	pxor	%xmm9, %xmm9
2291	movapd	-16 * SIZE(AO), %xmm0
2292	pxor	%xmm12, %xmm12
2293	movapd	 -8 * SIZE(AO), %xmm4
2294	pxor	%xmm13, %xmm13
2295
2296#ifndef LN
2297	prefetchw      3 * SIZE(CO1)
2298	movapd	%xmm0, %xmm2
2299	prefetchw      3 * SIZE(CO2)
2300#else
2301	prefetchw     -8 * SIZE(CO1)
2302	movapd	%xmm0, %xmm2
2303	prefetchw     -8 * SIZE(CO2)
2304#endif
2305
2306
2307#if defined(LT) || defined(RN)
2308	movq	KK, %rax
2309#else
2310	movq	K, %rax
2311	subq	KK, %rax
2312#endif
2313	andq	$-4, %rax
2314	leaq	(, %rax, SIZE), %rax
2315	leaq	(AO, %rax, 4), AO
2316	leaq	(BO, %rax, 2), BO
2317	negq	%rax
2318	NOBRANCH
2319	je	.L56
2320	ALIGN_4
2321
2322.L52:
2323	mulpd	%xmm1, %xmm0
2324	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1
2325	addpd	%xmm0, %xmm8
2326	movapd	-12 * SIZE(AO, %rax, 4), %xmm0
2327	addpd	%xmm1, %xmm12
2328	movddup	-14 * SIZE(BO, %rax, 2), %xmm1
2329	mulpd	%xmm5, %xmm2
2330	mulpd	-14 * SIZE(AO, %rax, 4), %xmm5
2331	addpd	%xmm2, %xmm9
2332	addpd	%xmm5, %xmm13
2333	movddup	-13 * SIZE(BO, %rax, 2), %xmm5
2334	movapd	%xmm0, %xmm2
2335	mulpd	%xmm1, %xmm0
2336	mulpd	-10 * SIZE(AO, %rax, 4), %xmm1
2337	addpd	%xmm0, %xmm8
2338	movapd	  (AO, %rax, 4), %xmm0
2339	addpd	%xmm1, %xmm12
2340	movddup	 -8 * SIZE(BO, %rax, 2), %xmm1
2341	mulpd	%xmm5, %xmm2
2342	mulpd	-10 * SIZE(AO, %rax, 4), %xmm5
2343	addpd	%xmm2, %xmm9
2344	addpd	%xmm5, %xmm13
2345	movddup	-11 * SIZE(BO, %rax, 2), %xmm5
2346	movapd	%xmm4, %xmm2
2347	mulpd	%xmm3, %xmm4
2348	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm3
2349	addpd	%xmm4, %xmm8
2350	movapd	 -4 * SIZE(AO, %rax, 4), %xmm4
2351	addpd	%xmm3, %xmm12
2352	movddup	-10 * SIZE(BO, %rax, 2), %xmm3
2353	mulpd	%xmm5, %xmm2
2354	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm5
2355	addpd	%xmm2, %xmm9
2356	addpd	%xmm5, %xmm13
2357	movddup	 -9 * SIZE(BO, %rax, 2), %xmm5
2358	movapd	%xmm4, %xmm2
2359	mulpd	%xmm3, %xmm4
2360	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm3
2361	addpd	%xmm4, %xmm8
2362	movapd	  8 * SIZE(AO, %rax, 4), %xmm4
2363	addpd	%xmm3, %xmm12
2364	movddup	 -4 * SIZE(BO, %rax, 2), %xmm3
2365	mulpd	%xmm5, %xmm2
2366	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm5
2367	addpd	%xmm2, %xmm9
2368	addpd	%xmm5, %xmm13
2369	movddup	 -7 * SIZE(BO, %rax, 2), %xmm5
2370	movapd	%xmm0, %xmm2
2371
2372	addq	$4 * SIZE, %rax
2373	BRANCH
2374	jl	.L52
2375	ALIGN_4
2376
2377.L56:
2378#if defined(LT) || defined(RN)
2379	movq	KK, %rax
2380#else
2381	movq	K, %rax
2382	subq	KK, %rax
2383#endif
2384	andq	$3, %rax		# if (k & 1)
2385	je .L59
2386
2387	leaq	(, %rax, SIZE), %rax
2388	leaq	(AO, %rax, 4), AO
2389	leaq	(BO, %rax, 2), BO
2390	negq	%rax
2391	ALIGN_4
2392
2393.L57:
2394	mulpd	%xmm1, %xmm0
2395	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1
2396	addpd	%xmm0, %xmm8
2397	movapd	-12 * SIZE(AO, %rax, 4), %xmm0
2398	addpd	%xmm1, %xmm12
2399	movddup	-14 * SIZE(BO, %rax, 2), %xmm1
2400	mulpd	%xmm5, %xmm2
2401	mulpd	-14 * SIZE(AO, %rax, 4), %xmm5
2402	addpd	%xmm2, %xmm9
2403	addpd	%xmm5, %xmm13
2404	movddup	-13 * SIZE(BO, %rax, 2), %xmm5
2405	movapd	%xmm0, %xmm2
2406
2407	addq	$SIZE, %rax
2408	jl	.L57
2409	ALIGN_4
2410
2411.L59:
2412#if defined(LN) || defined(RT)
2413	movq	KK, %rax
2414#ifdef LN
2415	subq	$4, %rax
2416#else
2417	subq	$2, %rax
2418#endif
2419
2420	leaq	(, %rax, SIZE), %rax
2421
2422	movq	AORIG, AO
2423	leaq	(AO, %rax, 4), AO
2424	leaq	(B,  %rax, 2), BO
2425#endif
2426
2427#if defined(LN) || defined(LT)
2428	movapd	%xmm8, %xmm0
2429	unpcklpd %xmm9, %xmm8
2430	unpckhpd %xmm9, %xmm0
2431
2432	movapd	%xmm12, %xmm4
2433	unpcklpd %xmm13, %xmm12
2434	unpckhpd %xmm13, %xmm4
2435
2436	movapd	-16 * SIZE(BO), %xmm9
2437	movapd	-14 * SIZE(BO), %xmm13
2438	movapd	-12 * SIZE(BO), %xmm1
2439	movapd	-10 * SIZE(BO), %xmm5
2440
2441	subpd	%xmm8,  %xmm9
2442	subpd	%xmm0,  %xmm13
2443	subpd	%xmm12, %xmm1
2444	subpd	%xmm4,  %xmm5
2445#else
2446	movapd	-16 * SIZE(AO), %xmm0
2447	movapd	-14 * SIZE(AO), %xmm1
2448	movapd	-12 * SIZE(AO), %xmm2
2449	movapd	-10 * SIZE(AO), %xmm3
2450
2451	subpd	%xmm8, %xmm0
2452	subpd	%xmm12, %xmm1
2453	subpd	%xmm9, %xmm2
2454	subpd	%xmm13, %xmm3
2455#endif
2456
2457#ifdef LN
2458	movddup	 -1 * SIZE(AO), %xmm8
2459	mulpd	 %xmm8, %xmm5
2460	movddup	 -2 * SIZE(AO), %xmm10
2461	mulpd	 %xmm5, %xmm10
2462	subpd	 %xmm10, %xmm1
2463	movddup	 -3 * SIZE(AO), %xmm12
2464	mulpd	 %xmm5, %xmm12
2465	subpd	 %xmm12, %xmm13
2466	movddup	 -4 * SIZE(AO), %xmm14
2467	mulpd	 %xmm5, %xmm14
2468	subpd	 %xmm14, %xmm9
2469
2470	movddup	 -6 * SIZE(AO), %xmm8
2471	mulpd	 %xmm8, %xmm1
2472	movddup	 -7 * SIZE(AO), %xmm10
2473	mulpd	 %xmm1, %xmm10
2474	subpd	 %xmm10, %xmm13
2475	movddup	 -8 * SIZE(AO), %xmm12
2476	mulpd	 %xmm1, %xmm12
2477	subpd	 %xmm12, %xmm9
2478
2479	movddup	-11 * SIZE(AO), %xmm8
2480	mulpd	 %xmm8, %xmm13
2481	movddup	-12 * SIZE(AO), %xmm10
2482	mulpd	 %xmm13, %xmm10
2483	subpd	 %xmm10, %xmm9
2484
2485	movddup	-16 * SIZE(AO), %xmm8
2486	mulpd	 %xmm8, %xmm9
2487#endif
2488
2489#ifdef LT
2490	movddup -16 * SIZE(AO), %xmm8
2491	mulpd	 %xmm8, %xmm9
2492	movddup	-15 * SIZE(AO), %xmm10
2493	mulpd	 %xmm9, %xmm10
2494	subpd	 %xmm10, %xmm13
2495	movddup	-14 * SIZE(AO), %xmm12
2496	mulpd	 %xmm9, %xmm12
2497	subpd	 %xmm12, %xmm1
2498	movddup	-13 * SIZE(AO), %xmm14
2499	mulpd	 %xmm9, %xmm14
2500	subpd	 %xmm14, %xmm5
2501
2502
2503	movddup	-11 * SIZE(AO), %xmm8
2504	mulpd	 %xmm8, %xmm13
2505
2506	movddup	-10 * SIZE(AO), %xmm10
2507	mulpd	 %xmm13, %xmm10
2508	subpd	 %xmm10, %xmm1
2509	movddup	 -9 * SIZE(AO), %xmm12
2510	mulpd	 %xmm13, %xmm12
2511	subpd	 %xmm12, %xmm5
2512
2513	movddup	 -6 * SIZE(AO), %xmm8
2514	mulpd	 %xmm8, %xmm1
2515	movddup	 -5 * SIZE(AO), %xmm10
2516	mulpd	 %xmm1, %xmm10
2517	subpd	 %xmm10, %xmm5
2518
2519	movddup	 -1 * SIZE(AO), %xmm8
2520	mulpd	 %xmm8, %xmm5
2521#endif
2522
2523#ifdef RN
2524	movddup	-16 * SIZE(BO), %xmm8
2525	mulpd	 %xmm8, %xmm0
2526	mulpd	 %xmm8, %xmm1
2527
2528	movddup	-15 * SIZE(BO), %xmm9
2529	mulpd	 %xmm0, %xmm9
2530	subpd	 %xmm9, %xmm2
2531	movddup	-15 * SIZE(BO), %xmm9
2532	mulpd	 %xmm1, %xmm9
2533	subpd	 %xmm9, %xmm3
2534
2535	movddup	-13 * SIZE(BO), %xmm8
2536	mulpd	 %xmm8, %xmm2
2537	mulpd	 %xmm8, %xmm3
2538#endif
2539
2540#ifdef RT
2541	movddup	-13 * SIZE(BO), %xmm8
2542	mulpd	 %xmm8, %xmm2
2543	mulpd	 %xmm8, %xmm3
2544
2545	movddup	-14 * SIZE(BO), %xmm9
2546	mulpd	 %xmm2, %xmm9
2547	subpd	 %xmm9, %xmm0
2548	movddup	-14 * SIZE(BO), %xmm9
2549	mulpd	 %xmm3, %xmm9
2550	subpd	 %xmm9, %xmm1
2551
2552	movddup	-16 * SIZE(BO), %xmm8
2553	mulpd	 %xmm8, %xmm0
2554	mulpd	 %xmm8, %xmm1
2555#endif
2556
2557#ifdef LN
2558	subq	$4 * SIZE, CO1
2559	subq	$4 * SIZE, CO2
2560#endif
2561
2562#if defined(LN) || defined(LT)
2563	movlpd	%xmm9,  0 * SIZE(CO1)
2564	movlpd	%xmm13, 1 * SIZE(CO1)
2565	movlpd	%xmm1,  2 * SIZE(CO1)
2566	movlpd	%xmm5,  3 * SIZE(CO1)
2567
2568	movhpd	%xmm9,  0 * SIZE(CO2)
2569	movhpd	%xmm13, 1 * SIZE(CO2)
2570	movhpd	%xmm1,  2 * SIZE(CO2)
2571	movhpd	%xmm5,  3 * SIZE(CO2)
2572#else
2573	movlpd	%xmm0,  0 * SIZE(CO1)
2574	movhpd	%xmm0,  1 * SIZE(CO1)
2575	movlpd	%xmm1,  2 * SIZE(CO1)
2576	movhpd	%xmm1,  3 * SIZE(CO1)
2577
2578	movlpd	%xmm2,  0 * SIZE(CO2)
2579	movhpd	%xmm2,  1 * SIZE(CO2)
2580	movlpd	%xmm3,  2 * SIZE(CO2)
2581	movhpd	%xmm3,  3 * SIZE(CO2)
2582#endif
2583
2584#if defined(LN) || defined(LT)
2585	movaps	%xmm9, -16 * SIZE(BO)
2586	movaps	%xmm13,-14 * SIZE(BO)
2587	movaps	%xmm1, -12 * SIZE(BO)
2588	movaps	%xmm5, -10 * SIZE(BO)
2589#else
2590	movaps	%xmm0, -16 * SIZE(AO)
2591	movaps	%xmm1, -14 * SIZE(AO)
2592	movaps	%xmm2, -12 * SIZE(AO)
2593	movaps	%xmm3, -10 * SIZE(AO)
2594#endif
2595
2596#ifndef LN
2597	addq	$4 * SIZE, CO1
2598	addq	$4 * SIZE, CO2
2599#endif
2600
2601#if defined(LT) || defined(RN)
2602	movq	K,  %rax
2603	subq	KK, %rax
2604	leaq	(,%rax, SIZE), %rax
2605	leaq	(AO, %rax, 4), AO
2606	leaq	(BO, %rax, 2), BO
2607#endif
2608
2609#ifdef LN
2610	subq	$4, KK
2611#endif
2612
2613#ifdef LT
2614	addq	$4, KK
2615#endif
2616
2617#ifdef RT
2618       movq	K, %rax
2619       salq	$2 + BASE_SHIFT, %rax
2620       addq	%rax, AORIG
2621#endif
2622
2623	decq	I			# i --
2624	jg	.L51
2625	ALIGN_4
2626
2627.L79:
2628#ifdef LN
2629       leaq	(, K, SIZE), %rax
2630       leaq	(B, %rax, 2), B
2631#endif
2632
2633#if defined(LT) || defined(RN)
2634	movq	BO, B
2635#endif
2636
2637#ifdef RN
2638	addq	$2, KK
2639#endif
2640
2641#ifdef RT
2642	subq	$2, KK
2643#endif
2644	ALIGN_4
2645
2646.L80:
2647	testq	$1, N
2648	je	.L999
2649
2650#if defined(LT) || defined(RN)
2651	movq	A, AO
2652#else
2653	movq	A, AORIG
2654#endif
2655
2656#ifdef RT
2657	movq	K, %rax
2658	salq	$0 + BASE_SHIFT, %rax
2659	subq	%rax, B
2660
2661	subq	LDC, C
2662#endif
2663
2664	movq	C, CO1			# coffset1 = c
2665#ifndef RT
2666	addq	LDC, C
2667#endif
2668
2669#ifdef LN
2670	movq	OFFSET, %rax
2671	addq	M, %rax
2672	movq	%rax, KK
2673#endif
2674
2675#ifdef LT
2676	movq	OFFSET, %rax
2677	movq	%rax, KK
2678#endif
2679
2680	testq	$1, M
2681	je	.L100
2682
2683#ifdef LN
2684       movq	K, %rax
2685       salq	$0 + BASE_SHIFT, %rax
2686       subq	%rax, AORIG
2687#endif
2688
2689#if defined(LN) || defined(RT)
2690	movq	KK, %rax
2691	movq	AORIG, AO
2692	leaq	(, %rax, SIZE), %rax
2693	leaq	(AO, %rax, 1), AO
2694#endif
2695
2696	movq	B, BO
2697
2698#if defined(LN) || defined(RT)
2699	movq	KK, %rax
2700	leaq	(BO, %rax, SIZE), BO
2701#endif
2702
2703	movapd	-16 * SIZE(AO), %xmm0
2704	pxor	%xmm8, %xmm8
2705	movapd	-14 * SIZE(AO), %xmm1
2706	pxor	%xmm9, %xmm9
2707
2708#if defined(LT) || defined(RN)
2709	movq	KK, %rax
2710#else
2711	movq	K, %rax
2712	subq	KK, %rax
2713#endif
2714	andq	$-4, %rax
2715	leaq	(, %rax, SIZE), %rax
2716	leaq	(AO, %rax, 1), AO
2717	leaq	(BO, %rax, 1), BO
2718	negq	%rax
2719	NOBRANCH
2720	je	.L116
2721	ALIGN_4
2722
2723.L112:
2724	mulpd	-16 * SIZE(BO, %rax, 1), %xmm0
2725	addpd	%xmm0, %xmm8
2726	movapd	-12 * SIZE(AO, %rax, 1), %xmm0
2727
2728	mulpd	-14 * SIZE(BO, %rax, 1), %xmm1
2729	addpd	%xmm1, %xmm9
2730	movapd	-10 * SIZE(AO, %rax, 1), %xmm1
2731
2732	addq	$4 * SIZE, %rax
2733	BRANCH
2734	jl	.L112
2735	ALIGN_4
2736
2737.L116:
2738#if defined(LT) || defined(RN)
2739	movq	KK, %rax
2740#else
2741	movq	K, %rax
2742	subq	KK, %rax
2743#endif
2744	andq	$3, %rax		# if (k & 1)
2745	je .L118
2746
2747	leaq	(, %rax, SIZE), %rax
2748	leaq	(AO, %rax, 1), AO
2749	leaq	(BO, %rax, 1), BO
2750	negq	%rax
2751	ALIGN_4
2752
2753.L117:
2754	mulsd	-16 * SIZE(BO, %rax, 1), %xmm0
2755	addsd	%xmm0, %xmm8
2756	movsd	-15 * SIZE(AO, %rax, 1), %xmm0
2757
2758	addq	$SIZE, %rax
2759	jl	.L117
2760	ALIGN_4
2761
2762.L118:
2763	addpd	%xmm9, %xmm8
2764	haddpd	%xmm8, %xmm8
2765
2766#if defined(LN) || defined(RT)
2767	movq	KK, %rax
2768#ifdef LN
2769	subq	$1, %rax
2770#else
2771	subq	$1, %rax
2772#endif
2773
2774	leaq	(, %rax, SIZE), %rax
2775
2776	movq	AORIG, AO
2777	leaq	(AO, %rax, 1), AO
2778	leaq	(B,  %rax, 1), BO
2779#endif
2780
2781#if defined(LN) || defined(LT)
2782	movsd	-16 * SIZE(BO), %xmm10
2783	subsd	%xmm8,  %xmm10
2784#else
2785	movsd	-16 * SIZE(AO), %xmm10
2786	subsd	%xmm8, %xmm10
2787#endif
2788
2789#if defined(LN) || defined(LT)
2790	movsd	-16 * SIZE(AO), %xmm12
2791	mulsd	 %xmm12, %xmm10
2792#endif
2793
2794#if defined(RN) || defined(RT)
2795	movsd	-16 * SIZE(BO), %xmm8
2796	mulsd	 %xmm8, %xmm10
2797#endif
2798
2799#ifdef LN
2800	subq	$1 * SIZE, CO1
2801#endif
2802
2803	movsd	%xmm10,  0 * SIZE(CO1)
2804
2805#if defined(LN) || defined(LT)
2806	movlpd	%xmm10, -16 * SIZE(BO)
2807#else
2808	movlpd	%xmm10, -16 * SIZE(AO)
2809#endif
2810
2811#ifndef LN
2812	addq	$1 * SIZE, CO1
2813#endif
2814
2815#if defined(LT) || defined(RN)
2816	movq	K,  %rax
2817	subq	KK, %rax
2818	leaq	(,%rax, SIZE), %rax
2819	addq	%rax, AO
2820	addq	%rax, BO
2821#endif
2822
2823#ifdef LN
2824	subq	$1, KK
2825#endif
2826
2827#ifdef LT
2828	addq	$1, KK
2829#endif
2830
2831#ifdef RT
2832       movq	K, %rax
2833       salq	$0 + BASE_SHIFT, %rax
2834       addq	%rax, AORIG
2835#endif
2836	ALIGN_4
2837
2838.L100:
2839	testq	$2, M
2840	je	.L110
2841
2842#ifdef LN
2843      movq	K, %rax
2844       salq	$1 + BASE_SHIFT, %rax
2845       subq	%rax, AORIG
2846#endif
2847
2848#if defined(LN) || defined(RT)
2849	movq	KK, %rax
2850	movq	AORIG, AO
2851	leaq	(, %rax, SIZE), %rax
2852	leaq	(AO, %rax, 2), AO
2853#endif
2854
2855	movq	B, BO
2856
2857#if defined(LN) || defined(RT)
2858	movq	KK, %rax
2859	leaq	(BO, %rax, SIZE), BO
2860#endif
2861
2862	movddup	-16 * SIZE(BO), %xmm0
2863	pxor	%xmm8, %xmm8
2864	movddup	-15 * SIZE(BO), %xmm1
2865	pxor	%xmm9, %xmm9
2866	movddup	-14 * SIZE(BO), %xmm2
2867	pxor	%xmm10, %xmm10
2868	movddup	-13 * SIZE(BO), %xmm3
2869	pxor	%xmm11, %xmm11
2870
2871#if defined(LT) || defined(RN)
2872	movq	KK, %rax
2873#else
2874	movq	K, %rax
2875	subq	KK, %rax
2876#endif
2877	andq	$-4, %rax
2878	leaq	(, %rax, SIZE), %rax
2879	leaq	(AO, %rax, 2), AO
2880	leaq	(BO, %rax, 1), BO
2881	negq	%rax
2882	NOBRANCH
2883	je	.L106
2884	ALIGN_4
2885
2886.L102:
2887	mulpd	-16 * SIZE(AO, %rax, 2), %xmm0
2888	addpd	%xmm0, %xmm8
2889	movddup	-12 * SIZE(BO, %rax, 1), %xmm0
2890
2891	mulpd	-14 * SIZE(AO, %rax, 2), %xmm1
2892	addpd	%xmm1, %xmm9
2893	movddup	-11 * SIZE(BO, %rax, 1), %xmm1
2894
2895	mulpd	-12 * SIZE(AO, %rax, 2), %xmm2
2896	addpd	%xmm2, %xmm10
2897	movddup	-10 * SIZE(BO, %rax, 1), %xmm2
2898
2899	mulpd	-10 * SIZE(AO, %rax, 2), %xmm3
2900	addpd	%xmm3, %xmm11
2901	movddup	 -9 * SIZE(BO, %rax, 1), %xmm3
2902
2903	addq	$4 * SIZE, %rax
2904	BRANCH
2905	jl	.L102
2906	ALIGN_4
2907
2908.L106:
2909#if defined(LT) || defined(RN)
2910	movq	KK, %rax
2911#else
2912	movq	K, %rax
2913	subq	KK, %rax
2914#endif
2915	andq	$3, %rax		# if (k & 1)
2916	je .L109
2917
2918	leaq	(, %rax, SIZE), %rax
2919	leaq	(AO, %rax, 2), AO
2920	leaq	(BO, %rax, 1), BO
2921	negq	%rax
2922	ALIGN_4
2923
2924.L107:
2925	movddup	-16 * SIZE(BO, %rax, 1), %xmm0
2926	mulpd	-16 * SIZE(AO, %rax, 2), %xmm0
2927	addpd	%xmm0, %xmm8
2928
2929	addq	$SIZE, %rax
2930	jl	.L107
2931	ALIGN_4
2932
2933.L109:
2934	addpd	%xmm9, %xmm8
2935	addpd	%xmm11, %xmm10
2936	addpd	%xmm10, %xmm8
2937
2938#if defined(LN) || defined(RT)
2939	movq	KK, %rax
2940#ifdef LN
2941	subq	$2, %rax
2942#else
2943	subq	$1, %rax
2944#endif
2945
2946	leaq	(, %rax, SIZE), %rax
2947
2948	movq	AORIG, AO
2949	leaq	(AO, %rax, 2), AO
2950	leaq	(B,  %rax, 1), BO
2951#endif
2952
2953#if defined(LN) || defined(LT)
2954	movapd	-16 * SIZE(BO), %xmm10
2955	subpd	%xmm8,  %xmm10
2956#else
2957	movapd	-16 * SIZE(AO), %xmm10
2958	subpd	%xmm8, %xmm10
2959#endif
2960
2961#ifdef LN
2962	movapd	%xmm10, %xmm8
2963        unpckhpd %xmm8, %xmm8
2964
2965	movsd	-13 * SIZE(AO), %xmm12
2966	mulsd	 %xmm12, %xmm8
2967
2968	movsd	-14 * SIZE(AO), %xmm13
2969	mulsd	 %xmm8, %xmm13
2970	subsd	 %xmm13, %xmm10
2971
2972	movsd	-16 * SIZE(AO), %xmm12
2973	mulsd	 %xmm12, %xmm10
2974
2975	unpcklpd %xmm8, %xmm10
2976#endif
2977
2978#ifdef LT
2979	movapd	%xmm10, %xmm8
2980        unpckhpd %xmm8, %xmm8
2981
2982	movsd	-16 * SIZE(AO), %xmm12
2983	mulsd	 %xmm12, %xmm10
2984
2985	movsd	-15 * SIZE(AO), %xmm13
2986	mulsd	 %xmm10, %xmm13
2987	subsd	 %xmm13, %xmm8
2988
2989	movsd	-13 * SIZE(AO), %xmm12
2990	mulsd	 %xmm12, %xmm8
2991
2992	unpcklpd %xmm8, %xmm10
2993#endif
2994
2995#ifdef RN
2996	movddup	-16 * SIZE(BO), %xmm8
2997	mulpd	 %xmm8, %xmm10
2998#endif
2999
3000#ifdef RT
3001	movddup	-16 * SIZE(BO), %xmm8
3002	mulpd	 %xmm8, %xmm10
3003#endif
3004
3005#ifdef LN
3006	subq	$2 * SIZE, CO1
3007#endif
3008
3009#if defined(LN) || defined(LT)
3010	movlpd	%xmm10,  0 * SIZE(CO1)
3011	movhpd	%xmm10,  1 * SIZE(CO1)
3012#else
3013	movlpd	%xmm10,  0 * SIZE(CO1)
3014	movhpd	%xmm10,  1 * SIZE(CO1)
3015#endif
3016
3017#if defined(LN) || defined(LT)
3018	movaps	%xmm10, -16 * SIZE(BO)
3019#else
3020	movaps	%xmm10, -16 * SIZE(AO)
3021#endif
3022
3023#ifndef LN
3024	addq	$2 * SIZE, CO1
3025#endif
3026
3027#if defined(LT) || defined(RN)
3028	movq	K,  %rax
3029	subq	KK, %rax
3030	leaq	(,%rax, SIZE), %rax
3031	leaq	(AO, %rax, 2), AO
3032	addq	%rax, BO
3033#endif
3034
3035#ifdef LN
3036	subq	$2, KK
3037#endif
3038
3039#ifdef LT
3040	addq	$2, KK
3041#endif
3042
3043#ifdef RT
3044       movq	K, %rax
3045       salq	$1 + BASE_SHIFT, %rax
3046       addq	%rax, AORIG
3047#endif
3048	ALIGN_4
3049
3050.L110:
3051	movq	M,  I
3052	sarq	$2, I	# i = (m >> 2)
3053	jle	.L119
3054	ALIGN_4
3055
3056.L91:
3057#ifdef LN
3058       movq	K, %rax
3059       salq	$2 + BASE_SHIFT, %rax
3060       subq	%rax, AORIG
3061#endif
3062
3063#if defined(LN) || defined(RT)
3064	movq	KK, %rax
3065	movq	AORIG, AO
3066	leaq	(, %rax, SIZE), %rax
3067	leaq	(AO, %rax, 4), AO
3068#endif
3069
3070	movq	B, BO
3071
3072#if defined(LN) || defined(RT)
3073	movq	KK, %rax
3074	leaq	(BO, %rax, SIZE), BO
3075#endif
3076
3077	movapd	-16 * SIZE(AO), %xmm0
3078	pxor	%xmm8, %xmm8
3079	movapd	 -8 * SIZE(AO), %xmm2
3080	pxor	%xmm9, %xmm9
3081	movddup	-16 * SIZE(BO), %xmm1
3082	pxor	%xmm10, %xmm10
3083	movddup	-15 * SIZE(BO), %xmm5
3084	pxor	%xmm11, %xmm11
3085	movddup	-14 * SIZE(BO), %xmm3
3086
3087#ifndef LN
3088	prefetchw      3 * SIZE(CO1)
3089#else
3090	prefetchw     -8 * SIZE(CO1)
3091#endif
3092
3093#if defined(LT) || defined(RN)
3094	movq	KK, %rax
3095#else
3096	movq	K, %rax
3097	subq	KK, %rax
3098#endif
3099	andq	$-4, %rax
3100	leaq	(, %rax, SIZE), %rax
3101	leaq	(AO, %rax, 4), AO
3102	leaq	(BO, %rax, 1), BO
3103	negq	%rax
3104	NOBRANCH
3105	je	.L96
3106	ALIGN_4
3107
3108.L92:
3109	mulpd	%xmm1, %xmm0
3110	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1
3111	addpd	%xmm0, %xmm8
3112	movapd	-12 * SIZE(AO, %rax, 4), %xmm0
3113	addpd	%xmm1, %xmm9
3114	movddup	-12 * SIZE(BO, %rax, 1), %xmm1
3115	mulpd	%xmm5, %xmm0
3116	mulpd	-10 * SIZE(AO, %rax, 4), %xmm5
3117	addpd	%xmm0, %xmm10
3118	movapd	  (AO, %rax, 4), %xmm0
3119	addpd	%xmm5, %xmm11
3120	movddup	-13 * SIZE(BO, %rax, 1), %xmm5
3121	mulpd	%xmm3, %xmm2
3122	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm3
3123	addpd	%xmm2, %xmm8
3124	movapd	 -4 * SIZE(AO, %rax, 4), %xmm2
3125	addpd	%xmm3, %xmm9
3126	movddup	-10 * SIZE(BO, %rax, 1), %xmm3
3127	mulpd	%xmm5, %xmm2
3128	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm5
3129	addpd	%xmm2, %xmm10
3130	movapd	  8 * SIZE(AO, %rax, 4), %xmm2
3131	addpd	%xmm5, %xmm11
3132	movddup	-11 * SIZE(BO, %rax, 1), %xmm5
3133
3134	addq	$4 * SIZE, %rax
3135	BRANCH
3136	jl	.L92
3137	ALIGN_4
3138
3139.L96:
3140#if defined(LT) || defined(RN)
3141	movq	KK, %rax
3142#else
3143	movq	K, %rax
3144	subq	KK, %rax
3145#endif
3146	andq	$3, %rax		# if (k & 1)
3147	je .L99
3148
3149	leaq	(, %rax, SIZE), %rax
3150	leaq	(AO, %rax, 4), AO
3151	leaq	(BO, %rax, 1), BO
3152	negq	%rax
3153	ALIGN_4
3154
3155.L97:
3156	mulpd	%xmm1, %xmm0
3157	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1
3158	addpd	%xmm0, %xmm8
3159	movapd	-12 * SIZE(AO, %rax, 4), %xmm0
3160	addpd	%xmm1, %xmm9
3161	movddup	-15 * SIZE(BO, %rax, 1), %xmm1
3162
3163	addq	$SIZE, %rax
3164	jl	.L97
3165	ALIGN_4
3166.L99:
3167	addpd	%xmm10, %xmm8
3168	addpd	%xmm11, %xmm9
3169
3170#if defined(LN) || defined(RT)
3171	movq	KK, %rax
3172#ifdef LN
3173	subq	$4, %rax
3174#else
3175	subq	$1, %rax
3176#endif
3177
3178	leaq	(, %rax, SIZE), %rax
3179
3180	movq	AORIG, AO
3181	leaq	(AO, %rax, 4), AO
3182	leaq	(B,  %rax, 1), BO
3183#endif
3184
3185#if defined(LN) || defined(LT)
3186	movapd	-16 * SIZE(BO), %xmm10
3187	movapd	-14 * SIZE(BO), %xmm11
3188
3189	subpd	%xmm8,  %xmm10
3190	subpd	%xmm9,  %xmm11
3191#else
3192	movapd	-16 * SIZE(AO), %xmm10
3193	movapd	-14 * SIZE(AO), %xmm11
3194
3195	subpd	%xmm8, %xmm10
3196	subpd	%xmm9, %xmm11
3197#endif
3198
3199#ifdef LN
3200	movapd	%xmm10, %xmm8
3201        unpckhpd %xmm8, %xmm8
3202
3203	movapd	%xmm11, %xmm9
3204        unpckhpd %xmm9, %xmm9
3205
3206	movsd	 -1 * SIZE(AO), %xmm12
3207	mulsd	 %xmm12, %xmm9
3208
3209	movsd	 -2 * SIZE(AO), %xmm13
3210	mulsd	 %xmm9, %xmm13
3211	subsd	 %xmm13, %xmm11
3212	movsd	 -3 * SIZE(AO), %xmm14
3213	mulsd	 %xmm9, %xmm14
3214	subsd	 %xmm14, %xmm8
3215	movsd	 -4 * SIZE(AO), %xmm15
3216	mulsd	 %xmm9, %xmm15
3217	subsd	 %xmm15, %xmm10
3218
3219	movsd	 -6 * SIZE(AO), %xmm12
3220	mulsd	 %xmm12, %xmm11
3221
3222	movsd	 -7 * SIZE(AO), %xmm13
3223	mulsd	 %xmm11, %xmm13
3224	subsd	 %xmm13, %xmm8
3225	movsd	 -8 * SIZE(AO), %xmm14
3226	mulsd	 %xmm11, %xmm14
3227	subsd	 %xmm14, %xmm10
3228
3229	movsd	-11 * SIZE(AO), %xmm12
3230	mulsd	 %xmm12, %xmm8
3231
3232	movsd	-12 * SIZE(AO), %xmm13
3233	mulsd	 %xmm8, %xmm13
3234	subsd	 %xmm13, %xmm10
3235
3236	movsd	-16 * SIZE(AO), %xmm12
3237	mulsd	 %xmm12, %xmm10
3238
3239	unpcklpd %xmm8, %xmm10
3240	unpcklpd %xmm9, %xmm11
3241#endif
3242
3243#ifdef LT
3244	movapd	%xmm10, %xmm8
3245        unpckhpd %xmm8, %xmm8
3246
3247	movapd	%xmm11, %xmm9
3248        unpckhpd %xmm9, %xmm9
3249
3250	movsd	-16 * SIZE(AO), %xmm12
3251	mulsd	 %xmm12, %xmm10
3252
3253	movsd	-15 * SIZE(AO), %xmm13
3254	mulsd	 %xmm10, %xmm13
3255	subsd	 %xmm13, %xmm8
3256	movsd	-14 * SIZE(AO), %xmm14
3257	mulsd	 %xmm10, %xmm14
3258	subsd	 %xmm14, %xmm11
3259	movsd	-13 * SIZE(AO), %xmm15
3260	mulsd	 %xmm10, %xmm15
3261	subsd	 %xmm15, %xmm9
3262
3263	movsd	-11 * SIZE(AO), %xmm12
3264	mulsd	 %xmm12, %xmm8
3265
3266	movsd	-10 * SIZE(AO), %xmm13
3267	mulsd	 %xmm8, %xmm13
3268	subsd	 %xmm13, %xmm11
3269	movsd	 -9 * SIZE(AO), %xmm14
3270	mulsd	 %xmm8, %xmm14
3271	subsd	 %xmm14, %xmm9
3272
3273	movsd	 -6 * SIZE(AO), %xmm12
3274	mulsd	 %xmm12, %xmm11
3275
3276	movsd	 -5 * SIZE(AO), %xmm13
3277	mulsd	 %xmm11, %xmm13
3278	subsd	 %xmm13, %xmm9
3279
3280	movsd	 -1 * SIZE(AO), %xmm12
3281	mulsd	 %xmm12, %xmm9
3282
3283	unpcklpd %xmm8, %xmm10
3284	unpcklpd %xmm9, %xmm11
3285#endif
3286
3287#ifdef RN
3288	movddup	-16 * SIZE(BO), %xmm8
3289	mulpd	 %xmm8, %xmm10
3290	mulpd	 %xmm8, %xmm11
3291#endif
3292
3293#ifdef RT
3294	movddup	-16 * SIZE(BO), %xmm8
3295	mulpd	 %xmm8, %xmm10
3296	mulpd	 %xmm8, %xmm11
3297#endif
3298
3299#ifdef LN
3300	subq	$4 * SIZE, CO1
3301#endif
3302
3303	movlpd	%xmm10,  0 * SIZE(CO1)
3304	movhpd	%xmm10,  1 * SIZE(CO1)
3305	movlpd	%xmm11,  2 * SIZE(CO1)
3306	movhpd	%xmm11,  3 * SIZE(CO1)
3307
3308#if defined(LN) || defined(LT)
3309	movaps	%xmm10, -16 * SIZE(BO)
3310	movaps	%xmm11, -14 * SIZE(BO)
3311#else
3312	movaps	%xmm10, -16 * SIZE(AO)
3313	movaps	%xmm11, -14 * SIZE(AO)
3314#endif
3315
3316#ifndef LN
3317	addq	$4 * SIZE, CO1
3318#endif
3319
3320#if defined(LT) || defined(RN)
3321	movq	K,  %rax
3322	subq	KK, %rax
3323	leaq	(,%rax, SIZE), %rax
3324	leaq	(AO, %rax, 4), AO
3325	addq	%rax, BO
3326#endif
3327
3328#ifdef LN
3329	subq	$4, KK
3330#endif
3331
3332#ifdef LT
3333	addq	$4, KK
3334#endif
3335
3336#ifdef RT
3337       movq	K, %rax
3338       salq	$2 + BASE_SHIFT, %rax
3339       addq	%rax, AORIG
3340#endif
3341
3342	decq	I			# i --
3343	jg	.L91
3344	ALIGN_4
3345
3346.L119:
3347#ifdef LN
3348       leaq	(B, K, SIZE), B
3349#endif
3350
3351#if defined(LT) || defined(RN)
3352	movq	BO, B
3353#endif
3354
3355#ifdef RN
3356	addq	$1, KK
3357#endif
3358
3359#ifdef RT
3360	subq	$1, KK
3361#endif
3362	ALIGN_4
3363
3364.L999:
3365	movq	   (%rsp), %rbx
3366	movq	  8(%rsp), %rbp
3367	movq	 16(%rsp), %r12
3368	movq	 24(%rsp), %r13
3369	movq	 32(%rsp), %r14
3370	movq	 40(%rsp), %r15
3371
3372#ifdef WINDOWS_ABI
3373	movq	 48(%rsp), %rdi
3374	movq	 56(%rsp), %rsi
3375	movups	 64(%rsp), %xmm6
3376	movups	 80(%rsp), %xmm7
3377	movups	 96(%rsp), %xmm8
3378	movups	112(%rsp), %xmm9
3379	movups	128(%rsp), %xmm10
3380	movups	144(%rsp), %xmm11
3381	movups	160(%rsp), %xmm12
3382	movups	176(%rsp), %xmm13
3383	movups	192(%rsp), %xmm14
3384	movups	208(%rsp), %xmm15
3385#endif
3386
3387	addq	$STACKSIZE, %rsp
3388	ret
3389
3390	EPILOGUE
3391