1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define OLD_M	%rdi
43#define OLD_N	%rsi
44#define M	%r13
45#define N	%r14
46#define K	%rdx
47
48#define A	%rcx
49#define B	%r8
50#define C	%r9
51#define LDC	%r10
52
53#define I	%r11
54#define AO	%rdi
55#define BO	%rsi
56#define	CO1	%r15
57#define CO2	%r12
58#define BB	%rbp
59#define	J	%rbx
60
61#ifndef WINDOWS_ABI
62
63#define STACKSIZE 96
64
65#define OFFSET	 48(%rsp)
66#define AORIG	 56(%rsp)
67#define KK	 64(%rsp)
68#define KKK	 72(%rsp)
69
70#else
71
72#define STACKSIZE 256
73
74#define OLD_A		40 + STACKSIZE(%rsp)
75#define OLD_B		48 + STACKSIZE(%rsp)
76#define OLD_C		56 + STACKSIZE(%rsp)
77#define OLD_LDC		64 + STACKSIZE(%rsp)
78#define OLD_OFFSET	72 + STACKSIZE(%rsp)
79
80#define OFFSET	224(%rsp)
81#define AORIG	232(%rsp)
82#define KK	240(%rsp)
83#define KKK	248(%rsp)
84
85#endif
86
87#define PREFETCH     prefetch
88#define PREFETCHSIZE  (8 *  7 + 0)
89
90#define movlpd	movsd
91#define movapd	movups
92#define movupd	movups
93
94#define KERNEL1(xx) \
95	mulpd	%xmm1, %xmm0 ;\
96	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1 ;\
97	addpd	%xmm0, %xmm8 ;\
98	movapd	%xmm2, %xmm0 ;\
99	addpd	%xmm1, %xmm12 ;\
100	movddup	-14 * SIZE(BO, %rax, 4), %xmm1 ;\
101	mulpd	%xmm3, %xmm2 ;\
102	mulpd	-14 * SIZE(AO, %rax, 4), %xmm3 ;\
103	addpd	%xmm2, %xmm9 ;\
104	movapd	%xmm0, %xmm2 ;\
105	addpd	%xmm3, %xmm13 ;\
106	movddup	-13 * SIZE(BO, %rax, 4), %xmm3 ;\
107	mulpd	%xmm1, %xmm0 ;\
108	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1 ;\
109	addpd	%xmm0, %xmm10 ;\
110	movapd	-12 * SIZE(AO, %rax, 4), %xmm0 ;\
111	addpd	%xmm1, %xmm14 ;\
112	movddup	-12 * SIZE(BO, %rax, 4), %xmm1 ;\
113	mulpd	%xmm3, %xmm2 ;\
114	mulpd	-14 * SIZE(AO, %rax, 4), %xmm3 ;\
115	addpd	%xmm2, %xmm11 ;\
116	addpd	%xmm3, %xmm15 ;\
117 	movddup	-11 * SIZE(BO, %rax, 4), %xmm3 ;\
118	movapd	%xmm0, %xmm2
119
120
121#define KERNEL2(xx) \
122	mulpd	%xmm1, %xmm0 ;\
123	mulpd	-10 * SIZE(AO, %rax, 4), %xmm1 ;\
124	addpd	%xmm0, %xmm8 ;\
125	movapd	%xmm2, %xmm0 ;\
126/**/	movapd	  (AO, %rax, 4), %xmm6 ;\
127	addpd	%xmm1, %xmm12 ;\
128	movddup	-10 * SIZE(BO, %rax, 4), %xmm1 ;\
129	mulpd	%xmm3, %xmm2 ;\
130	mulpd	-10 * SIZE(AO, %rax, 4), %xmm3 ;\
131	addpd	%xmm2, %xmm9 ;\
132	movapd	%xmm0, %xmm2 ;\
133	addpd	%xmm3, %xmm13 ;\
134	movddup	 -9 * SIZE(BO, %rax, 4), %xmm3 ;\
135	mulpd	%xmm1, %xmm0 ;\
136	mulpd	-10 * SIZE(AO, %rax, 4), %xmm1 ;\
137	addpd	%xmm0, %xmm10 ;\
138	addpd	%xmm1, %xmm14 ;\
139	mulpd	%xmm3, %xmm2 ;\
140/**/	movddup	  (BO, %rax, 4), %xmm1 ;\
141	mulpd	-10 * SIZE(AO, %rax, 4), %xmm3 ;\
142	addpd	%xmm2, %xmm11 ;\
143	addpd	%xmm3, %xmm15 ;\
144 	movddup	 -7 * SIZE(BO, %rax, 4), %xmm3 ;\
145	movapd	%xmm4, %xmm2
146
147#define KERNEL3(xx) \
148	mulpd	%xmm5, %xmm4 ;\
149	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm5 ;\
150	addpd	%xmm4, %xmm8 ;\
151	movapd	%xmm2, %xmm4 ;\
152	addpd	%xmm5, %xmm12 ;\
153	movddup	 -6 * SIZE(BO, %rax, 4), %xmm5 ;\
154	mulpd	%xmm3, %xmm2 ;\
155	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm3 ;\
156	addpd	%xmm2, %xmm9 ;\
157	movapd	%xmm4, %xmm2 ;\
158	addpd	%xmm3, %xmm13 ;\
159	movddup	 -5 * SIZE(BO, %rax, 4), %xmm3 ;\
160	mulpd	%xmm5, %xmm4 ;\
161	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm5 ;\
162	addpd	%xmm4, %xmm10 ;\
163	movapd	 -4 * SIZE(AO, %rax, 4), %xmm4 ;\
164	addpd	%xmm5, %xmm14 ;\
165	movddup	 -4 * SIZE(BO, %rax, 4), %xmm5 ;\
166	mulpd	%xmm3, %xmm2 ;\
167	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm3 ;\
168	addpd	%xmm2, %xmm11 ;\
169	addpd	%xmm3, %xmm15 ;\
170 	movddup	 -3 * SIZE(BO, %rax, 4), %xmm3 ;\
171	movapd	%xmm4, %xmm2
172
173#define KERNEL4(xx) \
174	mulpd	%xmm5, %xmm4 ;\
175	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm5 ;\
176	addpd	%xmm4, %xmm8 ;\
177	movapd	%xmm2, %xmm4 ;\
178/**/	movapd	  8 * SIZE(AO, %rax, 4), %xmm7 ;\
179	addpd	%xmm5, %xmm12 ;\
180	movddup	 -2 * SIZE(BO, %rax, 4), %xmm5 ;\
181	mulpd	%xmm3, %xmm2 ;\
182	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm3 ;\
183	addpd	%xmm2, %xmm9 ;\
184	movapd	%xmm4, %xmm2 ;\
185	addpd	%xmm3, %xmm13 ;\
186	movddup	 -1 * SIZE(BO, %rax, 4), %xmm3 ;\
187	mulpd	%xmm5, %xmm4 ;\
188	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm5 ;\
189	addpd	%xmm4, %xmm10 ;\
190	addpd	%xmm5, %xmm14 ;\
191/**/	movddup	  8 * SIZE(BO, %rax, 4), %xmm5 ;\
192	mulpd	%xmm3, %xmm2 ;\
193	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm3 ;\
194	addpd	%xmm2, %xmm11 ;\
195	addpd	%xmm3, %xmm15 ;\
196 	movddup	  1 * SIZE(BO, %rax, 4), %xmm3 ;\
197	movapd	%xmm6, %xmm2
198
199#define KERNEL5(xx) \
200	mulpd	%xmm1, %xmm6 ;\
201	mulpd	  2 * SIZE(AO, %rax, 4), %xmm1 ;\
202	addpd	%xmm6, %xmm8 ;\
203	movapd	%xmm2, %xmm6 ;\
204	addpd	%xmm1, %xmm12 ;\
205	movddup	  2 * SIZE(BO, %rax, 4), %xmm1 ;\
206	mulpd	%xmm3, %xmm2 ;\
207	mulpd	  2 * SIZE(AO, %rax, 4), %xmm3 ;\
208	addpd	%xmm2, %xmm9 ;\
209	movapd	%xmm6, %xmm2 ;\
210	addpd	%xmm3, %xmm13 ;\
211	movddup	  3 * SIZE(BO, %rax, 4), %xmm3 ;\
212	mulpd	%xmm1, %xmm6 ;\
213	mulpd	  2 * SIZE(AO, %rax, 4), %xmm1 ;\
214	addpd	%xmm6, %xmm10 ;\
215	movapd	  4 * SIZE(AO, %rax, 4), %xmm6 ;\
216	addpd	%xmm1, %xmm14 ;\
217	movddup	  4 * SIZE(BO, %rax, 4), %xmm1 ;\
218	mulpd	%xmm3, %xmm2 ;\
219	mulpd	  2 * SIZE(AO, %rax, 4), %xmm3 ;\
220	addpd	%xmm2, %xmm11 ;\
221	addpd	%xmm3, %xmm15 ;\
222 	movddup	  5 * SIZE(BO, %rax, 4), %xmm3 ;\
223	movapd	%xmm6, %xmm2
224
225#define KERNEL6(xx) \
226	mulpd	%xmm1, %xmm6 ;\
227	mulpd	  6 * SIZE(AO, %rax, 4), %xmm1 ;\
228	addpd	%xmm6, %xmm8 ;\
229	movapd	%xmm2, %xmm6 ;\
230/***/	movapd	 16 * SIZE(AO, %rax, 4), %xmm0 ;\
231	addpd	%xmm1, %xmm12 ;\
232	movddup	  6 * SIZE(BO, %rax, 4), %xmm1 ;\
233	mulpd	%xmm3, %xmm2 ;\
234	mulpd	  6 * SIZE(AO, %rax, 4), %xmm3 ;\
235	addpd	%xmm2, %xmm9 ;\
236	movapd	%xmm6, %xmm2 ;\
237	addpd	%xmm3, %xmm13 ;\
238	movddup	  7 * SIZE(BO, %rax, 4), %xmm3 ;\
239	mulpd	%xmm1, %xmm6 ;\
240	mulpd	  6 * SIZE(AO, %rax, 4), %xmm1 ;\
241	addpd	%xmm6, %xmm10 ;\
242 	addpd	%xmm1, %xmm14 ;\
243/**/	movddup	 16 * SIZE(BO, %rax, 4), %xmm1 ;\
244	mulpd	%xmm3, %xmm2 ;\
245	mulpd	  6 * SIZE(AO, %rax, 4), %xmm3 ;\
246	addpd	%xmm2, %xmm11 ;\
247	addpd	%xmm3, %xmm15 ;\
248 	movddup	  9 * SIZE(BO, %rax, 4), %xmm3 ;\
249	movapd	%xmm7, %xmm2
250
251#define KERNEL7(xx) \
252	mulpd	%xmm5, %xmm7 ;\
253	mulpd	 10 * SIZE(AO, %rax, 4), %xmm5 ;\
254	addpd	%xmm7, %xmm8 ;\
255	movapd	%xmm2, %xmm7 ;\
256	addpd	%xmm5, %xmm12 ;\
257	movddup	 10 * SIZE(BO, %rax, 4), %xmm5 ;\
258	mulpd	%xmm3, %xmm2 ;\
259	mulpd	 10 * SIZE(AO, %rax, 4), %xmm3 ;\
260	addpd	%xmm2, %xmm9 ;\
261	movapd	%xmm7, %xmm2 ;\
262	addpd	%xmm3, %xmm13 ;\
263	movddup	 11 * SIZE(BO, %rax, 4), %xmm3 ;\
264	mulpd	%xmm5, %xmm7 ;\
265	mulpd	 10 * SIZE(AO, %rax, 4), %xmm5 ;\
266	addpd	%xmm7, %xmm10 ;\
267	movapd	 12 * SIZE(AO, %rax, 4), %xmm7 ;\
268	addpd	%xmm5, %xmm14 ;\
269	movddup	 12 * SIZE(BO, %rax, 4), %xmm5 ;\
270	mulpd	%xmm3, %xmm2 ;\
271	mulpd	 10 * SIZE(AO, %rax, 4), %xmm3 ;\
272	addpd	%xmm2, %xmm11 ;\
273	addpd	%xmm3, %xmm15 ;\
274 	movddup	 13 * SIZE(BO, %rax, 4), %xmm3 ;\
275	movapd	%xmm7, %xmm2
276
277#define KERNEL8(xx) \
278	mulpd	%xmm5, %xmm7 ;\
279	mulpd	 14 * SIZE(AO, %rax, 4), %xmm5 ;\
280	addpd	%xmm7, %xmm8 ;\
281	movapd	%xmm2, %xmm7 ;\
282/**/	movapd	 24 * SIZE(AO, %rax, 4), %xmm4 ;\
283	addpd	%xmm5, %xmm12 ;\
284	movddup	 14 * SIZE(BO, %rax, 4), %xmm5 ;\
285	mulpd	%xmm3, %xmm2 ;\
286	mulpd	 14 * SIZE(AO, %rax, 4), %xmm3 ;\
287	addpd	%xmm2, %xmm9 ;\
288	movapd	%xmm7, %xmm2 ;\
289	addpd	%xmm3, %xmm13 ;\
290	movddup	 15 * SIZE(BO, %rax, 4), %xmm3 ;\
291	mulpd	%xmm5, %xmm7 ;\
292	mulpd	 14 * SIZE(AO, %rax, 4), %xmm5 ;\
293	addpd	%xmm7, %xmm10 ;\
294	addpd	%xmm5, %xmm14 ;\
295/**/	movddup	 24 * SIZE(BO, %rax, 4), %xmm5 ;\
296	mulpd	%xmm3, %xmm2 ;\
297	mulpd	 14 * SIZE(AO, %rax, 4), %xmm3 ;\
298	addpd	%xmm2, %xmm11 ;\
299	addpd	%xmm3, %xmm15 ;\
300 	movddup	 17 * SIZE(BO, %rax, 4), %xmm3 ;\
301	movapd	%xmm0, %xmm2 ;\
302	addq	$8 * SIZE, %rax
303
304#define KERNEL_SUB1(xx) \
305	mulpd	%xmm1, %xmm0 ;\
306	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1 ;\
307	addpd	%xmm0, %xmm8 ;\
308	movapd	%xmm2, %xmm0 ;\
309	addpd	%xmm1, %xmm12 ;\
310	movddup	-14 * SIZE(BO, %rax, 4), %xmm1 ;\
311	mulpd	%xmm3, %xmm2 ;\
312	mulpd	-14 * SIZE(AO, %rax, 4), %xmm3 ;\
313	addpd	%xmm2, %xmm9 ;\
314	movapd	%xmm0, %xmm2 ;\
315	addpd	%xmm3, %xmm13 ;\
316	movddup	-13 * SIZE(BO, %rax, 4), %xmm3 ;\
317	mulpd	%xmm1, %xmm0 ;\
318	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1 ;\
319	addpd	%xmm0, %xmm10 ;\
320	movapd	-12 * SIZE(AO, %rax, 4), %xmm0 ;\
321	addpd	%xmm1, %xmm14 ;\
322	movddup	-12 * SIZE(BO, %rax, 4), %xmm1 ;\
323	mulpd	%xmm3, %xmm2 ;\
324	mulpd	-14 * SIZE(AO, %rax, 4), %xmm3 ;\
325	addpd	%xmm2, %xmm11 ;\
326	addpd	%xmm3, %xmm15 ;\
327 	movddup	-11 * SIZE(BO, %rax, 4), %xmm3 ;\
328	movapd	%xmm0, %xmm2
329
330#define KERNEL_SUB2(xx) \
331	mulpd	%xmm1, %xmm0 ;\
332	mulpd	-10 * SIZE(AO, %rax, 4), %xmm1 ;\
333	addpd	%xmm0, %xmm8 ;\
334	movapd	%xmm2, %xmm0 ;\
335	addpd	%xmm1, %xmm12 ;\
336	movddup	-10 * SIZE(BO, %rax, 4), %xmm1 ;\
337	mulpd	%xmm3, %xmm2 ;\
338	mulpd	-10 * SIZE(AO, %rax, 4), %xmm3 ;\
339	addpd	%xmm2, %xmm9 ;\
340	movapd	%xmm0, %xmm2 ;\
341	addpd	%xmm3, %xmm13 ;\
342	movddup	 -9 * SIZE(BO, %rax, 4), %xmm3 ;\
343	mulpd	%xmm1, %xmm0 ;\
344	mulpd	-10 * SIZE(AO, %rax, 4), %xmm1 ;\
345	addpd	%xmm0, %xmm10 ;\
346	movapd	  (AO, %rax, 4), %xmm0 ;\
347	addpd	%xmm1, %xmm14 ;\
348	movddup	  (BO, %rax, 4), %xmm1 ;\
349	mulpd	%xmm3, %xmm2 ;\
350	mulpd	-10 * SIZE(AO, %rax, 4), %xmm3 ;\
351	addpd	%xmm2, %xmm11 ;\
352	addpd	%xmm3, %xmm15 ;\
353 	movddup	 -7 * SIZE(BO, %rax, 4), %xmm3 ;\
354	movapd	%xmm4, %xmm2
355
356#define KERNEL_SUB3(xx) \
357	mulpd	%xmm5, %xmm4 ;\
358	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm5 ;\
359	addpd	%xmm4, %xmm8 ;\
360	movapd	%xmm2, %xmm4 ;\
361	addpd	%xmm5, %xmm12 ;\
362	movddup	 -6 * SIZE(BO, %rax, 4), %xmm5 ;\
363	mulpd	%xmm3, %xmm2 ;\
364	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm3 ;\
365	addpd	%xmm2, %xmm9 ;\
366	movapd	%xmm4, %xmm2 ;\
367	addpd	%xmm3, %xmm13 ;\
368	movddup	 -5 * SIZE(BO, %rax, 4), %xmm3 ;\
369	mulpd	%xmm5, %xmm4 ;\
370	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm5 ;\
371	addpd	%xmm4, %xmm10 ;\
372	movapd	 -4 * SIZE(AO, %rax, 4), %xmm4 ;\
373	addpd	%xmm5, %xmm14 ;\
374	movddup	 -4 * SIZE(BO, %rax, 4), %xmm5 ;\
375	mulpd	%xmm3, %xmm2 ;\
376	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm3 ;\
377	addpd	%xmm2, %xmm11 ;\
378	addpd	%xmm3, %xmm15 ;\
379 	movddup	 -3 * SIZE(BO, %rax, 4), %xmm3 ;\
380	movapd	%xmm4, %xmm2
381
382#define KERNEL_SUB4(xx) \
383	mulpd	%xmm5, %xmm4 ;\
384	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm5 ;\
385	addpd	%xmm4, %xmm8 ;\
386	movapd	%xmm2, %xmm4 ;\
387	addpd	%xmm5, %xmm12 ;\
388	movddup	 -2 * SIZE(BO, %rax, 4), %xmm5 ;\
389	mulpd	%xmm3, %xmm2 ;\
390	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm3 ;\
391	addpd	%xmm2, %xmm9 ;\
392	movapd	%xmm4, %xmm2 ;\
393	addpd	%xmm3, %xmm13 ;\
394	movddup	 -1 * SIZE(BO, %rax, 4), %xmm3 ;\
395	mulpd	%xmm5, %xmm4 ;\
396	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm5 ;\
397	addpd	%xmm4, %xmm10 ;\
398	addpd	%xmm5, %xmm14 ;\
399	mulpd	%xmm3, %xmm2 ;\
400	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm3 ;\
401	addpd	%xmm2, %xmm11 ;\
402	addpd	%xmm3, %xmm15 ;\
403 	movddup	  1 * SIZE(BO, %rax, 4), %xmm3 ;\
404	movapd	%xmm0, %xmm2
405
406	PROLOGUE
407	PROFCODE
408
409	subq	$STACKSIZE, %rsp
410	movq	%rbx,   (%rsp)
411	movq	%rbp,  8(%rsp)
412	movq	%r12, 16(%rsp)
413	movq	%r13, 24(%rsp)
414	movq	%r14, 32(%rsp)
415	movq	%r15, 40(%rsp)
416
417#ifdef WINDOWS_ABI
418	movq	%rdi,    48(%rsp)
419	movq	%rsi,    56(%rsp)
420	movups	%xmm6,   64(%rsp)
421	movups	%xmm7,   80(%rsp)
422	movups	%xmm8,   96(%rsp)
423	movups	%xmm9,  112(%rsp)
424	movups	%xmm10, 128(%rsp)
425	movups	%xmm11, 144(%rsp)
426	movups	%xmm12, 160(%rsp)
427	movups	%xmm13, 176(%rsp)
428	movups	%xmm14, 192(%rsp)
429	movups	%xmm15, 208(%rsp)
430
431	movq	ARG1,      OLD_M
432	movq	ARG2,      OLD_N
433	movq	ARG3,      K
434	movq	OLD_A,     A
435	movq	OLD_B,     B
436	movq	OLD_C,     C
437	movq	OLD_LDC,   LDC
438	movsd	OLD_OFFSET, %xmm12
439#else
440	movq	STACKSIZE +  8(%rsp), LDC
441	movsd	STACKSIZE + 16(%rsp), %xmm12
442#endif
443
444	movq	OLD_M, M
445	movq	OLD_N, N
446
447	subq	$-16 * SIZE, A
448	subq	$-16 * SIZE, B
449
450	movsd	%xmm12, OFFSET
451	movsd	%xmm12, KK
452
453	leaq	(, LDC, SIZE), LDC
454
455#ifdef LN
456       leaq	(, M, SIZE), %rax
457       addq	%rax, C
458       imulq	K, %rax
459       addq	%rax, A
460#endif
461
462#ifdef RT
463       leaq	(, N, SIZE), %rax
464       imulq	K, %rax
465       addq	%rax, B
466       movq	N, %rax
467       imulq	LDC, %rax
468       addq	%rax, C
469#endif
470
471#ifdef RN
472	negq	KK
473#endif
474
475#ifdef RT
476       movq	N, %rax
477       subq	OFFSET, %rax
478       movq	%rax, KK
479#endif
480
481	movq	N,  J
482	sarq	$2, J		# j = (n >> 2)
483	jle	.L40
484
485.L01:
486#if defined(LT) || defined(RN)
487	movq	A, AO
488#else
489	movq	A, AORIG
490#endif
491
492#ifdef RT
493       movq	K, %rax
494       salq	$2 + BASE_SHIFT, %rax
495       subq	%rax, B
496
497       leaq	(, LDC, 4), %rax
498       subq	%rax, C
499#endif
500
501	movq	C, CO1			# coffset1 = c
502	leaq	(C, LDC, 1), CO2	# coffset2 = c + ldc
503#ifndef RT
504	leaq	(C, LDC, 4), C
505#endif
506
507#ifdef LN
508	movq	OFFSET, %rax
509	addq	M, %rax
510	movq	%rax, KK
511#endif
512
513	movq	K, %rax
514	salq	$BASE_SHIFT + 2, %rax
515	leaq	(B, %rax), BB
516
517#if defined(LT)
518	movq	OFFSET, %rax
519	movq	%rax, KK
520#endif
521
522	movq	M,  I
523	sarq	$2, I	# i = (m >> 2)
524	jle	.L20
525	ALIGN_4
526
527.L11:
528#ifdef LN
529       movq	K, %rax
530       salq	$2 + BASE_SHIFT, %rax
531       subq	%rax, AORIG
532#endif
533
534#if defined(LN) || defined(RT)
535	movq	KK, %rax
536	movq	AORIG, AO
537	leaq	(, %rax, SIZE), %rax
538	leaq	(AO, %rax, 4), AO
539#endif
540
541	movq	B, BO
542
543#if defined(LN) || defined(RT)
544	movq	KK, %rax
545	leaq	(, %rax, SIZE), %rax
546	leaq	(BO, %rax, 4), BO
547#endif
548
549	movapd	-16 * SIZE(AO), %xmm0
550	movddup	-16 * SIZE(BO), %xmm1
551	pxor	%xmm8, %xmm8
552 	movddup	-15 * SIZE(BO), %xmm3
553	pxor	%xmm9, %xmm9
554	movapd	 -8 * SIZE(AO), %xmm4
555	pxor	%xmm10, %xmm10
556	movddup	 -8 * SIZE(BO), %xmm5
557	pxor	%xmm11, %xmm11
558
559#ifndef LN
560	prefetchw      3 * SIZE(CO1)
561	pxor	%xmm12, %xmm12
562	prefetchw      7 * SIZE(CO2)
563	pxor	%xmm13, %xmm13
564	prefetchw      3 * SIZE(CO1, LDC, 2)
565	pxor	%xmm14, %xmm14
566	prefetchw      7 * SIZE(CO2, LDC, 2)
567	pxor	%xmm15, %xmm15
568	movapd	%xmm0, %xmm2
569#else
570	prefetchw     -8 * SIZE(CO1)
571	pxor	%xmm12, %xmm12
572	prefetchw     -8 * SIZE(CO2)
573	pxor	%xmm13, %xmm13
574	prefetchw     -8 * SIZE(CO1, LDC, 2)
575	pxor	%xmm14, %xmm14
576	prefetchw     -8 * SIZE(CO2, LDC, 2)
577	pxor	%xmm15, %xmm15
578	movapd	%xmm0, %xmm2
579#endif
580
581	prefetch	 -16 * SIZE(BB)
582
583#if defined(LT) || defined(RN)
584	movq	KK, %rax
585#else
586	movq	K, %rax
587	subq	KK, %rax
588#endif
589
590	andq	$-8, %rax
591	leaq	(, %rax, SIZE), %rax
592	leaq	(AO, %rax, 4), AO
593	leaq	(BO, %rax, 4), BO
594	negq	%rax
595	NOBRANCH
596	je	.L15
597	ALIGN_4
598
599.L12:
600	KERNEL1(16 *  0)
601	KERNEL2(16 *  0)
602	KERNEL3(16 *  0)
603	KERNEL4(16 *  0)
604	KERNEL5(16 *  0)
605	KERNEL6(16 *  0)
606	KERNEL7(16 *  0)
607	KERNEL8(16 *  0)
608	BRANCH
609	jl	.L12
610	ALIGN_4
611
612.L15:
613	prefetch	  -8 * SIZE(BB)
614	subq		 $-16 * SIZE, BB
615
616#if defined(LT) || defined(RN)
617	movq	KK, %rax
618#else
619	movq	K, %rax
620	subq	KK, %rax
621#endif
622	testq	$4, %rax
623	je .L16
624	xorq	%rax, %rax
625	ALIGN_4
626
627	KERNEL_SUB1(16 *  0)
628	KERNEL_SUB2(16 *  0)
629	KERNEL_SUB3(16 *  0)
630	KERNEL_SUB4(16 *  0)
631
632	subq	$-16 * SIZE, BO
633	subq	$-16 * SIZE, AO
634	ALIGN_4
635
636.L16:
637#if defined(LT) || defined(RN)
638	movq	KK, %rax
639#else
640	movq	K, %rax
641	subq	KK, %rax
642#endif
643	andq	$3, %rax		# if (k & 1)
644	je .L19
645
646	leaq	(, %rax, SIZE), %rax
647	leaq	(AO, %rax, 4), AO
648	leaq	(BO, %rax, 4), BO
649	negq	%rax
650	ALIGN_4
651
652.L17:
653	mulpd	%xmm1, %xmm0
654	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1
655	addpd	%xmm0, %xmm8
656	movapd	%xmm2, %xmm0
657	addpd	%xmm1, %xmm12
658	movddup	-14 * SIZE(BO, %rax, 4), %xmm1
659	mulpd	%xmm3, %xmm2
660	mulpd	-14 * SIZE(AO, %rax, 4), %xmm3
661	addpd	%xmm2, %xmm9
662	movapd	%xmm0, %xmm2
663	addpd	%xmm3, %xmm13
664	movddup	-13 * SIZE(BO, %rax, 4), %xmm3
665	mulpd	%xmm1, %xmm0
666	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1
667	addpd	%xmm0, %xmm10
668	movapd	-12 * SIZE(AO, %rax, 4), %xmm0
669	addpd	%xmm1, %xmm14
670	movddup	-12 * SIZE(BO, %rax, 4), %xmm1
671	mulpd	%xmm3, %xmm2
672	mulpd	-14 * SIZE(AO, %rax, 4), %xmm3
673	addpd	%xmm2, %xmm11
674	addpd	%xmm3, %xmm15
675 	movddup	-11 * SIZE(BO, %rax, 4), %xmm3
676	movapd	%xmm0, %xmm2
677
678	addq	$SIZE, %rax
679	jl	.L17
680	ALIGN_4
681
682.L19:
683#if defined(LN) || defined(RT)
684	movq	KK, %rax
685#ifdef LN
686	subq	$4, %rax
687#else
688	subq	$4, %rax
689#endif
690
691	leaq	(, %rax, SIZE), %rax
692
693	movq	AORIG, AO
694	leaq	(AO, %rax, 4), AO
695	leaq	(B,  %rax, 4), BO
696#endif
697
698#if defined(LN) || defined(LT)
699	movapd	%xmm8, %xmm0
700	unpcklpd %xmm9, %xmm8
701	unpckhpd %xmm9, %xmm0
702
703	movapd	%xmm10, %xmm2
704	unpcklpd %xmm11, %xmm10
705	unpckhpd %xmm11, %xmm2
706
707	movapd	%xmm12, %xmm4
708	unpcklpd %xmm13, %xmm12
709	unpckhpd %xmm13, %xmm4
710
711	movapd	%xmm14, %xmm6
712	unpcklpd %xmm15, %xmm14
713	unpckhpd %xmm15, %xmm6
714
715	movapd	-16 * SIZE(BO), %xmm9
716	movapd	-14 * SIZE(BO), %xmm11
717	movapd	-12 * SIZE(BO), %xmm13
718	movapd	-10 * SIZE(BO), %xmm15
719	movapd	 -8 * SIZE(BO), %xmm1
720	movapd	 -6 * SIZE(BO), %xmm3
721	movapd	 -4 * SIZE(BO), %xmm5
722	movapd	 -2 * SIZE(BO), %xmm7
723
724	subpd	%xmm8,  %xmm9
725	subpd	%xmm10, %xmm11
726	subpd	%xmm0,  %xmm13
727	subpd	%xmm2,  %xmm15
728	subpd	%xmm12, %xmm1
729	subpd	%xmm14, %xmm3
730	subpd	%xmm4,  %xmm5
731	subpd	%xmm6,  %xmm7
732#else
733	movapd	-16 * SIZE(AO), %xmm0
734	movapd	-14 * SIZE(AO), %xmm1
735	movapd	-12 * SIZE(AO), %xmm2
736	movapd	-10 * SIZE(AO), %xmm3
737
738	movapd	 -8 * SIZE(AO), %xmm4
739	movapd	 -6 * SIZE(AO), %xmm5
740	movapd	 -4 * SIZE(AO), %xmm6
741	movapd	 -2 * SIZE(AO), %xmm7
742
743	subpd	%xmm8,  %xmm0
744	subpd	%xmm12, %xmm1
745	subpd	%xmm9,  %xmm2
746	subpd	%xmm13, %xmm3
747	subpd	%xmm10, %xmm4
748	subpd	%xmm14, %xmm5
749	subpd	%xmm11, %xmm6
750	subpd	%xmm15, %xmm7
751#endif
752
753#ifdef LN
754	movddup	 -1 * SIZE(AO), %xmm8
755	mulpd	 %xmm8, %xmm5
756	mulpd	 %xmm8, %xmm7
757
758	movddup	 -2 * SIZE(AO), %xmm10
759	mulpd	 %xmm5, %xmm10
760	subpd	 %xmm10, %xmm1
761	movddup	 -2 * SIZE(AO), %xmm10
762	mulpd	 %xmm7, %xmm10
763	subpd	 %xmm10, %xmm3
764
765	movddup	 -3 * SIZE(AO), %xmm12
766	mulpd	 %xmm5, %xmm12
767	subpd	 %xmm12, %xmm13
768	movddup	 -3 * SIZE(AO), %xmm12
769	mulpd	 %xmm7, %xmm12
770	subpd	 %xmm12, %xmm15
771
772	movddup	 -4 * SIZE(AO), %xmm14
773	mulpd	 %xmm5, %xmm14
774	subpd	 %xmm14, %xmm9
775	movddup	 -4 * SIZE(AO), %xmm14
776	mulpd	 %xmm7, %xmm14
777	subpd	 %xmm14, %xmm11
778
779	movddup	 -6 * SIZE(AO), %xmm8
780	mulpd	 %xmm8, %xmm1
781	mulpd	 %xmm8, %xmm3
782
783	movddup	 -7 * SIZE(AO), %xmm10
784	mulpd	 %xmm1, %xmm10
785	subpd	 %xmm10, %xmm13
786	movddup	 -7 * SIZE(AO), %xmm10
787	mulpd	 %xmm3, %xmm10
788	subpd	 %xmm10, %xmm15
789
790	movddup	 -8 * SIZE(AO), %xmm12
791	mulpd	 %xmm1, %xmm12
792	subpd	 %xmm12, %xmm9
793	movddup	 -8 * SIZE(AO), %xmm12
794	mulpd	 %xmm3, %xmm12
795	subpd	 %xmm12, %xmm11
796
797	movddup	-11 * SIZE(AO), %xmm8
798	mulpd	 %xmm8, %xmm13
799	mulpd	 %xmm8, %xmm15
800
801	movddup	-12 * SIZE(AO), %xmm10
802	mulpd	 %xmm13, %xmm10
803	subpd	 %xmm10, %xmm9
804	movddup	-12 * SIZE(AO), %xmm10
805	mulpd	 %xmm15, %xmm10
806	subpd	 %xmm10, %xmm11
807
808	movddup	-16 * SIZE(AO), %xmm8
809	mulpd	 %xmm8, %xmm9
810	mulpd	 %xmm8, %xmm11
811#endif
812
813#ifdef LT
814	movddup -16 * SIZE(AO), %xmm8
815	mulpd	 %xmm8, %xmm9
816	mulpd	 %xmm8, %xmm11
817
818	movddup	-15 * SIZE(AO), %xmm10
819	mulpd	 %xmm9, %xmm10
820	subpd	 %xmm10, %xmm13
821
822	movddup	-15 * SIZE(AO), %xmm10
823	mulpd	 %xmm11, %xmm10
824	subpd	 %xmm10, %xmm15
825
826	movddup	-14 * SIZE(AO), %xmm12
827	mulpd	 %xmm9, %xmm12
828	subpd	 %xmm12, %xmm1
829	movddup	-14 * SIZE(AO), %xmm12
830	mulpd	 %xmm11, %xmm12
831	subpd	 %xmm12, %xmm3
832
833	movddup	-13 * SIZE(AO), %xmm14
834	mulpd	 %xmm9, %xmm14
835	subpd	 %xmm14, %xmm5
836	movddup	-13 * SIZE(AO), %xmm14
837	mulpd	 %xmm11, %xmm14
838	subpd	 %xmm14, %xmm7
839
840	movddup	-11 * SIZE(AO), %xmm8
841	mulpd	 %xmm8, %xmm13
842	mulpd	 %xmm8, %xmm15
843
844	movddup	-10 * SIZE(AO), %xmm10
845	mulpd	 %xmm13, %xmm10
846	subpd	 %xmm10, %xmm1
847	movddup	-10 * SIZE(AO), %xmm10
848	mulpd	 %xmm15, %xmm10
849	subpd	 %xmm10, %xmm3
850
851	movddup	 -9 * SIZE(AO), %xmm12
852	mulpd	 %xmm13, %xmm12
853	subpd	 %xmm12, %xmm5
854	movddup	 -9 * SIZE(AO), %xmm12
855	mulpd	 %xmm15, %xmm12
856	subpd	 %xmm12, %xmm7
857
858	movddup	 -6 * SIZE(AO), %xmm8
859	mulpd	 %xmm8, %xmm1
860	mulpd	 %xmm8, %xmm3
861
862	movddup	 -5 * SIZE(AO), %xmm10
863	mulpd	 %xmm1, %xmm10
864	subpd	 %xmm10, %xmm5
865	movddup	 -5 * SIZE(AO), %xmm10
866	mulpd	 %xmm3, %xmm10
867	subpd	 %xmm10, %xmm7
868
869	movddup	 -1 * SIZE(AO), %xmm8
870	mulpd	 %xmm8, %xmm5
871	mulpd	 %xmm8, %xmm7
872#endif
873
874#ifdef RN
875	movddup	-16 * SIZE(BO), %xmm8
876	mulpd	 %xmm8, %xmm0
877	mulpd	 %xmm8, %xmm1
878
879	movddup	-15 * SIZE(BO), %xmm9
880	mulpd	 %xmm0, %xmm9
881	subpd	 %xmm9, %xmm2
882	movddup	-15 * SIZE(BO), %xmm9
883	mulpd	 %xmm1, %xmm9
884	subpd	 %xmm9, %xmm3
885
886	movddup	-14 * SIZE(BO), %xmm10
887	mulpd	 %xmm0, %xmm10
888	subpd	 %xmm10, %xmm4
889	movddup	-14 * SIZE(BO), %xmm10
890	mulpd	 %xmm1, %xmm10
891	subpd	 %xmm10, %xmm5
892
893	movddup	 -13 * SIZE(BO), %xmm11
894	mulpd	 %xmm0, %xmm11
895	subpd	 %xmm11, %xmm6
896	movddup	 -13 * SIZE(BO), %xmm11
897	mulpd	 %xmm1, %xmm11
898	subpd	 %xmm11, %xmm7
899
900	movddup	 -11 * SIZE(BO), %xmm8
901	mulpd	 %xmm8, %xmm2
902	mulpd	 %xmm8, %xmm3
903
904	movddup	 -10 * SIZE(BO), %xmm9
905	mulpd	 %xmm2, %xmm9
906	subpd	 %xmm9, %xmm4
907	movddup	 -10 * SIZE(BO), %xmm9
908	mulpd	 %xmm3, %xmm9
909	subpd	 %xmm9, %xmm5
910
911	movddup	  -9 * SIZE(BO), %xmm10
912	mulpd	 %xmm2, %xmm10
913	subpd	 %xmm10, %xmm6
914	movddup	  -9 * SIZE(BO), %xmm10
915	mulpd	 %xmm3, %xmm10
916	subpd	 %xmm10, %xmm7
917
918	movddup	 -6 * SIZE(BO), %xmm8
919	mulpd	 %xmm8, %xmm4
920	mulpd	 %xmm8, %xmm5
921
922	movddup	 -5 * SIZE(BO), %xmm9
923	mulpd	 %xmm4, %xmm9
924	subpd	 %xmm9, %xmm6
925	movddup	 -5 * SIZE(BO), %xmm9
926	mulpd	 %xmm5, %xmm9
927	subpd	 %xmm9, %xmm7
928
929	movddup	 -1 * SIZE(BO), %xmm8
930	mulpd	 %xmm8, %xmm6
931	mulpd	 %xmm8, %xmm7
932#endif
933
934#ifdef RT
935	movddup	 -1 * SIZE(BO), %xmm8
936	mulpd	 %xmm8, %xmm6
937	mulpd	 %xmm8, %xmm7
938
939	movddup	 -2 * SIZE(BO), %xmm9
940	mulpd	 %xmm6, %xmm9
941	subpd	 %xmm9, %xmm4
942	movddup	 -2 * SIZE(BO), %xmm9
943	mulpd	 %xmm7, %xmm9
944	subpd	 %xmm9, %xmm5
945
946	movddup	 -3 * SIZE(BO), %xmm10
947	mulpd	 %xmm6, %xmm10
948	subpd	 %xmm10, %xmm2
949	movddup	 -3 * SIZE(BO), %xmm10
950	mulpd	 %xmm7, %xmm10
951	subpd	 %xmm10, %xmm3
952
953	movddup	 -4 * SIZE(BO), %xmm11
954	mulpd	 %xmm6, %xmm11
955	subpd	 %xmm11, %xmm0
956	movddup	 -4 * SIZE(BO), %xmm11
957	mulpd	 %xmm7, %xmm11
958	subpd	 %xmm11, %xmm1
959
960	movddup	 -6 * SIZE(BO), %xmm8
961	mulpd	 %xmm8, %xmm4
962	mulpd	 %xmm8, %xmm5
963
964	movddup	 -7 * SIZE(BO), %xmm9
965	mulpd	 %xmm4, %xmm9
966	subpd	 %xmm9, %xmm2
967	movddup	 -7 * SIZE(BO), %xmm9
968	mulpd	 %xmm5, %xmm9
969	subpd	 %xmm9, %xmm3
970
971	movddup	 -8 * SIZE(BO), %xmm10
972	mulpd	 %xmm4, %xmm10
973	subpd	 %xmm10, %xmm0
974	movddup	 -8 * SIZE(BO), %xmm10
975	mulpd	 %xmm5, %xmm10
976	subpd	 %xmm10, %xmm1
977
978	movddup	-11 * SIZE(BO), %xmm8
979	mulpd	 %xmm8, %xmm2
980	mulpd	 %xmm8, %xmm3
981
982	movddup	-12 * SIZE(BO), %xmm9
983	mulpd	 %xmm2, %xmm9
984	subpd	 %xmm9, %xmm0
985	movddup	-12 * SIZE(BO), %xmm9
986	mulpd	 %xmm3, %xmm9
987	subpd	 %xmm9, %xmm1
988
989	movddup	-16 * SIZE(BO), %xmm8
990	mulpd	 %xmm8, %xmm0
991	mulpd	 %xmm8, %xmm1
992#endif
993
994#ifdef LN
995	subq	$4 * SIZE, CO1
996	subq	$4 * SIZE, CO2
997#endif
998
999#if defined(LN) || defined(LT)
1000	movlpd	%xmm9,  0 * SIZE(CO1)
1001	movlpd	%xmm13, 1 * SIZE(CO1)
1002	movlpd	%xmm1,  2 * SIZE(CO1)
1003	movlpd	%xmm5,  3 * SIZE(CO1)
1004
1005	movhpd	%xmm9,  0 * SIZE(CO2)
1006	movhpd	%xmm13, 1 * SIZE(CO2)
1007	movhpd	%xmm1,  2 * SIZE(CO2)
1008	movhpd	%xmm5,  3 * SIZE(CO2)
1009
1010	movlpd	%xmm11, 0 * SIZE(CO1, LDC, 2)
1011	movlpd	%xmm15, 1 * SIZE(CO1, LDC, 2)
1012	movlpd	%xmm3,  2 * SIZE(CO1, LDC, 2)
1013	movlpd	%xmm7,  3 * SIZE(CO1, LDC, 2)
1014
1015	movhpd	%xmm11, 0 * SIZE(CO2, LDC, 2)
1016	movhpd	%xmm15, 1 * SIZE(CO2, LDC, 2)
1017	movhpd	%xmm3,  2 * SIZE(CO2, LDC, 2)
1018	movhpd	%xmm7,  3 * SIZE(CO2, LDC, 2)
1019#else
1020	movlpd	%xmm0,  0 * SIZE(CO1)
1021	movhpd	%xmm0,  1 * SIZE(CO1)
1022	movlpd	%xmm1,  2 * SIZE(CO1)
1023	movhpd	%xmm1,  3 * SIZE(CO1)
1024
1025	movlpd	%xmm2,  0 * SIZE(CO2)
1026	movhpd	%xmm2,  1 * SIZE(CO2)
1027	movlpd	%xmm3,  2 * SIZE(CO2)
1028	movhpd	%xmm3,  3 * SIZE(CO2)
1029
1030	movlpd	%xmm4,  0 * SIZE(CO1, LDC, 2)
1031	movhpd	%xmm4,  1 * SIZE(CO1, LDC, 2)
1032	movlpd	%xmm5,  2 * SIZE(CO1, LDC, 2)
1033	movhpd	%xmm5,  3 * SIZE(CO1, LDC, 2)
1034
1035	movlpd	%xmm6,  0 * SIZE(CO2, LDC, 2)
1036	movhpd	%xmm6,  1 * SIZE(CO2, LDC, 2)
1037	movlpd	%xmm7,  2 * SIZE(CO2, LDC, 2)
1038	movhpd	%xmm7,  3 * SIZE(CO2, LDC, 2)
1039#endif
1040
1041#if defined(LN) || defined(LT)
1042	movaps	%xmm9,  -16 * SIZE(BO)
1043	movaps	%xmm11, -14 * SIZE(BO)
1044	movaps	%xmm13, -12 * SIZE(BO)
1045	movaps	%xmm15, -10 * SIZE(BO)
1046	movaps	%xmm1,   -8 * SIZE(BO)
1047	movaps	%xmm3,   -6 * SIZE(BO)
1048	movaps	%xmm5,   -4 * SIZE(BO)
1049	movaps	%xmm7,   -2 * SIZE(BO)
1050#else
1051	movaps	%xmm0,  -16 * SIZE(AO)
1052	movaps	%xmm1,  -14 * SIZE(AO)
1053	movaps	%xmm2,  -12 * SIZE(AO)
1054	movaps	%xmm3,  -10 * SIZE(AO)
1055	movaps	%xmm4,   -8 * SIZE(AO)
1056	movaps	%xmm5,   -6 * SIZE(AO)
1057	movaps	%xmm6,   -4 * SIZE(AO)
1058	movaps	%xmm7,   -2 * SIZE(AO)
1059#endif
1060
1061#ifndef LN
1062	addq	$4 * SIZE, CO1
1063	addq	$4 * SIZE, CO2
1064#endif
1065
1066#if defined(LT) || defined(RN)
1067	movq	K,  %rax
1068	subq	KK, %rax
1069	leaq	(,%rax, SIZE), %rax
1070	leaq	(AO, %rax, 4), AO
1071	leaq	(BO, %rax, 4), BO
1072#endif
1073
1074#ifdef LN
1075	subq	$4, KK
1076#endif
1077
1078#ifdef LT
1079	addq	$4, KK
1080#endif
1081
1082#ifdef RT
1083	movq	K, %rax
1084	salq	$2 + BASE_SHIFT, %rax
1085	addq	%rax, AORIG
1086#endif
1087
1088	decq	I			# i --
1089	jg	.L11
1090	ALIGN_4
1091
1092.L20:
1093	testq	$3, M
1094	je	.L39
1095
1096	testq	$2, M
1097	je	.L30
1098	ALIGN_4
1099
1100.L21:
1101#ifdef LN
1102       movq	K, %rax
1103       salq	$1 + BASE_SHIFT, %rax
1104       subq	%rax, AORIG
1105#endif
1106
1107#if defined(LN) || defined(RT)
1108	movq	KK, %rax
1109	movq	AORIG, AO
1110	leaq	(, %rax, SIZE), %rax
1111	leaq	(AO, %rax, 2), AO
1112#endif
1113
1114	movq	B, BO
1115
1116#if defined(LN) || defined(RT)
1117	movq	KK, %rax
1118	leaq	(, %rax, SIZE), %rax
1119	leaq	(BO, %rax, 4), BO
1120#endif
1121
1122	movapd	-16 * SIZE(AO), %xmm0
1123	pxor	%xmm8, %xmm8
1124	movapd	-12 * SIZE(AO), %xmm2
1125	pxor	%xmm9, %xmm9
1126	movddup	-16 * SIZE(BO), %xmm1
1127	pxor	%xmm10, %xmm10
1128	movddup	-15 * SIZE(BO), %xmm5
1129	pxor	%xmm11, %xmm11
1130	movddup	 -8 * SIZE(BO), %xmm3
1131
1132#if defined(LT) || defined(RN)
1133	movq	KK, %rax
1134#else
1135	movq	K, %rax
1136	subq	KK, %rax
1137#endif
1138	andq	$-4, %rax
1139	leaq	(, %rax, SIZE), %rax
1140	leaq	(AO, %rax, 2), AO
1141	leaq	(BO, %rax, 4), BO
1142	negq	%rax
1143	NOBRANCH
1144	je	.L26
1145	ALIGN_4
1146
1147.L22:
1148	mulpd	%xmm0, %xmm1
1149	addpd	%xmm1, %xmm8
1150	movddup	-14 * SIZE(BO, %rax, 4), %xmm1
1151	mulpd	%xmm0, %xmm5
1152	addpd	%xmm5, %xmm9
1153	movddup	-13 * SIZE(BO, %rax, 4), %xmm5
1154	mulpd	%xmm0, %xmm1
1155	addpd	%xmm1, %xmm10
1156	movddup	-12 * SIZE(BO, %rax, 4), %xmm1
1157	mulpd	%xmm0, %xmm5
1158	movapd	-14 * SIZE(AO, %rax, 2), %xmm0
1159	addpd	%xmm5, %xmm11
1160	movddup	-11 * SIZE(BO, %rax, 4), %xmm5
1161	mulpd	%xmm0, %xmm1
1162	addpd	%xmm1, %xmm8
1163	movddup	-10 * SIZE(BO, %rax, 4), %xmm1
1164	mulpd	%xmm0, %xmm5
1165	addpd	%xmm5, %xmm9
1166	movddup	 -9 * SIZE(BO, %rax, 4), %xmm5
1167	mulpd	%xmm0, %xmm1
1168	addpd	%xmm1, %xmm10
1169	movddup	  (BO, %rax, 4), %xmm1
1170	mulpd	%xmm0, %xmm5
1171	movapd	 -8 * SIZE(AO, %rax, 2), %xmm0
1172	addpd	%xmm5, %xmm11
1173	movddup	 -7 * SIZE(BO, %rax, 4), %xmm5
1174	mulpd	%xmm2, %xmm3
1175	addpd	%xmm3, %xmm8
1176	movddup	 -6 * SIZE(BO, %rax, 4), %xmm3
1177	mulpd	%xmm2, %xmm5
1178	addpd	%xmm5, %xmm9
1179	movddup	 -5 * SIZE(BO, %rax, 4), %xmm5
1180	mulpd	%xmm2, %xmm3
1181	addpd	%xmm3, %xmm10
1182	movddup	 -4 * SIZE(BO, %rax, 4), %xmm3
1183	mulpd	%xmm2, %xmm5
1184	movapd	-10 * SIZE(AO, %rax, 2), %xmm2
1185	addpd	%xmm5, %xmm11
1186	movddup	 -3 * SIZE(BO, %rax, 4), %xmm5
1187	mulpd	%xmm2, %xmm3
1188	addpd	%xmm3, %xmm8
1189	movddup	 -2 * SIZE(BO, %rax, 4), %xmm3
1190	mulpd	%xmm2, %xmm5
1191	addpd	%xmm5, %xmm9
1192	movddup	 -1 * SIZE(BO, %rax, 4), %xmm5
1193	mulpd	%xmm2, %xmm3
1194	addpd	%xmm3, %xmm10
1195	movddup	  8 * SIZE(BO, %rax, 4), %xmm3
1196	mulpd	%xmm2, %xmm5
1197	movapd	 -4 * SIZE(AO, %rax, 2), %xmm2
1198	addpd	%xmm5, %xmm11
1199	movddup	  1 * SIZE(BO, %rax, 4), %xmm5
1200
1201	addq	$4 * SIZE, %rax
1202	BRANCH
1203	jl	.L22
1204	ALIGN_4
1205
1206.L26:
1207#if defined(LT) || defined(RN)
1208	movq	KK, %rax
1209#else
1210	movq	K, %rax
1211	subq	KK, %rax
1212#endif
1213	andq	$3, %rax		# if (k & 1)
1214	je .L29
1215
1216	leaq	(, %rax, SIZE), %rax
1217	leaq	(AO, %rax, 2), AO
1218	leaq	(BO, %rax, 4), BO
1219	negq	%rax
1220	ALIGN_4
1221
1222.L27:
1223	mulpd	%xmm0, %xmm1
1224	addpd	%xmm1, %xmm8
1225	movddup	-14 * SIZE(BO, %rax, 4), %xmm1
1226	mulpd	%xmm0, %xmm5
1227	addpd	%xmm5, %xmm9
1228	movddup	-13 * SIZE(BO, %rax, 4), %xmm5
1229	mulpd	%xmm0, %xmm1
1230	addpd	%xmm1, %xmm10
1231	movddup	-12 * SIZE(BO, %rax, 4), %xmm1
1232	mulpd	%xmm0, %xmm5
1233	movapd	-14 * SIZE(AO, %rax, 2), %xmm0
1234	addpd	%xmm5, %xmm11
1235	movddup	-11 * SIZE(BO, %rax, 4), %xmm5
1236
1237	addq	$SIZE, %rax
1238	jl	.L27
1239	ALIGN_4
1240
1241.L29:
1242#if defined(LN) || defined(RT)
1243	movq	KK, %rax
1244#ifdef LN
1245	subq	$2, %rax
1246#else
1247	subq	$4, %rax
1248#endif
1249
1250	leaq	(, %rax, SIZE), %rax
1251
1252	movq	AORIG, AO
1253	leaq	(AO, %rax, 2), AO
1254	leaq	(B,  %rax, 4), BO
1255#endif
1256
1257#if defined(LN) || defined(LT)
1258	movapd	 %xmm8,  %xmm0
1259	unpcklpd %xmm9,  %xmm8
1260	unpckhpd %xmm9,  %xmm0
1261
1262	movapd	 %xmm10, %xmm2
1263	unpcklpd %xmm11, %xmm10
1264	unpckhpd %xmm11, %xmm2
1265
1266	movapd	-16 * SIZE(BO), %xmm9
1267	movapd	-14 * SIZE(BO), %xmm11
1268	movapd	-12 * SIZE(BO), %xmm13
1269	movapd	-10 * SIZE(BO), %xmm15
1270
1271	subpd	%xmm8,  %xmm9
1272	subpd	%xmm10, %xmm11
1273	subpd	%xmm0,  %xmm13
1274	subpd	%xmm2,  %xmm15
1275#else
1276	movapd	-16 * SIZE(AO), %xmm0
1277	movapd	-14 * SIZE(AO), %xmm2
1278	movapd	-12 * SIZE(AO), %xmm4
1279	movapd	-10 * SIZE(AO), %xmm6
1280
1281	subpd	%xmm8,  %xmm0
1282	subpd	%xmm9,  %xmm2
1283	subpd	%xmm10, %xmm4
1284	subpd	%xmm11, %xmm6
1285#endif
1286
1287#ifdef LN
1288	movddup	-13 * SIZE(AO), %xmm8
1289	mulpd	 %xmm8, %xmm13
1290	mulpd	 %xmm8, %xmm15
1291
1292	movddup	-14 * SIZE(AO), %xmm10
1293	mulpd	 %xmm13, %xmm10
1294	subpd	 %xmm10, %xmm9
1295	movddup	-14 * SIZE(AO), %xmm10
1296	mulpd	 %xmm15, %xmm10
1297	subpd	 %xmm10, %xmm11
1298
1299	movddup	-16 * SIZE(AO), %xmm8
1300	mulpd	 %xmm8, %xmm9
1301	mulpd	 %xmm8, %xmm11
1302#endif
1303
1304#ifdef LT
1305	movddup	-16 * SIZE(AO), %xmm8
1306	mulpd	 %xmm8, %xmm9
1307	mulpd	 %xmm8, %xmm11
1308
1309	movddup	-15 * SIZE(AO), %xmm10
1310	mulpd	 %xmm9, %xmm10
1311	subpd	 %xmm10, %xmm13
1312	movddup	-15 * SIZE(AO), %xmm10
1313	mulpd	 %xmm11, %xmm10
1314	subpd	 %xmm10, %xmm15
1315
1316	movddup	-13 * SIZE(AO), %xmm8
1317	mulpd	 %xmm8, %xmm13
1318	mulpd	 %xmm8, %xmm15
1319#endif
1320
1321#ifdef RN
1322	movddup	-16 * SIZE(BO), %xmm8
1323	mulpd	 %xmm8, %xmm0
1324
1325	movddup	-15 * SIZE(BO), %xmm9
1326	mulpd	 %xmm0, %xmm9
1327	subpd	 %xmm9, %xmm2
1328	movddup	-14 * SIZE(BO), %xmm10
1329	mulpd	 %xmm0, %xmm10
1330	subpd	 %xmm10, %xmm4
1331	movddup	-13 * SIZE(BO), %xmm11
1332	mulpd	 %xmm0, %xmm11
1333	subpd	 %xmm11, %xmm6
1334
1335	movddup	-11 * SIZE(BO), %xmm8
1336	mulpd	 %xmm8, %xmm2
1337	movddup	-10 * SIZE(BO), %xmm9
1338	mulpd	 %xmm2, %xmm9
1339	subpd	 %xmm9, %xmm4
1340	movddup	 -9 * SIZE(BO), %xmm10
1341	mulpd	 %xmm2, %xmm10
1342	subpd	 %xmm10, %xmm6
1343
1344	movddup	 -6 * SIZE(BO), %xmm8
1345	mulpd	 %xmm8, %xmm4
1346
1347	movddup	 -5 * SIZE(BO), %xmm9
1348	mulpd	 %xmm4, %xmm9
1349	subpd	 %xmm9, %xmm6
1350
1351	movddup	 -1 * SIZE(BO), %xmm8
1352	mulpd	 %xmm8, %xmm6
1353#endif
1354
1355#ifdef RT
1356	movddup	 -1 * SIZE(BO), %xmm8
1357	mulpd	 %xmm8, %xmm6
1358
1359	movddup	 -2 * SIZE(BO), %xmm9
1360	mulpd	 %xmm6, %xmm9
1361	subpd	 %xmm9, %xmm4
1362	movddup	 -3 * SIZE(BO), %xmm10
1363	mulpd	 %xmm6, %xmm10
1364	subpd	 %xmm10, %xmm2
1365	movddup	 -4 * SIZE(BO), %xmm11
1366	mulpd	 %xmm6, %xmm11
1367	subpd	 %xmm11, %xmm0
1368
1369	movddup	 -6 * SIZE(BO), %xmm8
1370	mulpd	 %xmm8, %xmm4
1371	movddup	 -7 * SIZE(BO), %xmm9
1372	mulpd	 %xmm4, %xmm9
1373	subpd	 %xmm9, %xmm2
1374	movddup	 -8 * SIZE(BO), %xmm10
1375	mulpd	 %xmm4, %xmm10
1376	subpd	 %xmm10, %xmm0
1377
1378	movddup	-11 * SIZE(BO), %xmm8
1379	mulpd	 %xmm8, %xmm2
1380	movddup	-12 * SIZE(BO), %xmm9
1381	mulpd	 %xmm2, %xmm9
1382	subpd	 %xmm9, %xmm0
1383
1384	movddup	-16 * SIZE(BO), %xmm8
1385	mulpd	 %xmm8, %xmm0
1386#endif
1387
1388#ifdef LN
1389	subq	$2 * SIZE, CO1
1390	subq	$2 * SIZE, CO2
1391#endif
1392
1393#if defined(LN) || defined(LT)
1394	movlpd	%xmm9,   0 * SIZE(CO1)
1395	movlpd	%xmm13,  1 * SIZE(CO1)
1396
1397	movhpd	%xmm9,   0 * SIZE(CO2)
1398	movhpd	%xmm13,  1 * SIZE(CO2)
1399
1400	movlpd	%xmm11,  0 * SIZE(CO1, LDC, 2)
1401	movlpd	%xmm15,  1 * SIZE(CO1, LDC, 2)
1402
1403	movhpd	%xmm11,  0 * SIZE(CO2, LDC, 2)
1404	movhpd	%xmm15,  1 * SIZE(CO2, LDC, 2)
1405#else
1406	movlpd	%xmm0,  0 * SIZE(CO1)
1407	movhpd	%xmm0,  1 * SIZE(CO1)
1408
1409	movlpd	%xmm2,  0 * SIZE(CO2)
1410	movhpd	%xmm2,  1 * SIZE(CO2)
1411
1412	movlpd	%xmm4,  0 * SIZE(CO1, LDC, 2)
1413	movhpd	%xmm4,  1 * SIZE(CO1, LDC, 2)
1414
1415	movlpd	%xmm6,  0 * SIZE(CO2, LDC, 2)
1416	movhpd	%xmm6,  1 * SIZE(CO2, LDC, 2)
1417#endif
1418
1419#if defined(LN) || defined(LT)
1420	movaps	%xmm9,  -16 * SIZE(BO)
1421	movaps	%xmm11, -14 * SIZE(BO)
1422	movaps	%xmm13, -12 * SIZE(BO)
1423	movaps	%xmm15, -10 * SIZE(BO)
1424#else
1425	movaps	%xmm0,  -16 * SIZE(AO)
1426	movaps	%xmm2,  -14 * SIZE(AO)
1427	movaps	%xmm4,  -12 * SIZE(AO)
1428	movaps	%xmm6,  -10 * SIZE(AO)
1429#endif
1430
1431#ifndef LN
1432	addq	$2 * SIZE, CO1
1433	addq	$2 * SIZE, CO2
1434#endif
1435
1436#if defined(LT) || defined(RN)
1437	movq	K,  %rax
1438	subq	KK, %rax
1439	leaq	(,%rax, SIZE), %rax
1440	leaq	(AO, %rax, 2), AO
1441	leaq	(BO, %rax, 4), BO
1442#endif
1443
1444#ifdef LN
1445	subq	$2, KK
1446#endif
1447
1448#ifdef LT
1449	addq	$2, KK
1450#endif
1451
1452#ifdef RT
1453       movq	K, %rax
1454       salq	$1 + BASE_SHIFT, %rax
1455       addq	%rax, AORIG
1456#endif
1457	ALIGN_4
1458
1459.L30:
1460	testq	$1, M
1461	je	.L39
1462
1463#ifdef LN
1464       movq	K, %rax
1465       salq	$0 + BASE_SHIFT, %rax
1466       subq	%rax, AORIG
1467#endif
1468
1469#if defined(LN) || defined(RT)
1470	movq	KK, %rax
1471	movq	AORIG, AO
1472	leaq	(, %rax, SIZE), %rax
1473	leaq	(AO, %rax, 1), AO
1474#endif
1475
1476	movq	B, BO
1477
1478#if defined(LN) || defined(RT)
1479	movq	KK, %rax
1480	leaq	(, %rax, SIZE), %rax
1481	leaq	(BO, %rax, 4), BO
1482#endif
1483
1484	movddup	-16 * SIZE(AO), %xmm0
1485	pxor	%xmm8, %xmm8
1486	movddup	-14 * SIZE(AO), %xmm2
1487	pxor	%xmm9, %xmm9
1488	movddup	-15 * SIZE(AO), %xmm4
1489	pxor	%xmm10, %xmm10
1490	movapd	-16 * SIZE(BO), %xmm1
1491	pxor	%xmm11, %xmm11
1492	movapd	 -8 * SIZE(BO), %xmm3
1493
1494#if defined(LT) || defined(RN)
1495	movq	KK, %rax
1496#else
1497	movq	K, %rax
1498	subq	KK, %rax
1499#endif
1500	andq	$-4, %rax
1501	leaq	(, %rax, SIZE), %rax
1502	leaq	(AO, %rax, 1), AO
1503	leaq	(BO, %rax, 4), BO
1504	negq	%rax
1505	NOBRANCH
1506	je	.L36
1507	ALIGN_4
1508
1509.L32:
1510	mulpd	%xmm0, %xmm1
1511	mulpd	-14 * SIZE(BO, %rax, 4), %xmm0
1512	addpd	%xmm1, %xmm8
1513	movapd	-12 * SIZE(BO, %rax, 4), %xmm1
1514	addpd	%xmm0, %xmm9
1515	movddup	-12 * SIZE(AO, %rax, 1), %xmm0
1516	mulpd	%xmm4, %xmm1
1517	mulpd	-10 * SIZE(BO, %rax, 4), %xmm4
1518	addpd	%xmm1, %xmm10
1519	movapd	  (BO, %rax, 4), %xmm1
1520	addpd	%xmm4, %xmm11
1521	movddup	-11 * SIZE(AO, %rax, 1), %xmm4
1522	mulpd	%xmm2, %xmm3
1523	mulpd	 -6 * SIZE(BO, %rax, 4), %xmm2
1524	addpd	%xmm3, %xmm8
1525	movapd	 -4 * SIZE(BO, %rax, 4), %xmm3
1526	addpd	%xmm2, %xmm9
1527	movddup	-13 * SIZE(AO, %rax, 1), %xmm2
1528	mulpd	%xmm2, %xmm3
1529	mulpd	 -2 * SIZE(BO, %rax, 4), %xmm2
1530	addpd	%xmm3, %xmm10
1531	movapd	  8 * SIZE(BO, %rax, 4), %xmm3
1532	addpd	%xmm2, %xmm11
1533	movddup	-10 * SIZE(AO, %rax, 1), %xmm2
1534
1535	addq	$4 * SIZE, %rax
1536	BRANCH
1537	jl	.L32
1538	ALIGN_4
1539
1540.L36:
1541#if defined(LT) || defined(RN)
1542	movq	KK, %rax
1543#else
1544	movq	K, %rax
1545	subq	KK, %rax
1546#endif
1547	andq	$3, %rax		# if (k & 1)
1548	je .L38
1549
1550	leaq	(, %rax, SIZE), %rax
1551	leaq	(AO, %rax, 1), AO
1552	leaq	(BO, %rax, 4), BO
1553	negq	%rax
1554	ALIGN_4
1555
1556.L37:
1557	mulpd	%xmm0, %xmm1
1558	mulpd	-14 * SIZE(BO, %rax, 4), %xmm0
1559	addpd	%xmm1, %xmm8
1560	movapd	-12 * SIZE(BO, %rax, 4), %xmm1
1561	addpd	%xmm0, %xmm9
1562	movddup	-15 * SIZE(AO, %rax, 1), %xmm0
1563
1564	addq	$SIZE, %rax
1565	jl	.L37
1566	ALIGN_4
1567
1568.L38:
1569	addpd	%xmm10, %xmm8
1570	addpd	%xmm11, %xmm9
1571
1572#if defined(LN) || defined(RT)
1573	movq	KK, %rax
1574#ifdef LN
1575	subq	$1, %rax
1576#else
1577	subq	$4, %rax
1578#endif
1579
1580	leaq	(, %rax, SIZE), %rax
1581
1582	movq	AORIG, AO
1583	leaq	(AO, %rax, 1), AO
1584	leaq	(B,  %rax, 4), BO
1585#endif
1586
1587#if defined(LN) || defined(LT)
1588	movapd	-16 * SIZE(BO), %xmm2
1589	movapd	-14 * SIZE(BO), %xmm3
1590
1591	subpd	%xmm8,  %xmm2
1592	subpd	%xmm9,  %xmm3
1593#else
1594	movapd	-16 * SIZE(AO), %xmm2
1595	movapd	-14 * SIZE(AO), %xmm3
1596
1597	subpd	%xmm8, %xmm2
1598	subpd	%xmm9, %xmm3
1599#endif
1600
1601#if defined(LN) || defined(LT)
1602	movddup	-16 * SIZE(AO), %xmm0
1603	mulpd	 %xmm0, %xmm2
1604	mulpd	 %xmm0, %xmm3
1605#endif
1606
1607#ifdef RN
1608	movapd	%xmm2, %xmm0
1609        unpckhpd %xmm0, %xmm0
1610
1611	movapd	%xmm3, %xmm1
1612        unpckhpd %xmm1, %xmm1
1613
1614	movsd	-16 * SIZE(BO), %xmm4
1615	mulsd	 %xmm4, %xmm2
1616
1617	movsd	-15 * SIZE(BO), %xmm5
1618	mulsd	 %xmm2, %xmm5
1619	subsd	 %xmm5, %xmm0
1620	movsd	-14 * SIZE(BO), %xmm6
1621	mulsd	 %xmm2, %xmm6
1622	subsd	 %xmm6, %xmm3
1623	movsd	-13 * SIZE(BO), %xmm7
1624	mulsd	 %xmm2, %xmm7
1625	subsd	 %xmm7, %xmm1
1626
1627	movsd	-11 * SIZE(BO), %xmm4
1628	mulsd	 %xmm4, %xmm0
1629
1630	movsd	-10 * SIZE(BO), %xmm5
1631	mulsd	 %xmm0, %xmm5
1632	subsd	 %xmm5, %xmm3
1633	movsd	 -9 * SIZE(BO), %xmm6
1634	mulsd	 %xmm0, %xmm6
1635	subsd	 %xmm6, %xmm1
1636
1637	movsd	 -6 * SIZE(BO), %xmm4
1638	mulsd	 %xmm4, %xmm3
1639
1640	movsd	 -5 * SIZE(BO), %xmm5
1641	mulsd	 %xmm3, %xmm5
1642	subsd	 %xmm5, %xmm1
1643
1644	movsd	 -1 * SIZE(BO), %xmm4
1645	mulsd	 %xmm4, %xmm1
1646
1647	unpcklpd %xmm0, %xmm2
1648	unpcklpd %xmm1, %xmm3
1649#endif
1650
1651#ifdef RT
1652	movapd	%xmm2, %xmm0
1653        unpckhpd %xmm0, %xmm0
1654
1655	movapd	%xmm3, %xmm1
1656        unpckhpd %xmm1, %xmm1
1657
1658	movsd	 -1 * SIZE(BO), %xmm4
1659	mulsd	 %xmm4, %xmm1
1660
1661	movsd	 -2 * SIZE(BO), %xmm5
1662	mulsd	 %xmm1, %xmm5
1663	subsd	 %xmm5, %xmm3
1664	movsd	 -3 * SIZE(BO), %xmm6
1665	mulsd	 %xmm1, %xmm6
1666	subsd	 %xmm6, %xmm0
1667	movsd	 -4 * SIZE(BO), %xmm7
1668	mulsd	 %xmm1, %xmm7
1669	subsd	 %xmm7, %xmm2
1670
1671	movsd	 -6 * SIZE(BO), %xmm4
1672	mulsd	 %xmm4, %xmm3
1673
1674	movsd	 -7 * SIZE(BO), %xmm5
1675	mulsd	 %xmm3, %xmm5
1676	subsd	 %xmm5, %xmm0
1677	movsd	 -8 * SIZE(BO), %xmm6
1678	mulsd	 %xmm3, %xmm6
1679	subsd	 %xmm6, %xmm2
1680
1681	movsd	-11 * SIZE(BO), %xmm4
1682	mulsd	 %xmm4, %xmm0
1683
1684	movsd	-12 * SIZE(BO), %xmm5
1685	mulsd	 %xmm0, %xmm5
1686	subsd	 %xmm5, %xmm2
1687
1688	movsd	-16 * SIZE(BO), %xmm4
1689	mulsd	 %xmm4, %xmm2
1690
1691	unpcklpd %xmm0, %xmm2
1692	unpcklpd %xmm1, %xmm3
1693
1694#endif
1695
1696#ifdef LN
1697	subq	$1 * SIZE, CO1
1698	subq	$1 * SIZE, CO2
1699#endif
1700
1701#if defined(LN) || defined(LT)
1702	movlpd	%xmm2,  0 * SIZE(CO1)
1703	movhpd	%xmm2,  0 * SIZE(CO2)
1704	movlpd	%xmm3,  0 * SIZE(CO1, LDC, 2)
1705	movhpd	%xmm3,  0 * SIZE(CO2, LDC, 2)
1706#else
1707	movlpd	%xmm2,  0 * SIZE(CO1)
1708	movhpd	%xmm2,  0 * SIZE(CO2)
1709	movlpd	%xmm3,  0 * SIZE(CO1, LDC, 2)
1710	movhpd	%xmm3,  0 * SIZE(CO2, LDC, 2)
1711#endif
1712
1713#if defined(LN) || defined(LT)
1714	movaps	%xmm2, -16 * SIZE(BO)
1715	movaps	%xmm3, -14 * SIZE(BO)
1716#else
1717	movaps	%xmm2, -16 * SIZE(AO)
1718	movaps	%xmm3, -14 * SIZE(AO)
1719#endif
1720
1721#ifndef LN
1722	addq	$1 * SIZE, CO1
1723	addq	$1 * SIZE, CO2
1724#endif
1725
1726#if defined(LT) || defined(RN)
1727	movq	K,  %rax
1728	subq	KK, %rax
1729	leaq	(,%rax, SIZE), %rax
1730	leaq	(AO, %rax, 1), AO
1731	leaq	(BO, %rax, 4), BO
1732#endif
1733
1734#ifdef LN
1735	subq	$1, KK
1736#endif
1737
1738#ifdef LT
1739	addq	$1, KK
1740#endif
1741
1742#ifdef RT
1743       movq	K, %rax
1744       salq	$0 + BASE_SHIFT, %rax
1745       addq	%rax, AORIG
1746#endif
1747	ALIGN_4
1748
1749.L39:
1750#ifdef LN
1751       leaq	(, K, SIZE), %rax
1752       leaq	(B, %rax, 4), B
1753#endif
1754
1755#if defined(LT) || defined(RN)
1756	movq	BO, B
1757#endif
1758
1759#ifdef RN
1760	addq	$4, KK
1761#endif
1762
1763#ifdef RT
1764	subq	$4, KK
1765#endif
1766
1767	decq	J			# j --
1768	jg	.L01
1769	ALIGN_4
1770
1771.L40:
1772	testq	$2, N
1773	je	.L80
1774
1775#if defined(LT) || defined(RN)
1776	movq	A, AO
1777#else
1778	movq	A, AORIG
1779#endif
1780
1781#ifdef RT
1782       movq	K, %rax
1783       salq	$1 + BASE_SHIFT, %rax
1784       subq	%rax, B
1785
1786       leaq	(, LDC, 2), %rax
1787       subq	%rax, C
1788#endif
1789
1790	movq	C, CO1			# coffset1 = c
1791	leaq	(C, LDC, 1), CO2	# coffset2 = c + ldc
1792#ifndef RT
1793	leaq	(C, LDC, 2), C
1794#endif
1795
1796#ifdef LN
1797	movq	OFFSET, %rax
1798	addq	M, %rax
1799	movq	%rax, KK
1800#endif
1801
1802#if defined(LT)
1803	movq	OFFSET, %rax
1804	movq	%rax, KK
1805#endif
1806
1807	movq	M,  I
1808	sarq	$2, I	# i = (m >> 2)
1809	jle	.L60
1810	ALIGN_4
1811
1812.L51:
1813#ifdef LN
1814       movq	K, %rax
1815       salq	$2 + BASE_SHIFT, %rax
1816       subq	%rax, AORIG
1817#endif
1818
1819#if defined(LN) || defined(RT)
1820	movq	KK, %rax
1821	movq	AORIG, AO
1822	leaq	(, %rax, SIZE), %rax
1823	leaq	(AO, %rax, 4), AO
1824#endif
1825
1826	movq	B, BO
1827
1828#if defined(LN) || defined(RT)
1829	movq	KK, %rax
1830	leaq	(, %rax, SIZE), %rax
1831	leaq	(BO, %rax, 2), BO
1832#endif
1833
1834	movddup	-16 * SIZE(BO), %xmm1
1835	movddup	-15 * SIZE(BO), %xmm5
1836	pxor	%xmm8, %xmm8
1837	movddup	-12 * SIZE(BO), %xmm3
1838	pxor	%xmm9, %xmm9
1839	movapd	-16 * SIZE(AO), %xmm0
1840	pxor	%xmm12, %xmm12
1841	movapd	 -8 * SIZE(AO), %xmm4
1842	pxor	%xmm13, %xmm13
1843
1844#ifndef LN
1845	prefetchw      3 * SIZE(CO1)
1846	movapd	%xmm0, %xmm2
1847	prefetchw      3 * SIZE(CO2)
1848#else
1849	prefetchw     -8 * SIZE(CO1)
1850	movapd	%xmm0, %xmm2
1851	prefetchw     -8 * SIZE(CO2)
1852#endif
1853
1854
1855#if defined(LT) || defined(RN)
1856	movq	KK, %rax
1857#else
1858	movq	K, %rax
1859	subq	KK, %rax
1860#endif
1861	andq	$-4, %rax
1862	leaq	(, %rax, SIZE), %rax
1863	leaq	(AO, %rax, 4), AO
1864	leaq	(BO, %rax, 2), BO
1865	negq	%rax
1866	NOBRANCH
1867	je	.L56
1868	ALIGN_4
1869
1870.L52:
1871	mulpd	%xmm1, %xmm0
1872	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1
1873	addpd	%xmm0, %xmm8
1874	movapd	-12 * SIZE(AO, %rax, 4), %xmm0
1875	addpd	%xmm1, %xmm12
1876	movddup	-14 * SIZE(BO, %rax, 2), %xmm1
1877	mulpd	%xmm5, %xmm2
1878	mulpd	-14 * SIZE(AO, %rax, 4), %xmm5
1879	addpd	%xmm2, %xmm9
1880	addpd	%xmm5, %xmm13
1881	movddup	-13 * SIZE(BO, %rax, 2), %xmm5
1882	movapd	%xmm0, %xmm2
1883	mulpd	%xmm1, %xmm0
1884	mulpd	-10 * SIZE(AO, %rax, 4), %xmm1
1885	addpd	%xmm0, %xmm8
1886	movapd	  (AO, %rax, 4), %xmm0
1887	addpd	%xmm1, %xmm12
1888	movddup	 -8 * SIZE(BO, %rax, 2), %xmm1
1889	mulpd	%xmm5, %xmm2
1890	mulpd	-10 * SIZE(AO, %rax, 4), %xmm5
1891	addpd	%xmm2, %xmm9
1892	addpd	%xmm5, %xmm13
1893	movddup	-11 * SIZE(BO, %rax, 2), %xmm5
1894	movapd	%xmm4, %xmm2
1895	mulpd	%xmm3, %xmm4
1896	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm3
1897	addpd	%xmm4, %xmm8
1898	movapd	 -4 * SIZE(AO, %rax, 4), %xmm4
1899	addpd	%xmm3, %xmm12
1900	movddup	-10 * SIZE(BO, %rax, 2), %xmm3
1901	mulpd	%xmm5, %xmm2
1902	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm5
1903	addpd	%xmm2, %xmm9
1904	addpd	%xmm5, %xmm13
1905	movddup	 -9 * SIZE(BO, %rax, 2), %xmm5
1906	movapd	%xmm4, %xmm2
1907	mulpd	%xmm3, %xmm4
1908	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm3
1909	addpd	%xmm4, %xmm8
1910	movapd	  8 * SIZE(AO, %rax, 4), %xmm4
1911	addpd	%xmm3, %xmm12
1912	movddup	 -4 * SIZE(BO, %rax, 2), %xmm3
1913	mulpd	%xmm5, %xmm2
1914	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm5
1915	addpd	%xmm2, %xmm9
1916	addpd	%xmm5, %xmm13
1917	movddup	 -7 * SIZE(BO, %rax, 2), %xmm5
1918	movapd	%xmm0, %xmm2
1919
1920	addq	$4 * SIZE, %rax
1921	BRANCH
1922	jl	.L52
1923	ALIGN_4
1924
1925.L56:
1926#if defined(LT) || defined(RN)
1927	movq	KK, %rax
1928#else
1929	movq	K, %rax
1930	subq	KK, %rax
1931#endif
1932	andq	$3, %rax		# if (k & 1)
1933	je .L59
1934
1935	leaq	(, %rax, SIZE), %rax
1936	leaq	(AO, %rax, 4), AO
1937	leaq	(BO, %rax, 2), BO
1938	negq	%rax
1939	ALIGN_4
1940
1941.L57:
1942	mulpd	%xmm1, %xmm0
1943	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1
1944	addpd	%xmm0, %xmm8
1945	movapd	-12 * SIZE(AO, %rax, 4), %xmm0
1946	addpd	%xmm1, %xmm12
1947	movddup	-14 * SIZE(BO, %rax, 2), %xmm1
1948	mulpd	%xmm5, %xmm2
1949	mulpd	-14 * SIZE(AO, %rax, 4), %xmm5
1950	addpd	%xmm2, %xmm9
1951	addpd	%xmm5, %xmm13
1952	movddup	-13 * SIZE(BO, %rax, 2), %xmm5
1953	movapd	%xmm0, %xmm2
1954
1955	addq	$SIZE, %rax
1956	jl	.L57
1957	ALIGN_4
1958
1959.L59:
1960#if defined(LN) || defined(RT)
1961	movq	KK, %rax
1962#ifdef LN
1963	subq	$4, %rax
1964#else
1965	subq	$2, %rax
1966#endif
1967
1968	leaq	(, %rax, SIZE), %rax
1969
1970	movq	AORIG, AO
1971	leaq	(AO, %rax, 4), AO
1972	leaq	(B,  %rax, 2), BO
1973#endif
1974
1975#if defined(LN) || defined(LT)
1976	movapd	%xmm8, %xmm0
1977	unpcklpd %xmm9, %xmm8
1978	unpckhpd %xmm9, %xmm0
1979
1980	movapd	%xmm12, %xmm4
1981	unpcklpd %xmm13, %xmm12
1982	unpckhpd %xmm13, %xmm4
1983
1984	movapd	-16 * SIZE(BO), %xmm9
1985	movapd	-14 * SIZE(BO), %xmm13
1986	movapd	-12 * SIZE(BO), %xmm1
1987	movapd	-10 * SIZE(BO), %xmm5
1988
1989	subpd	%xmm8,  %xmm9
1990	subpd	%xmm0,  %xmm13
1991	subpd	%xmm12, %xmm1
1992	subpd	%xmm4,  %xmm5
1993#else
1994	movapd	-16 * SIZE(AO), %xmm0
1995	movapd	-14 * SIZE(AO), %xmm1
1996	movapd	-12 * SIZE(AO), %xmm2
1997	movapd	-10 * SIZE(AO), %xmm3
1998
1999	subpd	%xmm8, %xmm0
2000	subpd	%xmm12, %xmm1
2001	subpd	%xmm9, %xmm2
2002	subpd	%xmm13, %xmm3
2003#endif
2004
2005#ifdef LN
2006	movddup	 -1 * SIZE(AO), %xmm8
2007	mulpd	 %xmm8, %xmm5
2008	movddup	 -2 * SIZE(AO), %xmm10
2009	mulpd	 %xmm5, %xmm10
2010	subpd	 %xmm10, %xmm1
2011	movddup	 -3 * SIZE(AO), %xmm12
2012	mulpd	 %xmm5, %xmm12
2013	subpd	 %xmm12, %xmm13
2014	movddup	 -4 * SIZE(AO), %xmm14
2015	mulpd	 %xmm5, %xmm14
2016	subpd	 %xmm14, %xmm9
2017
2018	movddup	 -6 * SIZE(AO), %xmm8
2019	mulpd	 %xmm8, %xmm1
2020	movddup	 -7 * SIZE(AO), %xmm10
2021	mulpd	 %xmm1, %xmm10
2022	subpd	 %xmm10, %xmm13
2023	movddup	 -8 * SIZE(AO), %xmm12
2024	mulpd	 %xmm1, %xmm12
2025	subpd	 %xmm12, %xmm9
2026
2027	movddup	-11 * SIZE(AO), %xmm8
2028	mulpd	 %xmm8, %xmm13
2029	movddup	-12 * SIZE(AO), %xmm10
2030	mulpd	 %xmm13, %xmm10
2031	subpd	 %xmm10, %xmm9
2032
2033	movddup	-16 * SIZE(AO), %xmm8
2034	mulpd	 %xmm8, %xmm9
2035#endif
2036
2037#ifdef LT
2038	movddup -16 * SIZE(AO), %xmm8
2039	mulpd	 %xmm8, %xmm9
2040	movddup	-15 * SIZE(AO), %xmm10
2041	mulpd	 %xmm9, %xmm10
2042	subpd	 %xmm10, %xmm13
2043	movddup	-14 * SIZE(AO), %xmm12
2044	mulpd	 %xmm9, %xmm12
2045	subpd	 %xmm12, %xmm1
2046	movddup	-13 * SIZE(AO), %xmm14
2047	mulpd	 %xmm9, %xmm14
2048	subpd	 %xmm14, %xmm5
2049
2050
2051	movddup	-11 * SIZE(AO), %xmm8
2052	mulpd	 %xmm8, %xmm13
2053
2054	movddup	-10 * SIZE(AO), %xmm10
2055	mulpd	 %xmm13, %xmm10
2056	subpd	 %xmm10, %xmm1
2057	movddup	 -9 * SIZE(AO), %xmm12
2058	mulpd	 %xmm13, %xmm12
2059	subpd	 %xmm12, %xmm5
2060
2061	movddup	 -6 * SIZE(AO), %xmm8
2062	mulpd	 %xmm8, %xmm1
2063	movddup	 -5 * SIZE(AO), %xmm10
2064	mulpd	 %xmm1, %xmm10
2065	subpd	 %xmm10, %xmm5
2066
2067	movddup	 -1 * SIZE(AO), %xmm8
2068	mulpd	 %xmm8, %xmm5
2069#endif
2070
2071#ifdef RN
2072	movddup	-16 * SIZE(BO), %xmm8
2073	mulpd	 %xmm8, %xmm0
2074	mulpd	 %xmm8, %xmm1
2075
2076	movddup	-15 * SIZE(BO), %xmm9
2077	mulpd	 %xmm0, %xmm9
2078	subpd	 %xmm9, %xmm2
2079	movddup	-15 * SIZE(BO), %xmm9
2080	mulpd	 %xmm1, %xmm9
2081	subpd	 %xmm9, %xmm3
2082
2083	movddup	-13 * SIZE(BO), %xmm8
2084	mulpd	 %xmm8, %xmm2
2085	mulpd	 %xmm8, %xmm3
2086#endif
2087
2088#ifdef RT
2089	movddup	-13 * SIZE(BO), %xmm8
2090	mulpd	 %xmm8, %xmm2
2091	mulpd	 %xmm8, %xmm3
2092
2093	movddup	-14 * SIZE(BO), %xmm9
2094	mulpd	 %xmm2, %xmm9
2095	subpd	 %xmm9, %xmm0
2096	movddup	-14 * SIZE(BO), %xmm9
2097	mulpd	 %xmm3, %xmm9
2098	subpd	 %xmm9, %xmm1
2099
2100	movddup	-16 * SIZE(BO), %xmm8
2101	mulpd	 %xmm8, %xmm0
2102	mulpd	 %xmm8, %xmm1
2103#endif
2104
2105#ifdef LN
2106	subq	$4 * SIZE, CO1
2107	subq	$4 * SIZE, CO2
2108#endif
2109
2110#if defined(LN) || defined(LT)
2111	movlpd	%xmm9,  0 * SIZE(CO1)
2112	movlpd	%xmm13, 1 * SIZE(CO1)
2113	movlpd	%xmm1,  2 * SIZE(CO1)
2114	movlpd	%xmm5,  3 * SIZE(CO1)
2115
2116	movhpd	%xmm9,  0 * SIZE(CO2)
2117	movhpd	%xmm13, 1 * SIZE(CO2)
2118	movhpd	%xmm1,  2 * SIZE(CO2)
2119	movhpd	%xmm5,  3 * SIZE(CO2)
2120#else
2121	movlpd	%xmm0,  0 * SIZE(CO1)
2122	movhpd	%xmm0,  1 * SIZE(CO1)
2123	movlpd	%xmm1,  2 * SIZE(CO1)
2124	movhpd	%xmm1,  3 * SIZE(CO1)
2125
2126	movlpd	%xmm2,  0 * SIZE(CO2)
2127	movhpd	%xmm2,  1 * SIZE(CO2)
2128	movlpd	%xmm3,  2 * SIZE(CO2)
2129	movhpd	%xmm3,  3 * SIZE(CO2)
2130#endif
2131
2132#if defined(LN) || defined(LT)
2133	movaps	%xmm9, -16 * SIZE(BO)
2134	movaps	%xmm13,-14 * SIZE(BO)
2135	movaps	%xmm1, -12 * SIZE(BO)
2136	movaps	%xmm5, -10 * SIZE(BO)
2137#else
2138	movaps	%xmm0, -16 * SIZE(AO)
2139	movaps	%xmm1, -14 * SIZE(AO)
2140	movaps	%xmm2, -12 * SIZE(AO)
2141	movaps	%xmm3, -10 * SIZE(AO)
2142#endif
2143
2144#ifndef LN
2145	addq	$4 * SIZE, CO1
2146	addq	$4 * SIZE, CO2
2147#endif
2148
2149#if defined(LT) || defined(RN)
2150	movq	K,  %rax
2151	subq	KK, %rax
2152	leaq	(,%rax, SIZE), %rax
2153	leaq	(AO, %rax, 4), AO
2154	leaq	(BO, %rax, 2), BO
2155#endif
2156
2157#ifdef LN
2158	subq	$4, KK
2159#endif
2160
2161#ifdef LT
2162	addq	$4, KK
2163#endif
2164
2165#ifdef RT
2166       movq	K, %rax
2167       salq	$2 + BASE_SHIFT, %rax
2168       addq	%rax, AORIG
2169#endif
2170
2171	decq	I			# i --
2172	jg	.L51
2173	ALIGN_4
2174
2175.L60:
2176	testq	$2, M
2177	je	.L70
2178
2179#ifdef LN
2180       movq	K, %rax
2181       salq	$1 + BASE_SHIFT, %rax
2182       subq	%rax, AORIG
2183#endif
2184
2185#if defined(LN) || defined(RT)
2186	movq	KK, %rax
2187	movq	AORIG, AO
2188	leaq	(, %rax, SIZE), %rax
2189	leaq	(AO, %rax, 2), AO
2190#endif
2191
2192	movq	B, BO
2193
2194#if defined(LN) || defined(RT)
2195	movq	KK, %rax
2196	leaq	(, %rax, SIZE), %rax
2197	leaq	(BO, %rax, 2), BO
2198#endif
2199
2200	movapd	-16 * SIZE(AO), %xmm0
2201	pxor	%xmm8, %xmm8
2202	movapd	-12 * SIZE(AO), %xmm2
2203	pxor	%xmm9, %xmm9
2204	movddup	-16 * SIZE(BO), %xmm1
2205	pxor	%xmm10, %xmm10
2206	movddup	-15 * SIZE(BO), %xmm3
2207	pxor	%xmm11, %xmm11
2208
2209#if defined(LT) || defined(RN)
2210	movq	KK, %rax
2211#else
2212	movq	K, %rax
2213	subq	KK, %rax
2214#endif
2215	andq	$-4, %rax
2216	leaq	(, %rax, SIZE), %rax
2217	leaq	(AO, %rax, 2), AO
2218	leaq	(BO, %rax, 2), BO
2219	negq	%rax
2220	NOBRANCH
2221	je	.L66
2222	ALIGN_4
2223
2224.L62:
2225	mulpd	%xmm0, %xmm1
2226	addpd	%xmm1, %xmm8
2227	movddup	-14 * SIZE(BO, %rax, 2), %xmm1
2228	mulpd	%xmm0, %xmm3
2229	movapd	-14 * SIZE(AO, %rax, 2), %xmm0
2230	addpd	%xmm3, %xmm9
2231	movddup	-13 * SIZE(BO, %rax, 2), %xmm3
2232	mulpd	%xmm0, %xmm1
2233	addpd	%xmm1, %xmm10
2234	movddup	-12 * SIZE(BO, %rax, 2), %xmm1
2235	mulpd	%xmm0, %xmm3
2236	movapd	 -8 * SIZE(AO, %rax, 2), %xmm0
2237	addpd	%xmm3, %xmm11
2238	movddup	-11 * SIZE(BO, %rax, 2), %xmm3
2239	mulpd	%xmm2, %xmm1
2240	addpd	%xmm1, %xmm8
2241	movddup	-10 * SIZE(BO, %rax, 2), %xmm1
2242	mulpd	%xmm2, %xmm3
2243	movapd	-10 * SIZE(AO, %rax, 2), %xmm2
2244	addpd	%xmm3, %xmm9
2245	movddup	 -9 * SIZE(BO, %rax, 2), %xmm3
2246	mulpd	%xmm2, %xmm1
2247	addpd	%xmm1, %xmm10
2248	movddup	 -8 * SIZE(BO, %rax, 2), %xmm1
2249	mulpd	%xmm2, %xmm3
2250	movapd	 -4 * SIZE(AO, %rax, 2), %xmm2
2251	addpd	%xmm3, %xmm11
2252	movddup	 -7 * SIZE(BO, %rax, 2), %xmm3
2253
2254	addq	$4 * SIZE, %rax
2255	BRANCH
2256	jl	.L62
2257	ALIGN_4
2258
2259.L66:
2260#if defined(LT) || defined(RN)
2261	movq	KK, %rax
2262#else
2263	movq	K, %rax
2264	subq	KK, %rax
2265#endif
2266	andq	$3, %rax		# if (k & 1)
2267	je .L69
2268
2269	leaq	(, %rax, SIZE), %rax
2270	leaq	(AO, %rax, 2), AO
2271	leaq	(BO, %rax, 2), BO
2272	negq	%rax
2273	ALIGN_4
2274
2275.L67:
2276	mulpd	%xmm0, %xmm1
2277	addpd	%xmm1, %xmm8
2278	movddup	-14 * SIZE(BO, %rax, 2), %xmm1
2279	mulpd	%xmm0, %xmm3
2280	movapd	-14 * SIZE(AO, %rax, 2), %xmm0
2281	addpd	%xmm3, %xmm9
2282	movddup	-13 * SIZE(BO, %rax, 2), %xmm3
2283
2284	addq	$SIZE, %rax
2285	jl	.L67
2286	ALIGN_4
2287
2288.L69:
2289	addpd	%xmm10, %xmm8
2290	addpd	%xmm11, %xmm9
2291
2292#if defined(LN) || defined(RT)
2293	movq	KK, %rax
2294#ifdef LN
2295	subq	$2, %rax
2296#else
2297	subq	$2, %rax
2298#endif
2299
2300	leaq	(, %rax, SIZE), %rax
2301
2302	movq	AORIG, AO
2303	leaq	(AO, %rax, 2), AO
2304	leaq	(B,  %rax, 2), BO
2305#endif
2306
2307#if defined(LN) || defined(LT)
2308	movapd	%xmm8, %xmm0
2309	unpcklpd %xmm9, %xmm8
2310	unpckhpd %xmm9, %xmm0
2311
2312	movapd	-16 * SIZE(BO), %xmm9
2313	movapd	-14 * SIZE(BO), %xmm13
2314
2315	subpd	%xmm8,  %xmm9
2316	subpd	%xmm0,  %xmm13
2317#else
2318	movapd	-16 * SIZE(AO), %xmm0
2319	movapd	-14 * SIZE(AO), %xmm2
2320
2321	subpd	%xmm8, %xmm0
2322	subpd	%xmm9, %xmm2
2323#endif
2324
2325
2326#ifdef LN
2327	movddup	-13 * SIZE(AO), %xmm8
2328	mulpd	 %xmm8, %xmm13
2329
2330	movddup	-14 * SIZE(AO), %xmm10
2331	mulpd	 %xmm13, %xmm10
2332	subpd	 %xmm10, %xmm9
2333
2334	movddup	-16 * SIZE(AO), %xmm8
2335	mulpd	 %xmm8, %xmm9
2336#endif
2337
2338#ifdef LT
2339	movddup	-16 * SIZE(AO), %xmm8
2340	mulpd	 %xmm8, %xmm9
2341
2342	movddup	-15 * SIZE(AO), %xmm10
2343	mulpd	 %xmm9, %xmm10
2344	subpd	 %xmm10, %xmm13
2345
2346	movddup	-13 * SIZE(AO), %xmm8
2347	mulpd	 %xmm8, %xmm13
2348#endif
2349
2350#ifdef RN
2351	movddup	-16 * SIZE(BO), %xmm8
2352	mulpd	 %xmm8, %xmm0
2353
2354	movddup	-15 * SIZE(BO), %xmm9
2355	mulpd	 %xmm0, %xmm9
2356	subpd	 %xmm9, %xmm2
2357
2358	movddup	-13 * SIZE(BO), %xmm8
2359	mulpd	 %xmm8, %xmm2
2360#endif
2361
2362#ifdef RT
2363	movddup	-13 * SIZE(BO), %xmm8
2364	mulpd	 %xmm8, %xmm2
2365
2366	movddup	-14 * SIZE(BO), %xmm9
2367	mulpd	 %xmm2, %xmm9
2368	subpd	 %xmm9, %xmm0
2369
2370	movddup	-16 * SIZE(BO), %xmm8
2371	mulpd	 %xmm8, %xmm0
2372#endif
2373
2374#ifdef LN
2375	subq	$2 * SIZE, CO1
2376	subq	$2 * SIZE, CO2
2377#endif
2378
2379#if defined(LN) || defined(LT)
2380	movlpd	%xmm9,   0 * SIZE(CO1)
2381	movlpd	%xmm13,  1 * SIZE(CO1)
2382
2383	movhpd	%xmm9,   0 * SIZE(CO2)
2384	movhpd	%xmm13,  1 * SIZE(CO2)
2385#else
2386	movlpd	%xmm0,   0 * SIZE(CO1)
2387	movhpd	%xmm0,   1 * SIZE(CO1)
2388
2389	movlpd	%xmm2,   0 * SIZE(CO2)
2390	movhpd	%xmm2,   1 * SIZE(CO2)
2391#endif
2392
2393#if defined(LN) || defined(LT)
2394	movaps	%xmm9,  -16 * SIZE(BO)
2395	movaps	%xmm13, -14 * SIZE(BO)
2396#else
2397	movaps	%xmm0,  -16 * SIZE(AO)
2398	movaps	%xmm2,  -14 * SIZE(AO)
2399#endif
2400
2401#ifndef LN
2402	addq	$2 * SIZE, CO1
2403	addq	$2 * SIZE, CO2
2404#endif
2405
2406#if defined(LT) || defined(RN)
2407	movq	K,  %rax
2408	subq	KK, %rax
2409	leaq	(,%rax, SIZE), %rax
2410	leaq	(AO, %rax, 2), AO
2411	leaq	(BO, %rax, 2), BO
2412#endif
2413
2414#ifdef LN
2415	subq	$2, KK
2416#endif
2417
2418#ifdef LT
2419	addq	$2, KK
2420#endif
2421
2422#ifdef RT
2423	movq	K, %rax
2424	salq	$1 + BASE_SHIFT, %rax
2425	addq	%rax, AORIG
2426#endif
2427	ALIGN_4
2428
2429.L70:
2430	testq	$1, M
2431	je	.L79
2432	ALIGN_4
2433
2434.L71:
2435#ifdef LN
2436       movq	K, %rax
2437       salq	$0 + BASE_SHIFT, %rax
2438       subq	%rax, AORIG
2439#endif
2440
2441#if defined(LN) || defined(RT)
2442	movq	KK, %rax
2443	movq	AORIG, AO
2444	leaq	(, %rax, SIZE), %rax
2445	leaq	(AO, %rax, 1), AO
2446#endif
2447
2448	movq	B, BO
2449
2450#if defined(LN) || defined(RT)
2451	movq	KK, %rax
2452	salq	$1 + BASE_SHIFT, %rax
2453	leaq	(BO, %rax, 1), BO
2454#endif
2455
2456	movddup	-16 * SIZE(AO), %xmm0
2457	pxor	%xmm8, %xmm8
2458	movddup	-15 * SIZE(AO), %xmm1
2459	pxor	%xmm9, %xmm9
2460	movddup	-14 * SIZE(AO), %xmm2
2461	pxor	%xmm10, %xmm10
2462	movddup	-13 * SIZE(AO), %xmm3
2463	pxor	%xmm11, %xmm11
2464
2465#if defined(LT) || defined(RN)
2466	movq	KK, %rax
2467#else
2468	movq	K, %rax
2469	subq	KK, %rax
2470#endif
2471	andq	$-4, %rax
2472	leaq	(, %rax, SIZE), %rax
2473	leaq	(AO, %rax, 1), AO
2474	leaq	(BO, %rax, 2), BO
2475	negq	%rax
2476	NOBRANCH
2477	je	.L76
2478	ALIGN_4
2479
2480.L72:
2481	mulpd	-16 * SIZE(BO, %rax, 2), %xmm0
2482	addpd	%xmm0, %xmm8
2483	movddup	-12 * SIZE(AO, %rax, 1), %xmm0
2484
2485	mulpd	-14 * SIZE(BO, %rax, 2), %xmm1
2486	addpd	%xmm1, %xmm9
2487	movddup	-11 * SIZE(AO, %rax, 1), %xmm1
2488
2489	mulpd	-12 * SIZE(BO, %rax, 2), %xmm2
2490	addpd	%xmm2, %xmm10
2491	movddup	-10 * SIZE(AO, %rax, 1), %xmm2
2492
2493	mulpd	-10 * SIZE(BO, %rax, 2), %xmm3
2494	addpd	%xmm3, %xmm11
2495	movddup	 -9 * SIZE(AO, %rax, 1), %xmm3
2496
2497	addq	$4 * SIZE, %rax
2498	BRANCH
2499	jl	.L72
2500	ALIGN_4
2501
2502.L76:
2503#if defined(LT) || defined(RN)
2504	movq	KK, %rax
2505#else
2506	movq	K, %rax
2507	subq	KK, %rax
2508#endif
2509	andq	$3, %rax		# if (k & 1)
2510	je .L78
2511
2512	leaq	(, %rax, SIZE), %rax
2513	leaq	(AO, %rax, 1), AO
2514	leaq	(BO, %rax, 2), BO
2515	negq	%rax
2516	ALIGN_4
2517
2518.L77:
2519	mulpd	-16 * SIZE(BO, %rax, 2), %xmm0
2520	addpd	%xmm0, %xmm8
2521	movddup	-15 * SIZE(AO, %rax, 1), %xmm0
2522
2523	addq	$SIZE, %rax
2524	jl	.L77
2525	ALIGN_4
2526
2527.L78:
2528	addpd	%xmm9,  %xmm8
2529	addpd	%xmm11, %xmm10
2530	addpd	%xmm10, %xmm8
2531
2532#if defined(LN) || defined(RT)
2533	movq	KK, %rax
2534#ifdef LN
2535	subq	$1, %rax
2536#else
2537	subq	$2, %rax
2538#endif
2539
2540	leaq	(, %rax, SIZE), %rax
2541
2542	movq	AORIG, AO
2543	leaq	(AO, %rax, 1), AO
2544	leaq	(B,  %rax, 2), BO
2545#endif
2546
2547#if defined(LN) || defined(LT)
2548	movapd	-16 * SIZE(BO), %xmm2
2549#else
2550	movapd	-16 * SIZE(AO), %xmm2
2551#endif
2552
2553	subpd	%xmm8,  %xmm2
2554
2555#if defined(LN) || defined(LT)
2556	movddup	-16 * SIZE(AO), %xmm0
2557
2558	mulpd	 %xmm0, %xmm2
2559#endif
2560
2561#ifdef RN
2562	movapd	%xmm2,  %xmm0
2563        unpckhpd %xmm0, %xmm0
2564
2565	mulsd	-16 * SIZE(BO), %xmm2
2566	movsd	-15 * SIZE(BO), %xmm4
2567	mulsd	 %xmm2, %xmm4
2568	subsd	 %xmm4, %xmm0
2569
2570	mulsd	-13 * SIZE(BO), %xmm0
2571	unpcklpd %xmm0, %xmm2
2572#endif
2573
2574#ifdef RT
2575	movapd	%xmm2,  %xmm0
2576        unpckhpd %xmm0, %xmm0
2577
2578	mulsd	-13 * SIZE(BO), %xmm0
2579
2580	movlpd	-14 * SIZE(BO), %xmm4
2581	mulsd	 %xmm0, %xmm4
2582	subsd	 %xmm4, %xmm2
2583
2584	mulsd	-16 * SIZE(BO), %xmm2
2585	unpcklpd %xmm0, %xmm2
2586#endif
2587
2588#ifdef LN
2589	subq	$1 * SIZE, CO1
2590	subq	$1 * SIZE, CO2
2591#endif
2592
2593	movlpd	%xmm2,  0 * SIZE(CO1)
2594	movhpd	%xmm2,  0 * SIZE(CO2)
2595
2596#if defined(LN) || defined(LT)
2597	movaps	%xmm2, -16 * SIZE(BO)
2598#else
2599	movaps	%xmm2, -16 * SIZE(AO)
2600#endif
2601
2602#ifndef LN
2603	addq	$1 * SIZE, CO1
2604	addq	$1 * SIZE, CO2
2605#endif
2606
2607#if defined(LT) || defined(RN)
2608	movq	K,  %rax
2609	subq	KK, %rax
2610	leaq	(,%rax, SIZE), %rax
2611	leaq	(AO, %rax, 1), AO
2612	leaq	(BO, %rax, 2), BO
2613#endif
2614
2615#ifdef LN
2616	subq	$1, KK
2617#endif
2618
2619#ifdef LT
2620	addq	$1, KK
2621#endif
2622
2623#ifdef RT
2624	movq	K, %rax
2625	salq	$0 + BASE_SHIFT, %rax
2626	addq	%rax, AORIG
2627#endif
2628	ALIGN_4
2629
2630.L79:
2631#ifdef LN
2632       leaq	(, K, SIZE), %rax
2633       leaq	(B, %rax, 2), B
2634#endif
2635
2636#if defined(LT) || defined(RN)
2637	movq	BO, B
2638#endif
2639
2640#ifdef RN
2641	addq	$2, KK
2642#endif
2643
2644#ifdef RT
2645	subq	$2, KK
2646#endif
2647	ALIGN_4
2648
2649.L80:
2650	testq	$1, N
2651	je	.L999
2652
2653#if defined(LT) || defined(RN)
2654	movq	A, AO
2655#else
2656	movq	A, AORIG
2657#endif
2658
2659#ifdef RT
2660	movq	K, %rax
2661	salq	$0 + BASE_SHIFT, %rax
2662	subq	%rax, B
2663
2664	subq	LDC, C
2665#endif
2666
2667	movq	C, CO1			# coffset1 = c
2668#ifndef RT
2669	addq	LDC, C
2670#endif
2671
2672#ifdef LN
2673	movq	OFFSET, %rax
2674	addq	M, %rax
2675	movq	%rax, KK
2676#endif
2677
2678#ifdef LT
2679	movq	OFFSET, %rax
2680	movq	%rax, KK
2681#endif
2682
2683	movq	M,  I
2684	sarq	$2, I	# i = (m >> 2)
2685	jle	.L100
2686	ALIGN_4
2687
2688.L91:
2689#ifdef LN
2690       movq	K, %rax
2691       salq	$2 + BASE_SHIFT, %rax
2692       subq	%rax, AORIG
2693#endif
2694
2695#if defined(LN) || defined(RT)
2696	movq	KK, %rax
2697	movq	AORIG, AO
2698	leaq	(, %rax, SIZE), %rax
2699	leaq	(AO, %rax, 4), AO
2700#endif
2701
2702	movq	B, BO
2703
2704#if defined(LN) || defined(RT)
2705	movq	KK, %rax
2706	leaq	(BO, %rax, SIZE), BO
2707#endif
2708
2709	movapd	-16 * SIZE(AO), %xmm0
2710	pxor	%xmm8, %xmm8
2711	movapd	 -8 * SIZE(AO), %xmm2
2712	pxor	%xmm9, %xmm9
2713	movddup	-16 * SIZE(BO), %xmm1
2714	pxor	%xmm10, %xmm10
2715	movddup	-15 * SIZE(BO), %xmm5
2716	pxor	%xmm11, %xmm11
2717	movddup	-14 * SIZE(BO), %xmm3
2718
2719#ifndef LN
2720	prefetchw      3 * SIZE(CO1)
2721#else
2722	prefetchw     -8 * SIZE(CO1)
2723#endif
2724
2725#if defined(LT) || defined(RN)
2726	movq	KK, %rax
2727#else
2728	movq	K, %rax
2729	subq	KK, %rax
2730#endif
2731	andq	$-4, %rax
2732	leaq	(, %rax, SIZE), %rax
2733	leaq	(AO, %rax, 4), AO
2734	leaq	(BO, %rax, 1), BO
2735	negq	%rax
2736	NOBRANCH
2737	je	.L96
2738	ALIGN_4
2739
2740.L92:
2741	mulpd	%xmm1, %xmm0
2742	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1
2743	addpd	%xmm0, %xmm8
2744	movapd	-12 * SIZE(AO, %rax, 4), %xmm0
2745	addpd	%xmm1, %xmm9
2746	movddup	-12 * SIZE(BO, %rax, 1), %xmm1
2747	mulpd	%xmm5, %xmm0
2748	mulpd	-10 * SIZE(AO, %rax, 4), %xmm5
2749	addpd	%xmm0, %xmm10
2750	movapd	  (AO, %rax, 4), %xmm0
2751	addpd	%xmm5, %xmm11
2752	movddup	-13 * SIZE(BO, %rax, 1), %xmm5
2753	mulpd	%xmm3, %xmm2
2754	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm3
2755	addpd	%xmm2, %xmm8
2756	movapd	 -4 * SIZE(AO, %rax, 4), %xmm2
2757	addpd	%xmm3, %xmm9
2758	movddup	-10 * SIZE(BO, %rax, 1), %xmm3
2759	mulpd	%xmm5, %xmm2
2760	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm5
2761	addpd	%xmm2, %xmm10
2762	movapd	  8 * SIZE(AO, %rax, 4), %xmm2
2763	addpd	%xmm5, %xmm11
2764	movddup	-11 * SIZE(BO, %rax, 1), %xmm5
2765
2766	addq	$4 * SIZE, %rax
2767	BRANCH
2768	jl	.L92
2769	ALIGN_4
2770
2771.L96:
2772#if defined(LT) || defined(RN)
2773	movq	KK, %rax
2774#else
2775	movq	K, %rax
2776	subq	KK, %rax
2777#endif
2778	andq	$3, %rax		# if (k & 1)
2779	je .L99
2780
2781	leaq	(, %rax, SIZE), %rax
2782	leaq	(AO, %rax, 4), AO
2783	leaq	(BO, %rax, 1), BO
2784	negq	%rax
2785	ALIGN_4
2786
2787.L97:
2788	mulpd	%xmm1, %xmm0
2789	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1
2790	addpd	%xmm0, %xmm8
2791	movapd	-12 * SIZE(AO, %rax, 4), %xmm0
2792	addpd	%xmm1, %xmm9
2793	movddup	-15 * SIZE(BO, %rax, 1), %xmm1
2794
2795	addq	$SIZE, %rax
2796	jl	.L97
2797	ALIGN_4
2798.L99:
2799	addpd	%xmm10, %xmm8
2800	addpd	%xmm11, %xmm9
2801
2802#if defined(LN) || defined(RT)
2803	movq	KK, %rax
2804#ifdef LN
2805	subq	$4, %rax
2806#else
2807	subq	$1, %rax
2808#endif
2809
2810	leaq	(, %rax, SIZE), %rax
2811
2812	movq	AORIG, AO
2813	leaq	(AO, %rax, 4), AO
2814	leaq	(B,  %rax, 1), BO
2815#endif
2816
2817#if defined(LN) || defined(LT)
2818	movapd	-16 * SIZE(BO), %xmm10
2819	movapd	-14 * SIZE(BO), %xmm11
2820
2821	subpd	%xmm8,  %xmm10
2822	subpd	%xmm9,  %xmm11
2823#else
2824	movapd	-16 * SIZE(AO), %xmm10
2825	movapd	-14 * SIZE(AO), %xmm11
2826
2827	subpd	%xmm8, %xmm10
2828	subpd	%xmm9, %xmm11
2829#endif
2830
2831#ifdef LN
2832	movapd	%xmm10, %xmm8
2833        unpckhpd %xmm8, %xmm8
2834
2835	movapd	%xmm11, %xmm9
2836        unpckhpd %xmm9, %xmm9
2837
2838	movsd	 -1 * SIZE(AO), %xmm12
2839	mulsd	 %xmm12, %xmm9
2840
2841	movsd	 -2 * SIZE(AO), %xmm13
2842	mulsd	 %xmm9, %xmm13
2843	subsd	 %xmm13, %xmm11
2844	movsd	 -3 * SIZE(AO), %xmm14
2845	mulsd	 %xmm9, %xmm14
2846	subsd	 %xmm14, %xmm8
2847	movsd	 -4 * SIZE(AO), %xmm15
2848	mulsd	 %xmm9, %xmm15
2849	subsd	 %xmm15, %xmm10
2850
2851	movsd	 -6 * SIZE(AO), %xmm12
2852	mulsd	 %xmm12, %xmm11
2853
2854	movsd	 -7 * SIZE(AO), %xmm13
2855	mulsd	 %xmm11, %xmm13
2856	subsd	 %xmm13, %xmm8
2857	movsd	 -8 * SIZE(AO), %xmm14
2858	mulsd	 %xmm11, %xmm14
2859	subsd	 %xmm14, %xmm10
2860
2861	movsd	-11 * SIZE(AO), %xmm12
2862	mulsd	 %xmm12, %xmm8
2863
2864	movsd	-12 * SIZE(AO), %xmm13
2865	mulsd	 %xmm8, %xmm13
2866	subsd	 %xmm13, %xmm10
2867
2868	movsd	-16 * SIZE(AO), %xmm12
2869	mulsd	 %xmm12, %xmm10
2870
2871	unpcklpd %xmm8, %xmm10
2872	unpcklpd %xmm9, %xmm11
2873#endif
2874
2875#ifdef LT
2876	movapd	%xmm10, %xmm8
2877        unpckhpd %xmm8, %xmm8
2878
2879	movapd	%xmm11, %xmm9
2880        unpckhpd %xmm9, %xmm9
2881
2882	movsd	-16 * SIZE(AO), %xmm12
2883	mulsd	 %xmm12, %xmm10
2884
2885	movsd	-15 * SIZE(AO), %xmm13
2886	mulsd	 %xmm10, %xmm13
2887	subsd	 %xmm13, %xmm8
2888	movsd	-14 * SIZE(AO), %xmm14
2889	mulsd	 %xmm10, %xmm14
2890	subsd	 %xmm14, %xmm11
2891	movsd	-13 * SIZE(AO), %xmm15
2892	mulsd	 %xmm10, %xmm15
2893	subsd	 %xmm15, %xmm9
2894
2895	movsd	-11 * SIZE(AO), %xmm12
2896	mulsd	 %xmm12, %xmm8
2897
2898	movsd	-10 * SIZE(AO), %xmm13
2899	mulsd	 %xmm8, %xmm13
2900	subsd	 %xmm13, %xmm11
2901	movsd	 -9 * SIZE(AO), %xmm14
2902	mulsd	 %xmm8, %xmm14
2903	subsd	 %xmm14, %xmm9
2904
2905	movsd	 -6 * SIZE(AO), %xmm12
2906	mulsd	 %xmm12, %xmm11
2907
2908	movsd	 -5 * SIZE(AO), %xmm13
2909	mulsd	 %xmm11, %xmm13
2910	subsd	 %xmm13, %xmm9
2911
2912	movsd	 -1 * SIZE(AO), %xmm12
2913	mulsd	 %xmm12, %xmm9
2914
2915	unpcklpd %xmm8, %xmm10
2916	unpcklpd %xmm9, %xmm11
2917#endif
2918
2919#ifdef RN
2920	movddup	-16 * SIZE(BO), %xmm8
2921	mulpd	 %xmm8, %xmm10
2922	mulpd	 %xmm8, %xmm11
2923#endif
2924
2925#ifdef RT
2926	movddup	-16 * SIZE(BO), %xmm8
2927	mulpd	 %xmm8, %xmm10
2928	mulpd	 %xmm8, %xmm11
2929#endif
2930
2931#ifdef LN
2932	subq	$4 * SIZE, CO1
2933#endif
2934
2935	movlpd	%xmm10,  0 * SIZE(CO1)
2936	movhpd	%xmm10,  1 * SIZE(CO1)
2937	movlpd	%xmm11,  2 * SIZE(CO1)
2938	movhpd	%xmm11,  3 * SIZE(CO1)
2939
2940#if defined(LN) || defined(LT)
2941	movaps	%xmm10, -16 * SIZE(BO)
2942	movaps	%xmm11, -14 * SIZE(BO)
2943#else
2944	movaps	%xmm10, -16 * SIZE(AO)
2945	movaps	%xmm11, -14 * SIZE(AO)
2946#endif
2947
2948#ifndef LN
2949	addq	$4 * SIZE, CO1
2950#endif
2951
2952#if defined(LT) || defined(RN)
2953	movq	K,  %rax
2954	subq	KK, %rax
2955	leaq	(,%rax, SIZE), %rax
2956	leaq	(AO, %rax, 4), AO
2957	addq	%rax, BO
2958#endif
2959
2960#ifdef LN
2961	subq	$4, KK
2962#endif
2963
2964#ifdef LT
2965	addq	$4, KK
2966#endif
2967
2968#ifdef RT
2969       movq	K, %rax
2970       salq	$2 + BASE_SHIFT, %rax
2971       addq	%rax, AORIG
2972#endif
2973
2974	decq	I			# i --
2975	jg	.L91
2976	ALIGN_4
2977
2978.L100:
2979	testq	$2, M
2980	je	.L110
2981
2982#ifdef LN
2983      movq	K, %rax
2984       salq	$1 + BASE_SHIFT, %rax
2985       subq	%rax, AORIG
2986#endif
2987
2988#if defined(LN) || defined(RT)
2989	movq	KK, %rax
2990	movq	AORIG, AO
2991	leaq	(, %rax, SIZE), %rax
2992	leaq	(AO, %rax, 2), AO
2993#endif
2994
2995	movq	B, BO
2996
2997#if defined(LN) || defined(RT)
2998	movq	KK, %rax
2999	leaq	(BO, %rax, SIZE), BO
3000#endif
3001
3002	movddup	-16 * SIZE(BO), %xmm0
3003	pxor	%xmm8, %xmm8
3004	movddup	-15 * SIZE(BO), %xmm1
3005	pxor	%xmm9, %xmm9
3006	movddup	-14 * SIZE(BO), %xmm2
3007	pxor	%xmm10, %xmm10
3008	movddup	-13 * SIZE(BO), %xmm3
3009	pxor	%xmm11, %xmm11
3010
3011#if defined(LT) || defined(RN)
3012	movq	KK, %rax
3013#else
3014	movq	K, %rax
3015	subq	KK, %rax
3016#endif
3017	andq	$-4, %rax
3018	leaq	(, %rax, SIZE), %rax
3019	leaq	(AO, %rax, 2), AO
3020	leaq	(BO, %rax, 1), BO
3021	negq	%rax
3022	NOBRANCH
3023	je	.L106
3024	ALIGN_4
3025
3026.L102:
3027	mulpd	-16 * SIZE(AO, %rax, 2), %xmm0
3028	addpd	%xmm0, %xmm8
3029	movddup	-12 * SIZE(BO, %rax, 1), %xmm0
3030
3031	mulpd	-14 * SIZE(AO, %rax, 2), %xmm1
3032	addpd	%xmm1, %xmm9
3033	movddup	-11 * SIZE(BO, %rax, 1), %xmm1
3034
3035	mulpd	-12 * SIZE(AO, %rax, 2), %xmm2
3036	addpd	%xmm2, %xmm10
3037	movddup	-10 * SIZE(BO, %rax, 1), %xmm2
3038
3039	mulpd	-10 * SIZE(AO, %rax, 2), %xmm3
3040	addpd	%xmm3, %xmm11
3041	movddup	 -9 * SIZE(BO, %rax, 1), %xmm3
3042
3043	addq	$4 * SIZE, %rax
3044	BRANCH
3045	jl	.L102
3046	ALIGN_4
3047
3048.L106:
3049#if defined(LT) || defined(RN)
3050	movq	KK, %rax
3051#else
3052	movq	K, %rax
3053	subq	KK, %rax
3054#endif
3055	andq	$3, %rax		# if (k & 1)
3056	je .L109
3057
3058	leaq	(, %rax, SIZE), %rax
3059	leaq	(AO, %rax, 2), AO
3060	leaq	(BO, %rax, 1), BO
3061	negq	%rax
3062	ALIGN_4
3063
3064.L107:
3065	movddup	-16 * SIZE(BO, %rax, 1), %xmm0
3066	mulpd	-16 * SIZE(AO, %rax, 2), %xmm0
3067	addpd	%xmm0, %xmm8
3068
3069	addq	$SIZE, %rax
3070	jl	.L107
3071	ALIGN_4
3072
3073.L109:
3074	addpd	%xmm9, %xmm8
3075	addpd	%xmm11, %xmm10
3076	addpd	%xmm10, %xmm8
3077
3078#if defined(LN) || defined(RT)
3079	movq	KK, %rax
3080#ifdef LN
3081	subq	$2, %rax
3082#else
3083	subq	$1, %rax
3084#endif
3085
3086	leaq	(, %rax, SIZE), %rax
3087
3088	movq	AORIG, AO
3089	leaq	(AO, %rax, 2), AO
3090	leaq	(B,  %rax, 1), BO
3091#endif
3092
3093#if defined(LN) || defined(LT)
3094	movapd	-16 * SIZE(BO), %xmm10
3095	subpd	%xmm8,  %xmm10
3096#else
3097	movapd	-16 * SIZE(AO), %xmm10
3098	subpd	%xmm8, %xmm10
3099#endif
3100
3101#ifdef LN
3102	movapd	%xmm10, %xmm8
3103        unpckhpd %xmm8, %xmm8
3104
3105	movsd	-13 * SIZE(AO), %xmm12
3106	mulsd	 %xmm12, %xmm8
3107
3108	movsd	-14 * SIZE(AO), %xmm13
3109	mulsd	 %xmm8, %xmm13
3110	subsd	 %xmm13, %xmm10
3111
3112	movsd	-16 * SIZE(AO), %xmm12
3113	mulsd	 %xmm12, %xmm10
3114
3115	unpcklpd %xmm8, %xmm10
3116#endif
3117
3118#ifdef LT
3119	movapd	%xmm10, %xmm8
3120        unpckhpd %xmm8, %xmm8
3121
3122	movsd	-16 * SIZE(AO), %xmm12
3123	mulsd	 %xmm12, %xmm10
3124
3125	movsd	-15 * SIZE(AO), %xmm13
3126	mulsd	 %xmm10, %xmm13
3127	subsd	 %xmm13, %xmm8
3128
3129	movsd	-13 * SIZE(AO), %xmm12
3130	mulsd	 %xmm12, %xmm8
3131
3132	unpcklpd %xmm8, %xmm10
3133#endif
3134
3135#ifdef RN
3136	movddup	-16 * SIZE(BO), %xmm8
3137	mulpd	 %xmm8, %xmm10
3138#endif
3139
3140#ifdef RT
3141	movddup	-16 * SIZE(BO), %xmm8
3142	mulpd	 %xmm8, %xmm10
3143#endif
3144
3145#ifdef LN
3146	subq	$2 * SIZE, CO1
3147#endif
3148
3149#if defined(LN) || defined(LT)
3150	movlpd	%xmm10,  0 * SIZE(CO1)
3151	movhpd	%xmm10,  1 * SIZE(CO1)
3152#else
3153	movlpd	%xmm10,  0 * SIZE(CO1)
3154	movhpd	%xmm10,  1 * SIZE(CO1)
3155#endif
3156
3157#if defined(LN) || defined(LT)
3158	movaps	%xmm10, -16 * SIZE(BO)
3159#else
3160	movaps	%xmm10, -16 * SIZE(AO)
3161#endif
3162
3163#ifndef LN
3164	addq	$2 * SIZE, CO1
3165#endif
3166
3167#if defined(LT) || defined(RN)
3168	movq	K,  %rax
3169	subq	KK, %rax
3170	leaq	(,%rax, SIZE), %rax
3171	leaq	(AO, %rax, 2), AO
3172	addq	%rax, BO
3173#endif
3174
3175#ifdef LN
3176	subq	$2, KK
3177#endif
3178
3179#ifdef LT
3180	addq	$2, KK
3181#endif
3182
3183#ifdef RT
3184       movq	K, %rax
3185       salq	$1 + BASE_SHIFT, %rax
3186       addq	%rax, AORIG
3187#endif
3188	ALIGN_4
3189
3190.L110:
3191	testq	$1, M
3192	je	.L119
3193	ALIGN_4
3194
3195.L111:
3196#ifdef LN
3197       movq	K, %rax
3198       salq	$0 + BASE_SHIFT, %rax
3199       subq	%rax, AORIG
3200#endif
3201
3202#if defined(LN) || defined(RT)
3203	movq	KK, %rax
3204	movq	AORIG, AO
3205	leaq	(, %rax, SIZE), %rax
3206	leaq	(AO, %rax, 1), AO
3207#endif
3208
3209	movq	B, BO
3210
3211#if defined(LN) || defined(RT)
3212	movq	KK, %rax
3213	leaq	(BO, %rax, SIZE), BO
3214#endif
3215
3216	movapd	-16 * SIZE(AO), %xmm0
3217	pxor	%xmm8, %xmm8
3218	movapd	-14 * SIZE(AO), %xmm1
3219	pxor	%xmm9, %xmm9
3220
3221#if defined(LT) || defined(RN)
3222	movq	KK, %rax
3223#else
3224	movq	K, %rax
3225	subq	KK, %rax
3226#endif
3227	andq	$-4, %rax
3228	leaq	(, %rax, SIZE), %rax
3229	leaq	(AO, %rax, 1), AO
3230	leaq	(BO, %rax, 1), BO
3231	negq	%rax
3232	NOBRANCH
3233	je	.L116
3234	ALIGN_4
3235
3236.L112:
3237	mulpd	-16 * SIZE(BO, %rax, 1), %xmm0
3238	addpd	%xmm0, %xmm8
3239	movapd	-12 * SIZE(AO, %rax, 1), %xmm0
3240
3241	mulpd	-14 * SIZE(BO, %rax, 1), %xmm1
3242	addpd	%xmm1, %xmm9
3243	movapd	-10 * SIZE(AO, %rax, 1), %xmm1
3244
3245	addq	$4 * SIZE, %rax
3246	BRANCH
3247	jl	.L112
3248	ALIGN_4
3249
3250.L116:
3251#if defined(LT) || defined(RN)
3252	movq	KK, %rax
3253#else
3254	movq	K, %rax
3255	subq	KK, %rax
3256#endif
3257	andq	$3, %rax		# if (k & 1)
3258	je .L118
3259
3260	leaq	(, %rax, SIZE), %rax
3261	leaq	(AO, %rax, 1), AO
3262	leaq	(BO, %rax, 1), BO
3263	negq	%rax
3264	ALIGN_4
3265
3266.L117:
3267	mulsd	-16 * SIZE(BO, %rax, 1), %xmm0
3268	addsd	%xmm0, %xmm8
3269	movsd	-15 * SIZE(AO, %rax, 1), %xmm0
3270
3271	addq	$SIZE, %rax
3272	jl	.L117
3273	ALIGN_4
3274
3275.L118:
3276	addpd	%xmm9, %xmm8
3277	haddpd	%xmm8, %xmm8
3278
3279#if defined(LN) || defined(RT)
3280	movq	KK, %rax
3281#ifdef LN
3282	subq	$1, %rax
3283#else
3284	subq	$1, %rax
3285#endif
3286
3287	leaq	(, %rax, SIZE), %rax
3288
3289	movq	AORIG, AO
3290	leaq	(AO, %rax, 1), AO
3291	leaq	(B,  %rax, 1), BO
3292#endif
3293
3294#if defined(LN) || defined(LT)
3295	movsd	-16 * SIZE(BO), %xmm10
3296	subsd	%xmm8,  %xmm10
3297#else
3298	movsd	-16 * SIZE(AO), %xmm10
3299	subsd	%xmm8, %xmm10
3300#endif
3301
3302#if defined(LN) || defined(LT)
3303	movsd	-16 * SIZE(AO), %xmm12
3304	mulsd	 %xmm12, %xmm10
3305#endif
3306
3307#if defined(RN) || defined(RT)
3308	movsd	-16 * SIZE(BO), %xmm8
3309	mulsd	 %xmm8, %xmm10
3310#endif
3311
3312#ifdef LN
3313	subq	$1 * SIZE, CO1
3314#endif
3315
3316	movsd	%xmm10,  0 * SIZE(CO1)
3317
3318#if defined(LN) || defined(LT)
3319	movlpd	%xmm10, -16 * SIZE(BO)
3320#else
3321	movlpd	%xmm10, -16 * SIZE(AO)
3322#endif
3323
3324#ifndef LN
3325	addq	$1 * SIZE, CO1
3326#endif
3327
3328#if defined(LT) || defined(RN)
3329	movq	K,  %rax
3330	subq	KK, %rax
3331	leaq	(,%rax, SIZE), %rax
3332	addq	%rax, AO
3333	addq	%rax, BO
3334#endif
3335
3336#ifdef LN
3337	subq	$1, KK
3338#endif
3339
3340#ifdef LT
3341	addq	$1, KK
3342#endif
3343
3344#ifdef RT
3345       movq	K, %rax
3346       salq	$0 + BASE_SHIFT, %rax
3347       addq	%rax, AORIG
3348#endif
3349	ALIGN_4
3350
3351.L119:
3352#ifdef LN
3353       leaq	(B, K, SIZE), B
3354#endif
3355
3356#if defined(LT) || defined(RN)
3357	movq	BO, B
3358#endif
3359
3360#ifdef RN
3361	addq	$1, KK
3362#endif
3363
3364#ifdef RT
3365	subq	$1, KK
3366#endif
3367	ALIGN_4
3368
3369
3370.L999:
3371	movq	   (%rsp), %rbx
3372	movq	  8(%rsp), %rbp
3373	movq	 16(%rsp), %r12
3374	movq	 24(%rsp), %r13
3375	movq	 32(%rsp), %r14
3376	movq	 40(%rsp), %r15
3377
3378#ifdef WINDOWS_ABI
3379	movq	 48(%rsp), %rdi
3380	movq	 56(%rsp), %rsi
3381	movups	 64(%rsp), %xmm6
3382	movups	 80(%rsp), %xmm7
3383	movups	 96(%rsp), %xmm8
3384	movups	112(%rsp), %xmm9
3385	movups	128(%rsp), %xmm10
3386	movups	144(%rsp), %xmm11
3387	movups	160(%rsp), %xmm12
3388	movups	176(%rsp), %xmm13
3389	movups	192(%rsp), %xmm14
3390	movups	208(%rsp), %xmm15
3391#endif
3392
3393	addq	$STACKSIZE, %rsp
3394	ret
3395
3396	EPILOGUE
3397