1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define OLD_M	%rdi
43#define OLD_N	%rsi
44#define M	%r13
45#define N	%r14
46#define K	%rdx
47
48#define A	%rcx
49#define B	%r8
50#define C	%r9
51#define LDC	%r10
52
53#define I	%r11
54#define AO	%rdi
55#define BO	%rsi
56#define	CO1	%r15
57#define CO2	%r12
58#define BB	%rbp
59#define	J	%rbx
60
61#ifndef WINDOWS_ABI
62
63#define STACKSIZE 96
64
65#define OFFSET	 48(%rsp)
66#define AORIG	 56(%rsp)
67#define KK	 64(%rsp)
68#define KKK	 72(%rsp)
69
70#else
71
72#define STACKSIZE 256
73
74#define OLD_A		40 + STACKSIZE(%rsp)
75#define OLD_B		48 + STACKSIZE(%rsp)
76#define OLD_C		56 + STACKSIZE(%rsp)
77#define OLD_LDC		64 + STACKSIZE(%rsp)
78#define OLD_OFFSET	72 + STACKSIZE(%rsp)
79
80#define OFFSET	224(%rsp)
81#define AORIG	232(%rsp)
82#define KK	240(%rsp)
83#define KKK	248(%rsp)
84
85#endif
86
87#define PREFETCH     prefetch
88#define PREFETCHSIZE  (8 *  7 + 0)
89
90#define movlpd	movsd
91#define movapd	movups
92#define movupd	movups
93
94#define KERNEL1(xx) \
95	mulpd	%xmm1, %xmm0 ;\
96	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1 ;\
97	addpd	%xmm0, %xmm8 ;\
98	movapd	%xmm2, %xmm0 ;\
99	PREFETCH (PREFETCHSIZE +  0) * SIZE(AO, %rax, 4) ;\
100	addpd	%xmm1, %xmm12 ;\
101	movddup	-14 * SIZE(BO, %rax, 4), %xmm1 ;\
102	mulpd	%xmm3, %xmm2 ;\
103	mulpd	-14 * SIZE(AO, %rax, 4), %xmm3 ;\
104	addpd	%xmm2, %xmm9 ;\
105	movapd	%xmm0, %xmm2 ;\
106	addpd	%xmm3, %xmm13 ;\
107	movddup	-13 * SIZE(BO, %rax, 4), %xmm3 ;\
108	mulpd	%xmm1, %xmm0 ;\
109	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1 ;\
110	addpd	%xmm0, %xmm10 ;\
111	movapd	-12 * SIZE(AO, %rax, 4), %xmm0 ;\
112	addpd	%xmm1, %xmm14 ;\
113	movddup	-12 * SIZE(BO, %rax, 4), %xmm1 ;\
114	mulpd	%xmm3, %xmm2 ;\
115	mulpd	-14 * SIZE(AO, %rax, 4), %xmm3 ;\
116	addpd	%xmm2, %xmm11 ;\
117	addpd	%xmm3, %xmm15 ;\
118 	movddup	-11 * SIZE(BO, %rax, 4), %xmm3 ;\
119	movapd	%xmm0, %xmm2
120
121#define KERNEL2(xx) \
122	mulpd	%xmm1, %xmm0 ;\
123	mulpd	-10 * SIZE(AO, %rax, 4), %xmm1 ;\
124	addpd	%xmm0, %xmm8 ;\
125	movapd	%xmm2, %xmm0 ;\
126	addpd	%xmm1, %xmm12 ;\
127	movddup	-10 * SIZE(BO, %rax, 4), %xmm1 ;\
128	mulpd	%xmm3, %xmm2 ;\
129	mulpd	-10 * SIZE(AO, %rax, 4), %xmm3 ;\
130	addpd	%xmm2, %xmm9 ;\
131	movapd	%xmm0, %xmm2 ;\
132	addpd	%xmm3, %xmm13 ;\
133	movddup	 -9 * SIZE(BO, %rax, 4), %xmm3 ;\
134	mulpd	%xmm1, %xmm0 ;\
135	mulpd	-10 * SIZE(AO, %rax, 4), %xmm1 ;\
136	addpd	%xmm0, %xmm10 ;\
137	addpd	%xmm1, %xmm14 ;\
138	mulpd	%xmm3, %xmm2 ;\
139	mulpd	-10 * SIZE(AO, %rax, 4), %xmm3 ;\
140	addpd	%xmm2, %xmm11 ;\
141	addpd	%xmm3, %xmm15 ;\
142 	movddup	 -7 * SIZE(BO, %rax, 4), %xmm3 ;\
143/**/	movddup	  (BO, %rax, 4), %xmm1 ;\
144	movapd	%xmm4, %xmm2
145
146#define KERNEL3(xx) \
147	mulpd	%xmm5, %xmm4 ;\
148	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm5 ;\
149	addpd	%xmm4, %xmm8 ;\
150	movapd	%xmm2, %xmm4 ;\
151	addpd	%xmm5, %xmm12 ;\
152	movddup	 -6 * SIZE(BO, %rax, 4), %xmm5 ;\
153	mulpd	%xmm3, %xmm2 ;\
154	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm3 ;\
155	addpd	%xmm2, %xmm9 ;\
156	movapd	%xmm4, %xmm2 ;\
157	addpd	%xmm3, %xmm13 ;\
158	movddup	 -5 * SIZE(BO, %rax, 4), %xmm3 ;\
159	mulpd	%xmm5, %xmm4 ;\
160	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm5 ;\
161	addpd	%xmm4, %xmm10 ;\
162	movapd	 -4 * SIZE(AO, %rax, 4), %xmm4 ;\
163	addpd	%xmm5, %xmm14 ;\
164	movddup	 -4 * SIZE(BO, %rax, 4), %xmm5 ;\
165	mulpd	%xmm3, %xmm2 ;\
166	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm3 ;\
167	addpd	%xmm2, %xmm11 ;\
168	addpd	%xmm3, %xmm15 ;\
169 	movddup	 -3 * SIZE(BO, %rax, 4), %xmm3 ;\
170	movapd	%xmm4, %xmm2
171
172#define KERNEL4(xx) \
173	mulpd	%xmm5, %xmm4 ;\
174	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm5 ;\
175	addpd	%xmm4, %xmm8 ;\
176	movapd	%xmm2, %xmm4 ;\
177	addpd	%xmm5, %xmm12 ;\
178	movddup	 -2 * SIZE(BO, %rax, 4), %xmm5 ;\
179	mulpd	%xmm3, %xmm2 ;\
180	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm3 ;\
181	addpd	%xmm2, %xmm9 ;\
182	movapd	%xmm4, %xmm2 ;\
183	addpd	%xmm3, %xmm13 ;\
184	movddup	 -1 * SIZE(BO, %rax, 4), %xmm3 ;\
185	mulpd	%xmm5, %xmm4 ;\
186	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm5 ;\
187/***/	movapd	  (AO, %rax, 4), %xmm6 ;\
188	addpd	%xmm4, %xmm10 ;\
189	addpd	%xmm5, %xmm14 ;\
190	mulpd	%xmm3, %xmm2 ;\
191	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm3 ;\
192	addpd	%xmm2, %xmm11 ;\
193	addpd	%xmm3, %xmm15 ;\
194 	movddup	  1 * SIZE(BO, %rax, 4), %xmm3 ;\
195	movddup	  8 * SIZE(BO, %rax, 4), %xmm5 ;\
196	movapd	%xmm6, %xmm2
197
198#define KERNEL5(xx) \
199	mulpd	%xmm1, %xmm6 ;\
200	mulpd	  2 * SIZE(AO, %rax, 4), %xmm1 ;\
201	addpd	%xmm6, %xmm8 ;\
202	movapd	%xmm2, %xmm6 ;\
203	addpd	%xmm1, %xmm12 ;\
204	movddup	  2 * SIZE(BO, %rax, 4), %xmm1 ;\
205	mulpd	%xmm3, %xmm2 ;\
206	mulpd	  2 * SIZE(AO, %rax, 4), %xmm3 ;\
207	addpd	%xmm2, %xmm9 ;\
208/**/	movapd	  8 * SIZE(AO, %rax, 4), %xmm7 ;\
209	movapd	%xmm6, %xmm2 ;\
210	addpd	%xmm3, %xmm13 ;\
211	movddup	  3 * SIZE(BO, %rax, 4), %xmm3 ;\
212	mulpd	%xmm1, %xmm6 ;\
213	mulpd	  2 * SIZE(AO, %rax, 4), %xmm1 ;\
214	addpd	%xmm6, %xmm10 ;\
215	movapd	  4 * SIZE(AO, %rax, 4), %xmm6 ;\
216	addpd	%xmm1, %xmm14 ;\
217	movddup	  4 * SIZE(BO, %rax, 4), %xmm1 ;\
218	mulpd	%xmm3, %xmm2 ;\
219	mulpd	  2 * SIZE(AO, %rax, 4), %xmm3 ;\
220	addpd	%xmm2, %xmm11 ;\
221	addpd	%xmm3, %xmm15 ;\
222 	movddup	  5 * SIZE(BO, %rax, 4), %xmm3 ;\
223	movapd	%xmm6, %xmm2
224
225#define KERNEL6(xx) \
226	mulpd	%xmm1, %xmm6 ;\
227	mulpd	  6 * SIZE(AO, %rax, 4), %xmm1 ;\
228	addpd	%xmm6, %xmm8 ;\
229	movapd	%xmm2, %xmm6 ;\
230	addpd	%xmm1, %xmm12 ;\
231	movddup	  6 * SIZE(BO, %rax, 4), %xmm1 ;\
232	mulpd	%xmm3, %xmm2 ;\
233	mulpd	  6 * SIZE(AO, %rax, 4), %xmm3 ;\
234	addpd	%xmm2, %xmm9 ;\
235	movapd	%xmm6, %xmm2 ;\
236	addpd	%xmm3, %xmm13 ;\
237	movddup	  7 * SIZE(BO, %rax, 4), %xmm3 ;\
238	mulpd	%xmm1, %xmm6 ;\
239	mulpd	  6 * SIZE(AO, %rax, 4), %xmm1 ;\
240	addpd	%xmm6, %xmm10 ;\
241/***/	movapd	 16 * SIZE(AO, %rax, 4), %xmm0 ;\
242	addpd	%xmm1, %xmm14 ;\
243	mulpd	%xmm3, %xmm2 ;\
244	mulpd	  6 * SIZE(AO, %rax, 4), %xmm3 ;\
245	addpd	%xmm2, %xmm11 ;\
246	addpd	%xmm3, %xmm15 ;\
247 	movddup	  9 * SIZE(BO, %rax, 4), %xmm3 ;\
248	movddup	 16 * SIZE(BO, %rax, 4), %xmm1 ;\
249	movapd	%xmm7, %xmm2
250
251#define KERNEL7(xx) \
252	mulpd	%xmm5, %xmm7 ;\
253	mulpd	 10 * SIZE(AO, %rax, 4), %xmm5 ;\
254	addpd	%xmm7, %xmm8 ;\
255	movapd	%xmm2, %xmm7 ;\
256	addpd	%xmm5, %xmm12 ;\
257	movddup	 10 * SIZE(BO, %rax, 4), %xmm5 ;\
258	mulpd	%xmm3, %xmm2 ;\
259	mulpd	 10 * SIZE(AO, %rax, 4), %xmm3 ;\
260	addpd	%xmm2, %xmm9 ;\
261	movapd	%xmm7, %xmm2 ;\
262	addpd	%xmm3, %xmm13 ;\
263	movddup	 11 * SIZE(BO, %rax, 4), %xmm3 ;\
264	mulpd	%xmm5, %xmm7 ;\
265	mulpd	 10 * SIZE(AO, %rax, 4), %xmm5 ;\
266	addpd	%xmm7, %xmm10 ;\
267	movapd	 12 * SIZE(AO, %rax, 4), %xmm7 ;\
268	addpd	%xmm5, %xmm14 ;\
269	movddup	 12 * SIZE(BO, %rax, 4), %xmm5 ;\
270	mulpd	%xmm3, %xmm2 ;\
271	mulpd	 10 * SIZE(AO, %rax, 4), %xmm3 ;\
272	addpd	%xmm2, %xmm11 ;\
273	addpd	%xmm3, %xmm15 ;\
274 	movddup	 13 * SIZE(BO, %rax, 4), %xmm3 ;\
275	movapd	%xmm7, %xmm2
276
277#define KERNEL8(xx) \
278	mulpd	%xmm5, %xmm7 ;\
279	mulpd	 14 * SIZE(AO, %rax, 4), %xmm5 ;\
280	addpd	%xmm7, %xmm8 ;\
281	movapd	%xmm2, %xmm7 ;\
282	addpd	%xmm5, %xmm12 ;\
283	movddup	 14 * SIZE(BO, %rax, 4), %xmm5 ;\
284	mulpd	%xmm3, %xmm2 ;\
285	mulpd	 14 * SIZE(AO, %rax, 4), %xmm3 ;\
286	addpd	%xmm2, %xmm9 ;\
287	movapd	%xmm7, %xmm2 ;\
288	addpd	%xmm3, %xmm13 ;\
289	movddup	 15 * SIZE(BO, %rax, 4), %xmm3 ;\
290	mulpd	%xmm5, %xmm7 ;\
291	mulpd	 14 * SIZE(AO, %rax, 4), %xmm5 ;\
292	addpd	%xmm7, %xmm10 ;\
293	addpd	%xmm5, %xmm14 ;\
294/**/	movapd	 24 * SIZE(AO, %rax, 4), %xmm4 ;\
295	mulpd	%xmm3, %xmm2 ;\
296	mulpd	 14 * SIZE(AO, %rax, 4), %xmm3 ;\
297	addpd	%xmm3, %xmm15 ;\
298 	movddup	 17 * SIZE(BO, %rax, 4), %xmm3 ;\
299	addpd	%xmm2, %xmm11 ;\
300	movddup	 24 * SIZE(BO, %rax, 4), %xmm5 ;\
301	movapd	%xmm0, %xmm2 ;\
302	addq	$8 * SIZE, %rax
303
304#define KERNEL_SUB1(xx) \
305	mulpd	%xmm1, %xmm0 ;\
306	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1 ;\
307	addpd	%xmm0, %xmm8 ;\
308	movapd	%xmm2, %xmm0 ;\
309	addpd	%xmm1, %xmm12 ;\
310	movddup	-14 * SIZE(BO, %rax, 4), %xmm1 ;\
311	mulpd	%xmm3, %xmm2 ;\
312	mulpd	-14 * SIZE(AO, %rax, 4), %xmm3 ;\
313	addpd	%xmm2, %xmm9 ;\
314	movapd	%xmm0, %xmm2 ;\
315	addpd	%xmm3, %xmm13 ;\
316	movddup	-13 * SIZE(BO, %rax, 4), %xmm3 ;\
317	mulpd	%xmm1, %xmm0 ;\
318	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1 ;\
319	addpd	%xmm0, %xmm10 ;\
320	movapd	-12 * SIZE(AO, %rax, 4), %xmm0 ;\
321	addpd	%xmm1, %xmm14 ;\
322	movddup	-12 * SIZE(BO, %rax, 4), %xmm1 ;\
323	mulpd	%xmm3, %xmm2 ;\
324	mulpd	-14 * SIZE(AO, %rax, 4), %xmm3 ;\
325	addpd	%xmm2, %xmm11 ;\
326	addpd	%xmm3, %xmm15 ;\
327 	movddup	-11 * SIZE(BO, %rax, 4), %xmm3 ;\
328	movapd	%xmm0, %xmm2
329
330#define KERNEL_SUB2(xx) \
331	mulpd	%xmm1, %xmm0 ;\
332	mulpd	-10 * SIZE(AO, %rax, 4), %xmm1 ;\
333	addpd	%xmm0, %xmm8 ;\
334	movapd	%xmm2, %xmm0 ;\
335	addpd	%xmm1, %xmm12 ;\
336	movddup	-10 * SIZE(BO, %rax, 4), %xmm1 ;\
337	mulpd	%xmm3, %xmm2 ;\
338	mulpd	-10 * SIZE(AO, %rax, 4), %xmm3 ;\
339	addpd	%xmm2, %xmm9 ;\
340	movapd	%xmm0, %xmm2 ;\
341	addpd	%xmm3, %xmm13 ;\
342	movddup	 -9 * SIZE(BO, %rax, 4), %xmm3 ;\
343	mulpd	%xmm1, %xmm0 ;\
344	mulpd	-10 * SIZE(AO, %rax, 4), %xmm1 ;\
345	addpd	%xmm0, %xmm10 ;\
346	movapd	  (AO, %rax, 4), %xmm0 ;\
347	addpd	%xmm1, %xmm14 ;\
348	movddup	  (BO, %rax, 4), %xmm1 ;\
349	mulpd	%xmm3, %xmm2 ;\
350	mulpd	-10 * SIZE(AO, %rax, 4), %xmm3 ;\
351	addpd	%xmm2, %xmm11 ;\
352	addpd	%xmm3, %xmm15 ;\
353 	movddup	 -7 * SIZE(BO, %rax, 4), %xmm3 ;\
354	movapd	%xmm4, %xmm2
355
356#define KERNEL_SUB3(xx) \
357	mulpd	%xmm5, %xmm4 ;\
358	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm5 ;\
359	addpd	%xmm4, %xmm8 ;\
360	movapd	%xmm2, %xmm4 ;\
361	addpd	%xmm5, %xmm12 ;\
362	movddup	 -6 * SIZE(BO, %rax, 4), %xmm5 ;\
363	mulpd	%xmm3, %xmm2 ;\
364	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm3 ;\
365	addpd	%xmm2, %xmm9 ;\
366	movapd	%xmm4, %xmm2 ;\
367	addpd	%xmm3, %xmm13 ;\
368	movddup	 -5 * SIZE(BO, %rax, 4), %xmm3 ;\
369	mulpd	%xmm5, %xmm4 ;\
370	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm5 ;\
371	addpd	%xmm4, %xmm10 ;\
372	movapd	 -4 * SIZE(AO, %rax, 4), %xmm4 ;\
373	addpd	%xmm5, %xmm14 ;\
374	movddup	 -4 * SIZE(BO, %rax, 4), %xmm5 ;\
375	mulpd	%xmm3, %xmm2 ;\
376	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm3 ;\
377	addpd	%xmm2, %xmm11 ;\
378	addpd	%xmm3, %xmm15 ;\
379 	movddup	 -3 * SIZE(BO, %rax, 4), %xmm3 ;\
380	movapd	%xmm4, %xmm2
381
382#define KERNEL_SUB4(xx) \
383	mulpd	%xmm5, %xmm4 ;\
384	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm5 ;\
385	addpd	%xmm4, %xmm8 ;\
386	movapd	%xmm2, %xmm4 ;\
387	addpd	%xmm5, %xmm12 ;\
388	movddup	 -2 * SIZE(BO, %rax, 4), %xmm5 ;\
389	mulpd	%xmm3, %xmm2 ;\
390	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm3 ;\
391	addpd	%xmm2, %xmm9 ;\
392	movapd	%xmm4, %xmm2 ;\
393	addpd	%xmm3, %xmm13 ;\
394	movddup	 -1 * SIZE(BO, %rax, 4), %xmm3 ;\
395	mulpd	%xmm5, %xmm4 ;\
396	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm5 ;\
397	addpd	%xmm4, %xmm10 ;\
398	addpd	%xmm5, %xmm14 ;\
399	mulpd	%xmm3, %xmm2 ;\
400	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm3 ;\
401	addpd	%xmm2, %xmm11 ;\
402	addpd	%xmm3, %xmm15 ;\
403 	movddup	  1 * SIZE(BO, %rax, 4), %xmm3 ;\
404	movapd	%xmm0, %xmm2
405
406	PROLOGUE
407	PROFCODE
408
409	subq	$STACKSIZE, %rsp
410	movq	%rbx,   (%rsp)
411	movq	%rbp,  8(%rsp)
412	movq	%r12, 16(%rsp)
413	movq	%r13, 24(%rsp)
414	movq	%r14, 32(%rsp)
415	movq	%r15, 40(%rsp)
416
417#ifdef WINDOWS_ABI
418	movq	%rdi,    48(%rsp)
419	movq	%rsi,    56(%rsp)
420	movups	%xmm6,   64(%rsp)
421	movups	%xmm7,   80(%rsp)
422	movups	%xmm8,   96(%rsp)
423	movups	%xmm9,  112(%rsp)
424	movups	%xmm10, 128(%rsp)
425	movups	%xmm11, 144(%rsp)
426	movups	%xmm12, 160(%rsp)
427	movups	%xmm13, 176(%rsp)
428	movups	%xmm14, 192(%rsp)
429	movups	%xmm15, 208(%rsp)
430
431	movq	ARG1,      OLD_M
432	movq	ARG2,      OLD_N
433	movq	ARG3,      K
434	movq	OLD_A,     A
435	movq	OLD_B,     B
436	movq	OLD_C,     C
437	movq	OLD_LDC,   LDC
438	movsd	OLD_OFFSET, %xmm12
439#else
440	movq	STACKSIZE +  8(%rsp), LDC
441	movsd	STACKSIZE + 16(%rsp), %xmm12
442#endif
443
444	movq	OLD_M, M
445	movq	OLD_N, N
446
447	subq	$-16 * SIZE, A
448	subq	$-16 * SIZE, B
449
450	movsd	%xmm12, OFFSET
451	movsd	%xmm12, KK
452
453	leaq	(, LDC, SIZE), LDC
454
455#ifdef LN
456       leaq	(, M, SIZE), %rax
457       addq	%rax, C
458       imulq	K, %rax
459       addq	%rax, A
460#endif
461
462#ifdef RT
463       leaq	(, N, SIZE), %rax
464       imulq	K, %rax
465       addq	%rax, B
466       movq	N, %rax
467       imulq	LDC, %rax
468       addq	%rax, C
469#endif
470
471#ifdef RN
472	negq	KK
473#endif
474
475#ifdef RT
476       movq	N, %rax
477       subq	OFFSET, %rax
478       movq	%rax, KK
479#endif
480
481	testq	$1, N
482	je	.L40
483
484#if defined(LT) || defined(RN)
485	movq	A, AO
486#else
487	movq	A, AORIG
488#endif
489
490#ifdef RT
491	movq	K, %rax
492	salq	$0 + BASE_SHIFT, %rax
493	subq	%rax, B
494
495	subq	LDC, C
496#endif
497
498	movq	C, CO1			# coffset1 = c
499#ifndef RT
500	addq	LDC, C
501#endif
502
503#ifdef LN
504	movq	OFFSET, %rax
505	addq	M, %rax
506	movq	%rax, KK
507#endif
508
509#ifdef LT
510	movq	OFFSET, %rax
511	movq	%rax, KK
512#endif
513
514	movq	M,  I
515	sarq	$2, I	# i = (m >> 2)
516	jle	.L100
517	ALIGN_4
518
519.L91:
520#ifdef LN
521       movq	K, %rax
522       salq	$2 + BASE_SHIFT, %rax
523       subq	%rax, AORIG
524#endif
525
526#if defined(LN) || defined(RT)
527	movq	KK, %rax
528	movq	AORIG, AO
529	leaq	(, %rax, SIZE), %rax
530	leaq	(AO, %rax, 4), AO
531#endif
532
533	movq	B, BO
534
535#if defined(LN) || defined(RT)
536	movq	KK, %rax
537	leaq	(BO, %rax, SIZE), BO
538#endif
539
540	movapd	-16 * SIZE(AO), %xmm0
541	pxor	%xmm8, %xmm8
542	movapd	 -8 * SIZE(AO), %xmm2
543	pxor	%xmm9, %xmm9
544	movddup	-16 * SIZE(BO), %xmm1
545	pxor	%xmm10, %xmm10
546	movddup	-15 * SIZE(BO), %xmm5
547	pxor	%xmm11, %xmm11
548	movddup	-14 * SIZE(BO), %xmm3
549
550#ifndef LN
551	prefetchw      3 * SIZE(CO1)
552#else
553	prefetchw     -8 * SIZE(CO1)
554#endif
555
556#if defined(LT) || defined(RN)
557	movq	KK, %rax
558#else
559	movq	K, %rax
560	subq	KK, %rax
561#endif
562	andq	$-4, %rax
563	leaq	(, %rax, SIZE), %rax
564	leaq	(AO, %rax, 4), AO
565	leaq	(BO, %rax, 1), BO
566	negq	%rax
567	NOBRANCH
568	je	.L96
569	ALIGN_4
570
571.L92:
572	mulpd	%xmm1, %xmm0
573	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1
574	addpd	%xmm0, %xmm8
575	movapd	-12 * SIZE(AO, %rax, 4), %xmm0
576	addpd	%xmm1, %xmm9
577	movddup	-12 * SIZE(BO, %rax, 1), %xmm1
578	mulpd	%xmm5, %xmm0
579	mulpd	-10 * SIZE(AO, %rax, 4), %xmm5
580	addpd	%xmm0, %xmm10
581	movapd	  (AO, %rax, 4), %xmm0
582	addpd	%xmm5, %xmm11
583	movddup	-13 * SIZE(BO, %rax, 1), %xmm5
584	mulpd	%xmm3, %xmm2
585	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm3
586	addpd	%xmm2, %xmm8
587	movapd	 -4 * SIZE(AO, %rax, 4), %xmm2
588	addpd	%xmm3, %xmm9
589	movddup	-10 * SIZE(BO, %rax, 1), %xmm3
590	mulpd	%xmm5, %xmm2
591	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm5
592	addpd	%xmm2, %xmm10
593	movapd	  8 * SIZE(AO, %rax, 4), %xmm2
594	addpd	%xmm5, %xmm11
595	movddup	-11 * SIZE(BO, %rax, 1), %xmm5
596
597	addq	$4 * SIZE, %rax
598	BRANCH
599	jl	.L92
600	ALIGN_4
601
602.L96:
603#if defined(LT) || defined(RN)
604	movq	KK, %rax
605#else
606	movq	K, %rax
607	subq	KK, %rax
608#endif
609	andq	$3, %rax		# if (k & 1)
610	je .L99
611
612	leaq	(, %rax, SIZE), %rax
613	leaq	(AO, %rax, 4), AO
614	leaq	(BO, %rax, 1), BO
615	negq	%rax
616	ALIGN_4
617
618.L97:
619	mulpd	%xmm1, %xmm0
620	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1
621	addpd	%xmm0, %xmm8
622	movapd	-12 * SIZE(AO, %rax, 4), %xmm0
623	addpd	%xmm1, %xmm9
624	movddup	-15 * SIZE(BO, %rax, 1), %xmm1
625
626	addq	$SIZE, %rax
627	jl	.L97
628	ALIGN_4
629.L99:
630	addpd	%xmm10, %xmm8
631	addpd	%xmm11, %xmm9
632
633#if defined(LN) || defined(RT)
634	movq	KK, %rax
635#ifdef LN
636	subq	$4, %rax
637#else
638	subq	$1, %rax
639#endif
640
641	leaq	(, %rax, SIZE), %rax
642
643	movq	AORIG, AO
644	leaq	(AO, %rax, 4), AO
645	leaq	(B,  %rax, 1), BO
646#endif
647
648#if defined(LN) || defined(LT)
649	movapd	-16 * SIZE(BO), %xmm10
650	movapd	-14 * SIZE(BO), %xmm11
651
652	subpd	%xmm8,  %xmm10
653	subpd	%xmm9,  %xmm11
654#else
655	movapd	-16 * SIZE(AO), %xmm10
656	movapd	-14 * SIZE(AO), %xmm11
657
658	subpd	%xmm8, %xmm10
659	subpd	%xmm9, %xmm11
660#endif
661
662#ifdef LN
663	movapd	%xmm10, %xmm8
664        unpckhpd %xmm8, %xmm8
665
666	movapd	%xmm11, %xmm9
667        unpckhpd %xmm9, %xmm9
668
669	movsd	 -1 * SIZE(AO), %xmm12
670	mulsd	 %xmm12, %xmm9
671
672	movsd	 -2 * SIZE(AO), %xmm13
673	mulsd	 %xmm9, %xmm13
674	subsd	 %xmm13, %xmm11
675	movsd	 -3 * SIZE(AO), %xmm14
676	mulsd	 %xmm9, %xmm14
677	subsd	 %xmm14, %xmm8
678	movsd	 -4 * SIZE(AO), %xmm15
679	mulsd	 %xmm9, %xmm15
680	subsd	 %xmm15, %xmm10
681
682	movsd	 -6 * SIZE(AO), %xmm12
683	mulsd	 %xmm12, %xmm11
684
685	movsd	 -7 * SIZE(AO), %xmm13
686	mulsd	 %xmm11, %xmm13
687	subsd	 %xmm13, %xmm8
688	movsd	 -8 * SIZE(AO), %xmm14
689	mulsd	 %xmm11, %xmm14
690	subsd	 %xmm14, %xmm10
691
692	movsd	-11 * SIZE(AO), %xmm12
693	mulsd	 %xmm12, %xmm8
694
695	movsd	-12 * SIZE(AO), %xmm13
696	mulsd	 %xmm8, %xmm13
697	subsd	 %xmm13, %xmm10
698
699	movsd	-16 * SIZE(AO), %xmm12
700	mulsd	 %xmm12, %xmm10
701
702	unpcklpd %xmm8, %xmm10
703	unpcklpd %xmm9, %xmm11
704#endif
705
706#ifdef LT
707	movapd	%xmm10, %xmm8
708        unpckhpd %xmm8, %xmm8
709
710	movapd	%xmm11, %xmm9
711        unpckhpd %xmm9, %xmm9
712
713	movsd	-16 * SIZE(AO), %xmm12
714	mulsd	 %xmm12, %xmm10
715
716	movsd	-15 * SIZE(AO), %xmm13
717	mulsd	 %xmm10, %xmm13
718	subsd	 %xmm13, %xmm8
719	movsd	-14 * SIZE(AO), %xmm14
720	mulsd	 %xmm10, %xmm14
721	subsd	 %xmm14, %xmm11
722	movsd	-13 * SIZE(AO), %xmm15
723	mulsd	 %xmm10, %xmm15
724	subsd	 %xmm15, %xmm9
725
726	movsd	-11 * SIZE(AO), %xmm12
727	mulsd	 %xmm12, %xmm8
728
729	movsd	-10 * SIZE(AO), %xmm13
730	mulsd	 %xmm8, %xmm13
731	subsd	 %xmm13, %xmm11
732	movsd	 -9 * SIZE(AO), %xmm14
733	mulsd	 %xmm8, %xmm14
734	subsd	 %xmm14, %xmm9
735
736	movsd	 -6 * SIZE(AO), %xmm12
737	mulsd	 %xmm12, %xmm11
738
739	movsd	 -5 * SIZE(AO), %xmm13
740	mulsd	 %xmm11, %xmm13
741	subsd	 %xmm13, %xmm9
742
743	movsd	 -1 * SIZE(AO), %xmm12
744	mulsd	 %xmm12, %xmm9
745
746	unpcklpd %xmm8, %xmm10
747	unpcklpd %xmm9, %xmm11
748#endif
749
750#ifdef RN
751	movddup	-16 * SIZE(BO), %xmm8
752	mulpd	 %xmm8, %xmm10
753	mulpd	 %xmm8, %xmm11
754#endif
755
756#ifdef RT
757	movddup	-16 * SIZE(BO), %xmm8
758	mulpd	 %xmm8, %xmm10
759	mulpd	 %xmm8, %xmm11
760#endif
761
762#ifdef LN
763	subq	$4 * SIZE, CO1
764#endif
765
766	movlpd	%xmm10,  0 * SIZE(CO1)
767	movhpd	%xmm10,  1 * SIZE(CO1)
768	movlpd	%xmm11,  2 * SIZE(CO1)
769	movhpd	%xmm11,  3 * SIZE(CO1)
770
771#if defined(LN) || defined(LT)
772	movaps	%xmm10, -16 * SIZE(BO)
773	movaps	%xmm11, -14 * SIZE(BO)
774#else
775	movaps	%xmm10, -16 * SIZE(AO)
776	movaps	%xmm11, -14 * SIZE(AO)
777#endif
778
779#ifndef LN
780	addq	$4 * SIZE, CO1
781#endif
782
783#if defined(LT) || defined(RN)
784	movq	K,  %rax
785	subq	KK, %rax
786	leaq	(,%rax, SIZE), %rax
787	leaq	(AO, %rax, 4), AO
788	addq	%rax, BO
789#endif
790
791#ifdef LN
792	subq	$4, KK
793#endif
794
795#ifdef LT
796	addq	$4, KK
797#endif
798
799#ifdef RT
800       movq	K, %rax
801       salq	$2 + BASE_SHIFT, %rax
802       addq	%rax, AORIG
803#endif
804
805	decq	I			# i --
806	jg	.L91
807	ALIGN_4
808
809.L100:
810	testq	$2, M
811	je	.L110
812
813#ifdef LN
814      movq	K, %rax
815       salq	$1 + BASE_SHIFT, %rax
816       subq	%rax, AORIG
817#endif
818
819#if defined(LN) || defined(RT)
820	movq	KK, %rax
821	movq	AORIG, AO
822	leaq	(, %rax, SIZE), %rax
823	leaq	(AO, %rax, 2), AO
824#endif
825
826	movq	B, BO
827
828#if defined(LN) || defined(RT)
829	movq	KK, %rax
830	leaq	(BO, %rax, SIZE), BO
831#endif
832
833	movddup	-16 * SIZE(BO), %xmm0
834	pxor	%xmm8, %xmm8
835	movddup	-15 * SIZE(BO), %xmm1
836	pxor	%xmm9, %xmm9
837	movddup	-14 * SIZE(BO), %xmm2
838	pxor	%xmm10, %xmm10
839	movddup	-13 * SIZE(BO), %xmm3
840	pxor	%xmm11, %xmm11
841
842#if defined(LT) || defined(RN)
843	movq	KK, %rax
844#else
845	movq	K, %rax
846	subq	KK, %rax
847#endif
848	andq	$-4, %rax
849	leaq	(, %rax, SIZE), %rax
850	leaq	(AO, %rax, 2), AO
851	leaq	(BO, %rax, 1), BO
852	negq	%rax
853	NOBRANCH
854	je	.L106
855	ALIGN_4
856
857.L102:
858	mulpd	-16 * SIZE(AO, %rax, 2), %xmm0
859	addpd	%xmm0, %xmm8
860	movddup	-12 * SIZE(BO, %rax, 1), %xmm0
861
862	mulpd	-14 * SIZE(AO, %rax, 2), %xmm1
863	addpd	%xmm1, %xmm9
864	movddup	-11 * SIZE(BO, %rax, 1), %xmm1
865
866	mulpd	-12 * SIZE(AO, %rax, 2), %xmm2
867	addpd	%xmm2, %xmm10
868	movddup	-10 * SIZE(BO, %rax, 1), %xmm2
869
870	mulpd	-10 * SIZE(AO, %rax, 2), %xmm3
871	addpd	%xmm3, %xmm11
872	movddup	 -9 * SIZE(BO, %rax, 1), %xmm3
873
874	addq	$4 * SIZE, %rax
875	BRANCH
876	jl	.L102
877	ALIGN_4
878
879.L106:
880#if defined(LT) || defined(RN)
881	movq	KK, %rax
882#else
883	movq	K, %rax
884	subq	KK, %rax
885#endif
886	andq	$3, %rax		# if (k & 1)
887	je .L109
888
889	leaq	(, %rax, SIZE), %rax
890	leaq	(AO, %rax, 2), AO
891	leaq	(BO, %rax, 1), BO
892	negq	%rax
893	ALIGN_4
894
895.L107:
896	movddup	-16 * SIZE(BO, %rax, 1), %xmm0
897	mulpd	-16 * SIZE(AO, %rax, 2), %xmm0
898	addpd	%xmm0, %xmm8
899
900	addq	$SIZE, %rax
901	jl	.L107
902	ALIGN_4
903
904.L109:
905	addpd	%xmm9, %xmm8
906	addpd	%xmm11, %xmm10
907	addpd	%xmm10, %xmm8
908
909#if defined(LN) || defined(RT)
910	movq	KK, %rax
911#ifdef LN
912	subq	$2, %rax
913#else
914	subq	$1, %rax
915#endif
916
917	leaq	(, %rax, SIZE), %rax
918
919	movq	AORIG, AO
920	leaq	(AO, %rax, 2), AO
921	leaq	(B,  %rax, 1), BO
922#endif
923
924#if defined(LN) || defined(LT)
925	movapd	-16 * SIZE(BO), %xmm10
926	subpd	%xmm8,  %xmm10
927#else
928	movapd	-16 * SIZE(AO), %xmm10
929	subpd	%xmm8, %xmm10
930#endif
931
932#ifdef LN
933	movapd	%xmm10, %xmm8
934        unpckhpd %xmm8, %xmm8
935
936	movsd	-13 * SIZE(AO), %xmm12
937	mulsd	 %xmm12, %xmm8
938
939	movsd	-14 * SIZE(AO), %xmm13
940	mulsd	 %xmm8, %xmm13
941	subsd	 %xmm13, %xmm10
942
943	movsd	-16 * SIZE(AO), %xmm12
944	mulsd	 %xmm12, %xmm10
945
946	unpcklpd %xmm8, %xmm10
947#endif
948
949#ifdef LT
950	movapd	%xmm10, %xmm8
951        unpckhpd %xmm8, %xmm8
952
953	movsd	-16 * SIZE(AO), %xmm12
954	mulsd	 %xmm12, %xmm10
955
956	movsd	-15 * SIZE(AO), %xmm13
957	mulsd	 %xmm10, %xmm13
958	subsd	 %xmm13, %xmm8
959
960	movsd	-13 * SIZE(AO), %xmm12
961	mulsd	 %xmm12, %xmm8
962
963	unpcklpd %xmm8, %xmm10
964#endif
965
966#ifdef RN
967	movddup	-16 * SIZE(BO), %xmm8
968	mulpd	 %xmm8, %xmm10
969#endif
970
971#ifdef RT
972	movddup	-16 * SIZE(BO), %xmm8
973	mulpd	 %xmm8, %xmm10
974#endif
975
976#ifdef LN
977	subq	$2 * SIZE, CO1
978#endif
979
980#if defined(LN) || defined(LT)
981	movlpd	%xmm10,  0 * SIZE(CO1)
982	movhpd	%xmm10,  1 * SIZE(CO1)
983#else
984	movlpd	%xmm10,  0 * SIZE(CO1)
985	movhpd	%xmm10,  1 * SIZE(CO1)
986#endif
987
988#if defined(LN) || defined(LT)
989	movaps	%xmm10, -16 * SIZE(BO)
990#else
991	movaps	%xmm10, -16 * SIZE(AO)
992#endif
993
994#ifndef LN
995	addq	$2 * SIZE, CO1
996#endif
997
998#if defined(LT) || defined(RN)
999	movq	K,  %rax
1000	subq	KK, %rax
1001	leaq	(,%rax, SIZE), %rax
1002	leaq	(AO, %rax, 2), AO
1003	addq	%rax, BO
1004#endif
1005
1006#ifdef LN
1007	subq	$2, KK
1008#endif
1009
1010#ifdef LT
1011	addq	$2, KK
1012#endif
1013
1014#ifdef RT
1015       movq	K, %rax
1016       salq	$1 + BASE_SHIFT, %rax
1017       addq	%rax, AORIG
1018#endif
1019	ALIGN_4
1020
1021.L110:
1022	testq	$1, M
1023	je	.L119
1024
1025#ifdef LN
1026       movq	K, %rax
1027       salq	$0 + BASE_SHIFT, %rax
1028       subq	%rax, AORIG
1029#endif
1030
1031#if defined(LN) || defined(RT)
1032	movq	KK, %rax
1033	movq	AORIG, AO
1034	leaq	(, %rax, SIZE), %rax
1035	leaq	(AO, %rax, 1), AO
1036#endif
1037
1038	movq	B, BO
1039
1040#if defined(LN) || defined(RT)
1041	movq	KK, %rax
1042	leaq	(BO, %rax, SIZE), BO
1043#endif
1044
1045	movapd	-16 * SIZE(AO), %xmm0
1046	pxor	%xmm8, %xmm8
1047	movapd	-14 * SIZE(AO), %xmm1
1048	pxor	%xmm9, %xmm9
1049
1050#if defined(LT) || defined(RN)
1051	movq	KK, %rax
1052#else
1053	movq	K, %rax
1054	subq	KK, %rax
1055#endif
1056	andq	$-4, %rax
1057	leaq	(, %rax, SIZE), %rax
1058	leaq	(AO, %rax, 1), AO
1059	leaq	(BO, %rax, 1), BO
1060	negq	%rax
1061	NOBRANCH
1062	je	.L116
1063	ALIGN_4
1064
1065.L112:
1066	mulpd	-16 * SIZE(BO, %rax, 1), %xmm0
1067	addpd	%xmm0, %xmm8
1068	movapd	-12 * SIZE(AO, %rax, 1), %xmm0
1069
1070	mulpd	-14 * SIZE(BO, %rax, 1), %xmm1
1071	addpd	%xmm1, %xmm9
1072	movapd	-10 * SIZE(AO, %rax, 1), %xmm1
1073
1074	addq	$4 * SIZE, %rax
1075	BRANCH
1076	jl	.L112
1077	ALIGN_4
1078
1079.L116:
1080#if defined(LT) || defined(RN)
1081	movq	KK, %rax
1082#else
1083	movq	K, %rax
1084	subq	KK, %rax
1085#endif
1086	andq	$3, %rax		# if (k & 1)
1087	je .L118
1088
1089	leaq	(, %rax, SIZE), %rax
1090	leaq	(AO, %rax, 1), AO
1091	leaq	(BO, %rax, 1), BO
1092	negq	%rax
1093	ALIGN_4
1094
1095.L117:
1096	mulsd	-16 * SIZE(BO, %rax, 1), %xmm0
1097	addsd	%xmm0, %xmm8
1098	movsd	-15 * SIZE(AO, %rax, 1), %xmm0
1099
1100	addq	$SIZE, %rax
1101	jl	.L117
1102	ALIGN_4
1103
1104.L118:
1105	addpd	%xmm9, %xmm8
1106	haddpd	%xmm8, %xmm8
1107
1108#if defined(LN) || defined(RT)
1109	movq	KK, %rax
1110#ifdef LN
1111	subq	$1, %rax
1112#else
1113	subq	$1, %rax
1114#endif
1115
1116	leaq	(, %rax, SIZE), %rax
1117
1118	movq	AORIG, AO
1119	leaq	(AO, %rax, 1), AO
1120	leaq	(B,  %rax, 1), BO
1121#endif
1122
1123#if defined(LN) || defined(LT)
1124	movsd	-16 * SIZE(BO), %xmm10
1125	subsd	%xmm8,  %xmm10
1126#else
1127	movsd	-16 * SIZE(AO), %xmm10
1128	subsd	%xmm8, %xmm10
1129#endif
1130
1131#if defined(LN) || defined(LT)
1132	movsd	-16 * SIZE(AO), %xmm12
1133	mulsd	 %xmm12, %xmm10
1134#endif
1135
1136#if defined(RN) || defined(RT)
1137	movsd	-16 * SIZE(BO), %xmm8
1138	mulsd	 %xmm8, %xmm10
1139#endif
1140
1141#ifdef LN
1142	subq	$1 * SIZE, CO1
1143#endif
1144
1145	movsd	%xmm10,  0 * SIZE(CO1)
1146
1147#if defined(LN) || defined(LT)
1148	movlpd	%xmm10, -16 * SIZE(BO)
1149#else
1150	movlpd	%xmm10, -16 * SIZE(AO)
1151#endif
1152
1153#ifndef LN
1154	addq	$1 * SIZE, CO1
1155#endif
1156
1157#if defined(LT) || defined(RN)
1158	movq	K,  %rax
1159	subq	KK, %rax
1160	leaq	(,%rax, SIZE), %rax
1161	addq	%rax, AO
1162	addq	%rax, BO
1163#endif
1164
1165#ifdef LN
1166	subq	$1, KK
1167#endif
1168
1169#ifdef LT
1170	addq	$1, KK
1171#endif
1172
1173#ifdef RT
1174       movq	K, %rax
1175       salq	$0 + BASE_SHIFT, %rax
1176       addq	%rax, AORIG
1177#endif
1178	ALIGN_4
1179
1180.L119:
1181#ifdef LN
1182       leaq	(B, K, SIZE), B
1183#endif
1184
1185#if defined(LT) || defined(RN)
1186	movq	BO, B
1187#endif
1188
1189#ifdef RN
1190	addq	$1, KK
1191#endif
1192
1193#ifdef RT
1194	subq	$1, KK
1195#endif
1196	ALIGN_4
1197
1198.L40:
1199	testq	$2, N
1200	je	.L80
1201
1202#if defined(LT) || defined(RN)
1203	movq	A, AO
1204#else
1205	movq	A, AORIG
1206#endif
1207
1208#ifdef RT
1209       movq	K, %rax
1210       salq	$1 + BASE_SHIFT, %rax
1211       subq	%rax, B
1212
1213       leaq	(, LDC, 2), %rax
1214       subq	%rax, C
1215#endif
1216
1217	movq	C, CO1			# coffset1 = c
1218	leaq	(C, LDC, 1), CO2	# coffset2 = c + ldc
1219#ifndef RT
1220	leaq	(C, LDC, 2), C
1221#endif
1222
1223#ifdef LN
1224	movq	OFFSET, %rax
1225	addq	M, %rax
1226	movq	%rax, KK
1227#endif
1228
1229#if defined(LT)
1230	movq	OFFSET, %rax
1231	movq	%rax, KK
1232#endif
1233
1234	movq	M,  I
1235	sarq	$2, I	# i = (m >> 2)
1236	jle	.L60
1237	ALIGN_4
1238
1239.L51:
1240#ifdef LN
1241       movq	K, %rax
1242       salq	$2 + BASE_SHIFT, %rax
1243       subq	%rax, AORIG
1244#endif
1245
1246#if defined(LN) || defined(RT)
1247	movq	KK, %rax
1248	movq	AORIG, AO
1249	leaq	(, %rax, SIZE), %rax
1250	leaq	(AO, %rax, 4), AO
1251#endif
1252
1253	movq	B, BO
1254
1255#if defined(LN) || defined(RT)
1256	movq	KK, %rax
1257	leaq	(, %rax, SIZE), %rax
1258	leaq	(BO, %rax, 2), BO
1259#endif
1260
1261	movddup	-16 * SIZE(BO), %xmm1
1262	movddup	-15 * SIZE(BO), %xmm5
1263	pxor	%xmm8, %xmm8
1264	movddup	-12 * SIZE(BO), %xmm3
1265	pxor	%xmm9, %xmm9
1266	movapd	-16 * SIZE(AO), %xmm0
1267	pxor	%xmm12, %xmm12
1268	movapd	 -8 * SIZE(AO), %xmm4
1269	pxor	%xmm13, %xmm13
1270
1271#ifndef LN
1272	prefetchw      3 * SIZE(CO1)
1273	movapd	%xmm0, %xmm2
1274	prefetchw      5 * SIZE(CO2)
1275#else
1276	prefetchw     -4 * SIZE(CO1)
1277	movapd	%xmm0, %xmm2
1278	prefetchw     -4 * SIZE(CO2)
1279#endif
1280
1281
1282#if defined(LT) || defined(RN)
1283	movq	KK, %rax
1284#else
1285	movq	K, %rax
1286	subq	KK, %rax
1287#endif
1288	andq	$-4, %rax
1289	leaq	(, %rax, SIZE), %rax
1290	leaq	(AO, %rax, 4), AO
1291	leaq	(BO, %rax, 2), BO
1292	negq	%rax
1293	NOBRANCH
1294	je	.L56
1295	ALIGN_4
1296
1297.L52:
1298	mulpd	%xmm1, %xmm0
1299	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1
1300	addpd	%xmm0, %xmm8
1301	movapd	-12 * SIZE(AO, %rax, 4), %xmm0
1302	addpd	%xmm1, %xmm12
1303	movddup	-14 * SIZE(BO, %rax, 2), %xmm1
1304	mulpd	%xmm5, %xmm2
1305	mulpd	-14 * SIZE(AO, %rax, 4), %xmm5
1306	addpd	%xmm2, %xmm9
1307	addpd	%xmm5, %xmm13
1308	movddup	-13 * SIZE(BO, %rax, 2), %xmm5
1309	movapd	%xmm0, %xmm2
1310	mulpd	%xmm1, %xmm0
1311	mulpd	-10 * SIZE(AO, %rax, 4), %xmm1
1312	addpd	%xmm0, %xmm8
1313	movapd	  (AO, %rax, 4), %xmm0
1314	addpd	%xmm1, %xmm12
1315	movddup	 -8 * SIZE(BO, %rax, 2), %xmm1
1316	mulpd	%xmm5, %xmm2
1317	mulpd	-10 * SIZE(AO, %rax, 4), %xmm5
1318	addpd	%xmm2, %xmm9
1319	addpd	%xmm5, %xmm13
1320	movddup	-11 * SIZE(BO, %rax, 2), %xmm5
1321	movapd	%xmm4, %xmm2
1322	mulpd	%xmm3, %xmm4
1323	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm3
1324	addpd	%xmm4, %xmm8
1325	movapd	 -4 * SIZE(AO, %rax, 4), %xmm4
1326	addpd	%xmm3, %xmm12
1327	movddup	-10 * SIZE(BO, %rax, 2), %xmm3
1328	mulpd	%xmm5, %xmm2
1329	mulpd	 -6 * SIZE(AO, %rax, 4), %xmm5
1330	addpd	%xmm2, %xmm9
1331	addpd	%xmm5, %xmm13
1332	movddup	 -9 * SIZE(BO, %rax, 2), %xmm5
1333	movapd	%xmm4, %xmm2
1334	mulpd	%xmm3, %xmm4
1335	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm3
1336	addpd	%xmm4, %xmm8
1337	movapd	  8 * SIZE(AO, %rax, 4), %xmm4
1338	addpd	%xmm3, %xmm12
1339	movddup	 -4 * SIZE(BO, %rax, 2), %xmm3
1340	mulpd	%xmm5, %xmm2
1341	mulpd	 -2 * SIZE(AO, %rax, 4), %xmm5
1342	addpd	%xmm2, %xmm9
1343	addpd	%xmm5, %xmm13
1344	movddup	 -7 * SIZE(BO, %rax, 2), %xmm5
1345	movapd	%xmm0, %xmm2
1346
1347	addq	$4 * SIZE, %rax
1348	BRANCH
1349	jl	.L52
1350	ALIGN_4
1351
1352.L56:
1353#if defined(LT) || defined(RN)
1354	movq	KK, %rax
1355#else
1356	movq	K, %rax
1357	subq	KK, %rax
1358#endif
1359	andq	$3, %rax		# if (k & 1)
1360	je .L59
1361
1362	leaq	(, %rax, SIZE), %rax
1363	leaq	(AO, %rax, 4), AO
1364	leaq	(BO, %rax, 2), BO
1365	negq	%rax
1366	ALIGN_4
1367
1368.L57:
1369	mulpd	%xmm1, %xmm0
1370	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1
1371	addpd	%xmm0, %xmm8
1372	movapd	-12 * SIZE(AO, %rax, 4), %xmm0
1373	addpd	%xmm1, %xmm12
1374	movddup	-14 * SIZE(BO, %rax, 2), %xmm1
1375	mulpd	%xmm5, %xmm2
1376	mulpd	-14 * SIZE(AO, %rax, 4), %xmm5
1377	addpd	%xmm2, %xmm9
1378	addpd	%xmm5, %xmm13
1379	movddup	-13 * SIZE(BO, %rax, 2), %xmm5
1380	movapd	%xmm0, %xmm2
1381
1382	addq	$SIZE, %rax
1383	jl	.L57
1384	ALIGN_4
1385
1386.L59:
1387#if defined(LN) || defined(RT)
1388	movq	KK, %rax
1389#ifdef LN
1390	subq	$4, %rax
1391#else
1392	subq	$2, %rax
1393#endif
1394
1395	leaq	(, %rax, SIZE), %rax
1396
1397	movq	AORIG, AO
1398	leaq	(AO, %rax, 4), AO
1399	leaq	(B,  %rax, 2), BO
1400#endif
1401
1402#if defined(LN) || defined(LT)
1403	movapd	%xmm8, %xmm0
1404	unpcklpd %xmm9, %xmm8
1405	unpckhpd %xmm9, %xmm0
1406
1407	movapd	%xmm12, %xmm4
1408	unpcklpd %xmm13, %xmm12
1409	unpckhpd %xmm13, %xmm4
1410
1411	movapd	-16 * SIZE(BO), %xmm9
1412	movapd	-14 * SIZE(BO), %xmm13
1413	movapd	-12 * SIZE(BO), %xmm1
1414	movapd	-10 * SIZE(BO), %xmm5
1415
1416	subpd	%xmm8,  %xmm9
1417	subpd	%xmm0,  %xmm13
1418	subpd	%xmm12, %xmm1
1419	subpd	%xmm4,  %xmm5
1420#else
1421	movapd	-16 * SIZE(AO), %xmm0
1422	movapd	-14 * SIZE(AO), %xmm1
1423	movapd	-12 * SIZE(AO), %xmm2
1424	movapd	-10 * SIZE(AO), %xmm3
1425
1426	subpd	%xmm8, %xmm0
1427	subpd	%xmm12, %xmm1
1428	subpd	%xmm9, %xmm2
1429	subpd	%xmm13, %xmm3
1430#endif
1431
1432#ifdef LN
1433	movddup	 -1 * SIZE(AO), %xmm8
1434	mulpd	 %xmm8, %xmm5
1435	movddup	 -2 * SIZE(AO), %xmm10
1436	mulpd	 %xmm5, %xmm10
1437	subpd	 %xmm10, %xmm1
1438	movddup	 -3 * SIZE(AO), %xmm12
1439	mulpd	 %xmm5, %xmm12
1440	subpd	 %xmm12, %xmm13
1441	movddup	 -4 * SIZE(AO), %xmm14
1442	mulpd	 %xmm5, %xmm14
1443	subpd	 %xmm14, %xmm9
1444
1445	movddup	 -6 * SIZE(AO), %xmm8
1446	mulpd	 %xmm8, %xmm1
1447	movddup	 -7 * SIZE(AO), %xmm10
1448	mulpd	 %xmm1, %xmm10
1449	subpd	 %xmm10, %xmm13
1450	movddup	 -8 * SIZE(AO), %xmm12
1451	mulpd	 %xmm1, %xmm12
1452	subpd	 %xmm12, %xmm9
1453
1454	movddup	-11 * SIZE(AO), %xmm8
1455	mulpd	 %xmm8, %xmm13
1456	movddup	-12 * SIZE(AO), %xmm10
1457	mulpd	 %xmm13, %xmm10
1458	subpd	 %xmm10, %xmm9
1459
1460	movddup	-16 * SIZE(AO), %xmm8
1461	mulpd	 %xmm8, %xmm9
1462#endif
1463
1464#ifdef LT
1465	movddup -16 * SIZE(AO), %xmm8
1466	mulpd	 %xmm8, %xmm9
1467	movddup	-15 * SIZE(AO), %xmm10
1468	mulpd	 %xmm9, %xmm10
1469	subpd	 %xmm10, %xmm13
1470	movddup	-14 * SIZE(AO), %xmm12
1471	mulpd	 %xmm9, %xmm12
1472	subpd	 %xmm12, %xmm1
1473	movddup	-13 * SIZE(AO), %xmm14
1474	mulpd	 %xmm9, %xmm14
1475	subpd	 %xmm14, %xmm5
1476
1477
1478	movddup	-11 * SIZE(AO), %xmm8
1479	mulpd	 %xmm8, %xmm13
1480
1481	movddup	-10 * SIZE(AO), %xmm10
1482	mulpd	 %xmm13, %xmm10
1483	subpd	 %xmm10, %xmm1
1484	movddup	 -9 * SIZE(AO), %xmm12
1485	mulpd	 %xmm13, %xmm12
1486	subpd	 %xmm12, %xmm5
1487
1488	movddup	 -6 * SIZE(AO), %xmm8
1489	mulpd	 %xmm8, %xmm1
1490	movddup	 -5 * SIZE(AO), %xmm10
1491	mulpd	 %xmm1, %xmm10
1492	subpd	 %xmm10, %xmm5
1493
1494	movddup	 -1 * SIZE(AO), %xmm8
1495	mulpd	 %xmm8, %xmm5
1496#endif
1497
1498#ifdef RN
1499	movddup	-16 * SIZE(BO), %xmm8
1500	mulpd	 %xmm8, %xmm0
1501	mulpd	 %xmm8, %xmm1
1502
1503	movddup	-15 * SIZE(BO), %xmm9
1504	mulpd	 %xmm0, %xmm9
1505	subpd	 %xmm9, %xmm2
1506	movddup	-15 * SIZE(BO), %xmm9
1507	mulpd	 %xmm1, %xmm9
1508	subpd	 %xmm9, %xmm3
1509
1510	movddup	-13 * SIZE(BO), %xmm8
1511	mulpd	 %xmm8, %xmm2
1512	mulpd	 %xmm8, %xmm3
1513#endif
1514
1515#ifdef RT
1516	movddup	-13 * SIZE(BO), %xmm8
1517	mulpd	 %xmm8, %xmm2
1518	mulpd	 %xmm8, %xmm3
1519
1520	movddup	-14 * SIZE(BO), %xmm9
1521	mulpd	 %xmm2, %xmm9
1522	subpd	 %xmm9, %xmm0
1523	movddup	-14 * SIZE(BO), %xmm9
1524	mulpd	 %xmm3, %xmm9
1525	subpd	 %xmm9, %xmm1
1526
1527	movddup	-16 * SIZE(BO), %xmm8
1528	mulpd	 %xmm8, %xmm0
1529	mulpd	 %xmm8, %xmm1
1530#endif
1531
1532#ifdef LN
1533	subq	$4 * SIZE, CO1
1534	subq	$4 * SIZE, CO2
1535#endif
1536
1537#if defined(LN) || defined(LT)
1538	movlpd	%xmm9,  0 * SIZE(CO1)
1539	movlpd	%xmm13, 1 * SIZE(CO1)
1540	movlpd	%xmm1,  2 * SIZE(CO1)
1541	movlpd	%xmm5,  3 * SIZE(CO1)
1542
1543	movhpd	%xmm9,  0 * SIZE(CO2)
1544	movhpd	%xmm13, 1 * SIZE(CO2)
1545	movhpd	%xmm1,  2 * SIZE(CO2)
1546	movhpd	%xmm5,  3 * SIZE(CO2)
1547#else
1548	movlpd	%xmm0,  0 * SIZE(CO1)
1549	movhpd	%xmm0,  1 * SIZE(CO1)
1550	movlpd	%xmm1,  2 * SIZE(CO1)
1551	movhpd	%xmm1,  3 * SIZE(CO1)
1552
1553	movlpd	%xmm2,  0 * SIZE(CO2)
1554	movhpd	%xmm2,  1 * SIZE(CO2)
1555	movlpd	%xmm3,  2 * SIZE(CO2)
1556	movhpd	%xmm3,  3 * SIZE(CO2)
1557#endif
1558
1559#if defined(LN) || defined(LT)
1560	movaps	%xmm9, -16 * SIZE(BO)
1561	movaps	%xmm13,-14 * SIZE(BO)
1562	movaps	%xmm1, -12 * SIZE(BO)
1563	movaps	%xmm5, -10 * SIZE(BO)
1564#else
1565	movaps	%xmm0, -16 * SIZE(AO)
1566	movaps	%xmm1, -14 * SIZE(AO)
1567	movaps	%xmm2, -12 * SIZE(AO)
1568	movaps	%xmm3, -10 * SIZE(AO)
1569#endif
1570
1571#ifndef LN
1572	addq	$4 * SIZE, CO1
1573	addq	$4 * SIZE, CO2
1574#endif
1575
1576#if defined(LT) || defined(RN)
1577	movq	K,  %rax
1578	subq	KK, %rax
1579	leaq	(,%rax, SIZE), %rax
1580	leaq	(AO, %rax, 4), AO
1581	leaq	(BO, %rax, 2), BO
1582#endif
1583
1584#ifdef LN
1585	subq	$4, KK
1586#endif
1587
1588#ifdef LT
1589	addq	$4, KK
1590#endif
1591
1592#ifdef RT
1593       movq	K, %rax
1594       salq	$2 + BASE_SHIFT, %rax
1595       addq	%rax, AORIG
1596#endif
1597
1598	decq	I			# i --
1599	jg	.L51
1600	ALIGN_4
1601
1602.L60:
1603	testq	$2, M
1604	je	.L70
1605
1606#ifdef LN
1607       movq	K, %rax
1608       salq	$1 + BASE_SHIFT, %rax
1609       subq	%rax, AORIG
1610#endif
1611
1612#if defined(LN) || defined(RT)
1613	movq	KK, %rax
1614	movq	AORIG, AO
1615	leaq	(, %rax, SIZE), %rax
1616	leaq	(AO, %rax, 2), AO
1617#endif
1618
1619	movq	B, BO
1620
1621#if defined(LN) || defined(RT)
1622	movq	KK, %rax
1623	leaq	(, %rax, SIZE), %rax
1624	leaq	(BO, %rax, 2), BO
1625#endif
1626
1627	movapd	-16 * SIZE(AO), %xmm0
1628	pxor	%xmm8, %xmm8
1629	movapd	-12 * SIZE(AO), %xmm2
1630	pxor	%xmm9, %xmm9
1631	movddup	-16 * SIZE(BO), %xmm1
1632	pxor	%xmm10, %xmm10
1633	movddup	-15 * SIZE(BO), %xmm3
1634	pxor	%xmm11, %xmm11
1635
1636#if defined(LT) || defined(RN)
1637	movq	KK, %rax
1638#else
1639	movq	K, %rax
1640	subq	KK, %rax
1641#endif
1642	andq	$-4, %rax
1643	leaq	(, %rax, SIZE), %rax
1644	leaq	(AO, %rax, 2), AO
1645	leaq	(BO, %rax, 2), BO
1646	negq	%rax
1647	NOBRANCH
1648	je	.L66
1649	ALIGN_4
1650
1651.L62:
1652	mulpd	%xmm0, %xmm1
1653	addpd	%xmm1, %xmm8
1654	movddup	-14 * SIZE(BO, %rax, 2), %xmm1
1655	mulpd	%xmm0, %xmm3
1656	movapd	-14 * SIZE(AO, %rax, 2), %xmm0
1657	addpd	%xmm3, %xmm9
1658	movddup	-13 * SIZE(BO, %rax, 2), %xmm3
1659	mulpd	%xmm0, %xmm1
1660	addpd	%xmm1, %xmm10
1661	movddup	-12 * SIZE(BO, %rax, 2), %xmm1
1662	mulpd	%xmm0, %xmm3
1663	movapd	 -8 * SIZE(AO, %rax, 2), %xmm0
1664	addpd	%xmm3, %xmm11
1665	movddup	-11 * SIZE(BO, %rax, 2), %xmm3
1666	mulpd	%xmm2, %xmm1
1667	addpd	%xmm1, %xmm8
1668	movddup	-10 * SIZE(BO, %rax, 2), %xmm1
1669	mulpd	%xmm2, %xmm3
1670	movapd	-10 * SIZE(AO, %rax, 2), %xmm2
1671	addpd	%xmm3, %xmm9
1672	movddup	 -9 * SIZE(BO, %rax, 2), %xmm3
1673	mulpd	%xmm2, %xmm1
1674	addpd	%xmm1, %xmm10
1675	movddup	 -8 * SIZE(BO, %rax, 2), %xmm1
1676	mulpd	%xmm2, %xmm3
1677	movapd	 -4 * SIZE(AO, %rax, 2), %xmm2
1678	addpd	%xmm3, %xmm11
1679	movddup	 -7 * SIZE(BO, %rax, 2), %xmm3
1680
1681	addq	$4 * SIZE, %rax
1682	BRANCH
1683	jl	.L62
1684	ALIGN_4
1685
1686.L66:
1687#if defined(LT) || defined(RN)
1688	movq	KK, %rax
1689#else
1690	movq	K, %rax
1691	subq	KK, %rax
1692#endif
1693	andq	$3, %rax		# if (k & 1)
1694	je .L69
1695
1696	leaq	(, %rax, SIZE), %rax
1697	leaq	(AO, %rax, 2), AO
1698	leaq	(BO, %rax, 2), BO
1699	negq	%rax
1700	ALIGN_4
1701
1702.L67:
1703	mulpd	%xmm0, %xmm1
1704	addpd	%xmm1, %xmm8
1705	movddup	-14 * SIZE(BO, %rax, 2), %xmm1
1706	mulpd	%xmm0, %xmm3
1707	movapd	-14 * SIZE(AO, %rax, 2), %xmm0
1708	addpd	%xmm3, %xmm9
1709	movddup	-13 * SIZE(BO, %rax, 2), %xmm3
1710
1711	addq	$SIZE, %rax
1712	jl	.L67
1713	ALIGN_4
1714
1715.L69:
1716	addpd	%xmm10, %xmm8
1717	addpd	%xmm11, %xmm9
1718
1719#if defined(LN) || defined(RT)
1720	movq	KK, %rax
1721#ifdef LN
1722	subq	$2, %rax
1723#else
1724	subq	$2, %rax
1725#endif
1726
1727	leaq	(, %rax, SIZE), %rax
1728
1729	movq	AORIG, AO
1730	leaq	(AO, %rax, 2), AO
1731	leaq	(B,  %rax, 2), BO
1732#endif
1733
1734#if defined(LN) || defined(LT)
1735	movapd	%xmm8, %xmm0
1736	unpcklpd %xmm9, %xmm8
1737	unpckhpd %xmm9, %xmm0
1738
1739	movapd	-16 * SIZE(BO), %xmm9
1740	movapd	-14 * SIZE(BO), %xmm13
1741
1742	subpd	%xmm8,  %xmm9
1743	subpd	%xmm0,  %xmm13
1744#else
1745	movapd	-16 * SIZE(AO), %xmm0
1746	movapd	-14 * SIZE(AO), %xmm2
1747
1748	subpd	%xmm8, %xmm0
1749	subpd	%xmm9, %xmm2
1750#endif
1751
1752
1753#ifdef LN
1754	movddup	-13 * SIZE(AO), %xmm8
1755	mulpd	 %xmm8, %xmm13
1756
1757	movddup	-14 * SIZE(AO), %xmm10
1758	mulpd	 %xmm13, %xmm10
1759	subpd	 %xmm10, %xmm9
1760
1761	movddup	-16 * SIZE(AO), %xmm8
1762	mulpd	 %xmm8, %xmm9
1763#endif
1764
1765#ifdef LT
1766	movddup	-16 * SIZE(AO), %xmm8
1767	mulpd	 %xmm8, %xmm9
1768
1769	movddup	-15 * SIZE(AO), %xmm10
1770	mulpd	 %xmm9, %xmm10
1771	subpd	 %xmm10, %xmm13
1772
1773	movddup	-13 * SIZE(AO), %xmm8
1774	mulpd	 %xmm8, %xmm13
1775#endif
1776
1777#ifdef RN
1778	movddup	-16 * SIZE(BO), %xmm8
1779	mulpd	 %xmm8, %xmm0
1780
1781	movddup	-15 * SIZE(BO), %xmm9
1782	mulpd	 %xmm0, %xmm9
1783	subpd	 %xmm9, %xmm2
1784
1785	movddup	-13 * SIZE(BO), %xmm8
1786	mulpd	 %xmm8, %xmm2
1787#endif
1788
1789#ifdef RT
1790	movddup	-13 * SIZE(BO), %xmm8
1791	mulpd	 %xmm8, %xmm2
1792
1793	movddup	-14 * SIZE(BO), %xmm9
1794	mulpd	 %xmm2, %xmm9
1795	subpd	 %xmm9, %xmm0
1796
1797	movddup	-16 * SIZE(BO), %xmm8
1798	mulpd	 %xmm8, %xmm0
1799#endif
1800
1801#ifdef LN
1802	subq	$2 * SIZE, CO1
1803	subq	$2 * SIZE, CO2
1804#endif
1805
1806#if defined(LN) || defined(LT)
1807	movlpd	%xmm9,   0 * SIZE(CO1)
1808	movlpd	%xmm13,  1 * SIZE(CO1)
1809
1810	movhpd	%xmm9,   0 * SIZE(CO2)
1811	movhpd	%xmm13,  1 * SIZE(CO2)
1812#else
1813	movlpd	%xmm0,   0 * SIZE(CO1)
1814	movhpd	%xmm0,   1 * SIZE(CO1)
1815
1816	movlpd	%xmm2,   0 * SIZE(CO2)
1817	movhpd	%xmm2,   1 * SIZE(CO2)
1818#endif
1819
1820#if defined(LN) || defined(LT)
1821	movaps	%xmm9,  -16 * SIZE(BO)
1822	movaps	%xmm13, -14 * SIZE(BO)
1823#else
1824	movaps	%xmm0,  -16 * SIZE(AO)
1825	movaps	%xmm2,  -14 * SIZE(AO)
1826#endif
1827
1828#ifndef LN
1829	addq	$2 * SIZE, CO1
1830	addq	$2 * SIZE, CO2
1831#endif
1832
1833#if defined(LT) || defined(RN)
1834	movq	K,  %rax
1835	subq	KK, %rax
1836	leaq	(,%rax, SIZE), %rax
1837	leaq	(AO, %rax, 2), AO
1838	leaq	(BO, %rax, 2), BO
1839#endif
1840
1841#ifdef LN
1842	subq	$2, KK
1843#endif
1844
1845#ifdef LT
1846	addq	$2, KK
1847#endif
1848
1849#ifdef RT
1850	movq	K, %rax
1851	salq	$1 + BASE_SHIFT, %rax
1852	addq	%rax, AORIG
1853#endif
1854	ALIGN_4
1855
1856.L70:
1857	testq	$1, M
1858	je	.L79
1859	ALIGN_4
1860
1861.L71:
1862#ifdef LN
1863       movq	K, %rax
1864       salq	$0 + BASE_SHIFT, %rax
1865       subq	%rax, AORIG
1866#endif
1867
1868#if defined(LN) || defined(RT)
1869	movq	KK, %rax
1870	movq	AORIG, AO
1871	leaq	(, %rax, SIZE), %rax
1872	leaq	(AO, %rax, 1), AO
1873#endif
1874
1875	movq	B, BO
1876
1877#if defined(LN) || defined(RT)
1878	movq	KK, %rax
1879	salq	$1 + BASE_SHIFT, %rax
1880	leaq	(BO, %rax, 1), BO
1881#endif
1882
1883	movddup	-16 * SIZE(AO), %xmm0
1884	pxor	%xmm8, %xmm8
1885	movddup	-15 * SIZE(AO), %xmm1
1886	pxor	%xmm9, %xmm9
1887	movddup	-14 * SIZE(AO), %xmm2
1888	pxor	%xmm10, %xmm10
1889	movddup	-13 * SIZE(AO), %xmm3
1890	pxor	%xmm11, %xmm11
1891
1892#if defined(LT) || defined(RN)
1893	movq	KK, %rax
1894#else
1895	movq	K, %rax
1896	subq	KK, %rax
1897#endif
1898	andq	$-4, %rax
1899	leaq	(, %rax, SIZE), %rax
1900	leaq	(AO, %rax, 1), AO
1901	leaq	(BO, %rax, 2), BO
1902	negq	%rax
1903	NOBRANCH
1904	je	.L76
1905	ALIGN_4
1906
1907.L72:
1908	mulpd	-16 * SIZE(BO, %rax, 2), %xmm0
1909	addpd	%xmm0, %xmm8
1910	movddup	-12 * SIZE(AO, %rax, 1), %xmm0
1911
1912	mulpd	-14 * SIZE(BO, %rax, 2), %xmm1
1913	addpd	%xmm1, %xmm9
1914	movddup	-11 * SIZE(AO, %rax, 1), %xmm1
1915
1916	mulpd	-12 * SIZE(BO, %rax, 2), %xmm2
1917	addpd	%xmm2, %xmm10
1918	movddup	-10 * SIZE(AO, %rax, 1), %xmm2
1919
1920	mulpd	-10 * SIZE(BO, %rax, 2), %xmm3
1921	addpd	%xmm3, %xmm11
1922	movddup	 -9 * SIZE(AO, %rax, 1), %xmm3
1923
1924	addq	$4 * SIZE, %rax
1925	BRANCH
1926	jl	.L72
1927	ALIGN_4
1928
1929.L76:
1930#if defined(LT) || defined(RN)
1931	movq	KK, %rax
1932#else
1933	movq	K, %rax
1934	subq	KK, %rax
1935#endif
1936	andq	$3, %rax		# if (k & 1)
1937	je .L78
1938
1939	leaq	(, %rax, SIZE), %rax
1940	leaq	(AO, %rax, 1), AO
1941	leaq	(BO, %rax, 2), BO
1942	negq	%rax
1943	ALIGN_4
1944
1945.L77:
1946	mulpd	-16 * SIZE(BO, %rax, 2), %xmm0
1947	addpd	%xmm0, %xmm8
1948	movddup	-15 * SIZE(AO, %rax, 1), %xmm0
1949
1950	addq	$SIZE, %rax
1951	jl	.L77
1952	ALIGN_4
1953
1954.L78:
1955	addpd	%xmm9,  %xmm8
1956	addpd	%xmm11, %xmm10
1957	addpd	%xmm10, %xmm8
1958
1959#if defined(LN) || defined(RT)
1960	movq	KK, %rax
1961#ifdef LN
1962	subq	$1, %rax
1963#else
1964	subq	$2, %rax
1965#endif
1966
1967	leaq	(, %rax, SIZE), %rax
1968
1969	movq	AORIG, AO
1970	leaq	(AO, %rax, 1), AO
1971	leaq	(B,  %rax, 2), BO
1972#endif
1973
1974#if defined(LN) || defined(LT)
1975	movapd	-16 * SIZE(BO), %xmm2
1976#else
1977	movapd	-16 * SIZE(AO), %xmm2
1978#endif
1979
1980	subpd	%xmm8,  %xmm2
1981
1982#if defined(LN) || defined(LT)
1983	movddup	-16 * SIZE(AO), %xmm0
1984
1985	mulpd	 %xmm0, %xmm2
1986#endif
1987
1988#ifdef RN
1989	movapd	%xmm2,  %xmm0
1990        unpckhpd %xmm0, %xmm0
1991
1992	mulsd	-16 * SIZE(BO), %xmm2
1993	movsd	-15 * SIZE(BO), %xmm4
1994	mulsd	 %xmm2, %xmm4
1995	subsd	 %xmm4, %xmm0
1996
1997	mulsd	-13 * SIZE(BO), %xmm0
1998	unpcklpd %xmm0, %xmm2
1999#endif
2000
2001#ifdef RT
2002	movapd	%xmm2,  %xmm0
2003        unpckhpd %xmm0, %xmm0
2004
2005	mulsd	-13 * SIZE(BO), %xmm0
2006
2007	movlpd	-14 * SIZE(BO), %xmm4
2008	mulsd	 %xmm0, %xmm4
2009	subsd	 %xmm4, %xmm2
2010
2011	mulsd	-16 * SIZE(BO), %xmm2
2012	unpcklpd %xmm0, %xmm2
2013#endif
2014
2015#ifdef LN
2016	subq	$1 * SIZE, CO1
2017	subq	$1 * SIZE, CO2
2018#endif
2019
2020	movlpd	%xmm2,  0 * SIZE(CO1)
2021	movhpd	%xmm2,  0 * SIZE(CO2)
2022
2023#if defined(LN) || defined(LT)
2024	movaps	%xmm2, -16 * SIZE(BO)
2025#else
2026	movaps	%xmm2, -16 * SIZE(AO)
2027#endif
2028
2029#ifndef LN
2030	addq	$1 * SIZE, CO1
2031	addq	$1 * SIZE, CO2
2032#endif
2033
2034#if defined(LT) || defined(RN)
2035	movq	K,  %rax
2036	subq	KK, %rax
2037	leaq	(,%rax, SIZE), %rax
2038	leaq	(AO, %rax, 1), AO
2039	leaq	(BO, %rax, 2), BO
2040#endif
2041
2042#ifdef LN
2043	subq	$1, KK
2044#endif
2045
2046#ifdef LT
2047	addq	$1, KK
2048#endif
2049
2050#ifdef RT
2051	movq	K, %rax
2052	salq	$0 + BASE_SHIFT, %rax
2053	addq	%rax, AORIG
2054#endif
2055	ALIGN_4
2056
2057.L79:
2058#ifdef LN
2059       leaq	(, K, SIZE), %rax
2060       leaq	(B, %rax, 2), B
2061#endif
2062
2063#if defined(LT) || defined(RN)
2064	movq	BO, B
2065#endif
2066
2067#ifdef RN
2068	addq	$2, KK
2069#endif
2070
2071#ifdef RT
2072	subq	$2, KK
2073#endif
2074	ALIGN_4
2075
2076.L80:
2077	movq	N,  J
2078	sarq	$2, J		# j = (n >> 2)
2079	jle	.L999
2080
2081.L01:
2082#if defined(LT) || defined(RN)
2083	movq	A, AO
2084#else
2085	movq	A, AORIG
2086#endif
2087
2088#ifdef RT
2089       movq	K, %rax
2090       salq	$2 + BASE_SHIFT, %rax
2091       subq	%rax, B
2092
2093       leaq	(, LDC, 4), %rax
2094       subq	%rax, C
2095#endif
2096
2097	movq	C, CO1			# coffset1 = c
2098	leaq	(C, LDC, 1), CO2	# coffset2 = c + ldc
2099#ifndef RT
2100	leaq	(C, LDC, 4), C
2101#endif
2102
2103#ifdef LN
2104	movq	OFFSET, %rax
2105	addq	M, %rax
2106	movq	%rax, KK
2107#endif
2108
2109	movq	K, %rax
2110	salq	$BASE_SHIFT + 2, %rax
2111	movq	B, BB
2112	subq	%rax, BB
2113
2114#if defined(LT)
2115	movq	OFFSET, %rax
2116	movq	%rax, KK
2117#endif
2118
2119	movq	M,  I
2120	sarq	$2, I	# i = (m >> 2)
2121	jle	.L20
2122	ALIGN_4
2123
2124.L11:
2125#ifdef LN
2126       movq	K, %rax
2127       salq	$2 + BASE_SHIFT, %rax
2128       subq	%rax, AORIG
2129#endif
2130
2131#if defined(LN) || defined(RT)
2132	movq	KK, %rax
2133	movq	AORIG, AO
2134	leaq	(, %rax, SIZE), %rax
2135	leaq	(AO, %rax, 4), AO
2136#endif
2137
2138	movq	B, BO
2139
2140#if defined(LN) || defined(RT)
2141	movq	KK, %rax
2142	leaq	(, %rax, SIZE), %rax
2143	leaq	(BO, %rax, 4), BO
2144#endif
2145
2146	movapd	-16 * SIZE(AO), %xmm0
2147	movddup	-16 * SIZE(BO), %xmm1
2148	pxor	%xmm8, %xmm8
2149 	movddup	-15 * SIZE(BO), %xmm3
2150	pxor	%xmm9, %xmm9
2151	movapd	 -8 * SIZE(AO), %xmm4
2152	pxor	%xmm10, %xmm10
2153	movddup	 -8 * SIZE(BO), %xmm5
2154	pxor	%xmm11, %xmm11
2155
2156#ifndef LN
2157	prefetchw      3 * SIZE(CO1)
2158	pxor	%xmm12, %xmm12
2159	prefetchw      5 * SIZE(CO2)
2160	pxor	%xmm13, %xmm13
2161	prefetchw      3 * SIZE(CO1, LDC, 2)
2162	pxor	%xmm14, %xmm14
2163	prefetchw      5 * SIZE(CO2, LDC, 2)
2164	pxor	%xmm15, %xmm15
2165	movapd	%xmm0, %xmm2
2166#else
2167	prefetchw     -8 * SIZE(CO1)
2168	pxor	%xmm12, %xmm12
2169	prefetchw     -8 * SIZE(CO2)
2170	pxor	%xmm13, %xmm13
2171	prefetchw     -8 * SIZE(CO1, LDC, 2)
2172	pxor	%xmm14, %xmm14
2173	prefetchw     -8 * SIZE(CO2, LDC, 2)
2174	pxor	%xmm15, %xmm15
2175	movapd	%xmm0, %xmm2
2176#endif
2177
2178	prefetch	 -16 * SIZE(BB)
2179	prefetch	  -8 * SIZE(BB)
2180	subq		 $-16 * SIZE, BB
2181
2182#if defined(LT) || defined(RN)
2183	movq	KK, %rax
2184#else
2185	movq	K, %rax
2186	subq	KK, %rax
2187#endif
2188
2189	andq	$-8, %rax
2190	leaq	(, %rax, SIZE), %rax
2191	leaq	(AO, %rax, 4), AO
2192	leaq	(BO, %rax, 4), BO
2193	negq	%rax
2194	NOBRANCH
2195	je	.L15
2196	ALIGN_4
2197
2198.L12:
2199	KERNEL1(16 *  0)
2200	KERNEL2(16 *  0)
2201	KERNEL3(16 *  0)
2202	KERNEL4(16 *  0)
2203	KERNEL5(16 *  0)
2204	KERNEL6(16 *  0)
2205	KERNEL7(16 *  0)
2206	KERNEL8(16 *  0)
2207	BRANCH
2208	jl	.L12
2209	ALIGN_4
2210
2211.L15:
2212#if defined(LT) || defined(RN)
2213	movq	KK, %rax
2214#else
2215	movq	K, %rax
2216	subq	KK, %rax
2217#endif
2218	testq	$4, %rax
2219	je .L16
2220	xorq	%rax, %rax
2221	ALIGN_4
2222
2223	KERNEL_SUB1(16 *  0)
2224	KERNEL_SUB2(16 *  0)
2225	KERNEL_SUB3(16 *  0)
2226	KERNEL_SUB4(16 *  0)
2227
2228	subq	$-16 * SIZE, BO
2229	subq	$-16 * SIZE, AO
2230	ALIGN_4
2231
2232.L16:
2233#if defined(LT) || defined(RN)
2234	movq	KK, %rax
2235#else
2236	movq	K, %rax
2237	subq	KK, %rax
2238#endif
2239	andq	$3, %rax		# if (k & 1)
2240	je .L19
2241
2242	leaq	(, %rax, SIZE), %rax
2243	leaq	(AO, %rax, 4), AO
2244	leaq	(BO, %rax, 4), BO
2245	negq	%rax
2246	ALIGN_4
2247
2248.L17:
2249	mulpd	%xmm1, %xmm0
2250	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1
2251	addpd	%xmm0, %xmm8
2252	movapd	%xmm2, %xmm0
2253	addpd	%xmm1, %xmm12
2254	movddup	-14 * SIZE(BO, %rax, 4), %xmm1
2255	mulpd	%xmm3, %xmm2
2256	mulpd	-14 * SIZE(AO, %rax, 4), %xmm3
2257	addpd	%xmm2, %xmm9
2258	movapd	%xmm0, %xmm2
2259	addpd	%xmm3, %xmm13
2260	movddup	-13 * SIZE(BO, %rax, 4), %xmm3
2261	mulpd	%xmm1, %xmm0
2262	mulpd	-14 * SIZE(AO, %rax, 4), %xmm1
2263	addpd	%xmm0, %xmm10
2264	movapd	-12 * SIZE(AO, %rax, 4), %xmm0
2265	addpd	%xmm1, %xmm14
2266	movddup	-12 * SIZE(BO, %rax, 4), %xmm1
2267	mulpd	%xmm3, %xmm2
2268	mulpd	-14 * SIZE(AO, %rax, 4), %xmm3
2269	addpd	%xmm2, %xmm11
2270	addpd	%xmm3, %xmm15
2271 	movddup	-11 * SIZE(BO, %rax, 4), %xmm3
2272	movapd	%xmm0, %xmm2
2273
2274	addq	$SIZE, %rax
2275	jl	.L17
2276	ALIGN_4
2277
2278.L19:
2279#if defined(LN) || defined(RT)
2280	movq	KK, %rax
2281#ifdef LN
2282	subq	$4, %rax
2283#else
2284	subq	$4, %rax
2285#endif
2286
2287	leaq	(, %rax, SIZE), %rax
2288
2289	movq	AORIG, AO
2290	leaq	(AO, %rax, 4), AO
2291	leaq	(B,  %rax, 4), BO
2292#endif
2293
2294#if defined(LN) || defined(LT)
2295	movapd	%xmm8, %xmm0
2296	unpcklpd %xmm9, %xmm8
2297	unpckhpd %xmm9, %xmm0
2298
2299	movapd	%xmm10, %xmm2
2300	unpcklpd %xmm11, %xmm10
2301	unpckhpd %xmm11, %xmm2
2302
2303	movapd	%xmm12, %xmm4
2304	unpcklpd %xmm13, %xmm12
2305	unpckhpd %xmm13, %xmm4
2306
2307	movapd	%xmm14, %xmm6
2308	unpcklpd %xmm15, %xmm14
2309	unpckhpd %xmm15, %xmm6
2310
2311	movapd	-16 * SIZE(BO), %xmm9
2312	movapd	-14 * SIZE(BO), %xmm11
2313	movapd	-12 * SIZE(BO), %xmm13
2314	movapd	-10 * SIZE(BO), %xmm15
2315	movapd	 -8 * SIZE(BO), %xmm1
2316	movapd	 -6 * SIZE(BO), %xmm3
2317	movapd	 -4 * SIZE(BO), %xmm5
2318	movapd	 -2 * SIZE(BO), %xmm7
2319
2320	subpd	%xmm8,  %xmm9
2321	subpd	%xmm10, %xmm11
2322	subpd	%xmm0,  %xmm13
2323	subpd	%xmm2,  %xmm15
2324	subpd	%xmm12, %xmm1
2325	subpd	%xmm14, %xmm3
2326	subpd	%xmm4,  %xmm5
2327	subpd	%xmm6,  %xmm7
2328#else
2329	movapd	-16 * SIZE(AO), %xmm0
2330	movapd	-14 * SIZE(AO), %xmm1
2331	movapd	-12 * SIZE(AO), %xmm2
2332	movapd	-10 * SIZE(AO), %xmm3
2333
2334	movapd	 -8 * SIZE(AO), %xmm4
2335	movapd	 -6 * SIZE(AO), %xmm5
2336	movapd	 -4 * SIZE(AO), %xmm6
2337	movapd	 -2 * SIZE(AO), %xmm7
2338
2339	subpd	%xmm8,  %xmm0
2340	subpd	%xmm12, %xmm1
2341	subpd	%xmm9,  %xmm2
2342	subpd	%xmm13, %xmm3
2343	subpd	%xmm10, %xmm4
2344	subpd	%xmm14, %xmm5
2345	subpd	%xmm11, %xmm6
2346	subpd	%xmm15, %xmm7
2347#endif
2348
2349#ifdef LN
2350	movddup	 -1 * SIZE(AO), %xmm8
2351	mulpd	 %xmm8, %xmm5
2352	mulpd	 %xmm8, %xmm7
2353
2354	movddup	 -2 * SIZE(AO), %xmm10
2355	mulpd	 %xmm5, %xmm10
2356	subpd	 %xmm10, %xmm1
2357	movddup	 -2 * SIZE(AO), %xmm10
2358	mulpd	 %xmm7, %xmm10
2359	subpd	 %xmm10, %xmm3
2360
2361	movddup	 -3 * SIZE(AO), %xmm12
2362	mulpd	 %xmm5, %xmm12
2363	subpd	 %xmm12, %xmm13
2364	movddup	 -3 * SIZE(AO), %xmm12
2365	mulpd	 %xmm7, %xmm12
2366	subpd	 %xmm12, %xmm15
2367
2368	movddup	 -4 * SIZE(AO), %xmm14
2369	mulpd	 %xmm5, %xmm14
2370	subpd	 %xmm14, %xmm9
2371	movddup	 -4 * SIZE(AO), %xmm14
2372	mulpd	 %xmm7, %xmm14
2373	subpd	 %xmm14, %xmm11
2374
2375	movddup	 -6 * SIZE(AO), %xmm8
2376	mulpd	 %xmm8, %xmm1
2377	mulpd	 %xmm8, %xmm3
2378
2379	movddup	 -7 * SIZE(AO), %xmm10
2380	mulpd	 %xmm1, %xmm10
2381	subpd	 %xmm10, %xmm13
2382	movddup	 -7 * SIZE(AO), %xmm10
2383	mulpd	 %xmm3, %xmm10
2384	subpd	 %xmm10, %xmm15
2385
2386	movddup	 -8 * SIZE(AO), %xmm12
2387	mulpd	 %xmm1, %xmm12
2388	subpd	 %xmm12, %xmm9
2389	movddup	 -8 * SIZE(AO), %xmm12
2390	mulpd	 %xmm3, %xmm12
2391	subpd	 %xmm12, %xmm11
2392
2393	movddup	-11 * SIZE(AO), %xmm8
2394	mulpd	 %xmm8, %xmm13
2395	mulpd	 %xmm8, %xmm15
2396
2397	movddup	-12 * SIZE(AO), %xmm10
2398	mulpd	 %xmm13, %xmm10
2399	subpd	 %xmm10, %xmm9
2400	movddup	-12 * SIZE(AO), %xmm10
2401	mulpd	 %xmm15, %xmm10
2402	subpd	 %xmm10, %xmm11
2403
2404	movddup	-16 * SIZE(AO), %xmm8
2405	mulpd	 %xmm8, %xmm9
2406	mulpd	 %xmm8, %xmm11
2407#endif
2408
2409#ifdef LT
2410	movddup -16 * SIZE(AO), %xmm8
2411	mulpd	 %xmm8, %xmm9
2412	mulpd	 %xmm8, %xmm11
2413
2414	movddup	-15 * SIZE(AO), %xmm10
2415	mulpd	 %xmm9, %xmm10
2416	subpd	 %xmm10, %xmm13
2417
2418	movddup	-15 * SIZE(AO), %xmm10
2419	mulpd	 %xmm11, %xmm10
2420	subpd	 %xmm10, %xmm15
2421
2422	movddup	-14 * SIZE(AO), %xmm12
2423	mulpd	 %xmm9, %xmm12
2424	subpd	 %xmm12, %xmm1
2425	movddup	-14 * SIZE(AO), %xmm12
2426	mulpd	 %xmm11, %xmm12
2427	subpd	 %xmm12, %xmm3
2428
2429	movddup	-13 * SIZE(AO), %xmm14
2430	mulpd	 %xmm9, %xmm14
2431	subpd	 %xmm14, %xmm5
2432	movddup	-13 * SIZE(AO), %xmm14
2433	mulpd	 %xmm11, %xmm14
2434	subpd	 %xmm14, %xmm7
2435
2436	movddup	-11 * SIZE(AO), %xmm8
2437	mulpd	 %xmm8, %xmm13
2438	mulpd	 %xmm8, %xmm15
2439
2440	movddup	-10 * SIZE(AO), %xmm10
2441	mulpd	 %xmm13, %xmm10
2442	subpd	 %xmm10, %xmm1
2443	movddup	-10 * SIZE(AO), %xmm10
2444	mulpd	 %xmm15, %xmm10
2445	subpd	 %xmm10, %xmm3
2446
2447	movddup	 -9 * SIZE(AO), %xmm12
2448	mulpd	 %xmm13, %xmm12
2449	subpd	 %xmm12, %xmm5
2450	movddup	 -9 * SIZE(AO), %xmm12
2451	mulpd	 %xmm15, %xmm12
2452	subpd	 %xmm12, %xmm7
2453
2454	movddup	 -6 * SIZE(AO), %xmm8
2455	mulpd	 %xmm8, %xmm1
2456	mulpd	 %xmm8, %xmm3
2457
2458	movddup	 -5 * SIZE(AO), %xmm10
2459	mulpd	 %xmm1, %xmm10
2460	subpd	 %xmm10, %xmm5
2461	movddup	 -5 * SIZE(AO), %xmm10
2462	mulpd	 %xmm3, %xmm10
2463	subpd	 %xmm10, %xmm7
2464
2465	movddup	 -1 * SIZE(AO), %xmm8
2466	mulpd	 %xmm8, %xmm5
2467	mulpd	 %xmm8, %xmm7
2468#endif
2469
2470#ifdef RN
2471	movddup	-16 * SIZE(BO), %xmm8
2472	mulpd	 %xmm8, %xmm0
2473	mulpd	 %xmm8, %xmm1
2474
2475	movddup	-15 * SIZE(BO), %xmm9
2476	mulpd	 %xmm0, %xmm9
2477	subpd	 %xmm9, %xmm2
2478	movddup	-15 * SIZE(BO), %xmm9
2479	mulpd	 %xmm1, %xmm9
2480	subpd	 %xmm9, %xmm3
2481
2482	movddup	-14 * SIZE(BO), %xmm10
2483	mulpd	 %xmm0, %xmm10
2484	subpd	 %xmm10, %xmm4
2485	movddup	-14 * SIZE(BO), %xmm10
2486	mulpd	 %xmm1, %xmm10
2487	subpd	 %xmm10, %xmm5
2488
2489	movddup	 -13 * SIZE(BO), %xmm11
2490	mulpd	 %xmm0, %xmm11
2491	subpd	 %xmm11, %xmm6
2492	movddup	 -13 * SIZE(BO), %xmm11
2493	mulpd	 %xmm1, %xmm11
2494	subpd	 %xmm11, %xmm7
2495
2496	movddup	 -11 * SIZE(BO), %xmm8
2497	mulpd	 %xmm8, %xmm2
2498	mulpd	 %xmm8, %xmm3
2499
2500	movddup	 -10 * SIZE(BO), %xmm9
2501	mulpd	 %xmm2, %xmm9
2502	subpd	 %xmm9, %xmm4
2503	movddup	 -10 * SIZE(BO), %xmm9
2504	mulpd	 %xmm3, %xmm9
2505	subpd	 %xmm9, %xmm5
2506
2507	movddup	  -9 * SIZE(BO), %xmm10
2508	mulpd	 %xmm2, %xmm10
2509	subpd	 %xmm10, %xmm6
2510	movddup	  -9 * SIZE(BO), %xmm10
2511	mulpd	 %xmm3, %xmm10
2512	subpd	 %xmm10, %xmm7
2513
2514	movddup	 -6 * SIZE(BO), %xmm8
2515	mulpd	 %xmm8, %xmm4
2516	mulpd	 %xmm8, %xmm5
2517
2518	movddup	 -5 * SIZE(BO), %xmm9
2519	mulpd	 %xmm4, %xmm9
2520	subpd	 %xmm9, %xmm6
2521	movddup	 -5 * SIZE(BO), %xmm9
2522	mulpd	 %xmm5, %xmm9
2523	subpd	 %xmm9, %xmm7
2524
2525	movddup	 -1 * SIZE(BO), %xmm8
2526	mulpd	 %xmm8, %xmm6
2527	mulpd	 %xmm8, %xmm7
2528#endif
2529
2530#ifdef RT
2531	movddup	 -1 * SIZE(BO), %xmm8
2532	mulpd	 %xmm8, %xmm6
2533	mulpd	 %xmm8, %xmm7
2534
2535	movddup	 -2 * SIZE(BO), %xmm9
2536	mulpd	 %xmm6, %xmm9
2537	subpd	 %xmm9, %xmm4
2538	movddup	 -2 * SIZE(BO), %xmm9
2539	mulpd	 %xmm7, %xmm9
2540	subpd	 %xmm9, %xmm5
2541
2542	movddup	 -3 * SIZE(BO), %xmm10
2543	mulpd	 %xmm6, %xmm10
2544	subpd	 %xmm10, %xmm2
2545	movddup	 -3 * SIZE(BO), %xmm10
2546	mulpd	 %xmm7, %xmm10
2547	subpd	 %xmm10, %xmm3
2548
2549	movddup	 -4 * SIZE(BO), %xmm11
2550	mulpd	 %xmm6, %xmm11
2551	subpd	 %xmm11, %xmm0
2552	movddup	 -4 * SIZE(BO), %xmm11
2553	mulpd	 %xmm7, %xmm11
2554	subpd	 %xmm11, %xmm1
2555
2556	movddup	 -6 * SIZE(BO), %xmm8
2557	mulpd	 %xmm8, %xmm4
2558	mulpd	 %xmm8, %xmm5
2559
2560	movddup	 -7 * SIZE(BO), %xmm9
2561	mulpd	 %xmm4, %xmm9
2562	subpd	 %xmm9, %xmm2
2563	movddup	 -7 * SIZE(BO), %xmm9
2564	mulpd	 %xmm5, %xmm9
2565	subpd	 %xmm9, %xmm3
2566
2567	movddup	 -8 * SIZE(BO), %xmm10
2568	mulpd	 %xmm4, %xmm10
2569	subpd	 %xmm10, %xmm0
2570	movddup	 -8 * SIZE(BO), %xmm10
2571	mulpd	 %xmm5, %xmm10
2572	subpd	 %xmm10, %xmm1
2573
2574	movddup	-11 * SIZE(BO), %xmm8
2575	mulpd	 %xmm8, %xmm2
2576	mulpd	 %xmm8, %xmm3
2577
2578	movddup	-12 * SIZE(BO), %xmm9
2579	mulpd	 %xmm2, %xmm9
2580	subpd	 %xmm9, %xmm0
2581	movddup	-12 * SIZE(BO), %xmm9
2582	mulpd	 %xmm3, %xmm9
2583	subpd	 %xmm9, %xmm1
2584
2585	movddup	-16 * SIZE(BO), %xmm8
2586	mulpd	 %xmm8, %xmm0
2587	mulpd	 %xmm8, %xmm1
2588#endif
2589
2590#ifdef LN
2591	subq	$4 * SIZE, CO1
2592	subq	$4 * SIZE, CO2
2593#endif
2594
2595#if defined(LN) || defined(LT)
2596	movlpd	%xmm9,  0 * SIZE(CO1)
2597	movlpd	%xmm13, 1 * SIZE(CO1)
2598	movlpd	%xmm1,  2 * SIZE(CO1)
2599	movlpd	%xmm5,  3 * SIZE(CO1)
2600
2601	movhpd	%xmm9,  0 * SIZE(CO2)
2602	movhpd	%xmm13, 1 * SIZE(CO2)
2603	movhpd	%xmm1,  2 * SIZE(CO2)
2604	movhpd	%xmm5,  3 * SIZE(CO2)
2605
2606	movlpd	%xmm11, 0 * SIZE(CO1, LDC, 2)
2607	movlpd	%xmm15, 1 * SIZE(CO1, LDC, 2)
2608	movlpd	%xmm3,  2 * SIZE(CO1, LDC, 2)
2609	movlpd	%xmm7,  3 * SIZE(CO1, LDC, 2)
2610
2611	movhpd	%xmm11, 0 * SIZE(CO2, LDC, 2)
2612	movhpd	%xmm15, 1 * SIZE(CO2, LDC, 2)
2613	movhpd	%xmm3,  2 * SIZE(CO2, LDC, 2)
2614	movhpd	%xmm7,  3 * SIZE(CO2, LDC, 2)
2615#else
2616	movlpd	%xmm0,  0 * SIZE(CO1)
2617	movhpd	%xmm0,  1 * SIZE(CO1)
2618	movlpd	%xmm1,  2 * SIZE(CO1)
2619	movhpd	%xmm1,  3 * SIZE(CO1)
2620
2621	movlpd	%xmm2,  0 * SIZE(CO2)
2622	movhpd	%xmm2,  1 * SIZE(CO2)
2623	movlpd	%xmm3,  2 * SIZE(CO2)
2624	movhpd	%xmm3,  3 * SIZE(CO2)
2625
2626	movlpd	%xmm4,  0 * SIZE(CO1, LDC, 2)
2627	movhpd	%xmm4,  1 * SIZE(CO1, LDC, 2)
2628	movlpd	%xmm5,  2 * SIZE(CO1, LDC, 2)
2629	movhpd	%xmm5,  3 * SIZE(CO1, LDC, 2)
2630
2631	movlpd	%xmm6,  0 * SIZE(CO2, LDC, 2)
2632	movhpd	%xmm6,  1 * SIZE(CO2, LDC, 2)
2633	movlpd	%xmm7,  2 * SIZE(CO2, LDC, 2)
2634	movhpd	%xmm7,  3 * SIZE(CO2, LDC, 2)
2635#endif
2636
2637#if defined(LN) || defined(LT)
2638	movaps	%xmm9,  -16 * SIZE(BO)
2639	movaps	%xmm11, -14 * SIZE(BO)
2640	movaps	%xmm13, -12 * SIZE(BO)
2641	movaps	%xmm15, -10 * SIZE(BO)
2642	movaps	%xmm1,   -8 * SIZE(BO)
2643	movaps	%xmm3,   -6 * SIZE(BO)
2644	movaps	%xmm5,   -4 * SIZE(BO)
2645	movaps	%xmm7,   -2 * SIZE(BO)
2646#else
2647	movaps	%xmm0,  -16 * SIZE(AO)
2648	movaps	%xmm1,  -14 * SIZE(AO)
2649	movaps	%xmm2,  -12 * SIZE(AO)
2650	movaps	%xmm3,  -10 * SIZE(AO)
2651	movaps	%xmm4,   -8 * SIZE(AO)
2652	movaps	%xmm5,   -6 * SIZE(AO)
2653	movaps	%xmm6,   -4 * SIZE(AO)
2654	movaps	%xmm7,   -2 * SIZE(AO)
2655#endif
2656
2657#ifndef LN
2658	addq	$4 * SIZE, CO1
2659	addq	$4 * SIZE, CO2
2660#endif
2661
2662#if defined(LT) || defined(RN)
2663	movq	K,  %rax
2664	subq	KK, %rax
2665	leaq	(,%rax, SIZE), %rax
2666	leaq	(AO, %rax, 4), AO
2667	leaq	(BO, %rax, 4), BO
2668#endif
2669
2670#ifdef LN
2671	subq	$4, KK
2672#endif
2673
2674#ifdef LT
2675	addq	$4, KK
2676#endif
2677
2678#ifdef RT
2679	movq	K, %rax
2680	salq	$2 + BASE_SHIFT, %rax
2681	addq	%rax, AORIG
2682#endif
2683
2684	decq	I			# i --
2685	jg	.L11
2686	ALIGN_4
2687
2688.L20:
2689	testq	$3, M
2690	je	.L39
2691
2692	testq	$2, M
2693	je	.L30
2694	ALIGN_4
2695
2696.L21:
2697#ifdef LN
2698       movq	K, %rax
2699       salq	$1 + BASE_SHIFT, %rax
2700       subq	%rax, AORIG
2701#endif
2702
2703#if defined(LN) || defined(RT)
2704	movq	KK, %rax
2705	movq	AORIG, AO
2706	leaq	(, %rax, SIZE), %rax
2707	leaq	(AO, %rax, 2), AO
2708#endif
2709
2710	movq	B, BO
2711
2712#if defined(LN) || defined(RT)
2713	movq	KK, %rax
2714	leaq	(, %rax, SIZE), %rax
2715	leaq	(BO, %rax, 4), BO
2716#endif
2717
2718	movapd	-16 * SIZE(AO), %xmm0
2719	pxor	%xmm8, %xmm8
2720	movapd	-12 * SIZE(AO), %xmm2
2721	pxor	%xmm9, %xmm9
2722	movddup	-16 * SIZE(BO), %xmm1
2723	pxor	%xmm10, %xmm10
2724	movddup	-15 * SIZE(BO), %xmm5
2725	pxor	%xmm11, %xmm11
2726	movddup	 -8 * SIZE(BO), %xmm3
2727
2728#if defined(LT) || defined(RN)
2729	movq	KK, %rax
2730#else
2731	movq	K, %rax
2732	subq	KK, %rax
2733#endif
2734	andq	$-4, %rax
2735	leaq	(, %rax, SIZE), %rax
2736	leaq	(AO, %rax, 2), AO
2737	leaq	(BO, %rax, 4), BO
2738	negq	%rax
2739	NOBRANCH
2740	je	.L26
2741	ALIGN_4
2742
2743.L22:
2744	mulpd	%xmm0, %xmm1
2745	addpd	%xmm1, %xmm8
2746	movddup	-14 * SIZE(BO, %rax, 4), %xmm1
2747	mulpd	%xmm0, %xmm5
2748	addpd	%xmm5, %xmm9
2749	movddup	-13 * SIZE(BO, %rax, 4), %xmm5
2750	mulpd	%xmm0, %xmm1
2751	addpd	%xmm1, %xmm10
2752	movddup	-12 * SIZE(BO, %rax, 4), %xmm1
2753	mulpd	%xmm0, %xmm5
2754	movapd	-14 * SIZE(AO, %rax, 2), %xmm0
2755	addpd	%xmm5, %xmm11
2756	movddup	-11 * SIZE(BO, %rax, 4), %xmm5
2757	mulpd	%xmm0, %xmm1
2758	addpd	%xmm1, %xmm8
2759	movddup	-10 * SIZE(BO, %rax, 4), %xmm1
2760	mulpd	%xmm0, %xmm5
2761	addpd	%xmm5, %xmm9
2762	movddup	 -9 * SIZE(BO, %rax, 4), %xmm5
2763	mulpd	%xmm0, %xmm1
2764	addpd	%xmm1, %xmm10
2765	movddup	  (BO, %rax, 4), %xmm1
2766	mulpd	%xmm0, %xmm5
2767	movapd	 -8 * SIZE(AO, %rax, 2), %xmm0
2768	addpd	%xmm5, %xmm11
2769	movddup	 -7 * SIZE(BO, %rax, 4), %xmm5
2770	mulpd	%xmm2, %xmm3
2771	addpd	%xmm3, %xmm8
2772	movddup	 -6 * SIZE(BO, %rax, 4), %xmm3
2773	mulpd	%xmm2, %xmm5
2774	addpd	%xmm5, %xmm9
2775	movddup	 -5 * SIZE(BO, %rax, 4), %xmm5
2776	mulpd	%xmm2, %xmm3
2777	addpd	%xmm3, %xmm10
2778	movddup	 -4 * SIZE(BO, %rax, 4), %xmm3
2779	mulpd	%xmm2, %xmm5
2780	movapd	-10 * SIZE(AO, %rax, 2), %xmm2
2781	addpd	%xmm5, %xmm11
2782	movddup	 -3 * SIZE(BO, %rax, 4), %xmm5
2783	mulpd	%xmm2, %xmm3
2784	addpd	%xmm3, %xmm8
2785	movddup	 -2 * SIZE(BO, %rax, 4), %xmm3
2786	mulpd	%xmm2, %xmm5
2787	addpd	%xmm5, %xmm9
2788	movddup	 -1 * SIZE(BO, %rax, 4), %xmm5
2789	mulpd	%xmm2, %xmm3
2790	addpd	%xmm3, %xmm10
2791	movddup	  8 * SIZE(BO, %rax, 4), %xmm3
2792	mulpd	%xmm2, %xmm5
2793	movapd	 -4 * SIZE(AO, %rax, 2), %xmm2
2794	addpd	%xmm5, %xmm11
2795	movddup	  1 * SIZE(BO, %rax, 4), %xmm5
2796
2797	addq	$4 * SIZE, %rax
2798	BRANCH
2799	jl	.L22
2800	ALIGN_4
2801
2802.L26:
2803#if defined(LT) || defined(RN)
2804	movq	KK, %rax
2805#else
2806	movq	K, %rax
2807	subq	KK, %rax
2808#endif
2809	andq	$3, %rax		# if (k & 1)
2810	je .L29
2811
2812	leaq	(, %rax, SIZE), %rax
2813	leaq	(AO, %rax, 2), AO
2814	leaq	(BO, %rax, 4), BO
2815	negq	%rax
2816	ALIGN_4
2817
2818.L27:
2819	mulpd	%xmm0, %xmm1
2820	addpd	%xmm1, %xmm8
2821	movddup	-14 * SIZE(BO, %rax, 4), %xmm1
2822	mulpd	%xmm0, %xmm5
2823	addpd	%xmm5, %xmm9
2824	movddup	-13 * SIZE(BO, %rax, 4), %xmm5
2825	mulpd	%xmm0, %xmm1
2826	addpd	%xmm1, %xmm10
2827	movddup	-12 * SIZE(BO, %rax, 4), %xmm1
2828	mulpd	%xmm0, %xmm5
2829	movapd	-14 * SIZE(AO, %rax, 2), %xmm0
2830	addpd	%xmm5, %xmm11
2831	movddup	-11 * SIZE(BO, %rax, 4), %xmm5
2832
2833	addq	$SIZE, %rax
2834	jl	.L27
2835	ALIGN_4
2836
2837.L29:
2838#if defined(LN) || defined(RT)
2839	movq	KK, %rax
2840#ifdef LN
2841	subq	$2, %rax
2842#else
2843	subq	$4, %rax
2844#endif
2845
2846	leaq	(, %rax, SIZE), %rax
2847
2848	movq	AORIG, AO
2849	leaq	(AO, %rax, 2), AO
2850	leaq	(B,  %rax, 4), BO
2851#endif
2852
2853#if defined(LN) || defined(LT)
2854	movapd	 %xmm8,  %xmm0
2855	unpcklpd %xmm9,  %xmm8
2856	unpckhpd %xmm9,  %xmm0
2857
2858	movapd	 %xmm10, %xmm2
2859	unpcklpd %xmm11, %xmm10
2860	unpckhpd %xmm11, %xmm2
2861
2862	movapd	-16 * SIZE(BO), %xmm9
2863	movapd	-14 * SIZE(BO), %xmm11
2864	movapd	-12 * SIZE(BO), %xmm13
2865	movapd	-10 * SIZE(BO), %xmm15
2866
2867	subpd	%xmm8,  %xmm9
2868	subpd	%xmm10, %xmm11
2869	subpd	%xmm0,  %xmm13
2870	subpd	%xmm2,  %xmm15
2871#else
2872	movapd	-16 * SIZE(AO), %xmm0
2873	movapd	-14 * SIZE(AO), %xmm2
2874	movapd	-12 * SIZE(AO), %xmm4
2875	movapd	-10 * SIZE(AO), %xmm6
2876
2877	subpd	%xmm8,  %xmm0
2878	subpd	%xmm9,  %xmm2
2879	subpd	%xmm10, %xmm4
2880	subpd	%xmm11, %xmm6
2881#endif
2882
2883#ifdef LN
2884	movddup	-13 * SIZE(AO), %xmm8
2885	mulpd	 %xmm8, %xmm13
2886	mulpd	 %xmm8, %xmm15
2887
2888	movddup	-14 * SIZE(AO), %xmm10
2889	mulpd	 %xmm13, %xmm10
2890	subpd	 %xmm10, %xmm9
2891	movddup	-14 * SIZE(AO), %xmm10
2892	mulpd	 %xmm15, %xmm10
2893	subpd	 %xmm10, %xmm11
2894
2895	movddup	-16 * SIZE(AO), %xmm8
2896	mulpd	 %xmm8, %xmm9
2897	mulpd	 %xmm8, %xmm11
2898#endif
2899
2900#ifdef LT
2901	movddup	-16 * SIZE(AO), %xmm8
2902	mulpd	 %xmm8, %xmm9
2903	mulpd	 %xmm8, %xmm11
2904
2905	movddup	-15 * SIZE(AO), %xmm10
2906	mulpd	 %xmm9, %xmm10
2907	subpd	 %xmm10, %xmm13
2908	movddup	-15 * SIZE(AO), %xmm10
2909	mulpd	 %xmm11, %xmm10
2910	subpd	 %xmm10, %xmm15
2911
2912	movddup	-13 * SIZE(AO), %xmm8
2913	mulpd	 %xmm8, %xmm13
2914	mulpd	 %xmm8, %xmm15
2915#endif
2916
2917#ifdef RN
2918	movddup	-16 * SIZE(BO), %xmm8
2919	mulpd	 %xmm8, %xmm0
2920
2921	movddup	-15 * SIZE(BO), %xmm9
2922	mulpd	 %xmm0, %xmm9
2923	subpd	 %xmm9, %xmm2
2924	movddup	-14 * SIZE(BO), %xmm10
2925	mulpd	 %xmm0, %xmm10
2926	subpd	 %xmm10, %xmm4
2927	movddup	-13 * SIZE(BO), %xmm11
2928	mulpd	 %xmm0, %xmm11
2929	subpd	 %xmm11, %xmm6
2930
2931	movddup	-11 * SIZE(BO), %xmm8
2932	mulpd	 %xmm8, %xmm2
2933	movddup	-10 * SIZE(BO), %xmm9
2934	mulpd	 %xmm2, %xmm9
2935	subpd	 %xmm9, %xmm4
2936	movddup	 -9 * SIZE(BO), %xmm10
2937	mulpd	 %xmm2, %xmm10
2938	subpd	 %xmm10, %xmm6
2939
2940	movddup	 -6 * SIZE(BO), %xmm8
2941	mulpd	 %xmm8, %xmm4
2942
2943	movddup	 -5 * SIZE(BO), %xmm9
2944	mulpd	 %xmm4, %xmm9
2945	subpd	 %xmm9, %xmm6
2946
2947	movddup	 -1 * SIZE(BO), %xmm8
2948	mulpd	 %xmm8, %xmm6
2949#endif
2950
2951#ifdef RT
2952	movddup	 -1 * SIZE(BO), %xmm8
2953	mulpd	 %xmm8, %xmm6
2954
2955	movddup	 -2 * SIZE(BO), %xmm9
2956	mulpd	 %xmm6, %xmm9
2957	subpd	 %xmm9, %xmm4
2958	movddup	 -3 * SIZE(BO), %xmm10
2959	mulpd	 %xmm6, %xmm10
2960	subpd	 %xmm10, %xmm2
2961	movddup	 -4 * SIZE(BO), %xmm11
2962	mulpd	 %xmm6, %xmm11
2963	subpd	 %xmm11, %xmm0
2964
2965	movddup	 -6 * SIZE(BO), %xmm8
2966	mulpd	 %xmm8, %xmm4
2967	movddup	 -7 * SIZE(BO), %xmm9
2968	mulpd	 %xmm4, %xmm9
2969	subpd	 %xmm9, %xmm2
2970	movddup	 -8 * SIZE(BO), %xmm10
2971	mulpd	 %xmm4, %xmm10
2972	subpd	 %xmm10, %xmm0
2973
2974	movddup	-11 * SIZE(BO), %xmm8
2975	mulpd	 %xmm8, %xmm2
2976	movddup	-12 * SIZE(BO), %xmm9
2977	mulpd	 %xmm2, %xmm9
2978	subpd	 %xmm9, %xmm0
2979
2980	movddup	-16 * SIZE(BO), %xmm8
2981	mulpd	 %xmm8, %xmm0
2982#endif
2983
2984#ifdef LN
2985	subq	$2 * SIZE, CO1
2986	subq	$2 * SIZE, CO2
2987#endif
2988
2989#if defined(LN) || defined(LT)
2990	movlpd	%xmm9,   0 * SIZE(CO1)
2991	movlpd	%xmm13,  1 * SIZE(CO1)
2992
2993	movhpd	%xmm9,   0 * SIZE(CO2)
2994	movhpd	%xmm13,  1 * SIZE(CO2)
2995
2996	movlpd	%xmm11,  0 * SIZE(CO1, LDC, 2)
2997	movlpd	%xmm15,  1 * SIZE(CO1, LDC, 2)
2998
2999	movhpd	%xmm11,  0 * SIZE(CO2, LDC, 2)
3000	movhpd	%xmm15,  1 * SIZE(CO2, LDC, 2)
3001#else
3002	movlpd	%xmm0,  0 * SIZE(CO1)
3003	movhpd	%xmm0,  1 * SIZE(CO1)
3004
3005	movlpd	%xmm2,  0 * SIZE(CO2)
3006	movhpd	%xmm2,  1 * SIZE(CO2)
3007
3008	movlpd	%xmm4,  0 * SIZE(CO1, LDC, 2)
3009	movhpd	%xmm4,  1 * SIZE(CO1, LDC, 2)
3010
3011	movlpd	%xmm6,  0 * SIZE(CO2, LDC, 2)
3012	movhpd	%xmm6,  1 * SIZE(CO2, LDC, 2)
3013#endif
3014
3015#if defined(LN) || defined(LT)
3016	movaps	%xmm9,  -16 * SIZE(BO)
3017	movaps	%xmm11, -14 * SIZE(BO)
3018	movaps	%xmm13, -12 * SIZE(BO)
3019	movaps	%xmm15, -10 * SIZE(BO)
3020#else
3021	movaps	%xmm0,  -16 * SIZE(AO)
3022	movaps	%xmm2,  -14 * SIZE(AO)
3023	movaps	%xmm4,  -12 * SIZE(AO)
3024	movaps	%xmm6,  -10 * SIZE(AO)
3025#endif
3026
3027#ifndef LN
3028	addq	$2 * SIZE, CO1
3029	addq	$2 * SIZE, CO2
3030#endif
3031
3032#if defined(LT) || defined(RN)
3033	movq	K,  %rax
3034	subq	KK, %rax
3035	leaq	(,%rax, SIZE), %rax
3036	leaq	(AO, %rax, 2), AO
3037	leaq	(BO, %rax, 4), BO
3038#endif
3039
3040#ifdef LN
3041	subq	$2, KK
3042#endif
3043
3044#ifdef LT
3045	addq	$2, KK
3046#endif
3047
3048#ifdef RT
3049       movq	K, %rax
3050       salq	$1 + BASE_SHIFT, %rax
3051       addq	%rax, AORIG
3052#endif
3053	ALIGN_4
3054
3055.L30:
3056	testq	$1, M
3057	je	.L39
3058
3059#ifdef LN
3060       movq	K, %rax
3061       salq	$0 + BASE_SHIFT, %rax
3062       subq	%rax, AORIG
3063#endif
3064
3065#if defined(LN) || defined(RT)
3066	movq	KK, %rax
3067	movq	AORIG, AO
3068	leaq	(, %rax, SIZE), %rax
3069	leaq	(AO, %rax, 1), AO
3070#endif
3071
3072	movq	B, BO
3073
3074#if defined(LN) || defined(RT)
3075	movq	KK, %rax
3076	leaq	(, %rax, SIZE), %rax
3077	leaq	(BO, %rax, 4), BO
3078#endif
3079
3080	movddup	-16 * SIZE(AO), %xmm0
3081	pxor	%xmm8, %xmm8
3082	movddup	-14 * SIZE(AO), %xmm2
3083	pxor	%xmm9, %xmm9
3084	movddup	-15 * SIZE(AO), %xmm4
3085	pxor	%xmm10, %xmm10
3086	movapd	-16 * SIZE(BO), %xmm1
3087	pxor	%xmm11, %xmm11
3088	movapd	 -8 * SIZE(BO), %xmm3
3089
3090#if defined(LT) || defined(RN)
3091	movq	KK, %rax
3092#else
3093	movq	K, %rax
3094	subq	KK, %rax
3095#endif
3096	andq	$-4, %rax
3097	leaq	(, %rax, SIZE), %rax
3098	leaq	(AO, %rax, 1), AO
3099	leaq	(BO, %rax, 4), BO
3100	negq	%rax
3101	NOBRANCH
3102	je	.L36
3103	ALIGN_4
3104
3105.L32:
3106	mulpd	%xmm0, %xmm1
3107	mulpd	-14 * SIZE(BO, %rax, 4), %xmm0
3108	addpd	%xmm1, %xmm8
3109	movapd	-12 * SIZE(BO, %rax, 4), %xmm1
3110	addpd	%xmm0, %xmm9
3111	movddup	-12 * SIZE(AO, %rax, 1), %xmm0
3112	mulpd	%xmm4, %xmm1
3113	mulpd	-10 * SIZE(BO, %rax, 4), %xmm4
3114	addpd	%xmm1, %xmm10
3115	movapd	  (BO, %rax, 4), %xmm1
3116	addpd	%xmm4, %xmm11
3117	movddup	-11 * SIZE(AO, %rax, 1), %xmm4
3118	mulpd	%xmm2, %xmm3
3119	mulpd	 -6 * SIZE(BO, %rax, 4), %xmm2
3120	addpd	%xmm3, %xmm8
3121	movapd	 -4 * SIZE(BO, %rax, 4), %xmm3
3122	addpd	%xmm2, %xmm9
3123	movddup	-13 * SIZE(AO, %rax, 1), %xmm2
3124	mulpd	%xmm2, %xmm3
3125	mulpd	 -2 * SIZE(BO, %rax, 4), %xmm2
3126	addpd	%xmm3, %xmm10
3127	movapd	  8 * SIZE(BO, %rax, 4), %xmm3
3128	addpd	%xmm2, %xmm11
3129	movddup	-10 * SIZE(AO, %rax, 1), %xmm2
3130
3131	addq	$4 * SIZE, %rax
3132	BRANCH
3133	jl	.L32
3134	ALIGN_4
3135
3136.L36:
3137#if defined(LT) || defined(RN)
3138	movq	KK, %rax
3139#else
3140	movq	K, %rax
3141	subq	KK, %rax
3142#endif
3143	andq	$3, %rax		# if (k & 1)
3144	je .L38
3145
3146	leaq	(, %rax, SIZE), %rax
3147	leaq	(AO, %rax, 1), AO
3148	leaq	(BO, %rax, 4), BO
3149	negq	%rax
3150	ALIGN_4
3151
3152.L37:
3153	mulpd	%xmm0, %xmm1
3154	mulpd	-14 * SIZE(BO, %rax, 4), %xmm0
3155	addpd	%xmm1, %xmm8
3156	movapd	-12 * SIZE(BO, %rax, 4), %xmm1
3157	addpd	%xmm0, %xmm9
3158	movddup	-15 * SIZE(AO, %rax, 1), %xmm0
3159
3160	addq	$SIZE, %rax
3161	jl	.L37
3162	ALIGN_4
3163
3164.L38:
3165	addpd	%xmm10, %xmm8
3166	addpd	%xmm11, %xmm9
3167
3168#if defined(LN) || defined(RT)
3169	movq	KK, %rax
3170#ifdef LN
3171	subq	$1, %rax
3172#else
3173	subq	$4, %rax
3174#endif
3175
3176	leaq	(, %rax, SIZE), %rax
3177
3178	movq	AORIG, AO
3179	leaq	(AO, %rax, 1), AO
3180	leaq	(B,  %rax, 4), BO
3181#endif
3182
3183#if defined(LN) || defined(LT)
3184	movapd	-16 * SIZE(BO), %xmm2
3185	movapd	-14 * SIZE(BO), %xmm3
3186
3187	subpd	%xmm8,  %xmm2
3188	subpd	%xmm9,  %xmm3
3189#else
3190	movapd	-16 * SIZE(AO), %xmm2
3191	movapd	-14 * SIZE(AO), %xmm3
3192
3193	subpd	%xmm8, %xmm2
3194	subpd	%xmm9, %xmm3
3195#endif
3196
3197#if defined(LN) || defined(LT)
3198	movddup	-16 * SIZE(AO), %xmm0
3199	mulpd	 %xmm0, %xmm2
3200	mulpd	 %xmm0, %xmm3
3201#endif
3202
3203#ifdef RN
3204	movapd	%xmm2, %xmm0
3205        unpckhpd %xmm0, %xmm0
3206
3207	movapd	%xmm3, %xmm1
3208        unpckhpd %xmm1, %xmm1
3209
3210	movsd	-16 * SIZE(BO), %xmm4
3211	mulsd	 %xmm4, %xmm2
3212
3213	movsd	-15 * SIZE(BO), %xmm5
3214	mulsd	 %xmm2, %xmm5
3215	subsd	 %xmm5, %xmm0
3216	movsd	-14 * SIZE(BO), %xmm6
3217	mulsd	 %xmm2, %xmm6
3218	subsd	 %xmm6, %xmm3
3219	movsd	-13 * SIZE(BO), %xmm7
3220	mulsd	 %xmm2, %xmm7
3221	subsd	 %xmm7, %xmm1
3222
3223	movsd	-11 * SIZE(BO), %xmm4
3224	mulsd	 %xmm4, %xmm0
3225
3226	movsd	-10 * SIZE(BO), %xmm5
3227	mulsd	 %xmm0, %xmm5
3228	subsd	 %xmm5, %xmm3
3229	movsd	 -9 * SIZE(BO), %xmm6
3230	mulsd	 %xmm0, %xmm6
3231	subsd	 %xmm6, %xmm1
3232
3233	movsd	 -6 * SIZE(BO), %xmm4
3234	mulsd	 %xmm4, %xmm3
3235
3236	movsd	 -5 * SIZE(BO), %xmm5
3237	mulsd	 %xmm3, %xmm5
3238	subsd	 %xmm5, %xmm1
3239
3240	movsd	 -1 * SIZE(BO), %xmm4
3241	mulsd	 %xmm4, %xmm1
3242
3243	unpcklpd %xmm0, %xmm2
3244	unpcklpd %xmm1, %xmm3
3245#endif
3246
3247#ifdef RT
3248	movapd	%xmm2, %xmm0
3249        unpckhpd %xmm0, %xmm0
3250
3251	movapd	%xmm3, %xmm1
3252        unpckhpd %xmm1, %xmm1
3253
3254	movsd	 -1 * SIZE(BO), %xmm4
3255	mulsd	 %xmm4, %xmm1
3256
3257	movsd	 -2 * SIZE(BO), %xmm5
3258	mulsd	 %xmm1, %xmm5
3259	subsd	 %xmm5, %xmm3
3260	movsd	 -3 * SIZE(BO), %xmm6
3261	mulsd	 %xmm1, %xmm6
3262	subsd	 %xmm6, %xmm0
3263	movsd	 -4 * SIZE(BO), %xmm7
3264	mulsd	 %xmm1, %xmm7
3265	subsd	 %xmm7, %xmm2
3266
3267	movsd	 -6 * SIZE(BO), %xmm4
3268	mulsd	 %xmm4, %xmm3
3269
3270	movsd	 -7 * SIZE(BO), %xmm5
3271	mulsd	 %xmm3, %xmm5
3272	subsd	 %xmm5, %xmm0
3273	movsd	 -8 * SIZE(BO), %xmm6
3274	mulsd	 %xmm3, %xmm6
3275	subsd	 %xmm6, %xmm2
3276
3277	movsd	-11 * SIZE(BO), %xmm4
3278	mulsd	 %xmm4, %xmm0
3279
3280	movsd	-12 * SIZE(BO), %xmm5
3281	mulsd	 %xmm0, %xmm5
3282	subsd	 %xmm5, %xmm2
3283
3284	movsd	-16 * SIZE(BO), %xmm4
3285	mulsd	 %xmm4, %xmm2
3286
3287	unpcklpd %xmm0, %xmm2
3288	unpcklpd %xmm1, %xmm3
3289
3290#endif
3291
3292#ifdef LN
3293	subq	$1 * SIZE, CO1
3294	subq	$1 * SIZE, CO2
3295#endif
3296
3297#if defined(LN) || defined(LT)
3298	movlpd	%xmm2,  0 * SIZE(CO1)
3299	movhpd	%xmm2,  0 * SIZE(CO2)
3300	movlpd	%xmm3,  0 * SIZE(CO1, LDC, 2)
3301	movhpd	%xmm3,  0 * SIZE(CO2, LDC, 2)
3302#else
3303	movlpd	%xmm2,  0 * SIZE(CO1)
3304	movhpd	%xmm2,  0 * SIZE(CO2)
3305	movlpd	%xmm3,  0 * SIZE(CO1, LDC, 2)
3306	movhpd	%xmm3,  0 * SIZE(CO2, LDC, 2)
3307#endif
3308
3309#if defined(LN) || defined(LT)
3310	movaps	%xmm2, -16 * SIZE(BO)
3311	movaps	%xmm3, -14 * SIZE(BO)
3312#else
3313	movaps	%xmm2, -16 * SIZE(AO)
3314	movaps	%xmm3, -14 * SIZE(AO)
3315#endif
3316
3317#ifndef LN
3318	addq	$1 * SIZE, CO1
3319	addq	$1 * SIZE, CO2
3320#endif
3321
3322#if defined(LT) || defined(RN)
3323	movq	K,  %rax
3324	subq	KK, %rax
3325	leaq	(,%rax, SIZE), %rax
3326	leaq	(AO, %rax, 1), AO
3327	leaq	(BO, %rax, 4), BO
3328#endif
3329
3330#ifdef LN
3331	subq	$1, KK
3332#endif
3333
3334#ifdef LT
3335	addq	$1, KK
3336#endif
3337
3338#ifdef RT
3339       movq	K, %rax
3340       salq	$0 + BASE_SHIFT, %rax
3341       addq	%rax, AORIG
3342#endif
3343	ALIGN_4
3344
3345.L39:
3346#ifdef LN
3347       leaq	(, K, SIZE), %rax
3348       leaq	(B, %rax, 4), B
3349#endif
3350
3351#if defined(LT) || defined(RN)
3352	movq	BO, B
3353#endif
3354
3355#ifdef RN
3356	addq	$4, KK
3357#endif
3358
3359#ifdef RT
3360	subq	$4, KK
3361#endif
3362
3363	decq	J			# j --
3364	jg	.L01
3365	ALIGN_4
3366
3367.L999:
3368	movq	   (%rsp), %rbx
3369	movq	  8(%rsp), %rbp
3370	movq	 16(%rsp), %r12
3371	movq	 24(%rsp), %r13
3372	movq	 32(%rsp), %r14
3373	movq	 40(%rsp), %r15
3374
3375#ifdef WINDOWS_ABI
3376	movq	 48(%rsp), %rdi
3377	movq	 56(%rsp), %rsi
3378	movups	 64(%rsp), %xmm6
3379	movups	 80(%rsp), %xmm7
3380	movups	 96(%rsp), %xmm8
3381	movups	112(%rsp), %xmm9
3382	movups	128(%rsp), %xmm10
3383	movups	144(%rsp), %xmm11
3384	movups	160(%rsp), %xmm12
3385	movups	176(%rsp), %xmm13
3386	movups	192(%rsp), %xmm14
3387	movups	208(%rsp), %xmm15
3388#endif
3389
3390	addq	$STACKSIZE, %rsp
3391	ret
3392
3393	EPILOGUE
3394