1/*****************************************************************************
2Copyright (c) 2011-2014, The OpenBLAS Project
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are
7met:
8
9   1. Redistributions of source code must retain the above copyright
10      notice, this list of conditions and the following disclaimer.
11
12   2. Redistributions in binary form must reproduce the above copyright
13      notice, this list of conditions and the following disclaimer in
14      the documentation and/or other materials provided with the
15      distribution.
16   3. Neither the name of the OpenBLAS project nor the names of
17      its contributors may be used to endorse or promote products
18      derived from this software without specific prior written
19      permission.
20
21THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
30USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
32 **********************************************************************************/
33
34#define ASSEMBLER
35#include "common.h"
36
37#define old_bm %rdi
38#define old_bn %rsi
39#define old_bk %rdx
40
41#define bm %r13
42#define bn %r14
43#define bk %r15
44
45#define ALPHA %xmm0
46#define ba %rcx
47#define bb %r8
48#define C %r9
49#define ldc %r10
50
51#define i %r11
52#define k %rax
53
54#define ptrba %rdi
55#define ptrbb %rsi
56#define C0 %rbx
57#define C1 %rbp
58
59#define prebb %r12
60
61#ifndef WINDOWS_ABI
62
63#define STACKSIZE 128
64
65#define old_ldc 8+STACKSIZE(%rsp)
66#define old_offset 16+STACKSIZE(%rsp)
67#define MEMALPHA 48(%rsp)
68#define j 56(%rsp)
69#define OFFSET 64(%rsp)
70#define kk 72(%rsp)
71#define kkk 80(%rsp)
72
73#else
74
75#define STACKSIZE 512
76
77#define OLD_A		40 + STACKSIZE(%rsp)
78#define OLD_B		48 + STACKSIZE(%rsp)
79#define OLD_C		56 + STACKSIZE(%rsp)
80#define old_ldc		64 + STACKSIZE(%rsp)
81#define old_offset	72 + STACKSIZE(%rsp)
82
83#define MEMALPHA	  224(%rsp)
84#define j	  232(%rsp)
85#define OFFSET	  240(%rsp)
86#define kk	  248(%rsp)
87#define kkk	  256(%rsp)
88
89#endif
90
91#define PREFETCH0 prefetcht0
92#define PREFETCH1 prefetcht0
93#define PREFETCH2 prefetcht2
94
95#define xvec0	%xmm0
96#define xvec1	%xmm1
97#define xvec2	%xmm2
98#define xvec3	%xmm3
99#define xvec4	%xmm4
100#define xvec5	%xmm5
101#define xvec6	%xmm6
102#define xvec7	%xmm7
103#define xvec8	%xmm8
104#define xvec9	%xmm9
105#define xvec10	%xmm10
106#define xvec11	%xmm11
107#define xvec12	%xmm12
108#define xvec13	%xmm13
109#define xvec14	%xmm14
110#define xvec15	%xmm15
111
112#define yvec0	%ymm0
113#define yvec1	%ymm1
114#define yvec2	%ymm2
115#define yvec3	%ymm3
116#define yvec4	%ymm4
117#define yvec5	%ymm5
118#define yvec6	%ymm6
119#define yvec7	%ymm7
120#define yvec8	%ymm8
121#define yvec9	%ymm9
122#define yvec10	%ymm10
123#define yvec11	%ymm11
124#define yvec12	%ymm12
125#define yvec13	%ymm13
126#define yvec14	%ymm14
127#define yvec15	%ymm15
128
129#define LEAQ	leaq
130#define ADDQ	addq
131#define MULQ	imulq
132#define SARQ	sarq
133#define SALQ	salq
134#define ANDQ	andq
135#define SUBQ	subq
136#define DECQ	decq
137#define JG		jg
138#define JLE		jle
139#define TEST	testq
140#define OR		orq
141#define JNE		jne
142#define NOP
143#define XOR		xorpd
144#undef	MOVQ
145#define MOVQ	movq
146
147#define XOR_DY	vxorpd
148#define XOR_DX	vxorpd
149
150#define LD_DY	vmovapd
151#define LD_DX	vmovapd
152#define LDL_DX	vmovlpd
153#define LDL_DY	vmovlpd
154#define LDH_DX	vmovhpd
155#define LDH_DY	vmovhpd
156
157#define ST_DY	vmovapd
158#define ST_DX	vmovapd
159#define STL_DX	vmovlpd
160#define STL_DY	vmovlpd
161#define STH_DX	vmovhpd
162#define STH_DY	vmovhpd
163
164#define EDUP_DY	vmovddup
165
166#define ADD_DY	vaddpd
167#define ADD_DX	vaddpd
168
169#define ADD1_DY	vaddpd
170#define ADD2_DY	vaddpd
171#define ADDSUB_DY	vaddsubpd
172
173#define MUL_DY	vmulpd
174#define MUL_DX	vmulpd
175
176#define SHUF_DY	vperm2f128
177#define SHUF_DX	vpshufd
178
179#define VPERMILP_DY	vpermilpd
180
181#define BROAD_DY vbroadcastsd
182#define BROAD_DX vmovddup
183
184#define MOV_DY vmovapd
185#define MOV_DX vmovapd
186
187#define REVS_DY vshufpd
188#define REVS_DX	vmovsd
189
190#define EXTRA_DY vextractf128
191
192PROLOGUE
193
194subq    $STACKSIZE, %rsp;
195movq    %rbx,  0(%rsp);
196movq    %rbp,  8(%rsp);
197movq    %r12, 16(%rsp);
198movq    %r13, 24(%rsp);
199movq    %r14, 32(%rsp);
200movq    %r15, 40(%rsp);
201
202#ifdef WINDOWS_ABI
203	movq	%rdi,    48(%rsp)
204	movq	%rsi,    56(%rsp)
205	movups	%xmm6,   64(%rsp)
206	movups	%xmm7,   80(%rsp)
207	movups	%xmm8,   96(%rsp)
208	movups	%xmm9,  112(%rsp)
209	movups	%xmm10, 128(%rsp)
210	movups	%xmm11, 144(%rsp)
211	movups	%xmm12, 160(%rsp)
212	movups	%xmm13, 176(%rsp)
213	movups	%xmm14, 192(%rsp)
214	movups	%xmm15, 208(%rsp)
215
216	movq	ARG1,      old_bm
217	movq	ARG2,      old_bn
218	movq	ARG3,      old_bk
219	movq	OLD_A,     ba
220	movq	OLD_B,     bb
221	movq	OLD_C,     C
222	movq	old_ldc,   ldc
223#ifdef TRMMKERNEL
224	movq	old_offset, %r11
225#endif
226	movaps	%xmm3, %xmm0
227#else
228
229movq	old_ldc, ldc
230#ifdef	TRMMKERNEL
231movq	old_offset, %r11
232#endif
233#endif
234
235vzeroupper
236
237vmovlps	ALPHA, MEMALPHA
238movq	old_bm,	bm
239movq	old_bn, bn
240movq	old_bk, bk
241leaq	(, ldc, SIZE), ldc
242#ifdef	TRMMKERNEL
243movq	%r11, OFFSET
244#ifndef	LEFT
245negq	%r11;
246#endif
247movq	%r11, kk
248#endif
249
250MOVQ bn,j;
251SARQ $2,j;							# Rn = 4
252JLE .L0_loopE;
253ALIGN_5;
254.L0_bodyB:;
255#if defined(TRMMKERNEL) && defined(LEFT)
256MOVQ OFFSET, %rax;
257MOVQ %rax, kk;
258#endif
259
260MOVQ C,C0;
261LEAQ (C,ldc,2),C1;
262MOVQ bk, k;
263SALQ $5, k;
264LEAQ (bb, k, 1), prebb;
265MOVQ ba,ptrba;
266MOVQ bm,i;
267SARQ $3,i;							# Rm = 8
268JLE .L1_loopE;
269ALIGN_5;
270.L1_bodyB:;
271#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
272MOVQ bb, ptrbb;
273#else
274MOVQ bb, ptrbb;
275MOVQ kk, %rax;
276LEAQ (, %rax, SIZE), %rax;
277LEAQ (ptrba, %rax, 8), ptrba;
278LEAQ (ptrbb, %rax, 4), ptrbb;
279#endif
280//#### Initial Results Register ####
281PREFETCH2	0*SIZE(prebb);
282XOR_DY yvec15, yvec15, yvec15;
283PREFETCH2	8*SIZE(prebb);
284XOR_DY yvec14, yvec14, yvec14;
285XOR_DY yvec13, yvec13, yvec13;
286ADDQ	$16*SIZE, prebb
287XOR_DY yvec12, yvec12, yvec12;
288PREFETCH0	3*SIZE(C0)
289LD_DY	0*SIZE(ptrbb), yvec2;
290PREFETCH0	3*SIZE(C0, ldc, 1)
291XOR_DY yvec11, yvec11, yvec11;
292PREFETCH0	3*SIZE(C1)
293XOR_DY yvec10, yvec10, yvec10;
294PREFETCH0	3*SIZE(C1, ldc, 1)
295LD_DY	0*SIZE(ptrba), yvec0;
296XOR_DY yvec9, yvec9, yvec9;
297XOR_DY yvec8, yvec8, yvec8;
298VPERMILP_DY	$0x05, yvec2, yvec3;
299#ifndef	TRMMKERNEL
300MOVQ bk,k;
301#elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
302MOVQ bk, %rax;
303SUBQ kk, %rax;
304MOVQ %rax, kkk;
305#else
306MOVQ kk, %rax;
307#ifdef LEFT
308ADDQ $8, %rax;
309#else
310ADDQ $4, %rax;
311#endif
312MOVQ %rax, kkk;
313#endif
314SARQ $2,k;
315JLE .L2_loopE;
316ALIGN_5;
317.L2_bodyB:;
318# Computing kernel
319
320//#### Unroll times 1 ####
321LD_DY	4*SIZE(ptrba), yvec1;
322MUL_DY	yvec0, yvec2, yvec6;
323SHUF_DY	$0x03, yvec2, yvec2, yvec4;
324MUL_DY	yvec0, yvec3, yvec7;
325SHUF_DY	$0x03, yvec3, yvec3, yvec5;
326ADD_DY	yvec15, yvec6, yvec15;
327ADD_DY	yvec13, yvec7, yvec13;
328
329PREFETCH0	64*SIZE(ptrba)
330MUL_DY	yvec1, yvec2, yvec6;
331LD_DY	4*SIZE(ptrbb), yvec2;
332MUL_DY	yvec1, yvec3, yvec7;
333VPERMILP_DY	$0x05, yvec2, yvec3;
334ADD_DY	yvec14, yvec6, yvec14;
335ADD_DY	yvec12, yvec7, yvec12;
336
337MUL_DY	yvec0, yvec4, yvec6;
338MUL_DY	yvec0, yvec5, yvec7;
339LD_DY	8*SIZE(ptrba), yvec0;
340ADD_DY	yvec11, yvec6, yvec11;
341ADD_DY	yvec9, yvec7, yvec9;
342
343MUL_DY	yvec1, yvec4, yvec6;
344MUL_DY	yvec1, yvec5, yvec7;
345ADD_DY	yvec10, yvec6, yvec10;
346ADD_DY	yvec8, yvec7, yvec8;
347
348//#### Unroll times 2 ####
349LD_DY	12*SIZE(ptrba), yvec1;
350MUL_DY	yvec0, yvec2, yvec6;
351SHUF_DY	$0x03, yvec2, yvec2, yvec4;
352MUL_DY	yvec0, yvec3, yvec7;
353SHUF_DY	$0x03, yvec3, yvec3, yvec5;
354ADD_DY	yvec15, yvec6, yvec15;
355ADD_DY	yvec13, yvec7, yvec13;
356
357PREFETCH0	72*SIZE(ptrba)
358MUL_DY	yvec1, yvec2, yvec6;
359LD_DY	8*SIZE(ptrbb), yvec2;
360MUL_DY	yvec1, yvec3, yvec7;
361VPERMILP_DY	$0x05, yvec2, yvec3;
362ADD_DY	yvec14, yvec6, yvec14;
363ADD_DY	yvec12, yvec7, yvec12;
364
365MUL_DY	yvec0, yvec4, yvec6;
366MUL_DY	yvec0, yvec5, yvec7;
367LD_DY	16*SIZE(ptrba), yvec0;
368ADD_DY	yvec11, yvec6, yvec11;
369ADD_DY	yvec9, yvec7, yvec9;
370
371MUL_DY	yvec1, yvec4, yvec6;
372MUL_DY	yvec1, yvec5, yvec7;
373ADD_DY	yvec10, yvec6, yvec10;
374ADD_DY	yvec8, yvec7, yvec8;
375
376//#### Unroll times 3 ####
377LD_DY	20*SIZE(ptrba), yvec1;
378MUL_DY	yvec0, yvec2, yvec6;
379SHUF_DY	$0x03, yvec2, yvec2, yvec4;
380MUL_DY	yvec0, yvec3, yvec7;
381SHUF_DY	$0x03, yvec3, yvec3, yvec5;
382ADD_DY	yvec15, yvec6, yvec15;
383ADD_DY	yvec13, yvec7, yvec13;
384
385PREFETCH0	80*SIZE(ptrba)
386MUL_DY	yvec1, yvec2, yvec6;
387LD_DY	12*SIZE(ptrbb), yvec2;
388ADDQ	$16*SIZE, ptrbb;
389MUL_DY	yvec1, yvec3, yvec7;
390VPERMILP_DY	$0x05, yvec2, yvec3;
391ADD_DY	yvec14, yvec6, yvec14;
392ADD_DY	yvec12, yvec7, yvec12;
393
394MUL_DY	yvec0, yvec4, yvec6;
395MUL_DY	yvec0, yvec5, yvec7;
396LD_DY	24*SIZE(ptrba), yvec0;
397ADD_DY	yvec11, yvec6, yvec11;
398ADD_DY	yvec9, yvec7, yvec9;
399
400MUL_DY	yvec1, yvec4, yvec6;
401MUL_DY	yvec1, yvec5, yvec7;
402ADD_DY	yvec10, yvec6, yvec10;
403ADD_DY	yvec8, yvec7, yvec8;
404
405//#### Unroll times 4 ####
406LD_DY	28*SIZE(ptrba), yvec1;
407MUL_DY	yvec0, yvec2, yvec6;
408SHUF_DY	$0x03, yvec2, yvec2, yvec4;
409MUL_DY	yvec0, yvec3, yvec7;
410SHUF_DY	$0x03, yvec3, yvec3, yvec5;
411ADDQ	$32*SIZE, ptrba;
412ADD_DY	yvec15, yvec6, yvec15;
413ADD_DY	yvec13, yvec7, yvec13;
414
415PREFETCH0	88*SIZE(ptrba)
416MUL_DY	yvec1, yvec2, yvec6;
417LD_DY	0*SIZE(ptrbb), yvec2;
418MUL_DY	yvec1, yvec3, yvec7;
419VPERMILP_DY	$0x05, yvec2, yvec3;
420ADD_DY	yvec14, yvec6, yvec14;
421ADD_DY	yvec12, yvec7, yvec12;
422
423MUL_DY	yvec0, yvec4, yvec6;
424MUL_DY	yvec0, yvec5, yvec7;
425LD_DY	0*SIZE(ptrba), yvec0;
426ADD_DY	yvec11, yvec6, yvec11;
427ADD_DY	yvec9, yvec7, yvec9;
428
429MUL_DY	yvec1, yvec4, yvec6;
430MUL_DY	yvec1, yvec5, yvec7;
431ADD_DY	yvec10, yvec6, yvec10;
432ADD_DY	yvec8, yvec7, yvec8;
433.L2_bodyE:;
434DECQ k;
435JG .L2_bodyB;
436ALIGN_5
437.L2_loopE:;
438PREFETCH2	0*SIZE(prebb);
439ADDQ	$8*SIZE, prebb;
440#ifndef	TRMMKERNEL
441TEST	$2, bk;
442#else
443MOVQ	kkk, %rax;
444TEST 	$2, %rax;
445#endif
446JLE		.L3_loopE;
447ALIGN_5
448.L3_bodyB:
449//#### Unroll times 1 ####
450PREFETCH0	64*SIZE(ptrba)
451LD_DY	4*SIZE(ptrba), yvec1;
452MUL_DY	yvec0, yvec2, yvec6;
453SHUF_DY	$0x03, yvec2, yvec2, yvec4;
454MUL_DY	yvec0, yvec3, yvec7;
455SHUF_DY	$0x03, yvec3, yvec3, yvec5;
456ADD_DY	yvec15, yvec6, yvec15;
457ADD_DY	yvec13, yvec7, yvec13;
458
459MUL_DY	yvec1, yvec2, yvec6;
460LD_DY	4*SIZE(ptrbb), yvec2;
461ADDQ	$8*SIZE, ptrbb;
462MUL_DY	yvec1, yvec3, yvec7;
463VPERMILP_DY	$0x05, yvec2, yvec3;
464ADD_DY	yvec14, yvec6, yvec14;
465ADD_DY	yvec12, yvec7, yvec12;
466
467MUL_DY	yvec0, yvec4, yvec6;
468MUL_DY	yvec0, yvec5, yvec7;
469LD_DY	8*SIZE(ptrba), yvec0;
470ADD_DY	yvec11, yvec6, yvec11;
471ADD_DY	yvec9, yvec7, yvec9;
472
473MUL_DY	yvec1, yvec4, yvec6;
474MUL_DY	yvec1, yvec5, yvec7;
475ADD_DY	yvec10, yvec6, yvec10;
476ADD_DY	yvec8, yvec7, yvec8;
477
478//#### Unroll times 2 ####
479PREFETCH0	72*SIZE(ptrba)
480LD_DY	12*SIZE(ptrba), yvec1;
481MUL_DY	yvec0, yvec2, yvec6;
482SHUF_DY	$0x03, yvec2, yvec2, yvec4;
483MUL_DY	yvec0, yvec3, yvec7;
484SHUF_DY	$0x03, yvec3, yvec3, yvec5;
485ADDQ	$16*SIZE, ptrba;
486ADD_DY	yvec15, yvec6, yvec15;
487ADD_DY	yvec13, yvec7, yvec13;
488
489MUL_DY	yvec1, yvec2, yvec6;
490LD_DY	0*SIZE(ptrbb), yvec2;
491MUL_DY	yvec1, yvec3, yvec7;
492VPERMILP_DY	$0x05, yvec2, yvec3;
493ADD_DY	yvec14, yvec6, yvec14;
494ADD_DY	yvec12, yvec7, yvec12;
495
496MUL_DY	yvec0, yvec4, yvec6;
497MUL_DY	yvec0, yvec5, yvec7;
498LD_DY	0*SIZE(ptrba), yvec0;
499ADD_DY	yvec11, yvec6, yvec11;
500ADD_DY	yvec9, yvec7, yvec9;
501
502MUL_DY	yvec1, yvec4, yvec6;
503MUL_DY	yvec1, yvec5, yvec7;
504ADD_DY	yvec10, yvec6, yvec10;
505ADD_DY	yvec8, yvec7, yvec8;
506
507.L3_loopE:
508PREFETCH2	0*SIZE(prebb);
509ADDQ	$8*SIZE, prebb
510#ifndef	TRMMKERNEL
511TEST	$1, bk;
512#else
513MOVQ	kkk, %rax;
514TEST 	$1, %rax;
515#endif
516JLE		.L4_loopE;
517ALIGN_5
518.L4_bodyB:;
519//#### Unroll times 1 ####
520PREFETCH0	64*SIZE(ptrba)
521LD_DY	4*SIZE(ptrba), yvec1;
522MUL_DY	yvec0, yvec2, yvec6;
523SHUF_DY	$0x03, yvec2, yvec2, yvec4;
524MUL_DY	yvec0, yvec3, yvec7;
525SHUF_DY	$0x03, yvec3, yvec3, yvec5;
526ADDQ	$8*SIZE, ptrba;
527ADD_DY	yvec15, yvec6, yvec15;
528ADD_DY	yvec13, yvec7, yvec13;
529
530MUL_DY	yvec1, yvec2, yvec6;
531MUL_DY	yvec1, yvec3, yvec7;
532ADDQ	$4*SIZE, ptrbb;
533ADD_DY	yvec14, yvec6, yvec14;
534ADD_DY	yvec12, yvec7, yvec12;
535
536MUL_DY	yvec0, yvec4, yvec6;
537MUL_DY	yvec0, yvec5, yvec7;
538ADD_DY	yvec11, yvec6, yvec11;
539ADD_DY	yvec9, yvec7, yvec9;
540
541MUL_DY	yvec1, yvec4, yvec6;
542MUL_DY	yvec1, yvec5, yvec7;
543ADD_DY	yvec10, yvec6, yvec10;
544ADD_DY	yvec8, yvec7, yvec8;
545
546.L4_loopE:;
547//#### Load Alpha ####
548BROAD_DY MEMALPHA,yvec7;
549//#### Multiply Alpha ####
550MUL_DY yvec7,yvec15,yvec15;
551MUL_DY yvec7,yvec14,yvec14;
552MUL_DY yvec7,yvec13,yvec13;
553MUL_DY yvec7,yvec12,yvec12;
554MUL_DY yvec7,yvec11,yvec11;
555MUL_DY yvec7,yvec10,yvec10;
556MUL_DY yvec7,yvec9,yvec9;
557MUL_DY yvec7,yvec8,yvec8;
558//#### Reverse the Results ####
559MOV_DY yvec15,yvec7;
560REVS_DY $0x0a,yvec13,yvec15,yvec15;
561REVS_DY $0x0a,yvec7,yvec13,yvec13;
562MOV_DY yvec14,yvec7;
563REVS_DY $0x0a,yvec12,yvec14,yvec14;
564REVS_DY $0x0a,yvec7,yvec12,yvec12;
565MOV_DY yvec11,yvec7;
566REVS_DY $0x0a,yvec9,yvec11,yvec11;
567REVS_DY $0x0a,yvec7,yvec9,yvec9;
568MOV_DY yvec10,yvec7;
569REVS_DY $0x0a,yvec8,yvec10,yvec10;
570REVS_DY $0x0a,yvec7,yvec8,yvec8;
571//#### Testing alignment ####
572MOVQ	C0, %rax;
573OR		ldc, %rax;
574TEST	$15, %rax;
575JNE		.L4_loopEx;					# Unalign part write back
576ALIGN_5
577//#### Writing Back ####
578EXTRA_DY $1,yvec15,xvec7;
579EXTRA_DY $1,yvec14,xvec6;
580EXTRA_DY $1,yvec13,xvec5;
581EXTRA_DY $1,yvec12,xvec4;
582EXTRA_DY $1,yvec11,xvec3;
583EXTRA_DY $1,yvec10,xvec2;
584EXTRA_DY $1,yvec9,xvec1;
585EXTRA_DY $1,yvec8,xvec0;
586#ifndef	TRMMKERNEL
587ADD_DY 0*SIZE(C0),xvec15,xvec15;
588ADD_DY 2*SIZE(C1),xvec7,xvec7;
589ADD_DY 4*SIZE(C0),xvec14,xvec14;
590ADD_DY 6*SIZE(C1),xvec6,xvec6;
591ADD_DY 0*SIZE(C0,ldc,1),xvec13,xvec13;
592ADD_DY 2*SIZE(C1,ldc,1),xvec5,xvec5;
593ADD_DY 4*SIZE(C0,ldc,1),xvec12,xvec12;
594ADD_DY 6*SIZE(C1,ldc,1),xvec4,xvec4;
595ADD_DY 0*SIZE(C1),xvec11,xvec11;
596ADD_DY 2*SIZE(C0),xvec3,xvec3;
597ADD_DY 4*SIZE(C1),xvec10,xvec10;
598ADD_DY 6*SIZE(C0),xvec2,xvec2;
599ADD_DY 0*SIZE(C1,ldc,1),xvec9,xvec9;
600ADD_DY 2*SIZE(C0,ldc,1),xvec1,xvec1;
601ADD_DY 4*SIZE(C1,ldc,1),xvec8,xvec8;
602ADD_DY 6*SIZE(C0,ldc,1),xvec0,xvec0;
603#endif
604ST_DY xvec15, 0*SIZE(C0);
605ST_DY xvec7, 2*SIZE(C1);
606ST_DY xvec14, 4*SIZE(C0);
607ST_DY xvec6, 6*SIZE(C1);
608ST_DY xvec13, 0*SIZE(C0,ldc,1);
609ST_DY xvec5, 2*SIZE(C1,ldc,1);
610ST_DY xvec12, 4*SIZE(C0,ldc,1);
611ST_DY xvec4, 6*SIZE(C1,ldc,1);
612ST_DY xvec11, 0*SIZE(C1);
613ST_DY xvec3, 2*SIZE(C0);
614ST_DY xvec10, 4*SIZE(C1);
615ST_DY xvec2, 6*SIZE(C0);
616ST_DY xvec9, 0*SIZE(C1,ldc,1);
617ST_DY xvec1, 2*SIZE(C0,ldc,1);
618ST_DY xvec8, 4*SIZE(C1,ldc,1);
619ST_DY xvec0, 6*SIZE(C0,ldc,1);
620#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
621MOVQ	bk, %rax;
622SUBQ	kkk, %rax;
623LEAQ 	(, %rax, SIZE), %rax;
624LEAQ	(ptrba, %rax, 8), ptrba;
625LEAQ 	(ptrbb, %rax, 4), ptrbb;
626#endif
627#if defined(TRMMKERNEL) && defined(LEFT)
628ADDQ	$8, kk
629#endif
630ADDQ $8*SIZE,C0;
631ADDQ $8*SIZE,C1;
632.L1_bodyE:;
633DECQ i;
634JG .L1_bodyB;
635JMP	.L1_loopE;
636ALIGN_5;
637.L4_loopEx:;
638EXTRA_DY $1, yvec15, xvec7;
639#ifndef	TRMMKERNEL
640LDL_DY	0*SIZE(C0), xvec6, xvec6;
641LDH_DY	1*SIZE(C0), xvec6, xvec6;
642ADD_DY	xvec6, xvec15, xvec15;
643LDL_DY	2*SIZE(C1), xvec5, xvec5;
644LDH_DY	3*SIZE(C1), xvec5, xvec5;
645ADD_DY	xvec5, xvec7, xvec7;
646#endif
647STL_DY	xvec15, 0*SIZE(C0);
648STH_DY	xvec15, 1*SIZE(C0);
649STL_DY	xvec7, 2*SIZE(C1);
650STH_DY	xvec7, 3*SIZE(C1);
651
652EXTRA_DY $1, yvec14, xvec4;
653#ifndef	TRMMKERNEL
654LDL_DY	4*SIZE(C0), xvec3, xvec3;
655LDH_DY	5*SIZE(C0), xvec3, xvec3;
656ADD_DY	xvec3, xvec14, xvec14;
657LDL_DY	6*SIZE(C1), xvec2, xvec2;
658LDH_DY	7*SIZE(C1), xvec2, xvec2;
659ADD_DY	xvec2, xvec4, xvec4;
660#endif
661STL_DY	xvec14, 4*SIZE(C0);
662STH_DY	xvec14, 5*SIZE(C0);
663STL_DY	xvec4, 6*SIZE(C1);
664STH_DY	xvec4, 7*SIZE(C1);
665
666EXTRA_DY $1, yvec13, xvec7;
667#ifndef	TRMMKERNEL
668LDL_DY	0*SIZE(C0, ldc, 1), xvec6, xvec6;
669LDH_DY	1*SIZE(C0, ldc, 1), xvec6, xvec6;
670ADD_DY	xvec6, xvec13, xvec13;
671LDL_DY	2*SIZE(C1, ldc, 1), xvec5, xvec5;
672LDH_DY	3*SIZE(C1, ldc, 1), xvec5, xvec5;
673ADD_DY	xvec5, xvec7, xvec7;
674#endif
675STL_DY	xvec13, 0*SIZE(C0, ldc, 1);
676STH_DY	xvec13, 1*SIZE(C0, ldc, 1);
677STL_DY	xvec7, 2*SIZE(C1, ldc, 1);
678STH_DY	xvec7, 3*SIZE(C1, ldc, 1);
679
680EXTRA_DY $1, yvec12, xvec4;
681#ifndef	TRMMKERNEL
682LDL_DY	4*SIZE(C0, ldc, 1), xvec3, xvec3;
683LDH_DY	5*SIZE(C0, ldc, 1), xvec3, xvec3;
684ADD_DY	xvec3, xvec12, xvec12;
685LDL_DY	6*SIZE(C1, ldc, 1), xvec2, xvec2;
686LDH_DY	7*SIZE(C1, ldc, 1), xvec2, xvec2;
687ADD_DY	xvec2, xvec4, xvec4;
688#endif
689STL_DY	xvec12, 4*SIZE(C0, ldc, 1);
690STH_DY	xvec12, 5*SIZE(C0, ldc ,1);
691STL_DY	xvec4, 6*SIZE(C1, ldc, 1);
692STH_DY	xvec4, 7*SIZE(C1, ldc, 1);
693
694EXTRA_DY $1, yvec11, xvec7;
695#ifndef	TRMMKERNEL
696LDL_DY	0*SIZE(C1), xvec6, xvec6;
697LDH_DY	1*SIZE(C1), xvec6, xvec6;
698ADD_DY	xvec6, xvec11, xvec11;
699LDL_DY	2*SIZE(C0), xvec5, xvec5;
700LDH_DY	3*SIZE(C0), xvec5, xvec5;
701ADD_DY	xvec5, xvec7, xvec7;
702#endif
703STL_DY	xvec11, 0*SIZE(C1);
704STH_DY	xvec11, 1*SIZE(C1);
705STL_DY	xvec7, 2*SIZE(C0);
706STH_DY	xvec7, 3*SIZE(C0);
707
708EXTRA_DY $1, yvec10, xvec4;
709#ifndef	TRMMKERNEL
710LDL_DY	4*SIZE(C1), xvec3, xvec3;
711LDH_DY	5*SIZE(C1), xvec3, xvec3;
712ADD_DY	xvec3, xvec10, xvec10;
713LDL_DY	6*SIZE(C0), xvec2, xvec2;
714LDH_DY	7*SIZE(C0), xvec2, xvec2;
715ADD_DY	xvec2, xvec4, xvec4;
716#endif
717STL_DY	xvec10, 4*SIZE(C1);
718STH_DY	xvec10, 5*SIZE(C1);
719STL_DY	xvec4, 6*SIZE(C0);
720STH_DY	xvec4, 7*SIZE(C0);
721
722EXTRA_DY $1, yvec9, xvec7;
723#ifndef	TRMMKERNEL
724LDL_DY	0*SIZE(C1, ldc, 1), xvec6, xvec6;
725LDH_DY	1*SIZE(C1, ldc, 1), xvec6, xvec6;
726ADD_DY	xvec6, xvec9, xvec9;
727LDL_DY	2*SIZE(C0, ldc, 1), xvec5, xvec5;
728LDH_DY	3*SIZE(C0, ldc ,1), xvec5, xvec5;
729ADD_DY	xvec5, xvec7, xvec7;
730#endif
731STL_DY	xvec9, 0*SIZE(C1, ldc, 1);
732STH_DY	xvec9, 1*SIZE(C1, ldc, 1);
733STL_DY	xvec7, 2*SIZE(C0, ldc, 1);
734STH_DY	xvec7, 3*SIZE(C0, ldc, 1);
735
736EXTRA_DY $1, yvec8, xvec4;
737#ifndef	TRMMKERNEL
738LDL_DY	4*SIZE(C1, ldc, 1), xvec3, xvec3;
739LDH_DY	5*SIZE(C1, ldc, 1), xvec3, xvec3;
740ADD_DY	xvec3, xvec8, xvec8;
741LDL_DY	6*SIZE(C0, ldc, 1), xvec2, xvec2;
742LDH_DY	7*SIZE(C0, ldc, 1), xvec2, xvec2;
743ADD_DY	xvec2, xvec4, xvec4;
744#endif
745STL_DY	xvec8, 4*SIZE(C1, ldc, 1);
746STH_DY	xvec8, 5*SIZE(C1, ldc, 1);
747STL_DY	xvec4, 6*SIZE(C0, ldc, 1);
748STH_DY	xvec4, 7*SIZE(C0, ldc, 1);
749#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
750MOVQ	bk, %rax;
751SUBQ	kkk, %rax;
752LEAQ 	(, %rax, SIZE), %rax;
753LEAQ	(ptrba, %rax, 8), ptrba;
754LEAQ 	(ptrbb, %rax, 4), ptrbb;
755#endif
756#if defined(TRMMKERNEL) && defined(LEFT)
757ADDQ	$8, kk
758#endif
759
760ADDQ	$8*SIZE, C0;
761ADDQ	$8*SIZE, C1;
762DECQ i;
763JG .L1_bodyB;
764ALIGN_5
765.L1_loopE:;
766TEST $4, bm;							# Rm = 4
767JLE	.L5_loopE;
768ALIGN_5
769.L5_bodyB:;
770#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
771MOVQ bb, ptrbb;
772#else
773MOVQ bb, ptrbb;
774MOVQ kk, %rax;
775LEAQ (, %rax, SIZE), %rax;
776LEAQ (ptrba, %rax, 4), ptrba;
777LEAQ (ptrbb, %rax, 4), ptrbb;
778#endif
779//#### Initial Results Register ####
780XOR_DY	yvec15, yvec15, yvec15;
781XOR_DY 	yvec13, yvec13, yvec13;
782LD_DY	0*SIZE(ptrbb), yvec2;
783XOR_DY	yvec11, yvec11, yvec11;
784XOR_DY	yvec9, yvec9, yvec9;
785LD_DY	0*SIZE(ptrba), yvec0;
786VPERMILP_DY	$0x05, yvec2, yvec3;
787#ifndef	TRMMKERNEL
788MOVQ bk, k;
789#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
790MOVQ bk, %rax;
791SUBQ kk, %rax;
792MOVQ %rax, kkk;
793#else
794MOVQ kk, %rax;
795#ifdef	LEFT
796ADDQ $4, %rax;
797#else
798ADDQ $4, %rax;
799#endif
800MOVQ %rax, kkk;
801#endif
802SARQ $2, k;
803JLE .L6_loopE;
804ALIGN_5;
805.L6_bodyB:;
806# Computing kernel
807
808//#### Untoll time 1 ####
809LD_DY	4*SIZE(ptrba), yvec1;
810MUL_DY	yvec0, yvec2, yvec6;
811ADD_DY	yvec15, yvec6, yvec15;
812SHUF_DY	$0x03, yvec2, yvec2, yvec4;
813MUL_DY	yvec0, yvec3, yvec7;
814ADD_DY	yvec13, yvec7, yvec13;
815SHUF_DY	$0x03, yvec3, yvec3, yvec5;
816
817LD_DY	4*SIZE(ptrbb), yvec2;
818MUL_DY	yvec0, yvec4, yvec6;
819ADD_DY	yvec11, yvec6, yvec11;
820VPERMILP_DY	$0x05, yvec2, yvec3;
821MUL_DY	yvec0, yvec5, yvec7;
822ADD_DY	yvec9, yvec7, yvec9;
823
824//#### Untoll time 2 ####
825LD_DY	8*SIZE(ptrba), yvec0;
826MUL_DY	yvec1, yvec2, yvec6;
827ADD_DY	yvec15, yvec6, yvec15;
828SHUF_DY	$0x03, yvec2, yvec2, yvec4;
829MUL_DY	yvec1, yvec3, yvec7;
830ADD_DY	yvec13, yvec7, yvec13;
831SHUF_DY	$0x03, yvec3, yvec3, yvec5;
832
833LD_DY	8*SIZE(ptrbb), yvec2;
834MUL_DY	yvec1, yvec4, yvec6;
835ADD_DY	yvec11, yvec6, yvec11;
836VPERMILP_DY	$0x05, yvec2, yvec3;
837MUL_DY	yvec1, yvec5, yvec7;
838ADD_DY	yvec9, yvec7, yvec9;
839
840//#### Untoll time 3 ####
841LD_DY	12*SIZE(ptrba), yvec1;
842MUL_DY	yvec0, yvec2, yvec6;
843ADD_DY	yvec15, yvec6, yvec15;
844SHUF_DY	$0x03, yvec2, yvec2, yvec4;
845ADDQ	$16*SIZE, ptrba;
846MUL_DY	yvec0, yvec3, yvec7;
847ADD_DY	yvec13, yvec7, yvec13;
848SHUF_DY	$0x03, yvec3, yvec3, yvec5;
849
850LD_DY	12*SIZE(ptrbb), yvec2;
851MUL_DY	yvec0, yvec4, yvec6;
852ADD_DY	yvec11, yvec6, yvec11;
853VPERMILP_DY	$0x05, yvec2, yvec3;
854ADDQ	$16*SIZE, ptrbb;
855MUL_DY	yvec0, yvec5, yvec7;
856ADD_DY	yvec9, yvec7, yvec9;
857
858//#### Untoll time 4 ####
859LD_DY	0*SIZE(ptrba), yvec0;
860MUL_DY	yvec1, yvec2, yvec6;
861ADD_DY	yvec15, yvec6, yvec15;
862SHUF_DY	$0x03, yvec2, yvec2, yvec4;
863MUL_DY	yvec1, yvec3, yvec7;
864ADD_DY	yvec13, yvec7, yvec13;
865SHUF_DY	$0x03, yvec3, yvec3, yvec5;
866
867LD_DY	0*SIZE(ptrbb), yvec2;
868MUL_DY	yvec1, yvec4, yvec6;
869ADD_DY	yvec11, yvec6, yvec11;
870VPERMILP_DY	$0x05, yvec2, yvec3;
871MUL_DY	yvec1, yvec5, yvec7;
872ADD_DY	yvec9, yvec7, yvec9;
873DECQ k;
874JG .L6_bodyB;
875ALIGN_5
876.L6_loopE:;
877#ifndef	TRMMKERNEL
878TEST $2, bk;
879#else
880MOVQ kkk, %rax;
881TEST $2, %rax;
882#endif
883JLE .L7_loopE;
884ALIGN_5
885.L7_bodyB:;
886//#### Untoll time 1 ####
887LD_DY	4*SIZE(ptrba), yvec1;
888MUL_DY	yvec0, yvec2, yvec6;
889ADD_DY	yvec15, yvec6, yvec15;
890SHUF_DY	$0x03, yvec2, yvec2, yvec4;
891ADDQ	$8*SIZE, ptrba;
892MUL_DY	yvec0, yvec3, yvec7;
893ADD_DY	yvec13, yvec7, yvec13;
894SHUF_DY	$0x03, yvec3, yvec3, yvec5;
895
896LD_DY	4*SIZE(ptrbb), yvec2;
897MUL_DY	yvec0, yvec4, yvec6;
898ADD_DY	yvec11, yvec6, yvec11;
899VPERMILP_DY	$0x05, yvec2, yvec3;
900ADDQ	$8*SIZE, ptrbb;
901MUL_DY	yvec0, yvec5, yvec7;
902ADD_DY	yvec9, yvec7, yvec9;
903
904//#### Untoll time 2 ####
905LD_DY	0*SIZE(ptrba), yvec0;
906MUL_DY	yvec1, yvec2, yvec6;
907ADD_DY	yvec15, yvec6, yvec15;
908SHUF_DY	$0x03, yvec2, yvec2, yvec4;
909MUL_DY	yvec1, yvec3, yvec7;
910ADD_DY	yvec13, yvec7, yvec13;
911SHUF_DY	$0x03, yvec3, yvec3, yvec5;
912
913LD_DY	0*SIZE(ptrbb), yvec2;
914MUL_DY	yvec1, yvec4, yvec6;
915ADD_DY	yvec11, yvec6, yvec11;
916VPERMILP_DY	$0x05, yvec2, yvec3;
917MUL_DY	yvec1, yvec5, yvec7;
918ADD_DY	yvec9, yvec7, yvec9;
919
920.L7_loopE:;
921#ifndef	TRMMKERNEL
922TEST $1, bk
923#else
924MOVQ kkk, %rax;
925TEST $1, %rax;
926#endif
927JLE .L8_loopE;
928ALIGN_5
929.L8_bodyB:;
930//#### Untoll time 1 ####
931MUL_DY	yvec0, yvec2, yvec6;
932ADD_DY	yvec15, yvec6, yvec15;
933SHUF_DY	$0x03, yvec2, yvec2, yvec4;
934ADDQ	$4*SIZE, ptrba;
935MUL_DY	yvec0, yvec3, yvec7;
936ADD_DY	yvec13, yvec7, yvec13;
937SHUF_DY	$0x03, yvec3, yvec3, yvec5;
938
939MUL_DY	yvec0, yvec4, yvec6;
940ADD_DY	yvec11, yvec6, yvec11;
941ADDQ	$4*SIZE, ptrbb;
942MUL_DY	yvec0, yvec5, yvec7;
943ADD_DY	yvec9, yvec7, yvec9;
944
945.L8_loopE:;
946//#### Load Alpha ####
947BROAD_DY	MEMALPHA, yvec7;
948//#### Multiply Alpha ####
949MUL_DY yvec7,yvec15,yvec15;
950MUL_DY yvec7,yvec13,yvec13;
951MUL_DY yvec7,yvec11,yvec11;
952MUL_DY yvec7,yvec9,yvec9;
953//#### Reverse the Results ####
954MOV_DY	yvec15, yvec7;
955REVS_DY $0x0a,yvec13,yvec15,yvec15;
956REVS_DY $0x0a,yvec7,yvec13,yvec13;
957MOV_DY yvec11,yvec7;
958REVS_DY $0x0a,yvec9,yvec11,yvec11;
959REVS_DY $0x0a,yvec7,yvec9,yvec9;
960//#### Testing alignment ####
961MOVQ	C0, %rax;
962OR		ldc, %rax;
963TEST	$15, %rax;
964JNE		.L8_loopEx;					# Unalign part write back
965ALIGN_5
966//#### Writing Back ####
967EXTRA_DY $1,yvec15,xvec7;
968EXTRA_DY $1,yvec13,xvec5;
969EXTRA_DY $1,yvec11,xvec3;
970EXTRA_DY $1,yvec9,xvec1;
971#ifndef	TRMMKERNEL
972ADD_DX 0*SIZE(C0), xvec15, xvec15;
973ADD_DX 2*SIZE(C1), xvec7, xvec7;
974ADD_DX 0*SIZE(C0, ldc, 1), xvec13, xvec13;
975ADD_DX 2*SIZE(C1, ldc, 1), xvec5, xvec5;
976ADD_DX 0*SIZE(C1), xvec11, xvec11;
977ADD_DX 2*SIZE(C0), xvec3, xvec3;
978ADD_DX 0*SIZE(C1, ldc, 1), xvec9, xvec9;
979ADD_DX 2*SIZE(C0, ldc, 1), xvec1, xvec1;
980#endif
981ST_DX xvec15, 0*SIZE(C0);
982ST_DX xvec7, 2*SIZE(C1);
983ST_DX xvec13, 0*SIZE(C0,ldc,1);
984ST_DX xvec5, 2*SIZE(C1,ldc,1);
985ST_DX xvec11, 0*SIZE(C1);
986ST_DX xvec3, 2*SIZE(C0);
987ST_DX xvec9, 0*SIZE(C1,ldc,1);
988ST_DX xvec1, 2*SIZE(C0,ldc,1);
989#if	(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
990MOVQ	bk, %rax;
991SUBQ	kkk, %rax;
992LEAQ	(, %rax, SIZE), %rax;
993LEAQ	(ptrba, %rax, 4), ptrba;
994LEAQ	(ptrbb, %rax, 4), ptrbb;
995#endif
996#if	defined(TRMMKERNEL)&&defined(LEFT)
997ADDQ	$4, kk
998#endif
999ADDQ $4*SIZE, C0;
1000ADDQ $4*SIZE, C1;
1001JMP	.L5_loopE;
1002ALIGN_5
1003.L8_loopEx:;
1004EXTRA_DY $1,yvec15,xvec7;
1005EXTRA_DY $1,yvec13,xvec5;
1006EXTRA_DY $1,yvec11,xvec3;
1007EXTRA_DY $1,yvec9,xvec1;
1008#ifndef	TRMMKERNEL
1009LDL_DX 0*SIZE(C0), xvec14, xvec14;
1010LDH_DX 1*SIZE(C0), xvec14, xvec14;
1011LDL_DX 0*SIZE(C0, ldc, 1), xvec12, xvec12;
1012LDH_DX 1*SIZE(C0, ldc, 1), xvec12, xvec12;
1013LDL_DX 0*SIZE(C1), xvec10, xvec10;
1014LDH_DX 1*SIZE(C1), xvec10, xvec10;
1015LDL_DX 0*SIZE(C1, ldc, 1), xvec8, xvec8;
1016LDH_DX 1*SIZE(C1, ldc, 1), xvec8, xvec8;
1017ADD_DX xvec14, xvec15, xvec15;
1018ADD_DX xvec12, xvec13, xvec13;
1019ADD_DX xvec10, xvec11, xvec11;
1020ADD_DX xvec8, xvec9, xvec9;
1021#endif
1022STL_DX	xvec15, 0*SIZE(C0);
1023STH_DX	xvec15, 1*SIZE(C0);
1024STL_DX	xvec13, 0*SIZE(C0, ldc, 1);
1025STH_DX	xvec13, 1*SIZE(C0, ldc, 1);
1026STL_DX	xvec11, 0*SIZE(C1);
1027STH_DX	xvec11, 1*SIZE(C1);
1028STL_DX	xvec9,	0*SIZE(C1, ldc, 1);
1029STH_DX	xvec9, 	1*SIZE(C1, ldc, 1);
1030#ifndef	TRMMKERNEL
1031LDL_DX 2*SIZE(C0), xvec0, xvec0;
1032LDH_DX 3*SIZE(C0), xvec0, xvec0;
1033LDL_DX 2*SIZE(C0, ldc, 1), xvec2, xvec2;
1034LDH_DX 3*SIZE(C0, ldc, 1), xvec2, xvec2;
1035LDL_DX 2*SIZE(C1), xvec4, xvec4;
1036LDH_DX 3*SIZE(C1), xvec4, xvec4;
1037LDL_DX 2*SIZE(C1, ldc, 1), xvec6, xvec6;
1038LDH_DX 3*SIZE(C1, ldc, 1), xvec6, xvec6;
1039ADD_DX xvec0, xvec3, xvec3;
1040ADD_DX xvec2, xvec1, xvec1;
1041ADD_DX xvec4, xvec7, xvec7;
1042ADD_DX xvec6, xvec5, xvec5;
1043#endif
1044STL_DX	xvec3, 2*SIZE(C0);
1045STH_DX	xvec3, 3*SIZE(C0);
1046STL_DX	xvec1, 2*SIZE(C0, ldc, 1);
1047STH_DX	xvec1, 3*SIZE(C0, ldc, 1);
1048STL_DX	xvec7, 2*SIZE(C1);
1049STH_DX	xvec7, 3*SIZE(C1);
1050STL_DX	xvec5, 2*SIZE(C1, ldc, 1);
1051STH_DX	xvec5, 3*SIZE(C1, ldc, 1);
1052#if	(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
1053MOVQ	bk, %rax;
1054SUBQ	kkk, %rax;
1055LEAQ	(, %rax, SIZE), %rax;
1056LEAQ	(ptrba, %rax, 4), ptrba;
1057LEAQ	(ptrbb, %rax, 4), ptrbb;
1058#endif
1059#if	defined(TRMMKERNEL)&&defined(LEFT)
1060ADDQ	$4, kk
1061#endif
1062
1063ADDQ	$4*SIZE, C0;
1064ADDQ	$4*SIZE, C1;
1065.L5_loopE:;
1066TEST $2, bm;
1067JLE .L9_loopE;
1068ALIGN_5
1069.L9_bodyB:;
1070#if	!defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
1071MOVQ bb, ptrbb;
1072#else
1073MOVQ bb, ptrbb;
1074MOVQ kk, %rax;
1075LEAQ (, %rax, SIZE), %rax;
1076LEAQ (ptrba, %rax, 2), ptrba;
1077LEAQ (ptrbb, %rax, 4), ptrbb
1078#endif
1079//#### Initial Results Register ####
1080LD_DX	0*SIZE(ptrbb), xvec2;
1081XOR_DY yvec15, yvec15, yvec15;
1082LD_DX	2*SIZE(ptrbb), xvec3;
1083XOR_DY yvec13, yvec13, yvec13;
1084LD_DX	0*SIZE(ptrba), xvec0;
1085XOR_DY yvec11, yvec11, yvec11;
1086SHUF_DX	$0x4e, xvec2, xvec4;
1087XOR_DY yvec9, yvec9, yvec9;
1088#ifndef	TRMMKERNEL
1089MOVQ bk, k;
1090#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
1091MOVQ	bk, %rax;
1092SUBQ 	kk, %rax;
1093MOVQ 	%rax, kkk;
1094#else
1095MOVQ	kk, %rax;
1096#ifdef	LEFT
1097ADDQ	$2, %rax;
1098#else
1099ADDQ	$4, %rax;
1100#endif
1101MOVQ	%rax, kkk;
1102#endif
1103SARQ $2, k;
1104JLE .L10_loopE;
1105ALIGN_5;
1106.L10_bodyB:;
1107# Computing kernel
1108
1109//#### Unroll time 1 ####
1110LD_DX	4*SIZE(ptrbb), xvec6;
1111SHUF_DX	$0x4e, xvec3, xvec5;
1112MUL_DX xvec0, xvec2, xvec2;
1113ADD_DX xvec2, xvec15, xvec15;
1114
1115LD_DX	6*SIZE(ptrbb), xvec7;
1116MUL_DX xvec0, xvec3, xvec3;
1117ADD_DX xvec3, xvec11, xvec11;
1118
1119LD_DX	2*SIZE(ptrba), xvec1;
1120MUL_DX xvec0, xvec4, xvec4;
1121ADD_DX xvec4, xvec13, xvec13;
1122SHUF_DX	$0x4e, xvec6, xvec4;
1123MUL_DX xvec0, xvec5, xvec5;
1124ADD_DX xvec5, xvec9, xvec9;
1125
1126//#### Unroll time 2 ####
1127LD_DX	8*SIZE(ptrbb), xvec2;
1128SHUF_DX	$0x4e, xvec7, xvec5;
1129MUL_DX xvec1, xvec6, xvec6;
1130ADD_DX xvec6, xvec15, xvec15;
1131
1132LD_DX	10*SIZE(ptrbb), xvec3;
1133MUL_DX xvec1, xvec7, xvec7;
1134ADD_DX xvec7, xvec11, xvec11;
1135
1136LD_DX	4*SIZE(ptrba), xvec0;
1137MUL_DX xvec1, xvec4, xvec4;
1138ADD_DX xvec4, xvec13, xvec13;
1139SHUF_DX	$0x4e, xvec2, xvec4;
1140MUL_DX xvec1, xvec5, xvec5;
1141ADD_DX xvec5, xvec9, xvec9;
1142
1143//#### Unroll time 3 ####
1144LD_DX	12*SIZE(ptrbb), xvec6;
1145SHUF_DX	$0x4e, xvec3, xvec5;
1146MUL_DX xvec0, xvec2, xvec2;
1147ADD_DX xvec2, xvec15, xvec15;
1148
1149LD_DX	14*SIZE(ptrbb), xvec7;
1150MUL_DX xvec0, xvec3, xvec3;
1151ADD_DX xvec3, xvec11, xvec11;
1152ADDQ	$16*SIZE, ptrbb;
1153
1154LD_DX	6*SIZE(ptrba), xvec1;
1155MUL_DX xvec0, xvec4, xvec4;
1156ADD_DX xvec4, xvec13, xvec13;
1157SHUF_DX	$0x4e, xvec6, xvec4;
1158ADDQ	$8*SIZE, ptrba;
1159MUL_DX xvec0, xvec5, xvec5;
1160ADD_DX xvec5, xvec9, xvec9;
1161
1162//#### Unroll time 4 ####
1163LD_DX	0*SIZE(ptrbb), xvec2;
1164SHUF_DX	$0x4e, xvec7, xvec5;
1165MUL_DX xvec1, xvec6, xvec6;
1166ADD_DX xvec6, xvec15, xvec15;
1167
1168LD_DX	2*SIZE(ptrbb), xvec3;
1169MUL_DX xvec1, xvec7, xvec7;
1170ADD_DX xvec7, xvec11, xvec11;
1171
1172LD_DX	0*SIZE(ptrba), xvec0;
1173MUL_DX xvec1, xvec4, xvec4;
1174ADD_DX xvec4, xvec13, xvec13;
1175SHUF_DX	$0x4e, xvec2, xvec4;
1176MUL_DX xvec1, xvec5, xvec5;
1177ADD_DX xvec5, xvec9, xvec9;
1178DECQ k;
1179JG .L10_bodyB;
1180ALIGN_5
1181.L10_loopE:;
1182#ifndef	TRMMKERNEL
1183TEST $2, bk
1184#else
1185MOVQ kkk, %rax;
1186TEST $2, %rax;
1187#endif
1188JLE .L11_loopE;
1189ALIGN_5
1190.L11_bodyB:;
1191//#### Unroll time 1 ####
1192LD_DX	4*SIZE(ptrbb), xvec6;
1193SHUF_DX	$0x4e, xvec3, xvec5;
1194MUL_DX xvec0, xvec2, xvec2;
1195ADD_DX xvec2, xvec15, xvec15;
1196
1197LD_DX	6*SIZE(ptrbb), xvec7;
1198MUL_DX xvec0, xvec3, xvec3;
1199ADD_DX xvec3, xvec11, xvec11;
1200ADDQ	$8*SIZE, ptrbb;
1201
1202LD_DX	2*SIZE(ptrba), xvec1;
1203MUL_DX xvec0, xvec4, xvec4;
1204ADD_DX xvec4, xvec13, xvec13;
1205SHUF_DX	$0x4e, xvec6, xvec4;
1206ADDQ	$4*SIZE, ptrba;
1207
1208MUL_DX xvec0, xvec5, xvec5;
1209ADD_DX xvec5, xvec9, xvec9;
1210
1211//#### Unroll time 2 ####
1212LD_DX	0*SIZE(ptrbb), xvec2;
1213SHUF_DX	$0x4e, xvec7, xvec5;
1214MUL_DX xvec1, xvec6, xvec6;
1215ADD_DX xvec6, xvec15, xvec15;
1216
1217LD_DX	2*SIZE(ptrbb), xvec3;
1218MUL_DX xvec1, xvec7, xvec7;
1219ADD_DX xvec7, xvec11, xvec11;
1220
1221LD_DX	0*SIZE(ptrba), xvec0;
1222MUL_DX xvec1, xvec4, xvec4;
1223ADD_DX xvec4, xvec13, xvec13;
1224SHUF_DX	$0x4e, xvec2, xvec4;
1225MUL_DX xvec1, xvec5, xvec5;
1226ADD_DX xvec5, xvec9, xvec9;
1227
1228.L11_loopE:;
1229#ifndef	TRMMKERNEL
1230TEST $1, bk
1231#else
1232MOVQ kkk, %rax;
1233TEST $1, %rax;
1234#endif
1235JLE .L12_loopE;
1236ALIGN_5
1237.L12_bodyB:;
1238SHUF_DX	$0x4e, xvec3, xvec5;
1239MUL_DX xvec0, xvec2, xvec2;
1240ADD_DX xvec2, xvec15, xvec15;
1241ADDQ	$4*SIZE, ptrbb;
1242
1243MUL_DX xvec0, xvec3, xvec3;
1244ADD_DX xvec3, xvec11, xvec11;
1245ADDQ	$2*SIZE, ptrba;
1246
1247MUL_DX xvec0, xvec4, xvec4;
1248ADD_DX xvec4, xvec13, xvec13;
1249
1250MUL_DX xvec0, xvec5, xvec5;
1251ADD_DX xvec5, xvec9, xvec9;
1252
1253.L12_loopE:;
1254//#### Load Alpha ####
1255BROAD_DX MEMALPHA, xvec7;
1256//#### Multiply Alpha ####
1257MUL_DX xvec7, xvec15, xvec15;
1258MUL_DX xvec7, xvec13, xvec13;
1259MUL_DX xvec7, xvec11, xvec11;
1260MUL_DX xvec7, xvec9, xvec9;
1261//#### Reverse the Results ####
1262MOV_DX	xvec15, xvec6;
1263REVS_DX xvec13, xvec15, xvec15;
1264REVS_DX xvec6, xvec13, xvec13;
1265MOV_DX	xvec11, xvec6;
1266REVS_DX xvec9, xvec11, xvec11;
1267REVS_DX xvec6, xvec9, xvec9;
1268//#### Testing Alignment ####
1269MOVQ	C0, %rax;
1270OR		ldc, %rax;
1271TEST	$15, %rax;
1272JNE		.L12_loopEx;
1273ALIGN_5
1274//#### Writing Back ####
1275#ifndef	TRMMKERNEL
1276ADD_DX 0*SIZE(C0), xvec13, xvec13;
1277ADD_DX 0*SIZE(C0, ldc, 1), xvec15, xvec15;
1278ADD_DX 0*SIZE(C1), xvec9, xvec9;
1279ADD_DX 0*SIZE(C1, ldc, 1), xvec11, xvec11;
1280#endif
1281ST_DX	xvec13, 0*SIZE(C0);
1282ST_DX	xvec15, 0*SIZE(C0, ldc, 1);
1283ST_DX	xvec9, 0*SIZE(C1);
1284ST_DX	xvec11, 0*SIZE(C1, ldc, 1);
1285#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
1286MOVQ	bk, %rax;
1287SUBQ	kkk, %rax;
1288LEAQ	(,%rax, SIZE), %rax;
1289LEAQ	(ptrba, %rax, 2), ptrba;
1290LEAQ	(ptrbb, %rax, 4), ptrbb;
1291#endif
1292#if defined(TRMMKERNEL) && defined(LEFT)
1293ADDQ	$2, kk
1294#endif
1295ADDQ	$2*SIZE, C0
1296ADDQ	$2*SIZE, C1
1297JMP		.L9_loopE;
1298ALIGN_5
1299.L12_loopEx:
1300#ifndef	TRMMKERNEL
1301LDL_DX 0*SIZE(C0), xvec14, xvec14;
1302LDH_DX 1*SIZE(C0), xvec14, xvec14;
1303LDL_DX 0*SIZE(C0, ldc, 1), xvec12, xvec12;
1304LDH_DX 1*SIZE(C0, ldc, 1), xvec12, xvec12;
1305LDL_DX 0*SIZE(C1), xvec10, xvec10;
1306LDH_DX 1*SIZE(C1), xvec10, xvec10;
1307LDL_DX 0*SIZE(C1, ldc, 1), xvec8, xvec8;
1308LDH_DX 1*SIZE(C1, ldc, 1), xvec8, xvec8;
1309ADD_DX xvec14, xvec13, xvec13;
1310ADD_DX xvec12, xvec15, xvec15;
1311ADD_DX xvec10, xvec9, xvec9;
1312ADD_DX xvec8, xvec11, xvec11;
1313#endif
1314STL_DX	xvec13, 0*SIZE(C0);
1315STH_DX	xvec13, 1*SIZE(C0);
1316STL_DX	xvec15, 0*SIZE(C0, ldc, 1);
1317STH_DX	xvec15, 1*SIZE(C0, ldc, 1);
1318STL_DX	xvec9,	0*SIZE(C1);
1319STH_DX	xvec9,	1*SIZE(C1);
1320STL_DX	xvec11,	0*SIZE(C1, ldc, 1);
1321STH_DX	xvec11,	1*SIZE(C1, ldc, 1);
1322#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
1323MOVQ	bk, %rax;
1324SUBQ	kkk, %rax;
1325LEAQ	(,%rax, SIZE), %rax;
1326LEAQ	(ptrba, %rax, 2), ptrba;
1327LEAQ	(ptrbb, %rax, 4), ptrbb;
1328#endif
1329#if defined(TRMMKERNEL) && defined(LEFT)
1330ADDQ	$2, kk
1331#endif
1332ADDQ	$2*SIZE, C0;
1333ADDQ	$2*SIZE, C1;
1334.L9_loopE:;
1335TEST $1, bm
1336JLE	.L13_loopE;
1337ALIGN_5
1338.L13_bodyB:;
1339#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
1340MOVQ bb, ptrbb;
1341#else
1342MOVQ bb, ptrbb;
1343MOVQ kk, %rax;
1344LEAQ (,%rax, SIZE), %rax;
1345ADDQ	%rax, ptrba;
1346LEAQ (ptrbb, %rax, 4), ptrbb;
1347#endif
1348//#### Initial Results Register ####
1349XOR_DY	yvec15, yvec15, yvec15;
1350#ifndef	TRMMKERNEL
1351MOVQ bk, k;
1352#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
1353MOVQ	bk, %rax;
1354SUBQ	kk, %rax;
1355MOVQ	%rax, kkk;
1356#else
1357MOVQ	kk, %rax;
1358#ifdef	LEFT
1359ADDQ	$1, %rax;
1360#else
1361ADDQ	$4, %rax;
1362#endif
1363MOVQ	%rax, kkk;
1364#endif
1365SARQ $2, k;
1366JLE .L14_loopE;
1367ALIGN_5
1368.L14_bodyB:;
1369BROAD_DY 0*SIZE(ptrba), yvec0;
1370LD_DY	0*SIZE(ptrbb), yvec2;
1371MUL_DY	yvec0, yvec2, yvec6;
1372ADD_DY	yvec15, yvec6, yvec15;
1373
1374BROAD_DY 1*SIZE(ptrba), yvec1;
1375LD_DY	4*SIZE(ptrbb), yvec3;
1376MUL_DY	yvec1, yvec3, yvec7;
1377ADD_DY	yvec15, yvec7, yvec15;
1378
1379BROAD_DY 2*SIZE(ptrba), yvec0;
1380LD_DY	8*SIZE(ptrbb), yvec2;
1381MUL_DY	yvec0, yvec2, yvec6;
1382ADD_DY	yvec15, yvec6, yvec15;
1383
1384BROAD_DY 3*SIZE(ptrba), yvec1;
1385LD_DY	12*SIZE(ptrbb), yvec3;
1386MUL_DY	yvec1, yvec3, yvec7;
1387ADD_DY	yvec15, yvec7, yvec15;
1388ADDQ	$4*SIZE, ptrba;
1389ADDQ	$16*SIZE, ptrbb;
1390DECQ	k;
1391JG	.L14_bodyB;
1392ALIGN_5
1393.L14_loopE:
1394#ifndef	TRMMKERNEL
1395TEST $2, bk;
1396#else
1397MOVQ kkk, %rax;
1398TEST $2, %rax;
1399#endif
1400JLE .L15_loopE;
1401ALIGN_5
1402.L15_bodyB:
1403BROAD_DY 0*SIZE(ptrba), yvec0;
1404LD_DY	0*SIZE(ptrbb), yvec2;
1405MUL_DY	yvec0, yvec2, yvec6;
1406ADD_DY	yvec15, yvec6, yvec15;
1407
1408BROAD_DY 1*SIZE(ptrba), yvec1;
1409LD_DY	4*SIZE(ptrbb), yvec3;
1410MUL_DY	yvec1, yvec3, yvec7;
1411ADD_DY	yvec15, yvec7, yvec15;
1412ADDQ	$2*SIZE, ptrba;
1413ADDQ	$8*SIZE, ptrbb;
1414.L15_loopE:;
1415#ifndef	TRMMKERNEL
1416TEST $1, bk;
1417#else
1418MOVQ kkk, %rax;
1419TEST $1, %rax;
1420#endif
1421JLE .L16_loopE;
1422ALIGN_5
1423.L16_bodyB:;
1424BROAD_DY 0*SIZE(ptrba), yvec0;
1425LD_DY	0*SIZE(ptrbb), yvec2;
1426MUL_DY	yvec0, yvec2, yvec6;
1427ADD_DY	yvec15, yvec6, yvec15;
1428ADDQ	$1*SIZE, ptrba;
1429ADDQ	$4*SIZE, ptrbb;
1430
1431.L16_loopE:
1432//#### Load Alpha ####
1433BROAD_DY MEMALPHA, yvec7;
1434//#### Multiply Alpha ####
1435MUL_DY	yvec15, yvec7, yvec15;
1436//#### Writing Back ####
1437EXTRA_DY $1, yvec15, xvec7;
1438#ifndef	TRMMKERNEL
1439LDL_DX 0*SIZE(C0), xvec0, xvec0;
1440LDH_DX 0*SIZE(C0, ldc, 1), xvec0, xvec0;
1441LDL_DX 0*SIZE(C1), xvec1, xvec1;
1442LDH_DX 0*SIZE(C1, ldc, 1), xvec1, xvec1;
1443ADD_DX xvec0, xvec15, xvec15;
1444ADD_DX xvec1, xvec7, xvec7;
1445#endif
1446STL_DX	xvec15, 0*SIZE(C0);
1447STH_DX	xvec15, 0*SIZE(C0, ldc, 1);
1448STL_DX	xvec7, 0*SIZE(C1);
1449STH_DX	xvec7, 0*SIZE(C1, ldc, 1);
1450#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
1451MOVQ	bk, %rax;
1452SUBQ	kkk, %rax;
1453LEAQ	(,%rax, SIZE), %rax;
1454ADDQ	%rax, ptrba;
1455LEAQ	(ptrbb, %rax, 4), ptrbb;
1456#endif
1457#if defined(TRMMKERNEL)&&defined(LEFT)
1458ADDQ	$1, kk
1459#endif
1460ADDQ	$1*SIZE, C0
1461ADDQ	$1*SIZE, C1
1462.L13_loopE:;
1463#if	defined(TRMMKERNEL)&&!defined(LEFT)
1464ADDQ	$4, kk
1465#endif
1466MOVQ bk,k;
1467SALQ $5,k;
1468ADDQ k,bb;
1469LEAQ (C,ldc,4),C;
1470.L0_bodyE:;
1471DECQ j;
1472JG .L0_bodyB;
1473ALIGN_5;
1474.L0_loopE:;
1475TEST $2, bn;
1476JLE .L20_loopE;
1477ALIGN_5;
1478.L20_loopB:;
1479#if defined(TRMMKERNEL) && defined(LEFT)
1480MOVQ	OFFSET, %rax;
1481MOVQ	%rax, kk
1482#endif
1483MOVQ C, C0;
1484LEAQ (C, ldc, 1), C1;
1485MOVQ ba, ptrba;
1486MOVQ bm, i;
1487SARQ $3, i;					# Rm = 8
1488JLE	.L21_loopE;
1489ALIGN_5;
1490.L21_bodyB:;
1491#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
1492MOVQ bb, ptrbb;
1493#else
1494MOVQ bb, ptrbb;
1495MOVQ kk, %rax;
1496LEAQ (, %rax, SIZE), %rax;
1497LEAQ (ptrba, %rax, 8), ptrba;
1498LEAQ (ptrbb, %rax, 2), ptrbb;
1499#endif
1500//#### Initial Results Register ####
1501XOR_DY yvec15, yvec15, yvec15;
1502XOR_DY yvec14, yvec14, yvec14;
1503XOR_DY yvec13, yvec13, yvec13;
1504XOR_DY yvec12, yvec12, yvec12;
1505XOR_DY yvec11, yvec11, yvec11;
1506XOR_DY yvec10, yvec10, yvec10;
1507XOR_DY yvec9, yvec9, yvec9;
1508XOR_DY yvec8, yvec8, yvec8;
1509#ifndef	TRMMKERNEL
1510MOVQ bk, k;
1511#elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
1512MOVQ	bk, %rax;
1513SUBQ	kk, %rax;
1514MOVQ	%rax, kkk;
1515#else
1516MOVQ	kk, %rax;
1517#ifdef	LEFT
1518ADDQ	$8, %rax;
1519#else
1520ADDQ	$2, %rax;
1521#endif
1522MOVQ	%rax, kkk;
1523#endif
1524SARQ $2, k;
1525JLE .L211_loopE;
1526ALIGN_5;
1527.L211_bodyB:
1528# Computing kernel
1529//#### Unroll time 1 ####
1530LD_DX	0*SIZE(ptrba), xvec0;
1531LD_DX	0*SIZE(ptrbb), xvec4;
1532MOV_DX	xvec4, xvec5;
1533MUL_DX xvec0, xvec4, xvec4;
1534ADD_DX xvec4, xvec15, xvec15;
1535
1536LD_DX	2*SIZE(ptrba), xvec1;
1537MOV_DX	xvec5, xvec6;
1538MUL_DX xvec1, xvec5, xvec5;
1539ADD_DX xvec5, xvec14, xvec14;
1540
1541LD_DX	4*SIZE(ptrba), xvec2;
1542MOV_DX	xvec6, xvec7;
1543MUL_DX xvec2, xvec6, xvec6;
1544ADD_DX xvec6, xvec13, xvec13;
1545
1546LD_DX	6*SIZE(ptrba), xvec3;
1547SHUF_DX	$0x4e, xvec7, xvec4;
1548MUL_DX xvec3, xvec7, xvec7;
1549ADD_DX xvec7, xvec12, xvec12;
1550
1551MOV_DX	xvec4, xvec5;
1552MUL_DX xvec0, xvec4, xvec4;
1553ADD_DX xvec4, xvec11, xvec11;
1554
1555MOV_DX	xvec5, xvec6;
1556MUL_DX xvec1, xvec5, xvec5;
1557ADD_DX xvec5, xvec10, xvec10;
1558
1559MOV_DX	xvec6, xvec7;
1560MUL_DX xvec2, xvec6, xvec6;
1561ADD_DX xvec6, xvec9, xvec9;
1562
1563MUL_DX xvec3, xvec7, xvec7;
1564ADD_DX xvec7, xvec8, xvec8;
1565
1566//#### Unroll time 2 ####
1567LD_DX	8*SIZE(ptrba), xvec0;
1568LD_DX	2*SIZE(ptrbb), xvec4;
1569MOV_DX	xvec4, xvec5;
1570MUL_DX xvec0, xvec4, xvec4;
1571ADD_DX xvec4, xvec15, xvec15;
1572
1573LD_DX	10*SIZE(ptrba), xvec1;
1574MOV_DX	xvec5, xvec6;
1575MUL_DX xvec1, xvec5, xvec5;
1576ADD_DX xvec5, xvec14, xvec14;
1577
1578LD_DX	12*SIZE(ptrba), xvec2;
1579MOV_DX	xvec6, xvec7;
1580MUL_DX xvec2, xvec6, xvec6;
1581ADD_DX xvec6, xvec13, xvec13;
1582
1583LD_DX	14*SIZE(ptrba), xvec3;
1584SHUF_DX	$0x4e, xvec7, xvec4;
1585MUL_DX xvec3, xvec7, xvec7;
1586ADD_DX xvec7, xvec12, xvec12;
1587
1588MOV_DX	xvec4, xvec5;
1589MUL_DX xvec0, xvec4, xvec4;
1590ADD_DX xvec4, xvec11, xvec11;
1591
1592MOV_DX	xvec5, xvec6;
1593MUL_DX xvec1, xvec5, xvec5;
1594ADD_DX xvec5, xvec10, xvec10;
1595
1596MOV_DX	xvec6, xvec7;
1597MUL_DX xvec2, xvec6, xvec6;
1598ADD_DX xvec6, xvec9, xvec9;
1599
1600MUL_DX xvec3, xvec7, xvec7;
1601ADD_DX xvec7, xvec8, xvec8;
1602
1603//#### Unroll time 3 ####
1604LD_DX	16*SIZE(ptrba), xvec0;
1605LD_DX	4*SIZE(ptrbb), xvec4;
1606MOV_DX	xvec4, xvec5;
1607MUL_DX xvec0, xvec4, xvec4;
1608ADD_DX xvec4, xvec15, xvec15;
1609
1610LD_DX	18*SIZE(ptrba), xvec1;
1611MOV_DX	xvec5, xvec6;
1612MUL_DX xvec1, xvec5, xvec5;
1613ADD_DX xvec5, xvec14, xvec14;
1614
1615LD_DX	20*SIZE(ptrba), xvec2;
1616MOV_DX	xvec6, xvec7;
1617MUL_DX xvec2, xvec6, xvec6;
1618ADD_DX xvec6, xvec13, xvec13;
1619
1620LD_DX	22*SIZE(ptrba), xvec3;
1621SHUF_DX	$0x4e, xvec7, xvec4;
1622MUL_DX xvec3, xvec7, xvec7;
1623ADD_DX xvec7, xvec12, xvec12;
1624
1625MOV_DX	xvec4, xvec5;
1626MUL_DX xvec0, xvec4, xvec4;
1627ADD_DX xvec4, xvec11, xvec11;
1628
1629MOV_DX	xvec5, xvec6;
1630MUL_DX xvec1, xvec5, xvec5;
1631ADD_DX xvec5, xvec10, xvec10;
1632
1633MOV_DX	xvec6, xvec7;
1634MUL_DX xvec2, xvec6, xvec6;
1635ADD_DX xvec6, xvec9, xvec9;
1636
1637MUL_DX xvec3, xvec7, xvec7;
1638ADD_DX xvec7, xvec8, xvec8;
1639
1640//#### Unroll time 4 ####
1641LD_DX	24*SIZE(ptrba), xvec0;
1642LD_DX	6*SIZE(ptrbb), xvec4;
1643MOV_DX	xvec4, xvec5;
1644MUL_DX xvec0, xvec4, xvec4;
1645ADD_DX xvec4, xvec15, xvec15;
1646ADDQ	$8*SIZE, ptrbb;
1647
1648LD_DX	26*SIZE(ptrba), xvec1;
1649MOV_DX	xvec5, xvec6;
1650MUL_DX xvec1, xvec5, xvec5;
1651ADD_DX xvec5, xvec14, xvec14;
1652
1653LD_DX	28*SIZE(ptrba), xvec2;
1654MOV_DX	xvec6, xvec7;
1655MUL_DX xvec2, xvec6, xvec6;
1656ADD_DX xvec6, xvec13, xvec13;
1657
1658LD_DX	30*SIZE(ptrba), xvec3;
1659SHUF_DX	$0x4e, xvec7, xvec4;
1660MUL_DX xvec3, xvec7, xvec7;
1661ADD_DX xvec7, xvec12, xvec12;
1662ADDQ	$32*SIZE, ptrba;
1663
1664MOV_DX	xvec4, xvec5;
1665MUL_DX xvec0, xvec4, xvec4;
1666ADD_DX xvec4, xvec11, xvec11;
1667
1668MOV_DX	xvec5, xvec6;
1669MUL_DX xvec1, xvec5, xvec5;
1670ADD_DX xvec5, xvec10, xvec10;
1671
1672MOV_DX	xvec6, xvec7;
1673MUL_DX xvec2, xvec6, xvec6;
1674ADD_DX xvec6, xvec9, xvec9;
1675
1676MUL_DX xvec3, xvec7, xvec7;
1677ADD_DX xvec7, xvec8, xvec8;
1678DECQ	k;
1679JG	.L211_bodyB;
1680ALIGN_5
1681.L211_loopE:
1682#ifndef	TRMMKERNEL
1683TEST $2, bk;
1684#else
1685MOVQ kkk, %rax;
1686TEST $2, %rax;
1687#endif
1688JLE .L212_loopE;
1689ALIGN_5;
1690.L212_bodyB:
1691# Computing kernel
1692//#### Unroll time 1 ####
1693LD_DX	0*SIZE(ptrba), xvec0;
1694LD_DX	0*SIZE(ptrbb), xvec4;
1695MOV_DX	xvec4, xvec5;
1696MUL_DX xvec0, xvec4, xvec4;
1697ADD_DX xvec4, xvec15, xvec15;
1698
1699LD_DX	2*SIZE(ptrba), xvec1;
1700MOV_DX	xvec5, xvec6;
1701MUL_DX xvec1, xvec5, xvec5;
1702ADD_DX xvec5, xvec14, xvec14;
1703
1704LD_DX	4*SIZE(ptrba), xvec2;
1705MOV_DX	xvec6, xvec7;
1706MUL_DX xvec2, xvec6, xvec6;
1707ADD_DX xvec6, xvec13, xvec13;
1708
1709LD_DX	6*SIZE(ptrba), xvec3;
1710SHUF_DX	$0x4e, xvec7, xvec4;
1711MUL_DX xvec3, xvec7, xvec7;
1712ADD_DX xvec7, xvec12, xvec12;
1713
1714MOV_DX	xvec4, xvec5;
1715MUL_DX xvec0, xvec4, xvec4;
1716ADD_DX xvec4, xvec11, xvec11;
1717
1718MOV_DX	xvec5, xvec6;
1719MUL_DX xvec1, xvec5, xvec5;
1720ADD_DX xvec5, xvec10, xvec10;
1721
1722MOV_DX	xvec6, xvec7;
1723MUL_DX xvec2, xvec6, xvec6;
1724ADD_DX xvec6, xvec9, xvec9;
1725
1726MUL_DX xvec3, xvec7, xvec7;
1727ADD_DX xvec7, xvec8, xvec8;
1728
1729//#### Unroll time 2 ####
1730LD_DX	8*SIZE(ptrba), xvec0;
1731LD_DX	2*SIZE(ptrbb), xvec4;
1732MOV_DX	xvec4, xvec5;
1733MUL_DX xvec0, xvec4, xvec4;
1734ADD_DX xvec4, xvec15, xvec15;
1735ADDQ	$4*SIZE, ptrbb;
1736
1737LD_DX	10*SIZE(ptrba), xvec1;
1738MOV_DX	xvec5, xvec6;
1739MUL_DX xvec1, xvec5, xvec5;
1740ADD_DX xvec5, xvec14, xvec14;
1741
1742LD_DX	12*SIZE(ptrba), xvec2;
1743MOV_DX	xvec6, xvec7;
1744MUL_DX xvec2, xvec6, xvec6;
1745ADD_DX xvec6, xvec13, xvec13;
1746
1747LD_DX	14*SIZE(ptrba), xvec3;
1748SHUF_DX	$0x4e, xvec7, xvec4;
1749MUL_DX xvec3, xvec7, xvec7;
1750ADD_DX xvec7, xvec12, xvec12;
1751ADDQ	$16*SIZE, ptrba;
1752
1753MOV_DX	xvec4, xvec5;
1754MUL_DX xvec0, xvec4, xvec4;
1755ADD_DX xvec4, xvec11, xvec11;
1756
1757MOV_DX	xvec5, xvec6;
1758MUL_DX xvec1, xvec5, xvec5;
1759ADD_DX xvec5, xvec10, xvec10;
1760
1761MOV_DX	xvec6, xvec7;
1762MUL_DX xvec2, xvec6, xvec6;
1763ADD_DX xvec6, xvec9, xvec9;
1764
1765MUL_DX xvec3, xvec7, xvec7;
1766ADD_DX xvec7, xvec8, xvec8;
1767
1768.L212_loopE:
1769#ifndef	TRMMKERNEL
1770TEST $1, bk;
1771#else
1772MOVQ kkk, %rax;
1773TEST $1, %rax;
1774#endif
1775JLE .L213_loopE;
1776ALIGN_5
1777.L213_bodyB:
1778//#### Unroll time 1 ####
1779LD_DX	0*SIZE(ptrba), xvec0;
1780LD_DX	0*SIZE(ptrbb), xvec4;
1781MOV_DX	xvec4, xvec5;
1782MUL_DX xvec0, xvec4, xvec4;
1783ADD_DX xvec4, xvec15, xvec15;
1784ADDQ	$2*SIZE, ptrbb;
1785
1786LD_DX	2*SIZE(ptrba), xvec1;
1787MOV_DX	xvec5, xvec6;
1788MUL_DX xvec1, xvec5, xvec5;
1789ADD_DX xvec5, xvec14, xvec14;
1790
1791LD_DX	4*SIZE(ptrba), xvec2;
1792MOV_DX	xvec6, xvec7;
1793MUL_DX xvec2, xvec6, xvec6;
1794ADD_DX xvec6, xvec13, xvec13;
1795
1796LD_DX	6*SIZE(ptrba), xvec3;
1797SHUF_DX	$0x4e, xvec7, xvec4;
1798MUL_DX xvec3, xvec7, xvec7;
1799ADD_DX xvec7, xvec12, xvec12;
1800ADDQ	$8*SIZE, ptrba;
1801
1802MOV_DX	xvec4, xvec5;
1803MUL_DX xvec0, xvec4, xvec4;
1804ADD_DX xvec4, xvec11, xvec11;
1805
1806MOV_DX	xvec5, xvec6;
1807MUL_DX xvec1, xvec5, xvec5;
1808ADD_DX xvec5, xvec10, xvec10;
1809
1810MOV_DX	xvec6, xvec7;
1811MUL_DX xvec2, xvec6, xvec6;
1812ADD_DX xvec6, xvec9, xvec9;
1813
1814MUL_DX xvec3, xvec7, xvec7;
1815ADD_DX xvec7, xvec8, xvec8;
1816
1817.L213_loopE:
1818//#### Multiply Alpha ####
1819BROAD_DX MEMALPHA, xvec7;
1820MUL_DX xvec7, xvec15, xvec15;
1821MUL_DX xvec7, xvec14, xvec14;
1822MUL_DX xvec7, xvec13, xvec13;
1823MUL_DX xvec7, xvec12, xvec12;
1824MUL_DX xvec7, xvec11, xvec11;
1825MUL_DX xvec7, xvec10, xvec10;
1826MUL_DX xvec7, xvec9, xvec9;
1827MUL_DX xvec7, xvec8, xvec8;
1828//#### Reverse ####
1829MOV_DX	xvec15, xvec6;
1830REVS_DX xvec11, xvec15, xvec15;
1831REVS_DX xvec6, xvec11, xvec11;
1832MOV_DX	xvec14, xvec6;
1833REVS_DX xvec10, xvec14, xvec14;
1834REVS_DX xvec6, xvec10, xvec10;
1835MOV_DX	xvec13, xvec6;
1836REVS_DX xvec9, xvec13, xvec13;
1837REVS_DX xvec6, xvec9, xvec9;
1838MOV_DX	xvec12, xvec6;
1839REVS_DX xvec8, xvec12, xvec12;
1840REVS_DX xvec6, xvec8, xvec8;
1841//#### Testing Alignment ####
1842MOVQ	C0, %rax;
1843OR		ldc, %rax;
1844TEST	$15, %rax;
1845JNE		.L213_loopEx;
1846ALIGN_5
1847//#### Writing Back ####
1848#ifndef	TRMMKERNEL
1849ADD_DX 0*SIZE(C0), xvec11, xvec11;
1850ADD_DX 2*SIZE(C0), xvec10, xvec10;
1851ADD_DX 4*SIZE(C0), xvec9, xvec9;
1852ADD_DX 6*SIZE(C0), xvec8, xvec8;
1853ADD_DX 0*SIZE(C1), xvec15, xvec15;
1854ADD_DX 2*SIZE(C1), xvec14, xvec14;
1855ADD_DX 4*SIZE(C1), xvec13, xvec13;
1856ADD_DX 6*SIZE(C1), xvec12, xvec12;
1857#endif
1858ST_DX	xvec11,	0*SIZE(C0);
1859ST_DX	xvec10,	2*SIZE(C0);
1860ST_DX	xvec9,	4*SIZE(C0);
1861ST_DX	xvec8,	6*SIZE(C0);
1862ST_DX	xvec15, 0*SIZE(C1);
1863ST_DX	xvec14, 2*SIZE(C1);
1864ST_DX	xvec13, 4*SIZE(C1);
1865ST_DX	xvec12, 6*SIZE(C1);
1866#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
1867MOVQ	bk, %rax;
1868SUBQ	kkk, %rax;
1869LEAQ	(,%rax, SIZE), %rax;
1870LEAQ 	(ptrba, %rax, 8), ptrba;
1871LEAQ	(ptrbb, %rax, 2), ptrbb;
1872#endif
1873#if defined(TRMMKERNEL) && defined(LEFT)
1874ADDQ	$8, kk
1875#endif
1876ADDQ	$8*SIZE, C0;
1877ADDQ	$8*SIZE, C1;
1878DECQ i;
1879JG	.L21_bodyB;
1880JMP	.L21_loopE;
1881ALIGN_5
1882.L213_loopEx:;
1883#ifndef	TRMMKERNEL
1884LDL_DX 0*SIZE(C0), xvec0, xvec0;
1885LDH_DX 1*SIZE(C0), xvec0, xvec0;
1886LDL_DX 2*SIZE(C0), xvec1, xvec1;
1887LDH_DX 3*SIZE(C0), xvec1, xvec1;
1888LDL_DX 4*SIZE(C0), xvec2, xvec2;
1889LDH_DX 5*SIZE(C0), xvec2, xvec2;
1890LDL_DX 6*SIZE(C0), xvec3, xvec3;
1891LDH_DX 7*SIZE(C0), xvec3, xvec3;
1892ADD_DX xvec0, xvec11, xvec11;
1893ADD_DX xvec1, xvec10, xvec10;
1894ADD_DX xvec2, xvec9, xvec9;
1895ADD_DX xvec3, xvec8, xvec8;
1896#endif
1897STL_DX	xvec11, 0*SIZE(C0);
1898STH_DX	xvec11, 1*SIZE(C0);
1899STL_DX	xvec10, 2*SIZE(C0);
1900STH_DX	xvec10, 3*SIZE(C0);
1901STL_DX	xvec9, 4*SIZE(C0);
1902STH_DX	xvec9, 5*SIZE(C0);
1903STL_DX	xvec8, 6*SIZE(C0);
1904STH_DX	xvec8, 7*SIZE(C0);
1905#ifndef	TRMMKERNEL
1906LDL_DX 0*SIZE(C1), xvec4, xvec4;
1907LDH_DX 1*SIZE(C1), xvec4, xvec4;
1908LDL_DX 2*SIZE(C1), xvec5, xvec5;
1909LDH_DX 3*SIZE(C1), xvec5, xvec5;
1910LDL_DX 4*SIZE(C1), xvec6, xvec6;
1911LDH_DX 5*SIZE(C1), xvec6, xvec6;
1912LDL_DX 6*SIZE(C1), xvec7, xvec7;
1913LDH_DX 7*SIZE(C1), xvec7, xvec7;
1914ADD_DX xvec4, xvec15, xvec15;
1915ADD_DX xvec5, xvec14, xvec14;
1916ADD_DX xvec6, xvec13, xvec13;
1917ADD_DX xvec7, xvec12, xvec12;
1918#endif
1919STL_DX	xvec15, 0*SIZE(C1);
1920STH_DX	xvec15, 1*SIZE(C1);
1921STL_DX	xvec14, 2*SIZE(C1);
1922STH_DX	xvec14, 3*SIZE(C1);
1923STL_DX	xvec13, 4*SIZE(C1);
1924STH_DX	xvec13, 5*SIZE(C1);
1925STL_DX	xvec12, 6*SIZE(C1);
1926STH_DX	xvec12, 7*SIZE(C1);
1927#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
1928MOVQ	bk, %rax;
1929SUBQ	kkk, %rax;
1930LEAQ	(,%rax, SIZE), %rax;
1931LEAQ 	(ptrba, %rax, 8), ptrba;
1932LEAQ	(ptrbb, %rax, 2), ptrbb;
1933#endif
1934#if defined(TRMMKERNEL) && defined(LEFT)
1935ADDQ	$8, kk
1936#endif
1937ADDQ	$8*SIZE, C0;
1938ADDQ	$8*SIZE, C1;
1939DECQ	i;
1940JG	.L21_bodyB;
1941.L21_loopE:;
1942TEST $4, bm;				# Rm = 4
1943JLE .L22_loopE;
1944ALIGN_5;
1945.L22_bodyB:;
1946#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
1947MOVQ bb, ptrbb;
1948#else
1949MOVQ bb, ptrbb;
1950MOVQ kk, %rax;
1951LEAQ	(,%rax, SIZE), %rax;
1952LEAQ	(ptrba, %rax, 4), ptrba;
1953LEAQ	(ptrbb, %rax, 2), ptrbb;
1954#endif
1955//#### Initial Results Register ####
1956XOR_DY yvec15, yvec15, yvec15;
1957XOR_DY yvec14, yvec14, yvec14;
1958XOR_DY yvec11, yvec11, yvec11;
1959XOR_DY yvec10, yvec10, yvec10;
1960#ifndef	TRMMKERNEL
1961MOVQ bk, k;
1962#elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
1963MOVQ	bk, %rax;
1964SUBQ	kk, %rax;
1965MOVQ	%rax, kkk;
1966#else
1967MOVQ	kk, %rax;
1968#ifdef	LEFT
1969ADDQ	$4, %rax;
1970#else
1971ADDQ	$2, %rax;
1972#endif
1973MOVQ	%rax, kkk;
1974#endif
1975SARQ $2, k;
1976JLE	.L221_loopE;
1977ALIGN_5
1978.L221_bodyB:;
1979# Computing kernel
1980//#### Unroll time 1 ####
1981LD_DX	0*SIZE(ptrba), xvec0;
1982LD_DX	0*SIZE(ptrbb), xvec4;
1983MOV_DX	xvec4, xvec5;
1984MUL_DX xvec0, xvec4, xvec4;
1985ADD_DX xvec4, xvec15, xvec15;
1986
1987LD_DX	2*SIZE(ptrba), xvec1;
1988SHUF_DX	$0x4e, xvec5, xvec4;
1989MUL_DX xvec1, xvec5, xvec5;
1990ADD_DX xvec5, xvec14, xvec14;
1991
1992MOV_DX	xvec4, xvec5;
1993MUL_DX xvec0, xvec4, xvec4;
1994ADD_DX xvec4, xvec11, xvec11;
1995
1996MUL_DX xvec1, xvec5, xvec5;
1997ADD_DX xvec5, xvec10, xvec10;
1998
1999//#### Unroll time 2 ####
2000LD_DX	4*SIZE(ptrba), xvec0;
2001LD_DX	2*SIZE(ptrbb), xvec4;
2002MOV_DX	xvec4, xvec5;
2003MUL_DX xvec0, xvec4, xvec4;
2004ADD_DX xvec4, xvec15, xvec15;
2005
2006LD_DX	6*SIZE(ptrba), xvec1;
2007SHUF_DX	$0x4e, xvec5, xvec4;
2008MUL_DX xvec1, xvec5, xvec5;
2009ADD_DX xvec5, xvec14, xvec14;
2010
2011MOV_DX	xvec4, xvec5;
2012MUL_DX xvec0, xvec4, xvec4;
2013ADD_DX xvec4, xvec11, xvec11;
2014
2015MUL_DX xvec1, xvec5, xvec5;
2016ADD_DX xvec5, xvec10, xvec10;
2017
2018//#### Unroll time 3 ####
2019LD_DX	8*SIZE(ptrba), xvec0;
2020LD_DX	4*SIZE(ptrbb), xvec4;
2021MOV_DX	xvec4, xvec5;
2022MUL_DX xvec0, xvec4, xvec4;
2023ADD_DX xvec4, xvec15, xvec15;
2024
2025LD_DX	10*SIZE(ptrba), xvec1;
2026SHUF_DX	$0x4e, xvec5, xvec4;
2027MUL_DX xvec1, xvec5, xvec5;
2028ADD_DX xvec5, xvec14, xvec14;
2029
2030MOV_DX	xvec4, xvec5;
2031MUL_DX xvec0, xvec4, xvec4;
2032ADD_DX xvec4, xvec11, xvec11;
2033
2034MUL_DX xvec1, xvec5, xvec5;
2035ADD_DX xvec5, xvec10, xvec10;
2036
2037//#### Unroll time 4 ####
2038LD_DX	12*SIZE(ptrba), xvec0;
2039LD_DX	6*SIZE(ptrbb), xvec4;
2040MOV_DX	xvec4, xvec5;
2041MUL_DX xvec0, xvec4, xvec4;
2042ADD_DX xvec4, xvec15, xvec15;
2043ADDQ	$8*SIZE, ptrbb;
2044
2045LD_DX	14*SIZE(ptrba), xvec1;
2046SHUF_DX	$0x4e, xvec5, xvec4;
2047MUL_DX xvec1, xvec5, xvec5;
2048ADD_DX xvec5, xvec14, xvec14;
2049ADDQ	$16*SIZE, ptrba;
2050
2051MOV_DX	xvec4, xvec5;
2052MUL_DX xvec0, xvec4, xvec4;
2053ADD_DX xvec4, xvec11, xvec11;
2054
2055MUL_DX xvec1, xvec5, xvec5;
2056ADD_DX xvec5, xvec10, xvec10;
2057DECQ	k;
2058JG	.L221_bodyB;
2059ALIGN_5
2060.L221_loopE:;
2061#ifndef	TRMMKERNEL
2062TEST $2, bk;
2063#else
2064MOVQ kkk, %rax;
2065TEST $2, %rax;
2066#endif
2067JLE .L222_loopE;
2068ALIGN_5
2069.L222_bodyB:
2070//#### Unroll time 1 ####
2071LD_DX	0*SIZE(ptrba), xvec0;
2072LD_DX	0*SIZE(ptrbb), xvec4;
2073MOV_DX	xvec4, xvec5;
2074MUL_DX xvec0, xvec4, xvec4;
2075ADD_DX xvec4, xvec15, xvec15;
2076
2077LD_DX	2*SIZE(ptrba), xvec1;
2078SHUF_DX	$0x4e, xvec5, xvec4;
2079MUL_DX xvec1, xvec5, xvec5;
2080ADD_DX xvec5, xvec14, xvec14;
2081
2082MOV_DX	xvec4, xvec5;
2083MUL_DX xvec0, xvec4, xvec4;
2084ADD_DX xvec4, xvec11, xvec11;
2085
2086MUL_DX xvec1, xvec5, xvec5;
2087ADD_DX xvec5, xvec10, xvec10;
2088
2089//#### Unroll time 2 ####
2090LD_DX	4*SIZE(ptrba), xvec0;
2091LD_DX	2*SIZE(ptrbb), xvec4;
2092MOV_DX	xvec4, xvec5;
2093MUL_DX xvec0, xvec4, xvec4;
2094ADD_DX xvec4, xvec15, xvec15;
2095ADDQ	$4*SIZE, ptrbb;
2096
2097LD_DX	6*SIZE(ptrba), xvec1;
2098SHUF_DX	$0x4e, xvec5, xvec4;
2099MUL_DX xvec1, xvec5, xvec5;
2100ADD_DX xvec5, xvec14, xvec14;
2101ADDQ	$8*SIZE, ptrba;
2102MOV_DX	xvec4, xvec5;
2103MUL_DX xvec0, xvec4, xvec4;
2104ADD_DX xvec4, xvec11, xvec11;
2105
2106MUL_DX xvec1, xvec5, xvec5;
2107ADD_DX xvec5, xvec10, xvec10;
2108
2109.L222_loopE:
2110#ifndef	TRMMKERNEL
2111TEST $1, bk
2112#else
2113MOVQ kkk, %rax;
2114TEST $1, %rax;
2115#endif
2116JLE .L223_loopE;
2117ALIGN_5
2118.L223_bodyB:
2119//#### Unroll time 1 ####
2120LD_DX	0*SIZE(ptrba), xvec0;
2121LD_DX	0*SIZE(ptrbb), xvec4;
2122MOV_DX	xvec4, xvec5;
2123MUL_DX xvec0, xvec4, xvec4;
2124ADD_DX xvec4, xvec15, xvec15;
2125ADDQ	$2*SIZE, ptrbb;
2126
2127LD_DX	2*SIZE(ptrba), xvec1;
2128SHUF_DX	$0x4e, xvec5, xvec4;
2129MUL_DX xvec1, xvec5, xvec5;
2130ADD_DX xvec5, xvec14, xvec14;
2131ADDQ	$4*SIZE, ptrba;
2132
2133MOV_DX	xvec4, xvec5;
2134MUL_DX xvec0, xvec4, xvec4;
2135ADD_DX xvec4, xvec11, xvec11;
2136
2137MUL_DX xvec1, xvec5, xvec5;
2138ADD_DX xvec5, xvec10, xvec10;
2139
2140.L223_loopE:
2141//#### Multiply Alpha ####
2142BROAD_DX MEMALPHA, xvec7;
2143MUL_DX xvec7, xvec15, xvec15;
2144MUL_DX xvec7, xvec14, xvec14;
2145MUL_DX xvec7, xvec11, xvec11;
2146MUL_DX xvec7, xvec10, xvec10;
2147//#### Reverse ####
2148MOV_DX	xvec15, xvec6;
2149REVS_DX xvec11, xvec15, xvec15;
2150REVS_DX xvec6, xvec11, xvec11;
2151MOV_DX	xvec14, xvec6;
2152REVS_DX xvec10, xvec14, xvec14;
2153REVS_DX xvec6, xvec10, xvec10;
2154//#### Testing Alignment ####
2155MOVQ	C0, %rax;
2156OR		ldc, %rax;
2157TEST	$15, %rax;
2158JNE		.L223_loopEx;
2159ALIGN_5
2160//#### Writing Back ####
2161#ifndef	TRMMKERNEL
2162ADD_DX 0*SIZE(C0), xvec11, xvec11;
2163ADD_DX 2*SIZE(C0), xvec10, xvec10;
2164ADD_DX 0*SIZE(C1), xvec15, xvec15;
2165ADD_DX 2*SIZE(C1), xvec14, xvec14;
2166#endif
2167ST_DX	xvec11,	0*SIZE(C0);
2168ST_DX	xvec10,	2*SIZE(C0);
2169ST_DX	xvec15, 0*SIZE(C1);
2170ST_DX	xvec14, 2*SIZE(C1);
2171#if (defined(TRMMKERNEL)&& defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&& !defined(TRANSA))
2172MOVQ	bk, %rax;
2173SUBQ	kkk, %rax;
2174LEAQ	(,%rax, SIZE), %rax;
2175LEAQ	(ptrba, %rax, 4), ptrba;
2176LEAQ	(ptrbb, %rax, 2), ptrbb;
2177#endif
2178#if defined(TRMMKERNEL) && defined(LEFT)
2179ADDQ	$4, kk
2180#endif
2181ADDQ	$4*SIZE, C0;
2182ADDQ	$4*SIZE, C1;
2183JMP	.L22_loopE;
2184ALIGN_5
2185.L223_loopEx:;
2186#ifndef	TRMMKERNEL
2187LDL_DX 0*SIZE(C0), xvec0, xvec0;
2188LDH_DX 1*SIZE(C0), xvec0, xvec0;
2189LDL_DX 2*SIZE(C0), xvec1, xvec1;
2190LDH_DX 3*SIZE(C0), xvec1, xvec1;
2191ADD_DX xvec0, xvec11, xvec11;
2192ADD_DX xvec1, xvec10, xvec10;
2193#endif
2194STL_DX	xvec11, 0*SIZE(C0);
2195STH_DX	xvec11, 1*SIZE(C0);
2196STL_DX	xvec10, 2*SIZE(C0);
2197STH_DX	xvec10, 3*SIZE(C0);
2198#ifndef	TRMMKERNEL
2199LDL_DX 0*SIZE(C1), xvec4, xvec4;
2200LDH_DX 1*SIZE(C1), xvec4, xvec4;
2201LDL_DX 2*SIZE(C1), xvec5, xvec5;
2202LDH_DX 3*SIZE(C1), xvec5, xvec5;
2203ADD_DX xvec4, xvec15, xvec15;
2204ADD_DX xvec5, xvec14, xvec14;
2205#endif
2206STL_DX	xvec15, 0*SIZE(C1);
2207STH_DX	xvec15, 1*SIZE(C1);
2208STL_DX	xvec14, 2*SIZE(C1);
2209STH_DX	xvec14, 3*SIZE(C1);
2210#if (defined(TRMMKERNEL)&& defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&& !defined(TRANSA))
2211MOVQ	bk, %rax;
2212SUBQ	kkk, %rax;
2213LEAQ	(,%rax, SIZE), %rax;
2214LEAQ	(ptrba, %rax, 4), ptrba;
2215LEAQ	(ptrbb, %rax, 2), ptrbb;
2216#endif
2217#if defined(TRMMKERNEL) && defined(LEFT)
2218ADDQ	$4, kk
2219#endif
2220ADDQ	$4*SIZE, C0;
2221ADDQ	$4*SIZE, C1;
2222.L22_loopE:;
2223TEST $2, bm;				// Rm = 2
2224JLE .L23_loopE;
2225ALIGN_5;
2226.L23_bodyB:
2227#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
2228MOVQ bb, ptrbb;
2229#else
2230MOVQ bb, ptrbb;
2231MOVQ kk, %rax;
2232LEAQ	(,%rax, SIZE), %rax;
2233LEAQ	(ptrba, %rax, 2), ptrba;
2234LEAQ	(ptrbb, %rax, 2), ptrbb;
2235#endif
2236XOR_DY yvec15, yvec15, yvec15;
2237XOR_DY yvec11, yvec11, yvec11;
2238#ifndef	TRMMKERNEL
2239MOVQ bk, k;
2240#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
2241MOVQ	bk, %rax;
2242SUBQ	kk, %rax;
2243MOVQ	%rax, kkk;
2244#else
2245MOVQ	kk, %rax;
2246#ifdef	LEFT
2247ADDQ	$2, %rax;
2248#else
2249ADDQ	$2, %rax;
2250#endif
2251MOVQ	%rax, kkk;
2252#endif
2253SARQ $2, k;
2254JLE .L231_loopE;
2255ALIGN_5
2256.L231_bodyB:
2257# Computing kernel
2258//#### Unroll time 1 ####
2259LD_DX	0*SIZE(ptrba), xvec0;
2260LD_DX	0*SIZE(ptrbb), xvec4;
2261SHUF_DX $0x4e, xvec4, xvec5;
2262MUL_DX xvec0, xvec4, xvec4;
2263ADD_DX xvec4, xvec15, xvec15;
2264
2265MUL_DX xvec0, xvec5, xvec5;
2266ADD_DX xvec5, xvec11, xvec11;
2267//#### Unroll time 2 ####
2268LD_DX	2*SIZE(ptrba), xvec0;
2269LD_DX	2*SIZE(ptrbb), xvec4;
2270SHUF_DX $0x4e, xvec4, xvec5;
2271MUL_DX xvec0, xvec4, xvec4;
2272ADD_DX xvec4, xvec15, xvec15;
2273
2274MUL_DX xvec0, xvec5, xvec5;
2275ADD_DX xvec5, xvec11, xvec11;
2276//#### Unroll time 3 ####
2277LD_DX	4*SIZE(ptrba), xvec0;
2278LD_DX	4*SIZE(ptrbb), xvec4;
2279SHUF_DX $0x4e, xvec4, xvec5;
2280MUL_DX xvec0, xvec4, xvec4;
2281ADD_DX xvec4, xvec15, xvec15;
2282
2283MUL_DX xvec0, xvec5, xvec5;
2284ADD_DX xvec5, xvec11, xvec11;
2285//#### Unroll time 4 ####
2286LD_DX	6*SIZE(ptrba), xvec0;
2287LD_DX	6*SIZE(ptrbb), xvec4;
2288SHUF_DX $0x4e, xvec4, xvec5;
2289MUL_DX xvec0, xvec4, xvec4;
2290ADD_DX xvec4, xvec15, xvec15;
2291ADDQ	$8*SIZE, ptrba;
2292MUL_DX xvec0, xvec5, xvec5;
2293ADD_DX xvec5, xvec11, xvec11;
2294ADDQ	$8*SIZE, ptrbb;
2295DECQ k;
2296JG .L231_bodyB;
2297ALIGN_5
2298.L231_loopE:
2299#ifndef	TRMMKERNEL
2300TEST $2, bk;
2301#else
2302MOVQ kkk, %rax;
2303TEST $2, %rax;
2304#endif
2305JLE .L232_loopE;
2306ALIGN_5
2307.L232_bodyB:
2308//#### Unroll time 1 ####
2309LD_DX	0*SIZE(ptrba), xvec0;
2310LD_DX	0*SIZE(ptrbb), xvec4;
2311SHUF_DX $0x4e, xvec4, xvec5;
2312MUL_DX xvec0, xvec4, xvec4;
2313ADD_DX xvec4, xvec15, xvec15;
2314
2315MUL_DX xvec0, xvec5, xvec5;
2316ADD_DX xvec5, xvec11, xvec11;
2317//#### Unroll time 2 ####
2318LD_DX	2*SIZE(ptrba), xvec0;
2319LD_DX	2*SIZE(ptrbb), xvec4;
2320SHUF_DX $0x4e, xvec4, xvec5;
2321MUL_DX xvec0, xvec4, xvec4;
2322ADD_DX xvec4, xvec15, xvec15;
2323ADDQ	$4*SIZE, ptrba;
2324MUL_DX xvec0, xvec5, xvec5;
2325ADD_DX xvec5, xvec11, xvec11;
2326ADDQ	$4*SIZE, ptrbb;
2327.L232_loopE:
2328#ifndef	TRMMKERNEL
2329TEST $1, bk;
2330#else
2331MOVQ kkk, %rax;
2332TEST $1, %rax;
2333#endif
2334JLE .L233_loopE;
2335ALIGN_5
2336.L233_bodyB:
2337//#### Unroll time 1 ####
2338LD_DX	0*SIZE(ptrba), xvec0;
2339LD_DX	0*SIZE(ptrbb), xvec4;
2340SHUF_DX $0x4e, xvec4, xvec5;
2341MUL_DX xvec0, xvec4, xvec4;
2342ADD_DX xvec4, xvec15, xvec15;
2343ADDQ	$2*SIZE, ptrba;
2344MUL_DX xvec0, xvec5, xvec5;
2345ADD_DX xvec5, xvec11, xvec11;
2346ADDQ	$2*SIZE, ptrbb;
2347.L233_loopE:
2348//#### Multiply Alpha ####
2349BROAD_DX MEMALPHA, xvec7;
2350MUL_DX xvec7, xvec15, xvec15;
2351MUL_DX xvec7, xvec11, xvec11;
2352//#### Reverse ####
2353MOV_DX	xvec15, xvec6;
2354REVS_DX xvec11, xvec15, xvec15;
2355REVS_DX xvec6, xvec11, xvec11;
2356//#### Testing Alignment ####
2357MOVQ	C0, %rax;
2358OR		ldc, %rax;
2359TEST	$15, %rax;
2360JNE		.L233_loopEx;
2361ALIGN_5
2362//#### Writing Back ####
2363#ifndef	TRMMKERNEL
2364ADD_DX 0*SIZE(C0), xvec11, xvec11;
2365ADD_DX 0*SIZE(C1), xvec15, xvec15;
2366#endif
2367ST_DX	xvec11,	0*SIZE(C0);
2368ST_DX	xvec15, 0*SIZE(C1);
2369#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
2370MOVQ	bk, %rax;
2371SUBQ	kkk, %rax;
2372LEAQ	(,%rax, SIZE), %rax;
2373LEAQ	(ptrba, %rax, 2), ptrba;
2374LEAQ	(ptrbb, %rax, 2), ptrbb;
2375#endif
2376#if defined(TRMMKERNEL) && defined(LEFT)
2377ADDQ	$2, kk;
2378#endif
2379ADDQ	$2*SIZE, C0;
2380ADDQ	$2*SIZE, C1;
2381JMP	.L23_loopE;
2382ALIGN_5
2383.L233_loopEx:;
2384#ifndef	TRMMKERNEL
2385LDL_DX 0*SIZE(C0), xvec0, xvec0;
2386LDH_DX 1*SIZE(C0), xvec0, xvec0;
2387ADD_DX xvec0, xvec11, xvec11;
2388#endif
2389STL_DX	xvec11, 0*SIZE(C0);
2390STH_DX	xvec11, 1*SIZE(C0);
2391#ifndef	TRMMKERNEL
2392LDL_DX 0*SIZE(C1), xvec4, xvec4;
2393LDH_DX 1*SIZE(C1), xvec4, xvec4;
2394ADD_DX xvec4, xvec15, xvec15;
2395#endif
2396STL_DX	xvec15, 0*SIZE(C1);
2397STH_DX	xvec15, 1*SIZE(C1);
2398#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
2399MOVQ	bk, %rax;
2400SUBQ	kkk, %rax;
2401LEAQ	(,%rax, SIZE), %rax;
2402LEAQ	(ptrba, %rax, 2), ptrba;
2403LEAQ	(ptrbb, %rax, 2), ptrbb;
2404#endif
2405#if defined(TRMMKERNEL) && defined(LEFT)
2406ADDQ	$2, kk;
2407#endif
2408ADDQ	$2*SIZE, C0;
2409ADDQ	$2*SIZE, C1;
2410.L23_loopE:
2411TEST $1, bm;				// Rm = 1
2412JLE .L24_loopE;
2413ALIGN_5;
2414.L24_bodyB:
2415#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
2416MOVQ bb, ptrbb;
2417#else
2418MOVQ bb, ptrbb;
2419MOVQ kk, %rax;
2420LEAQ	(, %rax, SIZE), %rax;
2421ADDQ	%rax, ptrba;
2422LEAQ	(ptrbb, %rax, 2), ptrbb;
2423#endif
2424XOR_DY	yvec15, yvec15, yvec15;
2425#ifndef	TRMMKERNEL
2426MOVQ bk, k;
2427#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
2428MOVQ bk, %rax;
2429SUBQ kk, %rax;
2430MOVQ %rax, kkk;
2431#else
2432MOVQ kk, %rax;
2433#ifdef	LEFT
2434ADDQ	$1, %rax;
2435#else
2436ADDQ	$2, %rax;
2437#endif
2438MOVQ	%rax, kkk;
2439#endif
2440SARQ $2, k;
2441JLE .L241_loopE;
2442ALIGN_5
2443.L241_bodyB:
2444BROAD_DX 0*SIZE(ptrba), xvec0;
2445LD_DX	0*SIZE(ptrbb), xvec2;
2446MUL_DX xvec0, xvec2, xvec2;
2447ADD_DX xvec2, xvec15, xvec15;
2448
2449BROAD_DX 1*SIZE(ptrba), xvec1;
2450LD_DX	2*SIZE(ptrbb), xvec3;
2451MUL_DX xvec1, xvec3, xvec3;
2452ADD_DX xvec3, xvec15, xvec15;
2453
2454BROAD_DX 2*SIZE(ptrba), xvec0;
2455LD_DX	4*SIZE(ptrbb), xvec2;
2456MUL_DX xvec0, xvec2, xvec2;
2457ADD_DX xvec2, xvec15, xvec15;
2458
2459BROAD_DX 3*SIZE(ptrba), xvec1;
2460LD_DX	6*SIZE(ptrbb), xvec3;
2461MUL_DX xvec1, xvec3, xvec3;
2462ADD_DX xvec3, xvec15, xvec15;
2463ADDQ	$4*SIZE, ptrba;
2464ADDQ	$8*SIZE, ptrbb;
2465DECQ k;
2466JG .L241_bodyB;
2467ALIGN_5
2468.L241_loopE:
2469#ifndef	TRMMKERNEL
2470TEST $2, bk;
2471#else
2472MOVQ kkk, %rax;
2473TEST $2, %rax;
2474#endif
2475JLE .L242_loopE;
2476ALIGN_5
2477.L242_bodyB:
2478BROAD_DX 0*SIZE(ptrba), xvec0;
2479LD_DX	0*SIZE(ptrbb), xvec2;
2480MUL_DX xvec0, xvec2, xvec2;
2481ADD_DX xvec2, xvec15, xvec15;
2482
2483BROAD_DX 1*SIZE(ptrba), xvec1;
2484LD_DX	2*SIZE(ptrbb), xvec3;
2485MUL_DX xvec1, xvec3, xvec3;
2486ADD_DX xvec3, xvec15, xvec15;
2487ADDQ	$2*SIZE, ptrba;
2488ADDQ	$4*SIZE, ptrbb;
2489.L242_loopE:
2490#ifndef	TRMMKERNEL
2491TEST $1, bk;
2492#else
2493MOVQ kkk, %rax;
2494TEST $1, %rax;
2495#endif
2496JLE .L243_loopE;
2497ALIGN_5
2498.L243_bodyB:
2499BROAD_DX 0*SIZE(ptrba), xvec0;
2500LD_DX	0*SIZE(ptrbb), xvec2;
2501MUL_DX xvec0, xvec2, xvec2;
2502ADD_DX xvec2, xvec15, xvec15;
2503ADDQ	$1*SIZE, ptrba;
2504ADDQ	$2*SIZE, ptrbb;
2505
2506.L243_loopE:
2507BROAD_DX MEMALPHA, xvec7;
2508MUL_DX xvec7, xvec15, xvec15;
2509#ifndef	TRMMKERNEL
2510LDL_DX 0*SIZE(C0), xvec0, xvec0;
2511LDH_DX 0*SIZE(C1), xvec0, xvec0;
2512ADD_DX xvec0, xvec15, xvec15;
2513#endif
2514STL_DX	xvec15, 0*SIZE(C0);
2515STH_DX	xvec15, 0*SIZE(C1);
2516#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
2517MOVQ	bk, %rax;
2518SUBQ	kkk, %rax;
2519LEAQ	(,%rax, SIZE), %rax;
2520ADDQ	%rax, ptrba;
2521LEAQ	(ptrbb, %rax, 2), ptrbb;
2522#endif
2523#if defined(TRMMKERNEL) && defined(LEFT)
2524ADDQ	$1, kk;
2525#endif
2526ADDQ	$1*SIZE, C0;
2527ADDQ	$1*SIZE, C1;
2528.L24_loopE:
2529#if defined(TRMMKERNEL) && !defined(LEFT)
2530ADDQ	$2, kk;
2531#endif
2532MOVQ bk, k;
2533SALQ $4, k;
2534ADDQ k, bb;
2535LEAQ (C, ldc, 2), C;
2536.L20_loopE:;
2537TEST $1, bn;					// Rn = 1
2538JLE .L30_loopE;
2539ALIGN_5
2540.L30_bodyB:
2541#if defined(TRMMKERNEL)&&defined(LEFT)
2542MOVQ	OFFSET, %rax;
2543MOVQ	%rax, kk;
2544#endif
2545MOVQ C, C0;
2546MOVQ ba, ptrba;
2547MOVQ bm, i;
2548SARQ $3, i;
2549JLE .L31_loopE;
2550ALIGN_5
2551.L31_bodyB:
2552#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
2553MOVQ bb, ptrbb;
2554#else
2555MOVQ bb, ptrbb;
2556MOVQ kk, %rax
2557LEAQ (, %rax, SIZE), %rax;
2558LEAQ (ptrba, %rax, 8), ptrba;
2559ADDQ	%rax, ptrbb;
2560#endif
2561//#### Initial Results Register ####
2562XOR_DY	yvec15, yvec15, yvec15;
2563XOR_DY	yvec14, yvec14, yvec14;
2564#ifndef	TRMMKERNEL
2565MOVQ bk, k;
2566#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
2567MOVQ bk, %rax;
2568SUBQ kk, %rax;
2569MOVQ %rax, kkk;
2570#else
2571MOVQ kk, %rax;
2572#ifdef	LEFT
2573ADDQ $8, %rax;
2574#else
2575ADDQ $1, %rax;
2576#endif
2577MOVQ %rax, kkk;
2578#endif
2579SARQ $2, k;
2580JLE .L311_loopE;
2581ALIGN_5
2582.L311_bodyB:
2583//#### Unroll time 1 ####
2584LD_DY	0*SIZE(ptrba), yvec0;
2585LD_DY	4*SIZE(ptrba), yvec1;
2586BROAD_DY 0*SIZE(ptrbb), yvec2;
2587MUL_DY	yvec2, yvec0, yvec0;
2588ADD_DY	yvec0, yvec15, yvec15;
2589MUL_DY	yvec2, yvec1, yvec1;
2590ADD_DY	yvec1, yvec14, yvec14;
2591
2592//#### Unroll time 2 ####
2593LD_DY	8*SIZE(ptrba), yvec3;
2594LD_DY	12*SIZE(ptrba), yvec4;
2595BROAD_DY 1*SIZE(ptrbb), yvec5;
2596MUL_DY	yvec5, yvec3, yvec3;
2597ADD_DY	yvec3, yvec15, yvec15;
2598MUL_DY	yvec5, yvec4, yvec4
2599ADD_DY	yvec4, yvec14, yvec14;
2600
2601//#### Unroll time 3 ####
2602LD_DY	16*SIZE(ptrba), yvec0;
2603LD_DY	20*SIZE(ptrba), yvec1;
2604BROAD_DY 2*SIZE(ptrbb), yvec2;
2605MUL_DY	yvec2, yvec0, yvec0;
2606ADD_DY	yvec0, yvec15, yvec15;
2607MUL_DY	yvec2, yvec1, yvec1;
2608ADD_DY	yvec1, yvec14, yvec14;
2609
2610//#### Unroll time 2 ####
2611LD_DY	24*SIZE(ptrba), yvec3;
2612LD_DY	28*SIZE(ptrba), yvec4;
2613BROAD_DY 3*SIZE(ptrbb), yvec5;
2614MUL_DY	yvec5, yvec3, yvec3;
2615ADD_DY	yvec3, yvec15, yvec15;
2616ADDQ	$32*SIZE, ptrba;
2617MUL_DY	yvec5, yvec4, yvec4;
2618ADD_DY	yvec4, yvec14, yvec14;
2619ADDQ	$4*SIZE, ptrbb;
2620DECQ	k;
2621JG	.L311_bodyB;
2622ALIGN_5
2623.L311_loopE:
2624#ifndef	TRMMKERNEL
2625TEST $2, bk;
2626#else
2627MOVQ kkk, %rax;
2628TEST $2, %rax;
2629#endif
2630JLE .L312_loopE;
2631ALIGN_5
2632.L312_bodyB:
2633//#### Unroll time 1 ####
2634LD_DY	0*SIZE(ptrba), yvec0;
2635LD_DY	4*SIZE(ptrba), yvec1;
2636BROAD_DY 0*SIZE(ptrbb), yvec2;
2637MUL_DY	yvec2, yvec0, yvec0;
2638ADD_DY	yvec0, yvec15, yvec15;
2639MUL_DY	yvec2, yvec1, yvec1;
2640ADD_DY	yvec1, yvec14, yvec14;
2641
2642//#### Unroll time 2 ####
2643LD_DY	8*SIZE(ptrba), yvec3;
2644LD_DY	12*SIZE(ptrba), yvec4;
2645BROAD_DY 1*SIZE(ptrbb), yvec5;
2646MUL_DY	yvec5, yvec3, yvec3;
2647ADD_DY	yvec3, yvec15, yvec15;
2648ADDQ	$16*SIZE, ptrba;
2649MUL_DY	yvec5, yvec4, yvec4
2650ADD_DY	yvec4, yvec14, yvec14;
2651ADDQ	$2*SIZE, ptrbb;
2652
2653.L312_loopE:
2654#ifndef	TRMMKERNEL
2655TEST $1, bk;
2656#else
2657MOVQ kkk, %rax;
2658TEST $1, %rax;
2659#endif
2660JLE .L313_loopE;
2661ALIGN_5
2662.L313_bodyB:
2663//#### Unroll time 1 ####
2664LD_DY	0*SIZE(ptrba), yvec0;
2665LD_DY	4*SIZE(ptrba), yvec1;
2666BROAD_DY 0*SIZE(ptrbb), yvec2;
2667MUL_DY	yvec2, yvec0, yvec0;
2668ADD_DY	yvec0, yvec15, yvec15;
2669ADDQ	$8*SIZE, ptrba;
2670MUL_DY	yvec2, yvec1, yvec1;
2671ADD_DY	yvec1, yvec14, yvec14;
2672ADDQ	$1*SIZE, ptrbb;
2673
2674.L313_loopE:
2675//#### Multiply Alpha ####
2676BROAD_DY MEMALPHA, yvec7;
2677MUL_DY	yvec7, yvec15, yvec15;
2678MUL_DY	yvec7, yvec14, yvec14;
2679//#### Testing Alignment ####
2680MOVQ	C0, %rax;
2681OR		ldc, %rax;
2682TEST 	$15, %rax;
2683JNE		.L313_loopEx;
2684ALIGN_5
2685//#### Writing Back ####
2686EXTRA_DY $1, yvec15, xvec13;
2687EXTRA_DY $1, yvec14, xvec12;
2688#ifndef	TRMMKERNEL
2689ADD_DX 0*SIZE(C0), xvec15, xvec15;
2690ADD_DX 2*SIZE(C0), xvec13, xvec13;
2691ADD_DX 4*SIZE(C0), xvec14, xvec14;
2692ADD_DX 6*SIZE(C0), xvec12, xvec12;
2693#endif
2694ST_DX	xvec15, 0*SIZE(C0);
2695ST_DX	xvec13, 2*SIZE(C0);
2696ST_DX	xvec14,	4*SIZE(C0);
2697ST_DX	xvec12, 6*SIZE(C0);
2698#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
2699MOVQ bk, %rax;
2700SUBQ kkk, %rax;
2701LEAQ (,%rax, SIZE), %rax;
2702LEAQ (ptrba, %rax, 8), ptrba;
2703ADDQ %rax, ptrbb;
2704#endif
2705#if defined(TRMMKERNEL)&&defined(LEFT)
2706ADDQ	$8, kk;
2707#endif
2708ADDQ	$8*SIZE, C0;
2709DECQ	i;
2710JG	.L31_bodyB;
2711JMP .L31_loopE;
2712ALIGN_5
2713.L313_loopEx:
2714EXTRA_DY $1, yvec15, xvec13;
2715EXTRA_DY $1, yvec14, xvec12;
2716#ifndef	TRMMKERNEL
2717LDL_DX 0*SIZE(C0), xvec11, xvec11;
2718LDH_DX 1*SIZE(C0), xvec11, xvec11;
2719LDL_DX 2*SIZE(C0), xvec10, xvec10;
2720LDH_DX 3*SIZE(C0), xvec10, xvec10;
2721LDL_DX 4*SIZE(C0), xvec9, xvec9;
2722LDH_DX 5*SIZE(C0), xvec9, xvec9;
2723LDL_DX 6*SIZE(C0), xvec8, xvec8;
2724LDH_DX 7*SIZE(C0), xvec8, xvec8;
2725ADD_DX xvec11, xvec15, xvec15;
2726ADD_DX xvec10, xvec13, xvec13;
2727ADD_DX xvec9, xvec14, xvec14;
2728ADD_DX xvec8, xvec12, xvec12;
2729#endif
2730STL_DX	xvec15, 0*SIZE(C0);
2731STH_DX	xvec15, 1*SIZE(C0);
2732STL_DX	xvec13, 2*SIZE(C0);
2733STH_DX	xvec13, 3*SIZE(C0);
2734STL_DX	xvec14, 4*SIZE(C0);
2735STH_DX	xvec14, 5*SIZE(C0);
2736STL_DX	xvec12, 6*SIZE(C0);
2737STH_DX	xvec12, 7*SIZE(C0);
2738#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
2739MOVQ bk, %rax;
2740SUBQ kkk, %rax;
2741LEAQ (,%rax, SIZE), %rax;
2742LEAQ (ptrba, %rax, 8), ptrba;
2743ADDQ %rax, ptrbb;
2744#endif
2745#if defined(TRMMKERNEL)&&defined(LEFT)
2746ADDQ	$8, kk;
2747#endif
2748ADDQ	$8*SIZE, C0;
2749DECQ	i;
2750JG	.L31_bodyB;
2751.L31_loopE:
2752TEST $4, bm
2753JLE .L32_loopE;
2754ALIGN_5
2755.L32_bodyB:
2756#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
2757MOVQ bb, ptrbb;
2758#else
2759MOVQ bb, ptrbb;
2760MOVQ kk, %rax;
2761LEAQ (,%rax, SIZE), %rax;
2762LEAQ (ptrba, %rax, 4), ptrba;
2763ADDQ	%rax, ptrbb;
2764#endif
2765//#### Initial Results Register ####
2766XOR_DY	yvec15, yvec15, yvec15;
2767#ifndef	TRMMKERNEL
2768MOVQ bk, k;
2769#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2770MOVQ bk, %rax;
2771SUBQ kk, %rax;
2772MOVQ %rax, kkk;
2773#else
2774MOVQ kk, %rax;
2775#ifdef	LEFT
2776ADDQ $4, %rax;
2777#else
2778ADDQ $1, %rax;
2779#endif
2780MOVQ %rax, kkk
2781#endif
2782SARQ $2, k;
2783JLE .L321_loopE;
2784ALIGN_5
2785.L321_bodyB:
2786LD_DY	0*SIZE(ptrba), yvec0;
2787BROAD_DY 0*SIZE(ptrbb), yvec1;
2788MUL_DY	yvec0, yvec1, yvec1;
2789ADD_DY	yvec1, yvec15, yvec15;
2790
2791LD_DY	4*SIZE(ptrba), yvec2;
2792BROAD_DY 1*SIZE(ptrbb), yvec3;
2793MUL_DY	yvec2, yvec3, yvec3;
2794ADD_DY	yvec3, yvec15, yvec15;
2795
2796LD_DY	8*SIZE(ptrba), yvec4;
2797BROAD_DY 2*SIZE(ptrbb), yvec5;
2798MUL_DY	yvec4, yvec5, yvec5;
2799ADD_DY	yvec5, yvec15, yvec15;
2800
2801LD_DY	12*SIZE(ptrba), yvec6;
2802BROAD_DY 3*SIZE(ptrbb), yvec7;
2803MUL_DY	yvec6, yvec7, yvec7;
2804ADD_DY	yvec7, yvec15, yvec15;
2805ADDQ	$16*SIZE, ptrba;
2806ADDQ	$4*SIZE, ptrbb;
2807DECQ	k;
2808JG .L321_bodyB;
2809ALIGN_5
2810.L321_loopE:
2811#ifndef	TRMMKERNEL
2812TEST $2, bk;
2813#else
2814MOVQ kkk, %rax;
2815TEST $2, %rax;
2816#endif
2817JLE .L322_loopE;
2818ALIGN_5
2819.L322_bodyB:
2820LD_DY	0*SIZE(ptrba), yvec0;
2821BROAD_DY 0*SIZE(ptrbb), yvec1;
2822MUL_DY	yvec0, yvec1, yvec1;
2823ADD_DY	yvec1, yvec15, yvec15;
2824
2825LD_DY	4*SIZE(ptrba), yvec2;
2826BROAD_DY 1*SIZE(ptrbb), yvec3;
2827MUL_DY	yvec2, yvec3, yvec3;
2828ADD_DY	yvec3, yvec15, yvec15;
2829ADDQ	$8*SIZE, ptrba;
2830ADDQ	$2*SIZE, ptrbb;
2831
2832.L322_loopE:
2833#ifndef	TRMMKERNEL
2834TEST $1, bk;
2835#else
2836MOVQ kkk, %rax;
2837TEST $1, %rax;
2838#endif
2839JLE .L323_loopE;
2840ALIGN_5
2841.L323_bodyB:
2842LD_DY	0*SIZE(ptrba), yvec0;
2843BROAD_DY 0*SIZE(ptrbb), yvec1;
2844MUL_DY	yvec0, yvec1, yvec1;
2845ADD_DY	yvec1, yvec15, yvec15;
2846ADDQ	$4*SIZE, ptrba;
2847ADDQ	$1*SIZE, ptrbb;
2848
2849.L323_loopE:
2850//#### Multiply Alpha ####
2851BROAD_DY MEMALPHA, yvec7;
2852MUL_DY	yvec7, yvec15, yvec15;
2853//#### Testing Alignment ####
2854MOVQ	C0, %rax;
2855OR		ldc, %rax;
2856TEST	$15, %rax;
2857JNE	.L323_loopEx;
2858ALIGN_5
2859//#### Writing Back ####
2860EXTRA_DY $1, yvec15, xvec14;
2861#ifndef	TRMMKERNEL
2862ADD_DX 0*SIZE(C0), xvec15, xvec15;
2863ADD_DX 2*SIZE(C0), xvec14, xvec14;
2864#endif
2865ST_DX	xvec15, 0*SIZE(C0);
2866ST_DX	xvec14, 2*SIZE(C0);
2867#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
2868MOVQ bk, %rax;
2869SUBQ kkk, %rax;
2870LEAQ (, %rax, SIZE), %rax;
2871LEAQ (ptrba, %rax, 4), ptrba;
2872ADDQ %rax, ptrbb;
2873#endif
2874#if defined(TRMMKERNEL) && defined(LEFT)
2875ADDQ $4, kk
2876#endif
2877ADDQ	$4*SIZE, C0;
2878JMP .L32_loopE;
2879ALIGN_5
2880.L323_loopEx:
2881//#### Writing Back ####
2882EXTRA_DY $1, yvec15, xvec14;
2883#ifndef	TRMMKERNEL
2884LDL_DX 0*SIZE(C0), xvec13, xvec13;
2885LDH_DX 1*SIZE(C0), xvec13, xvec13;
2886LDL_DX 2*SIZE(C0), xvec12, xvec12;
2887LDH_DX 3*SIZE(C0), xvec12, xvec12;
2888ADD_DX xvec13, xvec15, xvec15;
2889ADD_DX xvec12, xvec14, xvec14;
2890#endif
2891STL_DX	xvec15, 0*SIZE(C0);
2892STH_DX	xvec15, 1*SIZE(C0);
2893STL_DX	xvec14, 2*SIZE(C0);
2894STH_DX	xvec14, 3*SIZE(C0);
2895#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
2896MOVQ bk, %rax;
2897SUBQ kkk, %rax;
2898LEAQ (, %rax, SIZE), %rax;
2899LEAQ (ptrba, %rax, 4), ptrba;
2900ADDQ %rax, ptrbb;
2901#endif
2902#if defined(TRMMKERNEL) && defined(LEFT)
2903ADDQ $4, kk
2904#endif
2905ADDQ	$4*SIZE, C0;
2906.L32_loopE:
2907TEST $2, bm
2908JLE .L33_loopE;
2909ALIGN_5
2910.L33_bodyB:
2911#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2912MOVQ bb, ptrbb;
2913#else
2914MOVQ bb, ptrbb;
2915MOVQ kk, %rax
2916LEAQ (, %rax, SIZE), %rax
2917LEAQ (ptrba, %rax, 2), ptrba
2918ADDQ %rax, ptrbb;
2919#endif
2920//#### Initial Result ####
2921XOR_DY	yvec15, yvec15, yvec15;
2922#ifndef	TRMMKERNEL
2923MOVQ bk, k;
2924#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
2925MOVQ bk, %rax;
2926SUBQ kk, %rax;
2927MOVQ %rax, kkk;
2928#else
2929MOVQ kk, %rax;
2930#ifdef	LEFT
2931ADDQ $2, %rax;
2932#else
2933ADDQ $1, %rax;
2934#endif
2935MOVQ %rax, kkk;
2936#endif
2937SARQ $2, k;
2938JLE .L331_loopE;
2939ALIGN_5
2940.L331_bodyB:
2941LD_DX	0*SIZE(ptrba), xvec0;
2942BROAD_DX 0*SIZE(ptrbb), xvec2;
2943MUL_DX xvec0, xvec2, xvec2;
2944ADD_DX xvec2, xvec15, xvec15;
2945
2946LD_DX	2*SIZE(ptrba), xvec1;
2947BROAD_DX 1*SIZE(ptrbb), xvec3;
2948MUL_DX xvec1, xvec3, xvec3;
2949ADD_DX xvec3, xvec15, xvec15;
2950
2951LD_DX	4*SIZE(ptrba), xvec4;
2952BROAD_DX 2*SIZE(ptrbb), xvec5;
2953MUL_DX xvec4, xvec5, xvec5;
2954ADD_DX xvec5, xvec15, xvec15;
2955
2956LD_DX	6*SIZE(ptrba), xvec6;
2957BROAD_DX 3*SIZE(ptrbb), xvec7;
2958MUL_DX xvec6, xvec7, xvec7;
2959ADD_DX xvec7, xvec15, xvec15;
2960ADDQ	$8*SIZE, ptrba;
2961ADDQ	$4*SIZE, ptrbb;
2962DECQ	k;
2963JG	.L331_bodyB;
2964ALIGN_5
2965.L331_loopE:
2966#ifndef	TRMMKERNEL
2967TEST $2,bk;
2968#else
2969MOVQ kkk, %rax;
2970TEST $2, %rax
2971#endif
2972JLE .L332_loopE;
2973ALIGN_5
2974.L332_bodyB:
2975LD_DX	0*SIZE(ptrba), xvec0;
2976BROAD_DX 0*SIZE(ptrbb), xvec2;
2977MUL_DX xvec0, xvec2, xvec2;
2978ADD_DX xvec2, xvec15, xvec15;
2979
2980LD_DX	2*SIZE(ptrba), xvec1;
2981BROAD_DX 1*SIZE(ptrbb), xvec3;
2982MUL_DX xvec1, xvec3, xvec3;
2983ADD_DX xvec3, xvec15, xvec15;
2984ADDQ	$4*SIZE, ptrba;
2985ADDQ	$2*SIZE, ptrbb;
2986.L332_loopE:
2987#ifndef	TRMMKERNEL
2988TEST $1, bk;
2989#else
2990MOVQ kkk, %rax;
2991TEST $1, %rax;
2992#endif
2993JLE .L333_loopE;
2994ALIGN_5
2995.L333_bodyB:
2996LD_DX	0*SIZE(ptrba), xvec0;
2997BROAD_DX 0*SIZE(ptrbb), xvec2;
2998MUL_DX xvec0, xvec2, xvec2;
2999ADD_DX xvec2, xvec15, xvec15;
3000ADDQ	$2*SIZE, ptrba;
3001ADDQ	$1*SIZE, ptrbb;
3002.L333_loopE:
3003//#### Multiply Alpha ####
3004BROAD_DX MEMALPHA, xvec7;
3005MUL_DX xvec7, xvec15, xvec15;
3006#ifndef	TRMMKERNEL
3007LDL_DX 0*SIZE(C0), xvec14, xvec14;
3008LDH_DX 1*SIZE(C0), xvec14, xvec14;
3009ADD_DX xvec14, xvec15, xvec15;
3010#endif
3011STL_DX	xvec15, 0*SIZE(C0);
3012STH_DX	xvec15, 1*SIZE(C0);
3013#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3014MOVQ bk, %rax;
3015SUBQ kkk, %rax;
3016LEAQ (,%rax, SIZE), %rax;
3017LEAQ (ptrba, %rax, 2), ptrba;
3018ADDQ %rax, ptrbb;
3019#endif
3020#if defined(TRMMKERNEL) && defined(LEFT)
3021addq $2, kk
3022#endif
3023ADDQ	$2*SIZE, C0;
3024.L33_loopE:
3025TEST $1, bm
3026JLE .L34_loopE;
3027ALIGN_5
3028.L34_bodyB:
3029#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3030MOVQ bb, ptrbb;
3031#else
3032MOVQ bb, ptrbb;
3033MOVQ kk, %rax;
3034LEAQ (, %rax, SIZE), %rax;
3035ADDQ %rax, ptrba;
3036ADDQ %rax, ptrbb;
3037#endif
3038XOR_DY	yvec15, yvec15, yvec15;
3039#ifndef	TRMMKERNEL
3040MOVQ bk, k;
3041#elif (defined(LEFT)&& !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
3042MOVQ bk, %rax;
3043SUBQ kk, %rax;
3044MOVQ %rax, kkk;
3045#else
3046MOVQ kk, %rax;
3047#ifdef	LEFT
3048ADDQ $1, %rax;
3049#else
3050ADDQ $1, %rax;
3051#endif
3052MOVQ %rax, kkk;
3053#endif
3054SARQ $2, k;
3055JLE .L341_loopE;
3056ALIGN_5
3057.L341_bodyB:
3058vmovsd	0*SIZE(ptrba), xvec0;
3059vmovsd 	0*SIZE(ptrbb), xvec1;
3060vmulsd	xvec0, xvec1, xvec1;
3061vaddsd	xvec1, xvec15, xvec15;
3062
3063vmovsd	1*SIZE(ptrba), xvec0;
3064vmovsd 	1*SIZE(ptrbb), xvec1;
3065vmulsd	xvec0, xvec1, xvec1;
3066vaddsd	xvec1, xvec15, xvec15;
3067
3068vmovsd	2*SIZE(ptrba), xvec0;
3069vmovsd 	2*SIZE(ptrbb), xvec1;
3070vmulsd	xvec0, xvec1, xvec1;
3071vaddsd	xvec1, xvec15, xvec15;
3072
3073vmovsd	3*SIZE(ptrba), xvec0;
3074vmovsd 	3*SIZE(ptrbb), xvec1;
3075vmulsd	xvec0, xvec1, xvec1;
3076vaddsd	xvec1, xvec15, xvec15;
3077addq	$4*SIZE, ptrba;
3078addq	$4*SIZE, ptrbb;
3079decq	k;
3080JG	.L341_bodyB;
3081ALIGN_5
3082.L341_loopE:
3083#ifndef	TRMMKERNEL
3084TEST $2, bk;
3085#else
3086MOVQ kkk, %rax;
3087TEST $2, %rax;
3088#endif
3089JLE .L342_loopE;
3090ALIGN_5
3091.L342_bodyB:
3092vmovsd	0*SIZE(ptrba), xvec0;
3093vmovsd 	0*SIZE(ptrbb), xvec1;
3094vmulsd	xvec0, xvec1, xvec1;
3095vaddsd	xvec1, xvec15, xvec15;
3096
3097vmovsd	1*SIZE(ptrba), xvec0;
3098vmovsd 	1*SIZE(ptrbb), xvec1;
3099vmulsd	xvec0, xvec1, xvec1;
3100vaddsd	xvec1, xvec15, xvec15;
3101addq	$2*SIZE, ptrba;
3102addq	$2*SIZE, ptrbb;
3103
3104.L342_loopE:
3105#ifndef	TRMMKERNEL
3106TEST $1, bk
3107#else
3108MOVQ kkk, %rax;
3109TEST $1, %rax;
3110#endif
3111JLE .L343_loopE;
3112ALIGN_5
3113.L343_bodyB:
3114vmovsd	0*SIZE(ptrba), xvec0;
3115vmovsd 	0*SIZE(ptrbb), xvec1;
3116vmulsd	xvec0, xvec1, xvec1;
3117vaddsd	xvec1, xvec15, xvec15;
3118addq	$1*SIZE, ptrba;
3119addq	$1*SIZE, ptrbb;
3120
3121.L343_loopE:
3122//#### Writing Back ####
3123vmovsd	MEMALPHA, xvec7;
3124vmulsd	xvec7, xvec15, xvec15;
3125#ifndef	TRMMKERNEL
3126vmovsd	0*SIZE(C0), xvec0;
3127vaddsd	xvec0, xvec15, xvec15;
3128#endif
3129movsd	xvec15, 0*SIZE(C0);
3130#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3131MOVQ bk, %rax;
3132SUBQ kkk, %rax;
3133LEAQ (,%rax, SIZE), %rax;
3134ADDQ %rax, ptrba;
3135ADDQ %rax, ptrbb;
3136#endif
3137#if defined(TRMMKERNEL) && defined(LEFT)
3138addq $1, kk
3139#endif
3140addq	$1*SIZE, C0;
3141.L34_loopE:
3142MOVQ bk, k
3143SALQ $3, k;
3144ADDQ k, bb;
3145LEAQ (C, ldc, 1), C;
3146
3147.L30_loopE:
3148movq      0(%rsp), %rbx;
3149movq      8(%rsp), %rbp;
3150movq     16(%rsp), %r12;
3151movq     24(%rsp), %r13;
3152movq     32(%rsp), %r14;
3153movq     40(%rsp), %r15;
3154
3155vzeroupper
3156
3157#ifdef WINDOWS_ABI
3158	movq	 48(%rsp), %rdi
3159	movq	 56(%rsp), %rsi
3160	movups	 64(%rsp), %xmm6
3161	movups	 80(%rsp), %xmm7
3162	movups	 96(%rsp), %xmm8
3163	movups	112(%rsp), %xmm9
3164	movups	128(%rsp), %xmm10
3165	movups	144(%rsp), %xmm11
3166	movups	160(%rsp), %xmm12
3167	movups	176(%rsp), %xmm13
3168	movups	192(%rsp), %xmm14
3169	movups	208(%rsp), %xmm15
3170#endif
3171addq    $STACKSIZE, %rsp;
3172ret
3173
3174EPILOGUE
3175