1/***************************************************************************
2Copyright (c) 2013, The OpenBLAS Project
3All rights reserved.
4Redistribution and use in source and binary forms, with or without
5modification, are permitted provided that the following conditions are
6met:
71. Redistributions of source code must retain the above copyright
8notice, this list of conditions and the following disclaimer.
92. Redistributions in binary form must reproduce the above copyright
10notice, this list of conditions and the following disclaimer in
11the documentation and/or other materials provided with the
12distribution.
133. Neither the name of the OpenBLAS project nor the names of
14its contributors may be used to endorse or promote products
15derived from this software without specific prior written permission.
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*****************************************************************************/
27
28/*********************************************************************
29*
30* 2014/06/28 Saar
31*        BLASTEST               : OK
32*        CTEST                  : OK
33*        TEST                   : OK
34*
35*
36* 2013/10/30 Saar
37*
38* Parameter:
39*       UNROLL_M        2
40*       UNROLL_N        2
41*       ZGEMM_P         384
42*       ZGEMM_Q         168
43*       A_PR1           512
44*       B_PR1           256
45*
46* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1):
47*
48* 3456x3456     82.4    GFLOPS with 8 threads on 4 modules (ACML:  76.3 ) (BULLDOZER:  81.0 )
49* 3456x3456     79.9    GFLOPS with 4 threads on 4 modules (ACML:  69.9 ) (BULLDOZER:  74.6 )
50* 3456x3456     40.4    GFLOPS with 2 threads on 2 modules (ACML:  35.8 ) (BULLDOZER:  37.9 )
51* 3456x3456     20.3    GFLOPS with 1 threads on 1 modules (ACML:  18.1 ) (BULLDOZER:  19.2 )
52*
53* Performance at m x n on AMD 6380  (ACML-Version: 5.3.1):
54*
55* 6912x6912    227.5    GFLOPS with 32 threads on 16 modules (ACML: 166.3 ) (BULLDOZER: 228.5 )
56* 6912x6912    211.6    GFLOPS with 16 threads on 16 modules (ACML: 169.5 ) (BULLDOZER: 204.3 )
57* 6912x6912    123.5    GFLOPS with  8 threads on  8 modules (ACML:  92.7 ) (BULLDOZER: 117.0 )
58* 3456x3456     64.1    GFLOPS with  4 threads on  4 modules (ACML:  49.1 ) (BULLDOZER:  61.7 )
59* 3456x3456     33.4    GFLOPS with  2 threads on  2 modules (ACML:  28.1 ) (BULLDOZER:  30.9 )
60* 3456x3456     17.0    GFLOPS with  1 threads on  1 modules (ACML:  15.2 ) (BULLDOZER:  15.7 )
61*
62*********************************************************************/
63
64
65#define ASSEMBLER
66#include "common.h"
67
68#define OLD_M	%rdi
69#define OLD_N	%rsi
70#define M	%r13
71#define J	%r14
72#define OLD_K	%rdx
73
74#define A	%rcx
75#define B	%r8
76#define C	%r9
77#define LDC	%r10
78
79#define I	%r11
80#define AO	%rdi
81#define BO	%rsi
82#define	CO1	%r15
83#define K	%r12
84#define BI	%rbp
85#define	SP	%rbx
86
87#define BO1	%rdi
88#define BO2	%r15
89
90#ifndef WINDOWS_ABI
91
92#define STACKSIZE 96
93
94#else
95
96#define STACKSIZE 320
97
98#define OLD_ALPHA_I     40 + STACKSIZE(%rsp)
99#define OLD_A           48 + STACKSIZE(%rsp)
100#define OLD_B           56 + STACKSIZE(%rsp)
101#define OLD_C           64 + STACKSIZE(%rsp)
102#define OLD_LDC         72 + STACKSIZE(%rsp)
103#define OLD_OFFSET      80 + STACKSIZE(%rsp)
104
105#endif
106
107#define L_BUFFER_SIZE 256*8*4
108
109#define Ndiv6	 24(%rsp)
110#define Nmod6	 32(%rsp)
111#define N	 40(%rsp)
112#define ALPHA_R  48(%rsp)
113#define ALPHA_I  56(%rsp)
114#define OFFSET   64(%rsp)
115#define KK       72(%rsp)
116#define KKK      80(%rsp)
117#define BUFFER1	           128(%rsp)
118
119#if defined(OS_WINDOWS)
120#if   L_BUFFER_SIZE > 16384
121#define STACK_TOUCH \
122        movl    $0,  4096 * 4(%rsp);\
123        movl    $0,  4096 * 3(%rsp);\
124        movl    $0,  4096 * 2(%rsp);\
125        movl    $0,  4096 * 1(%rsp);
126#elif L_BUFFER_SIZE > 12288
127#define STACK_TOUCH \
128        movl    $0,  4096 * 3(%rsp);\
129        movl    $0,  4096 * 2(%rsp);\
130        movl    $0,  4096 * 1(%rsp);
131#elif L_BUFFER_SIZE > 8192
132#define STACK_TOUCH \
133        movl    $0,  4096 * 2(%rsp);\
134        movl    $0,  4096 * 1(%rsp);
135#elif L_BUFFER_SIZE > 4096
136#define STACK_TOUCH \
137        movl    $0,  4096 * 1(%rsp);
138#else
139#define STACK_TOUCH
140#endif
141#else
142#define STACK_TOUCH
143#endif
144
145
146#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
147#define VFMADD_R    vfmaddpd
148#define VFMADD_I    vfmaddpd
149#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
150#define VFMADD_R    vfnmaddpd
151#define VFMADD_I    vfmaddpd
152#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
153#define VFMADD_R    vfmaddpd
154#define VFMADD_I    vfnmaddpd
155#else
156#define VFMADD_R    vfnmaddpd
157#define VFMADD_I    vfnmaddpd
158#endif
159
160
161#define	A_PR1	512
162#define	B_PR1	256
163
164#define KERNEL2x2_1(xx) \
165        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
166        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
167        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
168        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
169        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
170        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
171        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
172        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
173        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
174        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
175        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
176        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
177        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
178        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
179        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
180
181#define KERNEL2x2_2(xx) \
182        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
183        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
184        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
185        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
186        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
187        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
188        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
189        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
190        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
191        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
192        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
193        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
194        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
195        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
196
197#define KERNEL2x2_3(xx) \
198	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
199        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
200        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
201        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
202        vmovups           2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
203        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
204        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
205        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
206        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
207        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
208        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
209        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
210        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
211        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
212        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
213
214#define KERNEL2x2_4(xx) \
215        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
216        vmovddup          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
217        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
218        vmovups           6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
219        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
220        vmovddup          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
221        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
222        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
223        vmovddup          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
224        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
225        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
226        vmovddup          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
227        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
228        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
229        addq    $16, BI                            ;\
230        addq    $16, %rax                          ;\
231
232
233#define KERNEL2x2_SUB(xx) \
234        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
235        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
236        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
237        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
238        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
239        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
240        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
241        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
242        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
243        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
244        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
245        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
246        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
247        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
248        addq    $4, BI                            ;\
249        addq    $4, %rax                          ;\
250
251/************************************************************************************************/
252
253#define KERNEL1x2_1(xx) \
254        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
255        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
256        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
257        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
258        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
259        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
260        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
261        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
262        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
263        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
264
265#define KERNEL1x2_2(xx) \
266        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\
267        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
268        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
269        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
270        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
271        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
272        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
273        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
274        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
275        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
276
277#define KERNEL1x2_3(xx) \
278        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
279        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
280        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
281        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
282        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
283        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
284        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
285        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
286        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
287
288#define KERNEL1x2_4(xx) \
289        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\
290        vmovddup          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
291        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
292        vmovddup          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
293        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
294        vmovddup          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
295        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
296        vmovddup          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
297        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
298        addq    $16, BI                            ;\
299        addq    $8 , %rax                          ;\
300
301
302#define KERNEL1x2_SUB(xx) \
303        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
304        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
305        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
306        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
307        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
308        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
309        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
310        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
311        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
312        addq    $4, BI                            ;\
313        addq    $2, %rax                          ;\
314
315/************************************************************************************************/
316
317#define KERNEL2x1_1(xx) \
318        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
319        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
320        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
321        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
322        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
323        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
324        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
325        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
326        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
327
328#define KERNEL2x1_2(xx) \
329        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
330        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
331        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
332        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
333        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
334        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
335        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
336        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
337
338#define KERNEL2x1_3(xx) \
339	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
340        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
341        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
342        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
343        vmovups           2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
344        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
345        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
346        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
347        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
348
349#define KERNEL2x1_4(xx) \
350        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
351        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
352        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
353        vmovups           6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
354        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
355        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
356        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
357        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
358        addq    $8, BI                            ;\
359        addq    $16, %rax                          ;\
360
361
362#define KERNEL2x1_SUB(xx) \
363        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
364        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
365        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
366        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
367        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
368        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
369        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
370        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
371        addq    $2, BI                            ;\
372        addq    $4, %rax                          ;\
373
374
375/************************************************************************************************/
376
377#define KERNEL1x1_1(xx) \
378        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
379        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
380        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
381        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
382        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
383        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
384
385#define KERNEL1x1_2(xx) \
386        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\
387        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
388        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
389        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
390        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
391
392#define KERNEL1x1_3(xx) \
393        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
394        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
395        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
396        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
397        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
398
399#define KERNEL1x1_4(xx) \
400        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\
401        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
402        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
403        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
404        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
405        addq    $8, BI                            ;\
406        addq    $8, %rax                          ;\
407
408
409#define KERNEL1x1_SUB(xx) \
410        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
411        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
412        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
413        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
414        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
415        addq    $2, BI                            ;\
416        addq    $2, %rax                          ;\
417
418
419/************************************************************************************************/
420
421
422
423
424	PROLOGUE
425	PROFCODE
426
427	subq	$STACKSIZE, %rsp
428	movq	%rbx,   (%rsp)
429	movq	%rbp,  8(%rsp)
430	movq	%r12, 16(%rsp)
431	movq	%r13, 24(%rsp)
432	movq	%r14, 32(%rsp)
433	movq	%r15, 40(%rsp)
434
435	vzeroupper
436
437#ifdef WINDOWS_ABI
438	movq	%rdi,    48(%rsp)
439	movq	%rsi,    56(%rsp)
440	vmovups	%xmm6,   64(%rsp)
441	vmovups	%xmm7,   80(%rsp)
442	vmovups	%xmm8,   96(%rsp)
443	vmovups	%xmm9,  112(%rsp)
444	vmovups	%xmm10, 128(%rsp)
445	vmovups	%xmm11, 144(%rsp)
446	vmovups	%xmm12, 160(%rsp)
447	vmovups	%xmm13, 176(%rsp)
448	vmovups	%xmm14, 192(%rsp)
449	vmovups	%xmm15, 208(%rsp)
450
451	movq	ARG1,      OLD_M
452	movq	ARG2,      OLD_N
453	movq	ARG3,      OLD_K
454	movq	OLD_A,     A
455	movq	OLD_B,     B
456	movq	OLD_C,     C
457	movq	OLD_LDC,   LDC
458#ifdef TRMMKERNEL
459	vmovsd	OLD_OFFSET, %xmm12
460#endif
461	vmovaps	%xmm3, %xmm0
462	vmovsd   OLD_ALPHA_I, %xmm1
463
464#else
465	movq	STACKSIZE +  8(%rsp), LDC
466#ifdef TRMMKERNEL
467	vmovsd	STACKSIZE + 16(%rsp), %xmm12
468#endif
469
470#endif
471
472	movq    %rsp, SP      # save old stack
473        subq    $128 + L_BUFFER_SIZE, %rsp
474        andq    $-4096, %rsp    # align stack
475
476        STACK_TOUCH
477
478	cmpq	$0, OLD_M
479	je	.L999
480
481	cmpq	$0, OLD_N
482	je	.L999
483
484	cmpq	$0, OLD_K
485	je	.L999
486
487	movq	OLD_M, M
488	movq	OLD_N, N
489	movq	OLD_K, K
490
491	vmovsd	 %xmm0, ALPHA_R
492	vmovsd	 %xmm1, ALPHA_I
493
494	salq	$ZBASE_SHIFT, LDC
495
496	movq    N, %rax
497        xorq    %rdx, %rdx
498        movq    $2,  %rdi
499        divq    %rdi                    //    N / 2
500        movq    %rax, Ndiv6             //    N / 2
501        movq    %rdx, Nmod6             //    N % 2
502
503
504
505#ifdef TRMMKERNEL
506	vmovsd	%xmm12, OFFSET
507	vmovsd	%xmm12, KK
508#ifndef LEFT
509	negq	KK
510#endif
511#endif
512
513.L2_0:
514
515	movq	Ndiv6,  J
516	cmpq	$0, J
517	je	.L1_0
518	ALIGN_4
519
520
521
522.L2_01:
523	// copy to sub buffer
524	movq	B, BO1
525	leaq    BUFFER1, BO		// first buffer to BO
526	movq	K, %rax
527	ALIGN_4
528
529.L2_02b:
530
531	vmovups		(BO1), %xmm0
532	vmovups	2 * SIZE(BO1), %xmm1
533	vmovups	%xmm0,       (BO)
534	vmovups	%xmm1, 2 * SIZE(BO)
535	addq	$4*SIZE,BO1
536	addq	$4*SIZE,BO
537	decq	%rax
538	jnz	.L2_02b
539
540.L2_02c:
541
542	movq	BO1, B			// next offset of B
543
544.L2_10:
545	movq	C, CO1
546	leaq	(C, LDC, 2), C		// c += 2 * ldc
547
548#if defined(TRMMKERNEL) && defined(LEFT)
549        movq    OFFSET, %rax
550        movq    %rax, KK
551#endif
552
553	movq	A, AO		 	// aoffset = a
554	addq	$8 * SIZE, AO
555
556	movq	M,  I
557	sarq	$1, I			// i = (m >> 1)
558	je	.L2_40
559
560	ALIGN_4
561
562.L2_11:
563
564#if !defined(TRMMKERNEL) || \
565        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
566        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
567	leaq	BUFFER1, BO		// first buffer to BO
568	addq	$8 * SIZE, BO
569#else
570        movq    KK, %rax
571	leaq	BUFFER1, BO			// first buffer to BO
572	addq	$8 * SIZE, BO
573	movq    %rax, BI                        //  Index for BO
574        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
575        leaq    (BO, BI, SIZE), BO
576	salq	$2, %rax			// rax = rax * 4 ; number of values
577        leaq    (AO, %rax, SIZE), AO
578#endif
579
580	vzeroall
581
582#ifndef TRMMKERNEL
583        movq    K, %rax
584#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
585        movq    K, %rax
586        subq    KK, %rax
587        movq    %rax, KKK
588#else
589        movq    KK, %rax
590#ifdef LEFT
591        addq    $2, %rax        // number of values in AO
592#else
593        addq    $2, %rax        // number of values in BO
594#endif
595        movq    %rax, KKK
596#endif
597
598
599	andq	$-8, %rax			//  K = K - ( K % 8 )
600	je	.L2_16
601	movq    %rax, BI                        //  Index for BO
602        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
603
604	salq	$2, %rax			// rax = rax * 4 ; number of values
605	leaq	(AO, %rax, SIZE), AO
606	leaq	(BO, BI, SIZE), BO
607	negq	BI
608	negq	%rax
609	ALIGN_4
610
611.L2_12:
612
613	prefetcht0	B_PR1(BO,BI,SIZE)
614	KERNEL2x2_1(xxx)
615	KERNEL2x2_2(xxx)
616	prefetcht0	B_PR1+64(BO,BI,SIZE)
617	KERNEL2x2_3(xxx)
618	KERNEL2x2_4(xxx)
619
620	prefetcht0	B_PR1(BO,BI,SIZE)
621	KERNEL2x2_1(xxx)
622	KERNEL2x2_2(xxx)
623	prefetcht0	B_PR1+64(BO,BI,SIZE)
624	KERNEL2x2_3(xxx)
625	KERNEL2x2_4(xxx)
626
627	je	.L2_16
628
629	prefetcht0	B_PR1(BO,BI,SIZE)
630	KERNEL2x2_1(xxx)
631	KERNEL2x2_2(xxx)
632	prefetcht0	B_PR1+64(BO,BI,SIZE)
633	KERNEL2x2_3(xxx)
634	KERNEL2x2_4(xxx)
635
636	prefetcht0	B_PR1(BO,BI,SIZE)
637	KERNEL2x2_1(xxx)
638	KERNEL2x2_2(xxx)
639	prefetcht0	B_PR1+64(BO,BI,SIZE)
640	KERNEL2x2_3(xxx)
641	KERNEL2x2_4(xxx)
642
643	je	.L2_16
644
645	jmp	.L2_12
646	ALIGN_4
647
648.L2_16:
649#ifndef TRMMKERNEL
650        movq    K, %rax
651#else
652        movq    KKK, %rax
653#endif
654
655	andq	$7, %rax		# if (k & 1)
656	je .L2_19
657
658	movq    %rax, BI                        //  Index for BO
659        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
660
661	salq	$2, %rax			// rax = rax * 4 ; number of values
662	leaq	(AO, %rax, SIZE), AO
663	leaq	(BO, BI, SIZE), BO
664	negq	BI
665	negq	%rax
666	ALIGN_4
667
668.L2_17:
669
670	KERNEL2x2_SUB(xxx)
671	jl	.L2_17
672	ALIGN_4
673
674
675.L2_19:
676
677	vmovddup	ALPHA_R, %xmm0
678	vmovddup	ALPHA_I, %xmm1
679
680	// swap high and low 64 bytes
681        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
682        vshufpd $0x01, %xmm11, %xmm11, %xmm11
683        vshufpd $0x01, %xmm13, %xmm13, %xmm13
684        vshufpd $0x01, %xmm15, %xmm15, %xmm15
685
686#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
687    defined(NR) || defined(NC) || defined(TR) || defined(TC)
688
689        vaddsubpd %xmm9, %xmm8 , %xmm8
690        vaddsubpd %xmm11,%xmm10, %xmm10
691        vaddsubpd %xmm13,%xmm12, %xmm12
692        vaddsubpd %xmm15,%xmm14, %xmm14
693
694        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
695        vshufpd $0x01, %xmm10, %xmm10, %xmm11
696        vshufpd $0x01, %xmm12, %xmm12, %xmm13
697        vshufpd $0x01, %xmm14, %xmm14, %xmm15
698
699#else
700        vaddsubpd %xmm8,  %xmm9 ,%xmm9
701        vaddsubpd %xmm10, %xmm11,%xmm11
702        vaddsubpd %xmm12, %xmm13,%xmm13
703        vaddsubpd %xmm14, %xmm15,%xmm15
704
705        vmovapd   %xmm9,  %xmm8
706        vmovapd   %xmm11, %xmm10
707        vmovapd   %xmm13, %xmm12
708        vmovapd   %xmm15, %xmm14
709
710	// swap high and low 64 bytes
711        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
712        vshufpd $0x01, %xmm11, %xmm11, %xmm11
713        vshufpd $0x01, %xmm13, %xmm13, %xmm13
714        vshufpd $0x01, %xmm15, %xmm15, %xmm15
715
716#endif
717
718	// multiply with ALPHA_R
719        vmulpd  %xmm8 , %xmm0, %xmm8
720        vmulpd  %xmm10, %xmm0, %xmm10
721        vmulpd  %xmm12, %xmm0, %xmm12
722        vmulpd  %xmm14, %xmm0, %xmm14
723
724	// multiply with ALPHA_I
725        vmulpd  %xmm9 , %xmm1, %xmm9
726        vmulpd  %xmm11, %xmm1, %xmm11
727        vmulpd  %xmm13, %xmm1, %xmm13
728        vmulpd  %xmm15, %xmm1, %xmm15
729
730	vaddsubpd %xmm9, %xmm8 , %xmm8
731        vaddsubpd %xmm11,%xmm10, %xmm10
732        vaddsubpd %xmm13,%xmm12, %xmm12
733        vaddsubpd %xmm15,%xmm14, %xmm14
734
735
736
737#ifndef TRMMKERNEL
738
739	vaddpd 	 	(CO1), %xmm8 , %xmm8
740	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
741
742	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
743	vaddpd  2 * SIZE(CO1, LDC), %xmm14, %xmm14
744
745#endif
746
747	vmovups	%xmm8 ,  	(CO1)
748	vmovups	%xmm12 , 2 * SIZE(CO1)
749
750	vmovups	%xmm10 ,  	(CO1, LDC)
751	vmovups	%xmm14 , 2 * SIZE(CO1, LDC)
752
753#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
754    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
755        movq    K, %rax
756        subq    KKK, %rax
757	movq    %rax, BI                        //  Index for BO
758        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
759        leaq    (BO, BI, SIZE), BO
760	salq	$2, %rax			// rax = rax * 4 ; number of values
761        leaq    (AO, %rax, SIZE), AO
762#endif
763
764
765#if defined(TRMMKERNEL) && defined(LEFT)
766        addq    $2, KK
767#endif
768
769	addq	$4 * SIZE, CO1		# coffset += 4
770	decq	I			# i --
771	jg	.L2_11
772	ALIGN_4
773
774
775/**************************************************************************
776* Rest of M
777***************************************************************************/
778.L2_40:
779	testq	$1, M
780	jz	.L2_60		// to next 2 lines of N
781
782	ALIGN_4
783
784.L2_41:
785
786#if !defined(TRMMKERNEL) || \
787        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
788        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
789	leaq	BUFFER1, BO		// first buffer to BO
790	addq	$8 * SIZE, BO
791#else
792        movq    KK, %rax
793	leaq	BUFFER1, BO			// first buffer to BO
794	addq	$8 * SIZE, BO
795	movq    %rax, BI                        //  Index for BO
796        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
797        leaq    (BO, BI, SIZE), BO
798	salq	$1, %rax			// rax = rax * 2 ; number of values
799        leaq    (AO, %rax, SIZE), AO
800#endif
801
802	vzeroall
803
804#ifndef TRMMKERNEL
805        movq    K, %rax
806#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
807        movq    K, %rax
808        subq    KK, %rax
809        movq    %rax, KKK
810#else
811        movq    KK, %rax
812#ifdef LEFT
813        addq    $1, %rax        // number of values in AO
814#else
815        addq    $2, %rax        // number of values in BO
816#endif
817        movq    %rax, KKK
818#endif
819
820
821	andq	$-8, %rax			//  K = K - ( K % 8 )
822	je	.L2_46
823	movq    %rax, BI                        //  Index for BO
824        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
825
826	salq	$1, %rax			// rax = rax * 2 ; number of values
827	leaq	(AO, %rax, SIZE), AO
828	leaq	(BO, BI, SIZE), BO
829	negq	BI
830	negq	%rax
831	ALIGN_4
832
833.L2_42:
834
835	prefetcht0	B_PR1(BO,BI,SIZE)
836	KERNEL1x2_1(xxx)
837	KERNEL1x2_2(xxx)
838	prefetcht0	B_PR1+64(BO,BI,SIZE)
839	KERNEL1x2_3(xxx)
840	KERNEL1x2_4(xxx)
841
842	prefetcht0	B_PR1(BO,BI,SIZE)
843	KERNEL1x2_1(xxx)
844	KERNEL1x2_2(xxx)
845	prefetcht0	B_PR1+64(BO,BI,SIZE)
846	KERNEL1x2_3(xxx)
847	KERNEL1x2_4(xxx)
848
849	je	.L2_46
850
851	prefetcht0	B_PR1(BO,BI,SIZE)
852	KERNEL1x2_1(xxx)
853	KERNEL1x2_2(xxx)
854	prefetcht0	B_PR1+64(BO,BI,SIZE)
855	KERNEL1x2_3(xxx)
856	KERNEL1x2_4(xxx)
857
858	prefetcht0	B_PR1(BO,BI,SIZE)
859	KERNEL1x2_1(xxx)
860	KERNEL1x2_2(xxx)
861	prefetcht0	B_PR1+64(BO,BI,SIZE)
862	KERNEL1x2_3(xxx)
863	KERNEL1x2_4(xxx)
864
865	je	.L2_46
866
867	jmp	.L2_42
868	ALIGN_4
869
870.L2_46:
871#ifndef TRMMKERNEL
872        movq    K, %rax
873#else
874        movq    KKK, %rax
875#endif
876
877	andq	$7, %rax		# if (k & 1)
878	je .L2_49
879
880	movq    %rax, BI                        //  Index for BO
881        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
882
883	salq	$1, %rax			// rax = rax * 2 ; number of values
884	leaq	(AO, %rax, SIZE), AO
885	leaq	(BO, BI, SIZE), BO
886	negq	BI
887	negq	%rax
888	ALIGN_4
889
890.L2_47:
891
892	KERNEL1x2_SUB(xxx)
893	jl	.L2_47
894	ALIGN_4
895
896
897.L2_49:
898
899	vmovddup	ALPHA_R, %xmm0
900	vmovddup	ALPHA_I, %xmm1
901
902	// swap high and low 64 bytes
903        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
904        vshufpd $0x01, %xmm11, %xmm11, %xmm11
905
906#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
907    defined(NR) || defined(NC) || defined(TR) || defined(TC)
908
909        vaddsubpd %xmm9, %xmm8 , %xmm8
910        vaddsubpd %xmm11,%xmm10, %xmm10
911
912        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
913        vshufpd $0x01, %xmm10, %xmm10, %xmm11
914
915#else
916        vaddsubpd %xmm8, %xmm9, %xmm9
917        vaddsubpd %xmm10,%xmm11, %xmm11
918
919        vmovapd   %xmm9,  %xmm8
920        vmovapd   %xmm11, %xmm10
921
922	// swap high and low 64 bytes
923        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
924        vshufpd $0x01, %xmm11, %xmm11, %xmm11
925
926#endif
927
928	// multiply with ALPHA_R
929        vmulpd  %xmm8 , %xmm0, %xmm8
930        vmulpd  %xmm10, %xmm0, %xmm10
931
932	// multiply with ALPHA_I
933        vmulpd  %xmm9 , %xmm1, %xmm9
934        vmulpd  %xmm11, %xmm1, %xmm11
935
936	vaddsubpd %xmm9, %xmm8 , %xmm8
937        vaddsubpd %xmm11,%xmm10, %xmm10
938
939
940
941#ifndef TRMMKERNEL
942
943	vaddpd 	 	(CO1), %xmm8 , %xmm8
944	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
945
946#endif
947
948	vmovups	%xmm8 ,  	(CO1)
949	vmovups	%xmm10 ,  	(CO1, LDC)
950
951#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
952    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
953        movq    K, %rax
954        subq    KKK, %rax
955	movq    %rax, BI                        //  Index for BO
956        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
957        leaq    (BO, BI, SIZE), BO
958	salq	$1, %rax			// rax = rax * 2 ; number of values
959        leaq    (AO, %rax, SIZE), AO
960#endif
961
962
963#if defined(TRMMKERNEL) && defined(LEFT)
964        addq    $1, KK
965#endif
966
967	addq	$2 * SIZE, CO1		# coffset += 2
968	ALIGN_4
969
970
971
972
973.L2_60:
974#if defined(TRMMKERNEL) && !defined(LEFT)
975        addq    $2, KK
976#endif
977
978	decq	J			// j --
979	jg	.L2_01			// next 2 lines of N
980
981
982
983.L1_0:
984
985/************************************************************************************************
986* Loop for Nmod6 % 2 > 0
987*************************************************************************************************/
988
989	movq	Nmod6, J
990	andq	$1, J			// j % 2
991	je	.L999
992	ALIGN_4
993
994.L1_01:
995	// copy to sub buffer
996	movq	B, BO1
997	leaq    BUFFER1, BO		// first buffer to BO
998	movq	K, %rax
999	ALIGN_4
1000
1001.L1_02b:
1002
1003	vmovups		(BO1), %xmm0
1004	vmovups	%xmm0,       (BO)
1005	addq	$2*SIZE,BO1
1006	addq	$2*SIZE,BO
1007	decq	%rax
1008	jnz	.L1_02b
1009
1010.L1_02c:
1011
1012	movq	BO1, B			// next offset of B
1013
1014.L1_10:
1015	movq	C, CO1
1016	leaq	(C, LDC, 1), C		// c += 1 * ldc
1017
1018#if defined(TRMMKERNEL) && defined(LEFT)
1019        movq    OFFSET, %rax
1020        movq    %rax, KK
1021#endif
1022
1023	movq	A, AO		 	// aoffset = a
1024	addq	$8 * SIZE, AO
1025
1026	movq	M,  I
1027	sarq	$1, I			// i = (m >> 1)
1028	je	.L1_40
1029
1030	ALIGN_4
1031
1032.L1_11:
1033
1034#if !defined(TRMMKERNEL) || \
1035        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1036        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1037	leaq	BUFFER1, BO		// first buffer to BO
1038	addq	$4 * SIZE, BO
1039#else
1040        movq    KK, %rax
1041	leaq	BUFFER1, BO			// first buffer to BO
1042	addq	$4 * SIZE, BO
1043	movq    %rax, BI                        //  Index for BO
1044        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
1045        leaq    (BO, BI, SIZE), BO
1046	salq	$2, %rax			// rax = rax * 4 ; number of values
1047        leaq    (AO, %rax, SIZE), AO
1048#endif
1049
1050	vzeroall
1051
1052#ifndef TRMMKERNEL
1053        movq    K, %rax
1054#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1055        movq    K, %rax
1056        subq    KK, %rax
1057        movq    %rax, KKK
1058#else
1059        movq    KK, %rax
1060#ifdef LEFT
1061        addq    $2, %rax        // number of values in AO
1062#else
1063        addq    $1, %rax        // number of values in BO
1064#endif
1065        movq    %rax, KKK
1066#endif
1067
1068
1069	andq	$-8, %rax			//  K = K - ( K % 8 )
1070	je	.L1_16
1071	movq    %rax, BI                        //  Index for BO
1072        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
1073
1074	salq	$2, %rax			// rax = rax * 4 ; number of values
1075	leaq	(AO, %rax, SIZE), AO
1076	leaq	(BO, BI, SIZE), BO
1077	negq	BI
1078	negq	%rax
1079	ALIGN_4
1080
1081.L1_12:
1082
1083	prefetcht0	B_PR1(BO,BI,SIZE)
1084	KERNEL2x1_1(xxx)
1085	KERNEL2x1_2(xxx)
1086	KERNEL2x1_3(xxx)
1087	KERNEL2x1_4(xxx)
1088
1089	prefetcht0	B_PR1(BO,BI,SIZE)
1090	KERNEL2x1_1(xxx)
1091	KERNEL2x1_2(xxx)
1092	KERNEL2x1_3(xxx)
1093	KERNEL2x1_4(xxx)
1094
1095	je	.L1_16
1096
1097	prefetcht0	B_PR1(BO,BI,SIZE)
1098	KERNEL2x1_1(xxx)
1099	KERNEL2x1_2(xxx)
1100	KERNEL2x1_3(xxx)
1101	KERNEL2x1_4(xxx)
1102
1103	prefetcht0	B_PR1(BO,BI,SIZE)
1104	KERNEL2x1_1(xxx)
1105	KERNEL2x1_2(xxx)
1106	KERNEL2x1_3(xxx)
1107	KERNEL2x1_4(xxx)
1108
1109	je	.L1_16
1110
1111	jmp	.L1_12
1112	ALIGN_4
1113
1114.L1_16:
1115#ifndef TRMMKERNEL
1116        movq    K, %rax
1117#else
1118        movq    KKK, %rax
1119#endif
1120
1121	andq	$7, %rax		# if (k & 1)
1122	je .L1_19
1123
1124	movq    %rax, BI                        //  Index for BO
1125        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
1126
1127	salq	$2, %rax			// rax = rax * 4 ; number of values
1128	leaq	(AO, %rax, SIZE), AO
1129	leaq	(BO, BI, SIZE), BO
1130	negq	BI
1131	negq	%rax
1132	ALIGN_4
1133
1134.L1_17:
1135
1136	KERNEL2x1_SUB(xxx)
1137	jl	.L1_17
1138	ALIGN_4
1139
1140
1141.L1_19:
1142
1143	vmovddup	ALPHA_R, %xmm0
1144	vmovddup	ALPHA_I, %xmm1
1145
1146	// swap high and low 64 bytes
1147        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
1148        vshufpd $0x01, %xmm13, %xmm13, %xmm13
1149
1150#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1151    defined(NR) || defined(NC) || defined(TR) || defined(TC)
1152
1153        vaddsubpd %xmm9, %xmm8  , %xmm8
1154        vaddsubpd %xmm13,%xmm12 , %xmm12
1155
1156        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
1157        vshufpd $0x01, %xmm12, %xmm12, %xmm13
1158
1159#else
1160        vaddsubpd %xmm8, %xmm9 , %xmm9
1161        vaddsubpd %xmm12,%xmm13, %xmm13
1162
1163        vmovapd   %xmm9,  %xmm8
1164        vmovapd   %xmm13, %xmm12
1165
1166	// swap high and low 64 bytes
1167        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
1168        vshufpd $0x01, %xmm13, %xmm13, %xmm13
1169
1170#endif
1171
1172	// multiply with ALPHA_R
1173        vmulpd  %xmm8 , %xmm0, %xmm8
1174        vmulpd  %xmm12, %xmm0, %xmm12
1175
1176	// multiply with ALPHA_I
1177        vmulpd  %xmm9 , %xmm1, %xmm9
1178        vmulpd  %xmm13, %xmm1, %xmm13
1179
1180	vaddsubpd %xmm9,  %xmm8 , %xmm8
1181        vaddsubpd %xmm13, %xmm12, %xmm12
1182
1183
1184
1185#ifndef TRMMKERNEL
1186
1187	vaddpd 	 	(CO1), %xmm8 , %xmm8
1188	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
1189
1190#endif
1191
1192	vmovups	%xmm8 ,  	(CO1)
1193	vmovups	%xmm12 , 2 * SIZE(CO1)
1194
1195
1196#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1197    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1198        movq    K, %rax
1199        subq    KKK, %rax
1200	movq    %rax, BI                        //  Index for BO
1201        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
1202        leaq    (BO, BI, SIZE), BO
1203	salq	$2, %rax			// rax = rax * 4 ; number of values
1204        leaq    (AO, %rax, SIZE), AO
1205#endif
1206
1207
1208#if defined(TRMMKERNEL) && defined(LEFT)
1209        addq    $2, KK
1210#endif
1211
1212	addq	$4 * SIZE, CO1		# coffset += 4
1213	decq	I			# i --
1214	jg	.L1_11
1215	ALIGN_4
1216
1217
1218/**************************************************************************
1219* Rest of M
1220***************************************************************************/
1221.L1_40:
1222	testq	$1, M
1223	jz	.L999
1224
1225	ALIGN_4
1226
1227.L1_41:
1228
1229#if !defined(TRMMKERNEL) || \
1230        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1231        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1232	leaq	BUFFER1, BO		// first buffer to BO
1233	addq	$4 * SIZE, BO
1234#else
1235        movq    KK, %rax
1236	leaq	BUFFER1, BO			// first buffer to BO
1237	addq	$4 * SIZE, BO
1238	movq    %rax, BI                        //  Index for BO
1239        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
1240        leaq    (BO, BI, SIZE), BO
1241	salq	$1, %rax			// rax = rax * 2 ; number of values
1242        leaq    (AO, %rax, SIZE), AO
1243#endif
1244
1245	vzeroall
1246
1247#ifndef TRMMKERNEL
1248        movq    K, %rax
1249#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1250        movq    K, %rax
1251        subq    KK, %rax
1252        movq    %rax, KKK
1253#else
1254        movq    KK, %rax
1255#ifdef LEFT
1256        addq    $1, %rax        // number of values in AO
1257#else
1258        addq    $1, %rax        // number of values in BO
1259#endif
1260        movq    %rax, KKK
1261#endif
1262
1263
1264	andq	$-8, %rax			//  K = K - ( K % 8 )
1265	je	.L1_46
1266	movq    %rax, BI                        //  Index for BO
1267        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
1268
1269	salq	$1, %rax			// rax = rax * 2 ; number of values
1270	leaq	(AO, %rax, SIZE), AO
1271	leaq	(BO, BI, SIZE), BO
1272	negq	BI
1273	negq	%rax
1274	ALIGN_4
1275
1276.L1_42:
1277
1278	prefetcht0	B_PR1(BO,BI,SIZE)
1279	KERNEL1x1_1(xxx)
1280	KERNEL1x1_2(xxx)
1281	KERNEL1x1_3(xxx)
1282	KERNEL1x1_4(xxx)
1283
1284	prefetcht0	B_PR1(BO,BI,SIZE)
1285	KERNEL1x1_1(xxx)
1286	KERNEL1x1_2(xxx)
1287	KERNEL1x1_3(xxx)
1288	KERNEL1x1_4(xxx)
1289
1290	je	.L1_46
1291
1292	prefetcht0	B_PR1(BO,BI,SIZE)
1293	KERNEL1x1_1(xxx)
1294	KERNEL1x1_2(xxx)
1295	KERNEL1x1_3(xxx)
1296	KERNEL1x1_4(xxx)
1297
1298	prefetcht0	B_PR1(BO,BI,SIZE)
1299	KERNEL1x1_1(xxx)
1300	KERNEL1x1_2(xxx)
1301	KERNEL1x1_3(xxx)
1302	KERNEL1x1_4(xxx)
1303
1304	je	.L1_46
1305
1306	jmp	.L1_42
1307	ALIGN_4
1308
1309.L1_46:
1310#ifndef TRMMKERNEL
1311        movq    K, %rax
1312#else
1313        movq    KKK, %rax
1314#endif
1315
1316	andq	$7, %rax		# if (k & 1)
1317	je .L1_49
1318
1319	movq    %rax, BI                        //  Index for BO
1320        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
1321
1322	salq	$1, %rax			// rax = rax * 2 ; number of values
1323	leaq	(AO, %rax, SIZE), AO
1324	leaq	(BO, BI, SIZE), BO
1325	negq	BI
1326	negq	%rax
1327	ALIGN_4
1328
1329.L1_47:
1330
1331	KERNEL1x1_SUB(xxx)
1332	jl	.L1_47
1333	ALIGN_4
1334
1335
1336.L1_49:
1337
1338	vmovddup	ALPHA_R, %xmm0
1339	vmovddup	ALPHA_I, %xmm1
1340
1341	// swap high and low 64 bytes
1342        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
1343
1344#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1345    defined(NR) || defined(NC) || defined(TR) || defined(TC)
1346
1347        vaddsubpd %xmm9, %xmm8,  %xmm8
1348
1349        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
1350
1351#else
1352        vaddsubpd %xmm8, %xmm9,  %xmm9
1353
1354        vmovapd   %xmm9,  %xmm8
1355
1356	// swap high and low 64 bytes
1357        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
1358
1359#endif
1360
1361	// multiply with ALPHA_R
1362        vmulpd  %xmm8 , %xmm0, %xmm8
1363
1364	// multiply with ALPHA_I
1365        vmulpd  %xmm9 , %xmm1, %xmm9
1366
1367	vaddsubpd %xmm9 ,%xmm8,  %xmm8
1368
1369
1370
1371#ifndef TRMMKERNEL
1372
1373	vaddpd 	 	(CO1), %xmm8 , %xmm8
1374
1375#endif
1376
1377	vmovups	%xmm8 ,  	(CO1)
1378
1379#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1380    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1381        movq    K, %rax
1382        subq    KKK, %rax
1383	movq    %rax, BI                        //  Index for BO
1384        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
1385        leaq    (BO, BI, SIZE), BO
1386	salq	$1, %rax			// rax = rax * 2 ; number of values
1387        leaq    (AO, %rax, SIZE), AO
1388#endif
1389
1390
1391#if defined(TRMMKERNEL) && defined(LEFT)
1392        addq    $1, KK
1393#endif
1394
1395	addq	$2 * SIZE, CO1		# coffset += 2
1396	ALIGN_4
1397
1398
1399
1400.L999:
1401	vzeroupper
1402
1403	movq   		SP, %rsp
1404	movq	   (%rsp), %rbx
1405	movq	  8(%rsp), %rbp
1406	movq	 16(%rsp), %r12
1407	movq	 24(%rsp), %r13
1408	movq	 32(%rsp), %r14
1409	movq	 40(%rsp), %r15
1410
1411#ifdef WINDOWS_ABI
1412	movq	 48(%rsp), %rdi
1413	movq	 56(%rsp), %rsi
1414	vmovups	 64(%rsp), %xmm6
1415	vmovups	 80(%rsp), %xmm7
1416	vmovups	 96(%rsp), %xmm8
1417	vmovups	112(%rsp), %xmm9
1418	vmovups	128(%rsp), %xmm10
1419	vmovups	144(%rsp), %xmm11
1420	vmovups	160(%rsp), %xmm12
1421	vmovups	176(%rsp), %xmm13
1422	vmovups	192(%rsp), %xmm14
1423	vmovups	208(%rsp), %xmm15
1424#endif
1425
1426	addq	$STACKSIZE, %rsp
1427	ret
1428
1429	EPILOGUE
1430