1 /*
2
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
6
7 Copyright (C) 2014, The University of Texas at Austin
8
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are
11 met:
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17 - Neither the name of The University of Texas at Austin nor the names
18 of its contributors may be used to endorse or promote products
19 derived from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
33 */
34
35 #include "blis.h"
36
37
38 #define SGEMM_INPUT_GS_BETA_NZ \
39 "vmovlps (%%rcx ), %%xmm0, %%xmm0 \n\t" \
40 "vmovhps (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \
41 "vmovlps (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \
42 "vmovhps (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \
43 "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" \
44 "vmovlps (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \
45 "vmovhps (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \
46 "vmovlps (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \
47 "vmovhps (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t" \
48 "vshufps $0x88, %%xmm1, %%xmm2, %%xmm2 \n\t" \
49 "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t"
50
51 #define SGEMM_OUTPUT_GS_BETA_NZ \
52 "vextractf128 $1, %%ymm0, %%xmm2 \n\t" \
53 "vmovss %%xmm0, (%%rcx ) \n\t" \
54 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" \
55 "vmovss %%xmm1, (%%rcx,%%rsi,1) \n\t" \
56 "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" \
57 "vmovss %%xmm0, (%%rcx,%%rsi,2) \n\t" \
58 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" \
59 "vmovss %%xmm1, (%%rcx,%%r13 ) \n\t" \
60 "vmovss %%xmm2, (%%rcx,%%rsi,4) \n\t" \
61 "vpermilps $0x39, %%xmm2, %%xmm1 \n\t" \
62 "vmovss %%xmm1, (%%rcx,%%r15 ) \n\t" \
63 "vpermilps $0x39, %%xmm1, %%xmm2 \n\t" \
64 "vmovss %%xmm2, (%%rcx,%%r13,2) \n\t" \
65 "vpermilps $0x39, %%xmm2, %%xmm1 \n\t" \
66 "vmovss %%xmm1, (%%rcx,%%r10 ) \n\t"
67
bli_sgemm_asm_6x16(dim_t k,float * restrict alpha,float * restrict a,float * restrict b,float * restrict beta,float * restrict c,inc_t rs_c,inc_t cs_c,auxinfo_t * restrict data,cntx_t * restrict cntx)68 void bli_sgemm_asm_6x16
69 (
70 dim_t k,
71 float* restrict alpha,
72 float* restrict a,
73 float* restrict b,
74 float* restrict beta,
75 float* restrict c, inc_t rs_c, inc_t cs_c,
76 auxinfo_t* restrict data,
77 cntx_t* restrict cntx
78 )
79 {
80 //void* a_next = bli_auxinfo_next_a( data );
81 //void* b_next = bli_auxinfo_next_b( data );
82
83 uint64_t k_iter = k / 4;
84 uint64_t k_left = k % 4;
85
86 __asm__ volatile
87 (
88 " \n\t"
89 "vzeroall \n\t" // zero all xmm/ymm registers.
90 " \n\t"
91 " \n\t"
92 "movq %2, %%rax \n\t" // load address of a.
93 "movq %3, %%rbx \n\t" // load address of b.
94 //"movq %9, %%r15 \n\t" // load address of b_next.
95 " \n\t"
96 "addq $32 * 4, %%rbx \n\t"
97 " \n\t" // initialize loop by pre-loading
98 "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t"
99 "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t"
100 " \n\t"
101 "movq %6, %%rcx \n\t" // load address of c
102 "movq %7, %%rdi \n\t" // load rs_c
103 "leaq (,%%rdi,4), %%rdi \n\t" // rs_c *= sizeof(float)
104 " \n\t"
105 "leaq (%%rdi,%%rdi,2), %%r13 \n\t" // r13 = 3*rs_c;
106 "leaq (%%rcx,%%r13,1), %%rdx \n\t" // rdx = c + 3*rs_c;
107 "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*rs_c
108 "prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*rs_c
109 "prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*rs_c
110 "prefetcht0 7 * 8(%%rdx) \n\t" // prefetch c + 3*rs_c
111 "prefetcht0 7 * 8(%%rdx,%%rdi) \n\t" // prefetch c + 4*rs_c
112 "prefetcht0 7 * 8(%%rdx,%%rdi,2) \n\t" // prefetch c + 5*rs_c
113 " \n\t"
114 " \n\t"
115 " \n\t"
116 " \n\t"
117 "movq %0, %%rsi \n\t" // i = k_iter;
118 "testq %%rsi, %%rsi \n\t" // check i via logical AND.
119 "je .SCONSIDKLEFT \n\t" // if i == 0, jump to code that
120 " \n\t" // contains the k_left loop.
121 " \n\t"
122 " \n\t"
123 ".SLOOPKITER: \n\t" // MAIN LOOP
124 " \n\t"
125 " \n\t"
126 " \n\t" // iteration 0
127 "prefetcht0 64 * 4(%%rax) \n\t"
128 " \n\t"
129 "vbroadcastss 0 * 4(%%rax), %%ymm2 \n\t"
130 "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t"
131 "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t"
132 "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t"
133 "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t"
134 "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t"
135 " \n\t"
136 "vbroadcastss 2 * 4(%%rax), %%ymm2 \n\t"
137 "vbroadcastss 3 * 4(%%rax), %%ymm3 \n\t"
138 "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t"
139 "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t"
140 "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t"
141 "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t"
142 " \n\t"
143 "vbroadcastss 4 * 4(%%rax), %%ymm2 \n\t"
144 "vbroadcastss 5 * 4(%%rax), %%ymm3 \n\t"
145 "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t"
146 "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t"
147 "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t"
148 "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t"
149 " \n\t"
150 "vmovaps -2 * 32(%%rbx), %%ymm0 \n\t"
151 "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t"
152 " \n\t"
153 " \n\t" // iteration 1
154 "vbroadcastss 6 * 4(%%rax), %%ymm2 \n\t"
155 "vbroadcastss 7 * 4(%%rax), %%ymm3 \n\t"
156 "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t"
157 "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t"
158 "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t"
159 "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t"
160 " \n\t"
161 "vbroadcastss 8 * 4(%%rax), %%ymm2 \n\t"
162 "vbroadcastss 9 * 4(%%rax), %%ymm3 \n\t"
163 "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t"
164 "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t"
165 "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t"
166 "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t"
167 " \n\t"
168 "vbroadcastss 10 * 4(%%rax), %%ymm2 \n\t"
169 "vbroadcastss 11 * 4(%%rax), %%ymm3 \n\t"
170 "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t"
171 "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t"
172 "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t"
173 "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t"
174 " \n\t"
175 "vmovaps 0 * 32(%%rbx), %%ymm0 \n\t"
176 "vmovaps 1 * 32(%%rbx), %%ymm1 \n\t"
177 " \n\t"
178 " \n\t" // iteration 2
179 "prefetcht0 76 * 4(%%rax) \n\t"
180 " \n\t"
181 "vbroadcastss 12 * 4(%%rax), %%ymm2 \n\t"
182 "vbroadcastss 13 * 4(%%rax), %%ymm3 \n\t"
183 "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t"
184 "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t"
185 "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t"
186 "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t"
187 " \n\t"
188 "vbroadcastss 14 * 4(%%rax), %%ymm2 \n\t"
189 "vbroadcastss 15 * 4(%%rax), %%ymm3 \n\t"
190 "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t"
191 "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t"
192 "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t"
193 "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t"
194 " \n\t"
195 "vbroadcastss 16 * 4(%%rax), %%ymm2 \n\t"
196 "vbroadcastss 17 * 4(%%rax), %%ymm3 \n\t"
197 "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t"
198 "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t"
199 "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t"
200 "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t"
201 " \n\t"
202 "vmovaps 2 * 32(%%rbx), %%ymm0 \n\t"
203 "vmovaps 3 * 32(%%rbx), %%ymm1 \n\t"
204 " \n\t"
205 " \n\t" // iteration 3
206 "vbroadcastss 18 * 4(%%rax), %%ymm2 \n\t"
207 "vbroadcastss 19 * 4(%%rax), %%ymm3 \n\t"
208 "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t"
209 "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t"
210 "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t"
211 "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t"
212 " \n\t"
213 "vbroadcastss 20 * 4(%%rax), %%ymm2 \n\t"
214 "vbroadcastss 21 * 4(%%rax), %%ymm3 \n\t"
215 "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t"
216 "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t"
217 "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t"
218 "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t"
219 " \n\t"
220 "vbroadcastss 22 * 4(%%rax), %%ymm2 \n\t"
221 "vbroadcastss 23 * 4(%%rax), %%ymm3 \n\t"
222 "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t"
223 "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t"
224 "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t"
225 "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t"
226 " \n\t"
227 "addq $4 * 6 * 4, %%rax \n\t" // a += 4*6 (unroll x mr)
228 "addq $4 * 16 * 4, %%rbx \n\t" // b += 4*16 (unroll x nr)
229 " \n\t"
230 "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t"
231 "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t"
232 " \n\t"
233 " \n\t"
234 "decq %%rsi \n\t" // i -= 1;
235 "jne .SLOOPKITER \n\t" // iterate again if i != 0.
236 " \n\t"
237 " \n\t"
238 " \n\t"
239 " \n\t"
240 " \n\t"
241 " \n\t"
242 ".SCONSIDKLEFT: \n\t"
243 " \n\t"
244 "movq %1, %%rsi \n\t" // i = k_left;
245 "testq %%rsi, %%rsi \n\t" // check i via logical AND.
246 "je .SPOSTACCUM \n\t" // if i == 0, we're done; jump to end.
247 " \n\t" // else, we prepare to enter k_left loop.
248 " \n\t"
249 " \n\t"
250 ".SLOOPKLEFT: \n\t" // EDGE LOOP
251 " \n\t"
252 "prefetcht0 16 * 32(%%rax) \n\t"
253 " \n\t"
254 "vbroadcastss 0 * 4(%%rax), %%ymm2 \n\t"
255 "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t"
256 "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t"
257 "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t"
258 "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t"
259 "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t"
260 " \n\t"
261 "vbroadcastss 2 * 4(%%rax), %%ymm2 \n\t"
262 "vbroadcastss 3 * 4(%%rax), %%ymm3 \n\t"
263 "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t"
264 "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t"
265 "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t"
266 "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t"
267 " \n\t"
268 "vbroadcastss 4 * 4(%%rax), %%ymm2 \n\t"
269 "vbroadcastss 5 * 4(%%rax), %%ymm3 \n\t"
270 "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t"
271 "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t"
272 "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t"
273 "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t"
274 " \n\t"
275 "addq $1 * 6 * 4, %%rax \n\t" // a += 1*6 (unroll x mr)
276 "addq $1 * 16 * 4, %%rbx \n\t" // b += 1*16 (unroll x nr)
277 " \n\t"
278 "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t"
279 "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t"
280 " \n\t"
281 " \n\t"
282 "decq %%rsi \n\t" // i -= 1;
283 "jne .SLOOPKLEFT \n\t" // iterate again if i != 0.
284 " \n\t"
285 " \n\t"
286 " \n\t"
287 ".SPOSTACCUM: \n\t"
288 " \n\t"
289 " \n\t"
290 " \n\t"
291 " \n\t"
292 "movq %4, %%rax \n\t" // load address of alpha
293 "movq %5, %%rbx \n\t" // load address of beta
294 "vbroadcastss (%%rax), %%ymm0 \n\t" // load alpha and duplicate
295 "vbroadcastss (%%rbx), %%ymm3 \n\t" // load beta and duplicate
296 " \n\t"
297 "vmulps %%ymm0, %%ymm4, %%ymm4 \n\t" // scale by alpha
298 "vmulps %%ymm0, %%ymm5, %%ymm5 \n\t"
299 "vmulps %%ymm0, %%ymm6, %%ymm6 \n\t"
300 "vmulps %%ymm0, %%ymm7, %%ymm7 \n\t"
301 "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t"
302 "vmulps %%ymm0, %%ymm9, %%ymm9 \n\t"
303 "vmulps %%ymm0, %%ymm10, %%ymm10 \n\t"
304 "vmulps %%ymm0, %%ymm11, %%ymm11 \n\t"
305 "vmulps %%ymm0, %%ymm12, %%ymm12 \n\t"
306 "vmulps %%ymm0, %%ymm13, %%ymm13 \n\t"
307 "vmulps %%ymm0, %%ymm14, %%ymm14 \n\t"
308 "vmulps %%ymm0, %%ymm15, %%ymm15 \n\t"
309 " \n\t"
310 " \n\t"
311 " \n\t"
312 " \n\t"
313 " \n\t"
314 " \n\t"
315 "movq %8, %%rsi \n\t" // load cs_c
316 "leaq (,%%rsi,4), %%rsi \n\t" // rsi = cs_c * sizeof(float)
317 " \n\t"
318 "leaq (%%rcx,%%rsi,8), %%rdx \n\t" // load address of c + 8*cs_c;
319 " \n\t"
320 "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*cs_c;
321 "leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*cs_c;
322 "leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*cs_c;
323 " \n\t"
324 " \n\t"
325 " \n\t" // now avoid loading C if beta == 0
326 " \n\t"
327 "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero.
328 "vucomiss %%xmm0, %%xmm3 \n\t" // set ZF if beta == 0.
329 "je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case
330 " \n\t"
331 " \n\t"
332 "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4.
333 "jz .SROWSTORED \n\t" // jump to row storage case
334 " \n\t"
335 " \n\t"
336 " \n\t"
337 ".SGENSTORED: \n\t"
338 " \n\t"
339 " \n\t"
340 SGEMM_INPUT_GS_BETA_NZ
341 "vfmadd213ps %%ymm4, %%ymm3, %%ymm0 \n\t"
342 SGEMM_OUTPUT_GS_BETA_NZ
343 "addq %%rdi, %%rcx \n\t" // c += rs_c;
344 " \n\t"
345 " \n\t"
346 SGEMM_INPUT_GS_BETA_NZ
347 "vfmadd213ps %%ymm6, %%ymm3, %%ymm0 \n\t"
348 SGEMM_OUTPUT_GS_BETA_NZ
349 "addq %%rdi, %%rcx \n\t" // c += rs_c;
350 " \n\t"
351 " \n\t"
352 SGEMM_INPUT_GS_BETA_NZ
353 "vfmadd213ps %%ymm8, %%ymm3, %%ymm0 \n\t"
354 SGEMM_OUTPUT_GS_BETA_NZ
355 "addq %%rdi, %%rcx \n\t" // c += rs_c;
356 " \n\t"
357 " \n\t"
358 SGEMM_INPUT_GS_BETA_NZ
359 "vfmadd213ps %%ymm10, %%ymm3, %%ymm0 \n\t"
360 SGEMM_OUTPUT_GS_BETA_NZ
361 "addq %%rdi, %%rcx \n\t" // c += rs_c;
362 " \n\t"
363 " \n\t"
364 SGEMM_INPUT_GS_BETA_NZ
365 "vfmadd213ps %%ymm12, %%ymm3, %%ymm0 \n\t"
366 SGEMM_OUTPUT_GS_BETA_NZ
367 "addq %%rdi, %%rcx \n\t" // c += rs_c;
368 " \n\t"
369 " \n\t"
370 SGEMM_INPUT_GS_BETA_NZ
371 "vfmadd213ps %%ymm14, %%ymm3, %%ymm0 \n\t"
372 SGEMM_OUTPUT_GS_BETA_NZ
373 //"addq %%rdi, %%rcx \n\t" // c += rs_c;
374 " \n\t"
375 " \n\t"
376 "movq %%rdx, %%rcx \n\t" // rcx = c + 8*cs_c
377 " \n\t"
378 " \n\t"
379 SGEMM_INPUT_GS_BETA_NZ
380 "vfmadd213ps %%ymm5, %%ymm3, %%ymm0 \n\t"
381 SGEMM_OUTPUT_GS_BETA_NZ
382 "addq %%rdi, %%rcx \n\t" // c += rs_c;
383 " \n\t"
384 " \n\t"
385 SGEMM_INPUT_GS_BETA_NZ
386 "vfmadd213ps %%ymm7, %%ymm3, %%ymm0 \n\t"
387 SGEMM_OUTPUT_GS_BETA_NZ
388 "addq %%rdi, %%rcx \n\t" // c += rs_c;
389 " \n\t"
390 " \n\t"
391 SGEMM_INPUT_GS_BETA_NZ
392 "vfmadd213ps %%ymm9, %%ymm3, %%ymm0 \n\t"
393 SGEMM_OUTPUT_GS_BETA_NZ
394 "addq %%rdi, %%rcx \n\t" // c += rs_c;
395 " \n\t"
396 " \n\t"
397 SGEMM_INPUT_GS_BETA_NZ
398 "vfmadd213ps %%ymm11, %%ymm3, %%ymm0 \n\t"
399 SGEMM_OUTPUT_GS_BETA_NZ
400 "addq %%rdi, %%rcx \n\t" // c += rs_c;
401 " \n\t"
402 " \n\t"
403 SGEMM_INPUT_GS_BETA_NZ
404 "vfmadd213ps %%ymm13, %%ymm3, %%ymm0 \n\t"
405 SGEMM_OUTPUT_GS_BETA_NZ
406 "addq %%rdi, %%rcx \n\t" // c += rs_c;
407 " \n\t"
408 " \n\t"
409 SGEMM_INPUT_GS_BETA_NZ
410 "vfmadd213ps %%ymm15, %%ymm3, %%ymm0 \n\t"
411 SGEMM_OUTPUT_GS_BETA_NZ
412 //"addq %%rdi, %%rcx \n\t" // c += rs_c;
413 " \n\t"
414 " \n\t"
415 " \n\t"
416 "jmp .SDONE \n\t" // jump to end.
417 " \n\t"
418 " \n\t"
419 " \n\t"
420 ".SROWSTORED: \n\t"
421 " \n\t"
422 " \n\t"
423 "vfmadd231ps (%%rcx), %%ymm3, %%ymm4 \n\t"
424 "vmovups %%ymm4, (%%rcx) \n\t"
425 "addq %%rdi, %%rcx \n\t"
426 "vfmadd231ps (%%rdx), %%ymm3, %%ymm5 \n\t"
427 "vmovups %%ymm5, (%%rdx) \n\t"
428 "addq %%rdi, %%rdx \n\t"
429 " \n\t"
430 " \n\t"
431 "vfmadd231ps (%%rcx), %%ymm3, %%ymm6 \n\t"
432 "vmovups %%ymm6, (%%rcx) \n\t"
433 "addq %%rdi, %%rcx \n\t"
434 "vfmadd231ps (%%rdx), %%ymm3, %%ymm7 \n\t"
435 "vmovups %%ymm7, (%%rdx) \n\t"
436 "addq %%rdi, %%rdx \n\t"
437 " \n\t"
438 " \n\t"
439 "vfmadd231ps (%%rcx), %%ymm3, %%ymm8 \n\t"
440 "vmovups %%ymm8, (%%rcx) \n\t"
441 "addq %%rdi, %%rcx \n\t"
442 "vfmadd231ps (%%rdx), %%ymm3, %%ymm9 \n\t"
443 "vmovups %%ymm9, (%%rdx) \n\t"
444 "addq %%rdi, %%rdx \n\t"
445 " \n\t"
446 " \n\t"
447 "vfmadd231ps (%%rcx), %%ymm3, %%ymm10 \n\t"
448 "vmovups %%ymm10, (%%rcx) \n\t"
449 "addq %%rdi, %%rcx \n\t"
450 "vfmadd231ps (%%rdx), %%ymm3, %%ymm11 \n\t"
451 "vmovups %%ymm11, (%%rdx) \n\t"
452 "addq %%rdi, %%rdx \n\t"
453 " \n\t"
454 " \n\t"
455 "vfmadd231ps (%%rcx), %%ymm3, %%ymm12 \n\t"
456 "vmovups %%ymm12, (%%rcx) \n\t"
457 "addq %%rdi, %%rcx \n\t"
458 "vfmadd231ps (%%rdx), %%ymm3, %%ymm13 \n\t"
459 "vmovups %%ymm13, (%%rdx) \n\t"
460 "addq %%rdi, %%rdx \n\t"
461 " \n\t"
462 " \n\t"
463 "vfmadd231ps (%%rcx), %%ymm3, %%ymm14 \n\t"
464 "vmovups %%ymm14, (%%rcx) \n\t"
465 //"addq %%rdi, %%rcx \n\t"
466 "vfmadd231ps (%%rdx), %%ymm3, %%ymm15 \n\t"
467 "vmovups %%ymm15, (%%rdx) \n\t"
468 //"addq %%rdi, %%rdx \n\t"
469 " \n\t"
470 " \n\t"
471 " \n\t"
472 "jmp .SDONE \n\t" // jump to end.
473 " \n\t"
474 " \n\t"
475 " \n\t"
476 ".SBETAZERO: \n\t"
477 " \n\t"
478 "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4.
479 "jz .SROWSTORBZ \n\t" // jump to row storage case
480 " \n\t"
481 " \n\t"
482 " \n\t"
483 ".SGENSTORBZ: \n\t"
484 " \n\t"
485 " \n\t"
486 "vmovaps %%ymm4, %%ymm0 \n\t"
487 SGEMM_OUTPUT_GS_BETA_NZ
488 "addq %%rdi, %%rcx \n\t" // c += rs_c;
489 " \n\t"
490 " \n\t"
491 "vmovaps %%ymm6, %%ymm0 \n\t"
492 SGEMM_OUTPUT_GS_BETA_NZ
493 "addq %%rdi, %%rcx \n\t" // c += rs_c;
494 " \n\t"
495 " \n\t"
496 "vmovaps %%ymm8, %%ymm0 \n\t"
497 SGEMM_OUTPUT_GS_BETA_NZ
498 "addq %%rdi, %%rcx \n\t" // c += rs_c;
499 " \n\t"
500 " \n\t"
501 "vmovaps %%ymm10, %%ymm0 \n\t"
502 SGEMM_OUTPUT_GS_BETA_NZ
503 "addq %%rdi, %%rcx \n\t" // c += rs_c;
504 " \n\t"
505 " \n\t"
506 "vmovaps %%ymm12, %%ymm0 \n\t"
507 SGEMM_OUTPUT_GS_BETA_NZ
508 "addq %%rdi, %%rcx \n\t" // c += rs_c;
509 " \n\t"
510 " \n\t"
511 "vmovaps %%ymm14, %%ymm0 \n\t"
512 SGEMM_OUTPUT_GS_BETA_NZ
513 //"addq %%rdi, %%rcx \n\t" // c += rs_c;
514 " \n\t"
515 " \n\t"
516 "movq %%rdx, %%rcx \n\t" // rcx = c + 8*cs_c
517 " \n\t"
518 " \n\t"
519 "vmovaps %%ymm5, %%ymm0 \n\t"
520 SGEMM_OUTPUT_GS_BETA_NZ
521 "addq %%rdi, %%rcx \n\t" // c += rs_c;
522 " \n\t"
523 " \n\t"
524 "vmovaps %%ymm7, %%ymm0 \n\t"
525 SGEMM_OUTPUT_GS_BETA_NZ
526 "addq %%rdi, %%rcx \n\t" // c += rs_c;
527 " \n\t"
528 " \n\t"
529 "vmovaps %%ymm9, %%ymm0 \n\t"
530 SGEMM_OUTPUT_GS_BETA_NZ
531 "addq %%rdi, %%rcx \n\t" // c += rs_c;
532 " \n\t"
533 " \n\t"
534 "vmovaps %%ymm11, %%ymm0 \n\t"
535 SGEMM_OUTPUT_GS_BETA_NZ
536 "addq %%rdi, %%rcx \n\t" // c += rs_c;
537 " \n\t"
538 " \n\t"
539 "vmovaps %%ymm13, %%ymm0 \n\t"
540 SGEMM_OUTPUT_GS_BETA_NZ
541 "addq %%rdi, %%rcx \n\t" // c += rs_c;
542 " \n\t"
543 " \n\t"
544 "vmovaps %%ymm15, %%ymm0 \n\t"
545 SGEMM_OUTPUT_GS_BETA_NZ
546 //"addq %%rdi, %%rcx \n\t" // c += rs_c;
547 " \n\t"
548 " \n\t"
549 " \n\t"
550 "jmp .SDONE \n\t" // jump to end.
551 " \n\t"
552 " \n\t"
553 " \n\t"
554 ".SROWSTORBZ: \n\t"
555 " \n\t"
556 " \n\t"
557 "vmovups %%ymm4, (%%rcx) \n\t"
558 "addq %%rdi, %%rcx \n\t"
559 "vmovups %%ymm5, (%%rdx) \n\t"
560 "addq %%rdi, %%rdx \n\t"
561 " \n\t"
562 "vmovups %%ymm6, (%%rcx) \n\t"
563 "addq %%rdi, %%rcx \n\t"
564 "vmovups %%ymm7, (%%rdx) \n\t"
565 "addq %%rdi, %%rdx \n\t"
566 " \n\t"
567 " \n\t"
568 "vmovups %%ymm8, (%%rcx) \n\t"
569 "addq %%rdi, %%rcx \n\t"
570 "vmovups %%ymm9, (%%rdx) \n\t"
571 "addq %%rdi, %%rdx \n\t"
572 " \n\t"
573 " \n\t"
574 "vmovups %%ymm10, (%%rcx) \n\t"
575 "addq %%rdi, %%rcx \n\t"
576 "vmovups %%ymm11, (%%rdx) \n\t"
577 "addq %%rdi, %%rdx \n\t"
578 " \n\t"
579 " \n\t"
580 "vmovups %%ymm12, (%%rcx) \n\t"
581 "addq %%rdi, %%rcx \n\t"
582 "vmovups %%ymm13, (%%rdx) \n\t"
583 "addq %%rdi, %%rdx \n\t"
584 " \n\t"
585 " \n\t"
586 "vmovups %%ymm14, (%%rcx) \n\t"
587 //"addq %%rdi, %%rcx \n\t"
588 "vmovups %%ymm15, (%%rdx) \n\t"
589 //"addq %%rdi, %%rdx \n\t"
590 " \n\t"
591 " \n\t"
592 " \n\t"
593 " \n\t"
594 " \n\t"
595 " \n\t"
596 " \n\t"
597 ".SDONE: \n\t"
598 " \n\t"
599
600 : // output operands (none)
601 : // input operands
602 "m" (k_iter), // 0
603 "m" (k_left), // 1
604 "m" (a), // 2
605 "m" (b), // 3
606 "m" (alpha), // 4
607 "m" (beta), // 5
608 "m" (c), // 6
609 "m" (rs_c), // 7
610 "m" (cs_c)/*, // 8
611 "m" (b_next), // 9
612 "m" (a_next)*/ // 10
613 : // register clobber list
614 "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
615 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
616 "xmm0", "xmm1", "xmm2", "xmm3",
617 "xmm4", "xmm5", "xmm6", "xmm7",
618 "xmm8", "xmm9", "xmm10", "xmm11",
619 "xmm12", "xmm13", "xmm14", "xmm15",
620 "memory"
621 );
622 }
623
624
625 #define DGEMM_INPUT_GS_BETA_NZ \
626 "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \
627 "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \
628 "vmovlpd (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \
629 "vmovhpd (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \
630 "vperm2f128 $0x20, %%ymm1, %%ymm0, %%ymm0 \n\t" /*\
631 "vmovlps (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \
632 "vmovhps (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \
633 "vmovlps (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \
634 "vmovhps (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t" \
635 "vperm2f128 $0x20, %%ymm1, %%ymm2, %%ymm2 \n\t"*/
636
637 #define DGEMM_OUTPUT_GS_BETA_NZ \
638 "vextractf128 $1, %%ymm0, %%xmm1 \n\t" \
639 "vmovlpd %%xmm0, (%%rcx ) \n\t" \
640 "vmovhpd %%xmm0, (%%rcx,%%rsi ) \n\t" \
641 "vmovlpd %%xmm1, (%%rcx,%%rsi,2) \n\t" \
642 "vmovhpd %%xmm1, (%%rcx,%%r13 ) \n\t" /*\
643 "vextractf128 $1, %%ymm2, %%xmm1 \n\t" \
644 "vmovlpd %%xmm2, (%%rcx,%%rsi,4) \n\t" \
645 "vmovhpd %%xmm2, (%%rcx,%%r15 ) \n\t" \
646 "vmovlpd %%xmm1, (%%rcx,%%r13,2) \n\t" \
647 "vmovhpd %%xmm1, (%%rcx,%%r10 ) \n\t"*/
648
bli_dgemm_asm_6x8(dim_t k,double * restrict alpha,double * restrict a,double * restrict b,double * restrict beta,double * restrict c,inc_t rs_c,inc_t cs_c,auxinfo_t * restrict data,cntx_t * restrict cntx)649 void bli_dgemm_asm_6x8
650 (
651 dim_t k,
652 double* restrict alpha,
653 double* restrict a,
654 double* restrict b,
655 double* restrict beta,
656 double* restrict c, inc_t rs_c, inc_t cs_c,
657 auxinfo_t* restrict data,
658 cntx_t* restrict cntx
659 )
660 {
661 //void* a_next = bli_auxinfo_next_a( data );
662 //void* b_next = bli_auxinfo_next_b( data );
663 void* c_prefetch = data->c_prefetch;
664
665 uint64_t k_iter = k / 4;
666 uint64_t k_left = k % 4;
667
668 __asm__ volatile
669 (
670 " \n\t"
671 "vzeroall \n\t" // zero all xmm/ymm registers.
672 " \n\t"
673 " \n\t"
674 "movq %2, %%rax \n\t" // load address of a.
675 "movq %3, %%rbx \n\t" // load address of b.
676 //"movq %9, %%r15 \n\t" // load address of b_next.
677 " \n\t"
678 "addq $32 * 4, %%rbx \n\t"
679 " \n\t" // initialize loop by pre-loading
680 "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t"
681 "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t"
682 " \n\t"
683 "movq %6, %%rcx \n\t" // load address of c
684 "movq %9, %%r8 \n\t" // load address of c_prefetch
685 "movq %7, %%rdi \n\t" // load rs_c
686 "leaq (,%%rdi,8), %%rdi \n\t" // rs_c *= sizeof(double)
687 " \n\t"
688 "leaq (%%rdi,%%rdi,2), %%r13 \n\t" // r13 = 3*rs_c;
689 "leaq (%%r8,%%r13,1), %%rdx \n\t" // rdx = c_prefetch + 3*rs_c;
690 "prefetcht0 7 * 8(%%r8) \n\t" // prefetch c + 0*rs_c
691 "prefetcht0 7 * 8(%%r8,%%rdi) \n\t" // prefetch c + 1*rs_c
692 "prefetcht0 7 * 8(%%r8,%%rdi,2) \n\t" // prefetch c + 2*rs_c
693 "prefetcht0 7 * 8(%%rdx) \n\t" // prefetch c + 3*rs_c
694 "prefetcht0 7 * 8(%%rdx,%%rdi) \n\t" // prefetch c + 4*rs_c
695 "prefetcht0 7 * 8(%%rdx,%%rdi,2) \n\t" // prefetch c + 5*rs_c
696 " \n\t"
697 " \n\t"
698 " \n\t"
699 " \n\t"
700 "movq %0, %%rsi \n\t" // i = k_iter;
701 "testq %%rsi, %%rsi \n\t" // check i via logical AND.
702 "je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that
703 " \n\t" // contains the k_left loop.
704 " \n\t"
705 " \n\t"
706 ".DLOOPKITER: \n\t" // MAIN LOOP
707 " \n\t"
708 " \n\t"
709 " \n\t" // iteration 0
710 "prefetcht0 64 * 8(%%rax) \n\t"
711 " \n\t"
712 "vbroadcastsd 0 * 8(%%rax), %%ymm2 \n\t"
713 "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t"
714 "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t"
715 "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t"
716 "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t"
717 "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t"
718 " \n\t"
719 "vbroadcastsd 2 * 8(%%rax), %%ymm2 \n\t"
720 "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t"
721 "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t"
722 "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t"
723 "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t"
724 "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t"
725 " \n\t"
726 "vbroadcastsd 4 * 8(%%rax), %%ymm2 \n\t"
727 "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t"
728 "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t"
729 "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t"
730 "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t"
731 "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t"
732 " \n\t"
733 "vmovaps -2 * 32(%%rbx), %%ymm0 \n\t"
734 "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t"
735 " \n\t"
736 " \n\t" // iteration 1
737 "prefetcht0 72 * 8(%%rax) \n\t"
738 " \n\t"
739 "vbroadcastsd 6 * 8(%%rax), %%ymm2 \n\t"
740 "vbroadcastsd 7 * 8(%%rax), %%ymm3 \n\t"
741 "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t"
742 "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t"
743 "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t"
744 "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t"
745 " \n\t"
746 "vbroadcastsd 8 * 8(%%rax), %%ymm2 \n\t"
747 "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t"
748 "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t"
749 "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t"
750 "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t"
751 "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t"
752 " \n\t"
753 "vbroadcastsd 10 * 8(%%rax), %%ymm2 \n\t"
754 "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t"
755 "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t"
756 "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t"
757 "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t"
758 "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t"
759 " \n\t"
760 "vmovaps 0 * 32(%%rbx), %%ymm0 \n\t"
761 "vmovaps 1 * 32(%%rbx), %%ymm1 \n\t"
762 " \n\t"
763 " \n\t" // iteration 2
764 "prefetcht0 80 * 8(%%rax) \n\t"
765 " \n\t"
766 "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t"
767 "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t"
768 "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t"
769 "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t"
770 "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t"
771 "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t"
772 " \n\t"
773 "vbroadcastsd 14 * 8(%%rax), %%ymm2 \n\t"
774 "vbroadcastsd 15 * 8(%%rax), %%ymm3 \n\t"
775 "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t"
776 "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t"
777 "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t"
778 "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t"
779 " \n\t"
780 "vbroadcastsd 16 * 8(%%rax), %%ymm2 \n\t"
781 "vbroadcastsd 17 * 8(%%rax), %%ymm3 \n\t"
782 "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t"
783 "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t"
784 "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t"
785 "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t"
786 " \n\t"
787 "vmovaps 2 * 32(%%rbx), %%ymm0 \n\t"
788 "vmovaps 3 * 32(%%rbx), %%ymm1 \n\t"
789 " \n\t"
790 " \n\t" // iteration 3
791 "vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t"
792 "vbroadcastsd 19 * 8(%%rax), %%ymm3 \n\t"
793 "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t"
794 "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t"
795 "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t"
796 "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t"
797 " \n\t"
798 "vbroadcastsd 20 * 8(%%rax), %%ymm2 \n\t"
799 "vbroadcastsd 21 * 8(%%rax), %%ymm3 \n\t"
800 "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t"
801 "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t"
802 "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t"
803 "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t"
804 " \n\t"
805 "vbroadcastsd 22 * 8(%%rax), %%ymm2 \n\t"
806 "vbroadcastsd 23 * 8(%%rax), %%ymm3 \n\t"
807 "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t"
808 "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t"
809 "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t"
810 "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t"
811 " \n\t"
812 "addq $4 * 6 * 8, %%rax \n\t" // a += 4*6 (unroll x mr)
813 "addq $4 * 8 * 8, %%rbx \n\t" // b += 4*8 (unroll x nr)
814 " \n\t"
815 "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t"
816 "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t"
817 " \n\t"
818 " \n\t"
819 "decq %%rsi \n\t" // i -= 1;
820 "jne .DLOOPKITER \n\t" // iterate again if i != 0.
821 " \n\t"
822 " \n\t"
823 " \n\t"
824 " \n\t"
825 " \n\t"
826 " \n\t"
827 ".DCONSIDKLEFT: \n\t"
828 " \n\t"
829 "movq %1, %%rsi \n\t" // i = k_left;
830 "testq %%rsi, %%rsi \n\t" // check i via logical AND.
831 "je .DPOSTACCUM \n\t" // if i == 0, we're done; jump to end.
832 " \n\t" // else, we prepare to enter k_left loop.
833 " \n\t"
834 " \n\t"
835 ".DLOOPKLEFT: \n\t" // EDGE LOOP
836 " \n\t"
837 "prefetcht0 64 * 8(%%rax) \n\t"
838 " \n\t"
839 "vbroadcastsd 0 * 8(%%rax), %%ymm2 \n\t"
840 "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t"
841 "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t"
842 "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t"
843 "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t"
844 "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t"
845 " \n\t"
846 "vbroadcastsd 2 * 8(%%rax), %%ymm2 \n\t"
847 "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t"
848 "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t"
849 "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t"
850 "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t"
851 "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t"
852 " \n\t"
853 "vbroadcastsd 4 * 8(%%rax), %%ymm2 \n\t"
854 "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t"
855 "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t"
856 "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t"
857 "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t"
858 "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t"
859 " \n\t"
860 "addq $1 * 6 * 8, %%rax \n\t" // a += 1*6 (unroll x mr)
861 "addq $1 * 8 * 8, %%rbx \n\t" // b += 1*8 (unroll x nr)
862 " \n\t"
863 "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t"
864 "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t"
865 " \n\t"
866 " \n\t"
867 "decq %%rsi \n\t" // i -= 1;
868 "jne .DLOOPKLEFT \n\t" // iterate again if i != 0.
869 " \n\t"
870 " \n\t"
871 " \n\t"
872 ".DPOSTACCUM: \n\t"
873 " \n\t"
874 " \n\t"
875 " \n\t"
876 " \n\t"
877 "movq %4, %%rax \n\t" // load address of alpha
878 "movq %5, %%rbx \n\t" // load address of beta
879 "vbroadcastsd (%%rax), %%ymm0 \n\t" // load alpha and duplicate
880 "vbroadcastsd (%%rbx), %%ymm3 \n\t" // load beta and duplicate
881 " \n\t"
882 "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" // scale by alpha
883 "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t"
884 "vmulpd %%ymm0, %%ymm6, %%ymm6 \n\t"
885 "vmulpd %%ymm0, %%ymm7, %%ymm7 \n\t"
886 "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t"
887 "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t"
888 "vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t"
889 "vmulpd %%ymm0, %%ymm11, %%ymm11 \n\t"
890 "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t"
891 "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t"
892 "vmulpd %%ymm0, %%ymm14, %%ymm14 \n\t"
893 "vmulpd %%ymm0, %%ymm15, %%ymm15 \n\t"
894 " \n\t"
895 " \n\t"
896 " \n\t"
897 " \n\t"
898 " \n\t"
899 " \n\t"
900 "movq %8, %%rsi \n\t" // load cs_c
901 "leaq (,%%rsi,8), %%rsi \n\t" // rsi = cs_c * sizeof(double)
902 " \n\t"
903 "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*cs_c;
904 " \n\t"
905 "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*cs_c;
906 //"leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*cs_c;
907 //"leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*cs_c;
908 " \n\t"
909 " \n\t"
910 " \n\t" // now avoid loading C if beta == 0
911 " \n\t"
912 "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero.
913 "vucomisd %%xmm0, %%xmm3 \n\t" // set ZF if beta == 0.
914 "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case
915 " \n\t"
916 " \n\t"
917 "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8.
918 "jz .DROWSTORED \n\t" // jump to row storage case
919 " \n\t"
920 " \n\t"
921 " \n\t"
922 ".DGENSTORED: \n\t"
923 " \n\t"
924 " \n\t"
925 DGEMM_INPUT_GS_BETA_NZ
926 "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t"
927 DGEMM_OUTPUT_GS_BETA_NZ
928 "addq %%rdi, %%rcx \n\t" // c += rs_c;
929 " \n\t"
930 " \n\t"
931 DGEMM_INPUT_GS_BETA_NZ
932 "vfmadd213pd %%ymm6, %%ymm3, %%ymm0 \n\t"
933 DGEMM_OUTPUT_GS_BETA_NZ
934 "addq %%rdi, %%rcx \n\t" // c += rs_c;
935 " \n\t"
936 " \n\t"
937 DGEMM_INPUT_GS_BETA_NZ
938 "vfmadd213pd %%ymm8, %%ymm3, %%ymm0 \n\t"
939 DGEMM_OUTPUT_GS_BETA_NZ
940 "addq %%rdi, %%rcx \n\t" // c += rs_c;
941 " \n\t"
942 " \n\t"
943 DGEMM_INPUT_GS_BETA_NZ
944 "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t"
945 DGEMM_OUTPUT_GS_BETA_NZ
946 "addq %%rdi, %%rcx \n\t" // c += rs_c;
947 " \n\t"
948 " \n\t"
949 DGEMM_INPUT_GS_BETA_NZ
950 "vfmadd213pd %%ymm12, %%ymm3, %%ymm0 \n\t"
951 DGEMM_OUTPUT_GS_BETA_NZ
952 "addq %%rdi, %%rcx \n\t" // c += rs_c;
953 " \n\t"
954 " \n\t"
955 DGEMM_INPUT_GS_BETA_NZ
956 "vfmadd213pd %%ymm14, %%ymm3, %%ymm0 \n\t"
957 DGEMM_OUTPUT_GS_BETA_NZ
958 " \n\t"
959 " \n\t"
960 "movq %%rdx, %%rcx \n\t" // rcx = c + 4*cs_c
961 " \n\t"
962 " \n\t"
963 DGEMM_INPUT_GS_BETA_NZ
964 "vfmadd213pd %%ymm5, %%ymm3, %%ymm0 \n\t"
965 DGEMM_OUTPUT_GS_BETA_NZ
966 "addq %%rdi, %%rcx \n\t" // c += rs_c;
967 " \n\t"
968 " \n\t"
969 DGEMM_INPUT_GS_BETA_NZ
970 "vfmadd213pd %%ymm7, %%ymm3, %%ymm0 \n\t"
971 DGEMM_OUTPUT_GS_BETA_NZ
972 "addq %%rdi, %%rcx \n\t" // c += rs_c;
973 " \n\t"
974 " \n\t"
975 DGEMM_INPUT_GS_BETA_NZ
976 "vfmadd213pd %%ymm9, %%ymm3, %%ymm0 \n\t"
977 DGEMM_OUTPUT_GS_BETA_NZ
978 "addq %%rdi, %%rcx \n\t" // c += rs_c;
979 " \n\t"
980 " \n\t"
981 DGEMM_INPUT_GS_BETA_NZ
982 "vfmadd213pd %%ymm11, %%ymm3, %%ymm0 \n\t"
983 DGEMM_OUTPUT_GS_BETA_NZ
984 "addq %%rdi, %%rcx \n\t" // c += rs_c;
985 " \n\t"
986 " \n\t"
987 DGEMM_INPUT_GS_BETA_NZ
988 "vfmadd213pd %%ymm13, %%ymm3, %%ymm0 \n\t"
989 DGEMM_OUTPUT_GS_BETA_NZ
990 "addq %%rdi, %%rcx \n\t" // c += rs_c;
991 " \n\t"
992 " \n\t"
993 DGEMM_INPUT_GS_BETA_NZ
994 "vfmadd213pd %%ymm15, %%ymm3, %%ymm0 \n\t"
995 DGEMM_OUTPUT_GS_BETA_NZ
996 " \n\t"
997 " \n\t"
998 " \n\t"
999 "jmp .DDONE \n\t" // jump to end.
1000 " \n\t"
1001 " \n\t"
1002 " \n\t"
1003 ".DROWSTORED: \n\t"
1004 " \n\t"
1005 " \n\t"
1006 "vfmadd231pd (%%rcx), %%ymm3, %%ymm4 \n\t"
1007 "vmovups %%ymm4, (%%rcx) \n\t"
1008 "addq %%rdi, %%rcx \n\t"
1009 "vfmadd231pd (%%rdx), %%ymm3, %%ymm5 \n\t"
1010 "vmovups %%ymm5, (%%rdx) \n\t"
1011 "addq %%rdi, %%rdx \n\t"
1012 " \n\t"
1013 " \n\t"
1014 "vfmadd231pd (%%rcx), %%ymm3, %%ymm6 \n\t"
1015 "vmovups %%ymm6, (%%rcx) \n\t"
1016 "addq %%rdi, %%rcx \n\t"
1017 "vfmadd231pd (%%rdx), %%ymm3, %%ymm7 \n\t"
1018 "vmovups %%ymm7, (%%rdx) \n\t"
1019 "addq %%rdi, %%rdx \n\t"
1020 " \n\t"
1021 " \n\t"
1022 "vfmadd231pd (%%rcx), %%ymm3, %%ymm8 \n\t"
1023 "vmovups %%ymm8, (%%rcx) \n\t"
1024 "addq %%rdi, %%rcx \n\t"
1025 "vfmadd231pd (%%rdx), %%ymm3, %%ymm9 \n\t"
1026 "vmovups %%ymm9, (%%rdx) \n\t"
1027 "addq %%rdi, %%rdx \n\t"
1028 " \n\t"
1029 " \n\t"
1030 "vfmadd231pd (%%rcx), %%ymm3, %%ymm10 \n\t"
1031 "vmovups %%ymm10, (%%rcx) \n\t"
1032 "addq %%rdi, %%rcx \n\t"
1033 "vfmadd231pd (%%rdx), %%ymm3, %%ymm11 \n\t"
1034 "vmovups %%ymm11, (%%rdx) \n\t"
1035 "addq %%rdi, %%rdx \n\t"
1036 " \n\t"
1037 " \n\t"
1038 "vfmadd231pd (%%rcx), %%ymm3, %%ymm12 \n\t"
1039 "vmovups %%ymm12, (%%rcx) \n\t"
1040 "addq %%rdi, %%rcx \n\t"
1041 "vfmadd231pd (%%rdx), %%ymm3, %%ymm13 \n\t"
1042 "vmovups %%ymm13, (%%rdx) \n\t"
1043 "addq %%rdi, %%rdx \n\t"
1044 " \n\t"
1045 " \n\t"
1046 "vfmadd231pd (%%rcx), %%ymm3, %%ymm14 \n\t"
1047 "vmovups %%ymm14, (%%rcx) \n\t"
1048 //"addq %%rdi, %%rcx \n\t"
1049 "vfmadd231pd (%%rdx), %%ymm3, %%ymm15 \n\t"
1050 "vmovups %%ymm15, (%%rdx) \n\t"
1051 //"addq %%rdi, %%rdx \n\t"
1052 " \n\t"
1053 " \n\t"
1054 " \n\t"
1055 "jmp .DDONE \n\t" // jump to end.
1056 " \n\t"
1057 " \n\t"
1058 " \n\t"
1059 ".DBETAZERO: \n\t"
1060 " \n\t"
1061 "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8.
1062 "jz .DROWSTORBZ \n\t" // jump to row storage case
1063 " \n\t"
1064 " \n\t"
1065 " \n\t"
1066 ".DGENSTORBZ: \n\t"
1067 " \n\t"
1068 " \n\t"
1069 "vmovaps %%ymm4, %%ymm0 \n\t"
1070 DGEMM_OUTPUT_GS_BETA_NZ
1071 "addq %%rdi, %%rcx \n\t" // c += rs_c;
1072 " \n\t"
1073 " \n\t"
1074 "vmovaps %%ymm6, %%ymm0 \n\t"
1075 DGEMM_OUTPUT_GS_BETA_NZ
1076 "addq %%rdi, %%rcx \n\t" // c += rs_c;
1077 " \n\t"
1078 " \n\t"
1079 "vmovaps %%ymm8, %%ymm0 \n\t"
1080 DGEMM_OUTPUT_GS_BETA_NZ
1081 "addq %%rdi, %%rcx \n\t" // c += rs_c;
1082 " \n\t"
1083 " \n\t"
1084 "vmovaps %%ymm10, %%ymm0 \n\t"
1085 DGEMM_OUTPUT_GS_BETA_NZ
1086 "addq %%rdi, %%rcx \n\t" // c += rs_c;
1087 " \n\t"
1088 " \n\t"
1089 "vmovaps %%ymm12, %%ymm0 \n\t"
1090 DGEMM_OUTPUT_GS_BETA_NZ
1091 "addq %%rdi, %%rcx \n\t" // c += rs_c;
1092 " \n\t"
1093 " \n\t"
1094 "vmovaps %%ymm14, %%ymm0 \n\t"
1095 DGEMM_OUTPUT_GS_BETA_NZ
1096 " \n\t"
1097 " \n\t"
1098 "movq %%rdx, %%rcx \n\t" // rcx = c + 4*cs_c
1099 " \n\t"
1100 " \n\t"
1101 "vmovaps %%ymm5, %%ymm0 \n\t"
1102 DGEMM_OUTPUT_GS_BETA_NZ
1103 "addq %%rdi, %%rcx \n\t" // c += rs_c;
1104 " \n\t"
1105 " \n\t"
1106 "vmovaps %%ymm7, %%ymm0 \n\t"
1107 DGEMM_OUTPUT_GS_BETA_NZ
1108 "addq %%rdi, %%rcx \n\t" // c += rs_c;
1109 " \n\t"
1110 " \n\t"
1111 "vmovaps %%ymm9, %%ymm0 \n\t"
1112 DGEMM_OUTPUT_GS_BETA_NZ
1113 "addq %%rdi, %%rcx \n\t" // c += rs_c;
1114 " \n\t"
1115 " \n\t"
1116 "vmovaps %%ymm11, %%ymm0 \n\t"
1117 DGEMM_OUTPUT_GS_BETA_NZ
1118 "addq %%rdi, %%rcx \n\t" // c += rs_c;
1119 " \n\t"
1120 " \n\t"
1121 "vmovaps %%ymm13, %%ymm0 \n\t"
1122 DGEMM_OUTPUT_GS_BETA_NZ
1123 "addq %%rdi, %%rcx \n\t" // c += rs_c;
1124 " \n\t"
1125 " \n\t"
1126 "vmovaps %%ymm15, %%ymm0 \n\t"
1127 DGEMM_OUTPUT_GS_BETA_NZ
1128 " \n\t"
1129 " \n\t"
1130 " \n\t"
1131 "jmp .DDONE \n\t" // jump to end.
1132 " \n\t"
1133 " \n\t"
1134 " \n\t"
1135 ".DROWSTORBZ: \n\t"
1136 " \n\t"
1137 " \n\t"
1138 "vmovups %%ymm4, (%%rcx) \n\t"
1139 "addq %%rdi, %%rcx \n\t"
1140 "vmovups %%ymm5, (%%rdx) \n\t"
1141 "addq %%rdi, %%rdx \n\t"
1142 " \n\t"
1143 "vmovups %%ymm6, (%%rcx) \n\t"
1144 "addq %%rdi, %%rcx \n\t"
1145 "vmovups %%ymm7, (%%rdx) \n\t"
1146 "addq %%rdi, %%rdx \n\t"
1147 " \n\t"
1148 " \n\t"
1149 "vmovups %%ymm8, (%%rcx) \n\t"
1150 "addq %%rdi, %%rcx \n\t"
1151 "vmovups %%ymm9, (%%rdx) \n\t"
1152 "addq %%rdi, %%rdx \n\t"
1153 " \n\t"
1154 " \n\t"
1155 "vmovups %%ymm10, (%%rcx) \n\t"
1156 "addq %%rdi, %%rcx \n\t"
1157 "vmovups %%ymm11, (%%rdx) \n\t"
1158 "addq %%rdi, %%rdx \n\t"
1159 " \n\t"
1160 " \n\t"
1161 "vmovups %%ymm12, (%%rcx) \n\t"
1162 "addq %%rdi, %%rcx \n\t"
1163 "vmovups %%ymm13, (%%rdx) \n\t"
1164 "addq %%rdi, %%rdx \n\t"
1165 " \n\t"
1166 " \n\t"
1167 "vmovups %%ymm14, (%%rcx) \n\t"
1168 //"addq %%rdi, %%rcx \n\t"
1169 "vmovups %%ymm15, (%%rdx) \n\t"
1170 //"addq %%rdi, %%rdx \n\t"
1171 " \n\t"
1172 " \n\t"
1173 " \n\t"
1174 " \n\t"
1175 " \n\t"
1176 " \n\t"
1177 ".DDONE: \n\t"
1178 " \n\t"
1179
1180 : // output operands (none)
1181 : // input operands
1182 "m" (k_iter), // 0
1183 "m" (k_left), // 1
1184 "m" (a), // 2
1185 "m" (b), // 3
1186 "m" (alpha), // 4
1187 "m" (beta), // 5
1188 "m" (c), // 6
1189 "m" (rs_c), // 7
1190 "m" (cs_c), // 8
1191 "m" (c_prefetch)/*, // 9
1192 "m" (b_next), // 9
1193 "m" (a_next)*/ // 10
1194 : // register clobber list
1195 "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
1196 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
1197 "xmm0", "xmm1", "xmm2", "xmm3",
1198 "xmm4", "xmm5", "xmm6", "xmm7",
1199 "xmm8", "xmm9", "xmm10", "xmm11",
1200 "xmm12", "xmm13", "xmm14", "xmm15",
1201 "memory"
1202 );
1203 }
1204
1205 #if 0
1206
1207 void bli_cgemm_asm_
1208 (
1209 dim_t k,
1210 scomplex* restrict alpha,
1211 scomplex* restrict a,
1212 scomplex* restrict b,
1213 scomplex* restrict beta,
1214 scomplex* restrict c, inc_t rs_c, inc_t cs_c,
1215 auxinfo_t* restrict data,
1216 cntx_t* restrict cntx
1217 )
1218 {
1219 //void* a_next = bli_auxinfo_next_a( data );
1220 //void* b_next = bli_auxinfo_next_b( data );
1221
1222 //dim_t k_iter = k / 4;
1223 //dim_t k_left = k % 4;
1224
1225 }
1226
1227
1228
1229 void bli_zgemm_asm_
1230 (
1231 dim_t k,
1232 dcomplex* restrict alpha,
1233 dcomplex* restrict a,
1234 dcomplex* restrict b,
1235 dcomplex* restrict beta,
1236 dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
1237 auxinfo_t* restrict data,
1238 cntx_t* restrict cntx
1239 )
1240 {
1241 //void* a_next = bli_auxinfo_next_a( data );
1242 //void* b_next = bli_auxinfo_next_b( data );
1243
1244 //dim_t k_iter = k / 4;
1245 //dim_t k_left = k % 4;
1246
1247 }
1248
1249 #endif
1250