1 /*********************************************************************************
2 Copyright (c) 2020, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 **********************************************************************************/
27 #include "common.h"
28 #include <altivec.h>
29
30 typedef __vector unsigned char vec_t;
31 typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
32 #if !__has_builtin(__builtin_vsx_assemble_pair)
33 #define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
34 #endif
35
36 #if !__has_builtin(__builtin_vsx_disassemble_pair)
37 #define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair
38 #endif
39
40 #ifdef TRMMKERNEL
41 #define SAVE_ACC(ACC, J) \
42 __builtin_mma_disassemble_acc ((void *)result, ACC); \
43 rowC = (v4sf_t *) &CO[0* ldc+J]; \
44 rowC[0] = result[0] * alpha; \
45 rowC = (v4sf_t *) &CO[1*ldc+J]; \
46 rowC[0] = result[1] * alpha; \
47 rowC = (v4sf_t *) &CO[2*ldc+J]; \
48 rowC[0] = result[2] * alpha; \
49 rowC = (v4sf_t *) &CO[3*ldc+J]; \
50 rowC[0] = result[3] * alpha;
51 #define SAVE_ACC1(ACC, J) \
52 __builtin_mma_disassemble_acc ((void *)result, ACC); \
53 rowC = (v4sf_t *) &CO[4* ldc+J]; \
54 rowC[0] = result[0] * alpha; \
55 rowC = (v4sf_t *) &CO[5*ldc+J]; \
56 rowC[0] = result[1] * alpha; \
57 rowC = (v4sf_t *) &CO[6*ldc+J]; \
58 rowC[0] = result[2] * alpha; \
59 rowC = (v4sf_t *) &CO[7*ldc+J]; \
60 rowC[0] = result[3] * alpha;
61 #define SAVE2x4_ACC(ACC, J) \
62 __builtin_mma_disassemble_acc ((void *)result, ACC); \
63 rowC = (v4sf_t *) &CO[0* ldc+J]; \
64 rowC[0] = result[0] * alpha; \
65 rowC = (v4sf_t *) &CO[1* ldc+J]; \
66 rowC[0] = result[1] * alpha;
67 #else
68 #define SAVE_ACC(ACC, J) \
69 __builtin_mma_disassemble_acc ((void *)result, ACC); \
70 rowC = (v4sf_t *) &CO[0* ldc+J]; \
71 rowC[0] += result[0] * alpha; \
72 rowC = (v4sf_t *) &CO[1*ldc+J]; \
73 rowC[0] += result[1] * alpha; \
74 rowC = (v4sf_t *) &CO[2*ldc+J]; \
75 rowC[0] += result[2] * alpha; \
76 rowC = (v4sf_t *) &CO[3*ldc+J]; \
77 rowC[0] += result[3] * alpha;
78 #define SAVE_ACC1(ACC, J) \
79 __builtin_mma_disassemble_acc ((void *)result, ACC); \
80 rowC = (v4sf_t *) &CO[4* ldc+J]; \
81 rowC[0] += result[0] * alpha; \
82 rowC = (v4sf_t *) &CO[5*ldc+J]; \
83 rowC[0] += result[1] * alpha; \
84 rowC = (v4sf_t *) &CO[6*ldc+J]; \
85 rowC[0] += result[2] * alpha; \
86 rowC = (v4sf_t *) &CO[7*ldc+J]; \
87 rowC[0] += result[3] * alpha;
88 #define SAVE2x4_ACC(ACC, J) \
89 __builtin_mma_disassemble_acc ((void *)result, ACC); \
90 rowC = (v4sf_t *) &CO[0* ldc+J]; \
91 rowC[0] += result[0] * alpha; \
92 rowC = (v4sf_t *) &CO[1* ldc+J]; \
93 rowC[0] += result[1] * alpha;
94 #endif
95
96 #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
97
98 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
99 #define REFRESH_TEMP_BK(x, y) \
100 temp = k - off;
101 #elif defined(LEFT)
102 #define REFRESH_TEMP_BK(x, y) \
103 temp = off + x;
104 #else
105 #define REFRESH_TEMP_BK(x, y) \
106 temp = off + y;
107 #endif
108 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
109 #define REFRESH_POINTERS(x, y) \
110 BO = B; \
111 REFRESH_TEMP_BK(x, y)
112 #else
113 #define REFRESH_POINTERS(x, y) \
114 AO += off * x; \
115 BO = B + off * y; \
116 REFRESH_TEMP_BK(x, y)
117 #endif
118
119 #ifdef LEFT
120 #define REFRESH_OFF(x) \
121 off += x;
122 #else
123 #define REFRESH_OFF(x)
124 #endif
125
126 #ifdef LEFT
127 #define UPDATE_TEMP(x, y) \
128 temp -= x;
129 #else
130 #define UPDATE_TEMP(x, y) \
131 temp -= y;
132 #endif
133
134 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
135 #define REFRESH_TMP_AFTER_SAVE(x, y) \
136 temp = k - off; \
137 UPDATE_TEMP(x, y) \
138 AO += temp * x; \
139 BO += temp * y;
140 #else
141 #define REFRESH_TMP_AFTER_SAVE(x, y)
142 #endif
143
144 #define REFRESH_AFTER_SAVE(x,y) \
145 REFRESH_TMP_AFTER_SAVE(x, y) \
146 REFRESH_OFF(x)
147 /*************************************************************************************
148 * GEMM Kernel
149 *************************************************************************************/
150 int
CNAME(BLASLONG m,BLASLONG n,BLASLONG k,FLOAT alpha,FLOAT * A,FLOAT * B,FLOAT * C,BLASLONG ldc,BLASLONG offset)151 CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
152 FLOAT * C, BLASLONG ldc
153 #ifdef TRMMKERNEL
154 , BLASLONG offset
155 #endif
156 )
157 {
158 BLASLONG i1;
159 #if defined(TRMMKERNEL)
160 BLASLONG off;
161 #endif
162 #if defined(TRMMKERNEL) && !defined(LEFT)
163 off = -offset;
164 #endif
165 v4sf_t valpha = { alpha, alpha };
166 for (i1 = 0; i1 < (n >> 3); i1++)
167 {
168 BLASLONG j, temp;
169 FLOAT *CO;
170 FLOAT *AO;
171 #if defined(TRMMKERNEL) && defined(LEFT)
172 off = offset;
173 #endif
174 CO = C;
175 C += ldc << 3;
176 AO = A;
177 PREFETCH1 (A, 128);
178 PREFETCH1 (A, 256);
179 for (j = 0; j < (m >> 3); j++)
180 {
181 FLOAT *BO;
182 #if defined(TRMMKERNEL)
183 REFRESH_POINTERS (8, 8);
184 #else
185 BO = B;
186 temp = k;
187 #endif
188 v4sf_t *rowC;
189 v4sf_t result[4];
190 __vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7;
191 BLASLONG l = 0;
192 vec_t *rowA = (vec_t *) & AO[0];
193 __vector_pair rowB, rowB1;
194 rowB = *((__vector_pair *)((void *)&BO[0]));
195 rowB1 = *((__vector_pair *)((void *)&BO[4]));
196 __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
197 __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
198 __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
199 __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]);
200 __builtin_mma_xvf64ger (&acc4, rowB, rowA[2]);
201 __builtin_mma_xvf64ger (&acc5, rowB1, rowA[2]);
202 __builtin_mma_xvf64ger (&acc6, rowB, rowA[3]);
203 __builtin_mma_xvf64ger (&acc7, rowB1, rowA[3]);
204 for (l = 1; l < temp; l++)
205 {
206 rowA = (vec_t *) & AO[l << 3];
207 rowB = *((__vector_pair *)((void *)&BO[l << 3]));
208 rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4]));
209 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
210 __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
211 __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
212 __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]);
213 __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[2]);
214 __builtin_mma_xvf64gerpp (&acc5, rowB1, rowA[2]);
215 __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[3]);
216 __builtin_mma_xvf64gerpp (&acc7, rowB1, rowA[3]);
217 }
218 SAVE_ACC (&acc0, 0);
219 SAVE_ACC1 (&acc1, 0);
220 SAVE_ACC (&acc2, 2);
221 SAVE_ACC1 (&acc3, 2);
222 SAVE_ACC (&acc4, 4);
223 SAVE_ACC1 (&acc5, 4);
224 SAVE_ACC (&acc6, 6);
225 SAVE_ACC1 (&acc7, 6);
226 CO += 8;
227 AO += temp << 3;
228 BO += temp << 3;
229 #if defined(TRMMKERNEL)
230 REFRESH_AFTER_SAVE (8, 8)
231 #endif
232 }
233 if (m & 4)
234 {
235 FLOAT *BO;
236 #if defined(TRMMKERNEL)
237 REFRESH_POINTERS (4, 8);
238 #else
239 BO = B;
240 temp = k;
241 #endif
242 v4sf_t *rowC;
243 v4sf_t result[4];
244 __vector_quad acc0, acc1, acc2, acc3;
245 BLASLONG l = 0;
246 vec_t *rowA = (vec_t *) & AO[0];
247 __vector_pair rowB, rowB1;
248 rowB = *((__vector_pair *)((void *)&BO[0]));
249 rowB1 = *((__vector_pair *)((void *)&BO[4]));
250 __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
251 __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
252 __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
253 __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]);
254 for (l = 1; l < temp; l++)
255 {
256 rowA = (vec_t *) & AO[l << 2];
257 rowB = *((__vector_pair *)((void *)&BO[l << 3]));
258 rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4]));
259 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
260 __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
261 __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
262 __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]);
263 }
264 SAVE_ACC (&acc0, 0);
265 SAVE_ACC1 (&acc1, 0);
266 SAVE_ACC (&acc2, 2);
267 SAVE_ACC1 (&acc3, 2);
268 CO += 4;
269 AO += temp << 2;
270 BO += temp << 3;
271 #if defined(TRMMKERNEL)
272 REFRESH_AFTER_SAVE (4, 8)
273 #endif
274 }
275 if (m & 2)
276 {
277 FLOAT *BO;
278 #if defined(TRMMKERNEL)
279 REFRESH_POINTERS (2, 8);
280 #else
281 BO = B;
282 temp = k;
283 #endif
284 v4sf_t *rowC;
285 v4sf_t result[4];
286 __vector_quad acc0, acc1;
287 BLASLONG l = 0;
288 vec_t *rowA = (vec_t *) & AO[0];
289 __vector_pair rowB, rowB1;
290 rowB = *((__vector_pair *)((void *)&BO[0]));
291 rowB1 = *((__vector_pair *)((void *)&BO[4]));
292 __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
293 __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
294 for (l = 1; l < temp; l++)
295 {
296 rowA = (vec_t *) & AO[l << 1];
297 rowB = *((__vector_pair *)((void *)&BO[l << 3]));
298 rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4]));
299 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
300 __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
301 }
302 SAVE_ACC (&acc0, 0);
303 SAVE_ACC1 (&acc1, 0);
304 CO += 2;
305 AO += temp << 1;
306 BO += temp << 3;
307 #if defined(TRMMKERNEL)
308 REFRESH_AFTER_SAVE (2, 8)
309 #endif
310 }
311 if (m & 1)
312 {
313 FLOAT *BO;
314 #if defined(TRMMKERNEL)
315 REFRESH_POINTERS (1, 8);
316 #else
317 BO = B;
318 temp = k;
319 #endif
320 BLASLONG l = 0;
321 v4sf_t t = { 0, 0 };
322 v4sf_t t1 = { 0, 0 };
323 v4sf_t t2 = { 0, 0 };
324 v4sf_t t3 = { 0, 0 };
325 for (l = 0; l < temp; l++)
326 {
327 v4sf_t rowA = { AO[l], AO[l] };
328 v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1] };
329 v4sf_t rowB1 = { BO[(l << 3) + 2], BO[(l << 3) + 3] };
330 v4sf_t rowB2 = { BO[(l << 3) + 4], BO[(l << 3) + 5] };
331 v4sf_t rowB3 = { BO[(l << 3) + 6], BO[(l << 3) + 7] };
332 t += rowA * rowB;
333 t1 += rowA * rowB1;
334 t2 += rowA * rowB2;
335 t3 += rowA * rowB3;
336 }
337 t = t * valpha;
338 t1 = t1 * valpha;
339 t2 = t2 * valpha;
340 t3 = t3 * valpha;
341 #if defined(TRMMKERNEL)
342 CO[0 * ldc] = t[0];
343 CO[1 * ldc] = t[1];
344 CO[2 * ldc] = t1[0];
345 CO[3 * ldc] = t1[1];
346 CO[4 * ldc] = t2[0];
347 CO[5 * ldc] = t2[1];
348 CO[6 * ldc] = t3[0];
349 CO[7 * ldc] = t3[1];
350 #else
351 CO[0 * ldc] += t[0];
352 CO[1 * ldc] += t[1];
353 CO[2 * ldc] += t1[0];
354 CO[3 * ldc] += t1[1];
355 CO[4 * ldc] += t2[0];
356 CO[5 * ldc] += t2[1];
357 CO[6 * ldc] += t3[0];
358 CO[7 * ldc] += t3[1];
359 #endif
360 CO += 1;
361 AO += temp;
362 BO += temp << 3;
363 #if defined(TRMMKERNEL)
364 REFRESH_AFTER_SAVE (1, 8)
365 #endif
366 }
367 #if defined(TRMMKERNEL) && !defined(LEFT)
368 off += 8; // number of values in A
369 #endif
370 B += k << 3;
371 }
372 if (n & 4)
373 {
374 BLASLONG j, temp;
375 FLOAT *CO;
376 FLOAT *AO;
377 #if defined(TRMMKERNEL) && defined(LEFT)
378 off = offset;
379 #endif
380 CO = C;
381 C += ldc << 2;
382 AO = A;
383 PREFETCH1 (A, 128);
384 PREFETCH1 (A, 256);
385 for (j = 0; j < (m >> 3); j++)
386 {
387 FLOAT *BO;
388 #if defined(TRMMKERNEL)
389 REFRESH_POINTERS (8, 4);
390 #else
391 BO = B;
392 temp = k;
393 #endif
394 v4sf_t *rowC;
395 v4sf_t result[4];
396 __vector_quad acc0, acc1, acc2, acc3;
397 BLASLONG l = 0;
398 vec_t *rowA = (vec_t *) & AO[0];
399 __vector_pair rowB;
400 rowB = *((__vector_pair *)((void *)&BO[0]));
401 __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
402 __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
403 __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
404 __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
405 for (l = 1; l < temp; l++)
406 {
407 rowA = (vec_t *) & AO[l << 3];
408 rowB = *((__vector_pair *)((void *)&BO[l << 2]));
409 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
410 __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
411 __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
412 __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
413 }
414 SAVE_ACC (&acc0, 0);
415 SAVE_ACC (&acc2, 4);
416 SAVE_ACC (&acc1, 2);
417 SAVE_ACC (&acc3, 6);
418 CO += 8;
419 AO += temp << 3;
420 BO += temp << 2;
421 #if defined(TRMMKERNEL)
422 REFRESH_AFTER_SAVE (8, 4)
423 #endif
424 }
425 if (m & 4)
426 {
427 FLOAT *BO;
428 #if defined(TRMMKERNEL)
429 REFRESH_POINTERS (4, 4);
430 #else
431 BO = B;
432 temp = k;
433 #endif
434 v4sf_t *rowC;
435 v4sf_t result[4];
436 __vector_quad acc0, acc1;
437 BLASLONG l = 0;
438 vec_t *rowA = (vec_t *) & AO[0];
439 __vector_pair rowB;
440 rowB = *((__vector_pair *)((void *)&BO[0]));
441 __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
442 __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
443 for (l = 1; l < temp; l++)
444 {
445 rowA = (vec_t *) & AO[l << 2];
446 rowB = *((__vector_pair *)((void *)&BO[l << 2]));
447 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
448 __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
449 }
450 SAVE_ACC (&acc0, 0);
451 SAVE_ACC (&acc1, 2);
452 CO += 4;
453 AO += temp << 2;
454 BO += temp << 2;
455 #if defined(TRMMKERNEL)
456 REFRESH_AFTER_SAVE (4, 4)
457 #endif
458 }
459 if (m & 2)
460 {
461 FLOAT *BO;
462 #if defined(TRMMKERNEL)
463 REFRESH_POINTERS (2, 4);
464 #else
465 BO = B;
466 temp = k;
467 #endif
468 v4sf_t *rowC;
469 v4sf_t result[4];
470 __vector_quad acc0;
471 BLASLONG l = 0;
472 vec_t *rowA = (vec_t *) & AO[0];
473 __vector_pair rowB;
474 rowB = *((__vector_pair *)((void *)&BO[0]));
475 __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
476 for (l = 1; l < temp; l++)
477 {
478 rowA = (vec_t *) & AO[l << 1];
479 rowB = *((__vector_pair *)((void *)&BO[l << 2]));
480 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
481 }
482 SAVE_ACC (&acc0, 0);
483 CO += 2;
484 AO += temp << 1;
485 BO += temp << 2;
486 #if defined(TRMMKERNEL)
487 REFRESH_AFTER_SAVE (2, 4)
488 #endif
489 }
490 if (m & 1)
491 {
492 FLOAT *BO;
493 #if defined(TRMMKERNEL)
494 REFRESH_POINTERS (1, 4);
495 #else
496 BO = B;
497 temp = k;
498 #endif
499 BLASLONG l = 0;
500 v4sf_t t = { 0, 0 };
501 v4sf_t t1 = { 0, 0 };
502 for (l = 0; l < temp; l++)
503 {
504 v4sf_t rowA = { AO[l], AO[l] };
505 v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1] };
506 v4sf_t rowB1 = { BO[(l << 2) + 2], BO[(l << 2) + 3] };
507 t += rowA * rowB;
508 t1 += rowA * rowB1;
509 }
510 t = t * valpha;
511 t1 = t1 * valpha;
512 #if defined(TRMMKERNEL)
513 CO[0 * ldc] = t[0];
514 CO[1 * ldc] = t[1];
515 CO[2 * ldc] = t1[0];
516 CO[3 * ldc] = t1[1];
517 #else
518 CO[0 * ldc] += t[0];
519 CO[1 * ldc] += t[1];
520 CO[2 * ldc] += t1[0];
521 CO[3 * ldc] += t1[1];
522 #endif
523 CO += 1;
524 AO += temp;
525 BO += temp << 2;
526 #if defined(TRMMKERNEL)
527 REFRESH_AFTER_SAVE (1, 4)
528 #endif
529 }
530 #if defined(TRMMKERNEL) && !defined(LEFT)
531 off += 4; // number of values in A
532 #endif
533 B += k << 2;
534 }
535 if (n & 2)
536 {
537 BLASLONG j, temp;
538 #if defined(TRMMKERNEL) && defined(LEFT)
539 off = offset;
540 #endif
541 FLOAT *CO;
542 FLOAT *AO;
543 CO = C;
544 C += ldc << 1;
545 AO = A;
546 for (j = 0; j < (m >> 3); j++)
547 {
548 FLOAT *BO;
549 #if defined(TRMMKERNEL)
550 REFRESH_POINTERS (8, 2);
551 #else
552 BO = B;
553 temp = k;
554 #endif
555 v4sf_t *rowC;
556 v4sf_t result[4];
557 __vector_quad acc0, acc1, acc2, acc3;
558 BLASLONG l = 0;
559 __vector_pair rowB;
560 vec_t *rb = (vec_t *) & BO[0];
561 __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
562 vec_t *rowA = (vec_t *) & AO[0];
563 __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
564 __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
565 __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
566 __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
567 for (l = 1; l < temp; l++)
568 {
569 rb = (vec_t *) & BO[l << 1];
570 __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
571 rowA = (vec_t *) & AO[l << 3];
572 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
573 __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
574 __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
575 __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
576 }
577 SAVE2x4_ACC (&acc0, 0);
578 SAVE2x4_ACC (&acc1, 2);
579 SAVE2x4_ACC (&acc2, 4);
580 SAVE2x4_ACC (&acc3, 6);
581 CO += 8;
582 AO += temp << 3;
583 BO += temp << 1;
584 #if defined(TRMMKERNEL)
585 REFRESH_AFTER_SAVE (8, 2)
586 #endif
587 }
588 if (m & 4)
589 {
590 FLOAT *BO;
591 #if defined(TRMMKERNEL)
592 REFRESH_POINTERS (4, 2);
593 #else
594 BO = B;
595 temp = k;
596 #endif
597 v4sf_t *rowC;
598 v4sf_t result[4];
599 __vector_quad acc0, acc1;
600 BLASLONG l = 0;
601 __vector_pair rowB;
602 vec_t *rb = (vec_t *) & BO[0];
603 __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
604 vec_t *rowA = (vec_t *) & AO[0];
605 __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
606 __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
607 for (l = 1; l < temp; l++)
608 {
609 rb = (vec_t *) & BO[l << 1];
610 __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
611 rowA = (vec_t *) & AO[l << 2];
612 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
613 __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
614 }
615 SAVE2x4_ACC (&acc0, 0);
616 SAVE2x4_ACC (&acc1, 2);
617 CO += 4;
618 AO += temp << 2;
619 BO += temp << 1;
620 #if defined(TRMMKERNEL)
621 REFRESH_AFTER_SAVE (4, 2)
622 #endif
623 }
624 if (m & 2)
625 {
626 FLOAT *BO;
627 #if defined(TRMMKERNEL)
628 REFRESH_POINTERS (2, 2);
629 #else
630 BO = B;
631 temp = k;
632 #endif
633 v4sf_t *rowC;
634 v4sf_t result[4];
635 __vector_quad acc0;
636 BLASLONG l = 0;
637 __vector_pair rowB;
638 vec_t *rb = (vec_t *) & BO[0];
639 __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
640 vec_t *rowA = (vec_t *) & AO[0];
641 __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
642 for (l = 1; l < temp; l++)
643 {
644 rb = (vec_t *) & BO[l << 1];
645 __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
646 rowA = (vec_t *) & AO[l << 1];
647 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
648 }
649 SAVE2x4_ACC (&acc0, 0);
650 CO += 2;
651 AO += temp << 1;
652 BO += temp << 1;
653 #if defined(TRMMKERNEL)
654 REFRESH_AFTER_SAVE (2, 2)
655 #endif
656 }
657 if (m & 1)
658 {
659 FLOAT *BO;
660 #if defined(TRMMKERNEL)
661 REFRESH_POINTERS (1, 2);
662 #else
663 BO = B;
664 temp = k;
665 #endif
666 BLASLONG l = 0;
667 v4sf_t t = { 0, 0 };
668 for (l = 0; l < temp; l++)
669 {
670 v4sf_t rowA = { AO[l], AO[l] };
671 v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1] };
672 t += rowA * rowB;
673 }
674 t = t * valpha;
675 #if defined(TRMMKERNEL)
676 CO[0 * ldc] = t[0];
677 CO[1 * ldc] = t[1];
678 #else
679 CO[0 * ldc] += t[0];
680 CO[1 * ldc] += t[1];
681 #endif
682 CO += 1;
683 AO += temp;
684 BO += temp << 1;
685 #if defined(TRMMKERNEL)
686 REFRESH_AFTER_SAVE (1, 2)
687 #endif
688 }
689 #if defined(TRMMKERNEL) && !defined(LEFT)
690 off += 2; // number of values in A
691 #endif
692 B += k << 1;
693 }
694 if (n & 1)
695 {
696 BLASLONG i, temp;
697 #if defined(TRMMKERNEL) && defined(LEFT)
698 off = offset;
699 #endif
700 FLOAT *CO;
701 FLOAT *AO;
702 CO = C;
703 C += ldc;
704 AO = A;
705 for (i = 0; i < (m >> 3); i++)
706 {
707 FLOAT *BO;
708 #if defined(TRMMKERNEL)
709 REFRESH_POINTERS (8, 1)
710 #else
711 BO = B;
712 temp = k;
713 #endif
714 BLASLONG l = 0;
715 v4sf_t t = { 0, 0 };
716 v4sf_t t1 = { 0, 0 };
717 v4sf_t t2 = { 0, 0 };
718 v4sf_t t3 = { 0, 0 };
719 for (l = 0; l < temp; l++)
720 {
721 v4sf_t rowB = { BO[l], BO[l] };
722 v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1] };
723 v4sf_t rowA1 = { AO[(l << 3) + 2], AO[(l << 3) + 3] };
724 v4sf_t rowA2 = { AO[(l << 3) + 4], AO[(l << 3) + 5] };
725 v4sf_t rowA3 = { AO[(l << 3) + 6], AO[(l << 3) + 7] };
726 t += rowA * rowB;
727 t1 += rowA1 * rowB;
728 t2 += rowA2 * rowB;
729 t3 += rowA3 * rowB;
730 }
731 t = t * valpha;
732 t1 = t1 * valpha;
733 t2 = t2 * valpha;
734 t3 = t3 * valpha;
735 #if defined(TRMMKERNEL)
736 CO[0] = t[0];
737 CO[1] = t[1];
738 CO[2] = t1[0];
739 CO[3] = t1[1];
740 CO[4] = t2[0];
741 CO[5] = t2[1];
742 CO[6] = t3[0];
743 CO[7] = t3[1];
744 #else
745 CO[0] += t[0];
746 CO[1] += t[1];
747 CO[2] += t1[0];
748 CO[3] += t1[1];
749 CO[4] += t2[0];
750 CO[5] += t2[1];
751 CO[6] += t3[0];
752 CO[7] += t3[1];
753 #endif
754 AO += temp << 3;
755 BO += temp;
756 CO += 8;
757 #if defined(TRMMKERNEL)
758 REFRESH_AFTER_SAVE (8, 1)
759 #endif
760 }
761 if (m & 4)
762 {
763 FLOAT *BO;
764 #if defined(TRMMKERNEL)
765 REFRESH_POINTERS (4, 1)
766 #else
767 BO = B;
768 temp = k;
769 #endif
770 BLASLONG l = 0;
771 v4sf_t t = { 0, 0 };
772 v4sf_t t1 = { 0, 0 };
773 for (l = 0; l < temp; l++)
774 {
775 v4sf_t rowB = { BO[l], BO[l] };
776 v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1] };
777 v4sf_t rowA1 = { AO[(l << 2) + 2], AO[(l << 2) + 3] };
778 t += rowA * rowB;
779 t1 += rowA1 * rowB;
780 }
781 t = t * valpha;
782 t1 = t1 * valpha;
783 #if defined(TRMMKERNEL)
784 CO[0] = t[0];
785 CO[1] = t[1];
786 CO[2] = t1[0];
787 CO[3] = t1[1];
788 #else
789 CO[0] += t[0];
790 CO[1] += t[1];
791 CO[2] += t1[0];
792 CO[3] += t1[1];
793 #endif
794 AO += temp << 2;
795 BO += temp;
796 CO += 4;
797 #if defined(TRMMKERNEL)
798 REFRESH_AFTER_SAVE (4, 1)
799 #endif
800 }
801 if (m & 2)
802 {
803 FLOAT *BO;
804 #if defined(TRMMKERNEL)
805 REFRESH_POINTERS (2, 1)
806 #else
807 BO = B;
808 temp = k;
809 #endif
810 BLASLONG l = 0;
811 v4sf_t t = { 0, 0 };
812 for (l = 0; l < temp; l++)
813 {
814 v4sf_t rowB = { BO[l], BO[l] };
815 v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1] };
816 t += rowA * rowB;
817 }
818 t = t * valpha;
819 #if defined(TRMMKERNEL)
820 CO[0] = t[0];
821 CO[1] = t[1];
822 #else
823 CO[0] += t[0];
824 CO[1] += t[1];
825 #endif
826 AO += temp << 1;
827 BO += temp;
828 CO += 2;
829 #if defined(TRMMKERNEL)
830 REFRESH_AFTER_SAVE (2, 1)
831 #endif
832 }
833 if (m & 1)
834 {
835 FLOAT *BO;
836 #if defined(TRMMKERNEL)
837 REFRESH_POINTERS (1, 1)
838 #else
839 BO = B;
840 temp = k;
841 #endif
842 BLASLONG l = 0;
843 FLOAT t = 0;
844 for (l = 0; l < temp; l++)
845 {
846 t += AO[l] * BO[l];
847 }
848 AO += temp;
849 BO += temp;
850 #if defined(TRMMKERNEL)
851 CO[0] = t * alpha;
852 #else
853 CO[0] += t * alpha;
854 #endif
855 CO += 1;
856 #if defined(TRMMKERNEL)
857 REFRESH_AFTER_SAVE (1, 1)
858 #endif
859 }
860 #if defined(TRMMKERNEL) && !defined(LEFT)
861 off += 1; // number of values in A
862 #endif
863 B += k;
864 }
865 return 0;
866 }
867