1 /*********************************************************************************
2 Copyright (c) 2020, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 **********************************************************************************/
27 #include "common.h"
28 #include <altivec.h>
29 
30 typedef __vector unsigned char  vec_t;
31 typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
32 #if !__has_builtin(__builtin_vsx_assemble_pair)
33 #define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
34 #endif
35 
36 #if !__has_builtin(__builtin_vsx_disassemble_pair)
37 #define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair
38 #endif
39 
40 #ifdef TRMMKERNEL
41 #define SAVE_ACC(ACC, J)  \
42           __builtin_mma_disassemble_acc ((void *)result, ACC); \
43           rowC = (v4sf_t *) &CO[0* ldc+J]; \
44           rowC[0] = result[0] * alpha; \
45           rowC = (v4sf_t *) &CO[1*ldc+J]; \
46           rowC[0] = result[1] * alpha; \
47           rowC = (v4sf_t *) &CO[2*ldc+J]; \
48           rowC[0] = result[2] * alpha; \
49           rowC = (v4sf_t *) &CO[3*ldc+J]; \
50           rowC[0] = result[3] * alpha;
51 #define SAVE_ACC1(ACC, J)  \
52           __builtin_mma_disassemble_acc ((void *)result, ACC); \
53           rowC = (v4sf_t *) &CO[4* ldc+J]; \
54           rowC[0] = result[0] * alpha; \
55           rowC = (v4sf_t *) &CO[5*ldc+J]; \
56           rowC[0] = result[1] * alpha; \
57           rowC = (v4sf_t *) &CO[6*ldc+J]; \
58           rowC[0] = result[2] * alpha; \
59           rowC = (v4sf_t *) &CO[7*ldc+J]; \
60           rowC[0] = result[3] * alpha;
61 #define  SAVE2x4_ACC(ACC, J)  \
62 	  __builtin_mma_disassemble_acc ((void *)result, ACC); \
63 	  rowC = (v4sf_t *) &CO[0* ldc+J]; \
64           rowC[0] = result[0] * alpha; \
65 	  rowC = (v4sf_t *) &CO[1* ldc+J]; \
66           rowC[0] = result[1] * alpha;
67 #else
68 #define SAVE_ACC(ACC, J)  \
69           __builtin_mma_disassemble_acc ((void *)result, ACC); \
70           rowC = (v4sf_t *) &CO[0* ldc+J]; \
71           rowC[0] += result[0] * alpha; \
72           rowC = (v4sf_t *) &CO[1*ldc+J]; \
73           rowC[0] += result[1] * alpha; \
74           rowC = (v4sf_t *) &CO[2*ldc+J]; \
75           rowC[0] += result[2] * alpha; \
76           rowC = (v4sf_t *) &CO[3*ldc+J]; \
77           rowC[0] += result[3] * alpha;
78 #define SAVE_ACC1(ACC, J)  \
79           __builtin_mma_disassemble_acc ((void *)result, ACC); \
80           rowC = (v4sf_t *) &CO[4* ldc+J]; \
81           rowC[0] += result[0] * alpha; \
82           rowC = (v4sf_t *) &CO[5*ldc+J]; \
83           rowC[0] += result[1] * alpha; \
84           rowC = (v4sf_t *) &CO[6*ldc+J]; \
85           rowC[0] += result[2] * alpha; \
86           rowC = (v4sf_t *) &CO[7*ldc+J]; \
87           rowC[0] += result[3] * alpha;
88 #define  SAVE2x4_ACC(ACC, J)  \
89 	  __builtin_mma_disassemble_acc ((void *)result, ACC); \
90 	  rowC = (v4sf_t *) &CO[0* ldc+J]; \
91           rowC[0] += result[0] * alpha; \
92 	  rowC = (v4sf_t *) &CO[1* ldc+J]; \
93           rowC[0] += result[1] * alpha;
94 #endif
95 
96 #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
97 
98 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
99 #define REFRESH_TEMP_BK(x, y) \
100             temp = k - off;
101 #elif defined(LEFT)
102 #define REFRESH_TEMP_BK(x, y) \
103             temp = off + x;
104 #else
105 #define REFRESH_TEMP_BK(x, y) \
106             temp = off + y;
107 #endif
108 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
109 #define REFRESH_POINTERS(x, y) \
110           BO = B; \
111           REFRESH_TEMP_BK(x, y)
112 #else
113 #define REFRESH_POINTERS(x, y) \
114           AO += off * x; \
115           BO = B + off * y; \
116           REFRESH_TEMP_BK(x, y)
117 #endif
118 
119 #ifdef LEFT
120 #define REFRESH_OFF(x) \
121             off += x;
122 #else
123 #define REFRESH_OFF(x)
124 #endif
125 
126 #ifdef LEFT
127 #define UPDATE_TEMP(x, y) \
128             temp -= x;
129 #else
130 #define UPDATE_TEMP(x, y) \
131             temp -= y;
132 #endif
133 
134 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
135 #define REFRESH_TMP_AFTER_SAVE(x, y) \
136             temp = k - off; \
137             UPDATE_TEMP(x, y) \
138             AO += temp * x; \
139             BO += temp * y;
140 #else
141 #define REFRESH_TMP_AFTER_SAVE(x, y)
142 #endif
143 
144 #define REFRESH_AFTER_SAVE(x,y) \
145         REFRESH_TMP_AFTER_SAVE(x, y) \
146         REFRESH_OFF(x)
147 /*************************************************************************************
148 * GEMM Kernel
149 *************************************************************************************/
150 int
CNAME(BLASLONG m,BLASLONG n,BLASLONG k,FLOAT alpha,FLOAT * A,FLOAT * B,FLOAT * C,BLASLONG ldc,BLASLONG offset)151 CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
152        FLOAT * C, BLASLONG ldc
153 #ifdef TRMMKERNEL
154        , BLASLONG offset
155 #endif
156   )
157 {
158   BLASLONG i1;
159 #if defined(TRMMKERNEL)
160   BLASLONG off;
161 #endif
162 #if defined(TRMMKERNEL) && !defined(LEFT)
163   off = -offset;
164 #endif
165   v4sf_t valpha = { alpha, alpha };
166   for (i1 = 0; i1 < (n >> 3); i1++)
167     {
168       BLASLONG j, temp;
169       FLOAT *CO;
170       FLOAT *AO;
171 #if defined(TRMMKERNEL) && defined(LEFT)
172       off = offset;
173 #endif
174       CO = C;
175       C += ldc << 3;
176       AO = A;
177       PREFETCH1 (A, 128);
178       PREFETCH1 (A, 256);
179       for (j = 0; j < (m >> 3); j++)
180 	{
181 	  FLOAT *BO;
182 #if defined(TRMMKERNEL)
183           REFRESH_POINTERS (8, 8);
184 #else
185           BO = B;
186           temp = k;
187 #endif
188 	  v4sf_t *rowC;
189 	  v4sf_t result[4];
190 	  __vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7;
191 	  BLASLONG l = 0;
192 	  vec_t *rowA = (vec_t *) & AO[0];
193 	  __vector_pair rowB, rowB1;
194 	  rowB = *((__vector_pair *)((void *)&BO[0]));
195 	  rowB1 = *((__vector_pair *)((void *)&BO[4]));
196 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
197 	  __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
198 	  __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
199 	  __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]);
200 	  __builtin_mma_xvf64ger (&acc4, rowB, rowA[2]);
201 	  __builtin_mma_xvf64ger (&acc5, rowB1, rowA[2]);
202 	  __builtin_mma_xvf64ger (&acc6, rowB, rowA[3]);
203 	  __builtin_mma_xvf64ger (&acc7, rowB1, rowA[3]);
204 	  for (l = 1; l < temp; l++)
205 	    {
206 	      rowA = (vec_t *) & AO[l << 3];
207 	      rowB = *((__vector_pair *)((void *)&BO[l << 3]));
208 	      rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4]));
209 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
210 	      __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
211 	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
212 	      __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]);
213 	      __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[2]);
214 	      __builtin_mma_xvf64gerpp (&acc5, rowB1, rowA[2]);
215 	      __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[3]);
216 	      __builtin_mma_xvf64gerpp (&acc7, rowB1, rowA[3]);
217 	    }
218 	  SAVE_ACC (&acc0, 0);
219 	  SAVE_ACC1 (&acc1, 0);
220 	  SAVE_ACC (&acc2, 2);
221 	  SAVE_ACC1 (&acc3, 2);
222 	  SAVE_ACC (&acc4, 4);
223 	  SAVE_ACC1 (&acc5, 4);
224 	  SAVE_ACC (&acc6, 6);
225 	  SAVE_ACC1 (&acc7, 6);
226 	  CO += 8;
227 	  AO += temp << 3;
228 	  BO += temp << 3;
229 #if defined(TRMMKERNEL)
230           REFRESH_AFTER_SAVE (8, 8)
231 #endif
232 	}
233       if (m & 4)
234 	{
235 	  FLOAT *BO;
236 #if defined(TRMMKERNEL)
237           REFRESH_POINTERS (4, 8);
238 #else
239           BO = B;
240           temp = k;
241 #endif
242 	  v4sf_t *rowC;
243 	  v4sf_t result[4];
244 	  __vector_quad acc0, acc1, acc2, acc3;
245 	  BLASLONG l = 0;
246 	  vec_t *rowA = (vec_t *) & AO[0];
247 	  __vector_pair rowB, rowB1;
248 	  rowB = *((__vector_pair *)((void *)&BO[0]));
249 	  rowB1 = *((__vector_pair *)((void *)&BO[4]));
250 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
251 	  __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
252 	  __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
253 	  __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]);
254 	  for (l = 1; l < temp; l++)
255 	    {
256 	      rowA = (vec_t *) & AO[l << 2];
257 	      rowB = *((__vector_pair *)((void *)&BO[l << 3]));
258 	      rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4]));
259 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
260 	      __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
261 	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
262 	      __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]);
263 	    }
264 	  SAVE_ACC (&acc0, 0);
265 	  SAVE_ACC1 (&acc1, 0);
266 	  SAVE_ACC (&acc2, 2);
267 	  SAVE_ACC1 (&acc3, 2);
268 	  CO += 4;
269 	  AO += temp << 2;
270 	  BO += temp << 3;
271 #if defined(TRMMKERNEL)
272           REFRESH_AFTER_SAVE (4, 8)
273 #endif
274 	}
275       if (m & 2)
276 	{
277 	  FLOAT *BO;
278 #if defined(TRMMKERNEL)
279           REFRESH_POINTERS (2, 8);
280 #else
281           BO = B;
282           temp = k;
283 #endif
284 	  v4sf_t *rowC;
285 	  v4sf_t result[4];
286 	  __vector_quad acc0, acc1;
287 	  BLASLONG l = 0;
288 	  vec_t *rowA = (vec_t *) & AO[0];
289 	  __vector_pair rowB, rowB1;
290 	  rowB = *((__vector_pair *)((void *)&BO[0]));
291 	  rowB1 = *((__vector_pair *)((void *)&BO[4]));
292 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
293 	  __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
294 	  for (l = 1; l < temp; l++)
295 	    {
296 	      rowA = (vec_t *) & AO[l << 1];
297 	      rowB = *((__vector_pair *)((void *)&BO[l << 3]));
298 	      rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4]));
299 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
300 	      __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
301 	    }
302 	  SAVE_ACC (&acc0, 0);
303 	  SAVE_ACC1 (&acc1, 0);
304 	  CO += 2;
305 	  AO += temp << 1;
306 	  BO += temp << 3;
307 #if defined(TRMMKERNEL)
308           REFRESH_AFTER_SAVE (2, 8)
309 #endif
310 	}
311       if (m & 1)
312 	{
313 	  FLOAT *BO;
314 #if defined(TRMMKERNEL)
315           REFRESH_POINTERS (1, 8);
316 #else
317           BO = B;
318           temp = k;
319 #endif
320 	  BLASLONG l = 0;
321 	  v4sf_t t = { 0, 0 };
322 	  v4sf_t t1 = { 0, 0 };
323 	  v4sf_t t2 = { 0, 0 };
324 	  v4sf_t t3 = { 0, 0 };
325 	  for (l = 0; l < temp; l++)
326 	    {
327 	      v4sf_t rowA = { AO[l], AO[l] };
328 	      v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1] };
329 	      v4sf_t rowB1 = { BO[(l << 3) + 2], BO[(l << 3) + 3] };
330 	      v4sf_t rowB2 = { BO[(l << 3) + 4], BO[(l << 3) + 5] };
331 	      v4sf_t rowB3 = { BO[(l << 3) + 6], BO[(l << 3) + 7] };
332 	      t += rowA * rowB;
333 	      t1 += rowA * rowB1;
334 	      t2 += rowA * rowB2;
335 	      t3 += rowA * rowB3;
336 	    }
337 	  t = t * valpha;
338 	  t1 = t1 * valpha;
339 	  t2 = t2 * valpha;
340 	  t3 = t3 * valpha;
341 #if defined(TRMMKERNEL)
342 	  CO[0 * ldc] = t[0];
343 	  CO[1 * ldc] = t[1];
344 	  CO[2 * ldc] = t1[0];
345 	  CO[3 * ldc] = t1[1];
346 	  CO[4 * ldc] = t2[0];
347 	  CO[5 * ldc] = t2[1];
348 	  CO[6 * ldc] = t3[0];
349 	  CO[7 * ldc] = t3[1];
350 #else
351 	  CO[0 * ldc] += t[0];
352 	  CO[1 * ldc] += t[1];
353 	  CO[2 * ldc] += t1[0];
354 	  CO[3 * ldc] += t1[1];
355 	  CO[4 * ldc] += t2[0];
356 	  CO[5 * ldc] += t2[1];
357 	  CO[6 * ldc] += t3[0];
358 	  CO[7 * ldc] += t3[1];
359 #endif
360 	  CO += 1;
361 	  AO += temp;
362 	  BO += temp << 3;
363 #if defined(TRMMKERNEL)
364           REFRESH_AFTER_SAVE (1, 8)
365 #endif
366 	}
367 #if defined(TRMMKERNEL) && !defined(LEFT)
368       off += 8;                 // number of values in A
369 #endif
370       B += k << 3;
371     }
372   if (n & 4)
373     {
374       BLASLONG j, temp;
375       FLOAT *CO;
376       FLOAT *AO;
377 #if defined(TRMMKERNEL) && defined(LEFT)
378       off = offset;
379 #endif
380       CO = C;
381       C += ldc << 2;
382       AO = A;
383       PREFETCH1 (A, 128);
384       PREFETCH1 (A, 256);
385       for (j = 0; j < (m >> 3); j++)
386 	{
387 	  FLOAT *BO;
388 #if defined(TRMMKERNEL)
389           REFRESH_POINTERS (8, 4);
390 #else
391           BO = B;
392           temp = k;
393 #endif
394 	  v4sf_t *rowC;
395 	  v4sf_t result[4];
396 	  __vector_quad acc0, acc1, acc2, acc3;
397 	  BLASLONG l = 0;
398 	  vec_t *rowA = (vec_t *) & AO[0];
399 	  __vector_pair rowB;
400 	  rowB = *((__vector_pair *)((void *)&BO[0]));
401 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
402 	  __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
403 	  __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
404 	  __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
405 	  for (l = 1; l < temp; l++)
406 	    {
407 	      rowA = (vec_t *) & AO[l << 3];
408 	      rowB = *((__vector_pair *)((void *)&BO[l << 2]));
409 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
410 	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
411 	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
412 	      __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
413 	    }
414 	  SAVE_ACC (&acc0, 0);
415 	  SAVE_ACC (&acc2, 4);
416 	  SAVE_ACC (&acc1, 2);
417 	  SAVE_ACC (&acc3, 6);
418 	  CO += 8;
419 	  AO += temp << 3;
420 	  BO += temp << 2;
421 #if defined(TRMMKERNEL)
422           REFRESH_AFTER_SAVE (8, 4)
423 #endif
424 	}
425       if (m & 4)
426 	{
427 	  FLOAT *BO;
428 #if defined(TRMMKERNEL)
429           REFRESH_POINTERS (4, 4);
430 #else
431           BO = B;
432           temp = k;
433 #endif
434 	  v4sf_t *rowC;
435 	  v4sf_t result[4];
436 	  __vector_quad acc0, acc1;
437 	  BLASLONG l = 0;
438 	  vec_t *rowA = (vec_t *) & AO[0];
439 	  __vector_pair rowB;
440 	  rowB = *((__vector_pair *)((void *)&BO[0]));
441 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
442 	  __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
443 	  for (l = 1; l < temp; l++)
444 	    {
445 	      rowA = (vec_t *) & AO[l << 2];
446 	      rowB = *((__vector_pair *)((void *)&BO[l << 2]));
447 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
448 	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
449 	    }
450 	  SAVE_ACC (&acc0, 0);
451 	  SAVE_ACC (&acc1, 2);
452 	  CO += 4;
453 	  AO += temp << 2;
454 	  BO += temp << 2;
455 #if defined(TRMMKERNEL)
456           REFRESH_AFTER_SAVE (4, 4)
457 #endif
458 	}
459       if (m & 2)
460 	{
461 	  FLOAT *BO;
462 #if defined(TRMMKERNEL)
463           REFRESH_POINTERS (2, 4);
464 #else
465           BO = B;
466           temp = k;
467 #endif
468 	  v4sf_t *rowC;
469 	  v4sf_t result[4];
470 	  __vector_quad acc0;
471 	  BLASLONG l = 0;
472 	  vec_t *rowA = (vec_t *) & AO[0];
473 	  __vector_pair rowB;
474 	  rowB = *((__vector_pair *)((void *)&BO[0]));
475 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
476 	  for (l = 1; l < temp; l++)
477 	    {
478 	      rowA = (vec_t *) & AO[l << 1];
479 	      rowB = *((__vector_pair *)((void *)&BO[l << 2]));
480 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
481 	    }
482 	  SAVE_ACC (&acc0, 0);
483 	  CO += 2;
484 	  AO += temp << 1;
485 	  BO += temp << 2;
486 #if defined(TRMMKERNEL)
487           REFRESH_AFTER_SAVE (2, 4)
488 #endif
489 	}
490       if (m & 1)
491 	{
492 	  FLOAT *BO;
493 #if defined(TRMMKERNEL)
494           REFRESH_POINTERS (1, 4);
495 #else
496           BO = B;
497           temp = k;
498 #endif
499 	  BLASLONG l = 0;
500 	  v4sf_t t = { 0, 0 };
501 	  v4sf_t t1 = { 0, 0 };
502 	  for (l = 0; l < temp; l++)
503 	    {
504 	      v4sf_t rowA = { AO[l], AO[l] };
505 	      v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1] };
506 	      v4sf_t rowB1 = { BO[(l << 2) + 2], BO[(l << 2) + 3] };
507 	      t += rowA * rowB;
508 	      t1 += rowA * rowB1;
509 	    }
510 	  t = t * valpha;
511 	  t1 = t1 * valpha;
512 #if defined(TRMMKERNEL)
513 	  CO[0 * ldc] = t[0];
514 	  CO[1 * ldc] = t[1];
515 	  CO[2 * ldc] = t1[0];
516 	  CO[3 * ldc] = t1[1];
517 #else
518 	  CO[0 * ldc] += t[0];
519 	  CO[1 * ldc] += t[1];
520 	  CO[2 * ldc] += t1[0];
521 	  CO[3 * ldc] += t1[1];
522 #endif
523 	  CO += 1;
524 	  AO += temp;
525 	  BO += temp << 2;
526 #if defined(TRMMKERNEL)
527           REFRESH_AFTER_SAVE (1, 4)
528 #endif
529 	}
530 #if defined(TRMMKERNEL) && !defined(LEFT)
531       off += 4;                 // number of values in A
532 #endif
533       B += k << 2;
534     }
535   if (n & 2)
536     {
537       BLASLONG j, temp;
538 #if defined(TRMMKERNEL) && defined(LEFT)
539       off = offset;
540 #endif
541       FLOAT *CO;
542       FLOAT *AO;
543       CO = C;
544       C += ldc << 1;
545       AO = A;
546       for (j = 0; j < (m >> 3); j++)
547 	{
548 	  FLOAT *BO;
549 #if defined(TRMMKERNEL)
550           REFRESH_POINTERS (8, 2);
551 #else
552           BO = B;
553           temp = k;
554 #endif
555 	  v4sf_t *rowC;
556 	  v4sf_t result[4];
557 	  __vector_quad acc0, acc1, acc2, acc3;
558 	  BLASLONG l = 0;
559 	  __vector_pair rowB;
560 	  vec_t *rb = (vec_t *) & BO[0];
561 	  __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
562 	  vec_t *rowA = (vec_t *) & AO[0];
563 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
564 	  __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
565 	  __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
566 	  __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
567 	  for (l = 1; l < temp; l++)
568 	    {
569 	      rb = (vec_t *) & BO[l << 1];
570 	      __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
571 	      rowA = (vec_t *) & AO[l << 3];
572 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
573 	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
574 	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
575 	      __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
576 	    }
577 	  SAVE2x4_ACC (&acc0, 0);
578 	  SAVE2x4_ACC (&acc1, 2);
579 	  SAVE2x4_ACC (&acc2, 4);
580 	  SAVE2x4_ACC (&acc3, 6);
581 	  CO += 8;
582 	  AO += temp << 3;
583 	  BO += temp << 1;
584 #if defined(TRMMKERNEL)
585           REFRESH_AFTER_SAVE (8, 2)
586 #endif
587 	}
588       if (m & 4)
589 	{
590 	  FLOAT *BO;
591 #if defined(TRMMKERNEL)
592           REFRESH_POINTERS (4, 2);
593 #else
594           BO = B;
595           temp = k;
596 #endif
597 	  v4sf_t *rowC;
598 	  v4sf_t result[4];
599 	  __vector_quad acc0, acc1;
600 	  BLASLONG l = 0;
601 	  __vector_pair rowB;
602 	  vec_t *rb = (vec_t *) & BO[0];
603 	  __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
604 	  vec_t *rowA = (vec_t *) & AO[0];
605 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
606 	  __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
607 	  for (l = 1; l < temp; l++)
608 	    {
609 	      rb = (vec_t *) & BO[l << 1];
610 	      __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
611 	      rowA = (vec_t *) & AO[l << 2];
612 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
613 	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
614 	    }
615 	  SAVE2x4_ACC (&acc0, 0);
616 	  SAVE2x4_ACC (&acc1, 2);
617 	  CO += 4;
618 	  AO += temp << 2;
619 	  BO += temp << 1;
620 #if defined(TRMMKERNEL)
621           REFRESH_AFTER_SAVE (4, 2)
622 #endif
623 	}
624       if (m & 2)
625 	{
626 	  FLOAT *BO;
627 #if defined(TRMMKERNEL)
628           REFRESH_POINTERS (2, 2);
629 #else
630           BO = B;
631           temp = k;
632 #endif
633 	  v4sf_t *rowC;
634 	  v4sf_t result[4];
635 	  __vector_quad acc0;
636 	  BLASLONG l = 0;
637 	  __vector_pair rowB;
638 	  vec_t *rb = (vec_t *) & BO[0];
639 	  __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
640 	  vec_t *rowA = (vec_t *) & AO[0];
641 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
642 	  for (l = 1; l < temp; l++)
643 	    {
644 	      rb = (vec_t *) & BO[l << 1];
645 	      __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
646 	      rowA = (vec_t *) & AO[l << 1];
647 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
648 	    }
649 	  SAVE2x4_ACC (&acc0, 0);
650 	  CO += 2;
651 	  AO += temp << 1;
652 	  BO += temp << 1;
653 #if defined(TRMMKERNEL)
654           REFRESH_AFTER_SAVE (2, 2)
655 #endif
656 	}
657       if (m & 1)
658 	{
659 	  FLOAT *BO;
660 #if defined(TRMMKERNEL)
661           REFRESH_POINTERS (1, 2);
662 #else
663           BO = B;
664           temp = k;
665 #endif
666 	  BLASLONG l = 0;
667 	  v4sf_t t = { 0, 0 };
668 	  for (l = 0; l < temp; l++)
669 	    {
670 	      v4sf_t rowA = { AO[l], AO[l] };
671 	      v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1] };
672 	      t += rowA * rowB;
673 	    }
674 	  t = t * valpha;
675 #if defined(TRMMKERNEL)
676 	  CO[0 * ldc] = t[0];
677 	  CO[1 * ldc] = t[1];
678 #else
679 	  CO[0 * ldc] += t[0];
680 	  CO[1 * ldc] += t[1];
681 #endif
682 	  CO += 1;
683 	  AO += temp;
684 	  BO += temp << 1;
685 #if defined(TRMMKERNEL)
686           REFRESH_AFTER_SAVE (1, 2)
687 #endif
688 	}
689 #if defined(TRMMKERNEL) && !defined(LEFT)
690       off += 2;                 // number of values in A
691 #endif
692       B += k << 1;
693     }
694   if (n & 1)
695     {
696       BLASLONG i, temp;
697 #if defined(TRMMKERNEL) && defined(LEFT)
698       off = offset;
699 #endif
700       FLOAT *CO;
701       FLOAT *AO;
702       CO = C;
703       C += ldc;
704       AO = A;
705       for (i = 0; i < (m >> 3); i++)
706 	{
707 	  FLOAT *BO;
708 #if defined(TRMMKERNEL)
709           REFRESH_POINTERS (8, 1)
710 #else
711           BO = B;
712           temp = k;
713 #endif
714 	  BLASLONG l = 0;
715 	  v4sf_t t = { 0, 0 };
716 	  v4sf_t t1 = { 0, 0 };
717 	  v4sf_t t2 = { 0, 0 };
718 	  v4sf_t t3 = { 0, 0 };
719 	  for (l = 0; l < temp; l++)
720 	    {
721 	      v4sf_t rowB = { BO[l], BO[l] };
722 	      v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1] };
723 	      v4sf_t rowA1 = { AO[(l << 3) + 2], AO[(l << 3) + 3] };
724 	      v4sf_t rowA2 = { AO[(l << 3) + 4], AO[(l << 3) + 5] };
725 	      v4sf_t rowA3 = { AO[(l << 3) + 6], AO[(l << 3) + 7] };
726 	      t += rowA * rowB;
727 	      t1 += rowA1 * rowB;
728 	      t2 += rowA2 * rowB;
729 	      t3 += rowA3 * rowB;
730 	    }
731 	  t = t * valpha;
732 	  t1 = t1 * valpha;
733 	  t2 = t2 * valpha;
734 	  t3 = t3 * valpha;
735 #if defined(TRMMKERNEL)
736 	  CO[0] = t[0];
737 	  CO[1] = t[1];
738 	  CO[2] = t1[0];
739 	  CO[3] = t1[1];
740 	  CO[4] = t2[0];
741 	  CO[5] = t2[1];
742 	  CO[6] = t3[0];
743 	  CO[7] = t3[1];
744 #else
745 	  CO[0] += t[0];
746 	  CO[1] += t[1];
747 	  CO[2] += t1[0];
748 	  CO[3] += t1[1];
749 	  CO[4] += t2[0];
750 	  CO[5] += t2[1];
751 	  CO[6] += t3[0];
752 	  CO[7] += t3[1];
753 #endif
754 	  AO += temp << 3;
755 	  BO += temp;
756 	  CO += 8;
757 #if defined(TRMMKERNEL)
758           REFRESH_AFTER_SAVE (8, 1)
759 #endif
760 	}
761       if (m & 4)
762 	{
763 	  FLOAT *BO;
764 #if defined(TRMMKERNEL)
765           REFRESH_POINTERS (4, 1)
766 #else
767           BO = B;
768           temp = k;
769 #endif
770 	  BLASLONG l = 0;
771 	  v4sf_t t = { 0, 0 };
772 	  v4sf_t t1 = { 0, 0 };
773 	  for (l = 0; l < temp; l++)
774 	    {
775 	      v4sf_t rowB = { BO[l], BO[l] };
776 	      v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1] };
777 	      v4sf_t rowA1 = { AO[(l << 2) + 2], AO[(l << 2) + 3] };
778 	      t += rowA * rowB;
779 	      t1 += rowA1 * rowB;
780 	    }
781 	  t = t * valpha;
782 	  t1 = t1 * valpha;
783 #if defined(TRMMKERNEL)
784 	  CO[0] = t[0];
785 	  CO[1] = t[1];
786 	  CO[2] = t1[0];
787 	  CO[3] = t1[1];
788 #else
789 	  CO[0] += t[0];
790 	  CO[1] += t[1];
791 	  CO[2] += t1[0];
792 	  CO[3] += t1[1];
793 #endif
794 	  AO += temp << 2;
795 	  BO += temp;
796 	  CO += 4;
797 #if defined(TRMMKERNEL)
798           REFRESH_AFTER_SAVE (4, 1)
799 #endif
800 	}
801       if (m & 2)
802 	{
803 	  FLOAT *BO;
804 #if defined(TRMMKERNEL)
805           REFRESH_POINTERS (2, 1)
806 #else
807           BO = B;
808           temp = k;
809 #endif
810 	  BLASLONG l = 0;
811 	  v4sf_t t = { 0, 0 };
812 	  for (l = 0; l < temp; l++)
813 	    {
814 	      v4sf_t rowB = { BO[l], BO[l] };
815 	      v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1] };
816 	      t += rowA * rowB;
817 	    }
818 	  t = t * valpha;
819 #if defined(TRMMKERNEL)
820 	  CO[0] = t[0];
821 	  CO[1] = t[1];
822 #else
823 	  CO[0] += t[0];
824 	  CO[1] += t[1];
825 #endif
826 	  AO += temp << 1;
827 	  BO += temp;
828 	  CO += 2;
829 #if defined(TRMMKERNEL)
830           REFRESH_AFTER_SAVE (2, 1)
831 #endif
832 	}
833       if (m & 1)
834 	{
835 	  FLOAT *BO;
836 #if defined(TRMMKERNEL)
837           REFRESH_POINTERS (1, 1)
838 #else
839           BO = B;
840           temp = k;
841 #endif
842 	  BLASLONG l = 0;
843 	  FLOAT t = 0;
844 	  for (l = 0; l < temp; l++)
845 	    {
846 	      t += AO[l] * BO[l];
847 	    }
848 	  AO += temp;
849 	  BO += temp;
850 #if defined(TRMMKERNEL)
851 	  CO[0] = t * alpha;
852 #else
853 	  CO[0] += t * alpha;
854 #endif
855 	  CO += 1;
856 #if defined(TRMMKERNEL)
857           REFRESH_AFTER_SAVE (1, 1)
858 #endif
859 	}
860 #if defined(TRMMKERNEL) && !defined(LEFT)
861       off += 1;                 // number of values in A
862 #endif
863       B += k;
864     }
865   return 0;
866 }
867