1/***************************************************************************
2Copyright (c) 2021, The OpenBLAS Project
3All rights reserved.
4Redistribution and use in source and binary forms, with or without
5modification, are permitted provided that the following conditions are
6met:
71. Redistributions of source code must retain the above copyright
8notice, this list of conditions and the following disclaimer.
92. Redistributions in binary form must reproduce the above copyright
10notice, this list of conditions and the following disclaimer in
11the documentation and/or other materials provided with the
12distribution.
133. Neither the name of the OpenBLAS project nor the names of
14its contributors may be used to endorse or promote products
15derived from this software without specific prior written permission.
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*****************************************************************************/
27
28#define ASSEMBLER
29
30#include "common.h"
31
32#define M      $r4
33#define N      $r5
34#define K      $r6
35#define A      $r7
36#define B      $r8
37#define C      $r9
38#define LDC    $r10
39#define AO     $r12
40#define BO     $r13
41#define I      $r17
42#define J      $r18
43#define L      $r30
44#define PREFETCHSIZE (4 * 10)
45#define CO1    $r14
46#define CO2    $r15
47#define CO3    $r23
48#define CO4    $r24
49#define CO5    $r25
50#define CO6    $r26
51#define CO7    $r27
52#define CO8    $r28
53#define BB     $r29
54
55#if defined(TRMMKERNEL)
56#define OFFSET $r11
57#define KK     $r20
58#define TEMP   $r16
59#endif
60
61#define a1     $f22
62#define a2     $f8
63#define a3     $f27
64#define a4     $f28
65#define b1     $f23
66#define b2     $f9
67#define b3     $f10
68#define b4     $f11
69#define b5     $f12
70#define b6     $f13
71#define b7     $f14
72#define b8     $f15
73#define a5     b8
74#define c11    $f16
75#define c12    $f17
76#define c21    $f3
77#define c22    $f1
78#define c31    $f2
79#define c32    $f4
80#define c41    $f5
81#define c42    $f6
82#define c51    $f7
83#define c52    $f18
84#define c61    $f19
85#define c62    $f20
86#define c71    $f21
87#define c72    $f24
88#define c81    $f25
89#define c82    $f26
90#define ALPHA  $f0
91
92   PROLOGUE
93
94   addi.d  $sp, $sp, -160
95   SDARG  $r23,  $sp,    0
96   SDARG  $r24,  $sp,    8
97   SDARG  $r25,  $sp,   16
98   SDARG  $r26,  $sp,   24
99   SDARG  $r27,  $sp,   32
100   SDARG  $r28,  $sp,   40
101   SDARG  $r29,  $sp,   48
102   SDARG  $r30,  $sp,   96
103   fst.d  $f24,  $sp,  56
104   fst.d  $f25,  $sp,  64
105   fst.d  $f26,  $sp,  72
106   fst.d  $f27,  $sp,  80
107   fst.d  $f28,  $sp,  88
108#if defined(TRMMKERNEL)
109   SDARG  $r20,  $sp,  104
110   SDARG  $r16,  $sp,  112
111#endif
112#ifndef __64BIT__
113   fst.d  $f18,  $sp, 120
114   fst.d  $f19,  $sp, 128
115   fst.d  $f20,  $sp, 136
116   fst.d  $f21,  $sp, 144
117#endif
118   slli.d     LDC,    LDC, BASE_SHIFT
119#if defined(TRMMKERNEL) && !defined(LEFT)
120   sub.d   KK, $r0, OFFSET
121#endif
122   srai.d  J,  N, 3
123nop
124   bge $r0,    J, .L30
125.L10:
126   move    CO1, C
127   MTC  c11, $r0
128   add.d   CO2, C,      LDC
129   move    AO, A
130   add.d   CO3, CO2,    LDC
131   addi.d  J, J, -1
132   add.d   CO4, CO3,    LDC
133   MOV c21, c11
134   add.d   CO5, CO4,    LDC
135   MOV c31, c11
136   add.d   CO6, CO5,    LDC
137   MOV c41, c11
138   add.d   CO7, CO6,    LDC
139   MOV c51, c11
140   add.d   CO8, CO7,    LDC
141   srai.d  I,  M, 1
142   add.d   C,   CO8,    LDC
143   slli.d  BB, K, 2 + BASE_SHIFT
144   add.d   BB, B, BB
145#if defined(TRMMKERNEL) &&  defined(LEFT)
146   move    KK, OFFSET
147#endif
148MOV    c61, c11
149   bge $r0,    I, .L20
150.L11:
151#if defined(TRMMKERNEL)
152#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
153   move    BO,  B
154#else
155   slli.d  L,    KK, 1 + BASE_SHIFT
156   slli.d  TEMP, KK, 3 + BASE_SHIFT
157   add.d   AO, AO, L
158   add.d   BO, B,  TEMP
159#endif
160   LD a1,  AO,   0 * SIZE
161   MOV c71, c11
162   LD b1,  BO,   0 * SIZE
163   MOV c81, c11
164   LD a3,  AO,   4 * SIZE
165   MOV c12, c11
166   LD b2,  BO,   1 * SIZE
167   MOV c22, c11
168   MOV c32, c11
169   LD b3,  BO,   2 * SIZE
170   MOV c42, c11
171   LD b4,  BO,   3 * SIZE
172   MOV c52, c11
173   LD b5,  BO,   4 * SIZE
174   MOV c62, c11
175   LD b6,  BO,   8 * SIZE
176   MOV c72, c11
177   LD b7,  BO,  12 * SIZE
178   MOV c82, c11
179#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
180   sub.d   TEMP, K, KK
181#elif defined(LEFT)
182   addi.d  TEMP, KK, 2
183#else
184   addi.d  TEMP, KK, 8
185#endif
186   srai.d  L,  TEMP, 2
187   bge $r0,    L, .L15
188#else
189   LD a1,  AO,   0 * SIZE
190   MOV c71, c11
191   LD b1,  B,   0 * SIZE
192   MOV c81, c11
193   preld  1,  CO1,  3 * SIZE
194   preld  1,  CO2,  3 * SIZE
195   LD a3,  AO,   4 * SIZE
196   MOV c12, c11
197   LD b2,  B,   1 * SIZE
198   MOV c22, c11
199   srai.d  L,  K, 2
200   MOV c32, c11
201   LD b3,  B,   2 * SIZE
202   MOV c42, c11
203   LD b4,  B,   3 * SIZE
204   MOV c52, c11
205   LD b5,  B,   4 * SIZE
206   MOV c62, c11
207   LD b6,  B,   8 * SIZE
208   MOV c72, c11
209   LD b7,  B,  12 * SIZE
210   MOV c82, c11
211move   BO,  B
212   bge $r0,    L, .L15
213#endif
214   MADD  c11, b1, a1, c11
215   LD a2,  AO,   1 * SIZE
216   MADD  c21, b2, a1, c21
217   addi.d  L, L, -1
218   MADD  c31, b3, a1, c31
219   MADD  c41, b4, a1, c41
220   bge $r0,    L, .L13
221   preld  1,  CO3,  2 * SIZE
222   .align  3
223.L12:
224   MADD  c12, b1, a2, c12
225   LD b1,  BO,  16 * SIZE
226   MADD  c22, b2, a2, c22
227   LD b2,  BO,   5 * SIZE
228   MADD  c32, b3, a2, c32
229   LD b3,  BO,   6 * SIZE
230   MADD  c42, b4, a2, c42
231   LD b4,  BO,   7 * SIZE
232   MADD  c51, b5, a1, c51
233   LD a4,  AO,   2 * SIZE
234   MADD  c61, b2, a1, c61
235   MADD  c71, b3, a1, c71
236   MADD  c81, b4, a1, c81
237   LD a1,  AO,   8 * SIZE
238   MADD  c52, b5, a2, c52
239   LD b5,  BO,  20 * SIZE
240   MADD  c62, b2, a2, c62
241   LD b2,  BO,   9 * SIZE
242   MADD  c72, b3, a2, c72
243   LD b3,  BO,  10 * SIZE
244   MADD  c82, b4, a2, c82
245   LD b4,  BO,  11 * SIZE
246   MADD  c11, b6, a4, c11
247   LD a2,  AO,   3 * SIZE
248   MADD  c21, b2, a4, c21
249   MADD  c31, b3, a4, c31
250   MADD  c41, b4, a4, c41
251   MADD  c12, b6, a2, c12
252   LD b6,  BO,  24 * SIZE
253   MADD  c22, b2, a2, c22
254   LD b2,  BO,  13 * SIZE
255   MADD  c32, b3, a2, c32
256   LD b3,  BO,  14 * SIZE
257   MADD  c42, b4, a2, c42
258   LD b4,  BO,  15 * SIZE
259   MADD  c51, b7, a4, c51
260   MADD  c61, b2, a4, c61
261   MADD  c71, b3, a4, c71
262   MADD  c81, b4, a4, c81
263   MADD  c52, b7, a2, c52
264   LD b7,  BO,  28 * SIZE
265   MADD  c62, b2, a2, c62
266   LD b2,  BO,  17 * SIZE
267   MADD  c72, b3, a2, c72
268   LD b3,  BO,  18 * SIZE
269   MADD  c82, b4, a2, c82
270   LD b4,  BO,  19 * SIZE
271   MADD  c11, b1, a3, c11
272   LD a2,  AO,   5 * SIZE
273   MADD  c21, b2, a3, c21
274   MADD  c31, b3, a3, c31
275   MADD  c41, b4, a3, c41
276   MADD  c12, b1, a2, c12
277   LD b1,  BO,  32 * SIZE
278   MADD  c22, b2, a2, c22
279   LD b2,  BO,  21 * SIZE
280   MADD  c32, b3, a2, c32
281   LD b3,  BO,  22 * SIZE
282   MADD  c42, b4, a2, c42
283   LD b4,  BO,  23 * SIZE
284   MADD  c51, b5, a3, c51
285   LD a4,  AO,   6 * SIZE
286   MADD  c61, b2, a3, c61
287   MADD  c71, b3, a3, c71
288   MADD  c81, b4, a3, c81
289   LD a3,  AO,  12 * SIZE
290   MADD  c52, b5, a2, c52
291   LD b5,  BO,  36 * SIZE
292   MADD  c62, b2, a2, c62
293   LD b2,  BO,  25 * SIZE
294   MADD  c72, b3, a2, c72
295   LD b3,  BO,  26 * SIZE
296   MADD  c82, b4, a2, c82
297   LD b4,  BO,  27 * SIZE
298   MADD  c11, b6, a4, c11
299   LD a2,  AO,   7 * SIZE
300   MADD  c21, b2, a4, c21
301   MADD  c31, b3, a4, c31
302   MADD  c41, b4, a4, c41
303   addi.d  L, L, -1
304   MADD  c12, b6, a2, c12
305   LD b6,  BO,  40 * SIZE
306   MADD  c22, b2, a2, c22
307   LD b2,  BO,  29 * SIZE
308   MADD  c32, b3, a2, c32
309   LD b3,  BO,  30 * SIZE
310   MADD  c42, b4, a2, c42
311   LD b4,  BO,  31 * SIZE
312   MADD  c51, b7, a4, c51
313   addi.d  BO, BO, 32 * SIZE
314   MADD  c61, b2, a4, c61
315   addi.d  AO, AO,  8 * SIZE
316   MADD  c71, b3, a4, c71
317   MADD  c81, b4, a4, c81
318   MADD  c52, b7, a2, c52
319   LD b7,  BO,  12 * SIZE
320   MADD  c62, b2, a2, c62
321   LD b2,  BO,   1 * SIZE
322   MADD  c72, b3, a2, c72
323   LD b3,  BO,   2 * SIZE
324   MADD  c82, b4, a2, c82
325   LD b4,  BO,   3 * SIZE
326   MADD  c11, b1, a1, c11
327   LD a2,  AO,   1 * SIZE
328   MADD  c21, b2, a1, c21
329   MADD  c31, b3, a1, c31
330   MADD  c41, b4, a1, c41
331   blt $r0,    L, .L12
332   .align 3
333
334.L13:
335   MADD  c12, b1, a2, c12
336   LD b1,  BO,  16 * SIZE
337   MADD  c22, b2, a2, c22
338   LD b2,  BO,   5 * SIZE
339   MADD  c32, b3, a2, c32
340   LD b3,  BO,   6 * SIZE
341   MADD  c42, b4, a2, c42
342   LD b4,  BO,   7 * SIZE
343   MADD  c51, b5, a1, c51
344   MADD  c61, b2, a1, c61
345   LD a4,  AO,   2 * SIZE
346   MADD  c71, b3, a1, c71
347   MADD  c81, b4, a1, c81
348   LD a1,  AO,   8 * SIZE
349   MADD  c52, b5, a2, c52
350   LD b5,  BO,  20 * SIZE
351   MADD  c62, b2, a2, c62
352   LD b2,  BO,   9 * SIZE
353   MADD  c72, b3, a2, c72
354   LD b3,  BO,  10 * SIZE
355   MADD  c82, b4, a2, c82
356   LD b4,  BO,  11 * SIZE
357   MADD  c11, b6, a4, c11
358   LD a2,  AO,   3 * SIZE
359   MADD  c21, b2, a4, c21
360   MADD  c31, b3, a4, c31
361   preld  1,  CO4,  3 * SIZE
362   MADD  c41, b4, a4, c41
363   MADD  c12, b6, a2, c12
364   LD b6,  BO,  24 * SIZE
365   MADD  c22, b2, a2, c22
366   LD b2,  BO,  13 * SIZE
367   MADD  c32, b3, a2, c32
368   LD b3,  BO,  14 * SIZE
369   MADD  c42, b4, a2, c42
370   LD b4,  BO,  15 * SIZE
371   MADD  c51, b7, a4, c51
372   preld  1,  CO5,  3 * SIZE
373   MADD  c61, b2, a4, c61
374   MADD  c71, b3, a4, c71
375   preld  1,  CO6,  3 * SIZE
376   MADD  c81, b4, a4, c81
377   MADD  c52, b7, a2, c52
378   LD b7,  BO,  28 * SIZE
379   MADD  c62, b2, a2, c62
380   LD b2,  BO,  17 * SIZE
381   MADD  c72, b3, a2, c72
382   LD b3,  BO,  18 * SIZE
383   MADD  c82, b4, a2, c82
384   LD b4,  BO,  19 * SIZE
385   MADD  c11, b1, a3, c11
386   LD a2,  AO,   5 * SIZE
387   MADD  c21, b2, a3, c21
388   MADD  c31, b3, a3, c31
389   preld  1,  CO7,  3 * SIZE
390   MADD  c41, b4, a3, c41
391   MADD  c12, b1, a2, c12
392   LD b1,  BO,  32 * SIZE
393   MADD  c22, b2, a2, c22
394   LD b2,  BO,  21 * SIZE
395   MADD  c32, b3, a2, c32
396   LD b3,  BO,  22 * SIZE
397   MADD  c42, b4, a2, c42
398   LD b4,  BO,  23 * SIZE
399   MADD  c51, b5, a3, c51
400   MADD  c61, b2, a3, c61
401   LD a4,  AO,   6 * SIZE
402   MADD  c71, b3, a3, c71
403   MADD  c81, b4, a3, c81
404   MADD  c52, b5, a2, c52
405   LD b5,  BO,  36 * SIZE
406   MADD  c62, b2, a2, c62
407   LD b2,  BO,  25 * SIZE
408   MADD  c72, b3, a2, c72
409   LD b3,  BO,  26 * SIZE
410   MADD  c82, b4, a2, c82
411   LD b4,  BO,  27 * SIZE
412   MADD  c11, b6, a4, c11
413   LD a2,  AO,   7 * SIZE
414   MADD  c21, b2, a4, c21
415   MADD  c31, b3, a4, c31
416   MADD  c41, b4, a4, c41
417   MADD  c12, b6, a2, c12
418   LD b6,  BO,  40 * SIZE
419   MADD  c22, b2, a2, c22
420   LD b2,  BO,  29 * SIZE
421   MADD  c32, b3, a2, c32
422   LD b3,  BO,  30 * SIZE
423   MADD  c42, b4, a2, c42
424   LD b4,  BO,  31 * SIZE
425   MADD  c51, b7, a4, c51
426   addi.d  BO, BO, 32 * SIZE
427   MADD  c61, b2, a4, c61
428   addi.d  AO, AO,  8 * SIZE
429   MADD  c71, b3, a4, c71
430   MADD  c81, b4, a4, c81
431   MADD  c52, b7, a2, c52
432   LD b7,  BO,  12 * SIZE
433   MADD  c62, b2, a2, c62
434   LD b2,  BO,   1 * SIZE
435   MADD  c72, b3, a2, c72
436   LD b3,  BO,   2 * SIZE
437   MADD  c82, b4, a2, c82
438   LD b4,  BO,   3 * SIZE
439   .align 3
440
441.L15:
442#ifndef TRMMKERNEL
443   andi    L,  K, 3
444#else
445   andi    L,  TEMP, 3
446#endif
447   preld  1,  CO8,  3 * SIZE
448   bge $r0,    L, .L18
449   .align  3
450.L16:
451   MADD  c11, b1, a1, c11
452   LD a2,  AO,   1 * SIZE
453   MADD  c21, b2, a1, c21
454   MADD  c31, b3, a1, c31
455   MADD  c41, b4, a1, c41
456   MADD  c12, b1, a2, c12
457   LD b1,  BO,   8 * SIZE
458   MADD  c22, b2, a2, c22
459   LD b2,  BO,   5 * SIZE
460   MADD  c32, b3, a2, c32
461   LD b3,  BO,   6 * SIZE
462   MADD  c42, b4, a2, c42
463   LD b4,  BO,   7 * SIZE
464   MADD  c51, b5, a1, c51
465   addi.d  L, L, -1
466   MADD  c61, b2, a1, c61
467   addi.d  AO, AO,  2 * SIZE
468   MADD  c71, b3, a1, c71
469   addi.d  BO, BO,  8 * SIZE
470   MADD  c81, b4, a1, c81
471   LD a1,  AO,   0 * SIZE
472   MADD  c52, b5, a2, c52
473   LD b5,  BO,   4 * SIZE
474   MADD  c62, b2, a2, c62
475   LD b2,  BO,   1 * SIZE
476   MADD  c72, b3, a2, c72
477   LD b3,  BO,   2 * SIZE
478   MADD  c82, b4, a2, c82
479   LD b4,  BO,   3 * SIZE
480   blt $r0,    L, .L16
481.L18:
482#ifndef TRMMKERNEL
483   LD $f22,  CO1,  0 * SIZE
484   addi.d  CO3,CO3, 2 * SIZE
485   LD $f8,  CO1,  1 * SIZE
486   addi.d  CO1,CO1, 2 * SIZE
487   LD $f23,  CO2,  0 * SIZE
488   addi.d  CO4,CO4, 2 * SIZE
489   LD $f9,  CO2,  1 * SIZE
490   addi.d  CO2,CO2, 2 * SIZE
491   LD $f10,  CO3,  -2 * SIZE
492   addi.d  CO5,CO5, 2 * SIZE
493   LD $f11,  CO3,  -1 * SIZE
494   addi.d  CO6,CO6, 2 * SIZE
495   LD $f12,  CO4,  -2 * SIZE
496   addi.d  CO7,CO7, 2 * SIZE
497   LD $f13,  CO4,  -1 * SIZE
498   addi.d  I, I, -1
499   MADD  c11, c11, ALPHA, $f22
500   LD $f22,  CO5, -2 * SIZE
501   MADD  c12, c12, ALPHA, $f8
502   LD $f8,  CO5, -1 * SIZE
503   MADD  c21, c21, ALPHA, $f23
504   LD $f23,  CO6, -2 * SIZE
505   MADD  c22, c22, ALPHA, $f9
506   LD $f9,  CO6, -1 * SIZE
507   MADD  c31, c31, ALPHA, $f10
508   LD $f10,  CO7, -2 * SIZE
509   MADD  c32, c32, ALPHA, $f11
510   LD $f11,  CO7, -1 * SIZE
511   MADD  c41, c41, ALPHA, $f12
512   LD $f12,  CO8,  0 * SIZE
513   MADD  c42, c42, ALPHA, $f13
514   LD $f13,  CO8,  1 * SIZE
515   preld  0,  BB,  0 * SIZE
516   preld  0,  BB,  8 * SIZE
517   ST c11,  CO1,  -2 * SIZE
518   MTC  c11, $r0
519   ST c12,  CO1,  -1 * SIZE
520   addi.d  CO8,CO8, 2 * SIZE
521   ST c21,  CO2,  -2 * SIZE
522   MOV c21, c11
523   ST c22,  CO2,  -1 * SIZE
524   addi.d  BB, BB, 16 * SIZE
525   MADD  c51, c51, ALPHA, $f22
526   ST c31,  CO3,  -2 * SIZE
527   MADD  c52, c52, ALPHA, $f8
528   ST c32,  CO3,  -1 * SIZE
529   MADD  c61, c61, ALPHA, $f23
530   ST c41,  CO4,  -2 * SIZE
531   MADD  c62, c62, ALPHA, $f9
532   ST c42,  CO4,  -1 * SIZE
533   MADD  c71, c71, ALPHA, $f10
534   ST c51,  CO5,  -2 * SIZE
535   MADD  c72, c72, ALPHA, $f11
536   ST c52,  CO5,  -1 * SIZE
537   MADD  c81, c81, ALPHA, $f12
538   ST c61,  CO6,  -2 * SIZE
539   MADD  c82, c82, ALPHA, $f13
540   ST c62,  CO6,  -1 * SIZE
541   ST c71,  CO7,  -2 * SIZE
542   MOV c31, c11
543   ST c72,  CO7,  -1 * SIZE
544   MOV c41, c11
545   ST c81,  CO8,  -2 * SIZE
546   MOV c51, c11
547   ST c82,  CO8,  -1 * SIZE
548MOV    c61, c11
549   blt $r0,    I, .L11
550#else
551   addi.d  CO4,CO4, 2 * SIZE
552   addi.d  CO5,CO5, 2 * SIZE
553   addi.d  CO6,CO6, 2 * SIZE
554   addi.d  CO7,CO7, 2 * SIZE
555   preld  0,  BB,  0 * SIZE
556   preld  0,  BB,  8 * SIZE
557   MUL c11, ALPHA, c11
558   addi.d  CO1,CO1, 2 * SIZE
559   MUL c12, ALPHA, c12
560   MTC  a1, $r0
561   MUL c21, ALPHA, c21
562   addi.d  CO2,CO2, 2 * SIZE
563   MUL c22, ALPHA, c22
564   addi.d  CO3,CO3, 2 * SIZE
565   ST c11,  CO1,  -2 * SIZE
566   MUL c31, ALPHA, c31
567   ST c12,  CO1,  -1 * SIZE
568   MUL c32, ALPHA, c32
569   ST c21,  CO2,  -2 * SIZE
570   MUL c41, ALPHA, c41
571   ST c22,  CO2,  -1 * SIZE
572   MUL c42, ALPHA, c42
573   ST c31,  CO3,  -2 * SIZE
574   MUL c51, ALPHA, c51
575   ST c32,  CO3,  -1 * SIZE
576   MUL c52, ALPHA, c52
577   ST c41,  CO4,  -2 * SIZE
578   MUL c61, ALPHA, c61
579   ST c42,  CO4,  -1 * SIZE
580   MUL c62, ALPHA, c62
581   ST c51,  CO5,  -2 * SIZE
582   MUL c71, ALPHA, c71
583   ST c52,  CO5,  -1 * SIZE
584   MUL c72, ALPHA, c72
585   ST c61,  CO6,  -2 * SIZE
586   MUL c81, ALPHA, c81
587   ST c62,  CO6,  -1 * SIZE
588   MUL c82, ALPHA, c82
589   ST c71,  CO7,  -2 * SIZE
590   MOV c11, a1
591   ST c72,  CO7,  -1 * SIZE
592   MOV c21, a1
593   addi.d  CO8,CO8, 2 * SIZE
594   addi.d  BB, BB, 16 * SIZE
595   ST c81,  CO8,  -2 * SIZE
596   MOV c31, a1
597   ST c82,  CO8,  -1 * SIZE
598   MOV c41, a1
599   addi.d  I, I, -1
600   MOV c51, a1
601#if ( defined(LEFT) &&  defined(TRANSA)) || \
602    (!defined(LEFT) && !defined(TRANSA))
603   sub.d   TEMP, K, KK
604#ifdef LEFT
605   addi.d  TEMP, TEMP, -2
606#else
607   addi.d  TEMP, TEMP, -8
608#endif
609   slli.d  L,    TEMP, 1 + BASE_SHIFT
610   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
611   add.d   AO, AO, L
612   add.d   BO, BO, TEMP
613#endif
614#ifdef LEFT
615   addi.d  KK, KK, 2
616#endif
617MOV    c61, a1
618   blt $r0,    I, .L11
619#endif
620   .align 3
621
622.L20:
623   andi    I,  M, 1
624   MOV c61, c11
625MOV    c71, c11
626   bge $r0,    I, .L29
627#if defined(TRMMKERNEL)
628#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
629   move    BO,  B
630#else
631   slli.d  L,    KK, 0 + BASE_SHIFT
632   slli.d  TEMP, KK, 3 + BASE_SHIFT
633   add.d   AO, AO, L
634   add.d   BO, B,  TEMP
635#endif
636   LD a1,  AO,   0 * SIZE
637   LD a2,  AO,   1 * SIZE
638   LD a3,  AO,   2 * SIZE
639   LD a4,  AO,   3 * SIZE
640   LD b1,  BO,   0 * SIZE
641   LD b2,  BO,   1 * SIZE
642   LD b3,  BO,   2 * SIZE
643   LD b4,  BO,   3 * SIZE
644   LD b5,  BO,   4 * SIZE
645   LD b6,  BO,   8 * SIZE
646   LD b7,  BO,  12 * SIZE
647#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
648   sub.d   TEMP, K, KK
649#elif defined(LEFT)
650   addi.d  TEMP, KK, 1
651#else
652   addi.d  TEMP, KK, 8
653#endif
654   srai.d  L,  TEMP, 2
655MOV    c81, c11
656   bge $r0,    L, .L25
657#else
658   LD a1,  AO,   0 * SIZE
659   LD a2,  AO,   1 * SIZE
660   LD a3,  AO,   2 * SIZE
661   LD a4,  AO,   3 * SIZE
662   LD b1,  B,   0 * SIZE
663   LD b2,  B,   1 * SIZE
664   LD b3,  B,   2 * SIZE
665   LD b4,  B,   3 * SIZE
666   LD b5,  B,   4 * SIZE
667   LD b6,  B,   8 * SIZE
668   LD b7,  B,  12 * SIZE
669   srai.d  L,  K, 2
670   MOV c81, c11
671move   BO,  B
672   bge $r0,    L, .L25
673#endif
674   .align  3
675.L22:
676   MADD  c11, b1, a1, c11
677   LD b1,  BO,  16 * SIZE
678   MADD  c21, b2, a1, c21
679   LD b2,  BO,   5 * SIZE
680   MADD  c31, b3, a1, c31
681   LD b3,  BO,   6 * SIZE
682   MADD  c41, b4, a1, c41
683   LD b4,  BO,   7 * SIZE
684   MADD  c51, b5, a1, c51
685   LD b5,  BO,  20 * SIZE
686   MADD  c61, b2, a1, c61
687   LD b2,  BO,   9 * SIZE
688   MADD  c71, b3, a1, c71
689   LD b3,  BO,  10 * SIZE
690   MADD  c81, b4, a1, c81
691   LD b4,  BO,  11 * SIZE
692   LD a1,  AO,   4 * SIZE
693   addi.d  L, L, -1
694   MADD  c11, b6, a2, c11
695   LD b6,  BO,  24 * SIZE
696   MADD  c21, b2, a2, c21
697   LD b2,  BO,  13 * SIZE
698   MADD  c31, b3, a2, c31
699   LD b3,  BO,  14 * SIZE
700   MADD  c41, b4, a2, c41
701   LD b4,  BO,  15 * SIZE
702   MADD  c51, b7, a2, c51
703   LD b7,  BO,  28 * SIZE
704   MADD  c61, b2, a2, c61
705   LD b2,  BO,  17 * SIZE
706   MADD  c71, b3, a2, c71
707   LD b3,  BO,  18 * SIZE
708   MADD  c81, b4, a2, c81
709   LD b4,  BO,  19 * SIZE
710   LD a2,  AO,   5 * SIZE
711   addi.d  AO, AO,  4 * SIZE
712   MADD  c11, b1, a3, c11
713   LD b1,  BO,  32 * SIZE
714   MADD  c21, b2, a3, c21
715   LD b2,  BO,  21 * SIZE
716   MADD  c31, b3, a3, c31
717   LD b3,  BO,  22 * SIZE
718   MADD  c41, b4, a3, c41
719   LD b4,  BO,  23 * SIZE
720   MADD  c51, b5, a3, c51
721   LD b5,  BO,  36 * SIZE
722   MADD  c61, b2, a3, c61
723   LD b2,  BO,  25 * SIZE
724   MADD  c71, b3, a3, c71
725   LD b3,  BO,  26 * SIZE
726   MADD  c81, b4, a3, c81
727   LD b4,  BO,  27 * SIZE
728   LD a3,  AO,   2 * SIZE
729   addi.d  BO, BO, 32 * SIZE
730   MADD  c11, b6, a4, c11
731   LD b6,  BO,   8 * SIZE
732   MADD  c21, b2, a4, c21
733   LD b2,  BO,  -3 * SIZE
734   MADD  c31, b3, a4, c31
735   LD b3,  BO,  -2 * SIZE
736   MADD  c41, b4, a4, c41
737   LD b4,  BO,  -1 * SIZE
738   MADD  c51, b7, a4, c51
739   LD b7,  BO,  12 * SIZE
740   MADD  c61, b2, a4, c61
741   LD b2,  BO,   1 * SIZE
742   MADD  c71, b3, a4, c71
743   LD b3,  BO,   2 * SIZE
744   MADD  c81, b4, a4, c81
745   LD b4,  BO,   3 * SIZE
746   LD a4,  AO,   3 * SIZE
747   blt $r0,    L, .L22
748   .align 3
749
750.L25:
751#ifndef TRMMKERNEL
752   andi    L,  K, 3
753#else
754   andi    L,  TEMP, 3
755#endif
756   bge $r0,    L, .L28
757   .align  3
758.L26:
759   MADD  c11, b1, a1, c11
760   LD b1,  BO,   8 * SIZE
761   MADD  c21, b2, a1, c21
762   LD b2,  BO,   5 * SIZE
763   MADD  c31, b3, a1, c31
764   LD b3,  BO,   6 * SIZE
765   MADD  c41, b4, a1, c41
766   LD b4,  BO,   7 * SIZE
767   addi.d  L, L, -1
768   MOV a2, a2
769   addi.d  AO, AO,  1 * SIZE
770   addi.d  BO, BO,  8 * SIZE
771   MADD  c51, b5, a1, c51
772   LD b5,  BO,   4 * SIZE
773   MADD  c61, b2, a1, c61
774   LD b2,  BO,   1 * SIZE
775   MADD  c71, b3, a1, c71
776   LD b3,  BO,   2 * SIZE
777   MADD  c81, b4, a1, c81
778   LD a1,  AO,   0 * SIZE
779   LD b4,  BO,   3 * SIZE
780   blt $r0,    L, .L26
781.L28:
782#ifndef TRMMKERNEL
783   LD $f22,  CO1,  0 * SIZE
784   LD $f8,  CO2,  0 * SIZE
785   LD $f23,  CO3,  0 * SIZE
786   LD $f9,  CO4,  0 * SIZE
787   MADD  c11, c11, ALPHA, $f22
788   LD $f10,  CO5,  0 * SIZE
789   MADD  c21, c21, ALPHA, $f8
790   LD $f11,  CO6,  0 * SIZE
791   MADD  c31, c31, ALPHA, $f23
792   LD $f12,  CO7,  0 * SIZE
793   MADD  c41, c41, ALPHA, $f9
794   LD $f13,  CO8,  0 * SIZE
795   MADD  c51, c51, ALPHA, $f10
796   ST c11,  CO1,   0 * SIZE
797   MADD  c61, c61, ALPHA, $f11
798   ST c21,  CO2,   0 * SIZE
799   MADD  c71, c71, ALPHA, $f12
800   ST c31,  CO3,   0 * SIZE
801   MADD  c81, c81, ALPHA, $f13
802   ST c41,  CO4,   0 * SIZE
803   ST c51,  CO5,   0 * SIZE
804   ST c61,  CO6,   0 * SIZE
805   ST c71,  CO7,   0 * SIZE
806   ST c81,  CO8,   0 * SIZE
807#else
808   MUL c11, ALPHA, c11
809   MUL c21, ALPHA, c21
810   MUL c31, ALPHA, c31
811   MUL c41, ALPHA, c41
812   ST c11,  CO1,   0 * SIZE
813   MUL c51, ALPHA, c51
814   ST c21,  CO2,   0 * SIZE
815   MUL c61, ALPHA, c61
816   ST c31,  CO3,   0 * SIZE
817   MUL c71, ALPHA, c71
818   ST c41,  CO4,   0 * SIZE
819   MUL c81, ALPHA, c81
820   ST c51,  CO5,   0 * SIZE
821   ST c61,  CO6,   0 * SIZE
822   ST c71,  CO7,   0 * SIZE
823   ST c81,  CO8,   0 * SIZE
824#if ( defined(LEFT) &&  defined(TRANSA)) || \
825    (!defined(LEFT) && !defined(TRANSA))
826   sub.d   TEMP, K, KK
827#ifdef LEFT
828   addi.d  TEMP, TEMP, -1
829#else
830   addi.d  TEMP, TEMP, -8
831#endif
832   slli.d  L,    TEMP, 0 + BASE_SHIFT
833   slli.d  TEMP, TEMP, 3 + BASE_SHIFT
834   add.d   AO, AO, L
835   add.d   BO, BO, TEMP
836#endif
837#ifdef LEFT
838   addi.d  KK, KK, 1
839#endif
840#endif
841   .align 3
842
843.L29:
844#if defined(TRMMKERNEL) && !defined(LEFT)
845   addi.d  KK, KK, 8
846#endif
847move   B, BO
848   blt $r0,    J, .L10
849   .align 3
850
851.L30:
852   andi    J,  N, 4
853move   AO, A
854   bge $r0,    J, .L50
855   move    CO1, C
856   MTC  c11, $r0
857   add.d   CO2, C,      LDC
858   add.d   CO3, CO2,    LDC
859   add.d   CO4, CO3,    LDC
860   MOV c21, c11
861   add.d   C,   CO4,    LDC
862   MOV c31, c11
863#if defined(TRMMKERNEL) &&  defined(LEFT)
864   move    KK, OFFSET
865#endif
866   srai.d  I,  M, 1
867MOV    c41, c11
868   bge $r0,    I, .L40
869.L31:
870#if defined(TRMMKERNEL)
871#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
872   move    BO,  B
873#else
874   slli.d  L,    KK, 1 + BASE_SHIFT
875   slli.d  TEMP, KK, 2 + BASE_SHIFT
876   add.d   AO, AO, L
877   add.d   BO, B,  TEMP
878#endif
879   LD a1,  AO,   0 * SIZE
880   LD a3,  AO,   4 * SIZE
881   LD b1,  BO,   0 * SIZE
882   MOV c12, c11
883   LD b2,  BO,   1 * SIZE
884   MOV c22, c11
885   LD b3,  BO,   2 * SIZE
886   MOV c32, c11
887   LD b4,  BO,   3 * SIZE
888   MOV c42, c11
889   LD b5,  BO,   4 * SIZE
890   LD b6,  BO,   8 * SIZE
891   LD b7,  BO,  12 * SIZE
892#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
893   sub.d   TEMP, K, KK
894#elif defined(LEFT)
895   addi.d  TEMP, KK, 2
896#else
897   addi.d  TEMP, KK, 4
898#endif
899   srai.d  L,  TEMP, 2
900   bge $r0,    L, .L35
901#else
902   LD a1,  AO,   0 * SIZE
903   LD a3,  AO,   4 * SIZE
904   LD b1,  B,   0 * SIZE
905   MOV c12, c11
906   LD b2,  B,   1 * SIZE
907   MOV c22, c11
908   LD b3,  B,   2 * SIZE
909   MOV c32, c11
910   LD b4,  B,   3 * SIZE
911   MOV c42, c11
912   LD b5,  B,   4 * SIZE
913   srai.d  L,  K, 2
914   LD b6,  B,   8 * SIZE
915   LD b7,  B,  12 * SIZE
916move   BO,  B
917   bge $r0,    L, .L35
918#endif
919   .align  3
920.L32:
921   MADD  c11, b1, a1, c11
922   LD a2,  AO,   1 * SIZE
923   MADD  c21, b2, a1, c21
924   addi.d  L, L, -1
925   MADD  c31, b3, a1, c31
926   MADD  c41, b4, a1, c41
927   LD a1,  AO,   2 * SIZE
928   MADD  c12, b1, a2, c12
929   LD b1,  BO,  16 * SIZE
930   MADD  c22, b2, a2, c22
931   LD b2,  BO,   5 * SIZE
932   MADD  c32, b3, a2, c32
933   LD b3,  BO,   6 * SIZE
934   MADD  c42, b4, a2, c42
935   LD b4,  BO,   7 * SIZE
936   MADD  c11, b5, a1, c11
937   LD a2,  AO,   3 * SIZE
938   MADD  c21, b2, a1, c21
939   MADD  c31, b3, a1, c31
940   MADD  c41, b4, a1, c41
941   LD a1,  AO,   8 * SIZE
942   MADD  c12, b5, a2, c12
943   LD b5,  BO,  20 * SIZE
944   MADD  c22, b2, a2, c22
945   LD b2,  BO,   9 * SIZE
946   MADD  c32, b3, a2, c32
947   LD b3,  BO,  10 * SIZE
948   MADD  c42, b4, a2, c42
949   LD b4,  BO,  11 * SIZE
950   MADD  c11, b6, a3, c11
951   LD a2,  AO,   5 * SIZE
952   MADD  c21, b2, a3, c21
953   MADD  c31, b3, a3, c31
954   MADD  c41, b4, a3, c41
955   LD a3,  AO,   6 * SIZE
956   MADD  c12, b6, a2, c12
957   LD b6,  BO,  24 * SIZE
958   MADD  c22, b2, a2, c22
959   LD b2,  BO,  13 * SIZE
960   MADD  c32, b3, a2, c32
961   LD b3,  BO,  14 * SIZE
962   MADD  c42, b4, a2, c42
963   LD b4,  BO,  15 * SIZE
964   MADD  c11, b7, a3, c11
965   LD a2,  AO,   7 * SIZE
966   MADD  c21, b2, a3, c21
967   addi.d  AO, AO,  8 * SIZE
968   MADD  c31, b3, a3, c31
969   addi.d  BO, BO, 16 * SIZE
970   MADD  c41, b4, a3, c41
971   LD a3,  AO,   4 * SIZE
972   MADD  c12, b7, a2, c12
973   LD b7,  BO,  12 * SIZE
974   MADD  c22, b2, a2, c22
975   LD b2,  BO,   1 * SIZE
976   MADD  c32, b3, a2, c32
977   LD b3,  BO,   2 * SIZE
978   MADD  c42, b4, a2, c42
979   LD b4,  BO,   3 * SIZE
980   blt $r0,    L, .L32
981   .align 3
982
983.L35:
984#ifndef TRMMKERNEL
985   andi    L,  K, 3
986#else
987   andi    L,  TEMP, 3
988#endif
989   bge $r0,    L, .L38
990   .align  3
991.L36:
992   MADD  c11, b1, a1, c11
993   LD a2,  AO,   1 * SIZE
994   MADD  c21, b2, a1, c21
995   addi.d  L, L, -1
996   MADD  c31, b3, a1, c31
997   addi.d  AO, AO,  2 * SIZE
998   MADD  c41, b4, a1, c41
999   LD a1,  AO,   0 * SIZE
1000   MADD  c12, b1, a2, c12
1001   LD b1,  BO,   4 * SIZE
1002   MADD  c22, b2, a2, c22
1003   LD b2,  BO,   5 * SIZE
1004   MADD  c32, b3, a2, c32
1005   LD b3,  BO,   6 * SIZE
1006   MADD  c42, b4, a2, c42
1007   LD b4,  BO,   7 * SIZE
1008addi.d BO, BO,  4 * SIZE
1009   blt $r0,    L, .L36
1010.L38:
1011#ifndef TRMMKERNEL
1012   LD $f22,  CO1,  0 * SIZE
1013   addi.d  CO3,CO3, 2 * SIZE
1014   LD $f8,  CO1,  1 * SIZE
1015   addi.d  CO1,CO1, 2 * SIZE
1016   LD $f23,  CO2,  0 * SIZE
1017   addi.d  CO4,CO4, 2 * SIZE
1018   LD $f9,  CO2,  1 * SIZE
1019   addi.d  CO2,CO2, 2 * SIZE
1020   LD $f10,  CO3,  -2 * SIZE
1021   MADD  c11, c11, ALPHA, $f22
1022   LD $f11,  CO3,  -1 * SIZE
1023   MADD  c12, c12, ALPHA, $f8
1024   LD $f12,  CO4,  -2 * SIZE
1025   MADD  c21, c21, ALPHA, $f23
1026   LD $f13,  CO4,  -1 * SIZE
1027   MADD  c22, c22, ALPHA, $f9
1028   MADD  c31, c31, ALPHA, $f10
1029   ST c11,  CO1,  -2 * SIZE
1030   MADD  c32, c32, ALPHA, $f11
1031   ST c12,  CO1,  -1 * SIZE
1032   MADD  c41, c41, ALPHA, $f12
1033   ST c21,  CO2,  -2 * SIZE
1034   MADD  c42, c42, ALPHA, $f13
1035   ST c22,  CO2,  -1 * SIZE
1036   ST c31,  CO3,  -2 * SIZE
1037   MTC  c11, $r0
1038   ST c32,  CO3,  -1 * SIZE
1039   addi.d  I, I, -1
1040   ST c41,  CO4,  -2 * SIZE
1041   MOV c21, c11
1042   ST c42,  CO4,  -1 * SIZE
1043   MOV c31, c11
1044#else
1045   MUL c11, ALPHA, c11
1046   addi.d  CO3,CO3, 2 * SIZE
1047   MUL c12, ALPHA, c12
1048   addi.d  CO1,CO1, 2 * SIZE
1049   MUL c21, ALPHA, c21
1050   addi.d  CO4,CO4, 2 * SIZE
1051   MUL c22, ALPHA, c22
1052   addi.d  CO2,CO2, 2 * SIZE
1053   ST c11,  CO1,  -2 * SIZE
1054   MUL c31, ALPHA, c31
1055   ST c12,  CO1,  -1 * SIZE
1056   MUL c32, ALPHA, c32
1057   ST c21,  CO2,  -2 * SIZE
1058   MUL c41, ALPHA, c41
1059   ST c22,  CO2,  -1 * SIZE
1060   MUL c42, ALPHA, c42
1061   ST c31,  CO3,  -2 * SIZE
1062   MTC  c11, $r0
1063   ST c32,  CO3,  -1 * SIZE
1064   addi.d  I, I, -1
1065   ST c41,  CO4,  -2 * SIZE
1066   MOV c21, c11
1067   ST c42,  CO4,  -1 * SIZE
1068   MOV c31, c11
1069#if ( defined(LEFT) &&  defined(TRANSA)) || \
1070    (!defined(LEFT) && !defined(TRANSA))
1071   sub.d   TEMP, K, KK
1072#ifdef LEFT
1073   addi.d  TEMP, TEMP, -2
1074#else
1075   addi.d  TEMP, TEMP, -4
1076#endif
1077   slli.d  L,    TEMP, 1 + BASE_SHIFT
1078   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
1079   add.d   AO, AO, L
1080   add.d   BO, BO, TEMP
1081#endif
1082#ifdef LEFT
1083   addi.d  KK, KK, 2
1084#endif
1085#endif
1086MOV    c41, c11
1087   blt $r0,    I, .L31
1088   .align 3
1089
1090.L40:
1091   andi    I,  M, 1
1092MOV    c61, c11
1093   bge $r0,    I, .L49
1094#if defined(TRMMKERNEL)
1095#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1096   move    BO,  B
1097#else
1098   slli.d  L,    KK, 0 + BASE_SHIFT
1099   slli.d  TEMP, KK, 2 + BASE_SHIFT
1100   add.d   AO, AO, L
1101   add.d   BO, B,  TEMP
1102#endif
1103   LD a1,  AO,   0 * SIZE
1104   MOV c71, c11
1105   LD a2,  AO,   1 * SIZE
1106   MOV c81, c11
1107   LD b1,  BO,   0 * SIZE
1108   LD b2,  BO,   1 * SIZE
1109   LD b3,  BO,   2 * SIZE
1110   LD b4,  BO,   3 * SIZE
1111   LD b5,  BO,   4 * SIZE
1112   LD b6,  BO,   8 * SIZE
1113   LD b7,  BO,  12 * SIZE
1114#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1115   sub.d   TEMP, K, KK
1116#elif defined(LEFT)
1117   addi.d  TEMP, KK, 1
1118#else
1119   addi.d  TEMP, KK, 4
1120#endif
1121   srai.d  L,  TEMP, 2
1122   bge $r0,    L, .L45
1123#else
1124   LD a1,  AO,   0 * SIZE
1125   MOV c71, c11
1126   LD a2,  AO,   1 * SIZE
1127   MOV c81, c11
1128   LD b1,  B,   0 * SIZE
1129   LD b2,  B,   1 * SIZE
1130   LD b3,  B,   2 * SIZE
1131   LD b4,  B,   3 * SIZE
1132   LD b5,  B,   4 * SIZE
1133   LD b6,  B,   8 * SIZE
1134   LD b7,  B,  12 * SIZE
1135   srai.d  L,  K, 2
1136move   BO,  B
1137   bge $r0,    L, .L45
1138#endif
1139   .align  3
1140.L42:
1141   MADD  c11, b1, a1, c11
1142   LD b1,  BO,  16 * SIZE
1143   MADD  c21, b2, a1, c21
1144   LD b2,  BO,   5 * SIZE
1145   MADD  c31, b3, a1, c31
1146   LD b3,  BO,   6 * SIZE
1147   MADD  c41, b4, a1, c41
1148   LD b4,  BO,   7 * SIZE
1149   LD a1,  AO,   4 * SIZE
1150   addi.d  L, L, -1
1151   MADD  c11, b5, a2, c11
1152   LD b5,  BO,  20 * SIZE
1153   MADD  c21, b2, a2, c21
1154   LD b2,  BO,   9 * SIZE
1155   MADD  c31, b3, a2, c31
1156   LD b3,  BO,  10 * SIZE
1157   MADD  c41, b4, a2, c41
1158   LD b4,  BO,  11 * SIZE
1159   LD a2,  AO,   2 * SIZE
1160   addi.d  AO, AO,  4 * SIZE
1161   MADD  c11, b6, a2, c11
1162   LD b6,  BO,  24 * SIZE
1163   MADD  c21, b2, a2, c21
1164   LD b2,  BO,  13 * SIZE
1165   MADD  c31, b3, a2, c31
1166   LD b3,  BO,  14 * SIZE
1167   MADD  c41, b4, a2, c41
1168   LD b4,  BO,  15 * SIZE
1169   LD a2,  AO,  -1 * SIZE
1170   addi.d  BO, BO, 16 * SIZE
1171   MADD  c11, b7, a2, c11
1172   LD b7,  BO,  12 * SIZE
1173   MADD  c21, b2, a2, c21
1174   LD b2,  BO,   1 * SIZE
1175   MADD  c31, b3, a2, c31
1176   LD b3,  BO,   2 * SIZE
1177   MADD  c41, b4, a2, c41
1178   LD b4,  BO,   3 * SIZE
1179   LD a2,  AO,   1 * SIZE
1180   blt $r0,    L, .L42
1181   .align 3
1182
1183.L45:
1184#ifndef TRMMKERNEL
1185   andi    L,  K, 3
1186#else
1187   andi    L,  TEMP, 3
1188#endif
1189   bge $r0,    L, .L48
1190   .align  3
1191.L46:
1192   MADD  c11, b1, a1, c11
1193   LD b1,  BO,   4 * SIZE
1194   MADD  c21, b2, a1, c21
1195   LD b2,  BO,   5 * SIZE
1196   MADD  c31, b3, a1, c31
1197   LD b3,  BO,   6 * SIZE
1198   MADD  c41, b4, a1, c41
1199   LD a1,  AO,   1 * SIZE
1200   LD b4,  BO,   7 * SIZE
1201   addi.d  L, L, -1
1202   addi.d  AO, AO,  1 * SIZE
1203   MOV a2, a2
1204addi.d BO, BO,  4 * SIZE
1205   blt $r0,    L, .L46
1206.L48:
1207#ifndef TRMMKERNEL
1208   LD $f22,  CO1,  0 * SIZE
1209   LD $f8,  CO2,  0 * SIZE
1210   LD $f23,  CO3,  0 * SIZE
1211   LD $f9,  CO4,  0 * SIZE
1212   MADD  c11, c11, ALPHA, $f22
1213   MADD  c21, c21, ALPHA, $f8
1214   MADD  c31, c31, ALPHA, $f23
1215   MADD  c41, c41, ALPHA, $f9
1216   ST c11,  CO1,   0 * SIZE
1217   ST c21,  CO2,   0 * SIZE
1218   ST c31,  CO3,   0 * SIZE
1219   ST c41,  CO4,   0 * SIZE
1220#else
1221   MUL c11, ALPHA, c11
1222   MUL c21, ALPHA, c21
1223   MUL c31, ALPHA, c31
1224   MUL c41, ALPHA, c41
1225   ST c11,  CO1,   0 * SIZE
1226   ST c21,  CO2,   0 * SIZE
1227   ST c31,  CO3,   0 * SIZE
1228   ST c41,  CO4,   0 * SIZE
1229#if ( defined(LEFT) &&  defined(TRANSA)) || \
1230    (!defined(LEFT) && !defined(TRANSA))
1231   sub.d   TEMP, K, KK
1232#ifdef LEFT
1233   addi.d  TEMP, TEMP, -1
1234#else
1235   addi.d  TEMP, TEMP, -4
1236#endif
1237   slli.d  L,    TEMP, 0 + BASE_SHIFT
1238   slli.d  TEMP, TEMP, 2 + BASE_SHIFT
1239   add.d   AO, AO, L
1240   add.d   BO, BO, TEMP
1241#endif
1242#ifdef LEFT
1243   addi.d  KK, KK, 1
1244#endif
1245#endif
1246   .align 3
1247
1248.L49:
1249#if defined(TRMMKERNEL) && !defined(LEFT)
1250   addi.d  KK, KK, 4
1251#endif
1252   move    B, BO
1253   .align 3
1254
1255.L50:
1256   andi    J,  N, 2
1257move   AO, A
1258   bge $r0,    J, .L70
1259   move    CO1, C
1260   add.d   CO2, C,      LDC
1261#if defined(TRMMKERNEL) &&  defined(LEFT)
1262   move    KK, OFFSET
1263#endif
1264   srai.d  I,  M, 1
1265add.d  C,   CO2,    LDC
1266   bge $r0,    I, .L60
1267.L51:
1268#if defined(TRMMKERNEL)
1269#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1270   move    BO,  B
1271#else
1272   slli.d  L,    KK, 1 + BASE_SHIFT
1273   slli.d  TEMP, KK, 1 + BASE_SHIFT
1274   add.d   AO, AO, L
1275   add.d   BO, B,  TEMP
1276#endif
1277   LD a1,  AO,   0 * SIZE
1278   MTC  c11, $r0
1279   LD a2,  AO,   1 * SIZE
1280   MOV c21, c11
1281   LD a5,  AO,   4 * SIZE
1282   LD b1,  BO,   0 * SIZE
1283   MOV c12, c11
1284   LD b2,  BO,   1 * SIZE
1285   MOV c22, c11
1286   LD b3,  BO,   2 * SIZE
1287   LD b5,  BO,   4 * SIZE
1288   LD b6,  BO,   8 * SIZE
1289   LD b7,  BO,  12 * SIZE
1290#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1291   sub.d   TEMP, K, KK
1292#elif defined(LEFT)
1293   addi.d  TEMP, KK, 2
1294#else
1295   addi.d  TEMP, KK, 2
1296#endif
1297   srai.d  L,  TEMP, 2
1298   bge $r0,    L, .L55
1299#else
1300   LD a1,  AO,   0 * SIZE
1301   MTC  c11, $r0
1302   LD a2,  AO,   1 * SIZE
1303   MOV c21, c11
1304   LD a5,  AO,   4 * SIZE
1305   LD b1,  B,   0 * SIZE
1306   MOV c12, c11
1307   LD b2,  B,   1 * SIZE
1308   MOV c22, c11
1309   LD b3,  B,   2 * SIZE
1310   LD b5,  B,   4 * SIZE
1311   srai.d  L,  K, 2
1312   LD b6,  B,   8 * SIZE
1313   LD b7,  B,  12 * SIZE
1314move   BO,  B
1315   bge $r0,    L, .L55
1316#endif
1317   .align  3
1318.L52:
1319   MADD  c11, b1, a1, c11
1320   LD a3,  AO,   2 * SIZE
1321   MADD  c21, b2, a1, c21
1322   LD b4,  BO,   3 * SIZE
1323   MADD  c12, b1, a2, c12
1324   LD a4,  AO,   3 * SIZE
1325   MADD  c22, b2, a2, c22
1326   LD b1,  BO,   8 * SIZE
1327   MADD  c11, b3, a3, c11
1328   LD a1,  AO,   8 * SIZE
1329   MADD  c21, b4, a3, c21
1330   LD b2,  BO,   5 * SIZE
1331   MADD  c12, b3, a4, c12
1332   LD a2,  AO,   5 * SIZE
1333   MADD  c22, b4, a4, c22
1334   LD b3,  BO,   6 * SIZE
1335   MADD  c11, b5, a5, c11
1336   LD a3,  AO,   6 * SIZE
1337   MADD  c21, b2, a5, c21
1338   LD b4,  BO,   7 * SIZE
1339   MADD  c12, b5, a2, c12
1340   LD a4,  AO,   7 * SIZE
1341   MADD  c22, b2, a2, c22
1342   LD b5,  BO,  12 * SIZE
1343   MADD  c11, b3, a3, c11
1344   LD a5,  AO,  12 * SIZE
1345   MADD  c21, b4, a3, c21
1346   LD b2,  BO,   9 * SIZE
1347   MADD  c12, b3, a4, c12
1348   LD a2,  AO,   9 * SIZE
1349   MADD  c22, b4, a4, c22
1350   LD b3,  BO,  10 * SIZE
1351   addi.d  AO, AO,  8 * SIZE
1352   addi.d  L, L, -1
1353addi.d BO, BO,  8 * SIZE
1354   blt $r0,    L, .L52
1355   .align 3
1356
1357.L55:
1358#ifndef TRMMKERNEL
1359   andi    L,  K, 3
1360#else
1361   andi    L,  TEMP, 3
1362#endif
1363   bge $r0,    L, .L58
1364   .align  3
1365.L56:
1366   MADD  c11, b1, a1, c11
1367   LD a2,  AO,   1 * SIZE
1368   MADD  c21, b2, a1, c21
1369   LD a1,  AO,   2 * SIZE
1370   MADD  c12, b1, a2, c12
1371   LD b1,  BO,   2 * SIZE
1372   MADD  c22, b2, a2, c22
1373   LD b2,  BO,   3 * SIZE
1374   addi.d  L, L, -1
1375   addi.d  AO, AO,  2 * SIZE
1376addi.d BO, BO,  2 * SIZE
1377   blt $r0,    L, .L56
1378.L58:
1379#ifndef TRMMKERNEL
1380   LD $f22,  CO1,  0 * SIZE
1381   addi.d  I, I, -1
1382   LD $f8,  CO1,  1 * SIZE
1383   addi.d  CO1,CO1, 2 * SIZE
1384   LD $f23,  CO2,  0 * SIZE
1385   LD $f9,  CO2,  1 * SIZE
1386   addi.d  CO2,CO2, 2 * SIZE
1387   MADD  c11, c11, ALPHA, $f22
1388   MADD  c12, c12, ALPHA, $f8
1389   MADD  c21, c21, ALPHA, $f23
1390   MADD  c22, c22, ALPHA, $f9
1391   ST c11,  CO1,  -2 * SIZE
1392   ST c12,  CO1,  -1 * SIZE
1393   ST c21,  CO2,  -2 * SIZE
1394   ST c22,  CO2,  -1 * SIZE
1395   blt $r0,    I, .L51
1396#else
1397   addi.d  I, I, -1
1398   addi.d  CO1,CO1, 2 * SIZE
1399   addi.d  CO2,CO2, 2 * SIZE
1400   MUL c11, ALPHA, c11
1401   MUL c12, ALPHA, c12
1402   MUL c21, ALPHA, c21
1403   MUL c22, ALPHA, c22
1404   ST c11,  CO1,  -2 * SIZE
1405   ST c12,  CO1,  -1 * SIZE
1406   ST c21,  CO2,  -2 * SIZE
1407   ST c22,  CO2,  -1 * SIZE
1408#if ( defined(LEFT) &&  defined(TRANSA)) || \
1409    (!defined(LEFT) && !defined(TRANSA))
1410   sub.d   TEMP, K, KK
1411#ifdef LEFT
1412   addi.d  TEMP, TEMP, -2
1413#else
1414   addi.d  TEMP, TEMP, -2
1415#endif
1416   slli.d  L,    TEMP, 1 + BASE_SHIFT
1417   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
1418   add.d   AO, AO, L
1419   add.d   BO, BO, TEMP
1420#endif
1421#ifdef LEFT
1422   addi.d  KK, KK, 2
1423#endif
1424   blt $r0,    I, .L51
1425#endif
1426   .align 3
1427
1428.L60:
1429   andi    I,  M, 1
1430   bge $r0,    I, .L69
1431#if defined(TRMMKERNEL)
1432#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1433   move    BO,  B
1434#else
1435   slli.d  L,    KK, 0 + BASE_SHIFT
1436   slli.d  TEMP, KK, 1 + BASE_SHIFT
1437   add.d   AO, AO, L
1438   add.d   BO, B,  TEMP
1439#endif
1440   LD a1,  AO,   0 * SIZE
1441   MTC  c11, $r0
1442   LD a2,  AO,   1 * SIZE
1443   MOV c21, c11
1444   LD a3,  AO,   2 * SIZE
1445   MOV c31, c11
1446   LD a4,  AO,   3 * SIZE
1447   MOV c41, c11
1448   LD b1,  BO,   0 * SIZE
1449   LD b2,  BO,   1 * SIZE
1450   LD b3,  BO,   2 * SIZE
1451   LD b4,  BO,   3 * SIZE
1452   LD b5,  BO,   4 * SIZE
1453   LD b6,  BO,   8 * SIZE
1454   LD b7,  BO,  12 * SIZE
1455#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1456   sub.d   TEMP, K, KK
1457#elif defined(LEFT)
1458   addi.d  TEMP, KK, 1
1459#else
1460   addi.d  TEMP, KK, 2
1461#endif
1462   srai.d  L,  TEMP, 2
1463   bge $r0,    L, .L65
1464#else
1465   srai.d  L,  K, 2
1466   LD a1,  AO,   0 * SIZE
1467   MTC  c11, $r0
1468   LD a2,  AO,   1 * SIZE
1469   MOV c21, c11
1470   LD a3,  AO,   2 * SIZE
1471   MOV c31, c11
1472   LD a4,  AO,   3 * SIZE
1473   MOV c41, c11
1474   LD b1,  B,   0 * SIZE
1475   LD b2,  B,   1 * SIZE
1476   LD b3,  B,   2 * SIZE
1477   LD b4,  B,   3 * SIZE
1478   LD b5,  B,   4 * SIZE
1479   LD b6,  B,   8 * SIZE
1480   LD b7,  B,  12 * SIZE
1481move   BO,  B
1482   bge $r0,    L, .L65
1483#endif
1484   .align  3
1485.L62:
1486   MADD  c11, b1, a1, c11
1487   LD b1,  BO,   4 * SIZE
1488   MADD  c21, b2, a1, c21
1489   LD b2,  BO,   5 * SIZE
1490   MADD  c31, b3, a2, c31
1491   LD b3,  BO,   6 * SIZE
1492   MADD  c41, b4, a2, c41
1493   LD b4,  BO,   7 * SIZE
1494   LD a1,  AO,   4 * SIZE
1495   LD a2,  AO,   5 * SIZE
1496   MADD  c11, b1, a3, c11
1497   LD b1,  BO,   8 * SIZE
1498   MADD  c21, b2, a3, c21
1499   LD b2,  BO,   9 * SIZE
1500   MADD  c31, b3, a4, c31
1501   LD b3,  BO,  10 * SIZE
1502   MADD  c41, b4, a4, c41
1503   LD b4,  BO,  11 * SIZE
1504   LD a3,  AO,   6 * SIZE
1505   LD a4,  AO,   7 * SIZE
1506   addi.d  L, L, -1
1507   addi.d  AO, AO,  4 * SIZE
1508addi.d BO, BO,  8 * SIZE
1509   blt $r0,    L, .L62
1510   .align 3
1511
1512.L65:
1513#ifndef TRMMKERNEL
1514   andi    L,  K, 3
1515#else
1516   andi    L,  TEMP, 3
1517#endif
1518   bge $r0,    L, .L68
1519   .align  3
1520.L66:
1521   MADD  c11, b1, a1, c11
1522   LD b1,  BO,   2 * SIZE
1523   MADD  c21, b2, a1, c21
1524   LD b2,  BO,   3 * SIZE
1525   LD a1,  AO,   1 * SIZE
1526   addi.d  L, L, -1
1527   addi.d  AO, AO,  1 * SIZE
1528addi.d BO, BO,  2 * SIZE
1529   blt $r0,    L, .L66
1530.L68:
1531#ifndef TRMMKERNEL
1532   LD $f22,  CO1,  0 * SIZE
1533   LD $f8,  CO2,  0 * SIZE
1534   ADD c11, c11, c31
1535   ADD c21, c21, c41
1536   MADD  c11, c11, ALPHA, $f22
1537   MADD  c21, c21, ALPHA, $f8
1538   ST c11,  CO1,   0 * SIZE
1539   ST c21,  CO2,   0 * SIZE
1540#else
1541   ADD c11, c11, c31
1542   ADD c21, c21, c41
1543   MUL c11, ALPHA, c11
1544   MUL c21, ALPHA, c21
1545   ST c11,  CO1,   0 * SIZE
1546   ST c21,  CO2,   0 * SIZE
1547#if ( defined(LEFT) &&  defined(TRANSA)) || \
1548    (!defined(LEFT) && !defined(TRANSA))
1549   sub.d   TEMP, K, KK
1550#ifdef LEFT
1551   addi.d  TEMP, TEMP, -1
1552#else
1553   addi.d  TEMP, TEMP, -2
1554#endif
1555   slli.d  L,    TEMP, 0 + BASE_SHIFT
1556   slli.d  TEMP, TEMP, 1 + BASE_SHIFT
1557   add.d   AO, AO, L
1558   add.d   BO, BO, TEMP
1559#endif
1560#ifdef LEFT
1561   addi.d  KK, KK, 1
1562#endif
1563#endif
1564   .align 3
1565
1566.L69:
1567#if defined(TRMMKERNEL) && !defined(LEFT)
1568   addi.d  KK, KK, 2
1569#endif
1570   move    B, BO
1571   .align 3
1572
1573.L70:
1574   andi    J,  N, 1
1575move   AO, A
1576   bge $r0,    J, .L999
1577   move    CO1, C
1578#if defined(TRMMKERNEL) &&  defined(LEFT)
1579   move    KK, OFFSET
1580#endif
1581   srai.d  I,  M, 1
1582add.d  C,   CO1,    LDC
1583   bge $r0,    I, .L80
1584.L71:
1585#if defined(TRMMKERNEL)
1586#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1587   move    BO,  B
1588#else
1589   slli.d  L,    KK, 1 + BASE_SHIFT
1590   slli.d  TEMP, KK, 0 + BASE_SHIFT
1591   add.d   AO, AO, L
1592   add.d   BO, B,  TEMP
1593#endif
1594   LD a1,  AO,   0 * SIZE
1595   MTC  c11, $r0
1596   LD a2,  AO,   1 * SIZE
1597   MOV c21, c11
1598   LD a5,  AO,   4 * SIZE
1599   LD b1,  BO,   0 * SIZE
1600   MOV c12, c11
1601   LD b2,  BO,   1 * SIZE
1602   MOV c22, c11
1603   LD b3,  BO,   2 * SIZE
1604   LD b5,  BO,   4 * SIZE
1605   LD b6,  BO,   8 * SIZE
1606   LD b7,  BO,  12 * SIZE
1607#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1608   sub.d   TEMP, K, KK
1609#elif defined(LEFT)
1610   addi.d  TEMP, KK, 2
1611#else
1612   addi.d  TEMP, KK, 1
1613#endif
1614   srai.d  L,  TEMP, 2
1615   bge $r0,    L, .L75
1616#else
1617   LD a1,  AO,   0 * SIZE
1618   MTC  c11, $r0
1619   LD a2,  AO,   1 * SIZE
1620   MOV c21, c11
1621   LD a5,  AO,   4 * SIZE
1622   LD b1,  B,   0 * SIZE
1623   MOV c12, c11
1624   LD b2,  B,   1 * SIZE
1625   MOV c22, c11
1626   LD b3,  B,   2 * SIZE
1627   LD b5,  B,   4 * SIZE
1628   srai.d  L,  K, 2
1629   LD b6,  B,   8 * SIZE
1630   LD b7,  B,  12 * SIZE
1631move   BO,  B
1632   bge $r0,    L, .L75
1633#endif
1634   .align  3
1635.L72:
1636   LD a1,  AO,   0 * SIZE
1637   LD a2,  AO,   1 * SIZE
1638   LD b1,  BO,   0 * SIZE
1639   MADD  c11, b1, a1, c11
1640   MADD  c12, b1, a2, c12
1641   LD a1,  AO,   2 * SIZE
1642   LD a2,  AO,   3 * SIZE
1643   LD b1,  BO,   1 * SIZE
1644   MADD  c11, b1, a1, c11
1645   MADD  c12, b1, a2, c12
1646   LD a1,  AO,   4 * SIZE
1647   LD a2,  AO,   5 * SIZE
1648   LD b1,  BO,   2 * SIZE
1649   MADD  c11, b1, a1, c11
1650   MADD  c12, b1, a2, c12
1651   LD a1,  AO,   6 * SIZE
1652   LD a2,  AO,   7 * SIZE
1653   LD b1,  BO,   3 * SIZE
1654   MADD  c11, b1, a1, c11
1655   MADD  c12, b1, a2, c12
1656   addi.d  L, L, -1
1657   addi.d  AO, AO,  8 * SIZE
1658addi.d BO, BO,  4 * SIZE
1659   blt $r0,    L, .L72
1660   .align 3
1661
1662.L75:
1663#ifndef TRMMKERNEL
1664   andi    L,  K, 3
1665#else
1666   andi    L,  TEMP, 3
1667#endif
1668   bge $r0,    L, .L78
1669   .align  3
1670.L76:
1671   LD a1,  AO,   0 * SIZE
1672   LD a2,  AO,   1 * SIZE
1673   LD b1,  BO,   0 * SIZE
1674   MADD  c11, b1, a1, c11
1675   MADD  c12, b1, a2, c12
1676   addi.d  L, L, -1
1677   addi.d  AO, AO,  2 * SIZE
1678addi.d BO, BO,  1 * SIZE
1679   blt $r0,    L, .L76
1680.L78:
1681#ifndef TRMMKERNEL
1682   LD $f22,  CO1,  0 * SIZE
1683   addi.d  I, I, -1
1684   LD $f8,  CO1,  1 * SIZE
1685   addi.d  CO1,CO1, 2 * SIZE
1686   ADD c11, c11, c21
1687   ADD c12, c12, c22
1688   MADD  c11, c11, ALPHA, $f22
1689   MADD  c12, c12, ALPHA, $f8
1690   ST c11,  CO1,  -2 * SIZE
1691   ST c12,  CO1,  -1 * SIZE
1692   blt $r0,    I, .L71
1693#else
1694   ADD c11, c11, c21
1695   addi.d  I, I, -1
1696   ADD c12, c12, c22
1697   addi.d  CO1,CO1, 2 * SIZE
1698   MUL c11, ALPHA, c11
1699   MUL c12, ALPHA, c12
1700   ST c11,  CO1,  -2 * SIZE
1701   ST c12,  CO1,  -1 * SIZE
1702#if ( defined(LEFT) &&  defined(TRANSA)) || \
1703    (!defined(LEFT) && !defined(TRANSA))
1704   sub.d   TEMP, K, KK
1705#ifdef LEFT
1706   addi.d  TEMP, TEMP, -2
1707#else
1708   addi.d  TEMP, TEMP, -1
1709#endif
1710   slli.d  L,    TEMP, 1 + BASE_SHIFT
1711   slli.d  TEMP, TEMP, 0 + BASE_SHIFT
1712   add.d   AO, AO, L
1713   add.d   BO, BO, TEMP
1714#endif
1715#ifdef LEFT
1716   addi.d  KK, KK, 2
1717#endif
1718   blt $r0,    I, .L71
1719#endif
1720   .align 3
1721
1722.L80:
1723   andi    I,  M, 1
1724   bge $r0,    I, .L89
1725#if defined(TRMMKERNEL)
1726#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1727   move    BO,  B
1728#else
1729   slli.d  L,    KK, 0 + BASE_SHIFT
1730   slli.d  TEMP, KK, 0 + BASE_SHIFT
1731   add.d   AO, AO, L
1732   add.d   BO, B,  TEMP
1733#endif
1734   LD a1,  AO,   0 * SIZE
1735   MTC  c11, $r0
1736   LD a2,  AO,   1 * SIZE
1737   MOV c21, c11
1738   LD a3,  AO,   2 * SIZE
1739   LD a4,  AO,   3 * SIZE
1740   LD b1,  BO,   0 * SIZE
1741   LD b2,  BO,   1 * SIZE
1742   LD b3,  BO,   2 * SIZE
1743   LD b4,  BO,   3 * SIZE
1744   LD b5,  BO,   4 * SIZE
1745   LD b6,  BO,   8 * SIZE
1746   LD b7,  BO,  12 * SIZE
1747#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1748   sub.d   TEMP, K, KK
1749#elif defined(LEFT)
1750   addi.d  TEMP, KK, 1
1751#else
1752   addi.d  TEMP, KK, 1
1753#endif
1754   srai.d  L,  TEMP, 2
1755   bge $r0,    L, .L85
1756#else
1757   LD a1,  AO,   0 * SIZE
1758   MTC  c11, $r0
1759   LD a2,  AO,   1 * SIZE
1760   MOV c21, c11
1761   LD a3,  AO,   2 * SIZE
1762   LD a4,  AO,   3 * SIZE
1763   LD b1,  B,   0 * SIZE
1764   LD b2,  B,   1 * SIZE
1765   LD b3,  B,   2 * SIZE
1766   LD b4,  B,   3 * SIZE
1767   LD b5,  B,   4 * SIZE
1768   LD b6,  B,   8 * SIZE
1769   LD b7,  B,  12 * SIZE
1770   srai.d  L,  K, 2
1771move   BO,  B
1772   bge $r0,    L, .L85
1773#endif
1774   .align  3
1775.L82:
1776   LD a1,  AO,   0 * SIZE
1777   LD b1,  BO,   0 * SIZE
1778   MADD  c11, b1, a1, c11
1779   LD a1,  AO,   1 * SIZE
1780   LD b1,  BO,   1 * SIZE
1781   MADD  c21, b1, a1, c21
1782   LD a1,  AO,   2 * SIZE
1783   LD b1,  BO,   2 * SIZE
1784   MADD  c11, b1, a1, c11
1785   LD a1,  AO,   3 * SIZE
1786   LD b1,  BO,   3 * SIZE
1787   MADD  c21, b1, a1, c21
1788   addi.d  L, L, -1
1789   addi.d  AO, AO,  4 * SIZE
1790addi.d BO, BO,  4 * SIZE
1791   blt $r0,    L, .L82
1792   .align 3
1793
1794.L85:
1795#ifndef TRMMKERNEL
1796   andi    L,  K, 3
1797#else
1798   andi    L,  TEMP, 3
1799#endif
1800   bge $r0,    L, .L88
1801   .align  3
1802.L86:
1803   LD a1,  AO,   0 * SIZE
1804   LD b1,  BO,   0 * SIZE
1805   MADD  c11, b1, a1, c11
1806   addi.d  L, L, -1
1807   addi.d  AO, AO,  1 * SIZE
1808addi.d BO, BO,  1 * SIZE
1809   blt $r0,    L, .L86
1810.L88:
1811#ifndef TRMMKERNEL
1812   LD $f22,  CO1,  0 * SIZE
1813   ADD c11, c11, c21
1814   MADD  c11, c11, ALPHA, $f22
1815   ST c11,  CO1,   0 * SIZE
1816#else
1817   ADD c11, c11, c21
1818   MUL c11, ALPHA, c11
1819   ST c11,  CO1,   0 * SIZE
1820#endif
1821   .align 3
1822
1823.L89:
1824#if defined(TRMMKERNEL) && !defined(LEFT)
1825   addi.d  KK, KK, 1
1826#endif
1827   move    B, BO
1828   .align 3
1829
1830.L999:
1831   LDARG  $r23,  $sp,    0
1832   LDARG  $r24,  $sp,    8
1833   LDARG  $r25,  $sp,   16
1834   LDARG  $r26,  $sp,   24
1835   LDARG  $r27,  $sp,   32
1836   LDARG  $r28,  $sp,   40
1837   LDARG  $r29,  $sp,   48
1838   LDARG  $r30,  $sp,   96
1839   fld.d  $f24,  $sp,  56
1840   fld.d  $f25,  $sp,  64
1841   fld.d  $f26,  $sp,  72
1842   fld.d  $f27,  $sp,  80
1843   fld.d  $f28,  $sp,  88
1844#if defined(TRMMKERNEL)
1845   LDARG  $r20,  $sp,  104
1846   LDARG  $r16,  $sp,  112
1847#endif
1848#ifndef __64BIT__
1849   fld.d  $f18,  $sp, 120
1850   fld.d  $f19,  $sp, 128
1851   fld.d  $f20,  $sp, 136
1852   fld.d  $f21,  $sp, 144
1853#endif
1854   addi.d  $sp, $sp, 160
1855   move $r4, $r17
1856   fmov.d $f0, $f22
1857   jirl    $r0, $r1, 0x0
1858
1859   EPILOGUE
1860