/*********************************************************************/ /* Copyright 2005-2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define APREFETCHSIZE 24 #define APREFETCH_CATEGORY 0 #define M %i0 #define N %i1 #define K %i2 #if defined(DOUBLE) && !defined(__64BIT__) #define A %i5 #define B %i4 #else #define A %i4 #define B %i5 #endif #define C %o4 #define LDC %o5 #define AO %l0 #define BO %l1 #define I %l2 #define J %l3 #define L %l4 #define C1 %o0 #define C2 %o1 #define C3 %o2 #define C4 %o3 #define C5 %l5 #define C6 %l6 #define C7 %l7 #define C8 %i3 #define OFFSET %g1 #define KK %g2 #define TEMP1 %g3 #define TEMP2 %g4 #define AORIG %o7 #ifdef DOUBLE #define c01 %f0 #define c02 %f2 #define c03 %f4 #define c04 %f6 #define c05 %f8 #define c06 %f10 #define c07 %f12 #define c08 %f14 #define c09 %f16 #define c10 %f18 #define c11 %f20 #define c12 %f22 #define c13 %f24 #define c14 %f26 #define c15 %f28 #define c16 %f30 #define a1 %f32 #define a2 %f34 #define a3 %f36 #define a4 %f38 #define a5 %f40 #define b1 %f42 #define b2 %f44 #define b3 %f46 #define b4 %f48 #define b5 %f50 #define b6 %f52 #define b7 %f54 #define b8 %f56 #define b9 %f58 #define cc01 0 #define cc02 2 #define cc03 4 #define cc04 6 #define cc05 8 #define cc06 10 #define cc07 12 #define cc08 14 #define cc09 16 #define cc10 18 #define cc11 20 #define cc12 22 #define cc13 24 #define cc14 26 #define cc15 28 #define cc16 30 #define aa1 1 #define aa2 3 #define aa3 5 #define aa4 7 #define aa5 9 #define bb1 11 #define bb2 13 #define bb3 15 #define bb4 17 #define bb5 19 #define bb6 21 #define bb7 23 #define bb8 25 #define bb9 27 #else #define c01 %f0 #define c02 %f1 #define c03 %f2 #define c04 %f3 #define c05 %f4 #define c06 %f5 #define c07 %f6 #define c08 %f7 #define c09 %f8 #define c10 %f9 #define c11 %f10 #define c12 %f11 #define c13 %f12 #define c14 %f13 #define c15 %f14 #define c16 %f15 #define a1 %f16 #define a2 %f17 #define a3 %f18 #define a4 %f19 #define a5 %f20 #define b1 %f21 #define b2 %f22 #define b3 %f23 #define b4 %f24 #define b5 %f25 #define b6 %f26 #define b7 %f27 #define b8 %f28 #define b9 %f29 #define cc01 0 #define cc02 1 #define cc03 2 #define cc04 3 #define cc05 4 #define cc06 5 #define cc07 6 #define cc08 7 #define cc09 8 #define cc10 9 #define cc11 10 #define cc12 11 #define cc13 12 #define cc14 13 #define cc15 14 #define cc16 15 #define aa1 16 #define aa2 17 #define aa3 18 #define aa4 19 #define aa5 20 #define bb1 21 #define bb2 22 #define bb3 23 #define bb4 24 #define bb5 25 #define bb6 26 #define bb7 27 #define bb8 28 #define bb9 29 #endif .register %g2, #scratch .register %g3, #scratch PROLOGUE SAVESP nop #ifndef __64BIT__ #ifdef DOUBLE ld [%sp + STACK_START + 28], B ld [%sp + STACK_START + 32], C ld [%sp + STACK_START + 36], LDC ld [%sp + STACK_START + 40], OFFSET #else ld [%sp + STACK_START + 28], C ld [%sp + STACK_START + 32], LDC ld [%sp + STACK_START + 36], OFFSET #endif st %g1, [%sp + STACK_START + 8] st %g2, [%sp + STACK_START + 12] st %g3, [%sp + STACK_START + 16] st %g4, [%sp + STACK_START + 20] #else ldx [%sp+ STACK_START + 56], C ldx [%sp+ STACK_START + 64], LDC ldx [%sp+ STACK_START + 72], OFFSET stx %g1, [%sp + STACK_START + 32] stx %g2, [%sp + STACK_START + 40] stx %g3, [%sp + STACK_START + 48] stx %g4, [%sp + STACK_START + 56] #endif #if defined(TRMMKERNEL) && !defined(LEFT) neg OFFSET, KK #endif sll LDC, BASE_SHIFT, LDC #ifdef LN smul M, K, TEMP1 sll TEMP1, BASE_SHIFT, TEMP1 add A, TEMP1, A sll M, BASE_SHIFT, TEMP1 add C, TEMP1, C #endif #ifdef RN neg OFFSET, KK #endif #ifdef RT smul N, K, TEMP1 sll TEMP1, BASE_SHIFT, TEMP1 add B, TEMP1, B smul N, LDC, TEMP1 add C, TEMP1, C sub N, OFFSET, KK #endif sra N, 3, J cmp J, 0 ble,pn %icc, .LL30 nop .align 4 .LL11: #ifdef RT sll K, BASE_SHIFT + 3, TEMP1 sub B, TEMP1, B #endif #ifndef RT mov C, C1 add C, LDC, C2 add C2, LDC, C3 add C3, LDC, C4 add C4, LDC, C5 add C5, LDC, C6 add C6, LDC, C7 add C7, LDC, C8 add C8, LDC, C #else sub C, LDC, C8 sub C8, LDC, C7 sub C7, LDC, C6 sub C6, LDC, C5 sub C5, LDC, C4 sub C4, LDC, C3 sub C3, LDC, C2 sub C2, LDC, C1 sub C2, LDC, C #endif #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif sra M, 1, I cmp I, 0 ble,pn %icc, .LL20 nop .align 4 .LL12: #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 1, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TEMP1 sll KK, BASE_SHIFT + 3, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 8 * SIZE], a5 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 FCLR (cc01) LDF [BO + 2 * SIZE], b3 FCLR (cc05) LDF [BO + 3 * SIZE], b4 FCLR (cc09) LDF [BO + 4 * SIZE], b5 FCLR (cc13) LDF [BO + 5 * SIZE], b6 FCLR (cc02) LDF [BO + 6 * SIZE], b7 FCLR (cc06) LDF [BO + 7 * SIZE], b8 FCLR (cc10) LDF [BO + 8 * SIZE], b9 FCLR (cc14) prefetch [C1 + 1 * SIZE], 3 FCLR (cc03) prefetch [C2 + 2 * SIZE], 3 FCLR (cc07) prefetch [C3 + 1 * SIZE], 3 FCLR (cc11) prefetch [C4 + 2 * SIZE], 3 FCLR (cc15) prefetch [C5 + 1 * SIZE], 3 FCLR (cc04) prefetch [C6 + 2 * SIZE], 3 FCLR (cc08) prefetch [C7 + 1 * SIZE], 3 FCLR (cc12) prefetch [C8 + 2 * SIZE], 3 FCLR (cc16) #if defined(LT) || defined(RN) sra KK, 3, L #else sub K, KK, L sra L, 3, L #endif cmp L, 0 ble,pn %icc, .LL15 nop .align 4 .LL13: FMADD (aa1, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa1, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa1, bb3, cc05, cc05) LDF [BO + 16 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [AO + 2 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 3 * SIZE], a4 FMADD (aa1, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa1, bb7, cc13, cc13) LDF [BO + 12 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 13 * SIZE], b6 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 14 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 15 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 24 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 17 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 18 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 19 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 4 * SIZE], a1 FMADD (aa4, bb5, cc10, cc10) LDF [AO + 5 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) add L, -1, L FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 20 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 21 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 22 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 23 * SIZE], b8 FMADD (aa1, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa1, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa1, bb3, cc05, cc05) LDF [BO + 32 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 25 * SIZE], b2 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 26 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 27 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [AO + 6 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 7 * SIZE], a4 FMADD (aa1, bb6, cc11, cc11) nop FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa1, bb7, cc13, cc13) LDF [BO + 28 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 29 * SIZE], b6 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 30 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 31 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 40 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 33 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 34 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 35 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 16 * SIZE], a1 /****/ FMADD (aa4, bb5, cc10, cc10) LDF [AO + 9 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) nop FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 36 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 37 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 38 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 39 * SIZE], b8 FMADD (aa5, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa5, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa5, bb3, cc05, cc05) LDF [BO + 48 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 41 * SIZE], b2 FMADD (aa5, bb4, cc07, cc07) LDF [BO + 42 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 43 * SIZE], b4 FMADD (aa5, bb5, cc09, cc09) LDF [AO + 10 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 11 * SIZE], a4 FMADD (aa5, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa5, bb7, cc13, cc13) LDF [BO + 44 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 45 * SIZE], b6 FMADD (aa5, bb8, cc15, cc15) LDF [BO + 46 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 47 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 56 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 49 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 50 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 51 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 12 * SIZE], a5 FMADD (aa4, bb5, cc10, cc10) LDF [AO + 13 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) cmp L, 0 FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 52 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 53 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 54 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 55 * SIZE], b8 FMADD (aa5, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa5, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa5, bb3, cc05, cc05) LDF [BO + 64 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 57 * SIZE], b2 FMADD (aa5, bb4, cc07, cc07) LDF [BO + 58 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 59 * SIZE], b4 FMADD (aa5, bb5, cc09, cc09) LDF [AO + 14 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 15 * SIZE], a4 FMADD (aa5, bb6, cc11, cc11) add BO, 64 * SIZE, BO FMADD (aa2, bb6, cc12, cc12) add AO, 16 * SIZE, AO FMADD (aa5, bb7, cc13, cc13) LDF [BO - 4 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO - 3 * SIZE], b6 FMADD (aa5, bb8, cc15, cc15) LDF [BO - 2 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO - 1 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 8 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 1 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 2 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 3 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 8 * SIZE], a5 /****/ FMADD (aa4, bb5, cc10, cc10) LDF [AO + 1 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) FMADD (aa4, bb6, cc12, cc12) FMADD (aa3, bb7, cc13, cc13) LDF [BO + 4 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 5 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 6 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) ble,pn %icc, .LL15 LDF [BO + 7 * SIZE], b8 FMADD (aa1, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa1, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa1, bb3, cc05, cc05) LDF [BO + 16 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [AO + 2 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 3 * SIZE], a4 FMADD (aa1, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa1, bb7, cc13, cc13) LDF [BO + 12 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 13 * SIZE], b6 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 14 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 15 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 24 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 17 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 18 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 19 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 4 * SIZE], a1 FMADD (aa4, bb5, cc10, cc10) LDF [AO + 5 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) add L, -1, L FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 20 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 21 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 22 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 23 * SIZE], b8 FMADD (aa1, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa1, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa1, bb3, cc05, cc05) LDF [BO + 32 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 25 * SIZE], b2 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 26 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 27 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [AO + 6 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 7 * SIZE], a4 FMADD (aa1, bb6, cc11, cc11) nop FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa1, bb7, cc13, cc13) LDF [BO + 28 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 29 * SIZE], b6 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 30 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 31 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 40 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 33 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 34 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 35 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 16 * SIZE], a1 /****/ FMADD (aa4, bb5, cc10, cc10) LDF [AO + 9 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) nop FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 36 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 37 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 38 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 39 * SIZE], b8 FMADD (aa5, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa5, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa5, bb3, cc05, cc05) LDF [BO + 48 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 41 * SIZE], b2 FMADD (aa5, bb4, cc07, cc07) LDF [BO + 42 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 43 * SIZE], b4 FMADD (aa5, bb5, cc09, cc09) LDF [AO + 10 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 11 * SIZE], a4 FMADD (aa5, bb6, cc11, cc11) prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb6, cc12, cc12) nop FMADD (aa5, bb7, cc13, cc13) LDF [BO + 44 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO + 45 * SIZE], b6 FMADD (aa5, bb8, cc15, cc15) LDF [BO + 46 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO + 47 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 56 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 49 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 50 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 51 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 12 * SIZE], a5 FMADD (aa4, bb5, cc10, cc10) LDF [AO + 13 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) cmp L, 0 FMADD (aa4, bb6, cc12, cc12) nop FMADD (aa3, bb7, cc13, cc13) LDF [BO + 52 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 53 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 54 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) LDF [BO + 55 * SIZE], b8 FMADD (aa5, bb1, cc01, cc01) FMADD (aa2, bb1, cc02, cc02) FMADD (aa5, bb2, cc03, cc03) FMADD (aa2, bb2, cc04, cc04) FMADD (aa5, bb3, cc05, cc05) LDF [BO + 64 * SIZE], b1 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 57 * SIZE], b2 FMADD (aa5, bb4, cc07, cc07) LDF [BO + 58 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 59 * SIZE], b4 FMADD (aa5, bb5, cc09, cc09) LDF [AO + 14 * SIZE], a3 FMADD (aa2, bb5, cc10, cc10) LDF [AO + 15 * SIZE], a4 FMADD (aa5, bb6, cc11, cc11) add BO, 64 * SIZE, BO FMADD (aa2, bb6, cc12, cc12) add AO, 16 * SIZE, AO FMADD (aa5, bb7, cc13, cc13) LDF [BO - 4 * SIZE], b5 FMADD (aa2, bb7, cc14, cc14) LDF [BO - 3 * SIZE], b6 FMADD (aa5, bb8, cc15, cc15) LDF [BO - 2 * SIZE], b7 FMADD (aa2, bb8, cc16, cc16) LDF [BO - 1 * SIZE], b8 FMADD (aa3, bb9, cc01, cc01) FMADD (aa4, bb9, cc02, cc02) FMADD (aa3, bb2, cc03, cc03) FMADD (aa4, bb2, cc04, cc04) FMADD (aa3, bb3, cc05, cc05) LDF [BO + 8 * SIZE], b9 FMADD (aa4, bb3, cc06, cc06) LDF [BO + 1 * SIZE], b2 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 2 * SIZE], b3 FMADD (aa4, bb4, cc08, cc08) LDF [BO + 3 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [AO + 8 * SIZE], a5 /****/ FMADD (aa4, bb5, cc10, cc10) LDF [AO + 1 * SIZE], a2 FMADD (aa3, bb6, cc11, cc11) FMADD (aa4, bb6, cc12, cc12) FMADD (aa3, bb7, cc13, cc13) LDF [BO + 4 * SIZE], b5 FMADD (aa4, bb7, cc14, cc14) LDF [BO + 5 * SIZE], b6 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 6 * SIZE], b7 FMADD (aa4, bb8, cc16, cc16) bg,pt %icc, .LL13 LDF [BO + 7 * SIZE], b8 .align 4 .LL15: #if defined(LT) || defined(RN) and KK, 7, L #else sub K, KK, L and L, 7, L #endif cmp L, 0 ble,a,pn %icc, .LL18 nop .align 4 .LL17: FMADD (aa1, bb1, cc01, cc01) add L, -1, L FMADD (aa2, bb1, cc02, cc02) nop FMADD (aa1, bb2, cc03, cc03) LDF [BO + 8 * SIZE], b1 FMADD (aa2, bb2, cc04, cc04) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) cmp L, 0 FMADD (aa2, bb3, cc06, cc06) nop FMADD (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) nop FMADD (aa2, bb5, cc10, cc10) nop FMADD (aa1, bb6, cc11, cc11) LDF [BO + 12 * SIZE], b5 FMADD (aa2, bb6, cc12, cc12) LDF [BO + 13 * SIZE], b6 FMADD (aa1, bb7, cc13, cc13) add AO, 2 * SIZE, AO FMADD (aa2, bb7, cc14, cc14) add BO, 8 * SIZE, BO FMADD (aa1, bb8, cc15, cc15) LDF [AO + 0 * SIZE], a1 FMADD (aa2, bb8, cc16, cc16) LDF [AO + 1 * SIZE], a2 LDF [BO + 6 * SIZE], b7 bg,pt %icc, .LL17 LDF [BO + 7 * SIZE], b8 nop .align 4 .LL18: #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 8, TEMP1 #endif sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 3, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 FSUB b1, c09, c09 FSUB b2, c11, c11 FSUB b3, c13, c13 FSUB b4, c15, c15 LDF [BO + 8 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 10 * SIZE], a3 LDF [BO + 11 * SIZE], a4 LDF [BO + 12 * SIZE], b1 LDF [BO + 13 * SIZE], b2 LDF [BO + 14 * SIZE], b3 LDF [BO + 15 * SIZE], b4 FSUB a1, c02, c02 FSUB a2, c04, c04 FSUB a3, c06, c06 FSUB a4, c08, c08 FSUB b1, c10, c10 FSUB b2, c12, c12 FSUB b3, c14, c14 FSUB b4, c16, c16 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 FSUB b1, c05, c05 FSUB b2, c06, c06 FSUB b3, c07, c07 FSUB b4, c08, c08 LDF [AO + 8 * SIZE], a1 LDF [AO + 9 * SIZE], a2 LDF [AO + 10 * SIZE], a3 LDF [AO + 11 * SIZE], a4 LDF [AO + 12 * SIZE], b1 LDF [AO + 13 * SIZE], b2 LDF [AO + 14 * SIZE], b3 LDF [AO + 15 * SIZE], b4 FSUB a1, c09, c09 FSUB a2, c10, c10 FSUB a3, c11, c11 FSUB a4, c12, c12 FSUB b1, c13, c13 FSUB b2, c14, c14 FSUB b3, c15, c15 FSUB b4, c16, c16 #endif #ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c04, c04 FMUL a1, c06, c06 FMUL a1, c08, c08 FMUL a1, c10, c10 FMUL a1, c12, c12 FMUL a1, c14, c14 FMUL a1, c16, c16 FNMSUB (aa2, cc02, cc01, cc01) FNMSUB (aa2, cc04, cc03, cc03) FNMSUB (aa2, cc06, cc05, cc05) FNMSUB (aa2, cc08, cc07, cc07) FNMSUB (aa2, cc10, cc09, cc09) FNMSUB (aa2, cc12, cc11, cc11) FNMSUB (aa2, cc14, cc13, cc13) FNMSUB (aa2, cc16, cc15, cc15) FMUL a3, c01, c01 FMUL a3, c03, c03 FMUL a3, c05, c05 FMUL a3, c07, c07 FMUL a3, c09, c09 FMUL a3, c11, c11 FMUL a3, c13, c13 FMUL a3, c15, c15 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c03, c03 FMUL a1, c05, c05 FMUL a1, c07, c07 FMUL a1, c09, c09 FMUL a1, c11, c11 FMUL a1, c13, c13 FMUL a1, c15, c15 FNMSUB (aa2, cc01, cc02, cc02) FNMSUB (aa2, cc03, cc04, cc04) FNMSUB (aa2, cc05, cc06, cc06) FNMSUB (aa2, cc07, cc08, cc08) FNMSUB (aa2, cc09, cc10, cc10) FNMSUB (aa2, cc11, cc12, cc12) FNMSUB (aa2, cc13, cc14, cc14) FNMSUB (aa2, cc15, cc16, cc16) FMUL a3, c02, c02 FMUL a3, c04, c04 FMUL a3, c06, c06 FMUL a3, c08, c08 FMUL a3, c10, c10 FMUL a3, c12, c12 FMUL a3, c14, c14 FMUL a3, c16, c16 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FMUL a1, c01, c01 FMUL a1, c02, c02 FNMSUB (aa2, cc01, cc03, cc03) FNMSUB (aa2, cc02, cc04, cc04) FNMSUB (aa3, cc01, cc05, cc05) FNMSUB (aa3, cc02, cc06, cc06) FNMSUB (aa4, cc01, cc07, cc07) FNMSUB (aa4, cc02, cc08, cc08) FNMSUB (bb1, cc01, cc09, cc09) FNMSUB (bb1, cc02, cc10, cc10) FNMSUB (bb2, cc01, cc11, cc11) FNMSUB (bb2, cc02, cc12, cc12) FNMSUB (bb3, cc01, cc13, cc13) FNMSUB (bb3, cc02, cc14, cc14) FNMSUB (bb4, cc01, cc15, cc15) FNMSUB (bb4, cc02, cc16, cc16) LDF [BO + 9 * SIZE], a1 LDF [BO + 10 * SIZE], a2 LDF [BO + 11 * SIZE], a3 LDF [BO + 12 * SIZE], a4 LDF [BO + 13 * SIZE], b1 LDF [BO + 14 * SIZE], b2 LDF [BO + 15 * SIZE], b3 FMUL a1, c03, c03 FMUL a1, c04, c04 FNMSUB (aa2, cc03, cc05, cc05) FNMSUB (aa2, cc04, cc06, cc06) FNMSUB (aa3, cc03, cc07, cc07) FNMSUB (aa3, cc04, cc08, cc08) FNMSUB (aa4, cc03, cc09, cc09) FNMSUB (aa4, cc04, cc10, cc10) FNMSUB (bb1, cc03, cc11, cc11) FNMSUB (bb1, cc04, cc12, cc12) FNMSUB (bb2, cc03, cc13, cc13) FNMSUB (bb2, cc04, cc14, cc14) FNMSUB (bb3, cc03, cc15, cc15) FNMSUB (bb3, cc04, cc16, cc16) LDF [BO + 18 * SIZE], a1 LDF [BO + 19 * SIZE], a2 LDF [BO + 20 * SIZE], a3 LDF [BO + 21 * SIZE], a4 LDF [BO + 22 * SIZE], b1 LDF [BO + 23 * SIZE], b2 FMUL a1, c05, c05 FMUL a1, c06, c06 FNMSUB (aa2, cc05, cc07, cc07) FNMSUB (aa2, cc06, cc08, cc08) FNMSUB (aa3, cc05, cc09, cc09) FNMSUB (aa3, cc06, cc10, cc10) FNMSUB (aa4, cc05, cc11, cc11) FNMSUB (aa4, cc06, cc12, cc12) FNMSUB (bb1, cc05, cc13, cc13) FNMSUB (bb1, cc06, cc14, cc14) FNMSUB (bb2, cc05, cc15, cc15) FNMSUB (bb2, cc06, cc16, cc16) LDF [BO + 27 * SIZE], a1 LDF [BO + 28 * SIZE], a2 LDF [BO + 29 * SIZE], a3 LDF [BO + 30 * SIZE], a4 LDF [BO + 31 * SIZE], b1 FMUL a1, c07, c07 FMUL a1, c08, c08 FNMSUB (aa2, cc07, cc09, cc09) FNMSUB (aa2, cc08, cc10, cc10) FNMSUB (aa3, cc07, cc11, cc11) FNMSUB (aa3, cc08, cc12, cc12) FNMSUB (aa4, cc07, cc13, cc13) FNMSUB (aa4, cc08, cc14, cc14) FNMSUB (bb1, cc07, cc15, cc15) FNMSUB (bb1, cc08, cc16, cc16) LDF [BO + 36 * SIZE], a1 LDF [BO + 37 * SIZE], a2 LDF [BO + 38 * SIZE], a3 LDF [BO + 39 * SIZE], a4 FMUL a1, c09, c09 FMUL a1, c10, c10 FNMSUB (aa2, cc09, cc11, cc11) FNMSUB (aa2, cc10, cc12, cc12) FNMSUB (aa3, cc09, cc13, cc13) FNMSUB (aa3, cc10, cc14, cc14) FNMSUB (aa4, cc09, cc15, cc15) FNMSUB (aa4, cc10, cc16, cc16) LDF [BO + 45 * SIZE], a1 LDF [BO + 46 * SIZE], a2 LDF [BO + 47 * SIZE], a3 FMUL a1, c11, c11 FMUL a1, c12, c12 FNMSUB (aa2, cc11, cc13, cc13) FNMSUB (aa2, cc12, cc14, cc14) FNMSUB (aa3, cc11, cc15, cc15) FNMSUB (aa3, cc12, cc16, cc16) LDF [BO + 54 * SIZE], a1 LDF [BO + 55 * SIZE], a2 FMUL a1, c13, c13 FMUL a1, c14, c14 FNMSUB (aa2, cc13, cc15, cc15) FNMSUB (aa2, cc14, cc16, cc16) LDF [BO + 63 * SIZE], a1 FMUL a1, c15, c15 FMUL a1, c16, c16 #endif #ifdef RT LDF [BO + 63 * SIZE], a1 LDF [BO + 62 * SIZE], a2 LDF [BO + 61 * SIZE], a3 LDF [BO + 60 * SIZE], a4 LDF [BO + 59 * SIZE], b1 LDF [BO + 58 * SIZE], b2 LDF [BO + 57 * SIZE], b3 LDF [BO + 56 * SIZE], b4 FMUL a1, c16, c16 FMUL a1, c15, c15 FNMSUB (aa2, cc16, cc14, cc14) FNMSUB (aa2, cc15, cc13, cc13) FNMSUB (aa3, cc16, cc12, cc12) FNMSUB (aa3, cc15, cc11, cc11) FNMSUB (aa4, cc16, cc10, cc10) FNMSUB (aa4, cc15, cc09, cc09) FNMSUB (bb1, cc16, cc08, cc08) FNMSUB (bb1, cc15, cc07, cc07) FNMSUB (bb2, cc16, cc06, cc06) FNMSUB (bb2, cc15, cc05, cc05) FNMSUB (bb3, cc16, cc04, cc04) FNMSUB (bb3, cc15, cc03, cc03) FNMSUB (bb4, cc16, cc02, cc02) FNMSUB (bb4, cc15, cc01, cc01) LDF [BO + 54 * SIZE], a1 LDF [BO + 53 * SIZE], a2 LDF [BO + 52 * SIZE], a3 LDF [BO + 51 * SIZE], a4 LDF [BO + 50 * SIZE], b1 LDF [BO + 49 * SIZE], b2 LDF [BO + 48 * SIZE], b3 FMUL a1, c14, c14 FMUL a1, c13, c13 FNMSUB (aa2, cc14, cc12, cc12) FNMSUB (aa2, cc13, cc11, cc11) FNMSUB (aa3, cc14, cc10, cc10) FNMSUB (aa3, cc13, cc09, cc09) FNMSUB (aa4, cc14, cc08, cc08) FNMSUB (aa4, cc13, cc07, cc07) FNMSUB (bb1, cc14, cc06, cc06) FNMSUB (bb1, cc13, cc05, cc05) FNMSUB (bb2, cc14, cc04, cc04) FNMSUB (bb2, cc13, cc03, cc03) FNMSUB (bb3, cc14, cc02, cc02) FNMSUB (bb3, cc13, cc01, cc01) LDF [BO + 45 * SIZE], a1 LDF [BO + 44 * SIZE], a2 LDF [BO + 43 * SIZE], a3 LDF [BO + 42 * SIZE], a4 LDF [BO + 41 * SIZE], b1 LDF [BO + 40 * SIZE], b2 FMUL a1, c12, c12 FMUL a1, c11, c11 FNMSUB (aa2, cc12, cc10, cc10) FNMSUB (aa2, cc11, cc09, cc09) FNMSUB (aa3, cc12, cc08, cc08) FNMSUB (aa3, cc11, cc07, cc07) FNMSUB (aa4, cc12, cc06, cc06) FNMSUB (aa4, cc11, cc05, cc05) FNMSUB (bb1, cc12, cc04, cc04) FNMSUB (bb1, cc11, cc03, cc03) FNMSUB (bb2, cc12, cc02, cc02) FNMSUB (bb2, cc11, cc01, cc01) LDF [BO + 36 * SIZE], a1 LDF [BO + 35 * SIZE], a2 LDF [BO + 34 * SIZE], a3 LDF [BO + 33 * SIZE], a4 LDF [BO + 32 * SIZE], b1 FMUL a1, c10, c10 FMUL a1, c09, c09 FNMSUB (aa2, cc10, cc08, cc08) FNMSUB (aa2, cc09, cc07, cc07) FNMSUB (aa3, cc10, cc06, cc06) FNMSUB (aa3, cc09, cc05, cc05) FNMSUB (aa4, cc10, cc04, cc04) FNMSUB (aa4, cc09, cc03, cc03) FNMSUB (bb1, cc10, cc02, cc02) FNMSUB (bb1, cc09, cc01, cc01) LDF [BO + 27 * SIZE], a1 LDF [BO + 26 * SIZE], a2 LDF [BO + 25 * SIZE], a3 LDF [BO + 24 * SIZE], a4 FMUL a1, c08, c08 FMUL a1, c07, c07 FNMSUB (aa2, cc08, cc06, cc06) FNMSUB (aa2, cc07, cc05, cc05) FNMSUB (aa3, cc08, cc04, cc04) FNMSUB (aa3, cc07, cc03, cc03) FNMSUB (aa4, cc08, cc02, cc02) FNMSUB (aa4, cc07, cc01, cc01) LDF [BO + 18 * SIZE], a1 LDF [BO + 17 * SIZE], a2 LDF [BO + 16 * SIZE], a3 FMUL a1, c06, c06 FMUL a1, c05, c05 FNMSUB (aa2, cc06, cc04, cc04) FNMSUB (aa2, cc05, cc03, cc03) FNMSUB (aa3, cc06, cc02, cc02) FNMSUB (aa3, cc05, cc01, cc01) LDF [BO + 9 * SIZE], a1 LDF [BO + 8 * SIZE], a2 FMUL a1, c04, c04 FMUL a1, c03, c03 FNMSUB (aa2, cc04, cc02, cc02) FNMSUB (aa2, cc03, cc01, cc01) LDF [BO + 0 * SIZE], a1 FMUL a1, c02, c02 FMUL a1, c01, c01 #endif #ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 add C3, -2 * SIZE, C3 add C4, -2 * SIZE, C4 add C5, -2 * SIZE, C5 add C6, -2 * SIZE, C6 add C7, -2 * SIZE, C7 add C8, -2 * SIZE, C8 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c07, [BO + 3 * SIZE] STF c09, [BO + 4 * SIZE] STF c11, [BO + 5 * SIZE] STF c13, [BO + 6 * SIZE] STF c15, [BO + 7 * SIZE] STF c02, [BO + 8 * SIZE] STF c04, [BO + 9 * SIZE] STF c06, [BO + 10 * SIZE] STF c08, [BO + 11 * SIZE] STF c10, [BO + 12 * SIZE] STF c12, [BO + 13 * SIZE] STF c14, [BO + 14 * SIZE] STF c16, [BO + 15 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] STF c05, [AO + 4 * SIZE] STF c06, [AO + 5 * SIZE] STF c07, [AO + 6 * SIZE] STF c08, [AO + 7 * SIZE] STF c09, [AO + 8 * SIZE] STF c10, [AO + 9 * SIZE] STF c11, [AO + 10 * SIZE] STF c12, [AO + 11 * SIZE] STF c13, [AO + 12 * SIZE] STF c14, [AO + 13 * SIZE] STF c15, [AO + 14 * SIZE] STF c16, [AO + 15 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C2 + 0 * SIZE] STF c04, [C2 + 1 * SIZE] STF c05, [C3 + 0 * SIZE] STF c06, [C3 + 1 * SIZE] STF c07, [C4 + 0 * SIZE] STF c08, [C4 + 1 * SIZE] STF c09, [C5 + 0 * SIZE] STF c10, [C5 + 1 * SIZE] STF c11, [C6 + 0 * SIZE] STF c12, [C6 + 1 * SIZE] STF c13, [C7 + 0 * SIZE] STF c14, [C7 + 1 * SIZE] STF c15, [C8 + 0 * SIZE] STF c16, [C8 + 1 * SIZE] #ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 add C3, 2 * SIZE, C3 add C4, 2 * SIZE, C4 add C5, 2 * SIZE, C5 add C6, 2 * SIZE, C6 add C7, 2 * SIZE, C7 add C8, 2 * SIZE, C8 #endif #ifdef RT sll K, BASE_SHIFT + 1, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 3, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL12 nop .align 4 .LL20: and M, 1, I cmp I, 0 ble,pn %icc, .LL29 nop #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 0, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TEMP1 sll KK, BASE_SHIFT + 3, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 FCLR (cc01) LDF [BO + 1 * SIZE], b2 FCLR (cc03) LDF [BO + 2 * SIZE], b3 FCLR (cc05) LDF [BO + 3 * SIZE], b4 FCLR (cc07) LDF [BO + 4 * SIZE], b5 FCLR (cc09) LDF [BO + 5 * SIZE], b6 FCLR (cc11) LDF [BO + 6 * SIZE], b7 FCLR (cc13) LDF [BO + 7 * SIZE], b8 FCLR (cc15) #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL25 LDF [BO + 8 * SIZE], b9 .align 4 .LL23: prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY add L, -1, L FMADD (aa1, bb1, cc01, cc01) LDF [BO + 16 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) LDF [BO + 10 * SIZE], b3 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 11 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [BO + 12 * SIZE], b5 FMADD (aa1, bb6, cc11, cc11) LDF [BO + 13 * SIZE], b6 FMADD (aa1, bb7, cc13, cc13) LDF [BO + 14 * SIZE], b7 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 15 * SIZE], b8 FMADD (aa2, bb9, cc01, cc01) LDF [BO + 24 * SIZE], b9 FMADD (aa2, bb2, cc03, cc03) LDF [BO + 17 * SIZE], b2 FMADD (aa2, bb3, cc05, cc05) LDF [BO + 18 * SIZE], b3 FMADD (aa2, bb4, cc07, cc07) LDF [BO + 19 * SIZE], b4 FMADD (aa2, bb5, cc09, cc09) LDF [BO + 20 * SIZE], b5 FMADD (aa2, bb6, cc11, cc11) LDF [BO + 21 * SIZE], b6 FMADD (aa2, bb7, cc13, cc13) LDF [BO + 22 * SIZE], b7 FMADD (aa2, bb8, cc15, cc15) LDF [BO + 23 * SIZE], b8 LDF [AO + 4 * SIZE], a1 LDF [AO + 5 * SIZE], a2 FMADD (aa3, bb1, cc01, cc01) LDF [BO + 32 * SIZE], b1 FMADD (aa3, bb2, cc03, cc03) LDF [BO + 25 * SIZE], b2 FMADD (aa3, bb3, cc05, cc05) LDF [BO + 26 * SIZE], b3 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 27 * SIZE], b4 FMADD (aa3, bb5, cc09, cc09) LDF [BO + 28 * SIZE], b5 FMADD (aa3, bb6, cc11, cc11) LDF [BO + 29 * SIZE], b6 FMADD (aa3, bb7, cc13, cc13) LDF [BO + 30 * SIZE], b7 FMADD (aa3, bb8, cc15, cc15) LDF [BO + 31 * SIZE], b8 FMADD (aa4, bb9, cc01, cc01) LDF [BO + 40 * SIZE], b9 FMADD (aa4, bb2, cc03, cc03) LDF [BO + 33 * SIZE], b2 FMADD (aa4, bb3, cc05, cc05) LDF [BO + 34 * SIZE], b3 FMADD (aa4, bb4, cc07, cc07) LDF [BO + 35 * SIZE], b4 FMADD (aa4, bb5, cc09, cc09) LDF [BO + 36 * SIZE], b5 FMADD (aa4, bb6, cc11, cc11) LDF [BO + 37 * SIZE], b6 FMADD (aa4, bb7, cc13, cc13) LDF [BO + 38 * SIZE], b7 FMADD (aa4, bb8, cc15, cc15) LDF [BO + 39 * SIZE], b8 LDF [AO + 6 * SIZE], a3 LDF [AO + 7 * SIZE], a4 add AO, 4 * SIZE, AO cmp L, 0 bg,pt %icc, .LL23 add BO, 32 * SIZE, BO .align 4 .LL25: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL28 nop .align 4 .LL27: FMADD (aa1, bb1, cc01, cc01) LDF [BO + 8 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) LDF [BO + 10 * SIZE], b3 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 11 * SIZE], b4 FMADD (aa1, bb5, cc09, cc09) LDF [BO + 12 * SIZE], b5 FMADD (aa1, bb6, cc11, cc11) LDF [BO + 13 * SIZE], b6 FMADD (aa1, bb7, cc13, cc13) LDF [BO + 14 * SIZE], b7 FMADD (aa1, bb8, cc15, cc15) LDF [BO + 15 * SIZE], b8 LDF [AO + 1 * SIZE], a1 add AO, 1 * SIZE, AO add L, -1, L cmp L, 0 bg,pt %icc, .LL27 add BO, 8 * SIZE, BO .align 4 .LL28: #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 8, TEMP1 #endif sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 3, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 FSUB b1, c09, c09 FSUB b2, c11, c11 FSUB b3, c13, c13 FSUB b4, c15, c15 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 FSUB b1, c09, c09 FSUB b2, c11, c11 FSUB b3, c13, c13 FSUB b4, c15, c15 #endif #if defined(LN) || defined(LT) LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c03, c03 FMUL a1, c05, c05 FMUL a1, c07, c07 FMUL a1, c09, c09 FMUL a1, c11, c11 FMUL a1, c13, c13 FMUL a1, c15, c15 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FMUL a1, c01, c01 FNMSUB (aa2, cc01, cc03, cc03) FNMSUB (aa3, cc01, cc05, cc05) FNMSUB (aa4, cc01, cc07, cc07) FNMSUB (bb1, cc01, cc09, cc09) FNMSUB (bb2, cc01, cc11, cc11) FNMSUB (bb3, cc01, cc13, cc13) FNMSUB (bb4, cc01, cc15, cc15) LDF [BO + 9 * SIZE], a1 LDF [BO + 10 * SIZE], a2 LDF [BO + 11 * SIZE], a3 LDF [BO + 12 * SIZE], a4 LDF [BO + 13 * SIZE], b1 LDF [BO + 14 * SIZE], b2 LDF [BO + 15 * SIZE], b3 FMUL a1, c03, c03 FNMSUB (aa2, cc03, cc05, cc05) FNMSUB (aa3, cc03, cc07, cc07) FNMSUB (aa4, cc03, cc09, cc09) FNMSUB (bb1, cc03, cc11, cc11) FNMSUB (bb2, cc03, cc13, cc13) FNMSUB (bb3, cc03, cc15, cc15) LDF [BO + 18 * SIZE], a1 LDF [BO + 19 * SIZE], a2 LDF [BO + 20 * SIZE], a3 LDF [BO + 21 * SIZE], a4 LDF [BO + 22 * SIZE], b1 LDF [BO + 23 * SIZE], b2 FMUL a1, c05, c05 FNMSUB (aa2, cc05, cc07, cc07) FNMSUB (aa3, cc05, cc09, cc09) FNMSUB (aa4, cc05, cc11, cc11) FNMSUB (bb1, cc05, cc13, cc13) FNMSUB (bb2, cc05, cc15, cc15) LDF [BO + 27 * SIZE], a1 LDF [BO + 28 * SIZE], a2 LDF [BO + 29 * SIZE], a3 LDF [BO + 30 * SIZE], a4 LDF [BO + 31 * SIZE], b1 FMUL a1, c07, c07 FNMSUB (aa2, cc07, cc09, cc09) FNMSUB (aa3, cc07, cc11, cc11) FNMSUB (aa4, cc07, cc13, cc13) FNMSUB (bb1, cc07, cc15, cc15) LDF [BO + 36 * SIZE], a1 LDF [BO + 37 * SIZE], a2 LDF [BO + 38 * SIZE], a3 LDF [BO + 39 * SIZE], a4 FMUL a1, c09, c09 FNMSUB (aa2, cc09, cc11, cc11) FNMSUB (aa3, cc09, cc13, cc13) FNMSUB (aa4, cc09, cc15, cc15) LDF [BO + 45 * SIZE], a1 LDF [BO + 46 * SIZE], a2 LDF [BO + 47 * SIZE], a3 FMUL a1, c11, c11 FNMSUB (aa2, cc11, cc13, cc13) FNMSUB (aa3, cc11, cc15, cc15) LDF [BO + 54 * SIZE], a1 LDF [BO + 55 * SIZE], a2 FMUL a1, c13, c13 FNMSUB (aa2, cc13, cc15, cc15) LDF [BO + 63 * SIZE], a1 FMUL a1, c15, c15 #endif #ifdef RT LDF [BO + 63 * SIZE], a1 LDF [BO + 62 * SIZE], a2 LDF [BO + 61 * SIZE], a3 LDF [BO + 60 * SIZE], a4 LDF [BO + 59 * SIZE], b1 LDF [BO + 58 * SIZE], b2 LDF [BO + 57 * SIZE], b3 LDF [BO + 56 * SIZE], b4 FMUL a1, c15, c15 FNMSUB (aa2, cc15, cc13, cc13) FNMSUB (aa3, cc15, cc11, cc11) FNMSUB (aa4, cc15, cc09, cc09) FNMSUB (bb1, cc15, cc07, cc07) FNMSUB (bb2, cc15, cc05, cc05) FNMSUB (bb3, cc15, cc03, cc03) FNMSUB (bb4, cc15, cc01, cc01) LDF [BO + 54 * SIZE], a1 LDF [BO + 53 * SIZE], a2 LDF [BO + 52 * SIZE], a3 LDF [BO + 51 * SIZE], a4 LDF [BO + 50 * SIZE], b1 LDF [BO + 49 * SIZE], b2 LDF [BO + 48 * SIZE], b3 FMUL a1, c13, c13 FNMSUB (aa2, cc13, cc11, cc11) FNMSUB (aa3, cc13, cc09, cc09) FNMSUB (aa4, cc13, cc07, cc07) FNMSUB (bb1, cc13, cc05, cc05) FNMSUB (bb2, cc13, cc03, cc03) FNMSUB (bb3, cc13, cc01, cc01) LDF [BO + 45 * SIZE], a1 LDF [BO + 44 * SIZE], a2 LDF [BO + 43 * SIZE], a3 LDF [BO + 42 * SIZE], a4 LDF [BO + 41 * SIZE], b1 LDF [BO + 40 * SIZE], b2 FMUL a1, c11, c11 FNMSUB (aa2, cc11, cc09, cc09) FNMSUB (aa3, cc11, cc07, cc07) FNMSUB (aa4, cc11, cc05, cc05) FNMSUB (bb1, cc11, cc03, cc03) FNMSUB (bb2, cc11, cc01, cc01) LDF [BO + 36 * SIZE], a1 LDF [BO + 35 * SIZE], a2 LDF [BO + 34 * SIZE], a3 LDF [BO + 33 * SIZE], a4 LDF [BO + 32 * SIZE], b1 FMUL a1, c09, c09 FNMSUB (aa2, cc09, cc07, cc07) FNMSUB (aa3, cc09, cc05, cc05) FNMSUB (aa4, cc09, cc03, cc03) FNMSUB (bb1, cc09, cc01, cc01) LDF [BO + 27 * SIZE], a1 LDF [BO + 26 * SIZE], a2 LDF [BO + 25 * SIZE], a3 LDF [BO + 24 * SIZE], a4 FMUL a1, c07, c07 FNMSUB (aa2, cc07, cc05, cc05) FNMSUB (aa3, cc07, cc03, cc03) FNMSUB (aa4, cc07, cc01, cc01) LDF [BO + 18 * SIZE], a1 LDF [BO + 17 * SIZE], a2 LDF [BO + 16 * SIZE], a3 FMUL a1, c05, c05 FNMSUB (aa2, cc05, cc03, cc03) FNMSUB (aa3, cc05, cc01, cc01) LDF [BO + 9 * SIZE], a1 LDF [BO + 8 * SIZE], a2 FMUL a1, c03, c03 FNMSUB (aa2, cc03, cc01, cc01) LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LN add C1, -1 * SIZE, C1 add C2, -1 * SIZE, C2 add C3, -1 * SIZE, C3 add C4, -1 * SIZE, C4 add C5, -1 * SIZE, C5 add C6, -1 * SIZE, C6 add C7, -1 * SIZE, C7 add C8, -1 * SIZE, C8 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c07, [BO + 3 * SIZE] STF c09, [BO + 4 * SIZE] STF c11, [BO + 5 * SIZE] STF c13, [BO + 6 * SIZE] STF c15, [BO + 7 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c03, [AO + 1 * SIZE] STF c05, [AO + 2 * SIZE] STF c07, [AO + 3 * SIZE] STF c09, [AO + 4 * SIZE] STF c11, [AO + 5 * SIZE] STF c13, [AO + 6 * SIZE] STF c15, [AO + 7 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c03, [C2 + 0 * SIZE] STF c05, [C3 + 0 * SIZE] STF c07, [C4 + 0 * SIZE] STF c09, [C5 + 0 * SIZE] STF c11, [C6 + 0 * SIZE] STF c13, [C7 + 0 * SIZE] STF c15, [C8 + 0 * SIZE] #ifdef RT sll K, BASE_SHIFT + 0, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 3, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .align 4 .LL29: #ifdef LN sll K, BASE_SHIFT + 3, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 8, KK #endif #ifdef RT sub KK, 8, KK #endif add J, -1, J cmp J, 0 bg,pt %icc, .LL11 nop .align 4 .LL30: and N, 4, J cmp J, 0 ble,pn %icc, .LL50 nop #ifdef RT sll K, BASE_SHIFT + 2, TEMP1 sub B, TEMP1, B #endif #ifndef RT mov C, C1 add C, LDC, C2 add C2, LDC, C3 add C3, LDC, C4 add C4, LDC, C #else sub C, LDC, C4 sub C4, LDC, C3 sub C3, LDC, C2 sub C2, LDC, C1 sub C2, LDC, C #endif #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif sra M, 1, I cmp I, 0 ble,pn %icc, .LL40 nop .align 4 .LL32: #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 1, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TEMP1 sll KK, BASE_SHIFT + 2, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 LDF [BO + 3 * SIZE], b4 LDF [BO + 4 * SIZE], b5 LDF [BO + 5 * SIZE], b6 FCLR (cc01) LDF [BO + 6 * SIZE], b7 FCLR (cc02) LDF [BO + 7 * SIZE], b8 FCLR (cc03) LDF [BO + 8 * SIZE], b9 FCLR (cc04) prefetch [C1 + 2 * SIZE], 3 FCLR (cc05) prefetch [C2 + 2 * SIZE], 3 FCLR (cc06) prefetch [C3 + 2 * SIZE], 3 FCLR (cc07) prefetch [C4 + 2 * SIZE], 3 FCLR (cc08) #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL35 nop .align 4 .LL33: FMADD (aa1, bb1, cc01, cc01) LDF [AO + 2 * SIZE], a3 FMADD (aa2, bb1, cc02, cc02) LDF [AO + 3 * SIZE], a4 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 16 * SIZE], b1 FMADD (aa2, bb2, cc04, cc04) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb3, cc06, cc06) add L, -1, L FMADD (aa1, bb4, cc07, cc07) LDF [BO + 10 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 11 * SIZE], b4 FMADD (aa3, bb5, cc01, cc01) LDF [AO + 4 * SIZE], a1 FMADD (aa4, bb5, cc02, cc02) LDF [AO + 5 * SIZE], a2 FMADD (aa3, bb6, cc03, cc03) LDF [BO + 12 * SIZE], b5 FMADD (aa4, bb6, cc04, cc04) LDF [BO + 13 * SIZE], b6 FMADD (aa3, bb7, cc05, cc05) cmp L, 0 FMADD (aa4, bb7, cc06, cc06) add AO, 8 * SIZE, AO FMADD (aa3, bb8, cc07, cc07) LDF [BO + 14 * SIZE], b7 FMADD (aa4, bb8, cc08, cc08) LDF [BO + 15 * SIZE], b8 FMADD (aa1, bb9, cc01, cc01) LDF [AO - 2 * SIZE], a3 FMADD (aa2, bb9, cc02, cc02) LDF [AO - 1 * SIZE], a4 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 24 * SIZE], b9 FMADD (aa2, bb2, cc04, cc04) LDF [BO + 17 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) add BO, 16 * SIZE, BO FMADD (aa2, bb3, cc06, cc06) nop FMADD (aa1, bb4, cc07, cc07) LDF [BO + 2 * SIZE], b3 FMADD (aa2, bb4, cc08, cc08) LDF [BO + 3 * SIZE], b4 FMADD (aa3, bb5, cc01, cc01) LDF [AO + 0 * SIZE], a1 FMADD (aa4, bb5, cc02, cc02) LDF [AO + 1 * SIZE], a2 FMADD (aa3, bb6, cc03, cc03) LDF [BO + 4 * SIZE], b5 FMADD (aa4, bb6, cc04, cc04) LDF [BO + 5 * SIZE], b6 FMADD (aa3, bb7, cc05, cc05) nop FMADD (aa4, bb7, cc06, cc06) LDF [BO + 6 * SIZE], b7 FMADD (aa3, bb8, cc07, cc07) FMADD (aa4, bb8, cc08, cc08) bg,pt %icc, .LL33 LDF [BO + 7 * SIZE], b8 .align 4 .LL35: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL38 nop .align 4 .LL37: FMADD (aa1, bb1, cc01, cc01) add L, -1, L FMADD (aa2, bb1, cc02, cc02) LDF [BO + 4 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) add AO, 2 * SIZE, AO FMADD (aa2, bb2, cc04, cc04) LDF [BO + 5 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) cmp L, 0 FMADD (aa2, bb3, cc06, cc06) LDF [BO + 6 * SIZE], b3 FMADD (aa1, bb4, cc07, cc07) LDF [AO + 0 * SIZE], a1 FMADD (aa2, bb4, cc08, cc08) LDF [AO + 1 * SIZE], a2 LDF [BO + 7 * SIZE], b4 bg,pt %icc, .LL37 add BO, 4 * SIZE, BO .align 4 .LL38: #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 4, TEMP1 #endif sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 2, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 LDF [BO + 4 * SIZE], b1 LDF [BO + 5 * SIZE], b2 LDF [BO + 6 * SIZE], b3 LDF [BO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 FSUB b1, c02, c02 FSUB b2, c04, c04 FSUB b3, c06, c06 FSUB b4, c08, c08 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [AO + 4 * SIZE], b1 LDF [AO + 5 * SIZE], b2 LDF [AO + 6 * SIZE], b3 LDF [AO + 7 * SIZE], b4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 FSUB b1, c05, c05 FSUB b2, c06, c06 FSUB b3, c07, c07 FSUB b4, c08, c08 #endif #ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c04, c04 FMUL a1, c06, c06 FMUL a1, c08, c08 FNMSUB (aa2, cc02, cc01, cc01) FNMSUB (aa2, cc04, cc03, cc03) FNMSUB (aa2, cc06, cc05, cc05) FNMSUB (aa2, cc08, cc07, cc07) FMUL a3, c01, c01 FMUL a3, c03, c03 FMUL a3, c05, c05 FMUL a3, c07, c07 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c03, c03 FMUL a1, c05, c05 FMUL a1, c07, c07 FNMSUB (aa2, cc01, cc02, cc02) FNMSUB (aa2, cc03, cc04, cc04) FNMSUB (aa2, cc05, cc06, cc06) FNMSUB (aa2, cc07, cc08, cc08) FMUL a3, c02, c02 FMUL a3, c04, c04 FMUL a3, c06, c06 FMUL a3, c08, c08 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FMUL a1, c01, c01 FMUL a1, c02, c02 FNMSUB (aa2, cc01, cc03, cc03) FNMSUB (aa2, cc02, cc04, cc04) FNMSUB (aa3, cc01, cc05, cc05) FNMSUB (aa3, cc02, cc06, cc06) FNMSUB (aa4, cc01, cc07, cc07) FNMSUB (aa4, cc02, cc08, cc08) LDF [BO + 5 * SIZE], a1 LDF [BO + 6 * SIZE], a2 LDF [BO + 7 * SIZE], a3 FMUL a1, c03, c03 FMUL a1, c04, c04 FNMSUB (aa2, cc03, cc05, cc05) FNMSUB (aa2, cc04, cc06, cc06) FNMSUB (aa3, cc03, cc07, cc07) FNMSUB (aa3, cc04, cc08, cc08) LDF [BO + 10 * SIZE], a1 LDF [BO + 11 * SIZE], a2 FMUL a1, c05, c05 FMUL a1, c06, c06 FNMSUB (aa2, cc05, cc07, cc07) FNMSUB (aa2, cc06, cc08, cc08) LDF [BO + 15 * SIZE], a1 FMUL a1, c07, c07 FMUL a1, c08, c08 #endif #ifdef RT LDF [BO + 15 * SIZE], a1 LDF [BO + 14 * SIZE], a2 LDF [BO + 13 * SIZE], a3 LDF [BO + 12 * SIZE], a4 FMUL a1, c08, c08 FMUL a1, c07, c07 FNMSUB (aa2, cc08, cc06, cc06) FNMSUB (aa2, cc07, cc05, cc05) FNMSUB (aa3, cc08, cc04, cc04) FNMSUB (aa3, cc07, cc03, cc03) FNMSUB (aa4, cc08, cc02, cc02) FNMSUB (aa4, cc07, cc01, cc01) LDF [BO + 10 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 8 * SIZE], a3 FMUL a1, c06, c06 FMUL a1, c05, c05 FNMSUB (aa2, cc06, cc04, cc04) FNMSUB (aa2, cc05, cc03, cc03) FNMSUB (aa3, cc06, cc02, cc02) FNMSUB (aa3, cc05, cc01, cc01) LDF [BO + 5 * SIZE], a1 LDF [BO + 4 * SIZE], a2 FMUL a1, c04, c04 FMUL a1, c03, c03 FNMSUB (aa2, cc04, cc02, cc02) FNMSUB (aa2, cc03, cc01, cc01) LDF [BO + 0 * SIZE], a1 FMUL a1, c02, c02 FMUL a1, c01, c01 #endif #ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 add C3, -2 * SIZE, C3 add C4, -2 * SIZE, C4 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c07, [BO + 3 * SIZE] STF c02, [BO + 4 * SIZE] STF c04, [BO + 5 * SIZE] STF c06, [BO + 6 * SIZE] STF c08, [BO + 7 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] STF c05, [AO + 4 * SIZE] STF c06, [AO + 5 * SIZE] STF c07, [AO + 6 * SIZE] STF c08, [AO + 7 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C2 + 0 * SIZE] STF c04, [C2 + 1 * SIZE] STF c05, [C3 + 0 * SIZE] STF c06, [C3 + 1 * SIZE] STF c07, [C4 + 0 * SIZE] STF c08, [C4 + 1 * SIZE] #ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 add C3, 2 * SIZE, C3 add C4, 2 * SIZE, C4 #endif #ifdef RT sll K, BASE_SHIFT + 1, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 2, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL32 nop .LL40: and M, 1, I cmp I, 0 ble,pn %icc, .LL49 nop #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 0, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TEMP1 sll KK, BASE_SHIFT + 2, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 LDF [BO + 3 * SIZE], b4 LDF [BO + 4 * SIZE], b5 LDF [BO + 5 * SIZE], b6 FCLR (cc01) LDF [BO + 6 * SIZE], b7 FCLR (cc03) LDF [BO + 7 * SIZE], b8 FCLR (cc05) LDF [BO + 8 * SIZE], b9 FCLR (cc07) #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL45 nop .LL43: prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY add L, -1, L FMADD (aa1, bb1, cc01, cc01) LDF [BO + 16 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 9 * SIZE], b2 FMADD (aa1, bb3, cc05, cc05) LDF [BO + 10 * SIZE], b3 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 11 * SIZE], b4 LDF [AO + 4 * SIZE], a1 cmp L, 0 FMADD (aa2, bb5, cc01, cc01) LDF [BO + 12 * SIZE], b5 FMADD (aa2, bb6, cc03, cc03) LDF [BO + 13 * SIZE], b6 FMADD (aa2, bb7, cc05, cc05) LDF [BO + 14 * SIZE], b7 FMADD (aa2, bb8, cc07, cc07) LDF [BO + 15 * SIZE], b8 LDF [AO + 5 * SIZE], a2 add AO, 4 * SIZE, AO FMADD (aa3, bb9, cc01, cc01) LDF [BO + 24 * SIZE], b9 FMADD (aa3, bb2, cc03, cc03) LDF [BO + 17 * SIZE], b2 FMADD (aa3, bb3, cc05, cc05) LDF [BO + 18 * SIZE], b3 FMADD (aa3, bb4, cc07, cc07) LDF [BO + 19 * SIZE], b4 LDF [AO + 2 * SIZE], a3 add BO, 16 * SIZE, BO FMADD (aa4, bb5, cc01, cc01) LDF [BO + 4 * SIZE], b5 FMADD (aa4, bb6, cc03, cc03) LDF [BO + 5 * SIZE], b6 FMADD (aa4, bb7, cc05, cc05) LDF [BO + 6 * SIZE], b7 FMADD (aa4, bb8, cc07, cc07) LDF [BO + 7 * SIZE], b8 bg,pt %icc, .LL43 LDF [AO + 3 * SIZE], a4 .align 4 .LL45: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL48 nop .align 4 .LL47: FMADD (aa1, bb1, cc01, cc01) LDF [BO + 4 * SIZE], b1 add L, -1, L FMADD (aa1, bb2, cc03, cc03) LDF [BO + 5 * SIZE], b2 add AO, 1 * SIZE, AO FMADD (aa1, bb3, cc05, cc05) LDF [BO + 6 * SIZE], b3 cmp L, 0 FMADD (aa1, bb4, cc07, cc07) LDF [BO + 7 * SIZE], b4 add BO, 4 * SIZE, BO bg,pt %icc, .LL47 LDF [AO + 0 * SIZE], a1 .align 4 .LL48: #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 4, TEMP1 #endif sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 2, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c05, c05 FSUB a4, c07, c07 #endif #if defined(LN) || defined(LT) LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c03, c03 FMUL a1, c05, c05 FMUL a1, c07, c07 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FMUL a1, c01, c01 FNMSUB (aa2, cc01, cc03, cc03) FNMSUB (aa3, cc01, cc05, cc05) FNMSUB (aa4, cc01, cc07, cc07) LDF [BO + 5 * SIZE], a1 LDF [BO + 6 * SIZE], a2 LDF [BO + 7 * SIZE], a3 FMUL a1, c03, c03 FNMSUB (aa2, cc03, cc05, cc05) FNMSUB (aa3, cc03, cc07, cc07) LDF [BO + 10 * SIZE], a1 LDF [BO + 11 * SIZE], a2 FMUL a1, c05, c05 FNMSUB (aa2, cc05, cc07, cc07) LDF [BO + 15 * SIZE], a1 FMUL a1, c07, c07 #endif #ifdef RT LDF [BO + 15 * SIZE], a1 LDF [BO + 14 * SIZE], a2 LDF [BO + 13 * SIZE], a3 LDF [BO + 12 * SIZE], a4 FMUL a1, c07, c07 FNMSUB (aa2, cc07, cc05, cc05) FNMSUB (aa3, cc07, cc03, cc03) FNMSUB (aa4, cc07, cc01, cc01) LDF [BO + 10 * SIZE], a1 LDF [BO + 9 * SIZE], a2 LDF [BO + 8 * SIZE], a3 FMUL a1, c05, c05 FNMSUB (aa2, cc05, cc03, cc03) FNMSUB (aa3, cc05, cc01, cc01) LDF [BO + 5 * SIZE], a1 LDF [BO + 4 * SIZE], a2 FMUL a1, c03, c03 FNMSUB (aa2, cc03, cc01, cc01) LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LN add C1, -1 * SIZE, C1 add C2, -1 * SIZE, C2 add C3, -1 * SIZE, C3 add C4, -1 * SIZE, C4 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] STF c05, [BO + 2 * SIZE] STF c07, [BO + 3 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c03, [AO + 1 * SIZE] STF c05, [AO + 2 * SIZE] STF c07, [AO + 3 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c03, [C2 + 0 * SIZE] STF c05, [C3 + 0 * SIZE] STF c07, [C4 + 0 * SIZE] #ifdef RT sll K, BASE_SHIFT + 0, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 2, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .align 4 .LL49: #ifdef LN sll K, BASE_SHIFT + 2, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 4, KK #endif #ifdef RT sub KK, 4, KK #endif .align 4 .LL50: and N, 2, J cmp J, 0 ble,pn %icc, .LL70 nop #ifdef RT sll K, BASE_SHIFT + 1, TEMP1 sub B, TEMP1, B #endif #ifndef RT mov C, C1 add C, LDC, C2 add C2, LDC, C #else sub C, LDC, C2 sub C2, LDC, C1 sub C2, LDC, C #endif #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif sra M, 1, I cmp I, 0 ble,pn %icc, .LL60 nop .align 4 .LL52: #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 1, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TEMP1 sll KK, BASE_SHIFT + 1, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 FCLR (cc01) LDF [BO + 3 * SIZE], b4 FCLR (cc02) LDF [BO + 4 * SIZE], b5 FCLR (cc03) LDF [BO + 5 * SIZE], b6 FCLR (cc04) LDF [BO + 6 * SIZE], b7 FCLR (cc05) LDF [BO + 7 * SIZE], b8 FCLR (cc06) prefetch [C1 + 2 * SIZE], 3 FCLR (cc07) prefetch [C2 + 2 * SIZE], 3 FCLR (cc08) #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL55 nop .align 4 .LL53: FMADD (aa1, bb1, cc01, cc01) prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY FMADD (aa2, bb1, cc02, cc02) LDF [BO + 8 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [AO + 4 * SIZE], a1 FMADD (aa2, bb2, cc04, cc04) LDF [AO + 5 * SIZE], a2 FMADD (aa3, bb3, cc01, cc01) LDF [BO + 9 * SIZE], b2 FMADD (aa4, bb3, cc02, cc02) LDF [BO + 10 * SIZE], b3 FMADD (aa3, bb4, cc03, cc03) LDF [AO + 6 * SIZE], a3 FMADD (aa4, bb4, cc04, cc04) LDF [AO + 7 * SIZE], a4 FMADD (aa1, bb5, cc01, cc01) LDF [BO + 11 * SIZE], b4 FMADD (aa2, bb5, cc02, cc02) LDF [BO + 12 * SIZE], b5 FMADD (aa1, bb6, cc03, cc03) LDF [AO + 8 * SIZE], a1 FMADD (aa2, bb6, cc04, cc04) LDF [AO + 9 * SIZE], a2 FMADD (aa3, bb7, cc01, cc01) LDF [BO + 13 * SIZE], b6 FMADD (aa4, bb7, cc02, cc02) LDF [BO + 14 * SIZE], b7 FMADD (aa3, bb8, cc03, cc03) LDF [AO + 10 * SIZE], a3 FMADD (aa4, bb8, cc04, cc04) LDF [AO + 11 * SIZE], a4 add AO, 8 * SIZE, AO add L, -1, L add BO, 8 * SIZE, BO cmp L, 0 bg,pt %icc, .LL53 LDF [BO + 7 * SIZE], b8 .align 4 .LL55: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL58 nop .align 4 .LL57: FMADD (aa1, bb1, cc01, cc01) add L, -1, L FMADD (aa2, bb1, cc02, cc02) LDF [BO + 2 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [AO + 2 * SIZE], a1 FMADD (aa2, bb2, cc04, cc04) LDF [AO + 3 * SIZE], a2 add AO, 2 * SIZE, AO cmp L, 0 add BO, 2 * SIZE, BO bg,pt %icc, .LL57 LDF [BO + 1 * SIZE], b2 .align 4 .LL58: #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 2, TEMP1 #endif sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 1, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 LDF [BO + 2 * SIZE], a3 LDF [BO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c03, c03 FSUB a3, c02, c02 FSUB a4, c04, c04 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 FSUB a1, c01, c01 FSUB a2, c02, c02 FSUB a3, c03, c03 FSUB a4, c04, c04 #endif #ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c02, c02 FMUL a1, c04, c04 FNMSUB (aa2, cc02, cc01, cc01) FNMSUB (aa2, cc04, cc03, cc03) FMUL a3, c01, c01 FMUL a3, c03, c03 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FMUL a1, c03, c03 FNMSUB (aa2, cc01, cc02, cc02) FNMSUB (aa2, cc03, cc04, cc04) FMUL a3, c02, c02 FMUL a3, c04, c04 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FMUL a1, c01, c01 FMUL a1, c02, c02 FNMSUB (aa2, cc01, cc03, cc03) FNMSUB (aa2, cc02, cc04, cc04) LDF [BO + 3 * SIZE], a1 FMUL a1, c03, c03 FMUL a1, c04, c04 #endif #ifdef RT LDF [BO + 3 * SIZE], a1 LDF [BO + 2 * SIZE], a2 FMUL a1, c04, c04 FMUL a1, c03, c03 FNMSUB (aa2, cc04, cc02, cc02) FNMSUB (aa2, cc03, cc01, cc01) LDF [BO + 0 * SIZE], a1 FMUL a1, c02, c02 FMUL a1, c01, c01 #endif #ifdef LN add C1, -2 * SIZE, C1 add C2, -2 * SIZE, C2 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] STF c02, [BO + 2 * SIZE] STF c04, [BO + 3 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] STF c03, [AO + 2 * SIZE] STF c04, [AO + 3 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] STF c03, [C2 + 0 * SIZE] STF c04, [C2 + 1 * SIZE] #ifndef LN add C1, 2 * SIZE, C1 add C2, 2 * SIZE, C2 #endif #ifdef RT sll K, BASE_SHIFT + 1, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 1, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL52 nop .align 4 .LL60: and M, 1, I cmp I, 0 ble,pn %icc, .LL69 nop #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 0, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TEMP1 sll KK, BASE_SHIFT + 1, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 LDF [BO + 3 * SIZE], b4 LDF [BO + 4 * SIZE], b5 LDF [BO + 5 * SIZE], b6 LDF [BO + 6 * SIZE], b7 FCLR (cc01) LDF [BO + 7 * SIZE], b8 FCLR (cc03) #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL65 nop .align 4 .LL63: prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY add L, -1, L FMADD (aa1, bb1, cc01, cc01) LDF [BO + 8 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 9 * SIZE], b2 LDF [AO + 4 * SIZE], a1 cmp L, 0 FMADD (aa2, bb3, cc01, cc01) LDF [BO + 10 * SIZE], b3 FMADD (aa2, bb4, cc03, cc03) LDF [BO + 11 * SIZE], b4 LDF [AO + 5 * SIZE], a2 add AO, 4 * SIZE, AO FMADD (aa3, bb5, cc01, cc01) LDF [BO + 12 * SIZE], b5 FMADD (aa3, bb6, cc03, cc03) LDF [BO + 13 * SIZE], b6 LDF [AO + 2 * SIZE], a3 add BO, 8 * SIZE, BO FMADD (aa4, bb7, cc01, cc01) LDF [BO + 6 * SIZE], b7 FMADD (aa4, bb8, cc03, cc03) LDF [BO + 7 * SIZE], b8 bg,pt %icc, .LL63 LDF [AO + 3 * SIZE], a4 .align 4 .LL65: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL68 nop .align 4 .LL67: FMADD (aa1, bb1, cc01, cc01) LDF [BO + 2 * SIZE], b1 FMADD (aa1, bb2, cc03, cc03) LDF [BO + 3 * SIZE], b2 LDF [AO + 1 * SIZE], a1 add L, -1, L add AO, 1 * SIZE, AO cmp L, 0 bg,pt %icc, .LL67 add BO, 2 * SIZE, BO .align 4 .LL68: #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 2, TEMP1 #endif sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 1, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c03, c03 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c03, c03 #endif #if defined(LN) || defined(LT) LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c03, c03 #endif #ifdef RN LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FMUL a1, c01, c01 FNMSUB (aa2, cc01, cc03, cc03) LDF [BO + 3 * SIZE], a1 FMUL a1, c03, c03 #endif #ifdef RT LDF [BO + 3 * SIZE], a1 LDF [BO + 2 * SIZE], a2 FMUL a1, c03, c03 FNMSUB (aa2, cc03, cc01, cc01) LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LN add C1, -1 * SIZE, C1 add C2, -1 * SIZE, C2 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c03, [BO + 1 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c03, [AO + 1 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c03, [C2 + 0 * SIZE] #ifdef RT sll K, BASE_SHIFT + 0, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 1, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .align 4 .LL69: #ifdef LN sll K, BASE_SHIFT + 1, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 2, KK #endif #ifdef RT sub KK, 2, KK #endif .align 4 .LL70: and N, 1, J cmp J, 0 ble,pn %icc, .LL999 nop #ifdef RT sll K, BASE_SHIFT, TEMP1 sub B, TEMP1, B #endif #ifndef RT mov C, C1 add C1, LDC, C #else sub C, LDC, C1 sub C, LDC, C #endif #ifdef LN add M, OFFSET, KK #endif #ifdef LT mov OFFSET, KK #endif #if defined(LN) || defined(RT) mov A, AORIG #else mov A, AO #endif sra M, 1, I cmp I, 0 ble,pn %icc, .LL80 nop .align 4 .LL72: #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 1, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 1, TEMP1 sll KK, BASE_SHIFT + 0, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 2 * SIZE], a3 LDF [AO + 3 * SIZE], a4 LDF [BO + 0 * SIZE], b1 LDF [BO + 1 * SIZE], b2 LDF [BO + 2 * SIZE], b3 FCLR (cc01) LDF [BO + 3 * SIZE], b4 FCLR (cc02) prefetch [C1 + 2 * SIZE], 3 #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL75 nop .LL73: prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY add L, -1, L FMADD (aa1, bb1, cc01, cc01) LDF [AO + 4 * SIZE], a1 FMADD (aa2, bb1, cc02, cc02) LDF [AO + 5 * SIZE], a2 LDF [BO + 4 * SIZE], b1 cmp L, 0 FMADD (aa3, bb2, cc01, cc01) LDF [AO + 6 * SIZE], a3 FMADD (aa4, bb2, cc02, cc02) LDF [AO + 7 * SIZE], a4 LDF [BO + 5 * SIZE], b2 add BO, 4 * SIZE, BO FMADD (aa1, bb3, cc01, cc01) LDF [AO + 8 * SIZE], a1 FMADD (aa2, bb3, cc02, cc02) LDF [AO + 9 * SIZE], a2 LDF [BO + 2 * SIZE], b3 add AO, 8 * SIZE, AO FMADD (aa3, bb4, cc01, cc01) LDF [AO + 2 * SIZE], a3 FMADD (aa4, bb4, cc02, cc02) LDF [AO + 3 * SIZE], a4 bg,pt %icc, .LL73 LDF [BO + 3 * SIZE], b4 .align 4 .LL75: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL78 nop .align 4 .LL77: FMADD (aa1, bb1, cc01, cc01) LDF [AO + 2 * SIZE], a1 FMADD (aa2, bb1, cc02, cc02) LDF [AO + 3 * SIZE], a2 LDF [BO + 1 * SIZE], b1 add L, -1, L add AO, 2 * SIZE, AO cmp L, 0 bg,pt %icc, .LL77 add BO, 1 * SIZE, BO .align 4 .LL78: #if defined(LN) || defined(RT) #ifdef LN sub KK, 2, TEMP1 #else sub KK, 1, TEMP1 #endif sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 0, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 LDF [BO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #else LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 FSUB a1, c01, c01 FSUB a2, c02, c02 #endif #ifdef LN LDF [AO + 3 * SIZE], a1 LDF [AO + 2 * SIZE], a2 LDF [AO + 0 * SIZE], a3 FMUL a1, c02, c02 FNMSUB (aa2, cc02, cc01, cc01) FMUL a3, c01, c01 #endif #ifdef LT LDF [AO + 0 * SIZE], a1 LDF [AO + 1 * SIZE], a2 LDF [AO + 3 * SIZE], a3 FMUL a1, c01, c01 FNMSUB (aa2, cc01, cc02, cc02) FMUL a3, c02, c02 #endif #if defined(RN) || defined(RT) LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 FMUL a1, c02, c02 #endif #ifdef LN add C1, -2 * SIZE, C1 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] STF c02, [BO + 1 * SIZE] #else STF c01, [AO + 0 * SIZE] STF c02, [AO + 1 * SIZE] #endif STF c01, [C1 + 0 * SIZE] STF c02, [C1 + 1 * SIZE] #ifndef LN add C1, 2 * SIZE, C1 #endif #ifdef RT sll K, BASE_SHIFT + 1, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 1, TEMP2 sll TEMP1, BASE_SHIFT + 0, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 2, KK #endif #ifdef LN sub KK, 2, KK #endif add I, -1, I cmp I, 0 bg,pt %icc, .LL72 nop .align 4 .LL80: and M, 1, I cmp I, 0 ble,pn %icc, .LL89 nop #if defined(LT) || defined(RN) mov B, BO #else #ifdef LN sll K, BASE_SHIFT + 0, TEMP1 sub AORIG, TEMP1, AORIG #endif sll KK, BASE_SHIFT + 0, TEMP1 sll KK, BASE_SHIFT + 0, TEMP2 add AORIG, TEMP1, AO add B, TEMP2, BO #endif LDF [AO + 0 * SIZE], a1 LDF [BO + 0 * SIZE], b1 LDF [AO + 1 * SIZE], a2 LDF [BO + 1 * SIZE], b2 LDF [AO + 2 * SIZE], a3 LDF [BO + 2 * SIZE], b3 LDF [AO + 3 * SIZE], a4 LDF [BO + 3 * SIZE], b4 #if defined(LT) || defined(RN) sra KK, 2, L #else sub K, KK, L sra L, 2, L #endif cmp L, 0 ble,pn %icc, .LL85 FCLR (cc01) .align 4 .LL83: prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY add L, -1, L FMADD (aa1, bb1, cc01, cc01) LDF [AO + 4 * SIZE], a1 LDF [BO + 4 * SIZE], b1 FMADD (aa2, bb2, cc01, cc01) LDF [AO + 5 * SIZE], a2 LDF [BO + 5 * SIZE], b2 FMADD (aa3, bb3, cc01, cc01) LDF [AO + 6 * SIZE], a3 LDF [BO + 6 * SIZE], b3 FMADD (aa4, bb4, cc01, cc01) LDF [AO + 7 * SIZE], a4 LDF [BO + 7 * SIZE], b4 add AO, 4 * SIZE, AO cmp L, 0 bg,pt %icc, .LL83 add BO, 4 * SIZE, BO .align 4 .LL85: #if defined(LT) || defined(RN) and KK, 3, L #else sub K, KK, L and L, 3, L #endif cmp L, 0 ble,a,pn %icc, .LL88 nop .align 4 .LL87: FMADD (aa1, bb1, cc01, cc01) LDF [AO + 1 * SIZE], a1 LDF [BO + 1 * SIZE], b1 add AO, 1 * SIZE, AO add L, -1, L cmp L, 0 bg,pt %icc, .LL87 add BO, 1 * SIZE, BO .align 4 .LL88: #if defined(LN) || defined(RT) #ifdef LN sub KK, 1, TEMP1 #else sub KK, 1, TEMP1 #endif sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 0, TEMP1 add AORIG, TEMP2, AO add B, TEMP1, BO #endif #if defined(LN) || defined(LT) LDF [BO + 0 * SIZE], a1 FSUB a1, c01, c01 #else LDF [AO + 0 * SIZE], a1 FSUB a1, c01, c01 #endif #if defined(LN) || defined(LT) LDF [AO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #if defined(RN) || defined(RT) LDF [BO + 0 * SIZE], a1 FMUL a1, c01, c01 #endif #ifdef LN add C1, -1 * SIZE, C1 #endif #if defined(LN) || defined(LT) STF c01, [BO + 0 * SIZE] #else STF c01, [AO + 0 * SIZE] #endif STF c01, [C1 + 0 * SIZE] #ifdef RT sll K, BASE_SHIFT + 0, TEMP1 add AORIG, TEMP1, AORIG #endif #if defined(LT) || defined(RN) sub K, KK, TEMP1 sll TEMP1, BASE_SHIFT + 0, TEMP2 sll TEMP1, BASE_SHIFT + 0, TEMP1 add AO, TEMP2, AO add BO, TEMP1, BO #endif #ifdef LT add KK, 1, KK #endif #ifdef LN sub KK, 1, KK #endif .align 4 .LL89: #ifdef LN sll K, BASE_SHIFT, TEMP1 add B, TEMP1, B #endif #if defined(LT) || defined(RN) mov BO, B #endif #ifdef RN add KK, 1, KK #endif #ifdef RT sub KK, 1, KK #endif .align 4 .LL999: #ifdef TRMMKERNEL #ifndef __64BIT__ ld [%sp + STACK_START + 8], %g1 ld [%sp + STACK_START + 12], %g2 ld [%sp + STACK_START + 16], %g3 ld [%sp + STACK_START + 20], %g4 #else ldx [%sp + STACK_START + 32], %g1 ldx [%sp + STACK_START + 40], %g2 ldx [%sp + STACK_START + 48], %g3 ldx [%sp + STACK_START + 56], %g4 #endif #endif return %i7 + 8 clr %o0 EPILOGUE