/*************************************************************************** Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** * Abdelrauf(quickwritereader@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK * LAPACK-TEST : OK **************************************************************************************/ /********************************************************************* * Macros for N=4, M=16 * *********************************************************************/ .macro LOAD4x16_1 LOAD4x16 1 .endm .macro LOAD4x16_0 LOAD4x16 0 .endm .macro LOAD4x16 Zero lxv vs24, 0(BO) lxv vs26, 16(BO) xxpermdi vs25, vs24, vs24,2 xxpermdi vs27, vs26, vs26,2 lxv vs0, 0(AO) lxv vs1, 16(AO) lxv vs2, 32(AO) lxv vs3, 48(AO) lxv vs4, 64(AO) lxv vs5, 80(AO) lxv vs6, 96(AO) lxv vs7, 112(AO) .if \Zero==1 xxlxor vs32,vs32,vs32 xxlxor vs33,vs33,vs33 xxlxor vs34,vs34,vs34 xxlxor vs35,vs35,vs35 xxlxor vs36,vs36,vs36 xxlxor vs37,vs37,vs37 xxlxor vs38,vs38,vs38 xxlxor vs39,vs39,vs39 xxlxor vs40, vs40, vs40 xxlxor vs41, vs41, vs41 xxlxor vs42, vs42, vs42 xxlxor vs43, vs43, vs43 xxlxor vs44, vs44, vs44 xxlxor vs45, vs45, vs45 xxlxor vs46, vs46, vs46 xxlxor vs47, vs47, vs47 xxlxor vs48, vs48, vs48 xxlxor vs49, vs49, vs49 xxlxor vs50, vs50, vs50 xxlxor vs51, vs51, vs51 xxlxor vs52, vs52, vs52 xxlxor vs53, vs53, vs53 xxlxor vs54, vs54, vs54 xxlxor vs55, vs55, vs55 xxlxor vs56, vs56, vs56 xxlxor vs57, vs57, vs57 xxlxor vs58, vs58, vs58 xxlxor vs59, vs59, vs59 xxlxor vs60, vs60, vs60 xxlxor vs61, vs61, vs61 xxlxor vs62, vs62, vs62 xxlxor vs63, vs63, vs63 .endif .endm #define unit_size 8 #define DISP32(ind,disp) (ind*unit_size*32+disp) #define DISP16(ind,disp) (ind*unit_size*16+disp) #define DISP8(ind,disp) (ind*unit_size*8+disp) #define DISP4(ind,disp) (ind*unit_size*4+disp) #define DISP2(ind,disp) (ind*unit_size*2+disp) #define DISP1(ind,disp) (ind*unit_size+disp) .macro KERNEL4x16_L1_L2 Index,IsLast KERNEL4x16_L1_L2_I AO,BO, 0,0,0, \Index,\IsLast,0 .endm .macro KERNEL4x16_I1_L2 OffsetA,OffsetB, Index,IsLast KERNEL4x16_L1_L2_I AO,BO,1,\OffsetA,\OffsetB,\Index,\IsLast,0 .endm .macro KERNEL4x16_I1_L2_2 OffsetA,OffsetB, Index,IsLast KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 .endm .macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro KERNEL4x16_I2_L2 AREG,BREG,OffsetA,OffsetB, Index,IsLast KERNEL4x16_L1_L2_I \AREG,\BREG,1,\OffsetA,\OffsetB,\Index,\IsLast,0 .endm .macro KERNEL4x16_I2_L2_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 .endm .macro KERNEL4x16_I2_L2_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro KERNEL4x16_L1_L2_I AREG,BREG, First, OffsetA,OffsetB, Index,IsLast ,Complete .if \First ==1 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 .else xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 .endif lxv vs8, DISP32(\Index,0+\OffsetA)(\AREG) lxv vs9, DISP32(\Index,16+\OffsetA)(\AREG) lxv vs10, DISP32(\Index,32+\OffsetA)(\AREG) lxv vs11, DISP32(\Index,48+\OffsetA)(\AREG) .if \First ==1 xvmuldp vs36, vs4, vs24 xvmuldp vs37, vs5, vs24 xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 .else xvmaddadp vs36, vs4, vs24 xvmaddadp vs37, vs5, vs24 xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 .endif lxv vs28, DISP8(\Index,0 +\OffsetB)(\BREG) lxv vs30, DISP8(\Index,16 +\OffsetB)(\BREG) xxpermdi vs29, vs28, vs28,2 xxpermdi vs31, vs30, vs30,2 .if \First ==1 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 xvmuldp vs44, vs4, vs25 xvmuldp vs45, vs5, vs25 xvmuldp vs46, vs6, vs25 xvmuldp vs47, vs7, vs25 xvmuldp vs48, vs0, vs26 xvmuldp vs49, vs1, vs26 xvmuldp vs50, vs2, vs26 xvmuldp vs51, vs3, vs26 .else xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 xvmaddadp vs44, vs4, vs25 xvmaddadp vs45, vs5, vs25 xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 xvmaddadp vs48, vs0, vs26 xvmaddadp vs49, vs1, vs26 xvmaddadp vs50, vs2, vs26 xvmaddadp vs51, vs3, vs26 .endif lxv vs12, DISP32(\Index,64+\OffsetA)(\AREG) lxv vs13, DISP32(\Index,80+\OffsetA)(\AREG) .if \First ==1 xvmuldp vs52, vs4, vs26 xvmuldp vs53, vs5, vs26 xvmuldp vs54, vs6, vs26 xvmuldp vs55, vs7, vs26 .else xvmaddadp vs52, vs4, vs26 xvmaddadp vs53, vs5, vs26 xvmaddadp vs54, vs6, vs26 xvmaddadp vs55, vs7, vs26 .endif lxv vs14, DISP32(\Index,96+\OffsetA)(\AREG) lxv vs15, DISP32(\Index,112+\OffsetA)(\AREG) .if \First ==1 xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 xvmuldp vs58, vs2, vs27 xvmuldp vs59, vs3, vs27 xvmuldp vs60, vs4, vs27 xvmuldp vs61, vs5, vs27 xvmuldp vs62, vs6, vs27 xvmuldp vs63, vs7, vs27 .else xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 xvmaddadp vs58, vs2, vs27 xvmaddadp vs59, vs3, vs27 xvmaddadp vs60, vs4, vs27 xvmaddadp vs61, vs5, vs27 xvmaddadp vs62, vs6, vs27 xvmaddadp vs63, vs7, vs27 .endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 .if \Complete==0 lxv vs0, DISP32(\Index,128+\OffsetA)(\AREG) lxv vs1, DISP32(\Index,144+\OffsetA)(\AREG) .endif xvmaddadp vs36, vs12, vs28 xvmaddadp vs37, vs13, vs28 xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 .if \Complete==0 lxv vs24, DISP8(\Index,32 +\OffsetB)(\BREG) lxv vs26, DISP8(\Index,48 +\OffsetB)(\BREG) xxpermdi vs25, vs24, vs24,2 xxpermdi vs27, vs26, vs26,2 .endif xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 .if \Complete==0 lxv vs2, DISP32(\Index,160+\OffsetA)(\AREG) lxv vs3, DISP32(\Index,176+\OffsetA)(\AREG) .endif xvmaddadp vs44, vs12, vs29 xvmaddadp vs45, vs13, vs29 xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 xvmaddadp vs48, vs8, vs30 xvmaddadp vs49, vs9, vs30 xvmaddadp vs50, vs10, vs30 xvmaddadp vs51, vs11, vs30 .if \Complete==0 lxv vs4, DISP32(\Index,192+\OffsetA)(\AREG) lxv vs5, DISP32(\Index,208+\OffsetA)(\AREG) .endif xvmaddadp vs52, vs12, vs30 xvmaddadp vs53, vs13, vs30 xvmaddadp vs54, vs14, vs30 xvmaddadp vs55, vs15, vs30 .if \Complete==0 lxv vs6, DISP32(\Index,224+\OffsetA)(\AREG) lxv vs7, DISP32(\Index,240+\OffsetA)(\AREG) .endif xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 xvmaddadp vs58, vs10, vs31 xvmaddadp vs59, vs11, vs31 xvmaddadp vs60, vs12, vs31 xvmaddadp vs61, vs13, vs31 xvmaddadp vs62, vs14, vs31 xvmaddadp vs63, vs15, vs31 .if \IsLast==1 .if \Complete==1 addi \AREG, \AREG, DISP32(\Index,128+\OffsetA) addi \BREG, \BREG, DISP8(\Index,32+\OffsetB) .else addi \AREG, \AREG, DISP32(\Index,256) addi \BREG, \BREG, DISP8(\Index,64) .endif .endif .endm .macro KERNEL4x16 First lxv vs24, 0(BO) lxv vs26, 16(BO) xxpermdi vs25, vs24, vs24,2 xxpermdi vs27, vs26, vs26,2 lxv vs0, 0(AO) lxv vs1, 16(AO) lxv vs2, 32(AO) lxv vs3, 48(AO) lxv vs4, 64(AO) lxv vs5, 80(AO) lxv vs6, 96(AO) lxv vs7, 112(AO) addi BO, BO, 32 addi AO, AO, 128 .if \First==1 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 xvmuldp vs36, vs4, vs24 xvmuldp vs37, vs5, vs24 xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 xvmuldp vs44, vs4, vs25 xvmuldp vs45, vs5, vs25 xvmuldp vs46, vs6, vs25 xvmuldp vs47, vs7, vs25 xvmuldp vs48, vs0, vs26 xvmuldp vs49, vs1, vs26 xvmuldp vs50, vs2, vs26 xvmuldp vs51, vs3, vs26 xvmuldp vs52, vs4, vs26 xvmuldp vs53, vs5, vs26 xvmuldp vs54, vs6, vs26 xvmuldp vs55, vs7, vs26 xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 xvmuldp vs58, vs2, vs27 xvmuldp vs59, vs3, vs27 xvmuldp vs60, vs4, vs27 xvmuldp vs61, vs5, vs27 xvmuldp vs62, vs6, vs27 xvmuldp vs63, vs7, vs27 .else xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 xvmaddadp vs36, vs4, vs24 xvmaddadp vs37, vs5, vs24 xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 xvmaddadp vs44, vs4, vs25 xvmaddadp vs45, vs5, vs25 xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 xvmaddadp vs48, vs0, vs26 xvmaddadp vs49, vs1, vs26 xvmaddadp vs50, vs2, vs26 xvmaddadp vs51, vs3, vs26 xvmaddadp vs52, vs4, vs26 xvmaddadp vs53, vs5, vs26 xvmaddadp vs54, vs6, vs26 xvmaddadp vs55, vs7, vs26 xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 xvmaddadp vs58, vs2, vs27 xvmaddadp vs59, vs3, vs27 xvmaddadp vs60, vs4, vs27 xvmaddadp vs61, vs5, vs27 xvmaddadp vs62, vs6, vs27 xvmaddadp vs63, vs7, vs27 .endif .endm .macro SAVE4x16_REGS add C2, CO, LDC add C3, C2, LDC add C4, C3, LDC .endm .macro SAVE4x16 #ifndef TRMMKERNEL lxv vs0, 0(CO) lxv vs2, 16(CO) lxv vs4, 32(CO) lxv vs6, 48(CO) #endif xxpermdi vs8, vs40,vs32,1 xxpermdi vs9 ,vs32,vs40,1 #ifndef TRMMKERNEL lxv vs24, 64(CO) lxv vs26, 80(CO) lxv vs28, 96(CO) lxv vs30, 112(CO) #endif xxpermdi vs10, vs41,vs33,1 xxpermdi vs11 ,vs33,vs41,1 #ifndef TRMMKERNEL lxv vs1, 0(C2) lxv vs3, 16(C2) lxv vs5, 32(C2) lxv vs7, 48(C2) #endif xxpermdi vs12, vs42,vs34,1 xxpermdi vs13 ,vs34,vs42,1 #ifndef TRMMKERNEL lxv vs25, 64(C2) lxv vs27, 80(C2) #endif xxpermdi vs14, vs43,vs35,1 xxpermdi vs15 ,vs35,vs43,1 #ifndef TRMMKERNEL lxv vs29, 96(C2) lxv vs31, 112(C2) #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs8, alpha_r xvmaddadp vs1, vs9, alpha_r xvmaddadp vs2, vs10, alpha_r xvmaddadp vs3, vs11, alpha_r #else xvmuldp vs0, vs8, alpha_r xvmuldp vs1, vs9, alpha_r xvmuldp vs2, vs10, alpha_r xvmuldp vs3, vs11, alpha_r #endif xxpermdi vs8, vs44,vs36,1 xxpermdi vs9 ,vs36,vs44,1 xxpermdi vs10, vs45,vs37,1 xxpermdi vs11 ,vs37,vs45,1 #ifndef TRMMKERNEL xvmaddadp vs4, vs12, alpha_r xvmaddadp vs5, vs13, alpha_r xvmaddadp vs6, vs14, alpha_r xvmaddadp vs7, vs15, alpha_r #else xvmuldp vs4, vs12, alpha_r xvmuldp vs5, vs13, alpha_r xvmuldp vs6, vs14, alpha_r xvmuldp vs7, vs15, alpha_r #endif xxpermdi vs12, vs46,vs38,1 xxpermdi vs13 ,vs38,vs46,1 xxpermdi vs14, vs47,vs39,1 xxpermdi vs15 ,vs39,vs47,1 #ifndef TRMMKERNEL xvmaddadp vs24, vs8, alpha_r xvmaddadp vs25, vs9, alpha_r xvmaddadp vs26, vs10, alpha_r xvmaddadp vs27, vs11, alpha_r xvmaddadp vs28, vs12, alpha_r xvmaddadp vs29, vs13, alpha_r xvmaddadp vs30, vs14, alpha_r xvmaddadp vs31, vs15, alpha_r #else xvmuldp vs24, vs8, alpha_r xvmuldp vs25, vs9, alpha_r xvmuldp vs26, vs10, alpha_r xvmuldp vs27, vs11, alpha_r xvmuldp vs28, vs12, alpha_r xvmuldp vs29, vs13, alpha_r xvmuldp vs30, vs14, alpha_r xvmuldp vs31, vs15, alpha_r #endif stxv vs0, 0(CO) stxv vs2, 16(CO) stxv vs4, 32(CO) stxv vs6, 48(CO) stxv vs24, 64(CO) stxv vs26, 80(CO) stxv vs28, 96(CO) stxv vs30, 112(CO) stxv vs1, 0(C2) stxv vs3, 16(C2) stxv vs5, 32(C2) stxv vs7, 48(C2) stxv vs25, 64(C2) stxv vs27, 80(C2) stxv vs29, 96(C2) stxv vs31, 112(C2) #ifndef TRMMKERNEL lxv vs0, 0(C3) lxv vs2, 16(C3) lxv vs4, 32(C3) lxv vs6, 48(C3) #endif xxpermdi vs8, vs56,vs48,1 xxpermdi vs9 ,vs48,vs56,1 #ifndef TRMMKERNEL lxv vs24, 64(C3) lxv vs26, 80(C3) #endif xxpermdi vs10, vs57,vs49,1 xxpermdi vs11 ,vs49,vs57,1 #ifndef TRMMKERNEL lxv vs28, 96(C3) lxv vs30, 112(C3) #endif xxpermdi vs12, vs58,vs50,1 xxpermdi vs13 ,vs50,vs58,1 #ifndef TRMMKERNEL lxv vs1, 0(C4) lxv vs3, 16(C4) #endif xxpermdi vs14, vs59,vs51,1 xxpermdi vs15 ,vs51,vs59,1 #ifndef TRMMKERNEL lxv vs5, 32(C4) lxv vs7, 48(C4) lxv vs25, 64(C4) lxv vs27, 80(C4) lxv vs29, 96(C4) lxv vs31, 112(C4) #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs8, alpha_r xvmaddadp vs1, vs9, alpha_r xvmaddadp vs2, vs10, alpha_r xvmaddadp vs3, vs11, alpha_r #else xvmuldp vs0, vs8, alpha_r xvmuldp vs1, vs9, alpha_r xvmuldp vs2, vs10, alpha_r xvmuldp vs3, vs11, alpha_r #endif xxpermdi vs8, vs60,vs52,1 xxpermdi vs9 ,vs52,vs60,1 xxpermdi vs10, vs61,vs53,1 xxpermdi vs11 ,vs53,vs61,1 #ifndef TRMMKERNEL xvmaddadp vs4, vs12, alpha_r xvmaddadp vs5, vs13, alpha_r xvmaddadp vs6, vs14, alpha_r xvmaddadp vs7, vs15, alpha_r #else xvmuldp vs4, vs12, alpha_r xvmuldp vs5, vs13, alpha_r xvmuldp vs6, vs14, alpha_r xvmuldp vs7, vs15, alpha_r #endif xxpermdi vs12, vs62,vs54,1 xxpermdi vs13 ,vs54,vs62,1 xxpermdi vs14, vs63,vs55,1 xxpermdi vs15 ,vs55,vs63,1 #ifndef TRMMKERNEL xvmaddadp vs24, vs8, alpha_r xvmaddadp vs25, vs9, alpha_r xvmaddadp vs26, vs10, alpha_r xvmaddadp vs27, vs11, alpha_r xvmaddadp vs28, vs12, alpha_r xvmaddadp vs29, vs13, alpha_r xvmaddadp vs30, vs14, alpha_r xvmaddadp vs31, vs15, alpha_r #else xvmuldp vs24, vs8, alpha_r xvmuldp vs25, vs9, alpha_r xvmuldp vs26, vs10, alpha_r xvmuldp vs27, vs11, alpha_r xvmuldp vs28, vs12, alpha_r xvmuldp vs29, vs13, alpha_r xvmuldp vs30, vs14, alpha_r xvmuldp vs31, vs15, alpha_r #endif stxv vs0, 0(C3) stxv vs2, 16(C3) stxv vs4, 32(C3) stxv vs6, 48(C3) stxv vs24, 64(C3) stxv vs26, 80(C3) stxv vs28, 96(C3) stxv vs30, 112(C3) stxv vs1, 0(C4) stxv vs3, 16(C4) stxv vs5, 32(C4) stxv vs7, 48(C4) stxv vs25, 64(C4) stxv vs27, 80(C4) stxv vs29, 96(C4) stxv vs31, 112(C4) addi CO, CO, 128 .endm /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ .macro LOAD4x8_1 LOAD4x8 1 .endm .macro LOAD4x8_0 LOAD4x8 0 .endm .macro LOAD4x8 Zero lxv vs24, 0(BO) lxv vs26, 16(BO) xxpermdi vs25, vs24, vs24,2 xxpermdi vs27, vs26, vs26,2 lxv vs0, 0(AO) lxv vs1, 16(AO) lxv vs2, 32(AO) lxv vs3, 48(AO) .if \Zero==1 xxlxor vs32,vs32,vs32 xxlxor vs33,vs33,vs33 xxlxor vs34,vs34,vs34 xxlxor vs35,vs35,vs35 xxlxor vs40, vs40, vs40 xxlxor vs41, vs41, vs41 xxlxor vs42, vs42, vs42 xxlxor vs43, vs43, vs43 xxlxor vs48, vs48, vs48 xxlxor vs49, vs49, vs49 xxlxor vs50, vs50, vs50 xxlxor vs51, vs51, vs51 xxlxor vs56, vs56, vs56 xxlxor vs57, vs57, vs57 xxlxor vs58, vs58, vs58 xxlxor vs59, vs59, vs59 .endif .endm .macro KERNEL4x8_L1_L2 Index,IsLast KERNEL4x8_L1_L2_I 0,0,0, \Index,\IsLast,0 .endm .macro KERNEL4x8_I1_L2 OffsetA,OffsetB, Index,IsLast KERNEL4x8_L1_L2_I 1,\OffsetA,\OffsetB,\Index,\IsLast,0 .endm .macro KERNEL4x8_I1_L2_2 OffsetA,OffsetB, Index,IsLast KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,0 .endm .macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,1 .endm .macro KERNEL4x8_L1_L2_I First, OffsetA,OffsetB, Index,IsLast ,Complete lxv vs8, DISP16(\Index,0+\OffsetA)(AO) lxv vs9, DISP16(\Index,16+\OffsetA)(AO) .if \First ==1 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 .else xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 .endif lxv vs10, DISP16(\Index,32+\OffsetA)(AO) lxv vs11, DISP16(\Index,48+\OffsetA)(AO) .if \First ==1 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 xvmuldp vs48, vs0, vs26 xvmuldp vs49, vs1, vs26 xvmuldp vs50, vs2, vs26 xvmuldp vs51, vs3, vs26 .else lxv vs28, DISP8(\Index,0 +\OffsetB)(BO) lxv vs30, DISP8(\Index,16 +\OffsetB)(BO) xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 xvmaddadp vs48, vs0, vs26 xvmaddadp vs49, vs1, vs26 xvmaddadp vs50, vs2, vs26 xvmaddadp vs51, vs3, vs26 .endif xxpermdi vs29, vs28, vs28,2 xxpermdi vs31, vs30, vs30,2 .if \First ==1 xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 xvmuldp vs58, vs2, vs27 xvmuldp vs59, vs3, vs27 .else xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 xvmaddadp vs58, vs2, vs27 xvmaddadp vs59, vs3, vs27 .endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 .if \Complete==0 lxv vs0, DISP16(\Index,64+\OffsetA)(AO) lxv vs1, DISP16(\Index,80+\OffsetA)(AO) .endif xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 .if \Complete==0 lxv vs2, DISP16(\Index,96+\OffsetA)(AO) lxv vs3, DISP16(\Index,112+\OffsetA)(AO) .endif xvmaddadp vs48, vs8, vs30 xvmaddadp vs49, vs9, vs30 xvmaddadp vs50, vs10, vs30 xvmaddadp vs51, vs11, vs30 .if \Complete==0 lxv vs24, DISP8(\Index,32 +\OffsetB)(BO) lxv vs26, DISP8(\Index,48 +\OffsetB)(BO) .endif xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 xvmaddadp vs58, vs10, vs31 xvmaddadp vs59, vs11, vs31 .if \Complete==0 xxpermdi vs25, vs24, vs24,2 xxpermdi vs27, vs26, vs26,2 .endif .if \IsLast==1 .if \Complete==1 addi AO, AO, DISP16(\Index,64+\OffsetA) addi BO, BO, DISP8(\Index,32+\OffsetB) .else addi AO, AO, DISP16(\Index,128) addi BO, BO, DISP8(\Index,64) .endif .endif .endm .macro KERNEL4x8 First lxv vs24, 0(BO) lxv vs26, 16(BO) xxpermdi vs25, vs24, vs24,2 xxpermdi vs27, vs26, vs26,2 lxv vs0, 0(AO) lxv vs1, 16(AO) lxv vs2, 32(AO) lxv vs3, 48(AO) addi BO, BO, 32 addi AO, AO, 64 .if \First==1 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 xvmuldp vs48, vs0, vs26 xvmuldp vs49, vs1, vs26 xvmuldp vs50, vs2, vs26 xvmuldp vs51, vs3, vs26 xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 xvmuldp vs58, vs2, vs27 xvmuldp vs59, vs3, vs27 .else xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 xvmaddadp vs48, vs0, vs26 xvmaddadp vs49, vs1, vs26 xvmaddadp vs50, vs2, vs26 xvmaddadp vs51, vs3, vs26 xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 xvmaddadp vs58, vs2, vs27 xvmaddadp vs59, vs3, vs27 .endif .endm .macro SAVE4x8 add T2, CO, LDC add T3, T2, LDC add T4, T3, LDC #ifndef TRMMKERNEL lxv vs0, 0(CO) lxv vs2, 16(CO) #endif xxpermdi vs8, vs40,vs32,1 xxpermdi vs9 ,vs32,vs40,1 #ifndef TRMMKERNEL lxv vs4, 32(CO) lxv vs6, 48(CO) #endif xxpermdi vs10, vs41,vs33,1 xxpermdi vs11 ,vs33,vs41,1 #ifndef TRMMKERNEL lxv vs1, 0(T2) lxv vs3, 16(T2) #endif xxpermdi vs12, vs42,vs34,1 xxpermdi vs13 ,vs34,vs42,1 #ifndef TRMMKERNEL lxv vs5, 32(T2) lxv vs7, 48(T2) #endif xxpermdi vs14, vs43,vs35,1 xxpermdi vs15 ,vs35,vs43,1 #ifndef TRMMKERNEL xvmaddadp vs0, vs8, alpha_r xvmaddadp vs1, vs9, alpha_r xvmaddadp vs2, vs10, alpha_r xvmaddadp vs3, vs11, alpha_r xvmaddadp vs4, vs12, alpha_r xvmaddadp vs5, vs13, alpha_r xvmaddadp vs6, vs14, alpha_r xvmaddadp vs7, vs15, alpha_r #else xvmuldp vs0, vs8, alpha_r xvmuldp vs1, vs9, alpha_r xvmuldp vs2, vs10, alpha_r xvmuldp vs3, vs11, alpha_r xvmuldp vs4, vs12, alpha_r xvmuldp vs5, vs13, alpha_r xvmuldp vs6, vs14, alpha_r xvmuldp vs7, vs15, alpha_r #endif stxv vs0, 0(CO) stxv vs2, 16(CO) stxv vs4, 32(CO) stxv vs6, 48(CO) stxv vs1, 0(T2) stxv vs3, 16(T2) stxv vs5, 32(T2) stxv vs7, 48(T2) xxpermdi vs8, vs56,vs48,1 xxpermdi vs9 ,vs48,vs56,1 #ifndef TRMMKERNEL lxv vs0, 0(T3) lxv vs2, 16(T3) #endif xxpermdi vs10, vs57,vs49,1 xxpermdi vs11 ,vs49,vs57,1 #ifndef TRMMKERNEL lxv vs4, 32(T3) lxv vs6, 48(T3) #endif xxpermdi vs12, vs58,vs50,1 xxpermdi vs13 ,vs50,vs58,1 #ifndef TRMMKERNEL lxv vs1, 0(T4) lxv vs3, 16(T4) #endif xxpermdi vs14, vs59,vs51,1 xxpermdi vs15 ,vs51,vs59,1 #ifndef TRMMKERNEL lxv vs5, 32(T4) lxv vs7, 48(T4) xvmaddadp vs0, vs8, alpha_r xvmaddadp vs1, vs9, alpha_r xvmaddadp vs2, vs10, alpha_r xvmaddadp vs3, vs11, alpha_r xvmaddadp vs4, vs12, alpha_r xvmaddadp vs5, vs13, alpha_r xvmaddadp vs6, vs14, alpha_r xvmaddadp vs7, vs15, alpha_r #else xvmuldp vs0, vs8, alpha_r xvmuldp vs1, vs9, alpha_r xvmuldp vs2, vs10, alpha_r xvmuldp vs3, vs11, alpha_r xvmuldp vs4, vs12, alpha_r xvmuldp vs5, vs13, alpha_r xvmuldp vs6, vs14, alpha_r xvmuldp vs7, vs15, alpha_r #endif stxv vs0, 0(T3) stxv vs2, 16(T3) stxv vs4, 32(T3) stxv vs6, 48(T3) stxv vs1, 0(T4) stxv vs3, 16(T4) stxv vs5, 32(T4) stxv vs7, 48(T4) addi CO, CO, 64 .endm /********************************************************************* * Macros for N=4, M=4 * *********************************************************************/ .macro LOAD4x4_1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 32 addi BO, BO, 32 .endm .macro KERNEL4x4_I1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO lxvdsx vs30, o16, BO lxvdsx vs31, o24, BO addi AO, AO, 32 addi BO, BO, 32 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs48, vs0, vs26 xvmuldp vs49, vs1, vs26 xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 .endm .macro KERNEL4x4_1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO lxvdsx vs30, o16, BO lxvdsx vs31, o24, BO addi AO, AO, 32 addi BO, BO, 32 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs48, vs0, vs26 xvmaddadp vs49, vs1, vs26 xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 .endm .macro KERNEL4x4_2 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 32 addi BO, BO, 32 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs48, vs8, vs30 xvmaddadp vs49, vs9, vs30 xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 .endm .macro KERNEL4x4_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs48, vs8, vs30 xvmaddadp vs49, vs9, vs30 xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 .endm .macro KERNEL4x4_SUBI1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 32 addi BO, BO, 32 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs48, vs0, vs26 xvmuldp vs49, vs1, vs26 xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 .endm .macro KERNEL4x4_SUB1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 32 addi BO, BO, 32 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs48, vs0, vs26 xvmaddadp vs49, vs1, vs26 xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 .endm .macro SAVE4x4 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r #else xvmuldp vs0, vs32, alpha_r xvmuldp vs1, vs33, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 lxvd2x vs9, o16, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs40, alpha_r xvmaddadp vs9, vs41, alpha_r #else xvmuldp vs8, vs40, alpha_r xvmuldp vs9, vs41, alpha_r #endif stxvd2x vs8, 0, T1 stxvd2x vs9, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs48, alpha_r xvmaddadp vs1, vs49, alpha_r #else xvmuldp vs0, vs48, alpha_r xvmuldp vs1, vs49, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 lxvd2x vs9, o16, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs56, alpha_r xvmaddadp vs9, vs57, alpha_r #else xvmuldp vs8, vs56, alpha_r xvmuldp vs9, vs57, alpha_r #endif stxvd2x vs8, 0, T1 stxvd2x vs9, o16, T1 addi CO, CO, 32 .endm /********************************************************************* * Macros for N=4, M=2 * *********************************************************************/ .macro LOAD4x2_1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 16 addi BO, BO, 32 .endm .macro KERNEL4x2_I1 lxvd2x vs8, 0, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO lxvdsx vs30, o16, BO lxvdsx vs31, o24, BO addi AO, AO, 16 addi BO, BO, 32 xvmuldp vs32, vs0, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs48, vs0, vs26 xvmuldp vs56, vs0, vs27 .endm .macro KERNEL4x2_1 lxvd2x vs8, 0, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO lxvdsx vs30, o16, BO lxvdsx vs31, o24, BO addi AO, AO, 16 addi BO, BO, 32 xvmaddadp vs32, vs0, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs48, vs0, vs26 xvmaddadp vs56, vs0, vs27 .endm .macro KERNEL4x2_2 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 16 addi BO, BO, 32 xvmaddadp vs32, vs8, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs48, vs8, vs30 xvmaddadp vs56, vs8, vs31 .endm .macro KERNEL4x2_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs48, vs8, vs30 xvmaddadp vs56, vs8, vs31 .endm .macro KERNEL4x2_SUBI1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 16 addi BO, BO, 32 xvmuldp vs32, vs0, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs48, vs0, vs26 xvmuldp vs56, vs0, vs27 .endm .macro KERNEL4x2_SUB1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO lxvdsx vs26, o16, BO lxvdsx vs27, o24, BO addi AO, AO, 16 addi BO, BO, 32 xvmaddadp vs32, vs0, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs48, vs0, vs26 xvmaddadp vs56, vs0, vs27 .endm .macro SAVE4x2 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r #else xvmuldp vs0, vs32, alpha_r #endif stxvd2x vs0, 0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs40, alpha_r #else xvmuldp vs8, vs40, alpha_r #endif stxvd2x vs8, 0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs48, alpha_r #else xvmuldp vs0, vs48, alpha_r #endif stxvd2x vs0, 0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs56, alpha_r #else xvmuldp vs8, vs56, alpha_r #endif stxvd2x vs8, 0, T1 addi CO, CO, 16 .endm /********************************************************************* * Macros for N=4, M=1 * *********************************************************************/ .macro LOAD4x1_1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO lxsdx vs25, o8, BO lxsdx vs26, o16, BO lxsdx vs27, o24, BO addi AO, AO, 8 addi BO, BO, 32 .endm .macro KERNEL4x1_I1 lxsdx vs8, 0, AO lxsdx vs28, 0, BO lxsdx vs29, o8, BO lxsdx vs30, o16, BO lxsdx vs31, o24, BO addi AO, AO, 8 addi BO, BO, 32 xsmuldp vs32, vs0, vs24 xsmuldp vs40, vs0, vs25 xsmuldp vs48, vs0, vs26 xsmuldp vs56, vs0, vs27 .endm .macro KERNEL4x1_1 lxsdx vs8, 0, AO lxsdx vs28, 0, BO lxsdx vs29, o8, BO lxsdx vs30, o16, BO lxsdx vs31, o24, BO addi AO, AO, 8 addi BO, BO, 32 xsmaddadp vs32, vs0, vs24 xsmaddadp vs40, vs0, vs25 xsmaddadp vs48, vs0, vs26 xsmaddadp vs56, vs0, vs27 .endm .macro KERNEL4x1_2 lxsdx vs0, 0, AO lxsdx vs24, 0, BO lxsdx vs25, o8, BO lxsdx vs26, o16, BO lxsdx vs27, o24, BO addi AO, AO, 8 addi BO, BO, 32 xsmaddadp vs32, vs8, vs28 xsmaddadp vs40, vs8, vs29 xsmaddadp vs48, vs8, vs30 xsmaddadp vs56, vs8, vs31 .endm .macro KERNEL4x1_E2 xsmaddadp vs32, vs8, vs28 xsmaddadp vs40, vs8, vs29 xsmaddadp vs48, vs8, vs30 xsmaddadp vs56, vs8, vs31 .endm .macro KERNEL4x1_SUBI1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO lxsdx vs25, o8, BO lxsdx vs26, o16, BO lxsdx vs27, o24, BO addi AO, AO, 8 addi BO, BO, 32 xsmuldp vs32, vs0, vs24 xsmuldp vs40, vs0, vs25 xsmuldp vs48, vs0, vs26 xsmuldp vs56, vs0, vs27 .endm .macro KERNEL4x1_SUB1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO lxsdx vs25, o8, BO lxsdx vs26, o16, BO lxsdx vs27, o24, BO addi AO, AO, 8 addi BO, BO, 32 xsmaddadp vs32, vs0, vs24 xsmaddadp vs40, vs0, vs25 xsmaddadp vs48, vs0, vs26 xsmaddadp vs56, vs0, vs27 .endm .macro SAVE4x1 mr T1, CO #ifndef TRMMKERNEL lxsdx vs0, 0, T1 #endif #ifndef TRMMKERNEL xsmaddadp vs0, vs32, alpha_r #else xsmuldp vs0, vs32, alpha_r #endif stxsdx vs0, 0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsdx vs8, 0, T1 #endif #ifndef TRMMKERNEL xsmaddadp vs8, vs40, alpha_r #else xsmuldp vs8, vs40, alpha_r #endif stxsdx vs8, 0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsdx vs0, 0, T1 #endif #ifndef TRMMKERNEL xsmaddadp vs0, vs48, alpha_r #else xsmuldp vs0, vs48, alpha_r #endif stxsdx vs0, 0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsdx vs8, 0, T1 #endif #ifndef TRMMKERNEL xsmaddadp vs8, vs56, alpha_r #else xsmuldp vs8, vs56, alpha_r #endif stxsdx vs8, 0, T1 addi CO, CO, 8 .endm /********************************************************************* * Macros for N=2, M=16 * *********************************************************************/ .macro LOAD2x16_1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 64 addi BO, BO, 16 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 .endm .macro KERNEL2x16_I1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO addi AO, AO, 64 addi BO, BO, 16 lxvd2x vs12, 0, AO lxvd2x vs13, o16, AO lxvd2x vs14, o32, AO lxvd2x vs15, o48, AO addi AO, AO, 64 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 xvmuldp vs36, vs4, vs24 xvmuldp vs37, vs5, vs24 xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 xvmuldp vs44, vs4, vs25 xvmuldp vs45, vs5, vs25 xvmuldp vs46, vs6, vs25 xvmuldp vs47, vs7, vs25 .endm .macro KERNEL2x16_1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO addi AO, AO, 64 addi BO, BO, 16 lxvd2x vs12, 0, AO lxvd2x vs13, o16, AO lxvd2x vs14, o32, AO lxvd2x vs15, o48, AO addi AO, AO, 64 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 xvmaddadp vs36, vs4, vs24 xvmaddadp vs37, vs5, vs24 xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 xvmaddadp vs44, vs4, vs25 xvmaddadp vs45, vs5, vs25 xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 .endm .macro KERNEL2x16_2 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 64 addi BO, BO, 16 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 xvmaddadp vs36, vs12, vs28 xvmaddadp vs37, vs13, vs28 xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 xvmaddadp vs44, vs12, vs29 xvmaddadp vs45, vs13, vs29 xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 .endm .macro KERNEL2x16_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 xvmaddadp vs36, vs12, vs28 xvmaddadp vs37, vs13, vs28 xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 xvmaddadp vs44, vs12, vs29 xvmaddadp vs45, vs13, vs29 xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 .endm .macro KERNEL2x16_SUBI1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 64 addi BO, BO, 16 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 xvmuldp vs36, vs4, vs24 xvmuldp vs37, vs5, vs24 xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 xvmuldp vs44, vs4, vs25 xvmuldp vs45, vs5, vs25 xvmuldp vs46, vs6, vs25 xvmuldp vs47, vs7, vs25 .endm .macro KERNEL2x16_SUB1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 64 addi BO, BO, 16 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 xvmaddadp vs36, vs4, vs24 xvmaddadp vs37, vs5, vs24 xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 xvmaddadp vs44, vs4, vs25 xvmaddadp vs45, vs5, vs25 xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 .endm .macro SAVE2x16 mr T1, CO addi T2, T1, 64 #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 lxvd2x vs2, o32, T1 lxvd2x vs3, o48, T1 lxvd2x vs4, 0, T2 lxvd2x vs5, o16, T2 lxvd2x vs6, o32, T2 lxvd2x vs7, o48, T2 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r xvmaddadp vs2, vs34, alpha_r xvmaddadp vs3, vs35, alpha_r xvmaddadp vs4, vs36, alpha_r xvmaddadp vs5, vs37, alpha_r xvmaddadp vs6, vs38, alpha_r xvmaddadp vs7, vs39, alpha_r #else xvmuldp vs0, vs32, alpha_r xvmuldp vs1, vs33, alpha_r xvmuldp vs2, vs34, alpha_r xvmuldp vs3, vs35, alpha_r xvmuldp vs4, vs36, alpha_r xvmuldp vs5, vs37, alpha_r xvmuldp vs6, vs38, alpha_r xvmuldp vs7, vs39, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 stxvd2x vs2, o32, T1 stxvd2x vs3, o48, T1 stxvd2x vs4, 0, T2 stxvd2x vs5, o16, T2 stxvd2x vs6, o32, T2 stxvd2x vs7, o48, T2 add T1, T1, LDC add T2, T2, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 lxvd2x vs9, o16, T1 lxvd2x vs10, o32, T1 lxvd2x vs11, o48, T1 lxvd2x vs12, 0, T2 lxvd2x vs13, o16, T2 lxvd2x vs14, o32, T2 lxvd2x vs15, o48, T2 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs40, alpha_r xvmaddadp vs9, vs41, alpha_r xvmaddadp vs10, vs42, alpha_r xvmaddadp vs11, vs43, alpha_r xvmaddadp vs12, vs44, alpha_r xvmaddadp vs13, vs45, alpha_r xvmaddadp vs14, vs46, alpha_r xvmaddadp vs15, vs47, alpha_r #else xvmuldp vs8, vs40, alpha_r xvmuldp vs9, vs41, alpha_r xvmuldp vs10, vs42, alpha_r xvmuldp vs11, vs43, alpha_r xvmuldp vs12, vs44, alpha_r xvmuldp vs13, vs45, alpha_r xvmuldp vs14, vs46, alpha_r xvmuldp vs15, vs47, alpha_r #endif stxvd2x vs8, 0, T1 stxvd2x vs9, o16, T1 stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 stxvd2x vs12, 0, T2 stxvd2x vs13, o16, T2 stxvd2x vs14, o32, T2 stxvd2x vs15, o48, T2 addi CO, CO, 128 .endm /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ .macro LOAD2x8_1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 64 addi BO, BO, 16 .endm .macro KERNEL2x8_I1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO addi AO, AO, 64 addi BO, BO, 16 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 .endm .macro KERNEL2x8_1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO addi AO, AO, 64 addi BO, BO, 16 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 .endm .macro KERNEL2x8_2 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 64 addi BO, BO, 16 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 .endm .macro KERNEL2x8_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 .endm .macro KERNEL2x8_SUBI1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 64 addi BO, BO, 16 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 .endm .macro KERNEL2x8_SUB1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 64 addi BO, BO, 16 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 .endm .macro SAVE2x8 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 lxvd2x vs2, o32, T1 lxvd2x vs3, o48, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r xvmaddadp vs2, vs34, alpha_r xvmaddadp vs3, vs35, alpha_r #else xvmuldp vs0, vs32, alpha_r xvmuldp vs1, vs33, alpha_r xvmuldp vs2, vs34, alpha_r xvmuldp vs3, vs35, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 stxvd2x vs2, o32, T1 stxvd2x vs3, o48, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 lxvd2x vs9, o16, T1 lxvd2x vs10, o32, T1 lxvd2x vs11, o48, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs40, alpha_r xvmaddadp vs9, vs41, alpha_r xvmaddadp vs10, vs42, alpha_r xvmaddadp vs11, vs43, alpha_r #else xvmuldp vs8, vs40, alpha_r xvmuldp vs9, vs41, alpha_r xvmuldp vs10, vs42, alpha_r xvmuldp vs11, vs43, alpha_r #endif stxvd2x vs8, 0, T1 stxvd2x vs9, o16, T1 stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 addi CO, CO, 64 .endm /********************************************************************* * Macros for N=2, M=4 * *********************************************************************/ .macro LOAD2x4_1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 32 addi BO, BO, 16 .endm .macro KERNEL2x4_I1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO addi AO, AO, 32 addi BO, BO, 16 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 .endm .macro KERNEL2x4_1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO addi AO, AO, 32 addi BO, BO, 16 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 .endm .macro KERNEL2x4_2 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 32 addi BO, BO, 16 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 .endm .macro KERNEL2x4_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 .endm .macro KERNEL2x4_SUBI1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 32 addi BO, BO, 16 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 .endm .macro KERNEL2x4_SUB1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 32 addi BO, BO, 16 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 .endm .macro SAVE2x4 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r #else xvmuldp vs0, vs32, alpha_r xvmuldp vs1, vs33, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 lxvd2x vs9, o16, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs40, alpha_r xvmaddadp vs9, vs41, alpha_r #else xvmuldp vs8, vs40, alpha_r xvmuldp vs9, vs41, alpha_r #endif stxvd2x vs8, 0, T1 stxvd2x vs9, o16, T1 addi CO, CO, 32 .endm /********************************************************************* * Macros for N=2, M=2 * *********************************************************************/ .macro LOAD2x2_1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 16 addi BO, BO, 16 .endm .macro KERNEL2x2_I1 lxvd2x vs8, 0, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO addi AO, AO, 16 addi BO, BO, 16 xvmuldp vs32, vs0, vs24 xvmuldp vs40, vs0, vs25 .endm .macro KERNEL2x2_1 lxvd2x vs8, 0, AO lxvdsx vs28, 0, BO lxvdsx vs29, o8, BO addi AO, AO, 16 addi BO, BO, 16 xvmaddadp vs32, vs0, vs24 xvmaddadp vs40, vs0, vs25 .endm .macro KERNEL2x2_2 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 16 addi BO, BO, 16 xvmaddadp vs32, vs8, vs28 xvmaddadp vs40, vs8, vs29 .endm .macro KERNEL2x2_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs40, vs8, vs29 .endm .macro KERNEL2x2_SUBI1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 16 addi BO, BO, 16 xvmuldp vs32, vs0, vs24 xvmuldp vs40, vs0, vs25 .endm .macro KERNEL2x2_SUB1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO lxvdsx vs25, o8, BO addi AO, AO, 16 addi BO, BO, 16 xvmaddadp vs32, vs0, vs24 xvmaddadp vs40, vs0, vs25 .endm .macro SAVE2x2 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r #else xvmuldp vs0, vs32, alpha_r #endif stxvd2x vs0, 0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxvd2x vs8, 0, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs8, vs40, alpha_r #else xvmuldp vs8, vs40, alpha_r #endif stxvd2x vs8, 0, T1 addi CO, CO, 16 .endm /********************************************************************* * Macros for N=2, M=1 * *********************************************************************/ .macro LOAD2x1_1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO lxsdx vs25, o8, BO addi AO, AO, 8 addi BO, BO, 16 .endm .macro KERNEL2x1_I1 lxsdx vs8, 0, AO lxsdx vs28, 0, BO lxsdx vs29, o8, BO addi AO, AO, 8 addi BO, BO, 16 xsmuldp vs32, vs0, vs24 xsmuldp vs40, vs0, vs25 .endm .macro KERNEL2x1_1 lxsdx vs8, 0, AO lxsdx vs28, 0, BO lxsdx vs29, o8, BO addi AO, AO, 8 addi BO, BO, 16 xsmaddadp vs32, vs0, vs24 xsmaddadp vs40, vs0, vs25 .endm .macro KERNEL2x1_2 lxsdx vs0, 0, AO lxsdx vs24, 0, BO lxsdx vs25, o8, BO addi AO, AO, 8 addi BO, BO, 16 xsmaddadp vs32, vs8, vs28 xsmaddadp vs40, vs8, vs29 .endm .macro KERNEL2x1_E2 xsmaddadp vs32, vs8, vs28 xsmaddadp vs40, vs8, vs29 .endm .macro KERNEL2x1_SUBI1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO lxsdx vs25, o8, BO addi AO, AO, 8 addi BO, BO, 16 xsmuldp vs32, vs0, vs24 xsmuldp vs40, vs0, vs25 .endm .macro KERNEL2x1_SUB1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO lxsdx vs25, o8, BO addi AO, AO, 8 addi BO, BO, 16 xsmaddadp vs32, vs0, vs24 xsmaddadp vs40, vs0, vs25 .endm .macro SAVE2x1 mr T1, CO #ifndef TRMMKERNEL lxsdx vs0, 0, T1 #endif #ifndef TRMMKERNEL xsmaddadp vs0, vs32, alpha_r #else xsmuldp vs0, vs32, alpha_r #endif stxsdx vs0, 0, T1 add T1, T1, LDC #ifndef TRMMKERNEL lxsdx vs8, 0, T1 #endif #ifndef TRMMKERNEL xsmaddadp vs8, vs40, alpha_r #else xsmuldp vs8, vs40, alpha_r #endif stxsdx vs8, 0, T1 addi CO, CO, 8 .endm /********************************************************************* * Macros for N=1, M=16 * *********************************************************************/ .macro LOAD1x16_1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO addi AO, AO, 64 addi BO, BO, 8 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 .endm .macro KERNEL1x16_I1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO lxvdsx vs28, 0, BO addi AO, AO, 64 addi BO, BO, 8 lxvd2x vs12, 0, AO lxvd2x vs13, o16, AO lxvd2x vs14, o32, AO lxvd2x vs15, o48, AO addi AO, AO, 64 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 xvmuldp vs36, vs4, vs24 xvmuldp vs37, vs5, vs24 xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 .endm .macro KERNEL1x16_1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO lxvdsx vs28, 0, BO addi AO, AO, 64 addi BO, BO, 8 lxvd2x vs12, 0, AO lxvd2x vs13, o16, AO lxvd2x vs14, o32, AO lxvd2x vs15, o48, AO addi AO, AO, 64 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 xvmaddadp vs36, vs4, vs24 xvmaddadp vs37, vs5, vs24 xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 .endm .macro KERNEL1x16_2 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO addi AO, AO, 64 addi BO, BO, 8 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 xvmaddadp vs36, vs12, vs28 xvmaddadp vs37, vs13, vs28 xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 .endm .macro KERNEL1x16_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 xvmaddadp vs36, vs12, vs28 xvmaddadp vs37, vs13, vs28 xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 .endm .macro KERNEL1x16_SUBI1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO addi AO, AO, 64 addi BO, BO, 8 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 xvmuldp vs36, vs4, vs24 xvmuldp vs37, vs5, vs24 xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 .endm .macro KERNEL1x16_SUB1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO addi AO, AO, 64 addi BO, BO, 8 lxvd2x vs4, 0, AO lxvd2x vs5, o16, AO lxvd2x vs6, o32, AO lxvd2x vs7, o48, AO addi AO, AO, 64 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 xvmaddadp vs36, vs4, vs24 xvmaddadp vs37, vs5, vs24 xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 .endm .macro SAVE1x16 mr T1, CO addi T2, T1, 64 #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 lxvd2x vs2, o32, T1 lxvd2x vs3, o48, T1 lxvd2x vs4, 0, T2 lxvd2x vs5, o16, T2 lxvd2x vs6, o32, T2 lxvd2x vs7, o48, T2 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r xvmaddadp vs2, vs34, alpha_r xvmaddadp vs3, vs35, alpha_r xvmaddadp vs4, vs36, alpha_r xvmaddadp vs5, vs37, alpha_r xvmaddadp vs6, vs38, alpha_r xvmaddadp vs7, vs39, alpha_r #else xvmuldp vs0, vs32, alpha_r xvmuldp vs1, vs33, alpha_r xvmuldp vs2, vs34, alpha_r xvmuldp vs3, vs35, alpha_r xvmuldp vs4, vs36, alpha_r xvmuldp vs5, vs37, alpha_r xvmuldp vs6, vs38, alpha_r xvmuldp vs7, vs39, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 stxvd2x vs2, o32, T1 stxvd2x vs3, o48, T1 stxvd2x vs4, 0, T2 stxvd2x vs5, o16, T2 stxvd2x vs6, o32, T2 stxvd2x vs7, o48, T2 addi CO, CO, 128 .endm /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ .macro LOAD1x8_1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO addi AO, AO, 64 addi BO, BO, 8 .endm .macro KERNEL1x8_I1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO lxvdsx vs28, 0, BO addi AO, AO, 64 addi BO, BO, 8 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 .endm .macro KERNEL1x8_1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvd2x vs10, o32, AO lxvd2x vs11, o48, AO lxvdsx vs28, 0, BO addi AO, AO, 64 addi BO, BO, 8 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 .endm .macro KERNEL1x8_2 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO addi AO, AO, 64 addi BO, BO, 8 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 .endm .macro KERNEL1x8_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 .endm .macro KERNEL1x8_SUBI1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO addi AO, AO, 64 addi BO, BO, 8 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 .endm .macro KERNEL1x8_SUB1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvd2x vs2, o32, AO lxvd2x vs3, o48, AO lxvdsx vs24, 0, BO addi AO, AO, 64 addi BO, BO, 8 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 .endm .macro SAVE1x8 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 lxvd2x vs2, o32, T1 lxvd2x vs3, o48, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r xvmaddadp vs2, vs34, alpha_r xvmaddadp vs3, vs35, alpha_r #else xvmuldp vs0, vs32, alpha_r xvmuldp vs1, vs33, alpha_r xvmuldp vs2, vs34, alpha_r xvmuldp vs3, vs35, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 stxvd2x vs2, o32, T1 stxvd2x vs3, o48, T1 addi CO, CO, 64 .endm /********************************************************************* * Macros for N=1, M=4 * *********************************************************************/ .macro LOAD1x4_1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO addi AO, AO, 32 addi BO, BO, 8 .endm .macro KERNEL1x4_I1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvdsx vs28, 0, BO addi AO, AO, 32 addi BO, BO, 8 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 .endm .macro KERNEL1x4_1 lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO lxvdsx vs28, 0, BO addi AO, AO, 32 addi BO, BO, 8 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 .endm .macro KERNEL1x4_2 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO addi AO, AO, 32 addi BO, BO, 8 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 .endm .macro KERNEL1x4_E2 xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 .endm .macro KERNEL1x4_SUBI1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO addi AO, AO, 32 addi BO, BO, 8 xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 .endm .macro KERNEL1x4_SUB1 lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO lxvdsx vs24, 0, BO addi AO, AO, 32 addi BO, BO, 8 xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 .endm .macro SAVE1x4 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 lxvd2x vs1, o16, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r xvmaddadp vs1, vs33, alpha_r #else xvmuldp vs0, vs32, alpha_r xvmuldp vs1, vs33, alpha_r #endif stxvd2x vs0, 0, T1 stxvd2x vs1, o16, T1 addi CO, CO, 32 .endm /********************************************************************* * Macros for N=1, M=2 * *********************************************************************/ .macro LOAD1x2_1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO addi AO, AO, 16 addi BO, BO, 8 .endm .macro KERNEL1x2_I1 lxvd2x vs8, 0, AO lxvdsx vs28, 0, BO addi AO, AO, 16 addi BO, BO, 8 xvmuldp vs32, vs0, vs24 .endm .macro KERNEL1x2_1 lxvd2x vs8, 0, AO lxvdsx vs28, 0, BO addi AO, AO, 16 addi BO, BO, 8 xvmaddadp vs32, vs0, vs24 .endm .macro KERNEL1x2_2 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO addi AO, AO, 16 addi BO, BO, 8 xvmaddadp vs32, vs8, vs28 .endm .macro KERNEL1x2_E2 xvmaddadp vs32, vs8, vs28 .endm .macro KERNEL1x2_SUBI1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO addi AO, AO, 16 addi BO, BO, 8 xvmuldp vs32, vs0, vs24 .endm .macro KERNEL1x2_SUB1 lxvd2x vs0, 0, AO lxvdsx vs24, 0, BO addi AO, AO, 16 addi BO, BO, 8 xvmaddadp vs32, vs0, vs24 .endm .macro SAVE1x2 mr T1, CO #ifndef TRMMKERNEL lxvd2x vs0, 0, T1 #endif #ifndef TRMMKERNEL xvmaddadp vs0, vs32, alpha_r #else xvmuldp vs0, vs32, alpha_r #endif stxvd2x vs0, 0, T1 addi CO, CO, 16 .endm /********************************************************************* * Macros for N=1, M=1 * *********************************************************************/ .macro LOAD1x1_1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO addi AO, AO, 8 addi BO, BO, 8 .endm .macro KERNEL1x1_I1 lxsdx vs8, 0, AO lxsdx vs28, 0, BO addi AO, AO, 8 addi BO, BO, 8 xsmuldp vs32, vs0, vs24 .endm .macro KERNEL1x1_1 lxsdx vs8, 0, AO lxsdx vs28, 0, BO addi AO, AO, 8 addi BO, BO, 8 xsmaddadp vs32, vs0, vs24 .endm .macro KERNEL1x1_2 lxsdx vs0, 0, AO lxsdx vs24, 0, BO addi AO, AO, 8 addi BO, BO, 8 xsmaddadp vs32, vs8, vs28 .endm .macro KERNEL1x1_E2 xsmaddadp vs32, vs8, vs28 .endm .macro KERNEL1x1_SUBI1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO addi AO, AO, 8 addi BO, BO, 8 xsmuldp vs32, vs0, vs24 .endm .macro KERNEL1x1_SUB1 lxsdx vs0, 0, AO lxsdx vs24, 0, BO addi AO, AO, 8 addi BO, BO, 8 xsmaddadp vs32, vs0, vs24 .endm .macro SAVE1x1 mr T1, CO #ifndef TRMMKERNEL lxsdx vs0, 0, T1 #endif #ifndef TRMMKERNEL xsmaddadp vs0, vs32, alpha_r #else xsmuldp vs0, vs32, alpha_r #endif stxsdx vs0, 0, T1 addi CO, CO, 8 .endm /****************************TRMM POINTER REFRESH MACROSES*************************/ .macro SHIFT_REG REG1,REG2,SHIFT_VAL .if \SHIFT_VAL==16 slwi \REG1, \REG2, 7 .elseif \SHIFT_VAL==8 slwi \REG1, \REG2, 6 .elseif \SHIFT_VAL==4 slwi \REG1, \REG2, 5 .elseif \SHIFT_VAL==2 slwi \REG1, \REG2, 4 .elseif \SHIFT_VAL==1 slwi \REG1, \REG2, 3 .endif .endm /* //#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) // ptrbb = bb; // #else // ptrba += off*16; // ptrbb = bb + off*2; // #endif */ .macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) /* ptrbb = bb;*/ mr \PTR_B,\B_VAL /* refresh BPOINT */ #else /* // ptrba =ptrba+ off*C_A; // ptrbb = bb + off*C_B; */ SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ add \PTR_B, \B_VAL , T4 /* Add values to BO */ add \PTR_A, \PTR_A, T2 /* Add values to AO */ #endif .endm /* // #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) // temp = bk-off; // #elif defined(LEFT) // temp = off+16; // number of values in A // #else // temp = off+2; // number of values in B // #endif */ .macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) /* temp = bk-off;*/ sub \TEMP_BK,\BK_VAL,\OFF_VAL #elif defined(LEFT) /* temp = off+INCR_A; // number of values in A */ addi \TEMP_BK, \OFF_VAL, \INCR_A #else /* temp = off+INCR_B // number of values in B*/ addi \TEMP_BK,\OFF_VAL, \INCR_B #endif .endm /* // #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) // temp = bk - off; // #ifdef LEFT // temp -= 16; // number of values in A // #else // temp -= 2; // number of values in B // #endif // ptrba += temp*16; // ptrbb += temp*2; // #endif // #ifdef LEFT // off += 16; // number of values in A // #endif */ .macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) /*temp = bk - off;*/ sub \TEMP_BK,\BK_VAL,\OFF_VAL #ifdef LEFT /*temp -= 8; // number of values in A*/ addi \TEMP_BK,\TEMP_BK,-\C_A #else /*temp -= 4; // number of values in B*/ addi \TEMP_BK,\TEMP_BK,-\C_B #endif /*ptrba += temp*C_A; ptrbb += temp*C_B;*/ SHIFT_REG T4,\TEMP_BK,\C_A SHIFT_REG T2,\TEMP_BK,\C_B add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ add \PTR_B, \PTR_B,T2 #endif #ifdef LEFT /*off += 8; // number of values in A*/ addi \OFF_VAL,\OFF_VAL,\C_A #endif .endm