1/*************************************************************************** 2Copyright (c) 2013, The OpenBLAS Project 3All rights reserved. 4Redistribution and use in source and binary forms, with or without 5modification, are permitted provided that the following conditions are 6met: 71. Redistributions of source code must retain the above copyright 8notice, this list of conditions and the following disclaimer. 92. Redistributions in binary form must reproduce the above copyright 10notice, this list of conditions and the following disclaimer in 11the documentation and/or other materials provided with the 12distribution. 133. Neither the name of the OpenBLAS project nor the names of 14its contributors may be used to endorse or promote products 15derived from this software without specific prior written permission. 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*****************************************************************************/ 27 28/************************************************************************************** 29* 2013/11/07 Saar 30* BLASTEST : OK 31* CTEST : OK 32* TEST : OK 33* 34**************************************************************************************/ 35 36#define ASSEMBLER 37#include "common.h" 38 39#define STACKSIZE 256 40 41#define OLD_M r0 42#define OLD_N r1 43#define OLD_A r2 44#define OLD_LDA r3 45 46 47/****************************************************** 48* [fp, #-128] - [fp, #-64] is reserved 49* for store and restore of floating point 50* registers 51*******************************************************/ 52 53#define B [fp, #4 ] 54#define A [fp, #-248 ] 55 56#define M r0 57#define N r1 58#define M4 r2 59 60#define LDA r5 61 62#define AO1 r6 63#define BO1 r7 64#define BO2 r8 65 66#define I r4 67#define J r12 68 69#define A_PRE 256 70 71/************************************************************************************** 72* Macro definitions 73**************************************************************************************/ 74.macro COPY2x2 75 76 pld [ AO1, #A_PRE ] 77 vldmia.f64 AO1, { d0 - d3 } 78 79 add r3, AO1, LDA 80 pld [ r3, #A_PRE ] 81 vldmia.f64 r3, { d4 - d7 } 82 83 vstmia.f64 BO1, { d0 - d7 } 84 add AO1, AO1, #32 85 add BO1, BO1, M4 86 87.endm 88 89.macro COPY1x2 90 91 vldmia.f64 AO1, { d0 -d1 } 92 93 add r3, AO1, LDA 94 vldmia.f64 r3, { d2 - d3 } 95 96 vstmia.f64 BO2, { d0 - d3 } 97 add AO1, AO1, #16 98 add BO2, BO2, #32 99 100.endm 101 102/*************************************************************************************************************************/ 103.macro COPY2x1 104 105 vldmia.f64 AO1, { d0 - d3 } 106 107 vstmia.f64 BO1, { d0 - d3 } 108 add AO1, AO1, #32 109 add BO1, BO1, M4 110 111.endm 112 113.macro COPY1x1 114 115 vldmia.f64 AO1, { d0 - d1 } 116 117 vstmia.f64 BO2, { d0 - d1 } 118 add AO1, AO1, #16 119 add BO2, BO2, #16 120 121.endm 122 123 124 125/************************************************************************************** 126* End of macro definitions 127**************************************************************************************/ 128 129 PROLOGUE 130 131 .align 5 132 133 push {r4 - r9, fp} 134 add fp, sp, #24 135 sub sp, sp, #STACKSIZE // reserve stack 136 137 str OLD_A, A // store A 138 139 lsl LDA, OLD_LDA, #4 // lda = lda * SIZE * 2 140 141 sub r4, fp, #128 142 vstm r4, { d8 - d15} // store floating point registers 143 144 lsl r4 , M, #4 // M * SIZE * 2 145 146 ldr r3, B 147 148 and BO2 , N , #-2 149 150 mul BO2, BO2, r4 151 152 add BO2 , BO2, r3 153 154 lsl M4, M, #5 // M4 = M * 2 * SIZE * 2 155 156zgemm_tcopy_L2_BEGIN: 157 158 asrs J, M, #1 // J = N / 2 159 ble zgemm_tcopy_L1_BEGIN 160 161zgemm_tcopy_L2_M2_BEGIN: 162 163 ldr AO1, A // AO1 = A 164 lsl r3, LDA, #1 // r3 = 2 * LDA 165 add r3, r3 , AO1 // A = A + 2 * LDA 166 str r3, A // store A 167 168 ldr BO1, B 169 add r3, BO1, #64 // B = B + 4 * SIZE *2 170 str r3, B 171 172 asrs I, N, #1 // I = M / 2 173 ble zgemm_tcopy_L2_M2_60 174 175zgemm_tcopy_L2_M2_40: 176 177 COPY2x2 178 subs I, I, #1 179 bne zgemm_tcopy_L2_M2_40 180 181zgemm_tcopy_L2_M2_60: 182 183 tst N , #1 184 ble zgemm_tcopy_L2_M2_END 185 186 COPY1x2 187 188 189zgemm_tcopy_L2_M2_END: 190 191 subs J , J, #1 // j-- 192 bne zgemm_tcopy_L2_M2_BEGIN 193 194/*********************************************************************************************/ 195 196zgemm_tcopy_L1_BEGIN: 197 198 tst M, #1 199 ble zgemm_tcopy_L999 200 201 202zgemm_tcopy_L1_M2_BEGIN: 203 204 ldr AO1, A // AO1 = A 205 add r3, LDA , AO1 // A = A + 1 * LDA 206 str r3, A // store A 207 208 ldr BO1, B 209 add r3, BO1, #32 // B = B + 2 * SIZE *2 210 str r3, B 211 212 asrs I, N, #1 // I = M / 2 213 ble zgemm_tcopy_L1_M2_60 214 215 216zgemm_tcopy_L1_M2_40: 217 218 COPY2x1 219 subs I, I, #1 220 bne zgemm_tcopy_L1_M2_40 221 222zgemm_tcopy_L1_M2_60: 223 224 tst N , #1 225 ble zgemm_tcopy_L1_M2_END 226 227 COPY1x1 228 229 230zgemm_tcopy_L1_M2_END: 231 232 233 234zgemm_tcopy_L999: 235 236 sub r3, fp, #128 237 vldm r3, { d8 - d15} // restore floating point registers 238 239 mov r0, #0 // set return value 240 sub sp, fp, #24 241 pop {r4 - r9, fp} 242 bx lr 243 244 EPILOGUE 245 246