1/*************************************************************************** 2Copyright (c) 2013, The OpenBLAS Project 3All rights reserved. 4Redistribution and use in source and binary forms, with or without 5modification, are permitted provided that the following conditions are 6met: 71. Redistributions of source code must retain the above copyright 8notice, this list of conditions and the following disclaimer. 92. Redistributions in binary form must reproduce the above copyright 10notice, this list of conditions and the following disclaimer in 11the documentation and/or other materials provided with the 12distribution. 133. Neither the name of the OpenBLAS project nor the names of 14its contributors may be used to endorse or promote products 15derived from this software without specific prior written permission. 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*****************************************************************************/ 27 28/************************************************************************************** 29* 2013/11/05 Saar 30* BLASTEST : OK 31* CTEST : OK 32* TEST : OK 33* 34**************************************************************************************/ 35 36#define ASSEMBLER 37#include "common.h" 38 39#define STACKSIZE 256 40 41#define OLD_M r0 42#define OLD_N r1 43#define OLD_A r2 44#define OLD_LDA r3 45 46 47/****************************************************** 48* [fp, #-128] - [fp, #-64] is reserved 49* for store and restore of floating point 50* registers 51*******************************************************/ 52 53#define LDA [fp, #-260 ] 54 55#define B [fp, #4 ] 56 57#define M r0 58#define N r1 59#define A r2 60 61#define BO r5 62 63#define AO1 r6 64#define AO2 r7 65#define AO3 r8 66#define AO4 r9 67 68#define I r3 69#define J r12 70 71#define A_PRE 192 72 73/************************************************************************************** 74* Macro definitions 75**************************************************************************************/ 76 77.macro COPY4x4 78 79 flds s0 , [ AO1, #0 ] 80 flds s1 , [ AO2, #0 ] 81 flds s2 , [ AO3, #0 ] 82 flds s3 , [ AO4, #0 ] 83 84 flds s4 , [ AO1, #4 ] 85 flds s8 , [ AO1, #8 ] 86 flds s12, [ AO1, #12 ] 87 88 flds s5 , [ AO2, #4 ] 89 add AO1, AO1, #16 90 flds s9 , [ AO2, #8 ] 91 flds s13, [ AO2, #12 ] 92 93 flds s6 , [ AO3, #4 ] 94 add AO2, AO2, #16 95 flds s10, [ AO3, #8 ] 96 flds s14, [ AO3, #12 ] 97 98 flds s7 , [ AO4, #4 ] 99 add AO3, AO3, #16 100 flds s11, [ AO4, #8 ] 101 flds s15, [ AO4, #12 ] 102 103 vstmia.f32 BO!, { s0 - s3 } 104 add AO4, AO4, #16 105 vstmia.f32 BO!, { s4 - s7 } 106 vstmia.f32 BO!, { s8 - s15 } 107 108.endm 109 110.macro COPY1x4 111 112 flds s0 , [ AO1, #0 ] 113 flds s1 , [ AO2, #0 ] 114 add AO1, AO1, #4 115 flds s2 , [ AO3, #0 ] 116 add AO2, AO2, #4 117 flds s3 , [ AO4, #0 ] 118 119 add AO3, AO3, #4 120 vstmia.f32 BO!, { s0 - s3 } 121 add AO4, AO4, #4 122 123.endm 124 125.macro COPY4x2 126 127 flds s0 , [ AO1, #0 ] 128 flds s2 , [ AO1, #4 ] 129 flds s4 , [ AO1, #8 ] 130 flds s6 , [ AO1, #12 ] 131 132 flds s1 , [ AO2, #0 ] 133 flds s3 , [ AO2, #4 ] 134 add AO1, AO1, #16 135 flds s5 , [ AO2, #8 ] 136 flds s7 , [ AO2, #12 ] 137 138 vstmia.f32 BO!, { s0 - s7 } 139 add AO2, AO2, #16 140 141.endm 142 143 144.macro COPY1x2 145 146 flds s0 , [ AO1, #0 ] 147 flds s1 , [ AO2, #0 ] 148 add AO1, AO1, #4 149 150 vstmia.f32 BO!, { s0 - s1 } 151 add AO2, AO2, #4 152 153.endm 154 155.macro COPY4x1 156 157 flds s0 , [ AO1, #0 ] 158 flds s1 , [ AO1, #4 ] 159 flds s2 , [ AO1, #8 ] 160 flds s3 , [ AO1, #12 ] 161 162 vstmia.f32 BO!, { s0 - s3 } 163 add AO1, AO1, #16 164 165.endm 166 167 168.macro COPY1x1 169 170 flds s0 , [ AO1, #0 ] 171 172 vstmia.f32 BO!, { s0 } 173 add AO1, AO1, #4 174 175.endm 176 177 178 179 180 181/************************************************************************************** 182* End of macro definitions 183**************************************************************************************/ 184 185 PROLOGUE 186 187 .align 5 188 189 push {r4 - r9, fp} 190 add fp, sp, #24 191 sub sp, sp, #STACKSIZE // reserve stack 192 193 194 lsl r3, r3, #2 // lda = lda * 4 195 str r3, LDA 196 197 sub r4, fp, #128 198 vstm r4, { s8 - s15} // store floating point registers 199 200 ldr BO, B 201 202sgemm_ncopy_L4_BEGIN: 203 204 asrs J, N, #2 // J = N / 4 205 ble sgemm_ncopy_L2_BEGIN 206 207sgemm_ncopy_L4_M4_BEGIN: 208 209 mov AO1, A // AO1 = A 210 ldr r4 , LDA 211 add AO2, AO1, r4 212 add AO3, AO2, r4 213 add AO4, AO3, r4 214 add A , AO4, r4 // A = A + 4 * LDA 215 216 asrs I, M, #2 // I = M / 4 217 ble sgemm_ncopy_L4_M4_40 218 219sgemm_ncopy_L4_M4_20: 220 221 pld [ AO1, #A_PRE ] 222 pld [ AO2, #A_PRE ] 223 pld [ AO3, #A_PRE ] 224 pld [ AO4, #A_PRE ] 225 COPY4x4 226 227 subs I , I , #1 228 ble sgemm_ncopy_L4_M4_40 229 230 COPY4x4 231 232 subs I , I , #1 233 bne sgemm_ncopy_L4_M4_20 234 235 236sgemm_ncopy_L4_M4_40: 237 238 ands I, M , #3 239 ble sgemm_ncopy_L4_M4_END 240 241sgemm_ncopy_L4_M4_60: 242 243 COPY1x4 244 245 subs I , I , #1 246 bne sgemm_ncopy_L4_M4_60 247 248 249sgemm_ncopy_L4_M4_END: 250 251 subs J , J, #1 // j-- 252 bne sgemm_ncopy_L4_M4_BEGIN 253 254 255 256/*********************************************************************************************/ 257 258sgemm_ncopy_L2_BEGIN: 259 260 tst N, #3 261 ble sgemm_ncopy_L999 262 263 tst N, #2 264 ble sgemm_ncopy_L1_BEGIN 265 266sgemm_ncopy_L2_M4_BEGIN: 267 268 mov AO1, A // AO1 = A 269 ldr r4 , LDA 270 add AO2, AO1, r4 271 add A , AO2, r4 // A = A + 2 * LDA 272 273 asrs I, M, #2 // I = M / 4 274 ble sgemm_ncopy_L2_M4_40 275 276sgemm_ncopy_L2_M4_20: 277 278 COPY4x2 279 280 subs I , I , #1 281 bne sgemm_ncopy_L2_M4_20 282 283 284sgemm_ncopy_L2_M4_40: 285 286 ands I, M , #3 287 ble sgemm_ncopy_L2_M4_END 288 289sgemm_ncopy_L2_M4_60: 290 291 COPY1x2 292 293 subs I , I , #1 294 bne sgemm_ncopy_L2_M4_60 295 296 297sgemm_ncopy_L2_M4_END: 298 299 300/*********************************************************************************************/ 301 302sgemm_ncopy_L1_BEGIN: 303 304 tst N, #1 305 ble sgemm_ncopy_L999 306 307 308sgemm_ncopy_L1_M4_BEGIN: 309 310 mov AO1, A // AO1 = A 311 ldr r4 , LDA 312 add A , AO1, r4 // A = A + 1 * LDA 313 314 asrs I, M, #2 // I = M / 4 315 ble sgemm_ncopy_L1_M4_40 316 317sgemm_ncopy_L1_M4_20: 318 319 COPY4x1 320 321 subs I , I , #1 322 bne sgemm_ncopy_L1_M4_20 323 324 325sgemm_ncopy_L1_M4_40: 326 327 ands I, M , #3 328 ble sgemm_ncopy_L1_M4_END 329 330sgemm_ncopy_L1_M4_60: 331 332 COPY1x1 333 334 subs I , I , #1 335 bne sgemm_ncopy_L1_M4_60 336 337 338sgemm_ncopy_L1_M4_END: 339 340 341 342sgemm_ncopy_L999: 343 344 sub r3, fp, #128 345 vldm r3, { s8 - s15} // restore floating point registers 346 347 movs r0, #0 // set return value 348 sub sp, fp, #24 349 pop {r4 - r9, fp} 350 bx lr 351 352 EPILOGUE 353 354