1/************************************************************************************************** 2* * 3* This file is part of BLASFEO. * 4* * 5* BLASFEO -- BLAS For Embedded Optimization. * 6* Copyright (C) 2019 by Gianluca Frison. * 7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. * 8* All rights reserved. * 9* * 10* The 2-Clause BSD License * 11* * 12* Redistribution and use in source and binary forms, with or without * 13* modification, are permitted provided that the following conditions are met: * 14* * 15* 1. Redistributions of source code must retain the above copyright notice, this * 16* list of conditions and the following disclaimer. * 17* 2. Redistributions in binary form must reproduce the above copyright notice, * 18* this list of conditions and the following disclaimer in the documentation * 19* and/or other materials provided with the distribution. * 20* * 21* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * 22* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * 23* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * 24* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * 25* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 26* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * 27* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * 28* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 29* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * 30* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * 31* * 32* Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de * 33* * 34**************************************************************************************************/ 35 36 37 38 39 40// subroutine 41// 42// input arguments: 43// r4 <- alpha 44// r5 <- beta 45// r6 <- C 46// r7 <- ldc*sizeof(float) 47// 48// output arguments: 49 50#if MACRO_LEVEL>=1 51 .macro INNER_SCALE_AB_8X4_LIB lc_zero 52#else 53 .align 3 5499: // 0 55 .word 0 56 .word 0 57// .p2align 4,,15 58#if defined(OS_LINUX) 59 .type inner_scale_ab_8x4_lib, %function 60inner_scale_ab_8x4_lib: 61#elif defined(OS_MAC) 62_inner_scale_ab_8x4_lib: 63#endif 64#endif 65 66 flds s8, [r4, #0] // alpha 67 flds s9, [r5, #0] // beta 68 69#if MACRO_LEVEL>=1 70 flds s10, \lc_zero // 0.0 71#else 72 flds s10, 99b // 0.0 73#endif 74 75 vmul.f32 q4, q4, d4[0] 76 vmul.f32 q5, q5, d4[0] 77 vmul.f32 q6, q6, d4[0] 78 vmul.f32 q7, q7, d4[0] 79 fcmpes s9, s10 80 vmul.f32 q8, q8, d4[0] 81 vmul.f32 q9, q9, d4[0] 82 vmul.f32 q10, q10, d4[0] 83 vmul.f32 q11, q11, d4[0] 84 fmstat 85 86 beq 0f // end 87 88 add r8, r6, #16 89 90 vld1.64 {d0, d1}, [r6], r7 91 vld1.64 {d2, d3}, [r8], r7 92 vmla.f32 q4, q0, d4[1] 93 vmla.f32 q8, q1, d4[1] 94 vld1.64 {d0, d1}, [r6], r7 95 vld1.64 {d2, d3}, [r8], r7 96 vmla.f32 q5, q0, d4[1] 97 vmla.f32 q9, q1, d4[1] 98 vld1.64 {d0, d1}, [r6], r7 99 vld1.64 {d2, d3}, [r8], r7 100 vmla.f32 q6, q0, d4[1] 101 vmla.f32 q10, q1, d4[1] 102 vld1.64 {d0, d1}, [r6], r7 103 vld1.64 {d2, d3}, [r8], r7 104 vmla.f32 q7, q0, d4[1] 105 vmla.f32 q11, q1, d4[1] 106 1070: 108 109#if MACRO_LEVEL>=1 110 .endm 111#else 112 mov pc, lr // return 113 114#if defined(OS_LINUX) 115 .size inner_scale_ab_8x4_lib, .-inner_scale_ab_8x4_lib 116#endif 117#endif 118 119 120 121 122 123// subroutine 124// 125// input arguments: 126// r4 <- D 127// r5 <- ldd*sizeof(float) 128// 129// output arguments: 130 131#if MACRO_LEVEL>=1 132 .macro INNER_STORE_8X4_LIB 133#else 134// .p2align 4,,15 135#if defined(OS_LINUX) 136 .type inner_store_8x4_lib, %function 137inner_store_8x4_lib: 138#elif defined(OS_MAC) 139_inner_store_8x4_lib4: 140#endif 141#endif 142 143 add r6, r4, #16 144 145 vst1.64 {d8, d9}, [r4], r5 146 vst1.64 {d16, d17}, [r6], r5 147 vst1.64 {d10, d11}, [r4], r5 148 vst1.64 {d18, d19}, [r6], r5 149 vst1.64 {d12, d13}, [r4], r5 150 vst1.64 {d20, d21}, [r6], r5 151 vst1.64 {d14, d15}, [r4], r5 152 vst1.64 {d22, d23}, [r6], r5 153 154#if MACRO_LEVEL>=1 155 .endm 156#else 157 mov pc, lr // return 158 159#if defined(OS_LINUX) 160 .size inner_store_8x4_lib, .-inner_store_8x4_lib 161#endif 162#endif 163 164 165 166 167 168 .align 3 16999: // 0 170 .word 0 171 .word 0 172 173 174 175 176// r0 r1 r2 r3 sp+0 sp+4 sp+8 sp+12 sp+16 sp+20 177// void kernel_sgemm_nt_8x4_lib44cc(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int ldc, double *D, int ldd) 178 179// .p2align 4,,15 180#if defined(OS_LINUX) 181 .global kernel_sgemm_nt_8x4_lib44cc 182 .type kernel_sgemm_nt_8x4_lib44cc, %function 183kernel_sgemm_nt_8x4_lib44cc: 184#elif defined(OS_MAC) 185 .global kernel_sgemm_nt_8x4_lib44cc 186_kernel_sgemm_nt_8x4_lib44cc: 187#endif 188 189 PROLOGUE 190 191 192 193 // zero accumulation registers 194 vldr d8, 99b 195 vldr d9, 99b 196 vmov q5, q4 197 vmov q6, q4 198 vmov q7, q4 199 vmov q8, q4 200 vmov q9, q4 201 vmov q10, q4 202 vmov q11, q4 203 204 205 206 // call inner kernel dgemm nt 207 mov r4, r0 // kmax 208 mov r5, r2 // A 209 mov r6, r3 // sda 210 lsl r6, r6, #4 // 4*sizeof(float)*sda 211 ldr r7, [fp, #0] // B 212 213#if MACRO_LEVEL>=2 214 INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4 215#else 216#if defined(OS_LINUX) 217 bl inner_kernel_gemm_add_nt_8x4_lib4 218#elif defined(OS_MAC) 219 bl _inner_kernel_gemm_add_nt_8x4_lib4 220#endif 221#endif 222 223 224 225 // call inner blend for generic alpha and beta 226 mov r4, r1 // alpha 227 ldr r5, [fp, #4] // beta 228 ldr r6, [fp, #8] // C 229 ldr r7, [fp, #12] // ldc 230 lsl r7, r7, #2 // sizeof(float)*ldc 231 232#if MACRO_LEVEL>=1 233 INNER_SCALE_AB_8X4_LIB 99f 234#else 235#if defined(OS_LINUX) 236 bl inner_scale_ab_8x4_lib 237#elif defined(OS_MAC) 238 bl _inner_scale_ab_8x4_lib 239#endif 240#endif 241 242 243 244 // store n 245 ldr r4, [fp, #16] // D 246 ldr r5, [fp, #20] // ldd 247 lsl r5, r5, #2 // sizeof(float)*ldd 248 249#if MACRO_LEVEL>=1 250 INNER_STORE_8X4_LIB 251#else 252#if defined(OS_LINUX) 253 bl inner_store_8x4_lib 254#elif defined(OS_MAC) 255 bl _inner_store_8x4_lib 256#endif 257#endif 258 259 260 261 EPILOGUE 262 263#if defined(OS_LINUX) 264 .size kernel_sgemm_nt_8x4_lib44cc, .-kernel_sgemm_nt_8x4_lib44cc 265#endif 266 267 268 269 270 271 .align 3 27299: // 0 273 .word 0 274 .word 0 275 276 277 278 279 280 281 282