1/*************************************************************************** 2Copyright (c) 2013, The OpenBLAS Project 3All rights reserved. 4Redistribution and use in source and binary forms, with or without 5modification, are permitted provided that the following conditions are 6met: 71. Redistributions of source code must retain the above copyright 8notice, this list of conditions and the following disclaimer. 92. Redistributions in binary form must reproduce the above copyright 10notice, this list of conditions and the following disclaimer in 11the documentation and/or other materials provided with the 12distribution. 133. Neither the name of the OpenBLAS project nor the names of 14its contributors may be used to endorse or promote products 15derived from this software without specific prior written permission. 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*****************************************************************************/ 27 28/************************************************************************************** 29* 2013/11/05 Saar 30* BLASTEST : OK 31* CTEST : OK 32* TEST : OK 33* 34**************************************************************************************/ 35 36#define ASSEMBLER 37#include "common.h" 38 39#define STACKSIZE 256 40 41#define OLD_M r0 42#define OLD_N r1 43#define OLD_A r2 44#define OLD_LDA r3 45 46 47/****************************************************** 48* [fp, #-128] - [fp, #-64] is reserved 49* for store and restore of floating point 50* registers 51*******************************************************/ 52 53#define LDA [fp, #-260 ] 54 55#define B [fp, #4 ] 56 57#define M r0 58#define N r1 59#define A r2 60 61#define BO r5 62 63#define AO1 r6 64#define AO2 r7 65 66#define I r3 67#define J r12 68 69#define A_PRE 256 70 71/************************************************************************************** 72* Macro definitions 73**************************************************************************************/ 74 75.macro COPY2x2 76 77 pld [ AO1, #A_PRE ] 78 pld [ AO2, #A_PRE ] 79 fldd d0 , [ AO1, #0 ] 80 fldd d1 , [ AO1, #8 ] 81 fldd d4 , [ AO1, #16 ] 82 fldd d5 , [ AO1, #24 ] 83 84 fldd d2 , [ AO2, #0 ] 85 fldd d3 , [ AO2, #8 ] 86 add AO1, AO1, #32 87 fldd d6 , [ AO2, #16 ] 88 fldd d7 , [ AO2, #24 ] 89 90 vstmia.f64 BO!, { d0 - d7 } 91 add AO2, AO2, #32 92 93.endm 94 95 96.macro COPY1x2 97 98 fldd d0 , [ AO1, #0 ] 99 fldd d1 , [ AO1, #8 ] 100 fldd d2 , [ AO2, #0 ] 101 fldd d3 , [ AO2, #8 ] 102 103 add AO1, AO1, #16 104 vstmia.f64 BO!, { d0 - d3 } 105 add AO2, AO2, #16 106 107.endm 108 109.macro COPY2x1 110 111 fldd d0 , [ AO1, #0 ] 112 fldd d1 , [ AO1, #8 ] 113 fldd d2 , [ AO1, #16 ] 114 fldd d3 , [ AO1, #24 ] 115 116 vstmia.f64 BO!, { d0 - d3 } 117 add AO1, AO1, #32 118 119.endm 120 121 122.macro COPY1x1 123 124 fldd d0 , [ AO1, #0 ] 125 fldd d1 , [ AO1, #8 ] 126 127 vstmia.f64 BO!, { d0 - d1 } 128 add AO1, AO1, #16 129 130.endm 131 132 133 134 135 136/************************************************************************************** 137* End of macro definitions 138**************************************************************************************/ 139 140 PROLOGUE 141 142 .align 5 143 144 push {r4 - r9, fp} 145 add fp, sp, #24 146 sub sp, sp, #STACKSIZE // reserve stack 147 148 149 lsl r3, r3, #4 // lda = lda * 8 * 2 150 str r3, LDA 151 152 sub r4, fp, #128 153 vstm r4, { d8 - d15} // store floating point registers 154 155 ldr BO, B 156 157/*********************************************************************************************/ 158 159zgemm_ncopy_L2_BEGIN: 160 161 asrs J, N, #1 // J = N / 2 162 ble zgemm_ncopy_L1_BEGIN 163 164zgemm_ncopy_L2_M2_BEGIN: 165 166 mov AO1, A // AO1 = A 167 ldr r4 , LDA 168 add AO2, AO1, r4 169 add A , AO2, r4 // A = A + 2 * LDA 170 171 asrs I, M, #1 // I = M / 2 172 ble zgemm_ncopy_L2_M2_40 173 174zgemm_ncopy_L2_M2_20: 175 176 COPY2x2 177 178 subs I , I , #1 179 bne zgemm_ncopy_L2_M2_20 180 181 182zgemm_ncopy_L2_M2_40: 183 184 ands I, M , #1 185 ble zgemm_ncopy_L2_M2_END 186 187zgemm_ncopy_L2_M2_60: 188 189 COPY1x2 190 191 subs I , I , #1 192 bne zgemm_ncopy_L2_M2_60 193 194 195zgemm_ncopy_L2_M2_END: 196 197 subs J , J, #1 // j-- 198 bne zgemm_ncopy_L2_M2_BEGIN 199 200 201/*********************************************************************************************/ 202 203zgemm_ncopy_L1_BEGIN: 204 205 tst N, #1 206 ble zgemm_ncopy_L999 207 208 209zgemm_ncopy_L1_M2_BEGIN: 210 211 mov AO1, A // AO1 = A 212 ldr r4 , LDA 213 add A , AO1, r4 // A = A + 1 * LDA 214 215 asrs I, M, #1 // I = M / 2 216 ble zgemm_ncopy_L1_M2_40 217 218zgemm_ncopy_L1_M2_20: 219 220 COPY2x1 221 222 subs I , I , #1 223 bne zgemm_ncopy_L1_M2_20 224 225 226zgemm_ncopy_L1_M2_40: 227 228 ands I, M , #1 229 ble zgemm_ncopy_L1_M2_END 230 231zgemm_ncopy_L1_M2_60: 232 233 COPY1x1 234 235 subs I , I , #1 236 bne zgemm_ncopy_L1_M2_60 237 238 239zgemm_ncopy_L1_M2_END: 240 241 242 243zgemm_ncopy_L999: 244 245 sub r3, fp, #128 246 vldm r3, { d8 - d15} // restore floating point registers 247 248 movs r0, #0 // set return value 249 sub sp, fp, #24 250 pop {r4 - r9, fp} 251 bx lr 252 253 EPILOGUE 254 255