1/*************************************************************************** 2Copyright (c) 2013, The OpenBLAS Project 3All rights reserved. 4Redistribution and use in source and binary forms, with or without 5modification, are permitted provided that the following conditions are 6met: 71. Redistributions of source code must retain the above copyright 8notice, this list of conditions and the following disclaimer. 92. Redistributions in binary form must reproduce the above copyright 10notice, this list of conditions and the following disclaimer in 11the documentation and/or other materials provided with the 12distribution. 133. Neither the name of the OpenBLAS project nor the names of 14its contributors may be used to endorse or promote products 15derived from this software without specific prior written permission. 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*****************************************************************************/ 27 28/************************************************************************************** 29* 2013/11/14 Saar 30* BLASTEST : OK 31* CTEST : OK 32* TEST : OK 33* 34**************************************************************************************/ 35 36#define ASSEMBLER 37#include "common.h" 38 39#define STACKSIZE 256 40 41#if !defined(__ARM_PCS_VFP) 42 43#if !defined(COMPLEX) 44 45#if !defined(DOUBLE) 46#define OLD_X [fp, #0 ] 47#define OLD_INC_X [fp, #4 ] 48#define OLD_Y [fp, #8 ] 49#define OLD_INC_Y [fp, #12 ] 50#else 51#define OLD_X [fp, #8 ] 52#define OLD_INC_X [fp, #12] 53#define OLD_Y [fp, #16] 54#define OLD_INC_Y [fp, #20] 55#endif 56 57#else //COMPLEX 58 59#if !defined(DOUBLE) 60#define OLD_X [fp, #4 ] 61#define OLD_INC_X [fp, #8 ] 62#define OLD_Y [fp, #12 ] 63#define OLD_INC_Y [fp, #16 ] 64#else 65#define OLD_X [fp, #16] 66#define OLD_INC_X [fp, #20] 67#define OLD_Y [fp, #24] 68#define OLD_INC_Y [fp, #28] 69#endif 70 71#endif // !defined(__ARM_PCS_VFP) 72 73#else 74#define OLD_INC_X [fp, #0 ] 75#define OLD_Y [fp, #4 ] 76#define OLD_INC_Y [fp, #8 ] 77#endif 78 79 80#define N r0 81#define Y r1 82#define INC_X r2 83#define X r3 84#define INC_Y r4 85 86#define I r12 87 88#define X_PRE 512 89 90/************************************************************************************** 91* Macro definitions 92**************************************************************************************/ 93 94/*****************************************************************************************/ 95 96 97 98#if !defined(COMPLEX) 99 100#if defined(DOUBLE) 101 102.macro KERNEL_F4 103 104 pld [ X, #X_PRE ] 105 pld [ Y, #X_PRE ] 106 vldmia.f64 X, { d0 - d3 } 107 vldmia.f64 Y, { d4 - d7 } 108 vstmia.f64 Y!, { d0 - d3 } 109 vstmia.f64 X!, { d4 - d7} 110 111.endm 112 113 114.macro KERNEL_F1 115 116 vldmia.f64 X, { d0 } 117 vldmia.f64 Y, { d4 } 118 vstmia.f64 Y!, { d0 } 119 vstmia.f64 X!, { d4 } 120 121.endm 122 123.macro KERNEL_S1 124 125 vldmia.f64 X, { d0 } 126 vldmia.f64 Y, { d4 } 127 vstmia.f64 Y, { d0 } 128 vstmia.f64 X, { d4 } 129 add X, X, INC_X 130 add Y, Y, INC_Y 131 132.endm 133 134#else 135 136.macro KERNEL_F4 137 138 vldmia.f32 X, { s0 - s3 } 139 vldmia.f32 Y, { s4 - s7 } 140 vstmia.f32 Y!, { s0 - s3 } 141 vstmia.f32 X!, { s4 - s7} 142 143.endm 144 145 146.macro KERNEL_F1 147 148 vldmia.f32 X, { s0 } 149 vldmia.f32 Y, { s4 } 150 vstmia.f32 Y!, { s0 } 151 vstmia.f32 X!, { s4 } 152 153.endm 154 155.macro KERNEL_S1 156 157 vldmia.f32 X, { s0 } 158 vldmia.f32 Y, { s4 } 159 vstmia.f32 Y, { s0 } 160 vstmia.f32 X, { s4 } 161 add X, X, INC_X 162 add Y, Y, INC_Y 163 164.endm 165 166 167#endif 168 169#else 170 171#if defined(DOUBLE) 172 173.macro KERNEL_F4 174 175 pld [ X, #X_PRE ] 176 pld [ Y, #X_PRE ] 177 vldmia.f64 X, { d0 - d3 } 178 vldmia.f64 Y, { d4 - d7 } 179 vstmia.f64 Y!, { d0 - d3 } 180 vstmia.f64 X!, { d4 - d7} 181 182 pld [ X, #X_PRE ] 183 pld [ Y, #X_PRE ] 184 vldmia.f64 X, { d0 - d3 } 185 vldmia.f64 Y, { d4 - d7 } 186 vstmia.f64 Y!, { d0 - d3 } 187 vstmia.f64 X!, { d4 - d7} 188 189.endm 190 191.macro KERNEL_F1 192 193 vldmia.f64 X, { d0 - d1 } 194 vldmia.f64 Y, { d4 - d5 } 195 vstmia.f64 Y!, { d0 - d1 } 196 vstmia.f64 X!, { d4 - d5 } 197 198.endm 199 200.macro KERNEL_S1 201 202 vldmia.f64 X, { d0 - d1 } 203 vldmia.f64 Y, { d4 - d5 } 204 vstmia.f64 Y, { d0 - d1 } 205 vstmia.f64 X, { d4 - d5 } 206 add X, X, INC_X 207 add Y, Y, INC_Y 208 209.endm 210 211 212#else 213 214.macro KERNEL_F4 215 216 pld [ X, #X_PRE ] 217 pld [ Y, #X_PRE ] 218 vldmia.f32 X, { s0 - s3 } 219 vldmia.f32 Y, { s4 - s7 } 220 vstmia.f32 Y!, { s0 - s3 } 221 vstmia.f32 X!, { s4 - s7} 222 223 vldmia.f32 X, { s0 - s3 } 224 vldmia.f32 Y, { s4 - s7 } 225 vstmia.f32 Y!, { s0 - s3 } 226 vstmia.f32 X!, { s4 - s7} 227 228.endm 229 230.macro KERNEL_F1 231 232 vldmia.f32 X, { s0 - s1 } 233 vldmia.f32 Y, { s4 - s5 } 234 vstmia.f32 Y!, { s0 - s1 } 235 vstmia.f32 X!, { s4 - s5 } 236 237.endm 238 239.macro KERNEL_S1 240 241 vldmia.f32 X, { s0 - s1 } 242 vldmia.f32 Y, { s4 - s5 } 243 vstmia.f32 Y, { s0 - s1 } 244 vstmia.f32 X, { s4 - s5 } 245 add X, X, INC_X 246 add Y, Y, INC_Y 247 248.endm 249 250 251 252#endif 253 254#endif 255 256/************************************************************************************** 257* End of macro definitions 258**************************************************************************************/ 259 260 PROLOGUE 261 262 .align 5 263 push {r4 , fp} 264 add fp, sp, #8 265 266#if !defined(__ARM_PCS_VFP) 267 ldr X, OLD_X 268#endif 269 ldr INC_X , OLD_INC_X 270 ldr Y, OLD_Y 271 ldr INC_Y , OLD_INC_Y 272 273 274 cmp N, #0 275 ble swap_kernel_L999 276 277 cmp INC_X, #0 278 beq swap_kernel_L999 279 280 cmp INC_Y, #0 281 beq swap_kernel_L999 282 283 cmp INC_X, #1 284 bne swap_kernel_S_BEGIN 285 286 cmp INC_Y, #1 287 bne swap_kernel_S_BEGIN 288 289 290swap_kernel_F_BEGIN: 291 292 293 asrs I, N, #2 // I = N / 4 294 ble swap_kernel_F1 295 296 .align 5 297 298swap_kernel_F4: 299 300#if !defined(COMPLEX) && !defined(DOUBLE) 301 pld [ X, #X_PRE ] 302 pld [ Y, #X_PRE ] 303#endif 304 305 KERNEL_F4 306 307 subs I, I, #1 308 ble swap_kernel_F1 309 310 KERNEL_F4 311 312 subs I, I, #1 313 bne swap_kernel_F4 314 315swap_kernel_F1: 316 317 ands I, N, #3 318 ble swap_kernel_L999 319 320swap_kernel_F10: 321 322 KERNEL_F1 323 324 subs I, I, #1 325 bne swap_kernel_F10 326 327 b swap_kernel_L999 328 329swap_kernel_S_BEGIN: 330 331#if defined(COMPLEX) 332 333#if defined(DOUBLE) 334 lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 335 lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 336#else 337 lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 338 lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 339#endif 340 341#else 342 343#if defined(DOUBLE) 344 lsl INC_X, INC_X, #3 // INC_X * SIZE 345 lsl INC_Y, INC_Y, #3 // INC_Y * SIZE 346#else 347 lsl INC_X, INC_X, #2 // INC_X * SIZE 348 lsl INC_Y, INC_Y, #2 // INC_Y * SIZE 349#endif 350 351#endif 352 353 354 asrs I, N, #2 // I = N / 4 355 ble swap_kernel_S1 356 357 .align 5 358 359swap_kernel_S4: 360 361 KERNEL_S1 362 KERNEL_S1 363 KERNEL_S1 364 KERNEL_S1 365 366 subs I, I, #1 367 bne swap_kernel_S4 368 369swap_kernel_S1: 370 371 ands I, N, #3 372 ble swap_kernel_L999 373 374swap_kernel_S10: 375 376 KERNEL_S1 377 378 subs I, I, #1 379 bne swap_kernel_S10 380 381 382swap_kernel_L999: 383 384 mov r0, #0 // set return value 385 386 sub sp, fp, #8 387 pop {r4,fp} 388 bx lr 389 390 EPILOGUE 391 392