1/*************************************************************************** 2Copyright (c) 2016, The OpenBLAS Project 3All rights reserved. 4Redistribution and use in source and binary forms, with or without 5modification, are permitted provided that the following conditions are 6met: 71. Redistributions of source code must retain the above copyright 8notice, this list of conditions and the following disclaimer. 92. Redistributions in binary form must reproduce the above copyright 10notice, this list of conditions and the following disclaimer in 11the documentation and/or other materials provided with the 12distribution. 133. Neither the name of the OpenBLAS project nor the names of 14its contributors may be used to endorse or promote products 15derived from this software without specific prior written permission. 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*****************************************************************************/ 27 28#define ASSEMBLER 29#include "common.h" 30 31#define M x0 32#define N x1 33#define A00 x2 34#define LDA x3 35#define B00 x4 36 37#define A01 x5 38#define A02 x6 39#define A03 x7 40#define A04 x8 41#define A05 x9 42#define A06 x10 43#define A07 x11 44#define A08 x12 45 46#define I x13 47#define J x14 48 49#define TEMP1 x15 50#define TEMP2 x16 51 52#define A_PREFETCH 2560 53 54/************************************************************************************** 55* Macro definitions 56**************************************************************************************/ 57 58.macro SAVE_REGS 59 add sp, sp, #-(11 * 16) 60 stp d8, d9, [sp, #(0 * 16)] 61 stp d10, d11, [sp, #(1 * 16)] 62 stp d12, d13, [sp, #(2 * 16)] 63 stp d14, d15, [sp, #(3 * 16)] 64 stp d16, d17, [sp, #(4 * 16)] 65 stp x18, x19, [sp, #(5 * 16)] 66 stp x20, x21, [sp, #(6 * 16)] 67 stp x22, x23, [sp, #(7 * 16)] 68 stp x24, x25, [sp, #(8 * 16)] 69 stp x26, x27, [sp, #(9 * 16)] 70 str x28, [sp, #(10 * 16)] 71.endm 72 73.macro RESTORE_REGS 74 ldp d8, d9, [sp, #(0 * 16)] 75 ldp d10, d11, [sp, #(1 * 16)] 76 ldp d12, d13, [sp, #(2 * 16)] 77 ldp d14, d15, [sp, #(3 * 16)] 78 ldp d16, d17, [sp, #(4 * 16)] 79 ldp x18, x19, [sp, #(5 * 16)] 80 ldp x20, x21, [sp, #(6 * 16)] 81 ldp x22, x23, [sp, #(7 * 16)] 82 ldp x24, x25, [sp, #(8 * 16)] 83 ldp x26, x27, [sp, #(9 * 16)] 84 ldr x28, [sp, #(10 * 16)] 85 add sp, sp, #(11*16) 86.endm 87 88/*************************************************************************************/ 89 90.macro COPY8x8 91 //prfm PLDL1KEEP, [A01, #A_PREFETCH] 92 //prfm PLDL1KEEP, [A02, #A_PREFETCH] 93 //prfm PLDL1KEEP, [A03, #A_PREFETCH] 94 //prfm PLDL1KEEP, [A04, #A_PREFETCH] 95 //prfm PLDL1KEEP, [A05, #A_PREFETCH] 96 //prfm PLDL1KEEP, [A06, #A_PREFETCH] 97 //prfm PLDL1KEEP, [A07, #A_PREFETCH] 98 //prfm PLDL1KEEP, [A08, #A_PREFETCH] 99 100 COPY4x8 101 COPY4x8 102.endm 103 104.macro COPY4x8 105 ldp q0, q1, [A01], #32 106 ins v16.d[0], v0.d[0] 107 ins v20.d[0], v0.d[1] 108 ins v24.d[0], v1.d[0] 109 ins v28.d[0], v1.d[1] 110 111 ldp q2, q3, [A02], #32 112 ins v16.d[1], v2.d[0] 113 ins v20.d[1], v2.d[1] 114 ins v24.d[1], v3.d[0] 115 ins v28.d[1], v3.d[1] 116 117 ldp q4, q5, [A03], #32 118 ins v17.d[0], v4.d[0] 119 ins v21.d[0], v4.d[1] 120 ins v25.d[0], v5.d[0] 121 ins v29.d[0], v5.d[1] 122 123 ldp q6, q7, [A04], #32 124 ins v17.d[1], v6.d[0] 125 ins v21.d[1], v6.d[1] 126 ins v25.d[1], v7.d[0] 127 ins v29.d[1], v7.d[1] 128 129 ldp q8, q9, [A05], #32 130 ins v18.d[0], v8.d[0] 131 ins v22.d[0], v8.d[1] 132 ins v26.d[0], v9.d[0] 133 ins v30.d[0], v9.d[1] 134 135 ldp q10, q11, [A06], #32 136 ins v18.d[1], v10.d[0] 137 ins v22.d[1], v10.d[1] 138 ins v26.d[1], v11.d[0] 139 ins v30.d[1], v11.d[1] 140 141 ldp q12, q13, [A07], #32 142 ins v19.d[0], v12.d[0] 143 ins v23.d[0], v12.d[1] 144 ins v27.d[0], v13.d[0] 145 ins v31.d[0], v13.d[1] 146 147 ldp q14, q15, [A08], #32 148 ins v19.d[1], v14.d[0] 149 ins v23.d[1], v14.d[1] 150 ins v27.d[1], v15.d[0] 151 ins v31.d[1], v15.d[1] 152 153 st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [B00] 154 add B00, B00, #64 155 156 st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [B00] 157 add B00, B00, #64 158 159 st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [B00] 160 add B00, B00, #64 161 162 st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [B00] 163 add B00, B00, #64 164.endm 165 166.macro COPY1x8 167 //prfm PLDL1KEEP, [A01, #A_PREFETCH] 168 //prfm PLDL1KEEP, [A02, #A_PREFETCH] 169 //prfm PLDL1KEEP, [A03, #A_PREFETCH] 170 //prfm PLDL1KEEP, [A04, #A_PREFETCH] 171 //prfm PLDL1KEEP, [A05, #A_PREFETCH] 172 //prfm PLDL1KEEP, [A06, #A_PREFETCH] 173 //prfm PLDL1KEEP, [A07, #A_PREFETCH] 174 //prfm PLDL1KEEP, [A08, #A_PREFETCH] 175 176 ldr d0, [A01], #8 177 ldr d1, [A02], #8 178 ldr d2, [A03], #8 179 ldr d3, [A04], #8 180 ldr d4, [A05], #8 181 ldr d5, [A06], #8 182 ldr d6, [A07], #8 183 ldr d7, [A08], #8 184 185 st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B00] 186 add B00, B00, #32 187 st1 {v4.1d, v5.1d, v6.1d, v7.1d}, [B00] 188 add B00, B00, #32 189 190.endm 191 192 193/*************************************************************************************/ 194 195.macro COPY8x4 196 //prfm PLDL1KEEP, [A01, #A_PREFETCH] 197 //prfm PLDL1KEEP, [A02, #A_PREFETCH] 198 //prfm PLDL1KEEP, [A03, #A_PREFETCH] 199 //prfm PLDL1KEEP, [A04, #A_PREFETCH] 200 201 ldp q0, q1, [A01], #32 202 ins v8.d[0], v0.d[0] 203 ins v10.d[0], v0.d[1] 204 ins v12.d[0], v1.d[0] 205 ins v14.d[0], v1.d[1] 206 207 ldp q2, q3, [A02], #32 208 ins v8.d[1], v2.d[0] 209 ins v10.d[1], v2.d[1] 210 ins v12.d[1], v3.d[0] 211 ins v14.d[1], v3.d[1] 212 213 ldp q4, q5, [A03], #32 214 ins v9.d[0], v4.d[0] 215 ins v11.d[0], v4.d[1] 216 ins v13.d[0], v5.d[0] 217 ins v15.d[0], v5.d[1] 218 219 ldp q6, q7, [A04], #32 220 ins v9.d[1], v6.d[0] 221 ins v11.d[1], v6.d[1] 222 ins v13.d[1], v7.d[0] 223 ins v15.d[1], v7.d[1] 224 225 st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B00] 226 add B00, B00, #64 227 228 st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [B00] 229 add B00, B00, #64 230 231 ldp q16, q17, [A01], #32 232 ins v24.d[0], v16.d[0] 233 ins v26.d[0], v16.d[1] 234 ins v28.d[0], v17.d[0] 235 ins v30.d[0], v17.d[1] 236 237 ldp q18, q19, [A02], #32 238 ins v24.d[1], v18.d[0] 239 ins v26.d[1], v18.d[1] 240 ins v28.d[1], v19.d[0] 241 ins v30.d[1], v19.d[1] 242 243 ldp q20, q21, [A03], #32 244 ins v25.d[0], v20.d[0] 245 ins v27.d[0], v20.d[1] 246 ins v29.d[0], v21.d[0] 247 ins v31.d[0], v21.d[1] 248 249 ldp q22, q23, [A04], #32 250 ins v25.d[1], v22.d[0] 251 ins v27.d[1], v22.d[1] 252 ins v29.d[1], v23.d[0] 253 ins v31.d[1], v23.d[1] 254 255 st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [B00] 256 add B00, B00, #64 257 258 st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [B00] 259 add B00, B00, #64 260.endm 261 262.macro COPY1x4 263 //prfm PLDL1KEEP, [A01, #A_PREFETCH] 264 //prfm PLDL1KEEP, [A02, #A_PREFETCH] 265 //prfm PLDL1KEEP, [A03, #A_PREFETCH] 266 //prfm PLDL1KEEP, [A04, #A_PREFETCH] 267 268 ldr d0, [A01], #8 269 ldr d1, [A02], #8 270 ldr d2, [A03], #8 271 ldr d3, [A04], #8 272 273 st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B00] 274 add B00, B00, #32 275.endm 276 277/*************************************************************************************/ 278 279.macro COPY8x2 280 //prfm PLDL1KEEP, [A01, #A_PREFETCH] 281 //prfm PLDL1KEEP, [A02, #A_PREFETCH] 282 283 ldp q0, q1, [A01], #32 284 ldp q2, q3, [A01], #32 285 286 ins v8.d[0], v0.d[0] 287 ins v9.d[0], v0.d[1] 288 ins v10.d[0], v1.d[0] 289 ins v11.d[0], v1.d[1] 290 ins v12.d[0], v2.d[0] 291 ins v13.d[0], v2.d[1] 292 ins v14.d[0], v3.d[0] 293 ins v15.d[0], v3.d[1] 294 295 ldp q4, q5, [A02], #32 296 ldp q6, q7, [A02], #32 297 298 ins v8.d[1], v4.d[0] 299 ins v9.d[1], v4.d[1] 300 ins v10.d[1], v5.d[0] 301 ins v11.d[1], v5.d[1] 302 ins v12.d[1], v6.d[0] 303 ins v13.d[1], v6.d[1] 304 ins v14.d[1], v7.d[0] 305 ins v15.d[1], v7.d[1] 306 307 st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B00] 308 add B00, B00, #64 309 st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [B00] 310 add B00, B00, #64 311.endm 312 313 314.macro COPY1x2 315 //prfm PLDL1KEEP, [A01, #A_PREFETCH] 316 //prfm PLDL1KEEP, [A02, #A_PREFETCH] 317 318 ldr d0, [A01], #8 319 ldr d1, [A02], #8 320 321 stp d0, d1, [B00] 322 add B00, B00, #16 323.endm 324 325/*************************************************************************************/ 326 327.macro COPY8x1 328 //prfm PLDL1KEEP, [A01, #A_PREFETCH] 329 330 ldp q0, q1, [A01], #32 331 ldp q2, q3, [A01], #32 332 stp q0, q1, [B00], #32 333 stp q2, q3, [B00], #32 334.endm 335 336 337.macro COPY1x1 338 //prfm PLDL1KEEP, [A01, #A_PREFETCH] 339 340 ldr d0, [A01], #8 341 str d0, [B00], #8 342.endm 343 344/************************************************************************************** 345* End of macro definitions 346**************************************************************************************/ 347 348 PROLOGUE 349 350 .align 5 351 352 SAVE_REGS 353 354 lsl LDA, LDA, #3 // LDA = LDA * SIZE 355 356.Ldgemm_ncopy_L8_BEGIN: 357 358 asr J, N, #3 // J = N / 8 359 cmp J, #0 360 ble .Ldgemm_ncopy_L4_BEGIN 361 362.Ldgemm_ncopy_L8_M8_BEGIN: 363 364 mov A01, A00 365 add A02, A01, LDA 366 add A03, A02, LDA 367 add A04, A03, LDA 368 add A05, A04, LDA 369 add A06, A05, LDA 370 add A07, A06, LDA 371 add A08, A07, LDA 372 add A00, A08, LDA 373 374 375 asr I, M, #3 // I = M / 8 376 cmp I, #0 377 ble .Ldgemm_ncopy_L8_M8_40 378 379.Ldgemm_ncopy_L8_M8_20: 380 381 COPY8x8 382 383 subs I , I , #1 384 bne .Ldgemm_ncopy_L8_M8_20 385 386 387.Ldgemm_ncopy_L8_M8_40: 388 389 and I, M , #7 390 cmp I, #0 391 ble .Ldgemm_ncopy_L8_M8_END 392 393.Ldgemm_ncopy_L8_M8_60: 394 395 COPY1x8 396 397 subs I , I , #1 398 bne .Ldgemm_ncopy_L8_M8_60 399 400 401.Ldgemm_ncopy_L8_M8_END: 402 403 subs J , J, #1 // j-- 404 bne .Ldgemm_ncopy_L8_M8_BEGIN 405 406/*********************************************************************************************/ 407 408.Ldgemm_ncopy_L4_BEGIN: 409 410 tst N, #7 411 ble .Ldgemm_ncopy_L999 412 413 tst N, #4 414 ble .Ldgemm_ncopy_L2_BEGIN 415 416.Ldgemm_ncopy_L4_M8_BEGIN: 417 418 mov A01, A00 419 add A02, A01, LDA 420 add A03, A02, LDA 421 add A04, A03, LDA 422 add A00, A04, LDA 423 424 asr I, M, #3 // I = M / 8 425 cmp I, #0 426 ble .Ldgemm_ncopy_L4_M8_40 427 428.Ldgemm_ncopy_L4_M8_20: 429 430 COPY8x4 431 432 subs I , I , #1 433 bne .Ldgemm_ncopy_L4_M8_20 434 435 436.Ldgemm_ncopy_L4_M8_40: 437 438 and I, M , #7 439 cmp I, #0 440 ble .Ldgemm_ncopy_L4_M8_END 441 442.Ldgemm_ncopy_L4_M8_60: 443 444 COPY1x4 445 446 subs I , I , #1 447 bne .Ldgemm_ncopy_L4_M8_60 448 449 450.Ldgemm_ncopy_L4_M8_END: 451 452 453/*********************************************************************************************/ 454 455.Ldgemm_ncopy_L2_BEGIN: 456 457 tst N, #3 458 ble .Ldgemm_ncopy_L999 459 460 tst N, #2 461 ble .Ldgemm_ncopy_L1_BEGIN 462 463.Ldgemm_ncopy_L2_M8_BEGIN: 464 mov A01, A00 465 add A02, A01, LDA 466 add A00, A02, LDA 467 468 asr I, M, #3 // I = M / 8 469 cmp I, #0 470 ble .Ldgemm_ncopy_L2_M8_40 471 472.Ldgemm_ncopy_L2_M8_20: 473 474 COPY8x2 475 476 subs I , I , #1 477 bne .Ldgemm_ncopy_L2_M8_20 478 479 480.Ldgemm_ncopy_L2_M8_40: 481 482 and I, M , #7 483 cmp I, #0 484 ble .Ldgemm_ncopy_L2_M8_END 485 486.Ldgemm_ncopy_L2_M8_60: 487 488 COPY1x2 489 490 subs I , I , #1 491 bne .Ldgemm_ncopy_L2_M8_60 492 493 494.Ldgemm_ncopy_L2_M8_END: 495 496 497/*********************************************************************************************/ 498 499.Ldgemm_ncopy_L1_BEGIN: 500 501 tst N, #1 502 ble .Ldgemm_ncopy_L999 503 504 505.Ldgemm_ncopy_L1_M8_BEGIN: 506 507 mov A01, A00 508 509 asr I, M, #3 // I = M / 8 510 cmp I, #0 511 ble .Ldgemm_ncopy_L1_M8_40 512 513.Ldgemm_ncopy_L1_M8_20: 514 515 COPY8x1 516 517 subs I , I , #1 518 bne .Ldgemm_ncopy_L1_M8_20 519 520 521.Ldgemm_ncopy_L1_M8_40: 522 523 and I, M , #7 524 cmp I, #0 525 ble .Ldgemm_ncopy_L1_M8_END 526 527.Ldgemm_ncopy_L1_M8_60: 528 529 COPY1x1 530 531 subs I , I , #1 532 bne .Ldgemm_ncopy_L1_M8_60 533 534 535.Ldgemm_ncopy_L1_M8_END: 536 537.Ldgemm_ncopy_L999: 538 539 mov x0, #0 540 RESTORE_REGS 541 ret 542 543 EPILOGUE 544 545