1/***************************************************************************** 2* Copyright (C) 2000-2001 Andre McCurdy <armccurdy@yahoo.co.uk> 3* 4* This program is free software. you can redistribute it and/or modify 5* it under the terms of the GNU General Public License as published by 6* the Free Software Foundation@ either version 2 of the License, or 7* (at your option) any later version. 8* 9* This program is distributed in the hope that it will be useful, 10* but WITHOUT ANY WARRANTY, without even the implied warranty of 11* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12* GNU General Public License for more details. 13* 14* You should have received a copy of the GNU General Public License 15* along with this program@ if not, write to the Free Software 16* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17* 18***************************************************************************** 19* 20* Notes: 21* 22* 23***************************************************************************** 24* 25* $Id: imdct_l_arm.S,v 1.7 2001/03/25 20:03:34 rob Rel $ 26* 27* 2001/03/24: Andre McCurdy <armccurdy@yahoo.co.uk> 28* - Corrected PIC unsafe loading of address of 'imdct36_long_karray' 29* 30* 2000/09/20: Robert Leslie <rob@mars.org> 31* - Added a global symbol with leading underscore per suggestion of 32* Simon Burge to support linking with the a.out format. 33* 34* 2000/09/15: Robert Leslie <rob@mars.org> 35* - Fixed a small bug where flags were changed before a conditional branch. 36* 37* 2000/09/15: Andre McCurdy <armccurdy@yahoo.co.uk> 38* - Applied Nicolas Pitre's rounding optimisation in all remaining places. 39* 40* 2000/09/09: Nicolas Pitre <nico@cam.org> 41* - Optimized rounding + scaling operations. 42* 43* 2000/08/09: Andre McCurdy <armccurdy@yahoo.co.uk> 44* - Original created. 45* 46****************************************************************************/ 47 48 49/* 50 On entry: 51 52 r0 = pointer to 18 element input array 53 r1 = pointer to 36 element output array 54 r2 = windowing block type 55 56 57 Stack frame created during execution of the function: 58 59 Initial Holds: 60 Stack 61 pointer 62 minus: 63 64 0 65 4 lr 66 8 r11 67 12 r10 68 16 r9 69 20 r8 70 24 r7 71 28 r6 72 32 r5 73 36 r4 74 75 40 r2 : windowing block type 76 77 44 ct00 high 78 48 ct00 low 79 52 ct01 high 80 56 ct01 low 81 60 ct04 high 82 64 ct04 low 83 68 ct06 high 84 72 ct06 low 85 76 ct05 high 86 80 ct05 low 87 84 ct03 high 88 88 ct03 low 89 92 -ct05 high 90 96 -ct05 low 91 100 -ct07 high 92 104 -ct07 low 93 108 ct07 high 94 112 ct07 low 95 116 ct02 high 96 120 ct02 low 97*/ 98 99#define BLOCK_MODE_NORMAL 0 100#define BLOCK_MODE_START 1 101#define BLOCK_MODE_STOP 3 102 103 104#define X0 0x00 105#define X1 0x04 106#define X2 0x08 107#define X3 0x0C 108#define X4 0x10 109#define X5 0x14 110#define X6 0x18 111#define X7 0x1c 112#define X8 0x20 113#define X9 0x24 114#define X10 0x28 115#define X11 0x2c 116#define X12 0x30 117#define X13 0x34 118#define X14 0x38 119#define X15 0x3c 120#define X16 0x40 121#define X17 0x44 122 123#define x0 0x00 124#define x1 0x04 125#define x2 0x08 126#define x3 0x0C 127#define x4 0x10 128#define x5 0x14 129#define x6 0x18 130#define x7 0x1c 131#define x8 0x20 132#define x9 0x24 133#define x10 0x28 134#define x11 0x2c 135#define x12 0x30 136#define x13 0x34 137#define x14 0x38 138#define x15 0x3c 139#define x16 0x40 140#define x17 0x44 141#define x18 0x48 142#define x19 0x4c 143#define x20 0x50 144#define x21 0x54 145#define x22 0x58 146#define x23 0x5c 147#define x24 0x60 148#define x25 0x64 149#define x26 0x68 150#define x27 0x6c 151#define x28 0x70 152#define x29 0x74 153#define x30 0x78 154#define x31 0x7c 155#define x32 0x80 156#define x33 0x84 157#define x34 0x88 158#define x35 0x8c 159 160#define K00 0x0ffc19fd 161#define K01 0x00b2aa3e 162#define K02 0x0fdcf549 163#define K03 0x0216a2a2 164#define K04 0x0f9ee890 165#define K05 0x03768962 166#define K06 0x0f426cb5 167#define K07 0x04cfb0e2 168#define K08 0x0ec835e8 169#define K09 0x061f78aa 170#define K10 0x0e313245 171#define K11 0x07635284 172#define K12 0x0d7e8807 173#define K13 0x0898c779 174#define K14 0x0cb19346 175#define K15 0x09bd7ca0 176#define K16 0x0bcbe352 177#define K17 0x0acf37ad 178 179#define minus_K02 0xf0230ab7 180 181#define WL0 0x00b2aa3e 182#define WL1 0x0216a2a2 183#define WL2 0x03768962 184#define WL3 0x04cfb0e2 185#define WL4 0x061f78aa 186#define WL5 0x07635284 187#define WL6 0x0898c779 188#define WL7 0x09bd7ca0 189#define WL8 0x0acf37ad 190#define WL9 0x0bcbe352 191#define WL10 0x0cb19346 192#define WL11 0x0d7e8807 193#define WL12 0x0e313245 194#define WL13 0x0ec835e8 195#define WL14 0x0f426cb5 196#define WL15 0x0f9ee890 197#define WL16 0x0fdcf549 198#define WL17 0x0ffc19fd 199 200 201@***************************************************************************** 202 203 204 .text 205 .align 206 207 .global III_imdct_l 208 .global _III_imdct_l 209 210III_imdct_l: 211_III_imdct_l: 212 213 stmdb sp!, { r2, r4 - r11, lr } @ all callee saved regs, plus arg3 214 215 ldr r4, =K08 @ r4 = K08 216 ldr r5, =K09 @ r5 = K09 217 ldr r8, [r0, #X4] @ r8 = X4 218 ldr r9, [r0, #X13] @ r9 = X13 219 rsb r6, r4, #0 @ r6 = -K08 220 rsb r7, r5, #0 @ r7 = -K09 221 222 smull r2, r3, r4, r8 @ r2..r3 = (X4 * K08) 223 smlal r2, r3, r5, r9 @ r2..r3 = (X4 * K08) + (X13 * K09) = ct01 224 225 smull r10, lr, r8, r5 @ r10..lr = (X4 * K09) 226 smlal r10, lr, r9, r6 @ r10..lr = (X4 * K09) + (X13 * -K08) = ct00 227 228 ldr r8, [r0, #X7] @ r8 = X7 229 ldr r9, [r0, #X16] @ r9 = X16 230 231 stmdb sp!, { r2, r3, r10, lr } @ stack ct00_h, ct00_l, ct01_h, ct01_l 232 233 add r8, r8, r9 @ r8 = (X7 + X16) 234 ldr r9, [r0, #X1] @ r9 = X1 235 236 smlal r2, r3, r6, r8 @ r2..r3 = ct01 + ((X7 + X16) * -K08) 237 smlal r2, r3, r7, r9 @ r2..r3 += (X1 * -K09) 238 239 ldr r7, [r0, #X10] @ r7 = X10 240 241 rsbs r10, r10, #0 242 rsc lr, lr, #0 @ r10..lr = -ct00 243 244 smlal r2, r3, r5, r7 @ r2..r3 += (X10 * K09) = ct06 245 246 smlal r10, lr, r9, r6 @ r10..lr = -ct00 + ( X1 * -K08) 247 smlal r10, lr, r8, r5 @ r10..lr += ((X7 + X16) * K09) 248 smlal r10, lr, r7, r4 @ r10..lr += ( X10 * K08) = ct04 249 250 stmdb sp!, { r2, r3, r10, lr } @ stack ct04_h, ct04_l, ct06_h, ct06_l 251 252 @---- 253 254 ldr r7, [r0, #X0] 255 ldr r8, [r0, #X11] 256 ldr r9, [r0, #X12] 257 sub r7, r7, r8 258 sub r7, r7, r9 @ r7 = (X0 - X11 -X12) = ct14 259 260 ldr r9, [r0, #X3] 261 ldr r8, [r0, #X8] 262 ldr r11, [r0, #X15] 263 sub r8, r8, r9 264 add r8, r8, r11 @ r8 = (X8 - X3 + X15) = ct16 265 266 add r11, r7, r8 @ r11 = ct14 + ct16 = ct18 267 268 smlal r2, r3, r6, r11 @ r2..r3 = ct06 + ((X0 - X11 - X3 + X15 + X8 - X12) * -K08) 269 270 ldr r6, [r0, #X2] 271 ldr r9, [r0, #X9] 272 ldr r12, [r0, #X14] 273 sub r6, r6, r9 274 sub r6, r6, r12 @ r6 = (X2 - X9 - X14) = ct15 275 276 ldr r9, [r0, #X5] 277 ldr r12, [r0, #X6] 278 sub r9, r9, r12 279 ldr r12, [r0, #X17] 280 sub r9, r9, r12 @ r9 = (X5 - X6 - X17) = ct17 281 282 add r12, r9, r6 @ r12 = ct15 + ct17 = ct19 283 284 smlal r2, r3, r5, r12 @ r2..r3 += ((X2 - X9 + X5 - X6 - X17 - X14) * K09) 285 286 smlal r10, lr, r11, r5 @ r10..lr = ct04 + (ct18 * K09) 287 smlal r10, lr, r12, r4 @ r10..lr = ct04 + (ct18 * K09) + (ct19 * K08) 288 289 movs r2, r2, lsr #28 290 adc r2, r2, r3, lsl #4 @ r2 = bits[59..28] of r2..r3 291 str r2, [r1, #x22] @ store result x22 292 293 movs r10, r10, lsr #28 294 adc r10, r10, lr, lsl #4 @ r10 = bits[59..28] of r10..lr 295 str r10, [r1, #x4] @ store result x4 296 297 @---- 298 299 ldmia sp, { r2, r3, r4, r5 } @ r2..r3 = ct06, r4..r5 = ct04 (dont update sp) 300 301 @ r2..r3 = ct06 302 @ r4..r5 = ct04 303 @ r6 = ct15 304 @ r7 = ct14 305 @ r8 = ct16 306 @ r9 = ct17 307 @ r10 = . 308 @ r11 = . 309 @ r12 = . 310 @ lr = . 311 312 ldr r10, =K03 @ r10 = K03 313 ldr lr, =K15 @ lr = K15 314 315 smlal r2, r3, r10, r7 @ r2..r3 = ct06 + (ct14 * K03) 316 smlal r4, r5, lr, r7 @ r4..r5 = ct04 + (ct14 * K15) 317 318 ldr r12, =K14 @ r12 = K14 319 rsb r10, r10, #0 @ r10 = -K03 320 321 smlal r2, r3, lr, r6 @ r2..r3 += (ct15 * K15) 322 smlal r4, r5, r10, r6 @ r4..r5 += (ct15 * -K03) 323 smlal r2, r3, r12, r8 @ r2..r3 += (ct16 * K14) 324 325 ldr r11, =minus_K02 @ r11 = -K02 326 rsb r12, r12, #0 @ r12 = -K14 327 328 smlal r4, r5, r12, r9 @ r4..r5 += (ct17 * -K14) 329 smlal r2, r3, r11, r9 @ r2..r3 += (ct17 * -K02) 330 smlal r4, r5, r11, r8 @ r4..r5 += (ct16 * -K02) 331 332 movs r2, r2, lsr #28 333 adc r2, r2, r3, lsl #4 @ r2 = bits[59..28] of r2..r3 334 str r2, [r1, #x7] @ store result x7 335 336 movs r4, r4, lsr #28 337 adc r4, r4, r5, lsl #4 @ r4 = bits[59..28] of r4..r5 338 str r4, [r1, #x1] @ store result x1 339 340 @---- 341 342 ldmia sp, { r2, r3, r4, r5 } @ r2..r3 = ct06, r4..r5 = ct04 (dont update sp) 343 344 @ r2..r3 = ct06 345 @ r4..r5 = ct04 346 @ r6 = ct15 347 @ r7 = ct14 348 @ r8 = ct16 349 @ r9 = ct17 350 @ r10 = -K03 351 @ r11 = -K02 352 @ r12 = -K14 353 @ lr = K15 354 355 rsbs r2, r2, #0 356 rsc r3, r3, #0 @ r2..r3 = -ct06 357 358 smlal r2, r3, r12, r7 @ r2..r3 = -ct06 + (ct14 * -K14) 359 smlal r2, r3, r10, r8 @ r2..r3 += (ct16 * -K03) 360 361 smlal r4, r5, r12, r6 @ r4..r5 = ct04 + (ct15 * -K14) 362 smlal r4, r5, r10, r9 @ r4..r5 += (ct17 * -K03) 363 smlal r4, r5, lr, r8 @ r4..r5 += (ct16 * K15) 364 smlal r4, r5, r11, r7 @ r4..r5 += (ct14 * -K02) 365 366 rsb lr, lr, #0 @ lr = -K15 367 rsb r11, r11, #0 @ r11 = K02 368 369 smlal r2, r3, lr, r9 @ r2..r3 += (ct17 * -K15) 370 smlal r2, r3, r11, r6 @ r2..r3 += (ct15 * K02) 371 372 movs r4, r4, lsr #28 373 adc r4, r4, r5, lsl #4 @ r4 = bits[59..28] of r4..r5 374 str r4, [r1, #x25] @ store result x25 375 376 movs r2, r2, lsr #28 377 adc r2, r2, r3, lsl #4 @ r2 = bits[59..28] of r2..r3 378 str r2, [r1, #x19] @ store result x19 379 380 @---- 381 382 ldr r2, [sp, #16] @ r2 = ct01_l 383 ldr r3, [sp, #20] @ r3 = ct01_h 384 385 ldr r6, [r0, #X1] 386 ldr r8, [r0, #X7] 387 ldr r9, [r0, #X10] 388 ldr r7, [r0, #X16] 389 390 rsbs r2, r2, #0 391 rsc r3, r3, #0 @ r2..r3 = -ct01 392 393 mov r4, r2 394 mov r5, r3 @ r4..r5 = -ct01 395 396 @ r2..r3 = -ct01 397 @ r4..r5 = -ct01 398 @ r6 = X1 399 @ r7 = X16 400 @ r8 = X7 401 @ r9 = X10 402 @ r10 = -K03 403 @ r11 = K02 404 @ r12 = -K14 405 @ lr = -K15 406 407 smlal r4, r5, r12, r7 @ r4..r5 = -ct01 + (X16 * -K14) 408 smlal r2, r3, lr, r9 @ r2..r3 = -ct01 + (X10 * -K15) 409 410 smlal r4, r5, r10, r8 @ r4..r5 += (X7 * -K03) 411 smlal r2, r3, r10, r7 @ r2..r3 += (X16 * -K03) 412 413 smlal r4, r5, r11, r9 @ r4..r5 += (X10 * K02) 414 smlal r2, r3, r12, r8 @ r2..r3 += (X7 * -K14) 415 416 rsb lr, lr, #0 @ lr = K15 417 rsb r11, r11, #0 @ r11 = -K02 418 419 smlal r4, r5, lr, r6 @ r4..r5 += (X1 * K15) = ct05 420 smlal r2, r3, r11, r6 @ r2..r3 += (X1 * -K02) = ct03 421 422 stmdb sp!, { r2, r3, r4, r5 } @ stack ct05_h, ct05_l, ct03_h, ct03_l 423 424 rsbs r4, r4, #0 425 rsc r5, r5, #0 @ r4..r5 = -ct05 426 427 stmdb sp!, { r4, r5 } @ stack -ct05_h, -ct05_l 428 429 ldr r2, [sp, #48] @ r2 = ct00_l 430 ldr r3, [sp, #52] @ r3 = ct00_h 431 432 rsb r10, r10, #0 @ r10 = K03 433 434 rsbs r4, r2, #0 435 rsc r5, r3, #0 @ r4..r5 = -ct00 436 437 @ r2..r3 = ct00 438 @ r4..r5 = -ct00 439 @ r6 = X1 440 @ r7 = X16 441 @ r8 = X7 442 @ r9 = X10 443 @ r10 = K03 444 @ r11 = -K02 445 @ r12 = -K14 446 @ lr = K15 447 448 smlal r4, r5, r10, r6 @ r4..r5 = -ct00 + (X1 * K03) 449 smlal r2, r3, r10, r9 @ r2..r3 = ct00 + (X10 * K03) 450 451 smlal r4, r5, r12, r9 @ r4..r5 += (X10 * -K14) 452 smlal r2, r3, r12, r6 @ r2..r3 += (X1 * -K14) 453 454 smlal r4, r5, r11, r7 @ r4..r5 += (X16 * -K02) 455 smlal r4, r5, lr, r8 @ r4..r5 += (X7 * K15) = ct07 456 457 rsb lr, lr, #0 @ lr = -K15 458 rsb r11, r11, #0 @ r11 = K02 459 460 smlal r2, r3, r11, r8 @ r2..r3 += (X7 * K02) 461 smlal r2, r3, lr, r7 @ r2..r3 += (X16 * -K15) = ct02 462 463 rsbs r6, r4, #0 464 rsc r7, r5, #0 @ r6..r7 = -ct07 465 466 stmdb sp!, { r2 - r7 } @ stack -ct07_h, -ct07_l, ct07_h, ct07_l, ct02_h, ct02_l 467 468 469 @---- 470 471 add r2, pc, #(imdct36_long_karray-.-8) @ r2 = base address of Knn array (PIC safe ?) 472 473 474loop: 475 ldr r12, [r0, #X0] 476 477 ldmia r2!, { r5 - r11 } @ first 7 words from Karray element 478 479 smull r3, r4, r5, r12 @ sum = (Kxx * X0) 480 ldr r12, [r0, #X2] 481 ldr r5, [r0, #X3] 482 smlal r3, r4, r6, r12 @ sum += (Kxx * X2) 483 ldr r12, [r0, #X5] 484 ldr r6, [r0, #X6] 485 smlal r3, r4, r7, r5 @ sum += (Kxx * X3) 486 smlal r3, r4, r8, r12 @ sum += (Kxx * X5) 487 ldr r12, [r0, #X8] 488 ldr r5, [r0, #X9] 489 smlal r3, r4, r9, r6 @ sum += (Kxx * X6) 490 smlal r3, r4, r10, r12 @ sum += (Kxx * X8) 491 smlal r3, r4, r11, r5 @ sum += (Kxx * X9) 492 493 ldmia r2!, { r5 - r10 } @ final 6 words from Karray element 494 495 ldr r11, [r0, #X11] 496 ldr r12, [r0, #X12] 497 smlal r3, r4, r5, r11 @ sum += (Kxx * X11) 498 ldr r11, [r0, #X14] 499 ldr r5, [r0, #X15] 500 smlal r3, r4, r6, r12 @ sum += (Kxx * X12) 501 smlal r3, r4, r7, r11 @ sum += (Kxx * X14) 502 ldr r11, [r0, #X17] 503 smlal r3, r4, r8, r5 @ sum += (Kxx * X15) 504 smlal r3, r4, r9, r11 @ sum += (Kxx * X17) 505 506 add r5, sp, r10, lsr #16 @ create index back into stack for required ctxx 507 508 ldmia r5, { r6, r7 } @ r6..r7 = ctxx 509 510 mov r8, r10, lsl #16 @ push ctxx index off the top end 511 512 adds r3, r3, r6 @ add low words 513 adc r4, r4, r7 @ add high words, with carry 514 movs r3, r3, lsr #28 515 adc r3, r3, r4, lsl #4 @ r3 = bits[59..28] of r3..r4 516 517 str r3, [r1, r8, lsr #24] @ push completion flag off the bottom end 518 519 movs r8, r8, lsl #8 @ push result location index off the top end 520 beq loop @ loop back if completion flag not set 521 b imdct_l_windowing @ branch to windowing stage if looping finished 522 523imdct36_long_karray: 524 525 .word K17, -K13, K10, -K06, -K05, K01, -K00, K04, -K07, K11, K12, -K16, 0x00000000 526 .word K13, K07, K16, K01, K10, -K05, K04, -K11, K00, -K17, K06, -K12, 0x00200800 527 .word K11, K17, K05, K12, -K01, K06, -K07, K00, -K13, K04, -K16, K10, 0x00200c00 528 .word K07, K00, -K12, K05, -K16, -K10, K11, -K17, K04, K13, K01, K06, 0x00001400 529 .word K05, K10, -K00, -K17, K07, -K13, K12, K06, -K16, K01, -K11, -K04, 0x00181800 530 .word K01, K05, -K07, -K11, K13, K17, -K16, -K12, K10, K06, -K04, -K00, 0x00102000 531 .word -K16, K12, -K11, K07, K04, -K00, -K01, K05, -K06, K10, K13, -K17, 0x00284800 532 .word -K12, K06, K17, -K00, -K11, K04, K05, -K10, K01, K16, -K07, -K13, 0x00085000 533 .word -K10, K16, K04, -K13, -K00, K07, K06, -K01, -K12, -K05, K17, K11, 0x00105400 534 .word -K06, -K01, K13, K04, K17, -K11, -K10, -K16, -K05, K12, K00, K07, 0x00185c00 535 .word -K04, -K11, -K01, K16, K06, K12, K13, -K07, -K17, -K00, -K10, -K05, 0x00006000 536 .word -K00, -K04, -K06, -K10, -K12, -K16, -K17, -K13, -K11, -K07, -K05, -K01, 0x00206801 537 538 539 @---- 540 @------------------------------------------------------------------------- 541 @---- 542 543imdct_l_windowing: 544 545 ldr r11, [sp, #80] @ fetch function parameter 3 from out of the stack 546 ldmia r1!, { r0, r2 - r9 } @ load 9 words from x0, update pointer 547 548 @ r0 = x0 549 @ r1 = &x[9] 550 @ r2 = x1 551 @ r3 = x2 552 @ r4 = x3 553 @ r5 = x4 554 @ r6 = x5 555 @ r7 = x6 556 @ r8 = x7 557 @ r9 = x8 558 @ r10 = . 559 @ r11 = window mode: (0 == normal), (1 == start block), (3 == stop block) 560 @ r12 = . 561 @ lr = . 562 563 cmp r11, #BLOCK_MODE_STOP @ setup flags 564 rsb r10, r0, #0 @ r10 = -x0 (DONT change flags !!) 565 beq stop_block_x0_to_x17 566 567 568 @ start and normal blocks are treated the same for x[0]..x[17] 569 570normal_block_x0_to_x17: 571 572 ldr r12, =WL9 @ r12 = window_l[9] 573 574 rsb r0, r9, #0 @ r0 = -x8 575 rsb r9, r2, #0 @ r9 = -x1 576 rsb r2, r8, #0 @ r2 = -x7 577 rsb r8, r3, #0 @ r8 = -x2 578 rsb r3, r7, #0 @ r3 = -x6 579 rsb r7, r4, #0 @ r7 = -x3 580 rsb r4, r6, #0 @ r4 = -x5 581 rsb r6, r5, #0 @ r6 = -x4 582 583 @ r0 = -x8 584 @ r1 = &x[9] 585 @ r2 = -x7 586 @ r3 = -x6 587 @ r4 = -x5 588 @ r5 = . 589 @ r6 = -x4 590 @ r7 = -x3 591 @ r8 = -x2 592 @ r9 = -x1 593 @ r10 = -x0 594 @ r11 = window mode: (0 == normal), (1 == start block), (3 == stop block) 595 @ r12 = window_l[9] 596 @ lr = . 597 598 smull r5, lr, r12, r0 @ r5..lr = (window_l[9] * (x[9] == -x[8])) 599 ldr r12, =WL10 @ r12 = window_l[10] 600 movs r5, r5, lsr #28 601 adc r0, r5, lr, lsl #4 @ r0 = bits[59..28] of windowed x9 602 603 smull r5, lr, r12, r2 @ r5..lr = (window_l[10] * (x[10] == -x[7])) 604 ldr r12, =WL11 @ r12 = window_l[11] 605 movs r5, r5, lsr #28 606 adc r2, r5, lr, lsl #4 @ r2 = bits[59..28] of windowed x10 607 608 smull r5, lr, r12, r3 @ r5..lr = (window_l[11] * (x[11] == -x[6])) 609 ldr r12, =WL12 @ r12 = window_l[12] 610 movs r5, r5, lsr #28 611 adc r3, r5, lr, lsl #4 @ r3 = bits[59..28] of windowed x11 612 613 smull r5, lr, r12, r4 @ r5..lr = (window_l[12] * (x[12] == -x[5])) 614 ldr r12, =WL13 @ r12 = window_l[13] 615 movs r5, r5, lsr #28 616 adc r4, r5, lr, lsl #4 @ r4 = bits[59..28] of windowed x12 617 618 smull r5, lr, r12, r6 @ r5..lr = (window_l[13] * (x[13] == -x[4])) 619 ldr r12, =WL14 @ r12 = window_l[14] 620 movs r5, r5, lsr #28 621 adc r6, r5, lr, lsl #4 @ r6 = bits[59..28] of windowed x13 622 623 smull r5, lr, r12, r7 @ r5..lr = (window_l[14] * (x[14] == -x[3])) 624 ldr r12, =WL15 @ r12 = window_l[15] 625 movs r5, r5, lsr #28 626 adc r7, r5, lr, lsl #4 @ r7 = bits[59..28] of windowed x14 627 628 smull r5, lr, r12, r8 @ r5..lr = (window_l[15] * (x[15] == -x[2])) 629 ldr r12, =WL16 @ r12 = window_l[16] 630 movs r5, r5, lsr #28 631 adc r8, r5, lr, lsl #4 @ r8 = bits[59..28] of windowed x15 632 633 smull r5, lr, r12, r9 @ r5..lr = (window_l[16] * (x[16] == -x[1])) 634 ldr r12, =WL17 @ r12 = window_l[17] 635 movs r5, r5, lsr #28 636 adc r9, r5, lr, lsl #4 @ r9 = bits[59..28] of windowed x16 637 638 smull r5, lr, r12, r10 @ r5..lr = (window_l[17] * (x[17] == -x[0])) 639 ldr r12, =WL0 @ r12 = window_l[0] 640 movs r5, r5, lsr #28 641 adc r10, r5, lr, lsl #4 @ r10 = bits[59..28] of windowed x17 642 643 644 stmia r1, { r0, r2 - r4, r6 - r10 } @ store windowed x[9] .. x[17] 645 ldmdb r1!, { r0, r2 - r9 } @ load 9 words downto (and including) x0 646 647 648 smull r10, lr, r12, r0 @ r10..lr = (window_l[0] * x[0]) 649 ldr r12, =WL1 @ r12 = window_l[1] 650 movs r10, r10, lsr #28 651 adc r0, r10, lr, lsl #4 @ r0 = bits[59..28] of windowed x0 652 653 smull r10, lr, r12, r2 @ r10..lr = (window_l[1] * x[1]) 654 ldr r12, =WL2 @ r12 = window_l[2] 655 movs r10, r10, lsr #28 656 adc r2, r10, lr, lsl #4 @ r2 = bits[59..28] of windowed x1 657 658 smull r10, lr, r12, r3 @ r10..lr = (window_l[2] * x[2]) 659 ldr r12, =WL3 @ r12 = window_l[3] 660 movs r10, r10, lsr #28 661 adc r3, r10, lr, lsl #4 @ r3 = bits[59..28] of windowed x2 662 663 smull r10, lr, r12, r4 @ r10..lr = (window_l[3] * x[3]) 664 ldr r12, =WL4 @ r12 = window_l[4] 665 movs r10, r10, lsr #28 666 adc r4, r10, lr, lsl #4 @ r4 = bits[59..28] of windowed x3 667 668 smull r10, lr, r12, r5 @ r10..lr = (window_l[4] * x[4]) 669 ldr r12, =WL5 @ r12 = window_l[5] 670 movs r10, r10, lsr #28 671 adc r5, r10, lr, lsl #4 @ r5 = bits[59..28] of windowed x4 672 673 smull r10, lr, r12, r6 @ r10..lr = (window_l[5] * x[5]) 674 ldr r12, =WL6 @ r12 = window_l[6] 675 movs r10, r10, lsr #28 676 adc r6, r10, lr, lsl #4 @ r6 = bits[59..28] of windowed x5 677 678 smull r10, lr, r12, r7 @ r10..lr = (window_l[6] * x[6]) 679 ldr r12, =WL7 @ r12 = window_l[7] 680 movs r10, r10, lsr #28 681 adc r7, r10, lr, lsl #4 @ r7 = bits[59..28] of windowed x6 682 683 smull r10, lr, r12, r8 @ r10..lr = (window_l[7] * x[7]) 684 ldr r12, =WL8 @ r12 = window_l[8] 685 movs r10, r10, lsr #28 686 adc r8, r10, lr, lsl #4 @ r8 = bits[59..28] of windowed x7 687 688 smull r10, lr, r12, r9 @ r10..lr = (window_l[8] * x[8]) 689 movs r10, r10, lsr #28 690 adc r9, r10, lr, lsl #4 @ r9 = bits[59..28] of windowed x8 691 692 stmia r1, { r0, r2 - r9 } @ store windowed x[0] .. x[8] 693 694 cmp r11, #BLOCK_MODE_START 695 beq start_block_x18_to_x35 696 697 698 @---- 699 700 701normal_block_x18_to_x35: 702 703 ldr r11, =WL3 @ r11 = window_l[3] 704 ldr r12, =WL4 @ r12 = window_l[4] 705 706 add r1, r1, #(18*4) @ r1 = &x[18] 707 708 ldmia r1!, { r0, r2 - r4, r6 - r10 } @ load 9 words from x18, update pointer 709 710 @ r0 = x18 711 @ r1 = &x[27] 712 @ r2 = x19 713 @ r3 = x20 714 @ r4 = x21 715 @ r5 = . 716 @ r6 = x22 717 @ r7 = x23 718 @ r8 = x24 719 @ r9 = x25 720 @ r10 = x26 721 @ r11 = window_l[3] 722 @ r12 = window_l[4] 723 @ lr = . 724 725 smull r5, lr, r12, r6 @ r5..lr = (window_l[4] * (x[22] == x[31])) 726 movs r5, r5, lsr #28 727 adc r5, r5, lr, lsl #4 @ r5 = bits[59..28] of windowed x31 728 729 smull r6, lr, r11, r4 @ r5..lr = (window_l[3] * (x[21] == x[32])) 730 ldr r12, =WL5 @ r12 = window_l[5] 731 movs r6, r6, lsr #28 732 adc r6, r6, lr, lsl #4 @ r6 = bits[59..28] of windowed x32 733 734 smull r4, lr, r12, r7 @ r4..lr = (window_l[5] * (x[23] == x[30])) 735 ldr r11, =WL1 @ r11 = window_l[1] 736 ldr r12, =WL2 @ r12 = window_l[2] 737 movs r4, r4, lsr #28 738 adc r4, r4, lr, lsl #4 @ r4 = bits[59..28] of windowed x30 739 740 smull r7, lr, r12, r3 @ r7..lr = (window_l[2] * (x[20] == x[33])) 741 ldr r12, =WL6 @ r12 = window_l[6] 742 movs r7, r7, lsr #28 743 adc r7, r7, lr, lsl #4 @ r7 = bits[59..28] of windowed x33 744 745 smull r3, lr, r12, r8 @ r3..lr = (window_l[6] * (x[24] == x[29])) 746 movs r3, r3, lsr #28 747 adc r3, r3, lr, lsl #4 @ r3 = bits[59..28] of windowed x29 748 749 smull r8, lr, r11, r2 @ r7..lr = (window_l[1] * (x[19] == x[34])) 750 ldr r12, =WL7 @ r12 = window_l[7] 751 ldr r11, =WL8 @ r11 = window_l[8] 752 movs r8, r8, lsr #28 753 adc r8, r8, lr, lsl #4 @ r8 = bits[59..28] of windowed x34 754 755 smull r2, lr, r12, r9 @ r7..lr = (window_l[7] * (x[25] == x[28])) 756 ldr r12, =WL0 @ r12 = window_l[0] 757 movs r2, r2, lsr #28 758 adc r2, r2, lr, lsl #4 @ r2 = bits[59..28] of windowed x28 759 760 smull r9, lr, r12, r0 @ r3..lr = (window_l[0] * (x[18] == x[35])) 761 movs r9, r9, lsr #28 762 adc r9, r9, lr, lsl #4 @ r9 = bits[59..28] of windowed x35 763 764 smull r0, lr, r11, r10 @ r7..lr = (window_l[8] * (x[26] == x[27])) 765 ldr r11, =WL16 @ r11 = window_l[16] 766 ldr r12, =WL17 @ r12 = window_l[17] 767 movs r0, r0, lsr #28 768 adc r0, r0, lr, lsl #4 @ r0 = bits[59..28] of windowed x27 769 770 771 stmia r1, { r0, r2 - r9 } @ store windowed x[27] .. x[35] 772 ldmdb r1!, { r0, r2 - r9 } @ load 9 words downto (and including) x18 773 774 775 smull r10, lr, r12, r0 @ r10..lr = (window_l[17] * x[18]) 776 movs r10, r10, lsr #28 777 adc r0, r10, lr, lsl #4 @ r0 = bits[59..28] of windowed x0 778 779 smull r10, lr, r11, r2 @ r10..lr = (window_l[16] * x[19]) 780 ldr r11, =WL14 @ r11 = window_l[14] 781 ldr r12, =WL15 @ r12 = window_l[15] 782 movs r10, r10, lsr #28 783 adc r2, r10, lr, lsl #4 @ r2 = bits[59..28] of windowed x1 784 785 smull r10, lr, r12, r3 @ r10..lr = (window_l[15] * x[20]) 786 movs r10, r10, lsr #28 787 adc r3, r10, lr, lsl #4 @ r3 = bits[59..28] of windowed x2 788 789 smull r10, lr, r11, r4 @ r10..lr = (window_l[14] * x[21]) 790 ldr r11, =WL12 @ r11 = window_l[12] 791 ldr r12, =WL13 @ r12 = window_l[13] 792 movs r10, r10, lsr #28 793 adc r4, r10, lr, lsl #4 @ r4 = bits[59..28] of windowed x3 794 795 smull r10, lr, r12, r5 @ r10..lr = (window_l[13] * x[22]) 796 movs r10, r10, lsr #28 797 adc r5, r10, lr, lsl #4 @ r5 = bits[59..28] of windowed x4 798 799 smull r10, lr, r11, r6 @ r10..lr = (window_l[12] * x[23]) 800 ldr r11, =WL10 @ r12 = window_l[10] 801 ldr r12, =WL11 @ r12 = window_l[11] 802 movs r10, r10, lsr #28 803 adc r6, r10, lr, lsl #4 @ r6 = bits[59..28] of windowed x5 804 805 smull r10, lr, r12, r7 @ r10..lr = (window_l[11] * x[24]) 806 movs r10, r10, lsr #28 807 adc r7, r10, lr, lsl #4 @ r7 = bits[59..28] of windowed x6 808 809 smull r10, lr, r11, r8 @ r10..lr = (window_l[10] * x[25]) 810 ldr r12, =WL9 @ r12 = window_l[9] 811 movs r10, r10, lsr #28 812 adc r8, r10, lr, lsl #4 @ r8 = bits[59..28] of windowed x7 813 814 smull r10, lr, r12, r9 @ r10..lr = (window_l[9] * x[26]) 815 816 movs r10, r10, lsr #28 817 adc r9, r10, lr, lsl #4 @ r9 = bits[59..28] of windowed x8 818 819 stmia r1, { r0, r2 - r9 } @ store windowed x[18] .. x[26] 820 821 @---- 822 @ NB there are 2 possible exits from this function - this is only one of them 823 @---- 824 825 add sp, sp, #(21*4) @ return stack frame 826 ldmia sp!, { r4 - r11, pc } @ restore callee saved regs, and return 827 828 @---- 829 830 831stop_block_x0_to_x17: 832 833 @ r0 = x0 834 @ r1 = &x[9] 835 @ r2 = x1 836 @ r3 = x2 837 @ r4 = x3 838 @ r5 = x4 839 @ r6 = x5 840 @ r7 = x6 841 @ r8 = x7 842 @ r9 = x8 843 @ r10 = -x0 844 @ r11 = window mode: (0 == normal), (1 == start block), (3 == stop block) 845 @ r12 = . 846 @ lr = . 847 848 rsb r0, r6, #0 @ r0 = -x5 849 rsb r6, r2, #0 @ r6 = -x1 850 rsb r2, r5, #0 @ r2 = -x4 851 rsb r5, r3, #0 @ r5 = -x2 852 rsb r3, r4, #0 @ r3 = -x3 853 854 add r1, r1, #(3*4) @ r1 = &x[12] 855 stmia r1, { r0, r2, r3, r5, r6, r10 } @ store unchanged x[12] .. x[17] 856 857 ldr r0, =WL1 @ r0 = window_l[1] == window_s[0] 858 859 rsb r10, r9, #0 @ r10 = -x8 860 rsb r12, r8, #0 @ r12 = -x7 861 rsb lr, r7, #0 @ lr = -x6 862 863 @ r0 = WL1 864 @ r1 = &x[12] 865 @ r2 = . 866 @ r3 = . 867 @ r4 = . 868 @ r5 = . 869 @ r6 = . 870 @ r7 = x6 871 @ r8 = x7 872 @ r9 = x8 873 @ r10 = -x8 874 @ r11 = window mode: (0 == normal), (1 == start block), (3 == stop block) 875 @ r12 = -x7 876 @ lr = -x6 877 878 smull r5, r6, r0, r7 @ r5..r6 = (window_l[1] * x[6]) 879 ldr r2, =WL4 @ r2 = window_l[4] == window_s[1] 880 movs r5, r5, lsr #28 881 adc r7, r5, r6, lsl #4 @ r7 = bits[59..28] of windowed x6 882 883 smull r5, r6, r2, r8 @ r5..r6 = (window_l[4] * x[7]) 884 ldr r3, =WL7 @ r3 = window_l[7] == window_s[2] 885 movs r5, r5, lsr #28 886 adc r8, r5, r6, lsl #4 @ r8 = bits[59..28] of windowed x7 887 888 smull r5, r6, r3, r9 @ r5..r6 = (window_l[7] * x[8]) 889 ldr r4, =WL10 @ r4 = window_l[10] == window_s[3] 890 movs r5, r5, lsr #28 891 adc r9, r5, r6, lsl #4 @ r9 = bits[59..28] of windowed x8 892 893 smull r5, r6, r4, r10 @ r5..r6 = (window_l[10] * (x[9] == -x[8])) 894 ldr r0, =WL13 @ r0 = window_l[13] == window_s[4] 895 movs r5, r5, lsr #28 896 adc r10, r5, r6, lsl #4 @ r10 = bits[59..28] of windowed x9 897 898 smull r5, r6, r0, r12 @ r5..r6 = (window_l[13] * (x[10] == -x[7])) 899 ldr r2, =WL16 @ r2 = window_l[16] == window_s[5] 900 movs r5, r5, lsr #28 901 adc r12, r5, r6, lsl #4 @ r10 = bits[59..28] of windowed x9 902 903 smull r5, r6, r2, lr @ r5..r6 = (window_l[16] * (x[11] == -x[6])) 904 905 ldr r0, =0x00 906 907 movs r5, r5, lsr #28 908 adc lr, r5, r6, lsl #4 @ r10 = bits[59..28] of windowed x9 909 910 stmdb r1!, { r7 - r10, r12, lr } @ store windowed x[6] .. x[11] 911 912 ldr r5, =0x00 913 ldr r6, =0x00 914 ldr r2, =0x00 915 ldr r3, =0x00 916 ldr r4, =0x00 917 918 stmdb r1!, { r0, r2 - r6 } @ store windowed x[0] .. x[5] 919 920 b normal_block_x18_to_x35 921 922 923 @---- 924 925 926start_block_x18_to_x35: 927 928 ldr r4, =WL1 @ r0 = window_l[1] == window_s[0] 929 930 add r1, r1, #(24*4) @ r1 = &x[24] 931 932 ldmia r1, { r0, r2, r3 } @ load 3 words from x24, dont update pointer 933 934 @ r0 = x24 935 @ r1 = &x[24] 936 @ r2 = x25 937 @ r3 = x26 938 @ r4 = WL1 939 @ r5 = WL4 940 @ r6 = WL7 941 @ r7 = WL10 942 @ r8 = WL13 943 @ r9 = WL16 944 @ r10 = . 945 @ r11 = . 946 @ r12 = . 947 @ lr = . 948 949 ldr r5, =WL4 @ r5 = window_l[4] == window_s[1] 950 951 smull r10, r11, r4, r0 @ r10..r11 = (window_l[1] * (x[24] == x[29])) 952 ldr r6, =WL7 @ r6 = window_l[7] == window_s[2] 953 movs r10, r10, lsr #28 954 adc lr, r10, r11, lsl #4 @ lr = bits[59..28] of windowed x29 955 956 smull r10, r11, r5, r2 @ r10..r11 = (window_l[4] * (x[25] == x[28])) 957 ldr r7, =WL10 @ r7 = window_l[10] == window_s[3] 958 movs r10, r10, lsr #28 959 adc r12, r10, r11, lsl #4 @ r12 = bits[59..28] of windowed x28 960 961 smull r10, r11, r6, r3 @ r10..r11 = (window_l[7] * (x[26] == x[27])) 962 ldr r8, =WL13 @ r8 = window_l[13] == window_s[4] 963 movs r10, r10, lsr #28 964 adc r4, r10, r11, lsl #4 @ r4 = bits[59..28] of windowed x27 965 966 smull r10, r11, r7, r3 @ r10..r11 = (window_l[10] * x[26]) 967 ldr r9, =WL16 @ r9 = window_l[16] == window_s[5] 968 movs r10, r10, lsr #28 969 adc r3, r10, r11, lsl #4 @ r3 = bits[59..28] of windowed x26 970 971 smull r10, r11, r8, r2 @ r10..r11 = (window_l[13] * x[25]) 972 ldr r5, =0x00 973 movs r10, r10, lsr #28 974 adc r2, r10, r11, lsl #4 @ r2 = bits[59..28] of windowed x25 975 976 smull r10, r11, r9, r0 @ r10..r11 = (window_l[16] * x[24]) 977 ldr r6, =0x00 978 movs r10, r10, lsr #28 979 adc r0, r10, r11, lsl #4 @ r0 = bits[59..28] of windowed x24 980 981 stmia r1!, { r0, r2, r3, r4, r12, lr } @ store windowed x[24] .. x[29] 982 983 ldr r7, =0x00 984 ldr r8, =0x00 985 ldr r9, =0x00 986 ldr r10, =0x00 987 988 stmia r1!, { r5 - r10 } @ store windowed x[30] .. x[35] 989 990 @---- 991 @ NB there are 2 possible exits from this function - this is only one of them 992 @---- 993 994 add sp, sp, #(21*4) @ return stack frame 995 ldmia sp!, { r4 - r11, pc } @ restore callee saved regs, and return 996 997 @---- 998 @END 999 @---- 1000 1001