1############################################################################ 2## **** WAVPACK **** ## 3## Hybrid Lossless Wavefile Compressor ## 4## Copyright (c) 1998 - 2015 Conifer Software. ## 5## All Rights Reserved. ## 6## Distributed under the BSD Software License (see license.txt) ## 7############################################################################ 8 9 .intel_syntax noprefix 10 .text 11 12 .globl _unpack_decorr_stereo_pass_cont_x64win 13 .globl _unpack_decorr_mono_pass_cont_x64win 14 15 .globl unpack_decorr_stereo_pass_cont_x64win 16 .globl unpack_decorr_mono_pass_cont_x64win 17 18 .globl _unpack_decorr_stereo_pass_cont_x64 19 .globl _unpack_decorr_mono_pass_cont_x64 20 21 .globl unpack_decorr_stereo_pass_cont_x64 22 .globl unpack_decorr_mono_pass_cont_x64 23 24# This is an assembly optimized version of the following WavPack function: 25# 26# void unpack_decorr_stereo_pass_cont (struct decorr_pass *dpp, 27# int32_t *buffer, 28# int32_t sample_count, 29# int32_t long_math; 30# 31# It performs a single pass of stereo decorrelation on the provided buffer. 32# Note that this version of the function requires that up to 8 previous 33# stereo samples are visible and correct. In other words, it ignores the 34# "samples_*" fields in the decorr_pass structure and gets the history data 35# directly from the buffer. It does, however, return the appropriate history 36# samples to the decorr_pass structure before returning. 37# 38# The "long_math" argument is used to specify that a 32-bit multiply is 39# not enough for the "apply_weight" operation (although in this case it 40# would only apply to the -1 and -2 terms because the MMX code does not have 41# this limitation) but we ignore the parameter and use the overflow detection 42# of the "imul" instruction to switch automatically to the "long_math" loop. 43# 44# This is written to work on an X86-64 processor (also called the AMD64) 45# running in 64-bit mode and generally uses the MMX extensions to improve 46# the performance by processing both stereo channels together. Unfortunately 47# this is not easily used for terms -1 and -2, so these terms are handled 48# sequentially with regular assembler code. 49# 50# This version has entry points for both the System V ABI and the Windows 51# X64 ABI. It does not use the "red zone" or the "shadow area"; it saves the 52# non-volatile registers for both ABIs on the stack and allocates another 53# 8 bytes on the stack to store the dpp pointer. Note that it does NOT 54# provide unwind data for the Windows ABI (the unpack_x64.asm module for 55# MSVC does). The arguments are passed in registers: 56# 57# System V Windows 58# rdi rcx struct decorr_pass *dpp 59# rsi rdx int32_t *buffer 60# edx r8 int32_t sample_count 61# ecx r9 int32_t long_math 62# 63# registers after entry: 64# 65# rdi bptr 66# rsi eptr 67# 68# stack usage: 69# 70# [rsp+0] = *dpp 71# 72 73_unpack_decorr_stereo_pass_cont_x64win: 74unpack_decorr_stereo_pass_cont_x64win: 75 push rbp 76 push rbx 77 push rdi 78 push rsi 79 sub rsp, 8 80 mov rdi, rcx # copy params from win regs to Linux regs 81 mov rsi, rdx # so we can leave following code similar 82 mov rdx, r8 83 mov rcx, r9 84 jmp entry # jump into common portion 85 86_unpack_decorr_stereo_pass_cont_x64: 87unpack_decorr_stereo_pass_cont_x64: 88 push rbp 89 push rbx 90 push rdi 91 push rsi 92 sub rsp, 8 93 94entry: mov [rsp], rdi # store dpp* at [rsp] 95 and edx, edx # if sample_count is zero, do nothing 96 jz done 97 98 mov rdi, rsi # rdi = bptr 99 lea rsi, [rdi+rdx*8] # rsi = eptr 100 101 mov rax, [rsp] # get term from dpp struct & vector to handler 102 mov eax, [rax] 103 cmp al, 17 104 je term_17_entry 105 cmp al, 18 106 je term_18_entry 107 cmp al, -1 108 je term_minus_1_entry 109 cmp al, -2 110 je term_minus_2_entry 111 cmp al, -3 112 je term_minus_3_entry 113 114# 115# registers in default term loop: 116# 117# rbx term * -8 (for indexing correlation sample) 118# rdi bptr 119# rsi eptr 120# 121# mm0, mm1 scratch 122# mm2 original sample values 123# mm3 correlation sample 124# mm4 zero (for pcmpeqd) 125# mm5 weights 126# mm6 delta 127# mm7 512 (for rounding) 128# 129 130default_term_entry: 131 imul rbx, rax, -8 # set RBX to term * -8 132 mov eax, 512 133 movd mm7, eax 134 punpckldq mm7, mm7 # mm7 = round (512) 135 mov rdx, [rsp] # set RDX to *dpp 136 mov eax, [rdx+4] 137 movd mm6, eax 138 punpckldq mm6, mm6 # mm6 = delta (0-7) 139 mov eax, 0xFFFF # mask high weights to zero for PMADDWD 140 movd mm5, eax 141 punpckldq mm5, mm5 # mm5 = weight mask 0x0000FFFF0000FFFF 142 pand mm5, [rdx+8] # mm5 = weight_AB masked to 16 bits 143 pxor mm4, mm4 # mm4 = zero (for pcmpeqd) 144 jmp default_term_loop 145 146 .balign 64 147default_term_loop: 148 movq mm3, [rdi+rbx] # mm3 = sam_AB 149 movq mm1, mm3 150 movq mm0, mm3 151 paddd mm1, mm1 152 psrld mm0, 15 153 psrlw mm1, 1 154 pmaddwd mm0, mm5 155 pmaddwd mm1, mm5 156 movq mm2, [rdi] # mm2 = left_right 157 pslld mm0, 5 158 paddd mm1, mm7 # add 512 for rounding 159 psrad mm1, 10 160 paddd mm0, mm2 161 paddd mm0, mm1 # add shifted sums 162 movq [rdi], mm0 # store result 163 movq mm0, mm3 164 pxor mm0, mm2 165 psrad mm0, 31 # mm0 = sign (sam_AB ^ left_right) 166 add rdi, 8 167 pcmpeqd mm2, mm4 # mm2 = 1s if left_right was zero 168 pcmpeqd mm3, mm4 # mm3 = 1s if sam_AB was zero 169 por mm2, mm3 # mm2 = 1s if either was zero 170 pandn mm2, mm6 # mask delta with zeros check 171 pxor mm5, mm0 172 paddw mm5, mm2 # and add to weight_AB 173 pxor mm5, mm0 174 cmp rdi, rsi # compare bptr and eptr to see if we're done 175 jb default_term_loop 176 177 pslld mm5, 16 # sign-extend 16-bit weights back to dwords 178 psrad mm5, 16 179 mov rdx, [rsp] # point to dpp 180 movq [rdx+8], mm5 # put weight_AB back 181 emms 182 183 mov ecx, [rdx] # ecx = dpp->term 184 185default_store_samples: 186 dec ecx 187 sub rdi, 8 # back up one full sample 188 mov eax, [rdi+4] 189 mov [rdx+rcx*4+48], eax # store samples_B [ecx] 190 mov eax, [rdi] 191 mov [rdx+rcx*4+16], eax # store samples_A [ecx] 192 test ecx, ecx 193 jnz default_store_samples 194 jmp done 195 196# 197# registers in term 17 & 18 loops: 198# 199# rdi bptr 200# rsi eptr 201# 202# mm0, mm1 scratch 203# mm2 original sample values 204# mm3 correlation samples 205# mm4 last calculated values (so we don't need to reload) 206# mm5 weights 207# mm6 delta 208# mm7 512 (for rounding) 209# 210 211term_17_entry: 212 mov eax, 512 213 movd mm7, eax 214 punpckldq mm7, mm7 # mm7 = round (512) 215 mov rdx, [rsp] # set RDX to *dpp 216 mov eax, [rdx+4] 217 movd mm6, eax 218 punpckldq mm6, mm6 # mm6 = delta (0-7) 219 mov eax, 0xFFFF # mask high weights to zero for PMADDWD 220 movd mm5, eax 221 punpckldq mm5, mm5 # mm5 = weight mask 0x0000FFFF0000FFFF 222 pand mm5, [rdx+8] # mm5 = weight_AB masked to 16 bits 223 movq mm4, [rdi-8] # preload last calculated values in mm4 224 jmp term_17_loop 225 226 .balign 64 227term_17_loop: 228 paddd mm4, mm4 229 psubd mm4, [rdi-16] # mm3 = sam_AB 230 movq mm3, mm4 231 movq mm1, mm3 232 paddd mm1, mm1 233 psrld mm4, 15 234 psrlw mm1, 1 235 pmaddwd mm4, mm5 236 pmaddwd mm1, mm5 237 movq mm2, [rdi] # mm2 = left_right 238 pslld mm4, 5 239 paddd mm1, mm7 # add 512 for rounding 240 psrad mm1, 10 241 paddd mm4, mm2 242 paddd mm4, mm1 # add shifted sums 243 movq mm0, mm3 244 movq [rdi], mm4 # store result 245 pxor mm0, mm2 246 psrad mm0, 31 # mm0 = sign (sam_AB ^ left_right) 247 add rdi, 8 248 pxor mm1, mm1 # mm1 = zero 249 pcmpeqd mm2, mm1 # mm2 = 1s if left_right was zero 250 pcmpeqd mm3, mm1 # mm3 = 1s if sam_AB was zero 251 por mm2, mm3 # mm2 = 1s if either was zero 252 pandn mm2, mm6 # mask delta with zeros check 253 pxor mm5, mm0 254 paddw mm5, mm2 # and add to weight_AB 255 pxor mm5, mm0 256 cmp rdi, rsi # compare bptr and eptr to see if we're done 257 jb term_17_loop 258 jmp term_1718_exit # terms 17 & 18 treat samples_AB[] the same 259 260term_18_entry: 261 mov eax, 512 262 movd mm7, eax 263 punpckldq mm7, mm7 # mm7 = round (512) 264 mov rdx, [rsp] # set RDX to *dpp 265 mov eax, [rdx+4] 266 movd mm6, eax 267 punpckldq mm6, mm6 # mm6 = delta (0-7) 268 mov eax, 0xFFFF # mask high weights to zero for PMADDWD 269 movd mm5, eax 270 punpckldq mm5, mm5 # mm5 = weight mask 0x0000FFFF0000FFFF 271 pand mm5, [rdx+8] # mm5 = weight_AB masked to 16 bits 272 movq mm4, [rdi-8] # preload last calculated values in mm4 273 jmp term_18_loop 274 275 .balign 64 276term_18_loop: 277 movq mm3, mm4 278 psubd mm3, [rdi-16] 279 psrad mm3, 1 280 paddd mm3, mm4 # mm3 = sam_AB 281 movq mm1, mm3 282 movq mm4, mm3 283 paddd mm1, mm1 284 psrld mm4, 15 285 psrlw mm1, 1 286 pmaddwd mm4, mm5 287 pmaddwd mm1, mm5 288 movq mm2, [rdi] # mm2 = left_right 289 pslld mm4, 5 290 paddd mm1, mm7 # add 512 for rounding 291 psrad mm1, 10 292 paddd mm4, mm2 293 paddd mm4, mm1 # add shifted sums 294 movq mm0, mm3 295 movq [rdi], mm4 # store result 296 pxor mm0, mm2 297 psrad mm0, 31 # mm0 = sign (sam_AB ^ left_right) 298 add rdi, 8 299 pxor mm1, mm1 # mm1 = zero 300 pcmpeqd mm2, mm1 # mm2 = 1s if left_right was zero 301 pcmpeqd mm3, mm1 # mm3 = 1s if sam_AB was zero 302 por mm2, mm3 # mm2 = 1s if either was zero 303 pandn mm2, mm6 # mask delta with zeros check 304 pxor mm5, mm0 305 paddw mm5, mm2 # and add to weight_AB 306 pxor mm5, mm0 307 cmp rdi, rsi # compare bptr and eptr to see if we're done 308 jb term_18_loop 309 310term_1718_exit: 311 pslld mm5, 16 # sign-extend 16-bit weights back to dwords 312 psrad mm5, 16 313 mov rdx, [rsp] # point to dpp 314 movq [rdx+8], mm5 # put weight_AB back 315 emms 316 317 mov eax, [rdi-4] # dpp->samples_B [0] = bptr [-1]; 318 mov [rdx+48], eax 319 mov eax, [rdi-8] # dpp->samples_A [0] = bptr [-2]; 320 mov [rdx+16], eax 321 mov eax, [rdi-12] # dpp->samples_B [1] = bptr [-3]; 322 mov [rdx+52], eax 323 mov eax, [rdi-16] # dpp->samples_A [1] = bptr [-4]; 324 mov [rdx+20], eax 325 jmp done 326 327# 328# registers in term -1 & -2 loops: 329# 330# eax,ebx,edx scratch 331# ecx weight_A 332# ebp weight_B 333# rdi bptr 334# rsi eptr 335# r8d delta 336# 337 338term_minus_1_entry: 339 cld 340 mov rdx, [rsp] # point to dpp 341 mov ecx, [rdx+8] # ecx = weight_A 342 mov ebp, [rdx+12] # ebp = weight_B 343 mov r8d, [rdx+4] # r8d = delta 344 mov eax, [rdi-4] 345 jmp term_minus_1_loop 346 347 .balign 64 348term_minus_1_loop: 349 mov ebx, eax 350 imul eax, ecx 351 mov edx, [rdi] 352 jo OV11 353 sar eax, 10 354 adc eax, edx 355 stosd 356 test ebx, ebx 357 je L182 358 test edx, edx 359 je L182 360 xor ebx, edx 361 sar ebx, 31 362 xor ecx, ebx 363 add ecx, r8d 364 mov edx, 1024 365 add edx, ebx 366 cmp ecx, edx 367 jle L183 368 mov ecx, edx 369L183: xor ecx, ebx 370L182: mov ebx, eax 371 imul eax, ebp 372 mov edx, [rdi] 373 jo OV12 374 sar eax, 10 375 adc eax, edx 376 stosd 377 test ebx, ebx 378 je L187 379 test edx, edx 380 je L187 381 xor ebx, edx 382 sar ebx, 31 383 xor ebp, ebx 384 add ebp, r8d 385 mov edx, 1024 386 add edx, ebx 387 cmp ebp, edx 388 jle L188 389 mov ebp, edx 390L188: xor ebp, ebx 391L187: cmp rdi, rsi # compare bptr and eptr to see if we're done 392 jb term_minus_1_loop 393 jmp term_minus_1_done 394 395OV11: mov eax, ebx # restore previous sample into eax 396 jmp long_term_minus_1_loop 397 398OV12: mov eax, ebx # restore previous sample into eax 399 jmp L282 400 401 .balign 64 402long_term_minus_1_loop: 403 mov ebx, eax 404 imul ecx 405 shl edx, 22 406 shr eax, 10 407 adc eax, edx 408 mov edx, [rdi] 409 add eax, edx 410 stosd 411 test ebx, ebx 412 je L282 413 test edx, edx 414 je L282 415 xor ebx, edx 416 sar ebx, 31 417 xor ecx, ebx 418 add ecx, r8d 419 mov edx, 1024 420 add edx, ebx 421 cmp ecx, edx 422 jle L283 423 mov ecx, edx 424L283: xor ecx, ebx 425L282: mov ebx, eax 426 imul ebp 427 shl edx, 22 428 shr eax, 10 429 adc eax, edx 430 mov edx, [rdi] 431 add eax, edx 432 stosd 433 test ebx, ebx 434 je L287 435 test edx, edx 436 je L287 437 xor ebx, edx 438 sar ebx, 31 439 xor ebp, ebx 440 add ebp, r8d 441 mov edx, 1024 442 add edx, ebx 443 cmp ebp, edx 444 jle L288 445 mov ebp, edx 446L288: xor ebp, ebx 447L287: cmp rdi, rsi # compare bptr and eptr to see if we're done 448 jb long_term_minus_1_loop 449 450term_minus_1_done: 451 mov rdx, [rsp] # point to dpp 452 mov [rdx+8], ecx # store weights back 453 mov [rdx+12], ebp 454 mov eax, [rdi-4] # dpp->samples_A [0] = bptr [-1]; 455 mov [rdx+16], eax 456 jmp done 457 458term_minus_2_entry: 459 mov rdx, [rsp] # point to dpp 460 mov ecx, [rdx+8] # ecx = weight_A 461 mov ebp, [rdx+12] # ebp = weight_B 462 mov r8d, [rdx+4] # r8d = delta 463 mov eax, [rdi-8] 464 jmp term_minus_2_loop 465 466 .balign 64 467term_minus_2_loop: 468 mov ebx, eax 469 imul eax, ebp 470 mov edx, [rdi+4] 471 jo OV21 472 sar eax, 10 473 adc eax, edx 474 mov [rdi+4], eax 475 test ebx, ebx 476 je L194 477 test edx, edx 478 je L194 479 xor ebx, edx 480 sar ebx, 31 481 xor ebp, ebx 482 add ebp, r8d 483 mov edx, 1024 484 add edx, ebx 485 cmp ebp, edx 486 jle L195 487 mov ebp, edx 488L195: xor ebp, ebx 489L194: mov ebx, eax 490 imul eax, ecx 491 mov edx, [rdi] 492 jo OV22 493 sar eax, 10 494 adc eax, edx 495 mov [rdi], eax 496 test ebx, ebx 497 je L199 498 test edx, edx 499 je L199 500 xor ebx, edx 501 sar ebx, 31 502 xor ecx, ebx 503 add ecx, r8d 504 mov edx, 1024 505 add edx, ebx 506 cmp ecx, edx 507 jle L200 508 mov ecx, edx 509L200: xor ecx, ebx 510L199: add rdi, 8 511 cmp rdi, rsi # compare bptr and eptr to see if we're done 512 jb term_minus_2_loop 513 jmp term_minus_2_done 514 515OV21: mov eax, ebx # restore previous sample into eax 516 jmp long_term_minus_2_loop 517 518OV22: mov eax, ebx # restore previous sample into eax 519 jmp L294 520 521 .balign 64 522long_term_minus_2_loop: 523 mov ebx, eax 524 imul ebp 525 shl edx, 22 526 shr eax, 10 527 adc eax, edx 528 mov edx, [rdi+4] 529 add eax, edx 530 mov [rdi+4], eax 531 test ebx, ebx 532 je L294 533 test edx, edx 534 je L294 535 xor ebx, edx 536 sar ebx, 31 537 xor ebp, ebx 538 add ebp, r8d 539 mov edx, 1024 540 add edx, ebx 541 cmp ebp, edx 542 jle L295 543 mov ebp, edx 544L295: xor ebp, ebx 545L294: mov ebx, eax 546 imul ecx 547 shl edx, 22 548 shr eax, 10 549 adc eax, edx 550 mov edx, [rdi] 551 add eax, edx 552 mov [rdi], eax 553 test ebx, ebx 554 je L299 555 test edx, edx 556 je L299 557 xor ebx, edx 558 sar ebx, 31 559 xor ecx, ebx 560 add ecx, r8d 561 mov edx, 1024 562 add edx, ebx 563 cmp ecx, edx 564 jle L300 565 mov ecx, edx 566L300: xor ecx, ebx 567L299: add rdi, 8 568 cmp rdi, rsi # compare bptr and eptr to see if we're done 569 jb long_term_minus_2_loop 570 571term_minus_2_done: 572 mov rdx, [rsp] # point to dpp 573 mov [rdx+8], ecx # store weights back 574 mov [rdx+12], ebp 575 mov eax, [rdi-8] # dpp->samples_B [0] = bptr [-2]; 576 mov [rdx+48], eax 577 jmp done 578 579# 580# registers in term -3 loop: 581# 582# rdi bptr 583# rsi eptr 584# 585# mm0, mm1 scratch 586# mm2 original sample values 587# mm3 correlation samples 588# mm4 last calculated values (so we don't need to reload) 589# mm5 weights 590# mm6 delta 591# mm7 512 (for rounding) 592# 593 594term_minus_3_entry: 595 mov eax, 512 596 movd mm7, eax 597 punpckldq mm7, mm7 # mm7 = round (512) 598 mov rdx, [rsp] # set RDX to *dpp 599 mov eax, [rdx+4] 600 movd mm6, eax 601 punpckldq mm6, mm6 # mm6 = delta (0-7) 602 mov eax, 0xFFFF # mask high weights to zero for PMADDWD 603 movd mm5, eax 604 punpckldq mm5, mm5 # mm5 = weight mask 0x0000FFFF0000FFFF 605 pand mm5, [rdx+8] # mm5 = weight_AB masked to 16 bits 606 movq mm4, [rdi-8] 607 jmp term_minus_3_loop 608 609 .balign 64 610term_minus_3_loop: 611 movq mm3, mm4 612 psrlq mm3, 32 613 punpckldq mm3, mm4 # mm3 = sam_AB 614 movq mm1, mm3 615 movq mm4, mm3 616 pslld mm1, 1 617 psrld mm4, 15 618 psrlw mm1, 1 619 pmaddwd mm4, mm5 620 pmaddwd mm1, mm5 621 movq mm2, [rdi] # mm2 = left_right 622 pslld mm4, 5 623 paddd mm1, mm7 # add 512 for rounding 624 psrad mm1, 10 625 paddd mm4, mm2 626 paddd mm4, mm1 # add shifted sums 627 movq [rdi], mm4 # store result 628 movq mm0, mm3 629 pxor mm0, mm2 630 psrad mm0, 31 # mm0 = sign (sam_AB ^ left_right) 631 add rdi, 8 632 pxor mm1, mm1 # mm1 = zero 633 pcmpeqd mm2, mm1 # mm2 = 1s if left_right was zero 634 pcmpeqd mm3, mm1 # mm3 = 1s if sam_AB was zero 635 por mm2, mm3 # mm2 = 1s if either was zero 636 pandn mm2, mm6 # mask delta with zeros check 637 pcmpeqd mm1, mm1 638 psubd mm1, mm7 639 psubd mm1, mm7 640 psubd mm1, mm0 641 pxor mm5, mm0 642 paddw mm5, mm1 643 paddusw mm5, mm2 # and add to weight_AB 644 psubw mm5, mm1 645 pxor mm5, mm0 646 cmp rdi, rsi # compare bptr and eptr to see if we're done 647 jb term_minus_3_loop 648 649 pslld mm5, 16 # sign-extend 16-bit weights back to dwords 650 psrad mm5, 16 651 mov rdx, [rsp] # point to dpp 652 movq [rdx+8], mm5 # put weight_AB back 653 emms 654 655 mov edx, [rdi-4] # dpp->samples_A [0] = bptr [-1]; 656 mov rax, [rsp] 657 mov [rax+16], edx 658 mov edx, [rdi-8] # dpp->samples_B [0] = bptr [-2]; 659 mov [rax+48], edx 660 661done: add rsp, 8 662 pop rsi 663 pop rdi 664 pop rbx 665 pop rbp 666 ret 667 668####################################################################################################################### 669# 670# This is the mono version of the above function. It does not use MMX and does not handle negative terms. 671# 672# void unpack_decorr_mono_pass_cont (struct decorr_pass *dpp, 673# int32_t *buffer, 674# int32_t sample_count, 675# int32_t long_math; 676# arguments on entry: 677# 678# System V Windows 679# rdi rcx struct decorr_pass *dpp 680# rsi rdx int32_t *buffer 681# edx r8 int32_t sample_count 682# ecx r9 int32_t long_math 683# 684# registers after entry: 685# 686# rdi bptr 687# rsi eptr 688# 689# stack usage: 690# 691# [rsp+0] = *dpp 692# 693 694_unpack_decorr_mono_pass_cont_x64win: 695unpack_decorr_mono_pass_cont_x64win: 696 push rbp 697 push rbx 698 push rdi 699 push rsi 700 sub rsp, 8 701 702 mov rdi, rcx # copy params from win regs to Linux regs 703 mov rsi, rdx # so we can leave following code similar 704 mov rdx, r8 705 mov rcx, r9 706 jmp mentry # jump into common portion 707 708_unpack_decorr_mono_pass_cont_x64: 709unpack_decorr_mono_pass_cont_x64: 710 push rbp 711 push rbx 712 push rdi 713 push rsi 714 sub rsp, 8 715 716mentry: mov [rsp], rdi # store dpp* into [rsp] 717 and edx, edx # if sample_count is zero, do nothing 718 jz mono_done 719 720 cld # we use stosd 721 mov rdi, rsi # rdi = bptr 722 lea rsi, [rdi+rdx*4] # rsi = eptr 723 724 mov rax, [rsp] # get term from dpp struct & vector to handler 725 mov eax, [rax] 726 cmp al, 17 727 je mono_17_entry 728 cmp al, 18 729 je mono_18_entry 730 731# 732# registers during default term processing loop: 733# rdi active buffer pointer 734# rsi end of buffer pointer 735# r8d delta 736# ecx weight_A 737# ebx term * -4 738# eax,edx scratch 739# 740 741default_mono_entry: 742 imul rbx, rax, -4 # set rbx to term * -4 for decorrelation index 743 mov rdx, [rsp] 744 mov ecx, [rdx+8] # ecx = weight, r8d = delta 745 mov r8d, [rdx+4] 746 jmp default_mono_loop 747 748# 749# registers during processing loop for terms 17 & 18: 750# rdi active buffer pointer 751# rsi end of buffer pointer 752# r8d delta 753# ecx weight_A 754# ebp previously calculated value 755# ebx calculated correlation sample 756# eax,edx scratch 757# 758 759mono_17_entry: 760 mov rdx, [rsp] # rdx = dpp* 761 mov ecx, [rdx+8] # ecx = weight, r8d = delta 762 mov r8d, [rdx+4] 763 mov ebp, [rdi-4] 764 jmp mono_17_loop 765 766mono_18_entry: 767 mov rdx, [rsp] # rdx = dpp* 768 mov ecx, [rdx+8] # ecx = weight, r8d = delta 769 mov r8d, [rdx+4] 770 mov ebp, [rdi-4] 771 jmp mono_18_loop 772 773 .balign 64 774default_mono_loop: 775 mov eax, [rdi+rbx] 776 imul eax, ecx 777 mov edx, [rdi] 778 jo long_default_mono_loop 779 sar eax, 10 780 adc eax, edx 781 mov [rdi], eax 782 mov eax, [rdi+rbx] 783 add rdi, 4 784 test edx, edx 785 je L100 786 test eax, eax 787 je L100 788 xor eax, edx 789 cdq 790 xor ecx, edx 791 add ecx, r8d 792 xor ecx, edx 793L100: cmp rdi, rsi # compare bptr and eptr to see if we're done 794 jb default_mono_loop 795 jmp default_mono_done 796 797 .balign 64 798long_default_mono_loop: 799 mov eax, [rdi+rbx] 800 imul ecx 801 shl edx, 22 802 shr eax, 10 803 adc eax, edx 804 mov edx, [rdi] 805 add eax, edx 806 mov [rdi], eax 807 mov eax, [rdi+rbx] 808 add rdi, 4 809 test edx, edx 810 je L101 811 test eax, eax 812 je L101 813 xor eax, edx 814 cdq 815 xor ecx, edx 816 add ecx, r8d 817 xor ecx, edx 818L101: cmp rdi, rsi # compare bptr and eptr to see if we're done 819 jb long_default_mono_loop 820 821default_mono_done: 822 mov rdx, [rsp] # edx = dpp* 823 mov [rdx+8], ecx # store weight_A back 824 mov ecx, [rdx] # ecx = dpp->term 825 826default_mono_store_samples: 827 dec ecx 828 sub rdi, 4 # back up one full sample 829 mov eax, [rdi] 830 mov [rdx+rcx*4+16], eax # store samples_A [ecx] 831 test ecx, ecx 832 jnz default_mono_store_samples 833 jmp mono_done 834 835 .balign 64 836mono_17_loop: 837 lea ebx, [ebp+ebp] 838 sub ebx, [rdi-8] 839 mov eax, ecx 840 imul eax, ebx 841 mov edx, [rdi] 842 jo long_mono_17_loop 843 sar eax, 10 844 adc eax, edx 845 stosd 846 test ebx, ebx 847 mov ebp, eax 848 je L117 849 test edx, edx 850 je L117 851 xor ebx, edx 852 sar ebx, 31 853 xor ecx, ebx 854 add ecx, r8d 855 xor ecx, ebx 856L117: cmp rdi, rsi # compare bptr and eptr to see if we're done 857 jb mono_17_loop 858 jmp mono_1718_exit 859 860 .balign 64 861long_mono_17_loop: 862 lea ebx, [ebp+ebp] 863 sub ebx, [rdi-8] 864 mov eax, ecx 865 imul ebx 866 shl edx, 22 867 shr eax, 10 868 adc eax, edx 869 mov edx, [rdi] 870 add eax, edx 871 stosd 872 test ebx, ebx 873 mov ebp, eax 874 je L217 875 test edx, edx 876 je L217 877 xor ebx, edx 878 sar ebx, 31 879 xor ecx, ebx 880 add ecx, r8d 881 xor ecx, ebx 882L217: cmp rdi, rsi # compare bptr and eptr to see if we're done 883 jb long_mono_17_loop 884 jmp mono_1718_exit 885 886 .balign 64 887mono_18_loop: 888 lea ebx, [ebp+ebp*2] 889 sub ebx, [rdi-8] 890 sar ebx, 1 891 mov eax, ecx 892 imul eax, ebx 893 mov edx, [rdi] 894 jo long_mono_18_loop 895 sar eax, 10 896 adc eax, edx 897 stosd 898 test ebx, ebx 899 mov ebp, eax 900 je L118 901 test edx, edx 902 je L118 903 xor ebx, edx 904 sar ebx, 31 905 xor ecx, ebx 906 add ecx, r8d 907 xor ecx, ebx 908L118: cmp rdi, rsi # compare bptr and eptr to see if we're done 909 jb mono_18_loop 910 jmp mono_1718_exit 911 912 .balign 64 913long_mono_18_loop: 914 lea ebx, [ebp+ebp*2] 915 sub ebx, [rdi-8] 916 sar ebx, 1 917 mov eax, ecx 918 imul ebx 919 shl edx, 22 920 shr eax, 10 921 adc eax, edx 922 mov edx, [rdi] 923 add eax, edx 924 stosd 925 test ebx, ebx 926 mov ebp, eax 927 je L218 928 test edx, edx 929 je L218 930 xor ebx, edx 931 sar ebx, 31 932 xor ecx, ebx 933 add ecx, r8d 934 xor ecx, ebx 935L218: cmp rdi, rsi # compare bptr and eptr to see if we're done 936 jb long_mono_18_loop 937 938mono_1718_exit: 939 mov rdx, [rsp] # edx = dpp* 940 mov [rdx+8], ecx # store weight_A back 941 mov eax, [rdi-4] # dpp->samples_A [0] = bptr [-1]; 942 mov [rdx+16], eax 943 mov eax, [rdi-8] # dpp->samples_A [1] = bptr [-2]; 944 mov [rdx+20], eax 945 946mono_done: 947 add rsp, 8 948 pop rsi 949 pop rdi 950 pop rbx 951 pop rbp 952 ret 953 954#ifdef __ELF__ 955 .section .note.GNU-stack,"",@progbits 956#endif 957 958#ifdef __midipix__ 959 .section .got$unpack_decorr_mono_pass_cont_x64win,"r" 960 .global __imp_unpack_decorr_mono_pass_cont_x64win 961__imp_unpack_decorr_mono_pass_cont_x64win: 962 .quad unpack_decorr_mono_pass_cont_x64win 963 .linkonce discard 964 965 .section .got$unpack_decorr_stereo_pass_cont_x64win,"r" 966 .global __imp_unpack_decorr_stereo_pass_cont_x64win 967__imp_unpack_decorr_stereo_pass_cont_x64win: 968 .quad unpack_decorr_stereo_pass_cont_x64win 969 .linkonce discard 970#endif 971