1; back port from GOGO-no coda 2.24b by Takehiro TOMINAGA 2 3; GOGO-no-coda 4; Copyright (C) 1999 shigeo 5; special thanks to URURI 6 7%include "nasm.h" 8 9 externdef costab_fft 10 externdef sintab_fft 11 12 segment_data 13 align 32 14D_1_41421 dd 1.41421356 15D_1_0 dd 1.0 16D_0_5 dd 0.5 17D_0_25 dd 0.25 18D_0_0005 dd 0.0005 19D_0_0 dd 0.0 20 21 segment_code 22 23;void fht(float *fz, int n); 24proc fht_FPU 25 26%$fz arg 4 27%$n arg 4 28 29%$k local 4 30 31%$f0 local 4 32%$f1 local 4 33%$f2 local 4 34%$f3 local 4 35 36%$g0 local 4 37%$g1 local 4 38%$g2 local 4 39%$g3 local 4 40 41%$s1 local 4 42%$c1 local 4 43%$s2 local 4 44%$c2 local 4 45 46%$t_s local 4 47%$t_c local 4 48 alloc 49 50 pushd ebp, ebx, esi, edi 51 52fht_FPU_1st_part: 53 54fht_FPU_2nd_part: 55 56fht_FPU_3rd_part: 57 58.do_init: 59 mov r3, 16 ;k1*fsize = 4*fsize = k4 60 mov r4, 8 ;kx = k1/2 61 mov r2, 48 ;k3*fsize 62 mov dword [sp(%$k)], 2 ;k = 2 63 mov r0, [sp(%$fz)] ;fi 64 lea r1, [r0+8] ;gi = fi + kx 65 66.do: 67.do2: 68 ;f 69 fld dword [r0] 70 fsub dword [r0+r3] 71 72 fld dword [r0] 73 fadd dword [r0+r3] 74 75 fld dword [r0+r3*2] 76 fsub dword [r0+r2] 77 78 fld dword [r0+r3*2] 79 fadd dword [r0+r2] ;f2 f3 f0 f1 80 81 fld st2 ;f0 f2 f3 f0 f1 82 fadd st0, st1 83 fstp dword [r0] ;fi[0] 84 85 fld st3 ;f1 f2 f3 f0 f1 86 fadd st0, st2 87 fstp dword [r0+r3] ;fi[k1] 88 89 fsubr st0, st2 ;f0-f2 f3 f0 f1 90 fstp dword [r0+r3*2] ;fi[k2] 91 92 fsubr st0, st2 ;f1-f3 f0 f1 93 fstp dword [r0+r2] ;fi[k3] 94 fcompp 95 96 ;g 97 fld dword [r1] 98 fsub dword [r1+r3] 99 100 fld dword [r1] 101 fadd dword [r1+r3] 102 103 fld dword [D_1_41421] 104 fmul dword [r1+r2] 105 106 fld dword [D_1_41421] 107 fmul dword [r1+r3*2] ;g2 g3 g0 g1 108 109 fld st2 ;g0 g2 g3 g0 g1 110 fadd st0, st1 111 fstp dword [r1] ;gi[0] 112 113 fld st3 ;g1 g2 g3 g0 g1 114 fadd st0, st2 115 fstp dword [r1+r3] ;gi[k1] 116 117 fsubr st0, st2 ;g0-g2 g3 g0 g1 118 fstp dword [r1+r3*2] ;gi[k2] 119 120 fsubr st0, st2 ;g1-g3 g0 g1 121 fstp dword [r1+r2] ;gi[k3] 122 fcompp 123 124 lea r0, [r0+r3*4] 125 lea r1, [r1+r3*4] 126 cmp r0, r6 127 jb .do2 128 129 130 mov r0, [sp(%$k)] 131 fld dword [costab_fft +r0*4] 132 fstp dword [sp(%$t_c)] 133 fld dword [sintab_fft +r0*4] 134 fstp dword [sp(%$t_s)] 135 fld dword [D_1_0] 136 fstp dword [sp(%$c1)] 137 fld dword [D_0_0] 138 fstp dword [sp(%$s1)] 139 140.for_init: 141 mov r5, 4 ;i = 1*fsize 142 143.for: 144 fld dword [sp(%$c1)] 145 fmul dword [sp(%$t_c)] 146 fld dword [sp(%$s1)] 147 fmul dword [sp(%$t_s)] 148 fsubp st1, st0 ;c1 149 150 fld dword [sp(%$c1)] 151 fmul dword [sp(%$t_s)] 152 fld dword [sp(%$s1)] 153 fmul dword [sp(%$t_c)] 154 faddp st1, st0 ;s1 c1 155 156 fld st1 157 fmul st0, st0 ;c1c1 s1 c1 158 fld st1 159 fmul st0, st0 ;s1s1 c1c1 s1 c1 160 fsubp st1, st0 ;c2 s1 c1 161 fstp dword [sp(%$c2)] ;s1 c1 162 163 fld st1 ;c1 s1 c1 164 fmul st0, st1 ;c1s1 s1 c1 165 fadd st0, st0 ;s2 s1 c1 166 fstp dword [sp(%$s2)] ;s1 c1 167 168 fstp dword [sp(%$s1)] ;c1 169 fstp dword [sp(%$c1)] ; 170 171 mov r0, [sp(%$fz)] 172 add r0, r5 ;r0 = fi 173 mov r1, [sp(%$fz)] 174 add r1, r3 175 sub r1, r5 ;r1 = gi 176 177.do3: 178 fld dword [sp(%$s2)] 179 fmul dword [r0+r3] 180 fld dword [sp(%$c2)] 181 fmul dword [r1+r3] 182 fsubp st1, st0 ;b = s2*fi[k1] - c2*gi[k1] 183 184 fld dword [sp(%$c2)] 185 fmul dword [r0+r3] 186 fld dword [sp(%$s2)] 187 fmul dword [r1+r3] 188 faddp st1, st0 ;a = c2*fi[k1] + s2*gi[k1] b 189 190 fld dword [r0] 191 fsub st0, st1 ;f1 a b 192 fstp dword [sp(%$f1)] ;a b 193 194 fadd dword [r0] ;f0 b 195 fstp dword [sp(%$f0)] ;b 196 197 fld dword [r1] 198 fsub st0, st1 ;g1 b 199 fstp dword [sp(%$g1)] ;b 200 201 fadd dword [r1] ;g0 202 fstp dword [sp(%$g0)] ; 203 204 205 fld dword [sp(%$s2)] 206 fmul dword [r0+r2] 207 fld dword [sp(%$c2)] 208 fmul dword [r1+r2] 209 fsubp st1, st0 ;b = s2*fi[k3] - c2*gi[k3] 210 211 fld dword [sp(%$c2)] 212 fmul dword [r0+r2] 213 fld dword [sp(%$s2)] 214 fmul dword [r1+r2] 215 faddp st1, st0 ;a = c2*fi[k3] + s2*gi[k3] b 216 217 fld dword [r0+r3*2] 218 fsub st0, st1 ;f3 a b 219 fstp dword [sp(%$f3)] ;a b 220 221 fadd dword [r0+r3*2] ;f2 b 222 fstp dword [sp(%$f2)] ;b 223 224 fld dword [r1+r3*2] 225 fsub st0, st1 ;g3 b 226 fstp dword [sp(%$g3)] ;b 227 228 fadd dword [r1+r3*2] ;g2 229 fstp dword [sp(%$g2)] ; 230 231 232 fld dword [sp(%$s1)] 233 fmul dword [sp(%$f2)] 234 fld dword [sp(%$c1)] 235 fmul dword [sp(%$g3)] 236 fsubp st1, st0 ;b = s1*f2 - c1*g3 237 238 fld dword [sp(%$c1)] 239 fmul dword [sp(%$f2)] 240 fld dword [sp(%$s1)] 241 fmul dword [sp(%$g3)] 242 faddp st1, st0 ;a = c1*f2 + s1*g3 b 243 244 fld dword [sp(%$f0)] 245 fsub st0, st1 ;fi[k2] a b 246 fstp dword [r0+r3*2] 247 248 fadd dword [sp(%$f0)] ;fi[0] b 249 fstp dword [r0] 250 251 fld dword [sp(%$g1)] 252 fsub st0, st1 ;gi[k3] b 253 fstp dword [r1+r2] 254 255 fadd dword [sp(%$g1)] ;gi[k1] 256 fstp dword [r1+r3] 257 258 259 fld dword [sp(%$c1)] 260 fmul dword [sp(%$g2)] 261 fld dword [sp(%$s1)] 262 fmul dword [sp(%$f3)] 263 fsubp st1, st0 ;b = c1*g2 - s1*f3 264 265 fld dword [sp(%$s1)] 266 fmul dword [sp(%$g2)] 267 fld dword [sp(%$c1)] 268 fmul dword [sp(%$f3)] 269 faddp st1, st0 ;a = s1*g2 + c1*f3 b 270 271 fld dword [sp(%$g0)] 272 fsub st0, st1 ;gi[k2] a b 273 fstp dword [r1+r3*2] 274 275 fadd dword [sp(%$g0)] ;gi[0] b 276 fstp dword [r1] 277 278 fld dword [sp(%$f1)] 279 fsub st0, st1 ;fi[k3] b 280 fstp dword [r0+r2] 281 282 fadd dword [sp(%$f1)] ;fi[k1] 283 fstp dword [r0+r3] 284 285 286 lea r0, [r0+r3*4] 287 lea r1, [r1+r3*4] 288 cmp r0, r6 289 jb near .do3 290 291 add r5, 4 292 cmp r5, r4 293 jb near .for 294 295 cmp r3, [sp(%$n)] 296 jae .exit 297 298 add dword [sp(%$k)], 2 ;k += 2; 299 lea r3, [r3*4] ;k1 *= 4 300 lea r2, [r2*4] ;k3 *= 4 301 lea r4, [r4*4] ;kx *= 4 302 mov r0, [sp(%$fz)] ;fi 303 lea r1, [r0+r4] ;gi = fi + kx 304 jmp .do 305 306.exit: 307 popd ebp, ebx, esi, edi 308endproc 309 310;************************************************************* 311 312;void fht_FPU_FXCH(float *fz, int n); 313proc fht_FPU_FXCH 314 315%$fz arg 4 316%$n arg 4 317 318%$k local 4 319 320%$f0 local 4 321%$f1 local 4 322%$f2 local 4 323%$f3 local 4 324 325%$g0 local 4 326%$g1 local 4 327%$g2 local 4 328%$g3 local 4 329 330%$s1 local 4 331%$c1 local 4 332%$s2 local 4 333%$c2 local 4 334 335%$t_s local 4 336%$t_c local 4 337 alloc 338 339 pushd ebp, ebx, esi, edi 340 341fht_FPU_FXCH_1st_part: 342 343fht_FPU_FXCH_2nd_part: 344 345fht_FPU_FXCH_3rd_part: 346 347.do_init: 348 mov r3, 16 ;k1*fsize = 4*fsize = k4 349 mov r4, 8 ;kx = k1/2 350 mov r2, 48 ;k3*fsize 351 mov dword [sp(%$k)], 2 ;k = 2 352 mov r0, [sp(%$fz)] ;fi 353 lea r1, [r0+8] ;gi = fi + kx 354 355.do: 356.do2: 357 ;f 358 fld dword [r0] 359 fsub dword [r0+r3] 360 fld dword [r0] 361 fadd dword [r0+r3] 362 363 fld dword [r0+r3*2] 364 fsub dword [r0+r2] 365 fld dword [r0+r3*2] 366 fadd dword [r0+r2] ;f2 f3 f0 f1 367 368 fld st3 369 fld st3 370 fxch st5 371 fadd st0, st3 372 fxch st4 373 fadd st0, st2 374 fxch st3 375 fsubp st1, st0 376 fxch st1 377 fsubp st4, st0 378 fxch st2 379 380 fstp dword [r0+r3] ;fi[k1] 381 fstp dword [r0] ;fi[0] 382 fstp dword [r0+r2] ;fi[k3] 383 fstp dword [r0+r3*2] ;fi[k2] 384 385 ;g 386 fld dword [r1] 387 fsub dword [r1+r3] 388 fld dword [r1] 389 fadd dword [r1+r3] 390 391 fld dword [D_1_41421] 392 fmul dword [r1+r2] 393 fld dword [D_1_41421] 394 fmul dword [r1+r3*2] ;g2 g3 g0 g1 395 396 fld st3 397 fld st3 398 fxch st5 399 fadd st0, st3 400 fxch st4 401 fadd st0, st2 402 fxch st3 403 fsubp st1, st0 404 fxch st1 405 fsubp st4, st0 406 fxch st2 407 408 fstp dword [r1+r3] ;gi[k1] 409 fstp dword [r1] ;gi[0] 410 fstp dword [r1+r2] ;gi[k3] 411 fstp dword [r1+r3*2] ;gi[k2] 412 413 lea r0, [r0+r3*4] 414 lea r1, [r1+r3*4] 415 cmp r0, r6 416 jb .do2 417 418 419 mov r0, [sp(%$k)] 420 fld dword [costab_fft +r0*4] 421 fld dword [sintab_fft +r0*4] 422 fld dword [D_1_0] 423 fld dword [D_0_0] 424 fxch st3 425 fstp dword [sp(%$t_c)] 426 fxch st1 427 fstp dword [sp(%$t_s)] 428 fstp dword [sp(%$c1)] 429 fstp dword [sp(%$s1)] 430 431.for_init: 432 mov r5, 4 ;i = 1*fsize 433 434.for: 435 fld dword [sp(%$c1)] 436 fmul dword [sp(%$t_c)] 437 fld dword [sp(%$s1)] 438 fmul dword [sp(%$t_s)] 439 440 fld dword [sp(%$c1)] 441 fmul dword [sp(%$t_s)] 442 fld dword [sp(%$s1)] 443 fmul dword [sp(%$t_c)] 444 fxch st2 445 fsubp st3, st0 ;c1 446 faddp st1, st0 ;s1 c1 447 448 fld st1 449 fxch st2 450 fmul st0, st0 ;c1c1 s1 c1 451 fld st1 452 fxch st2 453 fmul st0, st0 ;s1s1 c1c1 s1 c1 454 455 fxch st3 456 fst dword [sp(%$c1)] ;c1 457 fxch st2 458 fst dword [sp(%$s1)] ;s1 c1c1 c1 s1s1 459 460 fmulp st2, st0 461 fsubrp st2, st0 462 fadd st0, st0 ;s2 c2 463 fxch st1 464 fstp dword [sp(%$c2)] 465 fstp dword [sp(%$s2)] 466 467 mov r0, [sp(%$fz)] 468 mov r1, [sp(%$fz)] 469 add r0, r5 ;r0 = fi 470 add r1, r3 471 sub r1, r5 ;r1 = gi 472 473.do3: 474 fld dword [sp(%$s2)] 475 fmul dword [r0+r3] 476 fld dword [sp(%$c2)] 477 fmul dword [r1+r3] 478 479 fld dword [sp(%$c2)] 480 fmul dword [r0+r3] 481 fld dword [sp(%$s2)] 482 fmul dword [r1+r3] 483 fxch st2 484 fsubp st3, st0 ;b = s2*fi[k1] - c2*gi[k1] 485 faddp st1, st0 ;a = c2*fi[k1] + s2*gi[k1] b 486 487 fld dword [r1] 488 fsub st0, st2 ;g1 a b 489 fxch st2 490 fadd dword [r1] ;g0 a g1 491 492 fld dword [r0] 493 fsub st0, st2 ;f1 g0 a g1 494 fxch st2 495 fadd dword [r0] ;f0 g0 f1 g1 496 497 fxch st3 498 fstp dword [sp(%$g1)] 499 fstp dword [sp(%$g0)] 500 fstp dword [sp(%$f1)] 501 fstp dword [sp(%$f0)] 502 503 504 fld dword [sp(%$s2)] 505 fmul dword [r0+r2] 506 fld dword [sp(%$c2)] 507 fmul dword [r1+r2] 508 509 fld dword [sp(%$c2)] 510 fmul dword [r0+r2] 511 fld dword [sp(%$s2)] 512 fmul dword [r1+r2] 513 fxch st2 514 fsubp st3, st0 ;b = s2*fi[k3] - c2*gi[k3] 515 faddp st1, st0 ;a = c2*fi[k3] + s2*gi[k3] b 516 517 518 fld dword [r1+r3*2] 519 fsub st0, st2 ;g3 a b 520 fxch st2 521 fadd dword [r1+r3*2] ;g2 a g3 522 523 fld dword [r0+r3*2] 524 fsub st0, st2 ;f3 g2 a g3 525 fxch st2 526 fadd dword [r0+r3*2] ;f2 g2 f3 g3 527 528 fxch st3 529 fstp dword [sp(%$g3)] 530 fstp dword [sp(%$g2)] 531 fstp dword [sp(%$f3)] 532 fstp dword [sp(%$f2)] 533 534 535 fld dword [sp(%$s1)] 536 fmul dword [sp(%$f2)] 537 fld dword [sp(%$c1)] 538 fmul dword [sp(%$g3)] 539 540 fld dword [sp(%$c1)] 541 fmul dword [sp(%$f2)] 542 fld dword [sp(%$s1)] 543 fmul dword [sp(%$g3)] 544 fxch st2 545 fsubp st3, st0 ;b = s1*f2 - c1*g3 546 faddp st1, st0 ;a = c1*f2 + s1*g3 b 547 548 fld dword [sp(%$g1)] 549 fsub st0, st2 ;gi[k3] a b 550 fxch st2 551 fadd dword [sp(%$g1)] ;gi[k1] a gi[k3] 552 553 fld dword [sp(%$f0)] 554 fsub st0, st2 ;fi[k2] gi[k1] a gi[k3] 555 fxch st2 556 fadd dword [sp(%$f0)] ;fi[0] gi[k1] fi[k2] gi[k3] 557 558 fxch st3 559 fstp dword [r1+r2] 560 fstp dword [r1+r3] 561 fstp dword [r0+r3*2] 562 fstp dword [r0] 563 564 565 fld dword [sp(%$c1)] 566 fmul dword [sp(%$g2)] 567 fld dword [sp(%$s1)] 568 fmul dword [sp(%$f3)] 569 570 fld dword [sp(%$s1)] 571 fmul dword [sp(%$g2)] 572 fld dword [sp(%$c1)] 573 fmul dword [sp(%$f3)] 574 fxch st2 575 fsubp st3, st0 ;b = c1*g2 - s1*f3 576 faddp st1, st0 ;a = s1*g2 + c1*f3 b 577 578 fld dword [sp(%$f1)] 579 fsub st0, st2 ;fi[k3] a b 580 fxch st2 581 fadd dword [sp(%$f1)] ;fi[k1] a fi[k3] 582 583 fld dword [sp(%$g0)] 584 fsub st0, st2 ;gi[k2] fi[k1] a fi[k3] 585 fxch st2 586 fadd dword [sp(%$g0)] ;gi[0] fi[k1] gi[k2] fi[k3] 587 588 fxch st3 589 fstp dword [r0+r2] 590 fstp dword [r0+r3] 591 fstp dword [r1+r3*2] 592 fstp dword [r1] 593 594 595 lea r0, [r0+r3*4] 596 lea r1, [r1+r3*4] 597 cmp r0, r6 598 jb near .do3 599 600 add r5, 4 601 cmp r5, r4 602 jb near .for 603 604 cmp r3, [sp(%$n)] 605 jae .exit 606 607 add dword [sp(%$k)], 2 ;k += 2; 608 lea r3, [r3*4] ;k1 *= 4 609 lea r2, [r2*4] ;k3 *= 4 610 lea r4, [r4*4] ;kx *= 4 611 mov r0, [sp(%$fz)] ;fi 612 lea r1, [r0+r4] ;gi = fi + kx 613 jmp .do 614 615.exit: 616 popd ebp, ebx, esi, edi 617endproc 618 619 end 620