1;****************************************************************************** 2;* AAC Spectral Band Replication decoding functions 3;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION_RODATA 25; mask equivalent for multiply by -1.0 1.0 26ps_mask times 2 dd 1<<31, 0 27ps_mask2 times 2 dd 0, 1<<31 28ps_mask3 dd 0, 0, 0, 1<<31 29ps_noise0 times 2 dd 1.0, 0.0, 30ps_noise2 times 2 dd -1.0, 0.0 31ps_noise13 dd 0.0, 1.0, 0.0, -1.0 32 dd 0.0, -1.0, 0.0, 1.0 33 dd 0.0, 1.0, 0.0, -1.0 34cextern sbr_noise_table 35cextern ps_neg 36 37SECTION .text 38 39INIT_XMM sse 40cglobal sbr_sum_square, 2, 3, 6 41 mov r2d, r1d 42 xorps m0, m0 43 xorps m1, m1 44 sar r2, 3 45 jz .prepare 46.loop: 47 movu m2, [r0 + 0] 48 movu m3, [r0 + 16] 49 movu m4, [r0 + 32] 50 movu m5, [r0 + 48] 51 mulps m2, m2 52 mulps m3, m3 53 mulps m4, m4 54 mulps m5, m5 55 addps m0, m2 56 addps m1, m3 57 addps m0, m4 58 addps m1, m5 59 add r0, 64 60 dec r2 61 jnz .loop 62.prepare: 63 and r1, 7 64 sar r1, 1 65 jz .end 66; len is a multiple of 2, thus there are at least 4 elements to process 67.endloop: 68 movu m2, [r0] 69 add r0, 16 70 mulps m2, m2 71 dec r1 72 addps m0, m2 73 jnz .endloop 74.end: 75 addps m0, m1 76 movhlps m2, m0 77 addps m0, m2 78 movss m1, m0 79 shufps m0, m0, 1 80 addss m0, m1 81%if ARCH_X86_64 == 0 82 movss r0m, m0 83 fld dword r0m 84%endif 85 RET 86 87%define STEP 40*4*2 88cglobal sbr_hf_g_filt, 5, 6, 5 89 lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high 90 mov r5, r3 91 and r3, 0xFC 92 lea r2, [r2 + r3*4] 93 lea r0, [r0 + r3*8] 94 neg r3 95 jz .loop1 96.loop4: 97 movlps m0, [r2 + 4*r3 + 0] 98 movlps m1, [r2 + 4*r3 + 8] 99 movlps m2, [r1 + 0*STEP] 100 movlps m3, [r1 + 2*STEP] 101 movhps m2, [r1 + 1*STEP] 102 movhps m3, [r1 + 3*STEP] 103 unpcklps m0, m0 104 unpcklps m1, m1 105 mulps m0, m2 106 mulps m1, m3 107 movu [r0 + 8*r3 + 0], m0 108 movu [r0 + 8*r3 + 16], m1 109 add r1, 4*STEP 110 add r3, 4 111 jnz .loop4 112 and r5, 3 ; number of single element loops 113 jz .end 114.loop1: ; element 0 and 1 can be computed at the same time 115 movss m0, [r2] 116 movlps m2, [r1] 117 unpcklps m0, m0 118 mulps m2, m0 119 movlps [r0], m2 120 add r0, 8 121 add r2, 4 122 add r1, STEP 123 dec r5 124 jnz .loop1 125.end: 126 RET 127 128; void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2], 129; const float alpha0[2], const float alpha1[2], 130; float bw, int start, int end) 131; 132cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E 133 ; load alpha factors 134%define bw m0 135%if ARCH_X86_64 == 0 || WIN64 136 movss bw, BWm 137%endif 138 movlps m2, [alpha1q] 139 movlps m1, [alpha0q] 140 shufps bw, bw, 0 141 mulps m2, bw ; (a1[0] a1[1])*bw 142 mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3) 143 mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1) 144 mova m3, m1 145 mova m4, m2 146 147 ; Set pointers 148%if ARCH_X86_64 == 0 || WIN64 149 ; start and end 6th and 7th args on stack 150 mov r2d, Sm 151 mov r3d, Em 152 DEFINE_ARGS X_high, X_low, start, end 153%else 154; BW does not actually occupy a register, so shift by 1 155 DEFINE_ARGS X_high, X_low, alpha0, alpha1, start, end 156 movsxd startq, startd 157 movsxd endq, endd 158%endif 159 sub startq, endq ; neg num of loops 160 lea X_highq, [X_highq + endq*2*4] 161 lea X_lowq, [X_lowq + endq*2*4 - 2*2*4] 162 shl startq, 3 ; offset from num loops 163 164 mova m0, [X_lowq + startq] 165 shufps m3, m3, q1111 166 shufps m4, m4, q1111 167 xorps m3, [ps_mask] 168 shufps m1, m1, q0000 169 shufps m2, m2, q0000 170 xorps m4, [ps_mask] 171.loop2: 172 movu m7, [X_lowq + startq + 8] ; BbCc 173 mova m6, m0 174 mova m5, m7 175 shufps m0, m0, q2301 ; aAbB 176 shufps m7, m7, q2301 ; bBcC 177 mulps m0, m4 178 mulps m7, m3 179 mulps m6, m2 180 mulps m5, m1 181 addps m7, m0 182 mova m0, [X_lowq + startq + 16] ; CcDd 183 addps m7, m0 184 addps m6, m5 185 addps m7, m6 186 mova [X_highq + startq], m7 187 add startq, 16 188 jnz .loop2 189 RET 190 191cglobal sbr_sum64x5, 1,2,4,z 192 lea r1q, [zq+ 256] 193.loop: 194 mova m0, [zq+ 0] 195 mova m2, [zq+ 16] 196 mova m1, [zq+ 256] 197 mova m3, [zq+ 272] 198 addps m0, [zq+ 512] 199 addps m2, [zq+ 528] 200 addps m1, [zq+ 768] 201 addps m3, [zq+ 784] 202 addps m0, [zq+1024] 203 addps m2, [zq+1040] 204 addps m0, m1 205 addps m2, m3 206 mova [zq], m0 207 mova [zq+16], m2 208 add zq, 32 209 cmp zq, r1q 210 jne .loop 211 REP_RET 212 213INIT_XMM sse 214cglobal sbr_qmf_post_shuffle, 2,3,4,W,z 215 lea r2q, [zq + (64-4)*4] 216 mova m3, [ps_neg] 217.loop: 218 mova m1, [zq] 219 xorps m0, m3, [r2q] 220 shufps m0, m0, m0, q0123 221 unpcklps m2, m0, m1 222 unpckhps m0, m0, m1 223 mova [Wq + 0], m2 224 mova [Wq + 16], m0 225 add Wq, 32 226 sub r2q, 16 227 add zq, 16 228 cmp zq, r2q 229 jl .loop 230 REP_RET 231 232INIT_XMM sse 233cglobal sbr_neg_odd_64, 1,2,4,z 234 lea r1q, [zq+256] 235.loop: 236 mova m0, [zq+ 0] 237 mova m1, [zq+16] 238 mova m2, [zq+32] 239 mova m3, [zq+48] 240 xorps m0, [ps_mask2] 241 xorps m1, [ps_mask2] 242 xorps m2, [ps_mask2] 243 xorps m3, [ps_mask2] 244 mova [zq+ 0], m0 245 mova [zq+16], m1 246 mova [zq+32], m2 247 mova [zq+48], m3 248 add zq, 64 249 cmp zq, r1q 250 jne .loop 251 REP_RET 252 253; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1) 254%macro SBR_QMF_DEINT_BFLY 0 255cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c 256 mov cq, 64*4-2*mmsize 257 lea vrevq, [vq + 64*4] 258.loop: 259 mova m0, [src0q+cq] 260 mova m1, [src1q] 261 mova m4, [src0q+cq+mmsize] 262 mova m5, [src1q+mmsize] 263%if cpuflag(sse2) 264 pshufd m2, m0, q0123 265 pshufd m3, m1, q0123 266 pshufd m6, m4, q0123 267 pshufd m7, m5, q0123 268%else 269 shufps m2, m0, m0, q0123 270 shufps m3, m1, m1, q0123 271 shufps m6, m4, m4, q0123 272 shufps m7, m5, m5, q0123 273%endif 274 addps m5, m2 275 subps m0, m7 276 addps m1, m6 277 subps m4, m3 278 mova [vrevq], m1 279 mova [vrevq+mmsize], m5 280 mova [vq+cq], m0 281 mova [vq+cq+mmsize], m4 282 add src1q, 2*mmsize 283 add vrevq, 2*mmsize 284 sub cq, 2*mmsize 285 jge .loop 286 REP_RET 287%endmacro 288 289INIT_XMM sse 290SBR_QMF_DEINT_BFLY 291 292INIT_XMM sse2 293SBR_QMF_DEINT_BFLY 294 295INIT_XMM sse2 296cglobal sbr_qmf_pre_shuffle, 1,4,6,z 297%define OFFSET (32*4-2*mmsize) 298 mov r3q, OFFSET 299 lea r1q, [zq + (32+1)*4] 300 lea r2q, [zq + 64*4] 301 mova m5, [ps_neg] 302.loop: 303 movu m0, [r1q] 304 movu m2, [r1q + mmsize] 305 movu m1, [zq + r3q + 4 + mmsize] 306 movu m3, [zq + r3q + 4] 307 308 pxor m2, m5 309 pxor m0, m5 310 pshufd m2, m2, q0123 311 pshufd m0, m0, q0123 312 SBUTTERFLY dq, 2, 3, 4 313 SBUTTERFLY dq, 0, 1, 4 314 mova [r2q + 2*r3q + 0*mmsize], m2 315 mova [r2q + 2*r3q + 1*mmsize], m3 316 mova [r2q + 2*r3q + 2*mmsize], m0 317 mova [r2q + 2*r3q + 3*mmsize], m1 318 add r1q, 2*mmsize 319 sub r3q, 2*mmsize 320 jge .loop 321 movq m2, [zq] 322 movq [r2q], m2 323 REP_RET 324 325%ifdef PIC 326%define NREGS 1 327%if UNIX64 328%define NOISE_TABLE r6q ; r5q is m_max 329%else 330%define NOISE_TABLE r5q 331%endif 332%else 333%define NREGS 0 334%define NOISE_TABLE sbr_noise_table 335%endif 336 337%macro LOAD_NST 1 338%ifdef PIC 339 lea NOISE_TABLE, [%1] 340 mova m0, [kxq + NOISE_TABLE] 341%else 342 mova m0, [kxq + %1] 343%endif 344%endmacro 345 346INIT_XMM sse2 347; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m, 348; const float *q_filt, int noise, 349; int kx, int m_max) 350cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max 351 mova m0, [ps_noise0] 352 jmp apply_noise_main 353 354; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m, 355; const float *q_filt, int noise, 356; int kx, int m_max) 357cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max 358 and kxq, 1 359 shl kxq, 4 360 LOAD_NST ps_noise13 361 jmp apply_noise_main 362 363; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m, 364; const float *q_filt, int noise, 365; int kx, int m_max) 366cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max 367 mova m0, [ps_noise2] 368 jmp apply_noise_main 369 370; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m, 371; const float *q_filt, int noise, 372; int kx, int m_max) 373cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max 374 and kxq, 1 375 shl kxq, 4 376 LOAD_NST ps_noise13+16 377 378apply_noise_main: 379%if ARCH_X86_64 == 0 || WIN64 380 mov kxd, m_maxm 381 DEFINE_ARGS Y, s_m, q_filt, noise, count 382%else 383 DEFINE_ARGS Y, s_m, q_filt, noise, kx, count 384%endif 385 movsxdifnidn noiseq, noised 386 dec noiseq 387 shl countd, 2 388%ifdef PIC 389 lea NOISE_TABLE, [sbr_noise_table] 390%endif 391 lea Yq, [Yq + 2*countq] 392 add s_mq, countq 393 add q_filtq, countq 394 shl noiseq, 3 395 pxor m5, m5 396 neg countq 397.loop: 398 mova m1, [q_filtq + countq] 399 movu m3, [noiseq + NOISE_TABLE + 1*mmsize] 400 movu m4, [noiseq + NOISE_TABLE + 2*mmsize] 401 add noiseq, 2*mmsize 402 and noiseq, 0x1ff<<3 403 punpckhdq m2, m1, m1 404 punpckldq m1, m1 405 mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise] 406 mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise] 407 mova m3, [s_mq + countq] 408 ; TODO: replace by a vpermd in AVX2 409 punpckhdq m4, m3, m3 410 punpckldq m3, m3 411 pcmpeqd m6, m3, m5 ; m6 == 0 412 pcmpeqd m7, m4, m5 ; m7 == 0 413 mulps m3, m0 ; s_m[m] * phi_sign 414 mulps m4, m0 ; s_m[m] * phi_sign 415 pand m1, m6 416 pand m2, m7 417 movu m6, [Yq + 2*countq] 418 movu m7, [Yq + 2*countq + mmsize] 419 addps m3, m1 420 addps m4, m2 421 addps m6, m3 422 addps m7, m4 423 movu [Yq + 2*countq], m6 424 movu [Yq + 2*countq + mmsize], m7 425 add countq, mmsize 426 jl .loop 427 RET 428 429INIT_XMM sse 430cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c 431%define COUNT 32*4 432%define OFFSET 32*4 433 mov cq, -COUNT 434 lea vrevq, [vq + OFFSET + COUNT] 435 add vq, OFFSET-mmsize 436 add srcq, 2*COUNT 437 mova m3, [ps_neg] 438.loop: 439 mova m0, [srcq + 2*cq + 0*mmsize] 440 mova m1, [srcq + 2*cq + 1*mmsize] 441 shufps m2, m0, m1, q2020 442 shufps m1, m0, q1313 443 xorps m2, m3 444 mova [vq], m1 445 mova [vrevq + cq], m2 446 sub vq, mmsize 447 add cq, mmsize 448 jl .loop 449 REP_RET 450 451%macro SBR_AUTOCORRELATE 0 452cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt 453 mov cntq, 37*8 454 add xq, cntq 455 neg cntq 456 457%if cpuflag(sse3) 458%define MOVH movsd 459 movddup m5, [xq+cntq] 460%else 461%define MOVH movlps 462 movlps m5, [xq+cntq] 463 movlhps m5, m5 464%endif 465 MOVH m7, [xq+cntq+8 ] 466 MOVH m1, [xq+cntq+16] 467 shufps m7, m7, q0110 468 shufps m1, m1, q0110 469 mulps m3, m5, m7 ; x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0] 470 mulps m4, m5, m5 ; x[0][0] * x[0][0], x[0][1] * x[0][1]; 471 mulps m5, m1 ; real_sum2 = x[0][0] * x[2][0], x[0][1] * x[2][1]; imag_sum2 = x[0][0] * x[2][1], x[0][1] * x[2][0] 472 movaps [rsp ], m3 473 movaps [rsp+16], m4 474 add cntq, 8 475 476 MOVH m2, [xq+cntq+16] 477 movlhps m7, m7 478 shufps m2, m2, q0110 479 mulps m6, m7, m1 ; real_sum1 = x[1][0] * x[2][0], x[1][1] * x[2][1]; imag_sum1 += x[1][0] * x[2][1], x[1][1] * x[2][0] 480 mulps m4, m7, m2 481 mulps m7, m7 ; real_sum0 = x[1][0] * x[1][0], x[1][1] * x[1][1]; 482 addps m5, m4 ; real_sum2 += x[1][0] * x[3][0], x[1][1] * x[3][1]; imag_sum2 += x[1][0] * x[3][1], x[1][1] * x[3][0] 483 484align 16 485.loop: 486 add cntq, 8 487 MOVH m0, [xq+cntq+16] 488 movlhps m1, m1 489 shufps m0, m0, q0110 490 mulps m3, m1, m2 491 mulps m4, m1, m0 492 mulps m1, m1 493 addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0]; 494 addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0]; 495 addps m7, m1 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1]; 496 add cntq, 8 497 MOVH m1, [xq+cntq+16] 498 movlhps m2, m2 499 shufps m1, m1, q0110 500 mulps m3, m2, m0 501 mulps m4, m2, m1 502 mulps m2, m2 503 addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0]; 504 addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0]; 505 addps m7, m2 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1]; 506 add cntq, 8 507 MOVH m2, [xq+cntq+16] 508 movlhps m0, m0 509 shufps m2, m2, q0110 510 mulps m3, m0, m1 511 mulps m4, m0, m2 512 mulps m0, m0 513 addps m6, m3 ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0]; 514 addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0]; 515 addps m7, m0 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1]; 516 jl .loop 517 518 movlhps m1, m1 519 mulps m2, m1 520 mulps m1, m1 521 addps m2, m6 ; real_sum1 + x[38][0] * x[39][0], x[38][1] * x[39][1]; imag_sum1 + x[38][0] * x[39][1], x[38][1] * x[39][0]; 522 addps m1, m7 ; real_sum0 + x[38][0] * x[38][0], x[38][1] * x[38][1]; 523 addps m6, [rsp ] ; real_sum1 + x[ 0][0] * x[ 1][0], x[ 0][1] * x[ 1][1]; imag_sum1 + x[ 0][0] * x[ 1][1], x[ 0][1] * x[ 1][0]; 524 addps m7, [rsp+16] ; real_sum0 + x[ 0][0] * x[ 0][0], x[ 0][1] * x[ 0][1]; 525 526 xorps m2, [ps_mask3] 527 xorps m5, [ps_mask3] 528 xorps m6, [ps_mask3] 529 HADDPS m2, m5, m3 530 HADDPS m7, m6, m4 531%if cpuflag(sse3) 532 movshdup m0, m1 533%else 534 movss m0, m1 535 shufps m1, m1, q0001 536%endif 537 addss m1, m0 538 movaps [phiq ], m2 539 movhps [phiq+0x18], m7 540 movss [phiq+0x28], m7 541 movss [phiq+0x10], m1 542 RET 543%endmacro 544 545INIT_XMM sse 546SBR_AUTOCORRELATE 547INIT_XMM sse3 548SBR_AUTOCORRELATE 549