1 /* 2 decode.c: decoding samples... 3 4 copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1 5 see COPYING and AUTHORS files in distribution or http://mpg123.org 6 initially written by Michael Hipp 7 altivec optimization by tmkk 8 */ 9 10 #include "mpg123lib_intern.h" 11 12 #ifndef __APPLE__ 13 #include <altivec.h> 14 #endif 15 16 /* A macro for normal synth functions */ 17 #define SYNTH_ALTIVEC(B0STEP) \ 18 v1 = vec_ld(0,window); \ 19 v2 = vec_ld(16,window); \ 20 v3 = vec_ld(32,window); \ 21 v4 = vec_ld(48,window); \ 22 v5 = vec_ld(64,window); \ 23 v1 = vec_perm(v1,v2,vperm1); \ 24 v6 = vec_ld(0,b0); \ 25 v2 = vec_perm(v2,v3,vperm1); \ 26 v7 = vec_ld(16,b0); \ 27 v3 = vec_perm(v3,v4,vperm1); \ 28 v8 = vec_ld(32,b0); \ 29 v4 = vec_perm(v4,v5,vperm1); \ 30 v9 = vec_ld(48,b0); \ 31 \ 32 vsum = vec_madd(v1,v6,vzero); \ 33 vsum = vec_madd(v2,v7,vsum); \ 34 vsum = vec_madd(v3,v8,vsum); \ 35 vsum = vec_madd(v4,v9,vsum); \ 36 \ 37 window += 32; \ 38 b0 += B0STEP; \ 39 \ 40 v1 = vec_ld(0,window); \ 41 v2 = vec_ld(16,window); \ 42 v3 = vec_ld(32,window); \ 43 v4 = vec_ld(48,window); \ 44 v5 = vec_ld(64,window); \ 45 v1 = vec_perm(v1,v2,vperm1); \ 46 v6 = vec_ld(0,b0); \ 47 v2 = vec_perm(v2,v3,vperm1); \ 48 v7 = vec_ld(16,b0); \ 49 v3 = vec_perm(v3,v4,vperm1); \ 50 v8 = vec_ld(32,b0); \ 51 v4 = vec_perm(v4,v5,vperm1); \ 52 v9 = vec_ld(48,b0); \ 53 \ 54 vsum2 = vec_madd(v1,v6,vzero); \ 55 vsum2 = vec_madd(v2,v7,vsum2); \ 56 vsum2 = vec_madd(v3,v8,vsum2); \ 57 vsum2 = vec_madd(v4,v9,vsum2); \ 58 \ 59 window += 32; \ 60 b0 += B0STEP; \ 61 \ 62 v1 = vec_ld(0,window); \ 63 v2 = vec_ld(16,window); \ 64 v3 = vec_ld(32,window); \ 65 v4 = vec_ld(48,window); \ 66 v5 = vec_ld(64,window); \ 67 v1 = vec_perm(v1,v2,vperm1); \ 68 v6 = vec_ld(0,b0); \ 69 v2 = vec_perm(v2,v3,vperm1); \ 70 v7 = vec_ld(16,b0); \ 71 v3 = vec_perm(v3,v4,vperm1); \ 72 v8 = vec_ld(32,b0); \ 73 v4 = vec_perm(v4,v5,vperm1); \ 74 v9 = vec_ld(48,b0); \ 75 \ 76 vsum3 = vec_madd(v1,v6,vzero); \ 77 vsum3 = vec_madd(v2,v7,vsum3); \ 78 vsum3 = vec_madd(v3,v8,vsum3); \ 79 vsum3 = vec_madd(v4,v9,vsum3); \ 80 \ 81 window += 32; \ 82 b0 += B0STEP; \ 83 \ 84 v1 = vec_ld(0,window); \ 85 v2 = vec_ld(16,window); \ 86 v3 = vec_ld(32,window); \ 87 v4 = vec_ld(48,window); \ 88 v5 = vec_ld(64,window); \ 89 v1 = vec_perm(v1,v2,vperm1); \ 90 v6 = vec_ld(0,b0); \ 91 v2 = vec_perm(v2,v3,vperm1); \ 92 v7 = vec_ld(16,b0); \ 93 v3 = vec_perm(v3,v4,vperm1); \ 94 v8 = vec_ld(32,b0); \ 95 v4 = vec_perm(v4,v5,vperm1); \ 96 v9 = vec_ld(48,b0); \ 97 \ 98 vsum4 = vec_madd(v1,v6,vzero); \ 99 vsum4 = vec_madd(v2,v7,vsum4); \ 100 vsum4 = vec_madd(v3,v8,vsum4); \ 101 vsum4 = vec_madd(v4,v9,vsum4); \ 102 \ 103 window += 32; \ 104 b0 += B0STEP; \ 105 \ 106 v1 = vec_mergeh(vsum,vsum3); \ 107 v2 = vec_mergeh(vsum2,vsum4); \ 108 v3 = vec_mergel(vsum,vsum3); \ 109 v4 = vec_mergel(vsum2,vsum4); \ 110 v5 = vec_mergeh(v1,v2); \ 111 v6 = vec_mergel(v1,v2); \ 112 v7 = vec_mergeh(v3,v4); \ 113 v8 = vec_mergel(v3,v4); 114 115 /* A macro for stereo synth functions */ 116 #define SYNTH_STEREO_ALTIVEC(B0STEP) \ 117 v1 = vec_ld(0,window); \ 118 v2 = vec_ld(16,window); \ 119 v3 = vec_ld(32,window); \ 120 v4 = vec_ld(48,window); \ 121 v5 = vec_ld(64,window); \ 122 v1 = vec_perm(v1,v2,vperm1); \ 123 v6 = vec_ld(0,b0l); \ 124 v10 = vec_ld(0,b0r); \ 125 v2 = vec_perm(v2,v3,vperm1); \ 126 v7 = vec_ld(16,b0l); \ 127 v11 = vec_ld(16,b0r); \ 128 v3 = vec_perm(v3,v4,vperm1); \ 129 v8 = vec_ld(32,b0l); \ 130 v12 = vec_ld(32,b0r); \ 131 v4 = vec_perm(v4,v5,vperm1); \ 132 v9 = vec_ld(48,b0l); \ 133 v13 = vec_ld(48,b0r); \ 134 \ 135 vsum = vec_madd(v1,v6,vzero); \ 136 vsum5 = vec_madd(v1,v10,vzero); \ 137 vsum = vec_madd(v2,v7,vsum); \ 138 vsum5 = vec_madd(v2,v11,vsum5); \ 139 vsum = vec_madd(v3,v8,vsum); \ 140 vsum5 = vec_madd(v3,v12,vsum5); \ 141 vsum = vec_madd(v4,v9,vsum); \ 142 vsum5 = vec_madd(v4,v13,vsum5); \ 143 \ 144 window += 32; \ 145 b0l += B0STEP; \ 146 b0r += B0STEP; \ 147 \ 148 v1 = vec_ld(0,window); \ 149 v2 = vec_ld(16,window); \ 150 v3 = vec_ld(32,window); \ 151 v4 = vec_ld(48,window); \ 152 v5 = vec_ld(64,window); \ 153 v1 = vec_perm(v1,v2,vperm1); \ 154 v6 = vec_ld(0,b0l); \ 155 v10 = vec_ld(0,b0r); \ 156 v2 = vec_perm(v2,v3,vperm1); \ 157 v7 = vec_ld(16,b0l); \ 158 v11 = vec_ld(16,b0r); \ 159 v3 = vec_perm(v3,v4,vperm1); \ 160 v8 = vec_ld(32,b0l); \ 161 v12 = vec_ld(32,b0r); \ 162 v4 = vec_perm(v4,v5,vperm1); \ 163 v9 = vec_ld(48,b0l); \ 164 v13 = vec_ld(48,b0r); \ 165 \ 166 vsum2 = vec_madd(v1,v6,vzero); \ 167 vsum6 = vec_madd(v1,v10,vzero); \ 168 vsum2 = vec_madd(v2,v7,vsum2); \ 169 vsum6 = vec_madd(v2,v11,vsum6); \ 170 vsum2 = vec_madd(v3,v8,vsum2); \ 171 vsum6 = vec_madd(v3,v12,vsum6); \ 172 vsum2 = vec_madd(v4,v9,vsum2); \ 173 vsum6 = vec_madd(v4,v13,vsum6); \ 174 \ 175 window += 32; \ 176 b0l += B0STEP; \ 177 b0r += B0STEP; \ 178 \ 179 v1 = vec_ld(0,window); \ 180 v2 = vec_ld(16,window); \ 181 v3 = vec_ld(32,window); \ 182 v4 = vec_ld(48,window); \ 183 v5 = vec_ld(64,window); \ 184 v1 = vec_perm(v1,v2,vperm1); \ 185 v6 = vec_ld(0,b0l); \ 186 v10 = vec_ld(0,b0r); \ 187 v2 = vec_perm(v2,v3,vperm1); \ 188 v7 = vec_ld(16,b0l); \ 189 v11 = vec_ld(16,b0r); \ 190 v3 = vec_perm(v3,v4,vperm1); \ 191 v8 = vec_ld(32,b0l); \ 192 v12 = vec_ld(32,b0r); \ 193 v4 = vec_perm(v4,v5,vperm1); \ 194 v9 = vec_ld(48,b0l); \ 195 v13 = vec_ld(48,b0r); \ 196 \ 197 vsum3 = vec_madd(v1,v6,vzero); \ 198 vsum7 = vec_madd(v1,v10,vzero); \ 199 vsum3 = vec_madd(v2,v7,vsum3); \ 200 vsum7 = vec_madd(v2,v11,vsum7); \ 201 vsum3 = vec_madd(v3,v8,vsum3); \ 202 vsum7 = vec_madd(v3,v12,vsum7); \ 203 vsum3 = vec_madd(v4,v9,vsum3); \ 204 vsum7 = vec_madd(v4,v13,vsum7); \ 205 \ 206 window += 32; \ 207 b0l += B0STEP; \ 208 b0r += B0STEP; \ 209 \ 210 v1 = vec_ld(0,window); \ 211 v2 = vec_ld(16,window); \ 212 v3 = vec_ld(32,window); \ 213 v4 = vec_ld(48,window); \ 214 v5 = vec_ld(64,window); \ 215 v1 = vec_perm(v1,v2,vperm1); \ 216 v6 = vec_ld(0,b0l); \ 217 v10 = vec_ld(0,b0r); \ 218 v2 = vec_perm(v2,v3,vperm1); \ 219 v7 = vec_ld(16,b0l); \ 220 v11 = vec_ld(16,b0r); \ 221 v3 = vec_perm(v3,v4,vperm1); \ 222 v8 = vec_ld(32,b0l); \ 223 v12 = vec_ld(32,b0r); \ 224 v4 = vec_perm(v4,v5,vperm1); \ 225 v9 = vec_ld(48,b0l); \ 226 v13 = vec_ld(48,b0r); \ 227 \ 228 vsum4 = vec_madd(v1,v6,vzero); \ 229 vsum8 = vec_madd(v1,v10,vzero); \ 230 vsum4 = vec_madd(v2,v7,vsum4); \ 231 vsum8 = vec_madd(v2,v11,vsum8); \ 232 vsum4 = vec_madd(v3,v8,vsum4); \ 233 vsum8 = vec_madd(v3,v12,vsum8); \ 234 vsum4 = vec_madd(v4,v9,vsum4); \ 235 vsum8 = vec_madd(v4,v13,vsum8); \ 236 \ 237 window += 32; \ 238 b0l += B0STEP; \ 239 b0r += B0STEP; \ 240 \ 241 v1 = vec_mergeh(vsum,vsum3); \ 242 v5 = vec_mergeh(vsum5,vsum7); \ 243 v2 = vec_mergeh(vsum2,vsum4); \ 244 v6 = vec_mergeh(vsum6,vsum8); \ 245 v3 = vec_mergel(vsum,vsum3); \ 246 v7 = vec_mergel(vsum5,vsum7); \ 247 v4 = vec_mergel(vsum2,vsum4); \ 248 v8 = vec_mergel(vsum6,vsum8); \ 249 vsum = vec_mergeh(v1,v2); \ 250 vsum5 = vec_mergeh(v5,v6); \ 251 vsum2 = vec_mergel(v1,v2); \ 252 vsum6 = vec_mergel(v5,v6); \ 253 vsum3 = vec_mergeh(v3,v4); \ 254 vsum7 = vec_mergeh(v7,v8); \ 255 vsum4 = vec_mergel(v3,v4); \ 256 vsum8 = vec_mergel(v7,v8); 257 258 int synth_1to1_altivec(real *bandPtr,int channel,mpg123_handle *fr, int final) 259 { 260 short *samples = (short *) (fr->buffer.data+fr->buffer.fill); 261 262 real *b0, **buf; 263 int clip; 264 int bo1; 265 #ifndef NO_EQUALIZER 266 if(fr->have_eq_settings) do_equalizer(bandPtr,channel,fr->equalizer); 267 #endif 268 if(!channel) 269 { 270 fr->bo--; 271 fr->bo &= 0xf; 272 buf = fr->real_buffs[0]; 273 } 274 else 275 { 276 samples++; 277 buf = fr->real_buffs[1]; 278 } 279 280 if(fr->bo & 0x1) 281 { 282 b0 = buf[0]; 283 bo1 = fr->bo; 284 dct64_altivec(buf[1]+((fr->bo+1)&0xf),buf[0]+fr->bo,bandPtr); 285 } 286 else 287 { 288 b0 = buf[1]; 289 bo1 = fr->bo+1; 290 dct64_altivec(buf[0]+fr->bo,buf[1]+fr->bo+1,bandPtr); 291 } 292 293 294 { 295 register int j; 296 real *window = fr->decwin + 16 - bo1; 297 298 ALIGNED(16) int clip_tmp[4]; 299 vector float v1,v2,v3,v4,v5,v6,v7,v8,v9; 300 vector unsigned char vperm1,vperm2,vperm3,vperm4; 301 vector float vsum,vsum2,vsum3,vsum4,vmin,vmax,vzero; 302 vector signed int vclip; 303 vector signed short vsample1,vsample2; 304 vector unsigned int vshift; 305 vclip = vec_xor(vclip,vclip); 306 vzero = vec_xor(vzero,vzero); 307 vshift = vec_splat_u32(-1); /* 31 */ 308 #ifdef __APPLE__ 309 vmax = (vector float)(32767.0f); 310 vmin = (vector float)(-32768.0f); 311 vperm4 = (vector unsigned char)(0,1,18,19,2,3,22,23,4,5,26,27,6,7,30,31); 312 #else 313 vmax = (vector float){32767.0f,32767.0f,32767.0f,32767.0f}; 314 vmin = (vector float){-32768.0f,-32768.0f,-32768.0f,-32768.0f}; 315 vperm4 = (vector unsigned char){0,1,18,19,2,3,22,23,4,5,26,27,6,7,30,31}; 316 #endif 317 318 vperm1 = vec_lvsl(0,window); 319 vperm2 = vec_lvsl(0,samples); 320 vperm3 = vec_lvsr(0,samples); 321 for (j=4;j;j--) 322 { 323 SYNTH_ALTIVEC(16); 324 325 vsum = vec_sub(v5,v6); 326 v9 = vec_sub(v7,v8); 327 vsum = vec_add(vsum,v9); 328 329 v3 = vec_round(vsum); 330 v3 = (vector float)vec_cts(v3,0); 331 v1 = (vector float)vec_cmpgt(vsum,vmax); 332 v2 = (vector float)vec_cmplt(vsum,vmin); 333 vsample1 = vec_ld(0,samples); 334 vsample2 = vec_ld(15,samples); 335 v3 = (vector float)vec_packs((vector signed int)v3,(vector signed int)v3); 336 v4 = (vector float)vec_perm(vsample1,vsample2,vperm2); 337 v5 = (vector float)vec_perm(v3,v4,vperm4); 338 v6 = (vector float)vec_perm(vsample2,vsample1,vperm2); 339 v7 = (vector float)vec_perm(v5,v6,vperm3); 340 v8 = (vector float)vec_perm(v6,v5,vperm3); 341 vec_st((vector signed short)v7,15,samples); 342 vec_st((vector signed short)v8,0,samples); 343 samples += 8; 344 345 v1 = (vector float)vec_sr((vector unsigned int)v1, vshift); 346 v2 = (vector float)vec_sr((vector unsigned int)v2, vshift); 347 v1 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2); 348 vclip = vec_sums((vector signed int)v1,vclip); 349 } 350 351 for (j=4;j;j--) 352 { 353 SYNTH_ALTIVEC(-16); 354 355 vsum = vec_add(v5,v6); 356 v9 = vec_add(v7,v8); 357 vsum = vec_add(vsum,v9); 358 359 v3 = vec_round(vsum); 360 v3 = (vector float)vec_cts(v3,0); 361 v1 = (vector float)vec_cmpgt(vsum,vmax); 362 v2 = (vector float)vec_cmplt(vsum,vmin); 363 vsample1 = vec_ld(0,samples); 364 vsample2 = vec_ld(15,samples); 365 v3 = (vector float)vec_packs((vector signed int)v3,(vector signed int)v3); 366 v4 = (vector float)vec_perm(vsample1,vsample2,vperm2); 367 v5 = (vector float)vec_perm(v3,v4,vperm4); 368 v6 = (vector float)vec_perm(vsample2,vsample1,vperm2); 369 v7 = (vector float)vec_perm(v5,v6,vperm3); 370 v8 = (vector float)vec_perm(v6,v5,vperm3); 371 vec_st((vector signed short)v7,15,samples); 372 vec_st((vector signed short)v8,0,samples); 373 samples += 8; 374 375 v1 = (vector float)vec_sr((vector unsigned int)v1, vshift); 376 v2 = (vector float)vec_sr((vector unsigned int)v2, vshift); 377 v1 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2); 378 vclip = vec_sums((vector signed int)v1,vclip); 379 } 380 381 vec_st(vclip,0,clip_tmp); 382 clip = clip_tmp[3]; 383 } 384 if(final) fr->buffer.fill += 128; 385 386 return clip; 387 } 388 389 int synth_1to1_stereo_altivec(real *bandPtr_l, real *bandPtr_r, mpg123_handle *fr) 390 { 391 short *samples = (short *) (fr->buffer.data+fr->buffer.fill); 392 393 real *b0l, *b0r, **bufl, **bufr; 394 int clip; 395 int bo1; 396 #ifndef NO_EQUALIZER 397 if(fr->have_eq_settings) 398 { 399 do_equalizer(bandPtr_l,0,fr->equalizer); 400 do_equalizer(bandPtr_r,1,fr->equalizer); 401 } 402 #endif 403 fr->bo--; 404 fr->bo &= 0xf; 405 bufl = fr->real_buffs[0]; 406 bufr = fr->real_buffs[1]; 407 408 if(fr->bo & 0x1) 409 { 410 b0l = bufl[0]; 411 b0r = bufr[0]; 412 bo1 = fr->bo; 413 dct64_altivec(bufl[1]+((fr->bo+1)&0xf),bufl[0]+fr->bo,bandPtr_l); 414 dct64_altivec(bufr[1]+((fr->bo+1)&0xf),bufr[0]+fr->bo,bandPtr_r); 415 } 416 else 417 { 418 b0l = bufl[1]; 419 b0r = bufr[1]; 420 bo1 = fr->bo+1; 421 dct64_altivec(bufl[0]+fr->bo,bufl[1]+fr->bo+1,bandPtr_l); 422 dct64_altivec(bufr[0]+fr->bo,bufr[1]+fr->bo+1,bandPtr_r); 423 } 424 425 426 { 427 register int j; 428 real *window = fr->decwin + 16 - bo1; 429 430 ALIGNED(16) int clip_tmp[4]; 431 vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13; 432 vector unsigned char vperm1,vperm2; 433 vector float vsum,vsum2,vsum3,vsum4,vsum5,vsum6,vsum7,vsum8,vmin,vmax,vzero; 434 vector signed int vclip; 435 vector unsigned int vshift; 436 vector signed short vprev; 437 vclip = vec_xor(vclip,vclip); 438 vzero = vec_xor(vzero,vzero); 439 vshift = vec_splat_u32(-1); /* 31 */ 440 #ifdef __APPLE__ 441 vmax = (vector float)(32767.0f); 442 vmin = (vector float)(-32768.0f); 443 #else 444 vmax = (vector float){32767.0f,32767.0f,32767.0f,32767.0f}; 445 vmin = (vector float){-32768.0f,-32768.0f,-32768.0f,-32768.0f}; 446 #endif 447 448 vperm1 = vec_lvsl(0,window); 449 vperm2 = vec_lvsr(0,samples); 450 vprev = vec_perm(vec_ld(0,samples),vec_ld(0,samples),vec_lvsl(0,samples)); 451 for (j=4;j;j--) 452 { 453 SYNTH_STEREO_ALTIVEC(16); 454 455 vsum = vec_sub(vsum,vsum2); 456 vsum2 = vec_sub(vsum5,vsum6); 457 vsum3 = vec_sub(vsum3,vsum4); 458 vsum4 = vec_sub(vsum7,vsum8); 459 vsum = vec_add(vsum,vsum3); 460 vsum2 = vec_add(vsum2,vsum4); 461 462 v1 = vec_round(vsum); 463 v2 = vec_round(vsum2); 464 v1 = (vector float)vec_cts(v1,0); 465 v2 = (vector float)vec_cts(v2,0); 466 v3 = vec_mergeh(v1, v2); 467 v4 = vec_mergel(v1, v2); 468 v5 = (vector float)vec_packs((vector signed int)v3,(vector signed int)v4); 469 v6 = (vector float)vec_perm(vprev,(vector signed short)v5,vperm2); 470 vprev = (vector signed short)v5; 471 v1 = (vector float)vec_cmpgt(vsum,vmax); 472 v2 = (vector float)vec_cmplt(vsum,vmin); 473 v3 = (vector float)vec_cmpgt(vsum2,vmax); 474 v4 = (vector float)vec_cmplt(vsum2,vmin); 475 vec_st((vector signed short)v6,0,samples); 476 samples += 8; 477 478 v1 = (vector float)vec_sr((vector unsigned int)v1, vshift); 479 v2 = (vector float)vec_sr((vector unsigned int)v2, vshift); 480 v3 = (vector float)vec_sr((vector unsigned int)v3, vshift); 481 v4 = (vector float)vec_sr((vector unsigned int)v4, vshift); 482 v1 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2); 483 v2 = (vector float)vec_add((vector unsigned int)v3,(vector unsigned int)v4); 484 vclip = vec_sums((vector signed int)v1,vclip); 485 vclip = vec_sums((vector signed int)v2,vclip); 486 } 487 488 for (j=4;j;j--) 489 { 490 SYNTH_STEREO_ALTIVEC(-16); 491 492 vsum = vec_add(vsum,vsum2); 493 vsum2 = vec_add(vsum5,vsum6); 494 vsum3 = vec_add(vsum3,vsum4); 495 vsum4 = vec_add(vsum7,vsum8); 496 vsum = vec_add(vsum,vsum3); 497 vsum2 = vec_add(vsum2,vsum4); 498 499 v1 = vec_round(vsum); 500 v2 = vec_round(vsum2); 501 v1 = (vector float)vec_cts(v1,0); 502 v2 = (vector float)vec_cts(v2,0); 503 v3 = vec_mergeh(v1, v2); 504 v4 = vec_mergel(v1, v2); 505 v5 = (vector float)vec_packs((vector signed int)v3,(vector signed int)v4); 506 v6 = (vector float)vec_perm(vprev,(vector signed short)v5,vperm2); 507 vprev = (vector signed short)v5; 508 v1 = (vector float)vec_cmpgt(vsum,vmax); 509 v2 = (vector float)vec_cmplt(vsum,vmin); 510 v3 = (vector float)vec_cmpgt(vsum2,vmax); 511 v4 = (vector float)vec_cmplt(vsum2,vmin); 512 vec_st((vector signed short)v6,0,samples); 513 samples += 8; 514 515 v1 = (vector float)vec_sr((vector unsigned int)v1, vshift); 516 v2 = (vector float)vec_sr((vector unsigned int)v2, vshift); 517 v3 = (vector float)vec_sr((vector unsigned int)v3, vshift); 518 v4 = (vector float)vec_sr((vector unsigned int)v4, vshift); 519 v1 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2); 520 v2 = (vector float)vec_add((vector unsigned int)v3,(vector unsigned int)v4); 521 vclip = vec_sums((vector signed int)v1,vclip); 522 vclip = vec_sums((vector signed int)v2,vclip); 523 } 524 525 if((size_t)samples & 0xf) 526 { 527 v1 = (vector float)vec_perm(vec_ld(0,samples),vec_ld(0,samples),vec_lvsl(0,samples)); 528 v2 = (vector float)vec_perm(vprev,(vector signed short)v1,vperm2); 529 vec_st((vector signed short)v2,0,samples); 530 } 531 532 vec_st(vclip,0,clip_tmp); 533 clip = clip_tmp[3]; 534 } 535 fr->buffer.fill += 128; 536 537 return clip; 538 } 539 540 int synth_1to1_real_altivec(real *bandPtr,int channel,mpg123_handle *fr, int final) 541 { 542 real *samples = (real *) (fr->buffer.data+fr->buffer.fill); 543 544 real *b0, **buf; 545 int bo1; 546 #ifndef NO_EQUALIZER 547 if(fr->have_eq_settings) do_equalizer(bandPtr,channel,fr->equalizer); 548 #endif 549 if(!channel) 550 { 551 fr->bo--; 552 fr->bo &= 0xf; 553 buf = fr->real_buffs[0]; 554 } 555 else 556 { 557 samples++; 558 buf = fr->real_buffs[1]; 559 } 560 561 if(fr->bo & 0x1) 562 { 563 b0 = buf[0]; 564 bo1 = fr->bo; 565 dct64_altivec(buf[1]+((fr->bo+1)&0xf),buf[0]+fr->bo,bandPtr); 566 } 567 else 568 { 569 b0 = buf[1]; 570 bo1 = fr->bo+1; 571 dct64_altivec(buf[0]+fr->bo,buf[1]+fr->bo+1,bandPtr); 572 } 573 574 575 { 576 register int j; 577 real *window = fr->decwin + 16 - bo1; 578 579 vector float v1,v2,v3,v4,v5,v6,v7,v8,v9; 580 vector unsigned char vperm1,vperm2,vperm3,vperm4, vperm5; 581 vector float vsum,vsum2,vsum3,vsum4,vscale,vzero; 582 vector float vsample1,vsample2,vsample3; 583 vzero = vec_xor(vzero, vzero); 584 #ifdef __APPLE__ 585 vscale = (vector float)(1.0f/32768.0f); 586 vperm4 = (vector unsigned char)(0,1,2,3,20,21,22,23,4,5,6,7,28,29,30,31); 587 vperm5 = (vector unsigned char)(8,9,10,11,20,21,22,23,12,13,14,15,28,29,30,31); 588 #else 589 vscale = (vector float){1.0f/32768.0f,1.0f/32768.0f,1.0f/32768.0f,1.0f/32768.0f}; 590 vperm4 = (vector unsigned char){0,1,2,3,20,21,22,23,4,5,6,7,28,29,30,31}; 591 vperm5 = (vector unsigned char){8,9,10,11,20,21,22,23,12,13,14,15,28,29,30,31}; 592 #endif 593 594 vperm1 = vec_lvsl(0,window); 595 vperm2 = vec_lvsl(0,samples); 596 vperm3 = vec_lvsr(0,samples); 597 for (j=4;j;j--) 598 { 599 SYNTH_ALTIVEC(16); 600 601 vsum = vec_sub(v5,v6); 602 v9 = vec_sub(v7,v8); 603 vsum = vec_add(vsum,v9); 604 vsum = vec_madd(vsum, vscale, vzero); 605 606 vsample1 = vec_ld(0,samples); 607 vsample2 = vec_ld(16,samples); 608 vsample3 = vec_ld(31,samples); 609 v1 = vec_perm(vsample1, vsample2, vperm2); 610 v2 = vec_perm(vsample2, vsample3, vperm2); 611 v1 = vec_perm(vsum, v1, vperm4); 612 v2 = vec_perm(vsum, v2, vperm5); 613 v3 = vec_perm(vsample3, vsample2, vperm2); 614 v4 = vec_perm(vsample2, vsample1, vperm2); 615 v5 = vec_perm(v2, v3, vperm3); 616 v6 = vec_perm(v1, v2, vperm3); 617 v7 = vec_perm(v4, v1, vperm3); 618 vec_st(v5,31,samples); 619 vec_st(v6,16,samples); 620 vec_st(v7,0,samples); 621 samples += 8; 622 } 623 624 for (j=4;j;j--) 625 { 626 SYNTH_ALTIVEC(-16); 627 628 vsum = vec_add(v5,v6); 629 v9 = vec_add(v7,v8); 630 vsum = vec_add(vsum,v9); 631 vsum = vec_madd(vsum, vscale, vzero); 632 633 vsample1 = vec_ld(0,samples); 634 vsample2 = vec_ld(16,samples); 635 vsample3 = vec_ld(31,samples); 636 v1 = vec_perm(vsample1, vsample2, vperm2); 637 v2 = vec_perm(vsample2, vsample3, vperm2); 638 v1 = vec_perm(vsum, v1, vperm4); 639 v2 = vec_perm(vsum, v2, vperm5); 640 v3 = vec_perm(vsample3, vsample2, vperm2); 641 v4 = vec_perm(vsample2, vsample1, vperm2); 642 v5 = vec_perm(v2, v3, vperm3); 643 v6 = vec_perm(v1, v2, vperm3); 644 v7 = vec_perm(v4, v1, vperm3); 645 vec_st(v5,31,samples); 646 vec_st(v6,16,samples); 647 vec_st(v7,0,samples); 648 samples += 8; 649 } 650 } 651 if(final) fr->buffer.fill += 256; 652 653 return 0; 654 } 655 656 int synth_1to1_fltst_altivec(real *bandPtr_l, real *bandPtr_r, mpg123_handle *fr) 657 { 658 real *samples = (real *) (fr->buffer.data+fr->buffer.fill); 659 660 real *b0l, *b0r, **bufl, **bufr; 661 int bo1; 662 #ifndef NO_EQUALIZER 663 if(fr->have_eq_settings) 664 { 665 do_equalizer(bandPtr_l,0,fr->equalizer); 666 do_equalizer(bandPtr_r,1,fr->equalizer); 667 } 668 #endif 669 fr->bo--; 670 fr->bo &= 0xf; 671 bufl = fr->real_buffs[0]; 672 bufr = fr->real_buffs[1]; 673 674 if(fr->bo & 0x1) 675 { 676 b0l = bufl[0]; 677 b0r = bufr[0]; 678 bo1 = fr->bo; 679 dct64_altivec(bufl[1]+((fr->bo+1)&0xf),bufl[0]+fr->bo,bandPtr_l); 680 dct64_altivec(bufr[1]+((fr->bo+1)&0xf),bufr[0]+fr->bo,bandPtr_r); 681 } 682 else 683 { 684 b0l = bufl[1]; 685 b0r = bufr[1]; 686 bo1 = fr->bo+1; 687 dct64_altivec(bufl[0]+fr->bo,bufl[1]+fr->bo+1,bandPtr_l); 688 dct64_altivec(bufr[0]+fr->bo,bufr[1]+fr->bo+1,bandPtr_r); 689 } 690 691 692 { 693 register int j; 694 real *window = fr->decwin + 16 - bo1; 695 696 vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13; 697 vector unsigned char vperm1,vperm2; 698 vector float vsum,vsum2,vsum3,vsum4,vsum5,vsum6,vsum7,vsum8,vscale,vzero; 699 vector float vprev; 700 vzero = vec_xor(vzero,vzero); 701 #ifdef __APPLE__ 702 vscale = (vector float)(1.0f/32768.0f); 703 #else 704 vscale = (vector float){1.0f/32768.0f,1.0f/32768.0f,1.0f/32768.0f,1.0f/32768.0f}; 705 #endif 706 707 vperm1 = vec_lvsl(0,window); 708 vperm2 = vec_lvsr(0,samples); 709 vprev = vec_perm(vec_ld(0,samples),vec_ld(0,samples),vec_lvsl(0,samples)); 710 for (j=4;j;j--) 711 { 712 SYNTH_STEREO_ALTIVEC(16); 713 714 vsum = vec_sub(vsum,vsum2); 715 vsum2 = vec_sub(vsum5,vsum6); 716 vsum3 = vec_sub(vsum3,vsum4); 717 vsum4 = vec_sub(vsum7,vsum8); 718 vsum = vec_add(vsum,vsum3); 719 vsum2 = vec_add(vsum2,vsum4); 720 vsum = vec_madd(vsum, vscale, vzero); 721 vsum2 = vec_madd(vsum2, vscale, vzero); 722 723 v1 = vec_mergeh(vsum, vsum2); 724 v2 = vec_mergel(vsum, vsum2); 725 v3 = vec_perm(vprev,v1,vperm2); 726 v4 = vec_perm(v1,v2,vperm2); 727 vprev = v2; 728 vec_st(v3,0,samples); 729 vec_st(v4,16,samples); 730 samples += 8; 731 } 732 733 for (j=4;j;j--) 734 { 735 SYNTH_STEREO_ALTIVEC(-16); 736 737 vsum = vec_add(vsum,vsum2); 738 vsum2 = vec_add(vsum5,vsum6); 739 vsum3 = vec_add(vsum3,vsum4); 740 vsum4 = vec_add(vsum7,vsum8); 741 vsum = vec_add(vsum,vsum3); 742 vsum2 = vec_add(vsum2,vsum4); 743 vsum = vec_madd(vsum, vscale, vzero); 744 vsum2 = vec_madd(vsum2, vscale, vzero); 745 746 v1 = vec_mergeh(vsum, vsum2); 747 v2 = vec_mergel(vsum, vsum2); 748 v3 = vec_perm(vprev,v1,vperm2); 749 v4 = vec_perm(v1,v2,vperm2); 750 vprev = v2; 751 vec_st(v3,0,samples); 752 vec_st(v4,16,samples); 753 samples += 8; 754 } 755 756 if((size_t)samples & 0xf) 757 { 758 v1 = (vector float)vec_perm(vec_ld(0,samples),vec_ld(0,samples),vec_lvsl(0,samples)); 759 v2 = (vector float)vec_perm(vprev,v1,vperm2); 760 vec_st(v2,0,samples); 761 } 762 } 763 fr->buffer.fill += 256; 764 765 return 0; 766 } 767 768 int synth_1to1_s32_altivec(real *bandPtr,int channel,mpg123_handle *fr, int final) 769 { 770 int32_t *samples = (int32_t *) (fr->buffer.data+fr->buffer.fill); 771 772 real *b0, **buf; 773 int clip; 774 int bo1; 775 #ifndef NO_EQUALIZER 776 if(fr->have_eq_settings) do_equalizer(bandPtr,channel,fr->equalizer); 777 #endif 778 if(!channel) 779 { 780 fr->bo--; 781 fr->bo &= 0xf; 782 buf = fr->real_buffs[0]; 783 } 784 else 785 { 786 samples++; 787 buf = fr->real_buffs[1]; 788 } 789 790 if(fr->bo & 0x1) 791 { 792 b0 = buf[0]; 793 bo1 = fr->bo; 794 dct64_altivec(buf[1]+((fr->bo+1)&0xf),buf[0]+fr->bo,bandPtr); 795 } 796 else 797 { 798 b0 = buf[1]; 799 bo1 = fr->bo+1; 800 dct64_altivec(buf[0]+fr->bo,buf[1]+fr->bo+1,bandPtr); 801 } 802 803 804 { 805 register int j; 806 real *window = fr->decwin + 16 - bo1; 807 808 ALIGNED(16) int clip_tmp[4]; 809 vector float v1,v2,v3,v4,v5,v6,v7,v8,v9; 810 vector unsigned char vperm1,vperm2,vperm3,vperm4,vperm5; 811 vector float vsum,vsum2,vsum3,vsum4,vmax,vmin,vzero; 812 vector signed int vsample1,vsample2,vsample3; 813 vector unsigned int vshift; 814 vector signed int vclip; 815 vzero = vec_xor(vzero, vzero); 816 vclip = vec_xor(vclip, vclip); 817 vshift = vec_splat_u32(-1); /* 31 */ 818 #ifdef __APPLE__ 819 vmax = (vector float)(32767.999f); 820 vmin = (vector float)(-32768.0f); 821 vperm4 = (vector unsigned char)(0,1,2,3,20,21,22,23,4,5,6,7,28,29,30,31); 822 vperm5 = (vector unsigned char)(8,9,10,11,20,21,22,23,12,13,14,15,28,29,30,31); 823 #else 824 vmax = (vector float){32767.999f,32767.999f,32767.999f,32767.999f}; 825 vmin = (vector float){-32768.0f,-32768.0f,-32768.0f,-32768.0f}; 826 vperm4 = (vector unsigned char){0,1,2,3,20,21,22,23,4,5,6,7,28,29,30,31}; 827 vperm5 = (vector unsigned char){8,9,10,11,20,21,22,23,12,13,14,15,28,29,30,31}; 828 #endif 829 830 vperm1 = vec_lvsl(0,window); 831 vperm2 = vec_lvsl(0,samples); 832 vperm3 = vec_lvsr(0,samples); 833 for (j=4;j;j--) 834 { 835 SYNTH_ALTIVEC(16); 836 837 vsum = vec_sub(v5,v6); 838 v9 = vec_sub(v7,v8); 839 v1 = vec_add(vsum,v9); 840 vsum = (vector float)vec_cts(v1,16); 841 v8 = (vector float)vec_cmpgt(v1,vmax); 842 v9 = (vector float)vec_cmplt(v1,vmin); 843 844 vsample1 = vec_ld(0,samples); 845 vsample2 = vec_ld(16,samples); 846 vsample3 = vec_ld(31,samples); 847 v1 = (vector float)vec_perm(vsample1, vsample2, vperm2); 848 v2 = (vector float)vec_perm(vsample2, vsample3, vperm2); 849 v1 = vec_perm(vsum, v1, vperm4); 850 v2 = vec_perm(vsum, v2, vperm5); 851 v3 = (vector float)vec_perm(vsample3, vsample2, vperm2); 852 v4 = (vector float)vec_perm(vsample2, vsample1, vperm2); 853 v5 = vec_perm(v2, v3, vperm3); 854 v6 = vec_perm(v1, v2, vperm3); 855 v7 = vec_perm(v4, v1, vperm3); 856 vec_st((vector signed int)v5,31,samples); 857 vec_st((vector signed int)v6,16,samples); 858 vec_st((vector signed int)v7,0,samples); 859 samples += 8; 860 861 v1 = (vector float)vec_sr((vector unsigned int)v8, vshift); 862 v2 = (vector float)vec_sr((vector unsigned int)v9, vshift); 863 v1 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2); 864 vclip = vec_sums((vector signed int)v1,vclip); 865 } 866 867 for (j=4;j;j--) 868 { 869 SYNTH_ALTIVEC(-16); 870 871 vsum = vec_add(v5,v6); 872 v9 = vec_add(v7,v8); 873 v1 = vec_add(vsum,v9); 874 vsum = (vector float)vec_cts(v1,16); 875 v8 = (vector float)vec_cmpgt(v1,vmax); 876 v9 = (vector float)vec_cmplt(v1,vmin); 877 878 vsample1 = vec_ld(0,samples); 879 vsample2 = vec_ld(16,samples); 880 vsample3 = vec_ld(31,samples); 881 v1 = (vector float)vec_perm(vsample1, vsample2, vperm2); 882 v2 = (vector float)vec_perm(vsample2, vsample3, vperm2); 883 v1 = vec_perm(vsum, v1, vperm4); 884 v2 = vec_perm(vsum, v2, vperm5); 885 v3 = (vector float)vec_perm(vsample3, vsample2, vperm2); 886 v4 = (vector float)vec_perm(vsample2, vsample1, vperm2); 887 v5 = vec_perm(v2, v3, vperm3); 888 v6 = vec_perm(v1, v2, vperm3); 889 v7 = vec_perm(v4, v1, vperm3); 890 vec_st((vector signed int)v5,31,samples); 891 vec_st((vector signed int)v6,16,samples); 892 vec_st((vector signed int)v7,0,samples); 893 samples += 8; 894 895 v1 = (vector float)vec_sr((vector unsigned int)v8, vshift); 896 v2 = (vector float)vec_sr((vector unsigned int)v9, vshift); 897 v1 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2); 898 vclip = vec_sums((vector signed int)v1,vclip); 899 } 900 901 vec_st(vclip,0,clip_tmp); 902 clip = clip_tmp[3]; 903 } 904 if(final) fr->buffer.fill += 256; 905 906 return clip; 907 } 908 909 910 int synth_1to1_s32_stereo_altivec(real *bandPtr_l, real *bandPtr_r, mpg123_handle *fr) 911 { 912 int32_t *samples = (int32_t *) (fr->buffer.data+fr->buffer.fill); 913 914 real *b0l, *b0r, **bufl, **bufr; 915 int clip; 916 int bo1; 917 #ifndef NO_EQUALIZER 918 if(fr->have_eq_settings) 919 { 920 do_equalizer(bandPtr_l,0,fr->equalizer); 921 do_equalizer(bandPtr_r,1,fr->equalizer); 922 } 923 #endif 924 fr->bo--; 925 fr->bo &= 0xf; 926 bufl = fr->real_buffs[0]; 927 bufr = fr->real_buffs[1]; 928 929 if(fr->bo & 0x1) 930 { 931 b0l = bufl[0]; 932 b0r = bufr[0]; 933 bo1 = fr->bo; 934 dct64_altivec(bufl[1]+((fr->bo+1)&0xf),bufl[0]+fr->bo,bandPtr_l); 935 dct64_altivec(bufr[1]+((fr->bo+1)&0xf),bufr[0]+fr->bo,bandPtr_r); 936 } 937 else 938 { 939 b0l = bufl[1]; 940 b0r = bufr[1]; 941 bo1 = fr->bo+1; 942 dct64_altivec(bufl[0]+fr->bo,bufl[1]+fr->bo+1,bandPtr_l); 943 dct64_altivec(bufr[0]+fr->bo,bufr[1]+fr->bo+1,bandPtr_r); 944 } 945 946 947 { 948 register int j; 949 real *window = fr->decwin + 16 - bo1; 950 951 ALIGNED(16) int clip_tmp[4]; 952 vector float v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13; 953 vector unsigned char vperm1,vperm2; 954 vector float vsum,vsum2,vsum3,vsum4,vsum5,vsum6,vsum7,vsum8,vmax,vmin,vzero; 955 vector float vprev; 956 vector unsigned int vshift; 957 vector signed int vclip; 958 vzero = vec_xor(vzero, vzero); 959 vclip = vec_xor(vclip, vclip); 960 vshift = vec_splat_u32(-1); /* 31 */ 961 #ifdef __APPLE__ 962 vmax = (vector float)(32767.999f); 963 vmin = (vector float)(-32768.0f); 964 #else 965 vmax = (vector float){32767.999f,32767.999f,32767.999f,32767.999f}; 966 vmin = (vector float){-32768.0f,-32768.0f,-32768.0f,-32768.0f}; 967 #endif 968 969 vperm1 = vec_lvsl(0,window); 970 vperm2 = vec_lvsr(0,samples); 971 vprev = (vector float)vec_perm(vec_ld(0,samples),vec_ld(0,samples),vec_lvsl(0,samples)); 972 for (j=4;j;j--) 973 { 974 SYNTH_STEREO_ALTIVEC(16); 975 976 vsum = vec_sub(vsum,vsum2); 977 vsum2 = vec_sub(vsum5,vsum6); 978 vsum3 = vec_sub(vsum3,vsum4); 979 vsum4 = vec_sub(vsum7,vsum8); 980 v1 = vec_add(vsum,vsum3); 981 v2 = vec_add(vsum2,vsum4); 982 vsum = (vector float)vec_cts(v1,16); 983 vsum2 = (vector float)vec_cts(v2,16); 984 v5 = (vector float)vec_cmpgt(v1,vmax); 985 v6 = (vector float)vec_cmplt(v1,vmin); 986 v7 = (vector float)vec_cmpgt(v2,vmax); 987 v8 = (vector float)vec_cmplt(v2,vmin); 988 989 v1 = vec_mergeh(vsum, vsum2); 990 v2 = vec_mergel(vsum, vsum2); 991 v3 = vec_perm(vprev,v1,vperm2); 992 v4 = vec_perm(v1,v2,vperm2); 993 vprev = v2; 994 vec_st((vector signed int)v3,0,samples); 995 vec_st((vector signed int)v4,16,samples); 996 samples += 8; 997 998 v1 = (vector float)vec_sr((vector unsigned int)v5, vshift); 999 v2 = (vector float)vec_sr((vector unsigned int)v6, vshift); 1000 v3 = (vector float)vec_sr((vector unsigned int)v7, vshift); 1001 v4 = (vector float)vec_sr((vector unsigned int)v8, vshift); 1002 v1 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2); 1003 v2 = (vector float)vec_add((vector unsigned int)v3,(vector unsigned int)v4); 1004 vclip = vec_sums((vector signed int)v1,vclip); 1005 vclip = vec_sums((vector signed int)v2,vclip); 1006 } 1007 1008 for (j=4;j;j--) 1009 { 1010 SYNTH_STEREO_ALTIVEC(-16); 1011 1012 vsum = vec_add(vsum,vsum2); 1013 vsum2 = vec_add(vsum5,vsum6); 1014 vsum3 = vec_add(vsum3,vsum4); 1015 vsum4 = vec_add(vsum7,vsum8); 1016 v1 = vec_add(vsum,vsum3); 1017 v2 = vec_add(vsum2,vsum4); 1018 vsum = (vector float)vec_cts(v1,16); 1019 vsum2 = (vector float)vec_cts(v2,16); 1020 v5 = (vector float)vec_cmpgt(v1,vmax); 1021 v6 = (vector float)vec_cmplt(v1,vmin); 1022 v7 = (vector float)vec_cmpgt(v2,vmax); 1023 v8 = (vector float)vec_cmplt(v2,vmin); 1024 1025 v1 = vec_mergeh(vsum, vsum2); 1026 v2 = vec_mergel(vsum, vsum2); 1027 v3 = vec_perm(vprev,v1,vperm2); 1028 v4 = vec_perm(v1,v2,vperm2); 1029 vprev = v2; 1030 vec_st((vector signed int)v3,0,samples); 1031 vec_st((vector signed int)v4,16,samples); 1032 samples += 8; 1033 1034 v1 = (vector float)vec_sr((vector unsigned int)v5, vshift); 1035 v2 = (vector float)vec_sr((vector unsigned int)v6, vshift); 1036 v3 = (vector float)vec_sr((vector unsigned int)v7, vshift); 1037 v4 = (vector float)vec_sr((vector unsigned int)v8, vshift); 1038 v1 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2); 1039 v2 = (vector float)vec_add((vector unsigned int)v3,(vector unsigned int)v4); 1040 vclip = vec_sums((vector signed int)v1,vclip); 1041 vclip = vec_sums((vector signed int)v2,vclip); 1042 } 1043 1044 if((size_t)samples & 0xf) 1045 { 1046 v1 = (vector float)vec_perm(vec_ld(0,samples),vec_ld(0,samples),vec_lvsl(0,samples)); 1047 v2 = (vector float)vec_perm(vprev,v1,vperm2); 1048 vec_st((vector signed int)v2,0,samples); 1049 } 1050 1051 vec_st(vclip,0,clip_tmp); 1052 clip = clip_tmp[3]; 1053 } 1054 fr->buffer.fill += 256; 1055 1056 return clip; 1057 } 1058