1/* 2 synth_stereo_x86_64_accurate: SSE optimized synth for x86-64 (stereo specific, MPEG-compliant 16bit output version) 3 4 copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1 5 see COPYING and AUTHORS files in distribution or http://mpg123.org 6 initially written by Taihei Monma 7*/ 8 9#include "mangle.h" 10 11#ifdef IS_MSABI 12/* short *window; */ 13#define WINDOW %rsi 14/* short *b0l; */ 15#define B0L %rdx 16/* short *b0r; */ 17#define B0R %r8 18/* short *samples; */ 19#define SAMPLES %r9 20#else 21/* real *window; */ 22#define WINDOW %rdi 23/* real *b0l; */ 24#define B0L %rsi 25/* real *b0r; */ 26#define B0R %rdx 27/* real *samples; */ 28#define SAMPLES %r8 29#endif 30 31#define XMMREG_MAX (%r10) /* {32767.0, 32767.0, 32767.0, 32767.0} */ 32#define XMMREG_MIN (%r11) /* {-32768.0, -32768.0, -32768.0, -32768.0} */ 33#define TEMP_CLIP (%rsp) 34 35/* 36 int synth_1to1_s_x86_64_accurate_asm(real *window, real *b0l, real *b0r, short *samples, int bo1); 37 return value: number of clipped samples 38*/ 39 40#ifndef __APPLE__ 41 .section .rodata 42#else 43 .data 44#endif 45 ALIGN32 46ASM_NAME(maxmin_s16): 47 .long 1191181824 48 .long 1191181824 49 .long 1191181824 50 .long 1191181824 51 .long -956301312 52 .long -956301312 53 .long -956301312 54 .long -956301312 55 .text 56 ALIGN16 57.globl ASM_NAME(synth_1to1_s_x86_64_accurate_asm) 58ASM_NAME(synth_1to1_s_x86_64_accurate_asm): 59#ifdef IS_MSABI /* should save xmm6-15 */ 60 movl 40(%rsp), %eax /* 5th argument; placed after 32-byte shadow space */ 61 pushq %rsi 62 subq $176, %rsp /* 10 xmm registers + temp */ 63 movaps %xmm6, 16(%rsp) 64 movaps %xmm7, 32(%rsp) 65 movaps %xmm8, 48(%rsp) 66 movaps %xmm9, 64(%rsp) 67 movaps %xmm10, 80(%rsp) 68 movaps %xmm11, 96(%rsp) 69 movaps %xmm12, 112(%rsp) 70 movaps %xmm13, 128(%rsp) 71 movaps %xmm14, 144(%rsp) 72 movaps %xmm15, 160(%rsp) 73#else 74 subq $24, %rsp /* stack alignment + temp */ 75#endif 76 77 leaq ASM_NAME(maxmin_s16)(%rip), %r10 78 leaq 16(%r10), %r11 79 xorps %xmm0, %xmm0 80 movaps %xmm0, TEMP_CLIP 81 82#ifdef IS_MSABI 83 shlq $32, %rax 84 shrq $30, %rax 85 movq %rcx, %rsi 86#else 87 movq %r8, %rax 88 shlq $32, %rax 89 shrq $30, %rax 90 movq %rcx, %r8 91#endif 92 leaq 64(WINDOW), WINDOW 93 subq %rax, WINDOW 94 95 movl $4, %ecx 96 97 ALIGN16 981: 99 movups (WINDOW), %xmm8 100 movups 16(WINDOW), %xmm1 101 movups 32(WINDOW), %xmm2 102 movups 48(WINDOW), %xmm3 103 movups 128(WINDOW), %xmm9 104 movups 144(WINDOW), %xmm5 105 movups 160(WINDOW), %xmm6 106 movups 176(WINDOW), %xmm7 107 movaps %xmm8, %xmm0 108 movaps %xmm1, %xmm4 109 movaps %xmm2, %xmm10 110 movaps %xmm3, %xmm11 111 movaps %xmm9, %xmm12 112 movaps %xmm5, %xmm13 113 movaps %xmm6, %xmm14 114 movaps %xmm7, %xmm15 115 mulps (B0L), %xmm8 116 mulps 16(B0L), %xmm1 117 mulps 32(B0L), %xmm2 118 mulps 48(B0L), %xmm3 119 mulps 64(B0L), %xmm9 120 mulps 80(B0L), %xmm5 121 mulps 96(B0L), %xmm6 122 mulps 112(B0L), %xmm7 123 mulps (B0R), %xmm0 124 mulps 16(B0R), %xmm4 125 mulps 32(B0R), %xmm10 126 mulps 48(B0R), %xmm11 127 mulps 64(B0R), %xmm12 128 mulps 80(B0R), %xmm13 129 mulps 96(B0R), %xmm14 130 mulps 112(B0R), %xmm15 131 132 addps %xmm1, %xmm8 133 addps %xmm2, %xmm3 134 addps %xmm4, %xmm0 135 addps %xmm11, %xmm10 136 addps %xmm5, %xmm9 137 addps %xmm7, %xmm6 138 addps %xmm13, %xmm12 139 addps %xmm15, %xmm14 140 addps %xmm3, %xmm8 141 addps %xmm6, %xmm9 142 addps %xmm10, %xmm0 143 addps %xmm12, %xmm14 144 movaps %xmm0, %xmm12 145 movaps %xmm14, %xmm13 146 leaq 256(WINDOW), WINDOW 147 leaq 128(B0L), B0L 148 leaq 128(B0R), B0R 149 150 movups (WINDOW), %xmm10 151 movups 16(WINDOW), %xmm1 152 movups 32(WINDOW), %xmm2 153 movups 48(WINDOW), %xmm3 154 movups 128(WINDOW), %xmm11 155 movups 144(WINDOW), %xmm5 156 movups 160(WINDOW), %xmm6 157 movups 176(WINDOW), %xmm7 158 movaps %xmm10, %xmm0 159 movaps %xmm1, %xmm4 160 movaps %xmm2, %xmm14 161 movaps %xmm3, %xmm15 162 mulps (B0L), %xmm10 163 mulps 16(B0L), %xmm1 164 mulps 32(B0L), %xmm2 165 mulps 48(B0L), %xmm3 166 mulps (B0R), %xmm0 167 mulps 16(B0R), %xmm4 168 mulps 32(B0R), %xmm14 169 mulps 48(B0R), %xmm15 170 addps %xmm1, %xmm10 171 addps %xmm2, %xmm3 172 addps %xmm4, %xmm0 173 addps %xmm15, %xmm14 174 movaps %xmm11, %xmm1 175 movaps %xmm5, %xmm2 176 movaps %xmm6, %xmm4 177 movaps %xmm7, %xmm15 178 mulps 64(B0L), %xmm11 179 mulps 80(B0L), %xmm5 180 mulps 96(B0L), %xmm6 181 mulps 112(B0L), %xmm7 182 mulps 64(B0R), %xmm1 183 mulps 80(B0R), %xmm2 184 mulps 96(B0R), %xmm4 185 mulps 112(B0R), %xmm15 186 addps %xmm5, %xmm11 187 addps %xmm7, %xmm6 188 addps %xmm2, %xmm1 189 addps %xmm15, %xmm4 190 191 addps %xmm3, %xmm10 192 addps %xmm6, %xmm11 193 addps %xmm0, %xmm14 194 addps %xmm4, %xmm1 195 movaps %xmm1, %xmm15 196 leaq 256(WINDOW), WINDOW 197 leaq 128(B0L), B0L 198 leaq 128(B0R), B0R 199 200 movaps %xmm8, %xmm0 201 movaps %xmm10, %xmm1 202 movaps %xmm12, %xmm4 203 movaps %xmm14, %xmm5 204 unpcklps %xmm9, %xmm8 205 unpcklps %xmm11, %xmm10 206 unpckhps %xmm9, %xmm0 207 unpckhps %xmm11, %xmm1 208 unpcklps %xmm13, %xmm12 209 unpcklps %xmm15, %xmm14 210 unpckhps %xmm13, %xmm4 211 unpckhps %xmm15, %xmm5 212 movaps %xmm8, %xmm2 213 movaps %xmm0, %xmm3 214 movaps %xmm12, %xmm6 215 movaps %xmm4, %xmm7 216 movlhps %xmm10, %xmm8 217 movhlps %xmm2, %xmm10 218 movlhps %xmm1, %xmm0 219 movhlps %xmm3, %xmm1 220 movlhps %xmm14, %xmm12 221 movhlps %xmm6, %xmm14 222 movlhps %xmm5, %xmm4 223 movhlps %xmm7, %xmm5 224 subps %xmm10, %xmm8 225 subps %xmm1, %xmm0 226 subps %xmm14, %xmm12 227 subps %xmm5, %xmm4 228 addps %xmm8, %xmm0 229 addps %xmm12, %xmm4 230 231 movaps %xmm0, %xmm2 232 movaps %xmm0, %xmm3 233 movaps %xmm4, %xmm5 234 movaps %xmm4, %xmm6 235 cmpnleps XMMREG_MAX, %xmm2 236 cmpltps XMMREG_MIN, %xmm3 237 cmpnleps XMMREG_MAX, %xmm5 238 cmpltps XMMREG_MIN, %xmm6 239 cvtps2dq %xmm0, %xmm0 240 cvtps2dq %xmm4, %xmm4 241 movaps %xmm0, %xmm1 242 unpcklps %xmm4, %xmm0 243 unpckhps %xmm4, %xmm1 244 packssdw %xmm1, %xmm0 245 movups %xmm0, (SAMPLES) 246 247 packssdw %xmm5, %xmm2 248 packssdw %xmm6, %xmm3 249 psrlw $15, %xmm2 250 psrlw $15, %xmm3 251 paddw %xmm3, %xmm2 252 paddw TEMP_CLIP, %xmm2 253 movaps %xmm2, TEMP_CLIP 254 255 leaq 16(SAMPLES), SAMPLES 256 decl %ecx 257 jnz 1b 258 259 movl $4, %ecx 260 261 ALIGN16 2621: 263 movups (WINDOW), %xmm8 264 movups 16(WINDOW), %xmm1 265 movups 32(WINDOW), %xmm2 266 movups 48(WINDOW), %xmm3 267 movups 128(WINDOW), %xmm9 268 movups 144(WINDOW), %xmm5 269 movups 160(WINDOW), %xmm6 270 movups 176(WINDOW), %xmm7 271 movaps %xmm8, %xmm0 272 movaps %xmm1, %xmm4 273 movaps %xmm2, %xmm10 274 movaps %xmm3, %xmm11 275 movaps %xmm9, %xmm12 276 movaps %xmm5, %xmm13 277 movaps %xmm6, %xmm14 278 movaps %xmm7, %xmm15 279 mulps (B0L), %xmm8 280 mulps 16(B0L), %xmm1 281 mulps 32(B0L), %xmm2 282 mulps 48(B0L), %xmm3 283 mulps -64(B0L), %xmm9 284 mulps -48(B0L), %xmm5 285 mulps -32(B0L), %xmm6 286 mulps -16(B0L), %xmm7 287 mulps (B0R), %xmm0 288 mulps 16(B0R), %xmm4 289 mulps 32(B0R), %xmm10 290 mulps 48(B0R), %xmm11 291 mulps -64(B0R), %xmm12 292 mulps -48(B0R), %xmm13 293 mulps -32(B0R), %xmm14 294 mulps -16(B0R), %xmm15 295 296 addps %xmm1, %xmm8 297 addps %xmm2, %xmm3 298 addps %xmm4, %xmm0 299 addps %xmm11, %xmm10 300 addps %xmm5, %xmm9 301 addps %xmm7, %xmm6 302 addps %xmm13, %xmm12 303 addps %xmm15, %xmm14 304 addps %xmm3, %xmm8 305 addps %xmm6, %xmm9 306 addps %xmm10, %xmm0 307 addps %xmm12, %xmm14 308 movaps %xmm0, %xmm12 309 movaps %xmm14, %xmm13 310 leaq 256(WINDOW), WINDOW 311 leaq -128(B0L), B0L 312 leaq -128(B0R), B0R 313 314 movups (WINDOW), %xmm10 315 movups 16(WINDOW), %xmm1 316 movups 32(WINDOW), %xmm2 317 movups 48(WINDOW), %xmm3 318 movups 128(WINDOW), %xmm11 319 movups 144(WINDOW), %xmm5 320 movups 160(WINDOW), %xmm6 321 movups 176(WINDOW), %xmm7 322 movaps %xmm10, %xmm0 323 movaps %xmm1, %xmm4 324 movaps %xmm2, %xmm14 325 movaps %xmm3, %xmm15 326 mulps (B0L), %xmm10 327 mulps 16(B0L), %xmm1 328 mulps 32(B0L), %xmm2 329 mulps 48(B0L), %xmm3 330 mulps (B0R), %xmm0 331 mulps 16(B0R), %xmm4 332 mulps 32(B0R), %xmm14 333 mulps 48(B0R), %xmm15 334 addps %xmm1, %xmm10 335 addps %xmm2, %xmm3 336 addps %xmm4, %xmm0 337 addps %xmm15, %xmm14 338 movaps %xmm11, %xmm1 339 movaps %xmm5, %xmm2 340 movaps %xmm6, %xmm4 341 movaps %xmm7, %xmm15 342 mulps -64(B0L), %xmm11 343 mulps -48(B0L), %xmm5 344 mulps -32(B0L), %xmm6 345 mulps -16(B0L), %xmm7 346 mulps -64(B0R), %xmm1 347 mulps -48(B0R), %xmm2 348 mulps -32(B0R), %xmm4 349 mulps -16(B0R), %xmm15 350 addps %xmm5, %xmm11 351 addps %xmm7, %xmm6 352 addps %xmm2, %xmm1 353 addps %xmm15, %xmm4 354 355 addps %xmm3, %xmm10 356 addps %xmm6, %xmm11 357 addps %xmm0, %xmm14 358 addps %xmm4, %xmm1 359 movaps %xmm1, %xmm15 360 leaq 256(WINDOW), WINDOW 361 leaq -128(B0L), B0L 362 leaq -128(B0R), B0R 363 364 movaps %xmm8, %xmm0 365 movaps %xmm10, %xmm1 366 movaps %xmm12, %xmm4 367 movaps %xmm14, %xmm5 368 unpcklps %xmm9, %xmm8 369 unpcklps %xmm11, %xmm10 370 unpckhps %xmm9, %xmm0 371 unpckhps %xmm11, %xmm1 372 unpcklps %xmm13, %xmm12 373 unpcklps %xmm15, %xmm14 374 unpckhps %xmm13, %xmm4 375 unpckhps %xmm15, %xmm5 376 movaps %xmm8, %xmm2 377 movaps %xmm0, %xmm3 378 movaps %xmm12, %xmm6 379 movaps %xmm4, %xmm7 380 movlhps %xmm10, %xmm8 381 movhlps %xmm2, %xmm10 382 movlhps %xmm1, %xmm0 383 movhlps %xmm3, %xmm1 384 movlhps %xmm14, %xmm12 385 movhlps %xmm6, %xmm14 386 movlhps %xmm5, %xmm4 387 movhlps %xmm7, %xmm5 388 addps %xmm10, %xmm8 389 addps %xmm1, %xmm0 390 addps %xmm14, %xmm12 391 addps %xmm5, %xmm4 392 addps %xmm8, %xmm0 393 addps %xmm12, %xmm4 394 395 movaps %xmm0, %xmm2 396 movaps %xmm0, %xmm3 397 movaps %xmm4, %xmm5 398 movaps %xmm4, %xmm6 399 cmpnleps XMMREG_MAX, %xmm2 400 cmpltps XMMREG_MIN, %xmm3 401 cmpnleps XMMREG_MAX, %xmm5 402 cmpltps XMMREG_MIN, %xmm6 403 cvtps2dq %xmm0, %xmm0 404 cvtps2dq %xmm4, %xmm4 405 movaps %xmm0, %xmm1 406 unpcklps %xmm4, %xmm0 407 unpckhps %xmm4, %xmm1 408 packssdw %xmm1, %xmm0 409 movups %xmm0, (SAMPLES) 410 411 packssdw %xmm5, %xmm2 412 packssdw %xmm6, %xmm3 413 psrlw $15, %xmm2 414 psrlw $15, %xmm3 415 paddw %xmm3, %xmm2 416 paddw TEMP_CLIP, %xmm2 417 movaps %xmm2, TEMP_CLIP 418 419 leaq 16(SAMPLES), SAMPLES 420 decl %ecx 421 jnz 1b 422 423 movaps TEMP_CLIP, %xmm4 424 movhlps %xmm4, %xmm0 425 paddw %xmm4, %xmm0 426 pshuflw $0x55, %xmm0, %xmm1 427 pshuflw $0xaa, %xmm0, %xmm2 428 pshuflw $0xff, %xmm0, %xmm3 429 paddw %xmm1, %xmm0 430 paddw %xmm2, %xmm0 431 paddw %xmm3, %xmm0 432 433 movd %xmm0, %eax 434 andl $0xffff, %eax 435 436#ifdef IS_MSABI 437 movaps 16(%rsp), %xmm6 438 movaps 32(%rsp), %xmm7 439 movaps 48(%rsp), %xmm8 440 movaps 64(%rsp), %xmm9 441 movaps 80(%rsp), %xmm10 442 movaps 96(%rsp), %xmm11 443 movaps 112(%rsp), %xmm12 444 movaps 128(%rsp), %xmm13 445 movaps 144(%rsp), %xmm14 446 movaps 160(%rsp), %xmm15 447 addq $176, %rsp 448 popq %rsi 449#else 450 addq $24, %rsp 451#endif 452 ret 453 454NONEXEC_STACK 455