1/* 2 synth_stereo_x86_64: SSE optimized synth for x86-64 (stereo specific version) 3 4 copyright 1995-2009 by the mpg123 project - free software under the terms of the LGPL 2.1 5 see COPYING and AUTHORS files in distribution or http://mpg123.org 6 initially written by Taihei Monma 7*/ 8 9#include "mangle.h" 10 11#ifdef IS_MSABI 12/* short *window; */ 13#define WINDOW %r10 14/* short *b0l; */ 15#define B0L %rdx 16/* short *b0r; */ 17#define B0R %r8 18/* short *samples; */ 19#define SAMPLES %r9 20#else 21/* short *window; */ 22#define WINDOW %rdi 23/* short *b0l; */ 24#define B0L %rsi 25/* short *b0r; */ 26#define B0R %rdx 27/* short *samples; */ 28#define SAMPLES %r9 29#endif 30 31#define XMMREG_CLIP %xmm15 32#define XMMREG_MAX %xmm14 /* {32767, 32767, 32767, 32767} */ 33#define XMMREG_MIN %xmm13 /* {-32769, -32769, -32769, -32769} : not -32768 because SSE doesn't have "less than" comparison... */ 34#define XMMREG_FULL %xmm12 /* {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF} */ 35 36/* 37 int synth_1to1_s_x86_64_asm(short *window, short *b0l, short *b0r, short *samples, int bo1); 38 return value: number of clipped samples 39*/ 40 41#ifndef __APPLE__ 42 .section .rodata 43#else 44 .data 45#endif 46 ALIGN32 47ASM_NAME(maxmin_x86_64): 48 .long 32767 49 .long 32767 50 .long 32767 51 .long 32767 52 .long -32769 53 .long -32769 54 .long -32769 55 .long -32769 56 .text 57 ALIGN16 58.globl ASM_NAME(synth_1to1_s_x86_64_asm) 59ASM_NAME(synth_1to1_s_x86_64_asm): 60#ifdef IS_MSABI /* should save xmm6-15 */ 61 movl 40(%rsp), %eax /* 5th argument; placed after 32-byte shadow space */ 62 subq $168, %rsp /* stack alignment + 10 xmm registers */ 63 movaps %xmm6, (%rsp) 64 movaps %xmm7, 16(%rsp) 65 movaps %xmm8, 32(%rsp) 66 movaps %xmm9, 48(%rsp) 67 movaps %xmm10, 64(%rsp) 68 movaps %xmm11, 80(%rsp) 69 movaps %xmm12, 96(%rsp) 70 movaps %xmm13, 112(%rsp) 71 movaps %xmm14, 128(%rsp) 72 movaps %xmm15, 144(%rsp) 73#endif 74 75#ifdef IS_MSABI 76 shlq $32, %rax 77 shrq $31, %rax 78 movq %rcx, %r10 79#else 80 movq %r8, %rax 81 shlq $32, %rax 82 shrq $31, %rax 83 movq %rcx, %r9 84#endif 85 leaq 32(WINDOW), WINDOW 86 subq %rax, WINDOW 87 88 leaq ASM_NAME(maxmin_x86_64)(%rip), %rax 89 movaps (%rax), XMMREG_MAX 90 movaps 16(%rax), XMMREG_MIN 91 pxor XMMREG_CLIP, XMMREG_CLIP 92 pcmpeqd XMMREG_FULL, XMMREG_FULL 93 94 movl $4, %ecx 95 96 ALIGN16 971: 98 movups (WINDOW), %xmm0 99 movups 16(WINDOW), %xmm1 100 movups 64(WINDOW), %xmm2 101 movups 80(WINDOW), %xmm3 102 movups 128(WINDOW), %xmm4 103 movups 144(WINDOW), %xmm5 104 movups 192(WINDOW), %xmm6 105 movups 208(WINDOW), %xmm7 106 movaps %xmm0, %xmm8 107 movaps %xmm1, %xmm9 108 movaps %xmm2, %xmm10 109 movaps %xmm3, %xmm11 110 pmaddwd (B0L), %xmm0 111 pmaddwd 16(B0L), %xmm1 112 pmaddwd 32(B0L), %xmm2 113 pmaddwd 48(B0L), %xmm3 114 pmaddwd (B0R), %xmm8 115 pmaddwd 16(B0R), %xmm9 116 pmaddwd 32(B0R), %xmm10 117 pmaddwd 48(B0R), %xmm11 118 paddd %xmm1, %xmm0 119 paddd %xmm3, %xmm2 120 paddd %xmm9, %xmm8 121 paddd %xmm11, %xmm10 122 movaps %xmm4, %xmm1 123 movaps %xmm5, %xmm9 124 movaps %xmm6, %xmm3 125 movaps %xmm7, %xmm11 126 pmaddwd 64(B0L), %xmm4 127 pmaddwd 80(B0L), %xmm5 128 pmaddwd 96(B0L), %xmm6 129 pmaddwd 112(B0L), %xmm7 130 pmaddwd 64(B0R), %xmm1 131 pmaddwd 80(B0R), %xmm9 132 pmaddwd 96(B0R), %xmm3 133 pmaddwd 112(B0R), %xmm11 134 paddd %xmm5, %xmm4 135 paddd %xmm7, %xmm6 136 paddd %xmm1, %xmm9 137 paddd %xmm3, %xmm11 138 139 movaps %xmm0, %xmm1 140 movaps %xmm4, %xmm3 141 movaps %xmm8, %xmm5 142 movaps %xmm9, %xmm7 143 punpckldq %xmm2, %xmm0 144 punpckldq %xmm6, %xmm4 145 punpckhdq %xmm2, %xmm1 146 punpckhdq %xmm6, %xmm3 147 punpckldq %xmm10, %xmm8 148 punpckldq %xmm11, %xmm9 149 punpckhdq %xmm10, %xmm5 150 punpckhdq %xmm11, %xmm7 151 movaps %xmm0, %xmm2 152 movaps %xmm1, %xmm6 153 movaps %xmm8, %xmm10 154 movaps %xmm5, %xmm11 155 movlhps %xmm4, %xmm0 156 movhlps %xmm2, %xmm4 157 movlhps %xmm3, %xmm1 158 movhlps %xmm6, %xmm3 159 movlhps %xmm9, %xmm8 160 movhlps %xmm10, %xmm9 161 movlhps %xmm7, %xmm5 162 movhlps %xmm11, %xmm7 163 paddd %xmm4, %xmm0 164 paddd %xmm3, %xmm1 165 paddd %xmm9, %xmm8 166 paddd %xmm7, %xmm5 167 paddd %xmm1, %xmm0 168 paddd %xmm5, %xmm8 169 psrad $13, %xmm0 170 psrad $13, %xmm8 171 172 movaps %xmm0, %xmm1 173 movaps %xmm0, %xmm2 174 movaps %xmm0, %xmm3 175 movaps %xmm8, %xmm4 176 punpckldq %xmm8, %xmm0 177 punpckhdq %xmm8, %xmm1 178 packssdw %xmm1, %xmm0 179 movups %xmm0, (SAMPLES) 180 181 pcmpgtd XMMREG_MAX, %xmm2 182 pcmpgtd XMMREG_MIN, %xmm3 183 pcmpgtd XMMREG_MAX, %xmm4 184 pcmpgtd XMMREG_MIN, %xmm8 185 packssdw %xmm4, %xmm2 186 packssdw %xmm8, %xmm3 187 pxor XMMREG_FULL, %xmm3 188 psrlw $15, %xmm2 189 psrlw $15, %xmm3 190 paddw %xmm3, %xmm2 191 paddw %xmm2, XMMREG_CLIP 192 193 leaq 256(WINDOW), WINDOW 194 leaq 128(B0L), B0L 195 leaq 128(B0R), B0R 196 leaq 16(SAMPLES), SAMPLES 197 198 decl %ecx 199 jnz 1b 200 201 movl $4, %ecx 202 203 ALIGN16 2041: 205 movups (WINDOW), %xmm0 206 movups 16(WINDOW), %xmm1 207 movups 64(WINDOW), %xmm2 208 movups 80(WINDOW), %xmm3 209 movups 128(WINDOW), %xmm4 210 movups 144(WINDOW), %xmm5 211 movups 192(WINDOW), %xmm6 212 movups 208(WINDOW), %xmm7 213 movaps %xmm0, %xmm8 214 movaps %xmm1, %xmm9 215 movaps %xmm2, %xmm10 216 movaps %xmm3, %xmm11 217 pmaddwd (B0L), %xmm0 218 pmaddwd 16(B0L), %xmm1 219 pmaddwd -32(B0L), %xmm2 220 pmaddwd -16(B0L), %xmm3 221 pmaddwd (B0R), %xmm8 222 pmaddwd 16(B0R), %xmm9 223 pmaddwd -32(B0R), %xmm10 224 pmaddwd -16(B0R), %xmm11 225 paddd %xmm1, %xmm0 226 paddd %xmm3, %xmm2 227 paddd %xmm9, %xmm8 228 paddd %xmm11, %xmm10 229 movaps %xmm4, %xmm1 230 movaps %xmm5, %xmm9 231 movaps %xmm6, %xmm3 232 movaps %xmm7, %xmm11 233 pmaddwd -64(B0L), %xmm4 234 pmaddwd -48(B0L), %xmm5 235 pmaddwd -96(B0L), %xmm6 236 pmaddwd -80(B0L), %xmm7 237 pmaddwd -64(B0R), %xmm1 238 pmaddwd -48(B0R), %xmm9 239 pmaddwd -96(B0R), %xmm3 240 pmaddwd -80(B0R), %xmm11 241 paddd %xmm5, %xmm4 242 paddd %xmm7, %xmm6 243 paddd %xmm1, %xmm9 244 paddd %xmm3, %xmm11 245 246 movaps %xmm0, %xmm1 247 movaps %xmm4, %xmm3 248 movaps %xmm8, %xmm5 249 movaps %xmm9, %xmm7 250 punpckldq %xmm2, %xmm0 251 punpckldq %xmm6, %xmm4 252 punpckhdq %xmm2, %xmm1 253 punpckhdq %xmm6, %xmm3 254 punpckldq %xmm10, %xmm8 255 punpckldq %xmm11, %xmm9 256 punpckhdq %xmm10, %xmm5 257 punpckhdq %xmm11, %xmm7 258 movaps %xmm0, %xmm2 259 movaps %xmm1, %xmm6 260 movaps %xmm8, %xmm10 261 movaps %xmm5, %xmm11 262 movlhps %xmm4, %xmm0 263 movhlps %xmm2, %xmm4 264 movlhps %xmm3, %xmm1 265 movhlps %xmm6, %xmm3 266 movlhps %xmm9, %xmm8 267 movhlps %xmm10, %xmm9 268 movlhps %xmm7, %xmm5 269 movhlps %xmm11, %xmm7 270 paddd %xmm4, %xmm0 271 paddd %xmm3, %xmm1 272 paddd %xmm9, %xmm8 273 paddd %xmm7, %xmm5 274 paddd %xmm1, %xmm0 275 paddd %xmm5, %xmm8 276 psrad $13, %xmm0 277 psrad $13, %xmm8 278 279 movaps %xmm0, %xmm1 280 movaps %xmm0, %xmm2 281 movaps %xmm0, %xmm3 282 movaps %xmm8, %xmm4 283 punpckldq %xmm8, %xmm0 284 punpckhdq %xmm8, %xmm1 285 packssdw %xmm1, %xmm0 286 movups %xmm0, (SAMPLES) 287 288 pcmpgtd XMMREG_MAX, %xmm2 289 pcmpgtd XMMREG_MIN, %xmm3 290 pcmpgtd XMMREG_MAX, %xmm4 291 pcmpgtd XMMREG_MIN, %xmm8 292 packssdw %xmm4, %xmm2 293 packssdw %xmm8, %xmm3 294 pxor XMMREG_FULL, %xmm3 295 psrlw $15, %xmm2 296 psrlw $15, %xmm3 297 paddw %xmm3, %xmm2 298 paddw %xmm2, XMMREG_CLIP 299 300 leaq 256(WINDOW), WINDOW 301 leaq -128(B0L), B0L 302 leaq -128(B0R), B0R 303 leaq 16(SAMPLES), SAMPLES 304 305 decl %ecx 306 jnz 1b 307 308 movhlps XMMREG_CLIP, %xmm0 309 paddw XMMREG_CLIP, %xmm0 310 pshuflw $0x55, %xmm0, %xmm1 311 pshuflw $0xaa, %xmm0, %xmm2 312 pshuflw $0xff, %xmm0, %xmm3 313 paddw %xmm1, %xmm0 314 paddw %xmm2, %xmm0 315 paddw %xmm3, %xmm0 316 317 movd %xmm0, %eax 318 andl $0xffff, %eax 319 320#ifdef IS_MSABI 321 movaps (%rsp), %xmm6 322 movaps 16(%rsp), %xmm7 323 movaps 32(%rsp), %xmm8 324 movaps 48(%rsp), %xmm9 325 movaps 64(%rsp), %xmm10 326 movaps 80(%rsp), %xmm11 327 movaps 96(%rsp), %xmm12 328 movaps 112(%rsp), %xmm13 329 movaps 128(%rsp), %xmm14 330 movaps 144(%rsp), %xmm15 331 addq $168, %rsp 332#endif 333 ret 334 335NONEXEC_STACK 336