1/* 2 synth_stereo_avx: AVX optimized synth for x86-64 (stereo specific version) 3 4 copyright 1995-2013 by the mpg123 project - free software under the terms of the LGPL 2.1 5 see COPYING and AUTHORS files in distribution or http://mpg123.org 6 initially written by Taihei Monma 7*/ 8 9#include "mangle.h" 10 11#ifdef IS_MSABI 12/* short *window; */ 13#define WINDOW %r10 14/* short *b0l; */ 15#define B0L %rdx 16/* short *b0r; */ 17#define B0R %r8 18/* short *samples; */ 19#define SAMPLES %r9 20#else 21/* short *window; */ 22#define WINDOW %rdi 23/* short *b0l; */ 24#define B0L %rsi 25/* short *b0r; */ 26#define B0R %rdx 27/* short *samples; */ 28#define SAMPLES %r9 29#endif 30 31/* 32 int synth_1to1_s_avx_asm(short *window, short *b0l, short *b0r, short *samples, int bo1); 33 return value: number of clipped samples 34*/ 35 36 ALIGN16 37.globl ASM_NAME(synth_1to1_s_avx_asm) 38ASM_NAME(synth_1to1_s_avx_asm): 39#ifdef IS_MSABI /* should save xmm6-15 */ 40 push %rbp 41 mov %rsp, %rbp 42 sub $144, %rsp 43 movaps %xmm6, (%rsp) 44 movaps %xmm7, 16(%rsp) 45 movaps %xmm8, 32(%rsp) 46 movaps %xmm9, 48(%rsp) 47 movaps %xmm10, 64(%rsp) 48 movaps %xmm11, 80(%rsp) 49 movaps %xmm12, 96(%rsp) 50 movaps %xmm13, 112(%rsp) 51 movaps %xmm14, 128(%rsp) 52 movl 48(%rbp), %eax /* 5th argument; placed after 32-byte shadow space */ 53#endif 54 55#ifdef IS_MSABI 56 shl $1, %eax 57 mov %rcx, WINDOW 58#else 59 mov %r8d, %eax 60 shl $1, %eax 61 movq %rcx, SAMPLES 62#endif 63 add $32, WINDOW 64 sub %rax, WINDOW 65 66 mov $64, %rax 67 movl $4, %ecx 68 vpxor %xmm14, %xmm14, %xmm14 69 70 ALIGN16 711: 72 movups (WINDOW), %xmm8 73 movups 16(WINDOW), %xmm9 74 movups (WINDOW,%rax), %xmm10 75 movups 16(WINDOW,%rax), %xmm11 76 vpmaddwd (B0L), %xmm8, %xmm0 77 vpmaddwd 16(B0L), %xmm9, %xmm1 78 vpmaddwd (B0R), %xmm8, %xmm2 79 vpmaddwd 16(B0R), %xmm9, %xmm3 80 vpmaddwd 32(B0L), %xmm10, %xmm4 81 vpmaddwd 48(B0L), %xmm11, %xmm5 82 vpmaddwd 32(B0R), %xmm10, %xmm6 83 vpmaddwd 48(B0R), %xmm11, %xmm7 84 vpaddd %xmm1, %xmm0, %xmm8 85 vpaddd %xmm3, %xmm2, %xmm0 86 vpaddd %xmm5, %xmm4, %xmm9 87 vpaddd %xmm7, %xmm6, %xmm1 88 lea (WINDOW,%rax,2), WINDOW 89 add %rax, B0L 90 add %rax, B0R 91 92 movups (WINDOW), %xmm10 93 movups 16(WINDOW), %xmm11 94 movups (WINDOW,%rax), %xmm12 95 movups 16(WINDOW,%rax), %xmm13 96 vpmaddwd (B0L), %xmm10, %xmm2 97 vpmaddwd 16(B0L), %xmm11, %xmm3 98 vpmaddwd (B0R), %xmm10, %xmm4 99 vpmaddwd 16(B0R), %xmm11, %xmm5 100 vpmaddwd 32(B0L), %xmm12, %xmm6 101 vpmaddwd 48(B0L), %xmm13, %xmm10 102 vpmaddwd 32(B0R), %xmm12, %xmm7 103 vpmaddwd 48(B0R), %xmm13, %xmm11 104 vpaddd %xmm3, %xmm2, %xmm2 105 vpaddd %xmm5, %xmm4, %xmm3 106 vpaddd %xmm6, %xmm10, %xmm4 107 vpaddd %xmm7, %xmm11, %xmm5 108 lea (WINDOW,%rax,2), WINDOW 109 add %rax, B0L 110 add %rax, B0R 111 112 vpunpckldq %xmm0, %xmm8, %xmm6 113 vpunpckhdq %xmm0, %xmm8, %xmm0 114 vpunpckldq %xmm1, %xmm9, %xmm7 115 vpunpckhdq %xmm1, %xmm9, %xmm1 116 vpaddd %xmm6, %xmm0, %xmm0 117 vpaddd %xmm7, %xmm1, %xmm1 118 vpunpckldq %xmm3, %xmm2, %xmm6 119 vpunpckhdq %xmm3, %xmm2, %xmm2 120 vpunpckldq %xmm5, %xmm4, %xmm7 121 vpunpckhdq %xmm5, %xmm4, %xmm3 122 vpaddd %xmm6, %xmm2, %xmm2 123 vpaddd %xmm7, %xmm3, %xmm3 124 125 vpunpcklqdq %xmm1, %xmm0, %xmm4 126 vpunpckhqdq %xmm1, %xmm0, %xmm0 127 vpunpcklqdq %xmm3, %xmm2, %xmm5 128 vpunpckhqdq %xmm3, %xmm2, %xmm1 129 vpaddd %xmm0, %xmm4, %xmm0 130 vpaddd %xmm1, %xmm5, %xmm1 131 vpsrad $13, %xmm0, %xmm0 132 vpsrad $13, %xmm1, %xmm1 133 vpackssdw %xmm1, %xmm0, %xmm2 134 vpcmpeqd %xmm3, %xmm3, %xmm3 135 vpslld $16, %xmm0, %xmm0 136 vpslld $16, %xmm1, %xmm1 137 vpsrld $16, %xmm0, %xmm0 138 vpsrld $16, %xmm1, %xmm1 139 vpackusdw %xmm1, %xmm0, %xmm0 140 vpcmpeqw %xmm2, %xmm0, %xmm0 141 vpxor %xmm3, %xmm0, %xmm0 142 vpaddw %xmm0, %xmm14, %xmm14 143 144 movups %xmm2, (SAMPLES) 145 add $16, SAMPLES 146 dec %ecx 147 jnz 1b 148 149 movl $4, %ecx 150 151 ALIGN16 1521: 153 movups (WINDOW), %xmm8 154 movups 16(WINDOW), %xmm9 155 movups (WINDOW,%rax), %xmm10 156 movups 16(WINDOW,%rax), %xmm11 157 vpmaddwd (B0L), %xmm8, %xmm0 158 vpmaddwd 16(B0L), %xmm9, %xmm1 159 vpmaddwd (B0R), %xmm8, %xmm2 160 vpmaddwd 16(B0R), %xmm9, %xmm3 161 vpmaddwd -32(B0L), %xmm10, %xmm4 162 vpmaddwd -16(B0L), %xmm11, %xmm5 163 vpmaddwd -32(B0R), %xmm10, %xmm6 164 vpmaddwd -16(B0R), %xmm11, %xmm7 165 vpaddd %xmm1, %xmm0, %xmm8 166 vpaddd %xmm3, %xmm2, %xmm0 167 vpaddd %xmm5, %xmm4, %xmm9 168 vpaddd %xmm7, %xmm6, %xmm1 169 lea (WINDOW,%rax,2), WINDOW 170 sub %rax, B0L 171 sub %rax, B0R 172 173 movups (WINDOW), %xmm10 174 movups 16(WINDOW), %xmm11 175 movups (WINDOW,%rax), %xmm12 176 movups 16(WINDOW,%rax), %xmm13 177 vpmaddwd (B0L), %xmm10, %xmm2 178 vpmaddwd 16(B0L), %xmm11, %xmm3 179 vpmaddwd (B0R), %xmm10, %xmm4 180 vpmaddwd 16(B0R), %xmm11, %xmm5 181 vpmaddwd -32(B0L), %xmm12, %xmm6 182 vpmaddwd -16(B0L), %xmm13, %xmm10 183 vpmaddwd -32(B0R), %xmm12, %xmm7 184 vpmaddwd -16(B0R), %xmm13, %xmm11 185 vpaddd %xmm3, %xmm2, %xmm2 186 vpaddd %xmm5, %xmm4, %xmm3 187 vpaddd %xmm6, %xmm10, %xmm4 188 vpaddd %xmm7, %xmm11, %xmm5 189 lea (WINDOW,%rax,2), WINDOW 190 sub %rax, B0L 191 sub %rax, B0R 192 193 vpunpckldq %xmm0, %xmm8, %xmm6 194 vpunpckhdq %xmm0, %xmm8, %xmm0 195 vpunpckldq %xmm1, %xmm9, %xmm7 196 vpunpckhdq %xmm1, %xmm9, %xmm1 197 vpaddd %xmm6, %xmm0, %xmm0 198 vpaddd %xmm7, %xmm1, %xmm1 199 vpunpckldq %xmm3, %xmm2, %xmm6 200 vpunpckhdq %xmm3, %xmm2, %xmm2 201 vpunpckldq %xmm5, %xmm4, %xmm7 202 vpunpckhdq %xmm5, %xmm4, %xmm3 203 vpaddd %xmm6, %xmm2, %xmm2 204 vpaddd %xmm7, %xmm3, %xmm3 205 206 vpunpcklqdq %xmm1, %xmm0, %xmm4 207 vpunpckhqdq %xmm1, %xmm0, %xmm0 208 vpunpcklqdq %xmm3, %xmm2, %xmm5 209 vpunpckhqdq %xmm3, %xmm2, %xmm1 210 vpaddd %xmm0, %xmm4, %xmm0 211 vpaddd %xmm1, %xmm5, %xmm1 212 vpsrad $13, %xmm0, %xmm0 213 vpsrad $13, %xmm1, %xmm1 214 vpackssdw %xmm1, %xmm0, %xmm2 215 vpcmpeqd %xmm3, %xmm3, %xmm3 216 vpslld $16, %xmm0, %xmm0 217 vpslld $16, %xmm1, %xmm1 218 vpsrld $16, %xmm0, %xmm0 219 vpsrld $16, %xmm1, %xmm1 220 vpackusdw %xmm1, %xmm0, %xmm0 221 vpcmpeqw %xmm2, %xmm0, %xmm0 222 vpxor %xmm3, %xmm0, %xmm0 223 vpaddw %xmm0, %xmm14, %xmm14 224 225 movups %xmm2, (SAMPLES) 226 add $16, SAMPLES 227 dec %ecx 228 jnz 1b 229 230 pxor %xmm1, %xmm1 231 psubw %xmm14, %xmm1 232 pshufd $0x4e, %xmm1, %xmm0 233 paddw %xmm1, %xmm0 234 pshuflw $0x4e, %xmm0, %xmm1 235 paddw %xmm1, %xmm0 236 pshuflw $0x11, %xmm0, %xmm1 237 paddw %xmm1, %xmm0 238 movd %xmm0, %eax 239 and $0x7f, %eax 240 241#ifdef IS_MSABI 242 movaps (%rsp), %xmm6 243 movaps 16(%rsp), %xmm7 244 movaps 32(%rsp), %xmm8 245 movaps 48(%rsp), %xmm9 246 movaps 64(%rsp), %xmm10 247 movaps 80(%rsp), %xmm11 248 movaps 96(%rsp), %xmm12 249 movaps 112(%rsp), %xmm13 250 movaps 128(%rsp), %xmm14 251 mov %rbp, %rsp 252 pop %rbp 253#endif 254 ret 255 256NONEXEC_STACK 257