1/* 2 synth_stereo_neon64_float: NEON optimized synth for AArch64 (stereo specific, float output version) 3 4 copyright 1995-2014 by the mpg123 project - free software under the terms of the LGPL 2.1 5 see COPYING and AUTHORS files in distribution or http://mpg123.org 6 initially written by Taihei Monma 7*/ 8 9#include "mangle.h" 10 11#ifndef __APPLE__ 12 .section .rodata 13#else 14 .data 15#endif 16 ALIGN16 17scale: 18 .word 939524096 19 .text 20 ALIGN4 21 .globl ASM_NAME(synth_1to1_real_s_neon64_asm) 22#ifdef __ELF__ 23 .type ASM_NAME(synth_1to1_real_s_neon64_asm), %function 24#endif 25ASM_NAME(synth_1to1_real_s_neon64_asm): 26 add x0, x0, #64 27 sub x0, x0, x4, lsl #2 28 adrp x5, AARCH64_PCREL_HI(scale) 29 add x5, x5, AARCH64_PCREL_LO(scale) 30 ld1r {v28.4s}, [x5] 31 sub sp, sp, #32 32 st1 {v8.2s,v9.2s,v10.2s,v11.2s}, [sp] 33 sub sp, sp, #32 34 st1 {v12.2s,v13.2s,v14.2s,v15.2s}, [sp] 35 36 mov w4, #4 37 mov x5, #128 381: 39 ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5 40 ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5 41 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x1], #64 42 ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2], #64 43 ld1 {v8.4s,v9.4s,v10.4s,v11.4s}, [x1], #64 44 ld1 {v12.4s,v13.4s,v14.4s,v15.4s}, [x2], #64 45 46 fmul v24.4s, v0.4s, v16.4s 47 fmul v25.4s, v0.4s, v20.4s 48 fmul v26.4s, v4.4s, v8.4s 49 fmul v27.4s, v4.4s, v12.4s 50 fmla v24.4s, v1.4s, v17.4s 51 fmla v25.4s, v1.4s, v21.4s 52 fmla v26.4s, v5.4s, v9.4s 53 fmla v27.4s, v5.4s, v13.4s 54 fmla v24.4s, v2.4s, v18.4s 55 fmla v25.4s, v2.4s, v22.4s 56 fmla v26.4s, v6.4s, v10.4s 57 fmla v27.4s, v6.4s, v14.4s 58 fmla v24.4s, v3.4s, v19.4s 59 fmla v25.4s, v3.4s, v23.4s 60 fmla v26.4s, v7.4s, v11.4s 61 fmla v27.4s, v7.4s, v15.4s 62 63 faddp v0.4s, v24.4s, v25.4s 64 faddp v1.4s, v26.4s, v27.4s 65 faddp v0.4s, v0.4s, v1.4s 66 fmul v0.4s, v0.4s, v28.4s 67 st1 {v0.4s}, [x3], #16 68 69 ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5 70 ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5 71 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x1], #64 72 ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2], #64 73 ld1 {v8.4s,v9.4s,v10.4s,v11.4s}, [x1], #64 74 ld1 {v12.4s,v13.4s,v14.4s,v15.4s}, [x2], #64 75 76 fmul v24.4s, v0.4s, v16.4s 77 fmul v25.4s, v0.4s, v20.4s 78 fmul v26.4s, v4.4s, v8.4s 79 fmul v27.4s, v4.4s, v12.4s 80 fmla v24.4s, v1.4s, v17.4s 81 fmla v25.4s, v1.4s, v21.4s 82 fmla v26.4s, v5.4s, v9.4s 83 fmla v27.4s, v5.4s, v13.4s 84 fmla v24.4s, v2.4s, v18.4s 85 fmla v25.4s, v2.4s, v22.4s 86 fmla v26.4s, v6.4s, v10.4s 87 fmla v27.4s, v6.4s, v14.4s 88 fmla v24.4s, v3.4s, v19.4s 89 fmla v25.4s, v3.4s, v23.4s 90 fmla v26.4s, v7.4s, v11.4s 91 fmla v27.4s, v7.4s, v15.4s 92 93 faddp v0.4s, v24.4s, v25.4s 94 faddp v1.4s, v26.4s, v27.4s 95 faddp v0.4s, v0.4s, v1.4s 96 fmul v0.4s, v0.4s, v28.4s 97 st1 {v0.4s}, [x3], #16 98 99 subs w4, w4, #1 100 b.ne 1b 101 102 mov w4, #4 103 mov x6, #-64 1042: 105 ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5 106 ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5 107 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x1], x6 108 ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2], x6 109 ld1 {v8.4s,v9.4s,v10.4s,v11.4s}, [x1], x6 110 ld1 {v12.4s,v13.4s,v14.4s,v15.4s}, [x2], x6 111 112 fmul v24.4s, v0.4s, v16.4s 113 fmul v25.4s, v0.4s, v20.4s 114 fmul v26.4s, v4.4s, v8.4s 115 fmul v27.4s, v4.4s, v12.4s 116 fmla v24.4s, v1.4s, v17.4s 117 fmla v25.4s, v1.4s, v21.4s 118 fmla v26.4s, v5.4s, v9.4s 119 fmla v27.4s, v5.4s, v13.4s 120 fmla v24.4s, v2.4s, v18.4s 121 fmla v25.4s, v2.4s, v22.4s 122 fmla v26.4s, v6.4s, v10.4s 123 fmla v27.4s, v6.4s, v14.4s 124 fmla v24.4s, v3.4s, v19.4s 125 fmla v25.4s, v3.4s, v23.4s 126 fmla v26.4s, v7.4s, v11.4s 127 fmla v27.4s, v7.4s, v15.4s 128 129 faddp v0.4s, v24.4s, v25.4s 130 faddp v1.4s, v26.4s, v27.4s 131 faddp v0.4s, v0.4s, v1.4s 132 fmul v0.4s, v0.4s, v28.4s 133 st1 {v0.4s}, [x3], #16 134 135 ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5 136 ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5 137 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x1], x6 138 ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2], x6 139 ld1 {v8.4s,v9.4s,v10.4s,v11.4s}, [x1], x6 140 ld1 {v12.4s,v13.4s,v14.4s,v15.4s}, [x2], x6 141 142 fmul v24.4s, v0.4s, v16.4s 143 fmul v25.4s, v0.4s, v20.4s 144 fmul v26.4s, v4.4s, v8.4s 145 fmul v27.4s, v4.4s, v12.4s 146 fmla v24.4s, v1.4s, v17.4s 147 fmla v25.4s, v1.4s, v21.4s 148 fmla v26.4s, v5.4s, v9.4s 149 fmla v27.4s, v5.4s, v13.4s 150 fmla v24.4s, v2.4s, v18.4s 151 fmla v25.4s, v2.4s, v22.4s 152 fmla v26.4s, v6.4s, v10.4s 153 fmla v27.4s, v6.4s, v14.4s 154 fmla v24.4s, v3.4s, v19.4s 155 fmla v25.4s, v3.4s, v23.4s 156 fmla v26.4s, v7.4s, v11.4s 157 fmla v27.4s, v7.4s, v15.4s 158 159 faddp v0.4s, v24.4s, v25.4s 160 faddp v1.4s, v26.4s, v27.4s 161 faddp v0.4s, v0.4s, v1.4s 162 fmul v0.4s, v0.4s, v28.4s 163 st1 {v0.4s}, [x3], #16 164 165 subs w4, w4, #1 166 b.ne 2b 167 168 eor w0, w0, w0 169 ld1 {v12.2s,v13.2s,v14.2s,v15.2s}, [sp], #32 170 ld1 {v8.2s,v9.2s,v10.2s,v11.2s}, [sp], #32 171 172 ret 173 174NONEXEC_STACK 175