1/* 2 synth_neon64_s32: NEON optimized synth for AArch64 (32-bit output version) 3 4 copyright 1995-2014 by the mpg123 project - free software under the terms of the LGPL 2.1 5 see COPYING and AUTHORS files in distribution or http://mpg123.org 6 initially written by Taihei Monma 7*/ 8 9#include "mangle.h" 10 11#ifndef __APPLE__ 12 .section .rodata 13#else 14 .data 15#endif 16 ALIGN16 17maxmin_s32: 18 .word 1191182335 19 .word -956301312 20 .word 1199570944 21 .text 22 ALIGN4 23 .globl ASM_NAME(synth_1to1_s32_neon64_asm) 24#ifdef __ELF__ 25 .type ASM_NAME(synth_1to1_s32_neon64_asm), %function 26#endif 27ASM_NAME(synth_1to1_s32_neon64_asm): 28 add x0, x0, #64 29 sub x0, x0, x3, lsl #2 30 eor v31.16b, v31.16b, v31.16b 31 adrp x5, AARCH64_PCREL_HI(maxmin_s32) 32 add x5, x5, AARCH64_PCREL_LO(maxmin_s32) 33 ld3r {v28.4s,v29.4s,v30.4s}, [x5] 34 35 mov w4, #4 36 mov x5, #128 371: 38 ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5 39 ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5 40 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x1], #64 41 ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64 42 43 fmul v24.4s, v0.4s, v16.4s 44 fmul v25.4s, v4.4s, v20.4s 45 fmla v24.4s, v1.4s, v17.4s 46 fmla v25.4s, v5.4s, v21.4s 47 fmla v24.4s, v2.4s, v18.4s 48 fmla v25.4s, v6.4s, v22.4s 49 fmla v24.4s, v3.4s, v19.4s 50 fmla v25.4s, v7.4s, v23.4s 51 52 ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5 53 ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5 54 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x1], #64 55 ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64 56 57 fmul v26.4s, v0.4s, v16.4s 58 fmul v27.4s, v4.4s, v20.4s 59 fmla v26.4s, v1.4s, v17.4s 60 fmla v27.4s, v5.4s, v21.4s 61 fmla v26.4s, v2.4s, v18.4s 62 fmla v27.4s, v6.4s, v22.4s 63 fmla v26.4s, v3.4s, v19.4s 64 fmla v27.4s, v7.4s, v23.4s 65 66 faddp v0.4s, v24.4s, v25.4s 67 faddp v1.4s, v26.4s, v27.4s 68 faddp v0.4s, v0.4s, v1.4s 69 fmul v1.4s, v0.4s, v30.4s 70 ld2 {v4.4s,v5.4s}, [x2] 71 fcvtns v4.4s, v1.4s 72 fcmgt v2.4s, v0.4s, v28.4s 73 fcmgt v3.4s, v29.4s, v0.4s 74 add v2.4s, v2.4s, v3.4s 75 add v31.4s, v31.4s, v2.4s 76 st2 {v4.4s,v5.4s}, [x2], #32 77 78 subs w4, w4, #1 79 b.ne 1b 80 81 mov w4, #4 82 mov x6, #-64 832: 84 ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5 85 ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5 86 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x1], x6 87 ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], x6 88 89 fmul v24.4s, v0.4s, v16.4s 90 fmul v25.4s, v4.4s, v20.4s 91 fmla v24.4s, v1.4s, v17.4s 92 fmla v25.4s, v5.4s, v21.4s 93 fmla v24.4s, v2.4s, v18.4s 94 fmla v25.4s, v6.4s, v22.4s 95 fmla v24.4s, v3.4s, v19.4s 96 fmla v25.4s, v7.4s, v23.4s 97 98 ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x0], x5 99 ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x0], x5 100 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x1], x6 101 ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], x6 102 103 fmul v26.4s, v0.4s, v16.4s 104 fmul v27.4s, v4.4s, v20.4s 105 fmla v26.4s, v1.4s, v17.4s 106 fmla v27.4s, v5.4s, v21.4s 107 fmla v26.4s, v2.4s, v18.4s 108 fmla v27.4s, v6.4s, v22.4s 109 fmla v26.4s, v3.4s, v19.4s 110 fmla v27.4s, v7.4s, v23.4s 111 112 faddp v0.4s, v24.4s, v25.4s 113 faddp v1.4s, v26.4s, v27.4s 114 faddp v0.4s, v0.4s, v1.4s 115 fmul v1.4s, v0.4s, v30.4s 116 ld2 {v4.4s,v5.4s}, [x2] 117 fcvtns v4.4s, v1.4s 118 fcmgt v2.4s, v0.4s, v28.4s 119 fcmgt v3.4s, v29.4s, v0.4s 120 add v2.4s, v2.4s, v3.4s 121 add v31.4s, v31.4s, v2.4s 122 st2 {v4.4s,v5.4s}, [x2], #32 123 124 subs w4, w4, #1 125 b.ne 2b 126 127 AARCH64_DUP_2D(v0, v31, 1) 128 add v0.4s, v0.4s, v31.4s 129 AARCH64_DUP_4S(v1, v0, 1) 130 add v0.4s, v0.4s, v1.4s 131 umov w0, v0.s[0] 132 neg w0, w0 133 134 ret 135 136NONEXEC_STACK 137