1/* 2 synth_neon64: NEON optimized synth for AArch64 3 4 copyright 1995-2014 by the mpg123 project - free software under the terms of the LGPL 2.1 5 see COPYING and AUTHORS files in distribution or http://mpg123.org 6 initially written by Taihei Monma 7*/ 8 9#include "mangle.h" 10 11#ifndef __APPLE__ 12 .section .rodata 13#else 14 .data 15#endif 16 ALIGN16 17maxmin_s16: 18 .word 32767 19 .word -32768 20 .text 21 ALIGN4 22 .globl ASM_NAME(synth_1to1_neon64_asm) 23#ifdef __ELF__ 24 .type ASM_NAME(synth_1to1_neon64_asm), %function 25#endif 26ASM_NAME(synth_1to1_neon64_asm): 27 add x0, x0, #32 28 sub x0, x0, x3, lsl #1 29 eor v31.16b, v31.16b, v31.16b 30 adrp x5, AARCH64_PCREL_HI(maxmin_s16) 31 add x5, x5, AARCH64_PCREL_LO(maxmin_s16) 32 ld2r {v28.4s,v29.4s}, [x5] 33 34 mov w4, #4 35 mov x5, #64 361: 37 ld1 {v0.8h,v1.8h}, [x0], x5 38 ld1 {v2.8h,v3.8h}, [x0], x5 39 ld1 {v4.8h,v5.8h}, [x0], x5 40 ld1 {v6.8h,v7.8h}, [x0], x5 41 ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x1], #64 42 ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x1], #64 43 44 smull v24.4s, v0.4h, v16.4h 45 smull v25.4s, v2.4h, v18.4h 46 smull v26.4s, v4.4h, v20.4h 47 smull v27.4s, v6.4h, v22.4h 48 smlal2 v24.4s, v0.8h, v16.8h 49 smlal2 v25.4s, v2.8h, v18.8h 50 smlal2 v26.4s, v4.8h, v20.8h 51 smlal2 v27.4s, v6.8h, v22.8h 52 smlal v24.4s, v1.4h, v17.4h 53 smlal v25.4s, v3.4h, v19.4h 54 smlal v26.4s, v5.4h, v21.4h 55 smlal v27.4s, v7.4h, v23.4h 56 smlal2 v24.4s, v1.8h, v17.8h 57 smlal2 v25.4s, v3.8h, v19.8h 58 smlal2 v26.4s, v5.8h, v21.8h 59 smlal2 v27.4s, v7.8h, v23.8h 60 61 addp v0.4s, v24.4s, v25.4s 62 addp v1.4s, v26.4s, v27.4s 63 addp v0.4s, v0.4s, v1.4s 64 ld2 {v4.4h,v5.4h}, [x2] 65 sqrshrn v4.4h, v0.4s, #13 66 cmgt v2.4s, v0.4s, v28.4s 67 cmgt v3.4s, v29.4s, v0.4s 68 add v2.4s, v2.4s, v3.4s 69 add v31.4s, v31.4s, v2.4s 70 st2 {v4.4h,v5.4h}, [x2], #16 71 72 subs w4, w4, #1 73 b.ne 1b 74 75 mov w4, #4 76 mov x6, #-32 772: 78 ld1 {v0.8h,v1.8h}, [x0], x5 79 ld1 {v2.8h,v3.8h}, [x0], x5 80 ld1 {v4.8h,v5.8h}, [x0], x5 81 ld1 {v6.8h,v7.8h}, [x0], x5 82 ld1 {v16.8h,v17.8h}, [x1], x6 83 ld1 {v18.8h,v19.8h}, [x1], x6 84 ld1 {v20.8h,v21.8h}, [x1], x6 85 ld1 {v22.8h,v23.8h}, [x1], x6 86 87 smull v24.4s, v0.4h, v16.4h 88 smull v25.4s, v2.4h, v18.4h 89 smull v26.4s, v4.4h, v20.4h 90 smull v27.4s, v6.4h, v22.4h 91 smlal2 v24.4s, v0.8h, v16.8h 92 smlal2 v25.4s, v2.8h, v18.8h 93 smlal2 v26.4s, v4.8h, v20.8h 94 smlal2 v27.4s, v6.8h, v22.8h 95 smlal v24.4s, v1.4h, v17.4h 96 smlal v25.4s, v3.4h, v19.4h 97 smlal v26.4s, v5.4h, v21.4h 98 smlal v27.4s, v7.4h, v23.4h 99 smlal2 v24.4s, v1.8h, v17.8h 100 smlal2 v25.4s, v3.8h, v19.8h 101 smlal2 v26.4s, v5.8h, v21.8h 102 smlal2 v27.4s, v7.8h, v23.8h 103 104 addp v0.4s, v24.4s, v25.4s 105 addp v1.4s, v26.4s, v27.4s 106 addp v0.4s, v0.4s, v1.4s 107 ld2 {v4.4h,v5.4h}, [x2] 108 sqrshrn v4.4h, v0.4s, #13 109 cmgt v2.4s, v0.4s, v28.4s 110 cmgt v3.4s, v29.4s, v0.4s 111 add v2.4s, v2.4s, v3.4s 112 add v31.4s, v31.4s, v2.4s 113 st2 {v4.4h,v5.4h}, [x2], #16 114 115 subs w4, w4, #1 116 b.ne 2b 117 118 AARCH64_DUP_2D(v0, v31, 1) 119 add v0.4s, v0.4s, v31.4s 120 AARCH64_DUP_4S(v1, v0, 1) 121 add v0.4s, v0.4s, v1.4s 122 umov w0, v0.s[0] 123 neg w0, w0 124 125 ret 126 127NONEXEC_STACK 128