1/* 2 dct36_neon64: NEON optimized dct36 for AArch64 3 4 copyright 1995-2014 by the mpg123 project - free software under the terms of the LGPL 2.1 5 see COPYING and AUTHORS files in distribution or http://mpg123.org 6 initially written by Taihei Monma 7*/ 8 9#include "mangle.h" 10 11#ifndef __APPLE__ 12 .section .rodata 13#else 14 .data 15#endif 16 ALIGN16 17dct36_aarch64_COS9: 18 .word 0x3f5db3d7 19 .word 0x3f5db3d7 20 .word 0x3f000000 21 .word 0x3f000000 22 .word 0x3f7c1c5c 23 .word 0x3f7c1c5c 24 .word 0x3f708fb2 25 .word 0x3f708fb2 26 .word 0x3f248dbb 27 .word 0x3f248dbb 28 .word 0x3e31d0d4 29 .word 0x3e31d0d4 30 .word 0x3eaf1d44 31 .word 0x3eaf1d44 32 .word 0x3f441b7d 33 .word 0x3f441b7d 34 .word 0x3f007d2b 35 .word 0x3f0483ee 36 .word 0x3f0d3b7d 37 .word 0x3f1c4257 38 .word 0x40b79454 39 .word 0x3ff746ea 40 .word 0x3f976fd9 41 .word 0x3f5f2944 42 .word 0x3f800000 43 .word 0x3f3504f3 44 45 .text 46 ALIGN4 47 .globl ASM_NAME(dct36_neon64) 48#ifdef __ELF__ 49 .type ASM_NAME(dct36_neon64), %function 50#endif 51ASM_NAME(dct36_neon64): 52 adrp x5, AARCH64_PCREL_HI(dct36_aarch64_COS9) 53 add x5, x5, AARCH64_PCREL_LO(dct36_aarch64_COS9) 54 cmeq v28.16b, v28.16b, v28.16b 55 eor v29.16b, v29.16b, v29.16b 56 shl v28.2d, v28.2d, #32 57 ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x0], #64 58 ld1 {v4.2s}, [x0] 59 60 ext v16.16b, v29.16b, v0.16b, #12 61 ext v17.16b, v0.16b, v1.16b, #12 62 ext v18.16b, v1.16b, v2.16b, #12 63 ext v19.16b, v2.16b, v3.16b, #12 64 ext v20.16b, v3.16b, v4.16b, #12 65 fadd v0.4s, v0.4s, v16.4s 66 fadd v1.4s, v1.4s, v17.4s 67 fadd v2.4s, v2.4s, v18.4s 68 fadd v3.4s, v3.4s, v19.4s 69 fadd v4.2s, v4.2s, v20.2s 70 71 ext v16.16b, v0.16b, v1.16b, #8 72 ext v17.16b, v1.16b, v2.16b, #8 73 ext v18.16b, v2.16b, v3.16b, #8 74 ext v19.16b, v3.16b, v4.16b, #8 75 and v20.16b, v0.16b, v28.16b 76 ext v0.16b, v29.16b, v0.16b, #8 77 and v21.16b, v1.16b, v28.16b 78 and v22.16b, v2.16b, v28.16b 79 and v23.16b, v3.16b, v28.16b 80 fadd v1.4s, v20.4s, v16.4s 81 fadd v2.4s, v21.4s, v17.4s 82 fadd v3.4s, v22.4s, v18.4s 83 fadd v4.4s, v23.4s, v19.4s 84 85/* 86v0 in[-,-,0,1] 87v1 in[2,3,4,5] 88v2 in[6,7,8,9] 89v3 in[10,11,12,13] 90v4 in[14,15,16,17] 91*/ 92 93 orr v5.16b, v2.16b, v2.16b 94 ins v2.d[1], v3.d[1] 95 ins v3.d[1], v4.d[1] 96 ins v4.d[1], v5.d[1] 97 98/* 99v2 in[6,7,12,13] 100v3 in[10,11,16,17] 101v4 in[14,15,8,9] 102*/ 103 104 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x5], #64 105 orr v20.16b, v0.16b, v0.16b 106 fmla v20.4s, v2.4s, v16.4s 107 108/* 109v17 COS9_[1,1,2,2] 110v18 COS9_[5,5,8,8] 111v19 COS9_[7,7,4,4] 112v16 COS9_[3,3,6,6] 113v20 [ta33,tb33,ta66,tb66] 114*/ 115 116 orr v21.16b, v20.16b, v20.16b 117 orr v23.16b, v20.16b, v20.16b 118 zip2 v25.2d, v29.2d, v2.2d 119 fsub v22.4s, v1.4s, v3.4s 120 fmul v24.4s, v1.4s, v17.4s 121 fmul v26.4s, v1.4s, v18.4s 122 fmul v27.4s, v1.4s, v19.4s 123 fmla v21.4s, v3.4s, v18.4s 124 fmla v23.4s, v3.4s, v19.4s 125 fmla v20.4s, v4.4s, v18.4s 126 fsub v25.4s, v0.4s, v25.4s 127 fsub v22.4s, v22.4s, v4.4s 128 fmla v24.4s, v4.4s, v19.4s 129 fmla v26.4s, v4.4s, v17.4s 130 fmla v27.4s, v3.4s, v17.4s 131 fmla v25.4s, v22.4s, v16.4s 132 fadd v24.4s, v24.4s, v21.4s 133 fsub v26.4s, v26.4s, v23.4s 134 fsub v27.4s, v27.4s, v20.4s 135 136 zip1 v16.4s, v24.4s, v25.4s 137 zip2 v17.4s, v24.4s, v25.4s 138 zip1 v18.4s, v26.4s, v27.4s 139 zip2 v19.4s, v26.4s, v27.4s 140 fneg v19.4s, v19.4s 141 zip1 v20.2d, v16.2d, v18.2d 142 zip1 v21.2d, v17.2d, v19.2d 143 zip2 v22.2d, v16.2d, v18.2d 144 zip2 v23.2d, v17.2d, v19.2d 145 146 ld1 {v5.4s,v6.4s}, [x5], #32 147 ld1 {v7.2s}, [x5] 148 fsub v0.4s, v0.4s, v1.4s 149 fsub v4.4s, v4.4s, v2.4s 150 fadd v17.4s, v22.4s, v23.4s 151 fsub v19.4s, v23.4s, v22.4s 152 fadd v0.4s, v0.4s, v3.4s 153 fadd v16.4s, v20.4s, v21.4s 154 fsub v18.4s, v21.4s, v20.4s 155 fadd v0.4s, v0.4s, v4.4s 156 fmul v17.4s, v17.4s, v5.4s 157 fmul v19.4s, v19.4s, v6.4s 158 AARCH64_DUP_2D(v0, v0, 1) 159 fmul v0.2s, v0.2s, v7.2s 160 161/* 162v16 tmp[0,1,2,3] 163v17 tmp[17,16,15,14] 164v18 tmp[8,7,6,5] 165v19 tmp[9,10,11,12] 166v0 tmp[4,13] 167*/ 168 169 add x0, x4, #640 170 add x5, x3, #20 171 add x6, x3, #92 172 add x7, x1, #20 173 ld1 {v1.4s,v2.4s}, [x5] 174 ld1 {v3.4s,v4.4s}, [x6] 175 ld1 {v5.4s,v6.4s}, [x7] 176 fadd v20.4s, v16.4s, v17.4s 177 fsub v21.4s, v16.4s, v17.4s 178 fmul v4.4s, v20.4s, v4.4s 179 fmla v6.4s, v21.4s, v2.4s 180 rev64 v20.4s, v20.4s 181 rev64 v21.4s, v21.4s 182 ext v20.16b, v20.16b, v20.16b, #8 183 ext v21.16b, v21.16b, v21.16b, #8 184 fmul v3.4s, v20.4s, v3.4s 185 fmla v5.4s, v21.4s, v1.4s 186 add x5, x2, #20 187 mov x9, #128 188 st1 {v3.4s,v4.4s}, [x5] 189 st1 {v5.s}[0], [x0], x9 190 st1 {v5.s}[1], [x0], x9 191 st1 {v5.s}[2], [x0], x9 192 st1 {v5.s}[3], [x0], x9 193 st1 {v6.s}[0], [x0], x9 194 st1 {v6.s}[1], [x0], x9 195 st1 {v6.s}[2], [x0], x9 196 st1 {v6.s}[3], [x0], x9 197 198 add x0, x4, #1792 199 add x5, x3, #56 200 add x6, x3, #128 201 add x7, x1, #56 202 ld1 {v1.4s}, [x3] 203 ld1 {v2.4s,v3.4s}, [x5] 204 ld1 {v4.4s}, [x6] 205 ld1 {v5.4s}, [x1] 206 ld1 {v6.4s}, [x7] 207 fadd v20.4s, v18.4s, v19.4s 208 fsub v21.4s, v18.4s, v19.4s 209 fmul v3.4s, v20.4s, v3.4s 210 fmla v5.4s, v21.4s, v1.4s 211 rev64 v20.4s, v20.4s 212 rev64 v21.4s, v21.4s 213 ext v20.16b, v20.16b, v20.16b, #8 214 ext v21.16b, v21.16b, v21.16b, #8 215 fmul v4.4s, v20.4s, v4.4s 216 fmla v6.4s, v21.4s, v2.4s 217 add x5, x2, #56 218 st1 {v3.4s}, [x2] 219 st1 {v4.4s}, [x5] 220 st1 {v5.s}[0], [x4], x9 221 st1 {v5.s}[1], [x4], x9 222 st1 {v5.s}[2], [x4], x9 223 st1 {v5.s}[3], [x4], x9 224 st1 {v6.s}[0], [x0], x9 225 st1 {v6.s}[1], [x0], x9 226 st1 {v6.s}[2], [x0], x9 227 st1 {v6.s}[3], [x0], x9 228 229 ins v1.s[0], v0.s[1] 230 ldr s2, [x3, #16] 231 ldr s3, [x3, #52] 232 ldr s4, [x3, #88] 233 ldr s5, [x3, #124] 234 ldr s6, [x1, #16] 235 ldr s7, [x1, #52] 236 fadd s16, s0, s1 237 fsub s17, s0, s1 238 fmul s4, s16, s4 239 fmul s5, s16, s5 240 fmadd s6, s17, s2, s6 241 fmadd s7, s17, s3, s7 242 str s4, [x2, #16] 243 str s5, [x2, #52] 244 str s6, [x4] 245 str s7, [x4, #1152] 246 247 ret 248 249NONEXEC_STACK 250