1/* 2 dct64_neon64: NEON optimized dct64 for AArch64 3 4 copyright 1995-2014 by the mpg123 project - free software under the terms of the LGPL 2.1 5 see COPYING and AUTHORS files in distribution or http://mpg123.org 6 initially written by Taihei Monma 7*/ 8 9#include "mangle.h" 10 11#ifndef __APPLE__ 12 .section .rodata 13#else 14 .data 15#endif 16 ALIGN16 17costab_neon_aarch64: 18 .word 1056974725 19 .word 1057056395 20 .word 1057223771 21 .word 1057485416 22 .word 1057855544 23 .word 1058356026 24 .word 1059019886 25 .word 1059897405 26 .word 1061067246 27 .word 1062657950 28 .word 1064892987 29 .word 1066774581 30 .word 1069414683 31 .word 1073984175 32 .word 1079645762 33 .word 1092815430 34 .word 1057005197 35 .word 1057342072 36 .word 1058087743 37 .word 1059427869 38 .word 1061799040 39 .word 1065862217 40 .word 1071413542 41 .word 1084439708 42 .word 1057128951 43 .word 1058664893 44 .word 1063675095 45 .word 1076102863 46 .word 1057655764 47 .word 1067924853 48 .word 1060439283 49 .word 1060439283 50 .text 51 ALIGN4 52 .globl ASM_NAME(dct64_neon64) 53#ifdef __ELF__ 54 .type ASM_NAME(dct64_neon64), %function 55#endif 56ASM_NAME(dct64_neon64): 57 add x3, x2, #64 58 adrp x4, AARCH64_PCREL_HI(costab_neon_aarch64) 59 add x4, x4, AARCH64_PCREL_LO(costab_neon_aarch64) 60 ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2] 61 ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x3] 62 ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x4], #64 63 64 rev64 v19.4s, v19.4s 65 rev64 v18.4s, v18.4s 66 rev64 v17.4s, v17.4s 67 rev64 v16.4s, v16.4s 68 ext v4.16b, v19.16b, v19.16b, #8 69 ext v5.16b, v18.16b, v18.16b, #8 70 ext v6.16b, v17.16b, v17.16b, #8 71 ext v7.16b, v16.16b, v16.16b, #8 72 73 fsub v16.4s, v3.4s, v7.4s 74 fsub v17.4s, v2.4s, v6.4s 75 fsub v18.4s, v1.4s, v5.4s 76 fsub v19.4s, v0.4s, v4.4s 77 fadd v0.4s, v0.4s, v4.4s /* bs[0,1,2,3] */ 78 fadd v1.4s, v1.4s, v5.4s /* bs[4,5,6,7] */ 79 fadd v2.4s, v2.4s, v6.4s /* bs[8,9,10,11] */ 80 fadd v3.4s, v3.4s, v7.4s /* bs[12,13,14,15] */ 81 fmul v16.4s, v16.4s, v23.4s /* bs[19,18,17,16] */ 82 fmul v17.4s, v17.4s, v22.4s /* bs[23,22,21,20] */ 83 fmul v18.4s, v18.4s, v21.4s /* bs[27,26,25,24] */ 84 fmul v19.4s, v19.4s, v20.4s /* bs[31,30,29,28] */ 85 86 ld1 {v20.4s, v21.4s}, [x4], #32 87 rev64 v22.4s, v3.4s 88 rev64 v23.4s, v2.4s 89 rev64 v24.4s, v16.4s 90 rev64 v25.4s, v17.4s 91 ext v4.16b, v22.16b, v22.16b, #8 /* bs[15,14,13,12] */ 92 ext v5.16b, v23.16b, v23.16b, #8 /* bs[11,10,9,8] */ 93 ext v6.16b, v24.16b, v24.16b, #8 /* bs[16,17,18,19] */ 94 ext v7.16b, v25.16b, v25.16b, #8 /* bs[20,21,22,23] */ 95 96 fsub v26.4s, v1.4s, v5.4s 97 fsub v27.4s, v0.4s, v4.4s 98 fsub v28.4s, v18.4s, v7.4s 99 fsub v29.4s, v19.4s, v6.4s 100 fadd v4.4s, v0.4s, v4.4s /* bs[32,33,34,35] */ 101 fadd v5.4s, v1.4s, v5.4s /* bs[36,37,38,39] */ 102 fadd v6.4s, v6.4s, v19.4s /* bs[48,49,50,51] */ 103 fadd v7.4s, v7.4s, v18.4s /* bs[52,53,54,55] */ 104 fmul v26.4s, v26.4s, v21.4s /* bs[43,42,41,40] */ 105 fmul v27.4s, v27.4s, v20.4s /* bs[47,46,45,44] */ 106 fmul v28.4s, v28.4s, v21.4s /* bs[59,58,57,56] */ 107 fmul v29.4s, v29.4s, v20.4s /* bs[63,62,61,60] */ 108 109 ld1 {v20.4s}, [x4], #16 110 rev64 v16.4s, v5.4s 111 rev64 v17.4s, v26.4s 112 rev64 v18.4s, v7.4s 113 rev64 v19.4s, v28.4s 114 ext v0.16b, v16.16b, v16.16b, #8 /* bs[39,38,37,36] */ 115 ext v1.16b, v17.16b, v17.16b, #8 /* bs[40,41,42,43] */ 116 ext v2.16b, v18.16b, v18.16b, #8 /* bs[55,54,53,52] */ 117 ext v3.16b, v19.16b, v19.16b, #8 /* bs[56,57,58,59] */ 118 119 fsub v16.4s, v4.4s, v0.4s 120 fsub v17.4s, v27.4s, v1.4s 121 fsub v18.4s, v6.4s, v2.4s 122 fsub v19.4s, v29.4s, v3.4s 123 fadd v0.4s, v4.4s, v0.4s /* bs[0,1,2,3] */ 124 fadd v1.4s, v1.4s, v27.4s /* bs[8,9,10,11] */ 125 fadd v2.4s, v6.4s, v2.4s /* bs[16,17,18,19] */ 126 fadd v3.4s, v3.4s, v29.4s /* bs[24,25,26,27] */ 127 fmul v16.4s, v16.4s, v20.4s /* bs[7,6,5,4] */ 128 fmul v17.4s, v17.4s, v20.4s /* bs[15,14,13,12] */ 129 fmul v18.4s, v18.4s, v20.4s /* bs[23,22,21,20] */ 130 fmul v19.4s, v19.4s, v20.4s /* bs[31,30,29,28] */ 131 132 ld1 {v28.4s}, [x4] 133 zip1 v4.2d, v0.2d, v16.2d /* bs[0,1,7,6] */ 134 zip2 v5.2d, v0.2d, v16.2d /* bs[2,3,5,4] */ 135 zip1 v6.2d, v1.2d, v17.2d /* bs[8,9,15,14] */ 136 zip2 v7.2d, v1.2d, v17.2d /* bs[10,11,13,12] */ 137 zip1 v20.2d, v2.2d, v18.2d /* bs[16,17,23,22] */ 138 zip2 v21.2d, v2.2d, v18.2d /* bs[18,19,21,20] */ 139 zip1 v22.2d, v3.2d, v19.2d /* bs[24,25,31,30] */ 140 zip2 v23.2d, v3.2d, v19.2d /* bs[26,27,29,28] */ 141 rev64 v5.4s, v5.4s /* bs[3,2,4,5] */ 142 rev64 v7.4s, v7.4s /* bs[11,10,12,13] */ 143 rev64 v21.4s, v21.4s /* bs[19,18,20,21] */ 144 rev64 v23.4s, v23.4s /* bs[27,26,28,29] */ 145 AARCH64_DUP_2D(v29, v28, 0) 146 AARCH64_DUP_4S(v28, v28, 2) 147 148 fsub v16.4s, v4.4s, v5.4s 149 fsub v17.4s, v6.4s, v7.4s 150 fsub v18.4s, v20.4s, v21.4s 151 fsub v19.4s, v22.4s, v23.4s 152 fadd v0.4s, v4.4s, v5.4s /* bs[32,33,36,37] */ 153 fadd v1.4s, v6.4s, v7.4s /* bs[40,41,44,45] */ 154 fadd v2.4s, v20.4s, v21.4s /* bs[48,49,52,53] */ 155 fadd v3.4s, v22.4s, v23.4s /* bs[56,57,60,61] */ 156 fmul v16.4s, v16.4s, v29.4s /* bs[35,34,39,38] */ 157 fmul v17.4s, v17.4s, v29.4s /* bs[43,42,47,46] */ 158 fmul v18.4s, v18.4s, v29.4s /* bs[51,50,55,54] */ 159 fmul v19.4s, v19.4s, v29.4s /* bs[59,58,63,62] */ 160 161 uzp1 v4.4s, v0.4s, v16.4s /* bs[32,36,35,39] */ 162 uzp2 v5.4s, v0.4s, v16.4s /* bs[33,37,34,38] */ 163 uzp1 v6.4s, v1.4s, v17.4s /* bs[40,44,43,47] */ 164 uzp2 v7.4s, v1.4s, v17.4s /* bs[41,45,42,46] */ 165 uzp1 v20.4s, v2.4s, v18.4s /* bs[48,52,51,55] */ 166 uzp2 v21.4s, v2.4s, v18.4s /* bs[49,53,50,54] */ 167 uzp1 v22.4s, v3.4s, v19.4s /* bs[56,60,59,63] */ 168 uzp2 v23.4s, v3.4s, v19.4s /* bs[57,61,58,62] */ 169 170 fsub v16.4s, v4.4s, v5.4s 171 fsub v17.4s, v6.4s, v7.4s 172 fsub v18.4s, v20.4s, v21.4s 173 fsub v19.4s, v22.4s, v23.4s 174 fadd v0.4s, v4.4s, v5.4s /* bs[0,4,2,6] */ 175 fadd v1.4s, v6.4s, v7.4s /* bs[8,12,10,14] */ 176 fadd v2.4s, v20.4s, v21.4s /* bs[16,20,18,22] */ 177 fadd v3.4s, v22.4s, v23.4s /* bs[24,28,26,30] */ 178 fmul v16.4s, v16.4s, v28.4s /* bs[1,5,3,7] */ 179 fmul v17.4s, v17.4s, v28.4s /* bs[9,13,11,15] */ 180 fmul v18.4s, v18.4s, v28.4s /* bs[17,21,19,23] */ 181 fmul v19.4s, v19.4s, v28.4s /* bs[25,29,27,31] */ 182 183 zip2 v4.2d, v0.2d, v1.2d /* bs[2,6,10,14] */ 184 zip2 v5.2d, v16.2d, v17.2d /* bs[3,7,11,15] */ 185 zip2 v6.2d, v2.2d, v3.2d /* bs[18,22,26,30] */ 186 zip2 v7.2d, v18.2d, v19.2d /* bs[19,23,27,31] */ 187 fadd v4.4s, v4.4s, v5.4s /* bs[2,6,10,14] */ 188 fadd v6.4s, v6.4s, v7.4s /* bs[18,22,26,30] */ 189 ins v0.d[1], v4.d[0] /* bs[0,4,2,6] */ 190 ins v1.d[1], v4.d[1] /* bs[8,12,10,14] */ 191 ins v2.d[1], v6.d[0] /* bs[16,20,18,22] */ 192 ins v3.d[1], v6.d[1] /* bs[24,28,26,30] */ 193 194 eor v31.16b, v31.16b, v31.16b 195 zip1 v4.4s, v0.4s, v16.4s /* bs[0,1,4,5] */ 196 zip2 v5.4s, v0.4s, v16.4s /* bs[2,3,6,7] */ 197 zip1 v6.4s, v1.4s, v17.4s /* bs[8,9,12,13] */ 198 zip2 v7.4s, v1.4s, v17.4s /* bs[10,11,14,15] */ 199 zip1 v20.4s, v2.4s, v18.4s /* bs[16,17,20,21] */ 200 zip2 v21.4s, v2.4s, v18.4s /* bs[18,19,22,23] */ 201 zip1 v22.4s, v3.4s, v19.4s /* bs[24,25,28,29] */ 202 zip2 v23.4s, v3.4s, v19.4s /* bs[26,27,30,31] */ 203 zip1 v0.2d, v4.2d, v5.2d /* bs[0,1,2,3] */ 204 zip2 v1.2d, v4.2d, v5.2d /* bs[4,5,6,7] */ 205 zip1 v2.2d, v6.2d, v7.2d /* bs[8,9,10,11] */ 206 zip2 v3.2d, v6.2d, v7.2d /* bs[12,13,14,15] */ 207 rev64 v16.4s, v4.4s 208 rev64 v17.4s, v6.4s 209 zip1 v24.2d, v7.2d, v17.2d 210 zip2 v16.2d, v5.2d, v16.2d 211 zip2 v17.2d, v7.2d, v17.2d 212 zip1 v4.2d, v20.2d, v21.2d /* bs[16,17,18,19] */ 213 zip2 v5.2d, v20.2d, v21.2d /* bs[20,21,22,23] */ 214 zip1 v6.2d, v22.2d, v23.2d /* bs[24,25,26,27] */ 215 zip2 v7.2d, v22.2d, v23.2d /* bs[28,29,30,31] */ 216 rev64 v18.4s, v20.4s 217 rev64 v19.4s, v22.4s 218 zip1 v25.2d, v23.2d, v19.2d 219 zip1 v26.2d, v21.2d, v18.2d 220 zip2 v18.2d, v21.2d, v18.2d 221 zip2 v19.2d, v23.2d, v19.2d 222 ins v16.s[3], v31.s[0] /* bs[6,7,5,-] */ 223 ins v17.s[3], v31.s[0] /* bs[14,15,13,-] */ 224 ins v18.s[3], v31.s[0] /* bs[22,23,21,-] */ 225 ins v19.s[3], v31.s[0] /* bs[30,31,29,-] */ 226 ins v24.s[3], v31.s[0] /* bs[10,11,9,-] */ 227 ins v25.s[3], v31.s[0] /* bs[26,27,25,-] */ 228 ins v26.s[3], v31.s[0] /* bs[18,19,17,-] */ 229 230 fadd v1.4s, v1.4s, v16.4s 231 fadd v3.4s, v3.4s, v17.4s 232 fadd v5.4s, v5.4s, v18.4s 233 fadd v7.4s, v7.4s, v19.4s 234 235 fadd v2.4s, v2.4s, v3.4s 236 fadd v3.4s, v3.4s, v24.4s 237 fadd v6.4s, v6.4s, v7.4s 238 fadd v7.4s, v7.4s, v25.4s 239 240 fadd v4.4s, v4.4s, v6.4s 241 fadd v6.4s, v6.4s, v5.4s 242 fadd v5.4s, v5.4s, v7.4s 243 fadd v7.4s, v7.4s, v26.4s 244 245 fcvtns v0.4s, v0.4s 246 fcvtns v1.4s, v1.4s 247 fcvtns v2.4s, v2.4s 248 fcvtns v3.4s, v3.4s 249 fcvtns v4.4s, v4.4s 250 fcvtns v5.4s, v5.4s 251 fcvtns v6.4s, v6.4s 252 fcvtns v7.4s, v7.4s 253 sqxtn v0.4h, v0.4s 254 sqxtn v1.4h, v1.4s 255 sqxtn v2.4h, v2.4s 256 sqxtn v3.4h, v3.4s 257 sqxtn v4.4h, v4.4s 258 sqxtn v5.4h, v5.4s 259 sqxtn v6.4h, v6.4s 260 sqxtn v7.4h, v7.4s 261 262 mov x3, #32 263 st1 {v0.h}[1], [x0], x3 264 st1 {v7.h}[2], [x0], x3 265 st1 {v3.h}[2], [x0], x3 266 st1 {v5.h}[2], [x0], x3 267 st1 {v1.h}[2], [x0], x3 268 st1 {v6.h}[2], [x0], x3 269 st1 {v2.h}[2], [x0], x3 270 st1 {v4.h}[2], [x0], x3 271 st1 {v0.h}[2], [x0], x3 272 st1 {v7.h}[0], [x0], x3 273 st1 {v3.h}[0], [x0], x3 274 st1 {v5.h}[0], [x0], x3 275 st1 {v1.h}[0], [x0], x3 276 st1 {v6.h}[0], [x0], x3 277 st1 {v2.h}[0], [x0], x3 278 st1 {v4.h}[0], [x0], x3 279 st1 {v0.h}[0], [x0] 280 st1 {v0.h}[1], [x1], x3 281 st1 {v4.h}[1], [x1], x3 282 st1 {v2.h}[1], [x1], x3 283 st1 {v6.h}[1], [x1], x3 284 st1 {v1.h}[1], [x1], x3 285 st1 {v5.h}[1], [x1], x3 286 st1 {v3.h}[1], [x1], x3 287 st1 {v7.h}[1], [x1], x3 288 st1 {v0.h}[3], [x1], x3 289 st1 {v4.h}[3], [x1], x3 290 st1 {v2.h}[3], [x1], x3 291 st1 {v6.h}[3], [x1], x3 292 st1 {v1.h}[3], [x1], x3 293 st1 {v5.h}[3], [x1], x3 294 st1 {v3.h}[3], [x1], x3 295 st1 {v7.h}[3], [x1] 296 297 ret 298 299NONEXEC_STACK 300