1/****************************************************************************** 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2015 Martin Storsjo 4 * Copyright © 2015 Janne Grunau 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, this 11 * list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 *****************************************************************************/ 28 29#ifndef DAV1D_SRC_ARM_64_UTIL_S 30#define DAV1D_SRC_ARM_64_UTIL_S 31 32#include "config.h" 33#include "src/arm/asm.S" 34 35.macro movrel rd, val, offset=0 36#if defined(__APPLE__) 37 .if \offset < 0 38 adrp \rd, \val@PAGE 39 add \rd, \rd, \val@PAGEOFF 40 sub \rd, \rd, -(\offset) 41 .else 42 adrp \rd, \val+(\offset)@PAGE 43 add \rd, \rd, \val+(\offset)@PAGEOFF 44 .endif 45#elif defined(PIC) && defined(_WIN32) 46 .if \offset < 0 47 adrp \rd, \val 48 add \rd, \rd, :lo12:\val 49 sub \rd, \rd, -(\offset) 50 .else 51 adrp \rd, \val+(\offset) 52 add \rd, \rd, :lo12:\val+(\offset) 53 .endif 54#elif defined(PIC) 55 adrp \rd, \val+(\offset) 56 add \rd, \rd, :lo12:\val+(\offset) 57#else 58 ldr \rd, =\val+\offset 59#endif 60.endm 61 62.macro sub_sp space 63#ifdef _WIN32 64.if \space > 8192 65 // Here, we'd need to touch two (or more) pages while decrementing 66 // the stack pointer. 67 .error "sub_sp_align doesn't support values over 8K at the moment" 68.elseif \space > 4096 69 sub x16, sp, #4096 70 ldr xzr, [x16] 71 sub sp, x16, #(\space - 4096) 72.else 73 sub sp, sp, #\space 74.endif 75#else 76.if \space >= 4096 77 sub sp, sp, #(\space)/4096*4096 78.endif 79.if (\space % 4096) != 0 80 sub sp, sp, #(\space)%4096 81.endif 82#endif 83.endm 84 85.macro transpose_8x8b_xtl r0, r1, r2, r3, r4, r5, r6, r7, xtl 86 // a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7 87 zip1 \r0\().16b, \r0\().16b, \r1\().16b 88 // c0 d0 c1 d1 c2 d2 d3 d3 c4 d4 c5 d5 c6 d6 d7 d7 89 zip1 \r2\().16b, \r2\().16b, \r3\().16b 90 // e0 f0 e1 f1 e2 f2 e3 f3 e4 f4 e5 f5 e6 f6 e7 f7 91 zip1 \r4\().16b, \r4\().16b, \r5\().16b 92 // g0 h0 g1 h1 g2 h2 h3 h3 g4 h4 g5 h5 g6 h6 h7 h7 93 zip1 \r6\().16b, \r6\().16b, \r7\().16b 94 95 // a0 b0 c0 d0 a2 b2 c2 d2 a4 b4 c4 d4 a6 b6 c6 d6 96 trn1 \r1\().8h, \r0\().8h, \r2\().8h 97 // a1 b1 c1 d1 a3 b3 c3 d3 a5 b5 c5 d5 a7 b7 c7 d7 98 trn2 \r3\().8h, \r0\().8h, \r2\().8h 99 // e0 f0 g0 h0 e2 f2 g2 h2 e4 f4 g4 h4 e6 f6 g6 h6 100 trn1 \r5\().8h, \r4\().8h, \r6\().8h 101 // e1 f1 g1 h1 e3 f3 g3 h3 e5 f5 g5 h5 e7 f7 g7 h7 102 trn2 \r7\().8h, \r4\().8h, \r6\().8h 103 104 // a0 b0 c0 d0 e0 f0 g0 h0 a4 b4 c4 d4 e4 f4 g4 h4 105 trn1 \r0\().4s, \r1\().4s, \r5\().4s 106 // a2 b2 c2 d2 e2 f2 g2 h2 a6 b6 c6 d6 e6 f6 g6 h6 107 trn2 \r2\().4s, \r1\().4s, \r5\().4s 108 // a1 b1 c1 d1 e1 f1 g1 h1 a5 b5 c5 d5 e5 f5 g5 h5 109 trn1 \r1\().4s, \r3\().4s, \r7\().4s 110 // a3 b3 c3 d3 e3 f3 g3 h3 a7 b7 c7 d7 e7 f7 g7 h7 111 trn2 \r3\().4s, \r3\().4s, \r7\().4s 112 113 \xtl\()2 \r4\().8h, \r0\().16b 114 \xtl \r0\().8h, \r0\().8b 115 \xtl\()2 \r6\().8h, \r2\().16b 116 \xtl \r2\().8h, \r2\().8b 117 \xtl\()2 \r5\().8h, \r1\().16b 118 \xtl \r1\().8h, \r1\().8b 119 \xtl\()2 \r7\().8h, \r3\().16b 120 \xtl \r3\().8h, \r3\().8b 121.endm 122 123.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, t8, t9 124 trn1 \t8\().8h, \r0\().8h, \r1\().8h 125 trn2 \t9\().8h, \r0\().8h, \r1\().8h 126 trn1 \r1\().8h, \r2\().8h, \r3\().8h 127 trn2 \r3\().8h, \r2\().8h, \r3\().8h 128 trn1 \r0\().8h, \r4\().8h, \r5\().8h 129 trn2 \r5\().8h, \r4\().8h, \r5\().8h 130 trn1 \r2\().8h, \r6\().8h, \r7\().8h 131 trn2 \r7\().8h, \r6\().8h, \r7\().8h 132 133 trn1 \r4\().4s, \r0\().4s, \r2\().4s 134 trn2 \r2\().4s, \r0\().4s, \r2\().4s 135 trn1 \r6\().4s, \r5\().4s, \r7\().4s 136 trn2 \r7\().4s, \r5\().4s, \r7\().4s 137 trn1 \r5\().4s, \t9\().4s, \r3\().4s 138 trn2 \t9\().4s, \t9\().4s, \r3\().4s 139 trn1 \r3\().4s, \t8\().4s, \r1\().4s 140 trn2 \t8\().4s, \t8\().4s, \r1\().4s 141 142 trn1 \r0\().2d, \r3\().2d, \r4\().2d 143 trn2 \r4\().2d, \r3\().2d, \r4\().2d 144 trn1 \r1\().2d, \r5\().2d, \r6\().2d 145 trn2 \r5\().2d, \r5\().2d, \r6\().2d 146 trn2 \r6\().2d, \t8\().2d, \r2\().2d 147 trn1 \r2\().2d, \t8\().2d, \r2\().2d 148 trn1 \r3\().2d, \t9\().2d, \r7\().2d 149 trn2 \r7\().2d, \t9\().2d, \r7\().2d 150.endm 151 152.macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9 153 trn1 \t8\().16b, \r0\().16b, \r1\().16b 154 trn2 \t9\().16b, \r0\().16b, \r1\().16b 155 trn1 \r1\().16b, \r2\().16b, \r3\().16b 156 trn2 \r3\().16b, \r2\().16b, \r3\().16b 157 trn1 \r0\().16b, \r4\().16b, \r5\().16b 158 trn2 \r5\().16b, \r4\().16b, \r5\().16b 159 trn1 \r2\().16b, \r6\().16b, \r7\().16b 160 trn2 \r7\().16b, \r6\().16b, \r7\().16b 161 162 trn1 \r4\().8h, \r0\().8h, \r2\().8h 163 trn2 \r2\().8h, \r0\().8h, \r2\().8h 164 trn1 \r6\().8h, \r5\().8h, \r7\().8h 165 trn2 \r7\().8h, \r5\().8h, \r7\().8h 166 trn1 \r5\().8h, \t9\().8h, \r3\().8h 167 trn2 \t9\().8h, \t9\().8h, \r3\().8h 168 trn1 \r3\().8h, \t8\().8h, \r1\().8h 169 trn2 \t8\().8h, \t8\().8h, \r1\().8h 170 171 trn1 \r0\().4s, \r3\().4s, \r4\().4s 172 trn2 \r4\().4s, \r3\().4s, \r4\().4s 173 trn1 \r1\().4s, \r5\().4s, \r6\().4s 174 trn2 \r5\().4s, \r5\().4s, \r6\().4s 175 trn2 \r6\().4s, \t8\().4s, \r2\().4s 176 trn1 \r2\().4s, \t8\().4s, \r2\().4s 177 trn1 \r3\().4s, \t9\().4s, \r7\().4s 178 trn2 \r7\().4s, \t9\().4s, \r7\().4s 179.endm 180 181.macro transpose_4x16b r0, r1, r2, r3, t4, t5, t6, t7 182 trn1 \t4\().16b, \r0\().16b, \r1\().16b 183 trn2 \t5\().16b, \r0\().16b, \r1\().16b 184 trn1 \t6\().16b, \r2\().16b, \r3\().16b 185 trn2 \t7\().16b, \r2\().16b, \r3\().16b 186 187 trn1 \r0\().8h, \t4\().8h, \t6\().8h 188 trn2 \r2\().8h, \t4\().8h, \t6\().8h 189 trn1 \r1\().8h, \t5\().8h, \t7\().8h 190 trn2 \r3\().8h, \t5\().8h, \t7\().8h 191.endm 192 193.macro transpose_4x4h r0, r1, r2, r3, t4, t5, t6, t7 194 trn1 \t4\().4h, \r0\().4h, \r1\().4h 195 trn2 \t5\().4h, \r0\().4h, \r1\().4h 196 trn1 \t6\().4h, \r2\().4h, \r3\().4h 197 trn2 \t7\().4h, \r2\().4h, \r3\().4h 198 199 trn1 \r0\().2s, \t4\().2s, \t6\().2s 200 trn2 \r2\().2s, \t4\().2s, \t6\().2s 201 trn1 \r1\().2s, \t5\().2s, \t7\().2s 202 trn2 \r3\().2s, \t5\().2s, \t7\().2s 203.endm 204 205.macro transpose_4x4s r0, r1, r2, r3, t4, t5, t6, t7 206 trn1 \t4\().4s, \r0\().4s, \r1\().4s 207 trn2 \t5\().4s, \r0\().4s, \r1\().4s 208 trn1 \t6\().4s, \r2\().4s, \r3\().4s 209 trn2 \t7\().4s, \r2\().4s, \r3\().4s 210 211 trn1 \r0\().2d, \t4\().2d, \t6\().2d 212 trn2 \r2\().2d, \t4\().2d, \t6\().2d 213 trn1 \r1\().2d, \t5\().2d, \t7\().2d 214 trn2 \r3\().2d, \t5\().2d, \t7\().2d 215.endm 216 217.macro transpose_4x8h r0, r1, r2, r3, t4, t5, t6, t7 218 trn1 \t4\().8h, \r0\().8h, \r1\().8h 219 trn2 \t5\().8h, \r0\().8h, \r1\().8h 220 trn1 \t6\().8h, \r2\().8h, \r3\().8h 221 trn2 \t7\().8h, \r2\().8h, \r3\().8h 222 223 trn1 \r0\().4s, \t4\().4s, \t6\().4s 224 trn2 \r2\().4s, \t4\().4s, \t6\().4s 225 trn1 \r1\().4s, \t5\().4s, \t7\().4s 226 trn2 \r3\().4s, \t5\().4s, \t7\().4s 227.endm 228 229#endif /* DAV1D_SRC_ARM_64_UTIL_S */ 230