1/**************************************************************************** 2 * dct-a.S: aarch64 transform and zigzag 3 ***************************************************************************** 4 * Copyright (C) 2009-2021 x264 project 5 * 6 * Authors: David Conrad <lessen42@gmail.com> 7 * Janne Grunau <janne-x264@jannau.net> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * This program is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 * 19 * You should have received a copy of the GNU General Public License 20 * along with this program; if not, write to the Free Software 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. 22 * 23 * This program is also available under a commercial proprietary license. 24 * For more information, contact us at licensing@x264.com. 25 *****************************************************************************/ 26 27#include "asm.S" 28 29const scan4x4_frame, align=4 30.byte 0,1, 8,9, 2,3, 4,5 31.byte 10,11, 16,17, 24,25, 18,19 32.byte 12,13, 6,7, 14,15, 20,21 33.byte 26,27, 28,29, 22,23, 30,31 34endconst 35 36const scan4x4_field, align=4 37.byte 0,1, 2,3, 8,9, 4,5 38.byte 6,7, 10,11, 12,13, 14,15 39endconst 40 41const sub4x4_frame, align=4 42.byte 0, 1, 4, 8 43.byte 5, 2, 3, 6 44.byte 9, 12, 13, 10 45.byte 7, 11, 14, 15 46endconst 47 48const sub4x4_field, align=4 49.byte 0, 4, 1, 8 50.byte 12, 5, 9, 13 51.byte 2, 6, 10, 14 52.byte 3, 7, 11, 15 53endconst 54 55// sum = a + (b>>shift) sub = (a>>shift) - b 56.macro SUMSUB_SHR shift sum sub a b t0 t1 57 sshr \t0, \b, #\shift 58 sshr \t1, \a, #\shift 59 add \sum, \a, \t0 60 sub \sub, \t1, \b 61.endm 62 63// sum = (a>>shift) + b sub = a - (b>>shift) 64.macro SUMSUB_SHR2 shift sum sub a b t0 t1 65 sshr \t0, \a, #\shift 66 sshr \t1, \b, #\shift 67 add \sum, \t0, \b 68 sub \sub, \a, \t1 69.endm 70 71// a += 1.5*ma b -= 1.5*mb 72.macro SUMSUB_15 a b ma mb t0 t1 73 sshr \t0, \ma, #1 74 sshr \t1, \mb, #1 75 add \t0, \t0, \ma 76 add \t1, \t1, \mb 77 add \a, \a, \t0 78 sub \b, \b, \t1 79.endm 80 81 82function dct4x4dc_neon, export=1 83 ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0] 84 movi v31.4h, #1 85 SUMSUB_AB v4.4h, v5.4h, v0.4h, v1.4h 86 SUMSUB_AB v6.4h, v7.4h, v2.4h, v3.4h 87 SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h 88 SUMSUB_AB v3.4h, v1.4h, v5.4h, v7.4h 89 transpose v4.4h, v6.4h, v0.4h, v2.4h 90 transpose v5.4h, v7.4h, v1.4h, v3.4h 91 SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h 92 SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h 93 transpose v4.2s, v5.2s, v0.2s, v1.2s 94 transpose v6.2s, v7.2s, v2.2s, v3.2s 95 add v16.4h, v4.4h, v31.4h 96 add v17.4h, v6.4h, v31.4h 97 srhadd v0.4h, v4.4h, v5.4h 98 shsub v1.4h, v16.4h, v5.4h 99 shsub v2.4h, v17.4h, v7.4h 100 srhadd v3.4h, v6.4h, v7.4h 101 st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0] 102 ret 103endfunc 104 105function idct4x4dc_neon, export=1 106 ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0] 107 SUMSUB_AB v4.4h, v5.4h, v0.4h, v1.4h 108 SUMSUB_AB v6.4h, v7.4h, v2.4h, v3.4h 109 SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h 110 SUMSUB_AB v3.4h, v1.4h, v5.4h, v7.4h 111 transpose v4.4h, v6.4h, v0.4h, v2.4h 112 transpose v5.4h, v7.4h, v1.4h, v3.4h 113 SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h 114 SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h 115 transpose v4.2s, v5.2s, v0.2s, v1.2s 116 transpose v6.2s, v7.2s, v2.2s, v3.2s 117 SUMSUB_AB v0.4h, v1.4h, v4.4h, v5.4h 118 SUMSUB_AB v3.4h, v2.4h, v6.4h, v7.4h 119 st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0] 120 ret 121endfunc 122 123.macro DCT_1D v0 v1 v2 v3 v4 v5 v6 v7 124 SUMSUB_AB \v1, \v6, \v5, \v6 125 SUMSUB_AB \v3, \v7, \v4, \v7 126 add \v0, \v3, \v1 127 add \v4, \v7, \v7 128 add \v5, \v6, \v6 129 sub \v2, \v3, \v1 130 add \v1, \v4, \v6 131 sub \v3, \v7, \v5 132.endm 133 134function sub4x4_dct_neon, export=1 135 mov x3, #FENC_STRIDE 136 mov x4, #FDEC_STRIDE 137 ld1 {v0.s}[0], [x1], x3 138 ld1 {v1.s}[0], [x2], x4 139 ld1 {v2.s}[0], [x1], x3 140 usubl v16.8h, v0.8b, v1.8b 141 ld1 {v3.s}[0], [x2], x4 142 ld1 {v4.s}[0], [x1], x3 143 usubl v17.8h, v2.8b, v3.8b 144 ld1 {v5.s}[0], [x2], x4 145 ld1 {v6.s}[0], [x1], x3 146 usubl v18.8h, v4.8b, v5.8b 147 ld1 {v7.s}[0], [x2], x4 148 usubl v19.8h, v6.8b, v7.8b 149 150 DCT_1D v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h 151 transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7 152 DCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h 153 st1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x0] 154 ret 155endfunc 156 157function sub8x4_dct_neon 158 ld1 {v0.8b}, [x1], x3 159 ld1 {v1.8b}, [x2], x4 160 usubl v16.8h, v0.8b, v1.8b 161 ld1 {v2.8b}, [x1], x3 162 ld1 {v3.8b}, [x2], x4 163 usubl v17.8h, v2.8b, v3.8b 164 ld1 {v4.8b}, [x1], x3 165 ld1 {v5.8b}, [x2], x4 166 usubl v18.8h, v4.8b, v5.8b 167 ld1 {v6.8b}, [x1], x3 168 ld1 {v7.8b}, [x2], x4 169 usubl v19.8h, v6.8b, v7.8b 170 171 DCT_1D v0.8h, v1.8h, v2.8h, v3.8h, v16.8h, v17.8h, v18.8h, v19.8h 172 transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7 173 174 SUMSUB_AB v16.8h, v19.8h, v0.8h, v3.8h 175 SUMSUB_AB v17.8h, v18.8h, v1.8h, v2.8h 176 add v22.8h, v19.8h, v19.8h 177 add v21.8h, v18.8h, v18.8h 178 add v0.8h, v16.8h, v17.8h 179 sub v1.8h, v16.8h, v17.8h 180 181 add v2.8h, v22.8h, v18.8h 182 sub v3.8h, v19.8h, v21.8h 183 184 zip1 v4.2d, v0.2d, v2.2d 185 zip2 v6.2d, v0.2d, v2.2d 186 zip1 v5.2d, v1.2d, v3.2d 187 zip2 v7.2d, v1.2d, v3.2d 188 189 st1 {v4.8h}, [x0], #16 190 st1 {v5.8h}, [x0], #16 191 st1 {v6.8h}, [x0], #16 192 st1 {v7.8h}, [x0], #16 193 ret 194endfunc 195 196function sub8x8_dct_neon, export=1 197 mov x5, x30 198 mov x3, #FENC_STRIDE 199 mov x4, #FDEC_STRIDE 200 bl sub8x4_dct_neon 201 mov x30, x5 202 b sub8x4_dct_neon 203endfunc 204 205function sub16x16_dct_neon, export=1 206 mov x5, x30 207 mov x3, #FENC_STRIDE 208 mov x4, #FDEC_STRIDE 209 bl sub8x4_dct_neon 210 bl sub8x4_dct_neon 211 sub x1, x1, #8*FENC_STRIDE-8 212 sub x2, x2, #8*FDEC_STRIDE-8 213 bl sub8x4_dct_neon 214 bl sub8x4_dct_neon 215 sub x1, x1, #8 216 sub x2, x2, #8 217 bl sub8x4_dct_neon 218 bl sub8x4_dct_neon 219 sub x1, x1, #8*FENC_STRIDE-8 220 sub x2, x2, #8*FDEC_STRIDE-8 221 bl sub8x4_dct_neon 222 mov x30, x5 223 b sub8x4_dct_neon 224endfunc 225 226 227.macro DCT8_1D type 228 SUMSUB_AB v18.8h, v17.8h, v3.8h, v4.8h // s34/d34 229 SUMSUB_AB v19.8h, v16.8h, v2.8h, v5.8h // s25/d25 230 SUMSUB_AB v22.8h, v21.8h, v1.8h, v6.8h // s16/d16 231 SUMSUB_AB v23.8h, v20.8h, v0.8h, v7.8h // s07/d07 232 233 SUMSUB_AB v24.8h, v26.8h, v23.8h, v18.8h // a0/a2 234 SUMSUB_AB v25.8h, v27.8h, v22.8h, v19.8h // a1/a3 235 236 SUMSUB_AB v30.8h, v29.8h, v20.8h, v17.8h // a6/a5 237 sshr v23.8h, v21.8h, #1 238 sshr v18.8h, v16.8h, #1 239 add v23.8h, v23.8h, v21.8h 240 add v18.8h, v18.8h, v16.8h 241 sub v30.8h, v30.8h, v23.8h 242 sub v29.8h, v29.8h, v18.8h 243 244 SUMSUB_AB v28.8h, v31.8h, v21.8h, v16.8h // a4/a7 245 sshr v22.8h, v20.8h, #1 246 sshr v19.8h, v17.8h, #1 247 add v22.8h, v22.8h, v20.8h 248 add v19.8h, v19.8h, v17.8h 249 add v22.8h, v28.8h, v22.8h 250 add v31.8h, v31.8h, v19.8h 251 252 SUMSUB_AB v0.8h, v4.8h, v24.8h, v25.8h 253 SUMSUB_SHR 2, v1.8h, v7.8h, v22.8h, v31.8h, v16.8h, v17.8h 254 SUMSUB_SHR 1, v2.8h, v6.8h, v26.8h, v27.8h, v18.8h, v19.8h 255 SUMSUB_SHR2 2, v3.8h, v5.8h, v30.8h, v29.8h, v20.8h, v21.8h 256.endm 257 258function sub8x8_dct8_neon, export=1 259 mov x3, #FENC_STRIDE 260 mov x4, #FDEC_STRIDE 261 ld1 {v16.8b}, [x1], x3 262 ld1 {v17.8b}, [x2], x4 263 ld1 {v18.8b}, [x1], x3 264 ld1 {v19.8b}, [x2], x4 265 usubl v0.8h, v16.8b, v17.8b 266 ld1 {v20.8b}, [x1], x3 267 ld1 {v21.8b}, [x2], x4 268 usubl v1.8h, v18.8b, v19.8b 269 ld1 {v22.8b}, [x1], x3 270 ld1 {v23.8b}, [x2], x4 271 usubl v2.8h, v20.8b, v21.8b 272 ld1 {v24.8b}, [x1], x3 273 ld1 {v25.8b}, [x2], x4 274 usubl v3.8h, v22.8b, v23.8b 275 ld1 {v26.8b}, [x1], x3 276 ld1 {v27.8b}, [x2], x4 277 usubl v4.8h, v24.8b, v25.8b 278 ld1 {v28.8b}, [x1], x3 279 ld1 {v29.8b}, [x2], x4 280 usubl v5.8h, v26.8b, v27.8b 281 ld1 {v30.8b}, [x1], x3 282 ld1 {v31.8b}, [x2], x4 283 usubl v6.8h, v28.8b, v29.8b 284 usubl v7.8h, v30.8b, v31.8b 285 286 DCT8_1D row 287 transpose8x8.h v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 288 DCT8_1D col 289 290 st1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], #64 291 st1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], #64 292 ret 293endfunc 294 295function sub16x16_dct8_neon, export=1 296 mov x7, x30 297 bl X(sub8x8_dct8_neon) 298 sub x1, x1, #FENC_STRIDE*8 - 8 299 sub x2, x2, #FDEC_STRIDE*8 - 8 300 bl X(sub8x8_dct8_neon) 301 sub x1, x1, #8 302 sub x2, x2, #8 303 bl X(sub8x8_dct8_neon) 304 mov x30, x7 305 sub x1, x1, #FENC_STRIDE*8 - 8 306 sub x2, x2, #FDEC_STRIDE*8 - 8 307 b X(sub8x8_dct8_neon) 308endfunc 309 310 311// First part of IDCT (minus final SUMSUB_BA) 312.macro IDCT_1D d4 d5 d6 d7 d0 d1 d2 d3 313 SUMSUB_AB \d4, \d5, \d0, \d2 314 sshr \d7, \d1, #1 315 sshr \d6, \d3, #1 316 sub \d7, \d7, \d3 317 add \d6, \d6, \d1 318.endm 319 320function add4x4_idct_neon, export=1 321 mov x2, #FDEC_STRIDE 322 ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x1] 323 324 IDCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h 325 ld1 {v28.s}[0], [x0], x2 326 SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h 327 SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h 328 329 transpose4x4.h v0, v1, v3, v2, v16, v17, v18, v19 330 331 IDCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v3.4h, v2.4h 332 ld1 {v29.s}[0], [x0], x2 333 SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h 334 SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h 335 336 srshr v0.4h, v0.4h, #6 337 srshr v1.4h, v1.4h, #6 338 ld1 {v31.s}[0], [x0], x2 339 srshr v2.4h, v2.4h, #6 340 srshr v3.4h, v3.4h, #6 341 ld1 {v30.s}[0], [x0], x2 342 343 sub x0, x0, x2, lsl #2 344 uaddw v0.8h, v0.8h, v28.8b 345 uaddw v1.8h, v1.8h, v29.8b 346 uaddw v2.8h, v2.8h, v30.8b 347 uaddw v3.8h, v3.8h, v31.8b 348 sqxtun v0.8b, v0.8h 349 sqxtun v1.8b, v1.8h 350 sqxtun v2.8b, v2.8h 351 sqxtun v3.8b, v3.8h 352 353 st1 {v0.s}[0], [x0], x2 354 st1 {v1.s}[0], [x0], x2 355 st1 {v3.s}[0], [x0], x2 356 st1 {v2.s}[0], [x0], x2 357 ret 358endfunc 359 360function add8x4_idct_neon, export=1 361 ld1 {v0.8h,v1.8h}, [x1], #32 362 ld1 {v2.8h,v3.8h}, [x1], #32 363 transpose v20.2d, v21.2d, v0.2d, v2.2d 364 transpose v22.2d, v23.2d, v1.2d, v3.2d 365 IDCT_1D v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h 366 SUMSUB_AB v0.8h, v3.8h, v16.8h, v18.8h 367 SUMSUB_AB v1.8h, v2.8h, v17.8h, v19.8h 368 369 transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7 370 371 IDCT_1D v16.8h, v17.8h, v18.8h, v19.8h, v0.8h, v1.8h, v2.8h, v3.8h 372 SUMSUB_AB v0.8h, v3.8h, v16.8h, v18.8h 373 SUMSUB_AB v1.8h, v2.8h, v17.8h, v19.8h 374 375 srshr v0.8h, v0.8h, #6 376 ld1 {v28.8b}, [x0], x2 377 srshr v1.8h, v1.8h, #6 378 ld1 {v29.8b}, [x0], x2 379 srshr v2.8h, v2.8h, #6 380 ld1 {v30.8b}, [x0], x2 381 srshr v3.8h, v3.8h, #6 382 ld1 {v31.8b}, [x0], x2 383 384 sub x0, x0, x2, lsl #2 385 uaddw v0.8h, v0.8h, v28.8b 386 uaddw v1.8h, v1.8h, v29.8b 387 uaddw v2.8h, v2.8h, v30.8b 388 uaddw v3.8h, v3.8h, v31.8b 389 390 sqxtun v0.8b, v0.8h 391 sqxtun v1.8b, v1.8h 392 st1 {v0.8b}, [x0], x2 393 sqxtun v2.8b, v2.8h 394 st1 {v1.8b}, [x0], x2 395 sqxtun v3.8b, v3.8h 396 st1 {v2.8b}, [x0], x2 397 st1 {v3.8b}, [x0], x2 398 ret 399endfunc 400 401function add8x8_idct_neon, export=1 402 mov x2, #FDEC_STRIDE 403 mov x5, x30 404 bl X(add8x4_idct_neon) 405 mov x30, x5 406 b X(add8x4_idct_neon) 407endfunc 408 409function add16x16_idct_neon, export=1 410 mov x2, #FDEC_STRIDE 411 mov x5, x30 412 bl X(add8x4_idct_neon) 413 bl X(add8x4_idct_neon) 414 sub x0, x0, #8*FDEC_STRIDE-8 415 bl X(add8x4_idct_neon) 416 bl X(add8x4_idct_neon) 417 sub x0, x0, #8 418 bl X(add8x4_idct_neon) 419 bl X(add8x4_idct_neon) 420 sub x0, x0, #8*FDEC_STRIDE-8 421 bl X(add8x4_idct_neon) 422 mov x30, x5 423 b X(add8x4_idct_neon) 424endfunc 425 426.macro IDCT8_1D type 427 SUMSUB_AB v0.8h, v1.8h, v16.8h, v20.8h // a0/a2 428.ifc \type, row 429 ld1 {v22.8h,v23.8h}, [x1], #32 430.endif 431 SUMSUB_SHR 1, v2.8h, v3.8h, v18.8h, v22.8h, v16.8h, v20.8h // a6/a4 432 SUMSUB_AB v16.8h, v18.8h, v21.8h, v19.8h 433 SUMSUB_15 v16.8h, v18.8h, v17.8h, v23.8h, v20.8h, v22.8h // a7/a1 434 SUMSUB_AB v22.8h, v23.8h, v23.8h, v17.8h 435 SUMSUB_15 v23.8h, v22.8h, v21.8h, v19.8h, v20.8h, v17.8h // a5/a3 436 437 SUMSUB_SHR 2, v21.8h, v22.8h, v22.8h, v23.8h, v19.8h, v17.8h // b3/b5 438 SUMSUB_SHR2 2, v20.8h, v23.8h, v16.8h, v18.8h, v19.8h, v17.8h // b1/b7 439 440 SUMSUB_AB v18.8h, v2.8h, v0.8h, v2.8h // b0/b6 441 SUMSUB_AB v19.8h, v3.8h, v1.8h, v3.8h // b2/b4 442 443 SUMSUB_AB v16.8h, v23.8h, v18.8h, v23.8h 444 SUMSUB_AB v17.8h, v22.8h, v19.8h, v22.8h 445 SUMSUB_AB v18.8h, v21.8h, v3.8h, v21.8h 446 SUMSUB_AB v19.8h, v20.8h, v2.8h, v20.8h 447.endm 448 449function add8x8_idct8_neon, export=1 450 mov x2, #FDEC_STRIDE 451 ld1 {v16.8h,v17.8h}, [x1], #32 452 ld1 {v18.8h,v19.8h}, [x1], #32 453 ld1 {v20.8h,v21.8h}, [x1], #32 454 455 IDCT8_1D row 456 457 transpose8x8.h v16, v17, v18, v19, v20, v21, v22, v23, v30, v31 458 459 IDCT8_1D col 460 461 ld1 {v0.8b}, [x0], x2 462 srshr v16.8h, v16.8h, #6 463 ld1 {v1.8b}, [x0], x2 464 srshr v17.8h, v17.8h, #6 465 ld1 {v2.8b}, [x0], x2 466 srshr v18.8h, v18.8h, #6 467 ld1 {v3.8b}, [x0], x2 468 srshr v19.8h, v19.8h, #6 469 ld1 {v4.8b}, [x0], x2 470 srshr v20.8h, v20.8h, #6 471 ld1 {v5.8b}, [x0], x2 472 srshr v21.8h, v21.8h, #6 473 ld1 {v6.8b}, [x0], x2 474 srshr v22.8h, v22.8h, #6 475 ld1 {v7.8b}, [x0], x2 476 srshr v23.8h, v23.8h, #6 477 sub x0, x0, x2, lsl #3 478 479 uaddw v16.8h, v16.8h, v0.8b 480 uaddw v17.8h, v17.8h, v1.8b 481 uaddw v18.8h, v18.8h, v2.8b 482 sqxtun v0.8b, v16.8h 483 sqxtun v1.8b, v17.8h 484 sqxtun v2.8b, v18.8h 485 uaddw v19.8h, v19.8h, v3.8b 486 st1 {v0.8b}, [x0], x2 487 uaddw v20.8h, v20.8h, v4.8b 488 st1 {v1.8b}, [x0], x2 489 uaddw v21.8h, v21.8h, v5.8b 490 st1 {v2.8b}, [x0], x2 491 sqxtun v3.8b, v19.8h 492 sqxtun v4.8b, v20.8h 493 uaddw v22.8h, v22.8h, v6.8b 494 uaddw v23.8h, v23.8h, v7.8b 495 st1 {v3.8b}, [x0], x2 496 sqxtun v5.8b, v21.8h 497 st1 {v4.8b}, [x0], x2 498 sqxtun v6.8b, v22.8h 499 sqxtun v7.8b, v23.8h 500 st1 {v5.8b}, [x0], x2 501 st1 {v6.8b}, [x0], x2 502 st1 {v7.8b}, [x0], x2 503 ret 504endfunc 505 506function add16x16_idct8_neon, export=1 507 mov x7, x30 508 bl X(add8x8_idct8_neon) 509 sub x0, x0, #8*FDEC_STRIDE-8 510 bl X(add8x8_idct8_neon) 511 sub x0, x0, #8 512 bl X(add8x8_idct8_neon) 513 sub x0, x0, #8*FDEC_STRIDE-8 514 mov x30, x7 515 b X(add8x8_idct8_neon) 516endfunc 517 518function add8x8_idct_dc_neon, export=1 519 mov x2, #FDEC_STRIDE 520 ld1 {v16.4h}, [x1] 521 ld1 {v0.8b}, [x0], x2 522 srshr v16.4h, v16.4h, #6 523 ld1 {v1.8b}, [x0], x2 524 dup v20.8h, v16.h[0] 525 dup v21.8h, v16.h[1] 526 ld1 {v2.8b}, [x0], x2 527 dup v22.8h, v16.h[2] 528 dup v23.8h, v16.h[3] 529 ld1 {v3.8b}, [x0], x2 530 trn1 v20.2d, v20.2d, v21.2d 531 ld1 {v4.8b}, [x0], x2 532 trn1 v21.2d, v22.2d, v23.2d 533 ld1 {v5.8b}, [x0], x2 534 neg v22.8h, v20.8h 535 ld1 {v6.8b}, [x0], x2 536 neg v23.8h, v21.8h 537 ld1 {v7.8b}, [x0], x2 538 539 sub x0, x0, #8*FDEC_STRIDE 540 541 sqxtun v20.8b, v20.8h 542 sqxtun v21.8b, v21.8h 543 sqxtun v22.8b, v22.8h 544 sqxtun v23.8b, v23.8h 545 546 uqadd v0.8b, v0.8b, v20.8b 547 uqadd v1.8b, v1.8b, v20.8b 548 uqadd v2.8b, v2.8b, v20.8b 549 uqadd v3.8b, v3.8b, v20.8b 550 uqadd v4.8b, v4.8b, v21.8b 551 uqadd v5.8b, v5.8b, v21.8b 552 uqadd v6.8b, v6.8b, v21.8b 553 uqadd v7.8b, v7.8b, v21.8b 554 uqsub v0.8b, v0.8b, v22.8b 555 uqsub v1.8b, v1.8b, v22.8b 556 uqsub v2.8b, v2.8b, v22.8b 557 uqsub v3.8b, v3.8b, v22.8b 558 uqsub v4.8b, v4.8b, v23.8b 559 uqsub v5.8b, v5.8b, v23.8b 560 uqsub v6.8b, v6.8b, v23.8b 561 uqsub v7.8b, v7.8b, v23.8b 562 563 st1 {v0.8b}, [x0], x2 564 st1 {v1.8b}, [x0], x2 565 st1 {v2.8b}, [x0], x2 566 st1 {v3.8b}, [x0], x2 567 st1 {v4.8b}, [x0], x2 568 st1 {v5.8b}, [x0], x2 569 st1 {v6.8b}, [x0], x2 570 st1 {v7.8b}, [x0], x2 571 ret 572endfunc 573 574.macro ADD16x4_IDCT_DC dc 575 ld1 {v4.16b}, [x0], x3 576 dup v24.8h, \dc[0] 577 dup v25.8h, \dc[1] 578 ld1 {v5.16b}, [x0], x3 579 dup v26.8h, \dc[2] 580 dup v27.8h, \dc[3] 581 ld1 {v6.16b}, [x0], x3 582 trn1 v24.2d, v24.2d, v25.2d 583 ld1 {v7.16b}, [x0], x3 584 trn1 v25.2d, v26.2d, v27.2d 585 neg v26.8h, v24.8h 586 neg v27.8h, v25.8h 587 588 sqxtun v20.8b, v24.8h 589 sqxtun v21.8b, v26.8h 590 sqxtun2 v20.16b, v25.8h 591 sqxtun2 v21.16b, v27.8h 592 593 uqadd v4.16b, v4.16b, v20.16b 594 uqadd v5.16b, v5.16b, v20.16b 595 uqadd v6.16b, v6.16b, v20.16b 596 uqadd v7.16b, v7.16b, v20.16b 597 598 uqsub v4.16b, v4.16b, v21.16b 599 uqsub v5.16b, v5.16b, v21.16b 600 uqsub v6.16b, v6.16b, v21.16b 601 st1 {v4.16b}, [x2], x3 602 uqsub v7.16b, v7.16b, v21.16b 603 st1 {v5.16b}, [x2], x3 604 st1 {v6.16b}, [x2], x3 605 st1 {v7.16b}, [x2], x3 606.endm 607 608function add16x16_idct_dc_neon, export=1 609 mov x2, x0 610 mov x3, #FDEC_STRIDE 611 612 ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x1] 613 srshr v0.4h, v0.4h, #6 614 srshr v1.4h, v1.4h, #6 615 616 ADD16x4_IDCT_DC v0.h 617 srshr v2.4h, v2.4h, #6 618 ADD16x4_IDCT_DC v1.h 619 srshr v3.4h, v3.4h, #6 620 ADD16x4_IDCT_DC v2.h 621 ADD16x4_IDCT_DC v3.h 622 ret 623endfunc 624 625.macro sub4x4x2_dct_dc, dst, t0, t1, t2, t3, t4, t5, t6, t7 626 ld1 {\t0\().8b}, [x1], x3 627 ld1 {\t1\().8b}, [x2], x4 628 ld1 {\t2\().8b}, [x1], x3 629 ld1 {\t3\().8b}, [x2], x4 630 usubl \t0\().8h, \t0\().8b, \t1\().8b 631 ld1 {\t4\().8b}, [x1], x3 632 ld1 {\t5\().8b}, [x2], x4 633 usubl \t1\().8h, \t2\().8b, \t3\().8b 634 ld1 {\t6\().8b}, [x1], x3 635 ld1 {\t7\().8b}, [x2], x4 636 add \dst\().8h, \t0\().8h, \t1\().8h 637 usubl \t2\().8h, \t4\().8b, \t5\().8b 638 usubl \t3\().8h, \t6\().8b, \t7\().8b 639 add \dst\().8h, \dst\().8h, \t2\().8h 640 add \dst\().8h, \dst\().8h, \t3\().8h 641.endm 642 643function sub8x8_dct_dc_neon, export=1 644 mov x3, #FENC_STRIDE 645 mov x4, #FDEC_STRIDE 646 647 sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23 648 sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31 649 650 transpose v2.2d, v3.2d, v0.2d, v1.2d 651 SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h 652 transpose v2.2d, v3.2d, v0.2d, v1.2d 653 SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h 654 transpose v2.2d, v3.2d, v0.2d, v1.2d 655 656 addp v0.8h, v2.8h, v3.8h 657 addp v0.8h, v0.8h, v0.8h 658 659 st1 {v0.4h}, [x0] 660 ret 661endfunc 662 663function sub8x16_dct_dc_neon, export=1 664 mov x3, #FENC_STRIDE 665 mov x4, #FDEC_STRIDE 666 sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23 667 sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31 668 sub4x4x2_dct_dc v2, v16, v17, v18, v19, v20, v21, v22, v23 669 sub4x4x2_dct_dc v3, v24, v25, v26, v27, v28, v29, v30, v31 670 671 addp v4.8h, v0.8h, v2.8h 672 addp v5.8h, v1.8h, v3.8h 673 674 transpose v2.4s, v3.4s, v4.4s, v5.4s 675 SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h 676 677 transpose v2.4s, v3.4s, v0.4s, v1.4s 678 SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h 679 680 transpose v2.2d, v3.2d, v0.2d, v1.2d 681 SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h 682 683 trn1 v2.2d, v0.2d, v1.2d 684 trn2 v3.2d, v1.2d, v0.2d 685 686 addp v0.8h, v2.8h, v3.8h 687 688 st1 {v0.8h}, [x0] 689 ret 690endfunc 691 692function zigzag_interleave_8x8_cavlc_neon, export=1 693 mov x3, #7 694 movi v31.4s, #1 695 ld4 {v0.8h,v1.8h,v2.8h,v3.8h}, [x1], #64 696 ld4 {v4.8h,v5.8h,v6.8h,v7.8h}, [x1], #64 697 umax v16.8h, v0.8h, v4.8h 698 umax v17.8h, v1.8h, v5.8h 699 umax v18.8h, v2.8h, v6.8h 700 umax v19.8h, v3.8h, v7.8h 701 st1 {v0.8h}, [x0], #16 702 st1 {v4.8h}, [x0], #16 703 umaxp v16.8h, v16.8h, v17.8h 704 umaxp v18.8h, v18.8h, v19.8h 705 st1 {v1.8h}, [x0], #16 706 st1 {v5.8h}, [x0], #16 707 umaxp v16.8h, v16.8h, v18.8h 708 st1 {v2.8h}, [x0], #16 709 st1 {v6.8h}, [x0], #16 710 cmhs v16.4s, v16.4s, v31.4s 711 st1 {v3.8h}, [x0], #16 712 and v16.16b, v16.16b, v31.16b 713 st1 {v7.8h}, [x0], #16 714 st1 {v16.b}[0], [x2], #1 715 st1 {v16.b}[4], [x2], x3 716 st1 {v16.b}[8], [x2], #1 717 st1 {v16.b}[12], [x2] 718 ret 719endfunc 720 721function zigzag_scan_4x4_frame_neon, export=1 722 movrel x2, scan4x4_frame 723 ld1 {v0.16b,v1.16b}, [x1] 724 ld1 {v16.16b,v17.16b}, [x2] 725 tbl v2.16b, {v0.16b,v1.16b}, v16.16b 726 tbl v3.16b, {v0.16b,v1.16b}, v17.16b 727 st1 {v2.16b,v3.16b}, [x0] 728 ret 729endfunc 730 731.macro zigzag_sub_4x4 f ac 732function zigzag_sub_4x4\ac\()_\f\()_neon, export=1 733 mov x9, #FENC_STRIDE 734 mov x4, #FDEC_STRIDE 735 movrel x5, sub4x4_\f 736 mov x6, x2 737 ld1 {v0.s}[0], [x1], x9 738 ld1 {v0.s}[1], [x1], x9 739 ld1 {v0.s}[2], [x1], x9 740 ld1 {v0.s}[3], [x1], x9 741 ld1 {v16.16b}, [x5] 742 ld1 {v1.s}[0], [x2], x4 743 ld1 {v1.s}[1], [x2], x4 744 ld1 {v1.s}[2], [x2], x4 745 ld1 {v1.s}[3], [x2], x4 746 tbl v2.16b, {v0.16b}, v16.16b 747 tbl v3.16b, {v1.16b}, v16.16b 748 st1 {v0.s}[0], [x6], x4 749 usubl v4.8h, v2.8b, v3.8b 750.ifc \ac, ac 751 dup h7, v4.h[0] 752 ins v4.h[0], wzr 753 fmov w5, s7 754 strh w5, [x3] 755.endif 756 usubl2 v5.8h, v2.16b, v3.16b 757 st1 {v0.s}[1], [x6], x4 758 umax v6.8h, v4.8h, v5.8h 759 umaxv h6, v6.8h 760 st1 {v0.s}[2], [x6], x4 761 fmov w7, s6 762 st1 {v0.s}[3], [x6], x4 763 cmp w7, #0 764 st1 {v4.8h,v5.8h}, [x0] 765 cset w0, ne 766 ret 767endfunc 768.endm 769 770zigzag_sub_4x4 field 771zigzag_sub_4x4 field, ac 772zigzag_sub_4x4 frame 773zigzag_sub_4x4 frame, ac 774 775function zigzag_scan_4x4_field_neon, export=1 776 movrel x2, scan4x4_field 777 ld1 {v0.8h,v1.8h}, [x1] 778 ld1 {v16.16b}, [x2] 779 tbl v0.16b, {v0.16b}, v16.16b 780 st1 {v0.8h,v1.8h}, [x0] 781 ret 782endfunc 783 784function zigzag_scan_8x8_frame_neon, export=1 785 movrel x2, scan8x8_frame 786 ld1 {v0.8h,v1.8h}, [x1], #32 787 ld1 {v2.8h,v3.8h}, [x1], #32 788 ld1 {v4.8h,v5.8h}, [x1], #32 789 ld1 {v6.8h,v7.8h}, [x1] 790 ld1 {v16.16b,v17.16b}, [x2], #32 791 ld1 {v18.16b,v19.16b}, [x2], #32 792 ld1 {v20.16b,v21.16b}, [x2], #32 793 ld1 {v22.16b,v23.16b}, [x2], #32 794 tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b 795 tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b 796 tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b 797 tbl v27.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v19.16b 798 tbl v28.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v20.16b 799 tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v21.16b 800 tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v22.16b 801 tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v23.16b 802 mov v25.h[6], v4.h[0] 803 mov v25.h[7], v5.h[0] 804 mov v26.h[0], v4.h[1] 805 mov v27.h[4], v7.h[0] 806 mov v28.h[7], v4.h[4] 807 mov v29.h[7], v3.h[6] 808 mov v30.h[0], v2.h[7] 809 mov v30.h[1], v3.h[7] 810 st1 {v24.8h,v25.8h}, [x0], #32 811 st1 {v26.8h,v27.8h}, [x0], #32 812 st1 {v28.8h,v29.8h}, [x0], #32 813 st1 {v30.8h,v31.8h}, [x0] 814 ret 815endfunc 816 817#define Z(z) 2*(z), 2*(z)+1 818#define T(x,y) Z(x*8+y) 819const scan8x8_frame, align=5 820 .byte T(0,0), T(1,0), T(0,1), T(0,2) 821 .byte T(1,1), T(2,0), T(3,0), T(2,1) 822 .byte T(1,2), T(0,3), T(0,4), T(1,3) 823 .byte T(2,2), T(3,1), T(4,0), T(5,0) 824 .byte T(4,1), T(3,2), T(2,3), T(1,4) 825 .byte T(0,5), T(0,6), T(1,5), T(2,4) 826#undef T 827#define T(x,y) Z((x-3)*8+y) 828 .byte T(3,3), T(4,2), T(5,1), T(6,0) 829 .byte T(7,0), T(6,1), T(5,2), T(4,3) 830#undef T 831#define T(x,y) Z((x-0)*8+y) 832 .byte T(3,4), T(2,5), T(1,6), T(0,7) 833 .byte T(1,7), T(2,6), T(3,5), T(4,4) 834#undef T 835#define T(x,y) Z((x-4)*8+y) 836 .byte T(5,3), T(6,2), T(7,1), T(7,2) 837 .byte T(6,3), T(5,4), T(4,5), T(3,6) 838 .byte T(2,7), T(3,7), T(4,6), T(5,5) 839 .byte T(6,4), T(7,3), T(7,4), T(6,5) 840 .byte T(5,6), T(4,7), T(5,7), T(6,6) 841 .byte T(7,5), T(7,6), T(6,7), T(7,7) 842endconst 843 844function zigzag_scan_8x8_field_neon, export=1 845 movrel x2, scan8x8_field 846 ld1 {v0.8h,v1.8h}, [x1], #32 847 ld1 {v2.8h,v3.8h}, [x1], #32 848 ld1 {v4.8h,v5.8h}, [x1], #32 849 ld1 {v6.8h,v7.8h}, [x1] 850 ld1 {v16.16b,v17.16b}, [x2], #32 851 ld1 {v18.16b,v19.16b}, [x2], #32 852 ld1 {v20.16b,v21.16b}, [x2], #32 853 ld1 {v22.16b}, [x2] 854 ext v31.16b, v7.16b, v7.16b, #4 855 tbl v24.16b, {v0.16b,v1.16b}, v16.16b 856 tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b 857 tbl v26.16b, {v1.16b,v2.16b,v3.16b,v4.16b}, v18.16b 858 tbl v27.16b, {v2.16b,v3.16b,v4.16b,v5.16b}, v19.16b 859 tbl v28.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v20.16b 860 tbl v29.16b, {v4.16b,v5.16b,v6.16b}, v21.16b 861 tbl v30.16b, {v5.16b,v6.16b,v7.16b}, v22.16b 862 ext v31.16b, v6.16b, v31.16b, #12 863 st1 {v24.8h,v25.8h}, [x0], #32 864 st1 {v26.8h,v27.8h}, [x0], #32 865 st1 {v28.8h,v29.8h}, [x0], #32 866 st1 {v30.8h,v31.8h}, [x0] 867 ret 868endfunc 869 870.macro zigzag_sub8x8 f 871function zigzag_sub_8x8_\f\()_neon, export=1 872 movrel x4, sub8x8_\f 873 mov x5, #FENC_STRIDE 874 mov x6, #FDEC_STRIDE 875 mov x7, x2 876 ld1 {v0.d}[0], [x1], x5 877 ld1 {v0.d}[1], [x1], x5 878 ld1 {v1.d}[0], [x1], x5 879 ld1 {v1.d}[1], [x1], x5 880 ld1 {v2.d}[0], [x1], x5 881 ld1 {v2.d}[1], [x1], x5 882 ld1 {v3.d}[0], [x1], x5 883 ld1 {v3.d}[1], [x1] 884 ld1 {v4.d}[0], [x2], x6 885 ld1 {v4.d}[1], [x2], x6 886 ld1 {v5.d}[0], [x2], x6 887 ld1 {v5.d}[1], [x2], x6 888 ld1 {v6.d}[0], [x2], x6 889 ld1 {v6.d}[1], [x2], x6 890 ld1 {v7.d}[0], [x2], x6 891 ld1 {v7.d}[1], [x2] 892 ld1 {v16.16b,v17.16b}, [x4], #32 893 ld1 {v18.16b,v19.16b}, [x4], #32 894 tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b 895 tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b 896 tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b 897 tbl v27.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v19.16b 898 tbl v28.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v16.16b 899 tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v17.16b 900 tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v18.16b 901 tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v19.16b 902 usubl v4.8h, v24.8b, v28.8b 903 usubl2 v5.8h, v24.16b, v28.16b 904 usubl v6.8h, v25.8b, v29.8b 905 usubl2 v7.8h, v25.16b, v29.16b 906 usubl v16.8h, v26.8b, v30.8b 907 usubl2 v17.8h, v26.16b, v30.16b 908 usubl v18.8h, v27.8b, v31.8b 909 usubl2 v19.8h, v27.16b, v31.16b 910 umax v20.8h, v4.8h, v5.8h 911 umax v21.8h, v6.8h, v7.8h 912 umax v22.8h, v16.8h, v17.8h 913 umax v23.8h, v18.8h, v19.8h 914 umax v20.8h, v20.8h, v21.8h 915 umax v21.8h, v22.8h, v23.8h 916 umax v20.8h, v20.8h, v21.8h 917 umaxv h22, v20.8h 918 st1 {v0.d}[0], [x7], x6 919 st1 {v0.d}[1], [x7], x6 920 st1 {v1.d}[0], [x7], x6 921 st1 {v1.d}[1], [x7], x6 922 st1 {v2.d}[0], [x7], x6 923 st1 {v2.d}[1], [x7], x6 924 st1 {v3.d}[0], [x7], x6 925 st1 {v3.d}[1], [x7] 926 st1 {v4.8h,v5.8h}, [x0], #32 927 st1 {v6.8h,v7.8h}, [x0], #32 928 st1 {v16.8h,v17.8h}, [x0], #32 929 st1 {v18.8h,v19.8h}, [x0] 930 fmov w9, s22 931 cmp w9, #0 932 cset w0, ne 933 ret 934endfunc 935.endm 936 937zigzag_sub8x8 field 938zigzag_sub8x8 frame 939 940#undef T 941#define T(x,y) Z(x*8+y) 942const scan8x8_field, align=5 943 .byte T(0,0), T(0,1), T(0,2), T(1,0) 944 .byte T(1,1), T(0,3), T(0,4), T(1,2) 945 .byte T(2,0), T(1,3), T(0,5), T(0,6) 946 .byte T(0,7), T(1,4), T(2,1), T(3,0) 947#undef T 948#define T(x,y) Z((x-1)*8+y) 949 .byte T(2,2), T(1,5), T(1,6), T(1,7) 950 .byte T(2,3), T(3,1), T(4,0), T(3,2) 951#undef T 952#define T(x,y) Z((x-2)*8+y) 953 .byte T(2,4), T(2,5), T(2,6), T(2,7) 954 .byte T(3,3), T(4,1), T(5,0), T(4,2) 955#undef T 956#define T(x,y) Z((x-3)*8+y) 957 .byte T(3,4), T(3,5), T(3,6), T(3,7) 958 .byte T(4,3), T(5,1), T(6,0), T(5,2) 959#undef T 960#define T(x,y) Z((x-4)*8+y) 961 .byte T(4,4), T(4,5), T(4,6), T(4,7) 962 .byte T(5,3), T(6,1), T(6,2), T(5,4) 963#undef T 964#define T(x,y) Z((x-5)*8+y) 965 .byte T(5,5), T(5,6), T(5,7), T(6,3) 966 .byte T(7,0), T(7,1), T(6,4), T(6,5) 967endconst 968 969 970#undef T 971#define T(y,x) x*8+y 972const sub8x8_frame, align=5 973 .byte T(0,0), T(1,0), T(0,1), T(0,2) 974 .byte T(1,1), T(2,0), T(3,0), T(2,1) 975 .byte T(1,2), T(0,3), T(0,4), T(1,3) 976 .byte T(2,2), T(3,1), T(4,0), T(5,0) 977 .byte T(4,1), T(3,2), T(2,3), T(1,4) 978 .byte T(0,5), T(0,6), T(1,5), T(2,4) 979 .byte T(3,3), T(4,2), T(5,1), T(6,0) 980 .byte T(7,0), T(6,1), T(5,2), T(4,3) 981 .byte T(3,4), T(2,5), T(1,6), T(0,7) 982 .byte T(1,7), T(2,6), T(3,5), T(4,4) 983 .byte T(5,3), T(6,2), T(7,1), T(7,2) 984 .byte T(6,3), T(5,4), T(4,5), T(3,6) 985 .byte T(2,7), T(3,7), T(4,6), T(5,5) 986 .byte T(6,4), T(7,3), T(7,4), T(6,5) 987 .byte T(5,6), T(4,7), T(5,7), T(6,6) 988 .byte T(7,5), T(7,6), T(6,7), T(7,7) 989endconst 990 991const sub8x8_field, align=5 992 .byte T(0,0), T(0,1), T(0,2), T(1,0) 993 .byte T(1,1), T(0,3), T(0,4), T(1,2) 994 .byte T(2,0), T(1,3), T(0,5), T(0,6) 995 .byte T(0,7), T(1,4), T(2,1), T(3,0) 996 .byte T(2,2), T(1,5), T(1,6), T(1,7) 997 .byte T(2,3), T(3,1), T(4,0), T(3,2) 998 .byte T(2,4), T(2,5), T(2,6), T(2,7) 999 .byte T(3,3), T(4,1), T(5,0), T(4,2) 1000 .byte T(3,4), T(3,5), T(3,6), T(3,7) 1001 .byte T(4,3), T(5,1), T(6,0), T(5,2) 1002 .byte T(4,4), T(4,5), T(4,6), T(4,7) 1003 .byte T(5,3), T(6,1), T(6,2), T(5,4) 1004 .byte T(5,5), T(5,6), T(5,7), T(6,3) 1005 .byte T(7,0), T(7,1), T(6,4), T(6,5) 1006 .byte T(6,6), T(6,7), T(7,2), T(7,3) 1007 .byte T(7,4), T(7,5), T(7,6), T(7,7) 1008endconst 1009