1/*! 2 * \copy 3 * Copyright (c) 2013, Cisco Systems 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * * Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 * 31 */ 32 33#ifdef HAVE_NEON_AARCH64 34#include "arm_arch64_common_macro.S" 35 36.macro LOAD_LUMA_DATA 37 sub x7, x0, x1 38 ld1 {v0.16b}, [x7] //top 39 sub x7, x0, #1 40 ld1 {v1.b}[0], [x7], x1 41 ld1 {v1.b}[1], [x7], x1 42 ld1 {v1.b}[2], [x7], x1 43 ld1 {v1.b}[3], [x7], x1 44 ld1 {v1.b}[4], [x7], x1 45 ld1 {v1.b}[5], [x7], x1 46 ld1 {v1.b}[6], [x7], x1 47 ld1 {v1.b}[7], [x7], x1 48 ld1 {v1.b}[8], [x7], x1 49 ld1 {v1.b}[9], [x7], x1 50 ld1 {v1.b}[10], [x7], x1 51 ld1 {v1.b}[11], [x7], x1 52 ld1 {v1.b}[12], [x7], x1 53 ld1 {v1.b}[13], [x7], x1 54 ld1 {v1.b}[14], [x7], x1 55 ld1 {v1.b}[15], [x7] //left 56.endm 57 58.macro LOAD_16X4_DATA 59 //Load the p_enc data and save to "v22 ~ v25"--- 16X4 bytes 60 ld1 {v0.16b}, [x2], x3 61 ld1 {v1.16b}, [x2], x3 62 ld1 {v20.16b}, [x2], x3 63 ld1 {v21.16b}, [x2], x3 64 trn1 v22.4s, v0.4s, v1.4s 65 trn2 v23.4s, v0.4s, v1.4s 66 trn1 v24.4s, v20.4s, v21.4s 67 trn2 v25.4s, v20.4s, v21.4s 68.endm 69 70.macro GET_16X16_V_SATD 71 trn1 v6.4s, v4.4s, v5.4s 72 trn2 v7.4s, v4.4s, v5.4s 73 add v4.8h, v6.8h, v7.8h 74 sub v5.8h, v6.8h, v7.8h 75 trn1 v6.8h, v4.8h, v5.8h 76 trn2 v7.8h, v4.8h, v5.8h 77 add v4.8h, v6.8h, v7.8h 78 sub v5.8h, v6.8h, v7.8h 79 trn1 v6.4s, v4.4s, v5.4s 80 trn2 v7.4s, v4.4s, v5.4s //{0,1,3,2, 4,5,7,6} v6 {8,9,11,10, 12,13,15,14} v7 81.endm 82 83.macro GET_16X16_H_SATD 84 trn1 v16.4s, v4.4s, v5.4s 85 trn2 v17.4s, v4.4s, v5.4s 86 add v4.8h, v16.8h, v17.8h 87 sub v5.8h, v16.8h, v17.8h 88 trn1 v16.8h, v4.8h, v5.8h 89 trn2 v17.8h, v4.8h, v5.8h 90 add v4.8h, v16.8h, v17.8h 91 sub v5.8h, v16.8h, v17.8h 92 trn1 v16.4s, v4.4s, v5.4s 93 trn2 v17.4s, v4.4s, v5.4s //{0,1,3,2, 4,5,7,6} v16 {8,9,11,10, 12,13,15,14} v17 94.endm 95 96.macro SELECT_BEST_COST arg0, arg1, arg2 97 cmp w1, \arg0 98 csel \arg0, \arg0, w1, \arg2 99 cset w7, \arg1 100 cmp w2, \arg0 101 mov w6, #2 102 csel \arg0, \arg0, w2, \arg2 103 csel w7, w7, w6, \arg2 104.endm 105 106.macro SELECT_BEST_COST_PREFER_HIGHER arg0 107 SELECT_BEST_COST \arg0, ls, hi 108.endm 109 110.macro SELECT_BEST_COST_PREFER_LOWER arg0 111 SELECT_BEST_COST \arg0, lo, hs 112.endm 113 114.macro LOAD_CHROMA_DATA arg0, arg1, arg2 115 sub x9, \arg0, x1 116 ld1 {\arg1}, [x9] //top_cb 117 sub x9, \arg0, #1 118 ld1 {\arg2}[8], [x9], x1 119 ld1 {\arg2}[9], [x9], x1 120 ld1 {\arg2}[10], [x9], x1 121 ld1 {\arg2}[11], [x9], x1 122 ld1 {\arg2}[12], [x9], x1 123 ld1 {\arg2}[13], [x9], x1 124 ld1 {\arg2}[14], [x9], x1 125 ld1 {\arg2}[15], [x9], x1 //left_cb 126.endm 127 128.macro LOAD_8X4_DATA arg0 129 //Load the p_enc data and save to "v20 ~ v21"--- 8X4 bytes 130 ld1 {v0.8b}, [\arg0], x3 131 ld1 {v1.8b}, [\arg0], x3 132 ld1 {v0.d}[1], [\arg0], x3 133 ld1 {v1.d}[1], [\arg0], x3 134 trn1 v2.4s, v0.4s, v1.4s 135 trn2 v1.4s, v0.4s, v1.4s 136 trn1 v20.2d, v2.2d, v1.2d 137 trn2 v21.2d, v2.2d, v1.2d 138.endm 139 140.macro HDM_TRANSFORM_4X4_L0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 141 //Do the vertical transform 142 uadd\arg9\() v0.8h, \arg0, \arg1 143 usub\arg9\() v1.8h, \arg0, \arg1 144 trn1 v3.2d, v0.2d, v1.2d 145 trn2 v1.2d, v0.2d, v1.2d 146 add v4.8h, v3.8h, v1.8h //{0,1,2,3,4,5,6,7} 147 sub v5.8h, v3.8h, v1.8h //{12,13,14,15,8,9,10,11} 148 149 //Do the horizontal transform 150 trn1 v0.4s, v4.4s, v5.4s 151 trn2 v1.4s, v4.4s, v5.4s 152 add v4.8h, v0.8h, v1.8h 153 sub v5.8h, v0.8h, v1.8h 154 trn1 v0.8h, v4.8h, v5.8h 155 trn2 v1.8h, v4.8h, v5.8h 156 add v4.8h, v0.8h, v1.8h 157 sub v5.8h, v0.8h, v1.8h 158 159 //16x16_v 160 trn1 v0.2s, v4.2s, v5.2s 161 trn2 v1.2s, v4.2s, v5.2s 162 sabal \arg5, v0.4h, \arg2 163 sabal \arg5, v1.4h, \arg8\().4h 164 sabal2 \arg5, v4.8h, \arg8\().8h 165 sabal2 \arg5, v5.8h, \arg8\().8h 166 167 //16x16_h 168 ins v3.d[0], v4.d[1] 169 trn1 v0.4h, v4.4h, v3.4h 170 trn2 v1.4h, v4.4h, v3.4h 171 sabal \arg6, v0.4h, \arg3 172 sabdl v4.4s, v1.4h, \arg8\().4h 173 sabal v4.4s, v5.4h, \arg8\().4h 174 sabal2 v4.4s, v5.8h, \arg8\().8h 175 add \arg6, \arg6, v4.4s 176 177 //16x16_dc_both 178 sabal \arg7, v0.4h, \arg4 179 add \arg7, \arg7, v4.4s 180.endm 181 182//int32_t WelsIntra8x8Combined3Sad_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*,uint8_t*); 183WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra8x8Combined3Sad_AArch64_neon 184 ldr x11, [sp, #0] 185 SIGN_EXTENSION x1,w1 186 SIGN_EXTENSION x3,w3 187 SIGN_EXTENSION x5,w5 188 LOAD_CHROMA_DATA x0, v0.8b, v0.b 189 190 uaddlp v1.8h, v0.16b 191 uaddlp v2.4s, v1.8h 192 ins v3.d[0], v2.d[1] 193 add v3.2s, v2.2s, v3.2s 194 urshr v2.4s, v2.4s, #2 195 urshr v3.2s, v3.2s, #3 196 197 dup v20.8b, v3.b[0] 198 dup v21.8b, v2.b[4] 199 dup v22.8b, v2.b[12] 200 dup v23.8b, v3.b[4] 201 ins v20.s[1], v21.s[0] 202 ins v22.s[1], v23.s[0] 203 204 LOAD_CHROMA_DATA x7, v4.8b, v4.b 205 206 uaddlp v5.8h, v4.16b 207 uaddlp v6.4s, v5.8h 208 ins v7.d[0], v6.d[1] 209 add v7.2s, v6.2s, v7.2s 210 urshr v6.4s, v6.4s, #2 211 urshr v7.2s, v7.2s, #3 212 213 dup v24.8b, v7.b[0] 214 dup v25.8b, v6.b[4] 215 dup v26.8b, v6.b[12] 216 dup v27.8b, v7.b[4] 217 ins v24.s[1], v25.s[0] 218 ins v26.s[1], v27.s[0] 219 220 sub x9, x0, #1 221 sub x10, x7, #1 222 223 ld1 {v3.8b}, [x2], x3 224 ld1 {v5.8b}, [x11], x3 225 226 ld1r {v6.8b}, [x9], x1 227 ld1r {v7.8b}, [x10], x1 228 229 uabdl v29.8h, v0.8b, v3.8b 230 uabal v29.8h, v4.8b, v5.8b //top 231 232 uabdl v30.8h, v6.8b, v3.8b 233 uabal v30.8h, v7.8b, v5.8b //left 234 235 uabdl v31.8h, v20.8b, v3.8b 236 uabal v31.8h, v24.8b, v5.8b //Dc 237.rept 3 238 ld1 {v3.8b}, [x2], x3 239 ld1 {v5.8b}, [x11], x3 240 241 ld1r {v6.8b}, [x9], x1 242 ld1r {v7.8b}, [x10], x1 243 244 uabal v29.8h, v0.8b, v3.8b 245 uabal v29.8h, v4.8b, v5.8b //top 246 247 uabal v30.8h, v6.8b, v3.8b 248 uabal v30.8h, v7.8b, v5.8b //left 249 250 uabal v31.8h, v20.8b, v3.8b 251 uabal v31.8h, v24.8b, v5.8b //Dc 252.endr 253 254.rept 4 255 ld1 {v3.8b}, [x2], x3 256 ld1 {v5.8b}, [x11], x3 257 258 ld1r {v6.8b}, [x9], x1 259 ld1r {v7.8b}, [x10], x1 260 261 uabal v29.8h, v0.8b, v3.8b 262 uabal v29.8h, v4.8b, v5.8b //top 263 264 uabal v30.8h, v6.8b, v3.8b 265 uabal v30.8h, v7.8b, v5.8b //left 266 267 uabal v31.8h, v22.8b, v3.8b 268 uabal v31.8h, v26.8b, v5.8b //Dc 269.endr 270 271 saddlv s29, v29.8h 272 fmov w2, s29 273 add w2, w2, w5, lsl #1 274 saddlv s30, v30.8h 275 fmov w1, s30 276 add w1, w1, w5, lsl #1 277 saddlv s31, v31.8h 278 fmov w0, s31 279 280 SELECT_BEST_COST_PREFER_HIGHER w0 281 282 str w7, [x4] 283WELS_ASM_AARCH64_FUNC_END 284 285//int32_t WelsIntra16x16Combined3Sad_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*); 286WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra16x16Combined3Sad_AArch64_neon 287 SIGN_EXTENSION x1,w1 288 SIGN_EXTENSION x3,w3 289 SIGN_EXTENSION x5,w5 290 LOAD_LUMA_DATA 291 292 uaddlv h2, v0.16b 293 uaddlv h3, v1.16b 294 add v2.8h, v2.8h, v3.8h 295 uqrshrn b2, h2, #5 296 dup v2.16b, v2.b[0] //Dc 297 298 sub x7, x0, #1 299 ld1 {v3.16b}, [x2], x3 300 ld1r {v4.16b}, [x7], x1 301 302 uabdl v29.8h, v0.8b, v3.8b 303 uabal2 v29.8h, v0.16b,v3.16b //top 304 305 uabdl v30.8h, v4.8b, v3.8b 306 uabal2 v30.8h, v4.16b,v3.16b //left 307 308 uabdl v31.8h, v2.8b, v3.8b 309 uabal2 v31.8h, v2.16b,v3.16b //Dc 310 mov x6, #15 311sad_intra_16x16_x3_opt_loop0: 312 ld1 {v3.16b}, [x2], x3 313 ld1r {v4.16b}, [x7], x1 314 315 uabal v29.8h, v0.8b, v3.8b 316 uabal2 v29.8h, v0.16b,v3.16b //top 317 318 uabal v30.8h, v4.8b, v3.8b 319 uabal2 v30.8h, v4.16b,v3.16b //left 320 321 uabal v31.8h, v2.8b, v3.8b 322 uabal2 v31.8h, v2.16b,v3.16b //Dc 323 sub x6, x6, #1 324 cbnz x6, sad_intra_16x16_x3_opt_loop0 325 326 saddlv s29, v29.8h 327 fmov w0, s29 328 saddlv s30, v30.8h 329 fmov w1, s30 330 add w1, w1, w5, lsl #1 331 saddlv s31, v31.8h 332 fmov w2, s31 333 add w2, w2, w5, lsl #1 334 335 SELECT_BEST_COST_PREFER_LOWER w0 336 337 str w7, [x4] 338WELS_ASM_AARCH64_FUNC_END 339 340//int32_t WelsIntra4x4Combined3Satd_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t*, int32_t, int32_t,int32_t); 341WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra4x4Combined3Satd_AArch64_neon 342 SIGN_EXTENSION x1,w1 343 SIGN_EXTENSION x3,w3 344 SIGN_EXTENSION x6,w6 345 SIGN_EXTENSION x7,w7 346 347 sub x9, x0, x1 348 ld1 {v16.s}[0], [x9] //top 349 sub x9, x0, #1 350 ld1 {v16.b}[4], [x9], x1 351 ld1 {v16.b}[5], [x9], x1 352 ld1 {v16.b}[6], [x9], x1 353 ld1 {v16.b}[7], [x9], x1 354 355 356 uaddlv h2, v16.8b 357 uqrshrn b17, h2, #3 358 urshr v2.4h, v2.4h, #3 359 shl v2.4h, v2.4h, #4 360 361 //Calculate the 4x4_v 4x4_h mode SATD and save to "v6, v7" 362 ushll v4.8h, v16.8b, #2 363 ins v5.d[0], v4.d[1] 364 trn1 v6.2s, v4.2s, v5.2s 365 trn2 v7.2s, v4.2s, v5.2s 366 367 add v4.4h, v6.4h, v7.4h 368 sub v5.4h, v6.4h, v7.4h 369 trn1 v6.4h, v4.4h, v5.4h 370 trn2 v7.4h, v4.4h, v5.4h 371 add v4.4h, v6.4h, v7.4h 372 sub v5.4h, v6.4h, v7.4h 373 trn1 v6.2s, v4.2s, v5.2s 374 trn2 v7.2s, v4.2s, v5.2s //{0,1,3,2,top} v6 {0,1,3,2,left} v7 375 376 eor v31.16b, v31.16b, v31.16b //Save the SATD of DC_BOTH 377 eor v30.16b, v30.16b, v30.16b //Save the SATD of H 378 eor v29.16b, v29.16b, v29.16b //Save the SATD of V 379 eor v28.16b, v28.16b, v28.16b //For zero register 380 381 //Load the p_enc data and save to "v22 ~ v23"--- 16X4 bytes 382 ld1 {v22.s}[0], [x2], x3 383 ld1 {v22.s}[1], [x2], x3 384 ld1 {v23.s}[0], [x2], x3 385 ld1 {v23.s}[1], [x2], x3 386 387 HDM_TRANSFORM_4X4_L0 v22.8b, v23.8b, v6.4h, v7.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l 388 389 ldr x11, [sp, #0] 390 urshr v29.4s, v29.4s, #1 391 addv s29, v29.4s 392 fmov w0, s29 393 add w0, w0, w11 394 395 urshr v30.4s, v30.4s, #1 396 addv s30, v30.4s 397 fmov w1, s30 398 add w1, w1, w7 399 400 urshr v31.4s, v31.4s, #1 401 addv s31, v31.4s 402 fmov w2, s31 403 add w2, w2, w6 404 405 mov w10, w0 406 SELECT_BEST_COST_PREFER_HIGHER w10 407 408 str w7, [x5] 409 410 sub w9, w10, w2 411 cbnz w9, satd_intra_4x4_x3_opt_jump0 412 dup v0.16b, v17.b[0] 413 st1 {v0.16b}, [x4] 414 b satd_intra_4x4_x3_opt_end 415 416satd_intra_4x4_x3_opt_jump0: 417 sub w8, w10, w1 418 cbnz w8, satd_intra_4x4_x3_opt_jump1 419 dup v0.16b, v16.b[4] 420 dup v1.16b, v16.b[5] 421 dup v2.16b, v16.b[6] 422 dup v3.16b, v16.b[7] 423 st4 {v0.s,v1.s,v2.s,v3.s}[0], [x4] 424 b satd_intra_4x4_x3_opt_end 425 426satd_intra_4x4_x3_opt_jump1: 427 st1 {v16.S}[0], [x4], #4 428 st1 {v16.S}[0], [x4], #4 429 st1 {v16.S}[0], [x4], #4 430 st1 {v16.S}[0], [x4] 431satd_intra_4x4_x3_opt_end: 432 mov w0, w10 433 434WELS_ASM_AARCH64_FUNC_END 435 436//int32_t WelsIntra8x8Combined3Satd_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*,uint8_t*); 437WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra8x8Combined3Satd_AArch64_neon 438 ldr x11, [sp, #0] 439 440 SIGN_EXTENSION x1,w1 441 SIGN_EXTENSION x3,w3 442 SIGN_EXTENSION x5,w5 443 LOAD_CHROMA_DATA x0, v0.8b, v0.b 444 445 LOAD_CHROMA_DATA x7, v1.8b, v1.b 446 447 //Calculate the 16x16_v mode SATD and save to "v6, v7" 448 ushll v4.8h, v0.8b, #2 449 ushll v5.8h, v1.8b, #2 450 GET_16X16_V_SATD 451 452 //Calculate the 16x16_h mode SATD and save to "v16, v17" 453 ushll2 v4.8h, v0.16b, #2 454 ushll2 v5.8h, v1.16b, #2 455 GET_16X16_H_SATD 456 457 uaddlp v0.8h, v0.16b 458 uaddlp v2.4s, v0.8h 459 ins v3.d[0], v2.d[1] 460 add v3.2s, v2.2s, v3.2s 461 462 uaddlp v1.8h, v1.16b 463 uaddlp v4.4s, v1.8h 464 ins v5.d[0], v4.d[1] 465 add v5.2s, v4.2s, v5.2s 466 467 trn2 v0.4s, v2.4s, v4.4s 468 urshr v0.4s, v0.4s, #2 469 urshr v3.2s, v3.2s, #3 470 urshr v5.2s, v5.2s, #3 471 472 ushll v22.2d, v0.2s, #4 //{1cb, 1cr} 473 ushll2 v23.2d, v0.4s, #4 //{2cb, 2cr} 474 ushll v24.2d, v3.2s, #4 //{0cb, 3cb} 475 ushll v25.2d, v5.2s, #4 //{0cr, 3cr} 476 477 eor v31.16b, v31.16b, v31.16b //Save the SATD of DC_BOTH 478 eor v30.16b, v30.16b, v30.16b //Save the SATD of H 479 eor v29.16b, v29.16b, v29.16b //Save the SATD of V 480 eor v28.16b, v28.16b, v28.16b //For zero register 481 482 ins v18.d[0], v6.d[1] 483 ins v19.d[0], v7.d[1] 484 ins v26.d[0], v16.d[1] 485 ins v27.d[0], v17.d[1] 486 487 LOAD_8X4_DATA x2 488 489 HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v6.4h, v16.4h, v24.4h, v29.4s, v30.4s, v31.4s, v28, l 490 HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v18.4h, v16.4h, v22.4h, v29.4s, v30.4s, v31.4s, v28, l2 491 492 LOAD_8X4_DATA x11 493 494 ins v22.d[0], v22.d[1] 495 HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v7.4h, v17.4h, v25.4h, v29.4s, v30.4s, v31.4s, v28, l 496 HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v19.4h, v17.4h, v22.4h, v29.4s, v30.4s, v31.4s, v28, l2 497 498 LOAD_8X4_DATA x2 499 500 ins v24.d[0], v24.d[1] 501 HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v6.4h, v26.4h, v23.4h, v29.4s, v30.4s, v31.4s, v28, l 502 HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v18.4h, v26.4h, v24.4h, v29.4s, v30.4s, v31.4s, v28, l2 503 504 LOAD_8X4_DATA x11 505 506 ins v23.d[0], v23.d[1] 507 ins v25.d[0], v25.d[1] 508 HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v7.4h, v27.4h, v23.4h, v29.4s, v30.4s, v31.4s, v28, l 509 HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v19.4h, v27.4h, v25.4h, v29.4s, v30.4s, v31.4s, v28, l2 510 511 urshr v29.4s, v29.4s, #1 512 addv s29, v29.4s 513 fmov w2, s29 514 add w2, w2, w5, lsl #1 515 516 urshr v30.4s, v30.4s, #1 517 addv s30, v30.4s 518 fmov w1, s30 519 add w1, w1, w5, lsl #1 520 521 urshr v31.4s, v31.4s, #1 522 addv s31, v31.4s 523 fmov w0, s31 524 525 SELECT_BEST_COST_PREFER_HIGHER w0 526 527 str w7, [x4] 528WELS_ASM_AARCH64_FUNC_END 529 530//int32_t WelsIntra16x16Combined3Satd_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*); 531WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra16x16Combined3Satd_AArch64_neon 532 SIGN_EXTENSION x1,w1 533 SIGN_EXTENSION x3,w3 534 SIGN_EXTENSION x5,w5 535 LOAD_LUMA_DATA 536 537 uaddlv h2, v0.16b 538 uaddlv h3, v1.16b 539 add v2.8h, v2.8h, v3.8h 540 urshr v2.4h, v2.4h, #5 541 shl v2.4h, v2.4h, #4 542 543 //Calculate the 16x16_v mode SATD and save to "v6, v7" 544 ushll v4.8h, v0.8b, #2 545 ushll2 v5.8h, v0.16b, #2 546 GET_16X16_V_SATD 547 548 //Calculate the 16x16_h mode SATD and save to "v16, v17" 549 ushll v4.8h, v1.8b, #2 550 ushll2 v5.8h, v1.16b, #2 551 GET_16X16_H_SATD 552 553 eor v31.16b, v31.16b, v31.16b //Save the SATD of DC_BOTH 554 eor v30.16b, v30.16b, v30.16b //Save the SATD of H 555 eor v29.16b, v29.16b, v29.16b //Save the SATD of V 556 eor v28.16b, v28.16b, v28.16b //For zero register 557 558 ins v18.d[0], v6.d[1] 559 ins v19.d[0], v7.d[1] 560 ins v26.d[0], v16.d[1] 561 ins v27.d[0], v17.d[1] 562 563 LOAD_16X4_DATA 564 565 HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l 566 HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2 567 HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l 568 HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2 569 570 LOAD_16X4_DATA 571 572 HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l 573 HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2 574 HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l 575 HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2 576 577 LOAD_16X4_DATA 578 579 HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l 580 HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2 581 HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l 582 HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2 583 584 LOAD_16X4_DATA 585 586 HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l 587 HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2 588 HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l 589 HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2 590 591 urshr v29.4s, v29.4s, #1 592 addv s29, v29.4s 593 fmov w0, s29 594 595 urshr v30.4s, v30.4s, #1 596 addv s30, v30.4s 597 fmov w1, s30 598 add w1, w1, w5, lsl #1 599 600 urshr v31.4s, v31.4s, #1 601 addv s31, v31.4s 602 fmov w2, s31 603 add w2, w2, w5, lsl #1 604 605 SELECT_BEST_COST_PREFER_LOWER w0 606 607 str w7, [x4] 608 609WELS_ASM_AARCH64_FUNC_END 610 611#endif 612