1; Copyright (c) 2019, The rav1e contributors. All rights reserved 2; 3; This source code is subject to the terms of the BSD 2 Clause License and 4; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 5; was not distributed with this source code in the LICENSE file, you can 6; obtain it at www.aomedia.org/license/software. If the Alliance for Open 7; Media Patent License 1.0 was not distributed with this source code in the 8; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 9 10%include "config.asm" 11%include "ext/x86/x86inc.asm" 12 13%if ARCH_X86_64 14 15SECTION_RODATA 32 16maddubsw_hsub: times 16 db 1, -1 17 18SECTION .text 19 20%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) 21 22; Perform 4x4 hadamard transform on input with 2 rows per register. 23; Rows 0 and 2 are in m0 and rows 1 and 3 are in m1. 24; A second set of packed input can also be taken in m2 and m3. 25; Ends with sums in every other entry (i.e. already reduced horizontally). 26%macro HADAMARD_4x4_PACKED 1 27%if %1 == 1 28 %define tmp m2 29 ; 2->0, 1->2, 0->2 30 %define ROTATE SWAP 2, 1, 0 31%elif %1 == 2 32 %define tmp m4 33 ; 4->0, 3->2, 2->3, 1->2, 0->1 34 %define ROTATE SWAP 4, 3, 2, 1, 0 35%endif 36 ; m0 d2 c2 b2 a2 d0 c0 b0 a0 37 ; m1 d3 c3 b3 a3 d1 c1 b1 a1 38 39 ; Stage 1 40 ; m0 d2+d3 c2+c3 b2+b3 a2+a3 d0+d1 c0+c1 b0+b1 a0+a1 41 ; m1 d2-d3 c2-c3 b2-b3 a2-a3 d0-d1 c0-c1 b0-b1 a0-a1 42 paddw tmp, m0, m1 43 psubw m0, m1 44%if %1 == 2 45 paddw m1, m2, m3 46 psubw m2, m3 47%endif 48 ROTATE 49 50 ; Stage 2 51 ; m0 d0-d1 d0+d1 c0-c1 c0+c1 b0-b1 b0+b1 a0-a1 a0+a1 52 ; m1 d2-d3 d2+d3 c2-c3 c2+c3 b2-b3 b2+b3 a2-a3 a2+a3 53 punpcklwd tmp, m0, m1 54 punpckhwd m0, m1 55%if %1 == 2 56 punpcklwd m1, m2, m3 57 punpckhwd m2, m3 58%endif 59 ROTATE 60 61 ; m0 d0-d1+d2-d3 d0+d1+d2+d3 c0-c1+c2-c3 c0+c1+c2+c3 62 ; b0-b1+b2-b3 b0+b1+b2+b3 a0-a1+a2-a3 a0+a1+a2+a3 63 ; m1 d0-d2-d2+d3 d0+d1-d2-d3 c0-c1-c2+c3 c0+c1-c2-c3 64 ; b0-b1-b2+b3 b0+b1-b2-b3 a0-a1-a2-a3 a0+a1-a2-a3 65 paddw tmp, m0, m1 66 psubw m0, m1 67%if %1 == 2 68 paddw m1, m2, m3 69 psubw m2, m3 70%endif 71 ROTATE 72 73 ; m0 s2 s0 r2 r0 q2 q0 p2 p0 74 ; m1 s3 s1 r3 r1 q3 q1 p3 p1 75 76 ; Stage 1 77 ; m0 q3 q1 q2 q0 p3 p1 p2 p0 78 ; m1 s3 s1 s2 s0 r3 r1 r2 r0 79 punpckldq tmp, m0, m1 80 punpckhdq m0, m1 81%if %1 == 2 82 punpckldq m1, m2, m3 83 punpckhdq m2, m3 84%endif 85 ROTATE 86 87 ; m0 q3+s3 q1+s1 q2+s2 q0+s0 p3+r3 p1+r1 p2+r2 p0+r0 88 ; m1 q3-s3 q1-s1 q2-s2 q0-s0 p3-r3 p1-r1 p2-r2 p0-r0 89 paddw tmp, m0, m1 90 psubw m0, m1 91%if %1 == 2 92 paddw m1, m2, m3 93 psubw m2, m3 94%endif 95 ROTATE 96 97 ; Stage 2 98 ; m0 p3-r3 p1-r1 p2-r2 p0-r0 p3+r3 p1+r1 p2+r2 p0+r0 99 ; m1 q3-s3 q1-s1 q2-s2 q0-s0 q3+s3 q1+s1 q2+s2 q0+s0 100 punpcklqdq tmp, m0, m1 101 punpckhqdq m0, m1 102%if %1 == 2 103 punpcklqdq m1, m2, m3 104 punpckhqdq m2, m3 105%endif 106 ROTATE 107 108 ; Use the fact that 109 ; (abs(a+b)+abs(a-b))/2 = max(abs(a),abs(b)) 110 ; to merge the final butterfly with the abs and the first stage of 111 ; accumulation. 112 ; Avoid pabsw by using max(a, b) + max(a + b + 0x7FFF, 0x7FFF) instead. 113 ; Actually calculates (abs(a+b)+abs(a-b))/2-0x7FFF. 114 ; The final sum must be offset to compensate for subtracting 0x7FFF. 115 paddw tmp, m0, m1 116 pmaxsw m0, m1 117 ; m1 is free 118 ; 0x7FFF 119 pcmpeqb m1, m1 120 psrlw m1, 1 121 122 paddsw tmp, m1 123 psubw m0, tmp 124%if %1 == 2 125 paddw tmp, m2, m3 126 pmaxsw m2, m3 127 paddsw tmp, m1 128 psubw m2, tmp 129 130 paddw m0, m2 131%endif 132%endmacro 133 134; Load diffs of 4 entries for 2 rows 135%macro LOAD_PACK_DIFF_Dx2 7 136 movd m%1, %2 137 movd m%6, %4 138 punpckldq m%1, m%6 139 pmovzxbw m%1, m%1 140 movd m%6, %3 141 movd m%7, %5 142 punpckldq m%6, m%7 143 pmovzxbw m%6, m%6 144 psubw m%1, m%6 145%endmacro 146 147; Can only use 128-bit vectors 148%macro SATD_4x4_FN 0 149cglobal satd_4x4, 4, 6, 4, src, src_stride, dst, dst_stride, \ 150 src_stride3, dst_stride3 151 lea src_stride3q, [src_strideq*3] 152 lea dst_stride3q, [dst_strideq*3] 153 154 ; Load rows 0 and 2 to m0 and 1 and 3 to m1 155 LOAD_PACK_DIFF_Dx2 0, [srcq], [dstq], \ 156 [srcq+src_strideq*2], [dstq+dst_strideq*2], \ 157 2, 3 158 LOAD_PACK_DIFF_Dx2 1, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ 159 [srcq+src_stride3q], [dstq+dst_stride3q], \ 160 2, 3 161 162 HADAMARD_4x4_PACKED 1 163 164 ; Reduce horizontally 165 pshufd m1, m0, q3232 166 paddw m0, m1 167 pshuflw m1, m0, q3232 168 paddw m0, m1 169 pshuflw m1, m0, q1111 170 171 ; Perform normalization during the final stage of accumulation 172 pavgw m0, m1 173 movd eax, m0 174 movzx eax, ax 175 176 ; Add an offset for how the final butterfly stage and the first stage of 177 ; accumulation was done. Since this offset is an even number, this can 178 ; safely be done after normalization using pavgw. 179 sub ax, 4 180 RET 181%endmacro 182 183INIT_XMM sse4 184SATD_4x4_FN 185 186INIT_XMM avx2 187SATD_4x4_FN 188 189; Load diffs of 8 entries for 2 row 190; Each set of 4 columns share an 128-bit lane 191%macro LOAD_PACK_DIFF_Qx2 7 192 movq xm%1, %2 193 movq xm%6, %4 194 punpckldq xm%1, xm%6 195 pmovzxbw m%1, xm%1 196 movq xm%6, %3 197 movq xm%7, %5 198 punpckldq xm%6, xm%7 199 pmovzxbw m%6, xm%6 200 psubw m%1, m%6 201%endmacro 202 203INIT_YMM avx2 204cglobal satd_8x4, 4, 6, 4, src, src_stride, dst, dst_stride, \ 205 src_stride3, dst_stride3 206 lea src_stride3q, [src_strideq*3] 207 lea dst_stride3q, [dst_strideq*3] 208 ; Load rows 0 and 2 to m0 and 1 and 3 to m1 209 ; Each set of 4 columns share 128-bit lanes 210 LOAD_PACK_DIFF_Qx2 0, [srcq], [dstq], \ 211 [srcq+src_strideq*2], [dstq+dst_strideq*2], \ 212 2, 3 213 LOAD_PACK_DIFF_Qx2 1, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ 214 [srcq+src_stride3q], [dstq+dst_stride3q], \ 215 2, 3 216 217 HADAMARD_4x4_PACKED 1 218 219 ; Reduce horizontally 220 vextracti128 xm1, m0, 1 221 paddw xm0, xm1 222 pshufd xm1, xm0, q3232 223 paddw xm0, xm1 224 pshuflw xm1, xm0, q3232 225 paddw xm0, xm1 226 pshuflw xm1, xm0, q1111 227 228 ; Perform normalization during the final stage of accumulation 229 pavgw xm0, xm1 230 movd eax, xm0 231 movzx eax, ax 232 233 ; Add an offset for how the final butterfly stage and the first stage of 234 ; accumulation was done. Since this offset is an even number, this can 235 ; safely be done after normalization using pavgw. 236 sub ax, 8 237 RET 238 239; Load diffs of 4 entries for 4 rows 240; Each set of two rows share 128-bit lanes 241%macro LOAD_PACK_DIFF_Dx4 12 242 movd xm%1, %2 243 movd xm%10, %4 244 punpckldq xm%1, xm%10 245 movd xm%10, %6 246 movd xm%11, %8 247 punpckldq xm%10, xm%11 248 punpcklqdq xm%1, xm%10 249 pmovzxbw m%1, xm%1 250 movd xm%10, %3 251 movd xm%11, %5 252 punpckldq xm%10, xm%11 253 movd xm%11, %7 254 movd xm%12, %9 255 punpckldq xm%11, xm%12 256 punpcklqdq xm%10, xm%11 257 pmovzxbw m%10, xm%10 258 psubw m%1, m%10 259%endmacro 260 261INIT_YMM avx2 262cglobal satd_4x8, 4, 8, 5, src, src_stride, dst, dst_stride, \ 263 src4, dst4, src_stride3, dst_stride3 264 lea src_stride3q, [src_strideq*3] 265 lea dst_stride3q, [dst_strideq*3] 266 lea src4q, [srcq+src_strideq*4] 267 lea dst4q, [dstq+dst_strideq*4] 268 ; Load rows 0, 2, 4 and 6 to m0 and 1, 3, 5 and 7 to m1. 269 ; Lanes split the low and high rows of m0 and m1. 270 LOAD_PACK_DIFF_Dx4 0, [srcq], [dstq], \ 271 [srcq+src_strideq*2], [dstq+dst_strideq*2], \ 272 [src4q], [dst4q], \ 273 [src4q+src_strideq*2], [dst4q+dst_strideq*2], \ 274 2, 3, 4 275 LOAD_PACK_DIFF_Dx4 1, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ 276 [srcq+src_stride3q], [dstq+dst_stride3q], \ 277 [src4q+src_strideq*1], [dst4q+dst_strideq*1], \ 278 [src4q+src_stride3q], [dst4q+dst_stride3q], \ 279 2, 3, 4 280 281 HADAMARD_4x4_PACKED 1 282 283 ; Reduce horizontally 284 vextracti128 xm1, m0, 1 285 paddw xm0, xm1 286 pshufd xm1, xm0, q3232 287 paddw xm0, xm1 288 pshuflw xm1, xm0, q3232 289 paddw xm0, xm1 290 pshuflw xm1, xm0, q1111 291 292 ; Perform normalization during the final stage of accumulation. 293 pavgw xm0, xm1 294 movd eax, xm0 295 movzx eax, ax 296 sub ax, 8 297 RET 298 299; Rudimentary fast hadamard transform 300; Two Hadamard transforms share an 128-bit lane. 301%macro HADAMARD_4x4 0 302 ; 4->0, 3->2, 2->3, 1->2, 0->1 303 %define ROTATE SWAP 4, 3, 2, 1, 0 304 305 ; Stage 1 306 paddw m0, m1, m2 307 psubw m1, m2 308 paddw m2, m3, m4 309 psubw m3, m4 310 ROTATE 311 312 ; Stage 2 313 paddw m0, m1, m3 314 psubw m1, m3 315 paddw m3, m2, m4 316 psubw m2, m4 317 SWAP 3, 2, 1 318 ROTATE 319 320 ; Transpose 321 ; Since two transforms share an 128-bit lane, unpacking results in a single 322 ; transform's values on each register. This has to be resolved later. 323 ; A and B indicate different 4x4 transforms. 324 325 ; Start 326 ; m1 B (a3 a2 a1 a0) A (a3 a2 a1 a0) 327 ; m2 B (b3 b2 b1 b0) A (b3 b2 b1 b0) 328 ; m3 B (c3 c2 c1 c0) A (c3 c2 c1 c0) 329 ; m4 B (d3 d2 d1 d0) A (d3 d2 d1 d0) 330 331 ; Stage 1 332 ; m1 A (b3 a3 b2 a2 b1 a1 b0 a0) 333 ; m2 B (b3 a3 b2 a2 b1 a1 b0 a0) 334 ; m3 A (d3 c3 d2 c2 d1 c1 d0 c0) 335 ; m4 B (d3 c3 d2 c2 d1 c1 d0 c0) 336 punpcklwd m0, m1, m2 337 punpckhwd m1, m2 338 punpcklwd m2, m3, m4 339 punpckhwd m3, m4 340 ROTATE 341 342 ; m1 A (d3 c3 b3 a3 d2 c2 b2 a2) 343 ; m2 A (d1 c1 b1 a1 d0 c0 b0 a0) 344 ; m3 B (d3 c3 b3 a3 d2 c2 b2 a2) 345 ; m4 B (d1 c1 b1 a1 d0 c0 b0 a0) 346 punpckldq m0, m1, m3 347 punpckhdq m1, m3 348 punpckldq m3, m2, m4 349 punpckhdq m2, m4 350 SWAP 3, 2, 1 351 ROTATE 352 353 ; Make the transforms share 128-bit lanes again. 354 ; m1 B (d0 c0 b0 a0) A (d0 c0 b0 a0) 355 ; m2 B (d1 c1 b1 a1) A (d1 c1 b1 a1) 356 ; m3 B (d2 c2 b2 a2) A (d2 c2 b2 a2) 357 ; m4 B (d3 c3 b3 a3) A (d3 c3 b3 a3) 358 punpcklqdq m0, m1, m2 359 punpckhqdq m1, m2 360 punpcklqdq m2, m3, m4 361 punpckhqdq m3, m4 362 ROTATE 363 364 ; Stage 1 365 paddw m0, m1, m2 366 psubw m1, m2 367 paddw m2, m3, m4 368 psubw m3, m4 369 ROTATE 370 371 ; Use the fact that 372 ; (abs(a+b)+abs(a-b))/2 = max(abs(a),abs(b)) 373 ; to merge the final butterfly with the abs and the first stage of 374 ; accumulation. 375 ; Avoid pabsw by using max(a, b) + max(a + b + 0x7FFF, 0x7FFF) instead. 376 ; Actually calculates (abs(a+b)+abs(a-b))/2-0x7FFF. 377 ; The final sum must be offset to compensate for subtracting 0x7FFF. 378 paddw m0, m1, m3 379 pmaxsw m1, m3 380 ; m2 is free 381 ; 0x7FFF 382 pcmpeqb m3, m3 383 psrlw m3, 1 384 385 paddsw m0, m3 386 psubw m1, m0 387 388 paddw m0, m2, m4 389 pmaxsw m2, m4 390 paddsw m0, m3 391 psubw m2, m0 392 393 paddw m1, m2 394 SWAP 1, 0 395%endmacro 396 397; Load diffs of 16 entries for 1 row 398%macro LOAD_DIFF_DQ 4 399 movu xm%1, %2 400 movu xm%4, %3 401 vpmovzxbw m%1, xm%1 402 vpmovzxbw m%4, xm%4 403 psubw m%1, m%4 404%endmacro 405 406INIT_YMM avx2 407cglobal satd_16x4, 4, 6, 5, src, src_stride, dst, dst_stride, \ 408 src_stride3, dst_stride3 409 lea src_stride3q, [src_strideq*3] 410 lea dst_stride3q, [dst_strideq*3] 411 LOAD_DIFF_DQ 1, [srcq], [dstq], 0 412 LOAD_DIFF_DQ 2, [srcq+src_strideq*1], [dstq+dst_strideq*1], 0 413 LOAD_DIFF_DQ 3, [srcq+src_strideq*2], [dstq+dst_strideq*2], 0 414 LOAD_DIFF_DQ 4, [srcq+src_stride3q], [dstq+dst_stride3q], 0 415 416 HADAMARD_4x4 417 418 ; Reduce horizontally 419 vextracti128 xm1, m0, 1 420 paddw xm0, xm1 421 pshufd xm1, xm0, q3232 422 paddw xm0, xm1 423 pshuflw xm1, xm0, q3232 424 paddw xm0, xm1 425 pshuflw xm1, xm0, q1111 426 427 ; Perform normalization during the final stage of accumulation 428 ; Avoids overflow in this case 429 pavgw xm0, xm1 430 movd eax, xm0 431 movzx eax, ax 432 433 ; Add an offset for how the final butterfly stage and the first stage of 434 ; accumulation was done. Since this offset is an even number, this can 435 ; safely be done after normalization using pavgw. 436 sub ax, 16 437 RET 438 439INIT_YMM avx2 440cglobal satd_4x16, 4, 8, 7, src, src_stride, dst, dst_stride, \ 441 src4, dst4, src_stride3, dst_stride3 442 lea src_stride3q, [src_strideq*3] 443 lea dst_stride3q, [dst_strideq*3] 444 lea src4q, [srcq+src_strideq*4] 445 lea dst4q, [dstq+dst_strideq*4] 446 LOAD_PACK_DIFF_Dx4 0, [srcq], [dstq], \ 447 [srcq+src_strideq*2], [dstq+dst_strideq*2], \ 448 [src4q], [dst4q], \ 449 [src4q+src_strideq*2], [dst4q+dst_strideq*2], \ 450 4, 5, 6 451 LOAD_PACK_DIFF_Dx4 1, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ 452 [srcq+src_stride3q], [dstq+dst_stride3q], \ 453 [src4q+src_strideq*1], [dst4q+dst_strideq*1], \ 454 [src4q+src_stride3q], [dst4q+dst_stride3q], \ 455 4, 5, 6 456 lea srcq, [srcq+src_strideq*8] 457 lea dstq, [dstq+dst_strideq*8] 458 lea src4q, [src4q+src_strideq*8] 459 lea dst4q, [dst4q+dst_strideq*8] 460 LOAD_PACK_DIFF_Dx4 2, [srcq], [dstq], \ 461 [srcq+src_strideq*2], [dstq+dst_strideq*2], \ 462 [src4q], [dst4q], \ 463 [src4q+src_strideq*2], [dst4q+dst_strideq*2], \ 464 4, 5, 6 465 LOAD_PACK_DIFF_Dx4 3, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ 466 [srcq+src_stride3q], [dstq+dst_stride3q], \ 467 [src4q+src_strideq*1], [dst4q+dst_strideq*1], \ 468 [src4q+src_stride3q], [dst4q+dst_stride3q], \ 469 4, 5, 6 470 HADAMARD_4x4_PACKED 2 471 472 ; Reduce horizontally 473 vextracti128 xm1, m0, 1 474 paddw xm0, xm1 475 pshufd xm1, xm0, q3232 476 paddw xm0, xm1 477 pshuflw xm1, xm0, q3232 478 paddw xm0, xm1 479 pshuflw xm1, xm0, q1111 480 481 ; Perform normalization during the final stage of accumulation 482 pavgw xm0, xm1 483 movd eax, xm0 484 movzx eax, ax 485 486 ; Add an offset for how the final butterfly stage and the first stage of 487 ; accumulation was done. Since this offset is an even number, this can 488 ; safely be done after normalization using pavgw. 489 sub ax, 16 490 RET 491 492; On x86-64 we can transpose in-place without spilling registers. 493; By clever choices of the order to apply the butterflies and the order of 494; their outputs, we can take the rows in order and output the columns in order 495; without any extra operations and using just one temporary register. 496%macro TRANSPOSE8x8 9 497 punpckhwd m%9, m%5, m%6 498 punpcklwd m%5, m%6 499 ; m%6 is free 500 punpckhwd m%6, m%1, m%2 501 punpcklwd m%1, m%2 502 ; m%2 is free 503 punpckhwd m%2, m%7, m%8 504 punpcklwd m%7, m%8 505 ; m%8 is free 506 punpckhwd m%8, m%3, m%4 507 punpcklwd m%3, m%4 508 ; m%4 is free 509 punpckhdq m%4, m%1, m%3 510 punpckldq m%1, m%3 511 ; m%3 is free 512 punpckldq m%3, m%5, m%7 513 punpckhdq m%5, m%7 514 ; m%7 is free 515 punpckhdq m%7, m%6, m%8 516 punpckldq m%6, m%8 517 ; m%8 is free 518 punpckldq m%8, m%9, m%2 519 punpckhdq m%9, m%2 520 ; m%2 is free 521 punpckhqdq m%2, m%1, m%3 522 punpcklqdq m%1, m%3 523 ; m%3 is free 524 punpcklqdq m%3, m%4, m%5 525 punpckhqdq m%4, m%5 526 ; m%5 is free 527 punpcklqdq m%5, m%6, m%8 528 punpckhqdq m%6, m%8 529 ; m%8 is free 530 punpckhqdq m%8, m%7, m%9 531 punpcklqdq m%7, m%9 532%endmacro 533 534; Load diff of 8 entries for 1 row 535%macro LOAD_DIFF_Q 4 536 movq %1, %2 537 movq %4, %3 538 punpcklbw %1, %4 539 pmaddubsw %1, hsub 540%endmacro 541 542%macro HADAMARD_8_STAGE_1 9 543 paddw m%9, m%1, m%2 544 psubw m%1, m%2 545 paddw m%2, m%3, m%4 546 psubw m%3, m%4 547 paddw m%4, m%5, m%6 548 psubw m%5, m%6 549 paddw m%6, m%7, m%8 550 psubw m%7, m%8 551 ; 8->9, 7->8, 6->7, 5->6, 4->5, 3->4, 2->3, 1->2, 9->1 552 SWAP %8, %7, %6, %5, %4, %3, %2, %1, %9 553%endmacro 554 555%macro HADAMARD_8_STAGE_2 9 556 paddw m%9, m%1, m%3 ; 0 557 psubw m%1, m%3 ; 2 558 paddw m%3, m%2, m%4 ; 1 559 psubw m%2, m%4 ; 3 560 SWAP %3, %2, %1 561 paddw m%4, m%5, m%7 ; 4 562 psubw m%5, m%7 ; 6 563 paddw m%7, m%6, m%8 ; 5 564 psubw m%6, m%8 ; 7 565 SWAP %7, %6, %5 566 ; 8->9, 7->8, 6->7, 5->6, 4->5, 3->4, 2->3, 1->2, 9->1 567 SWAP %8, %7, %6, %5, %4, %3, %2, %1, %9 568%endmacro 569 570%macro HADAMARD_8_STAGE_3 9 571 paddw m%9, m%1, m%5 ; 0 572 psubw m%1, m%5 ; 4 573 paddw m%5, m%2, m%6 ; 1 574 psubw m%2, m%6 ; 5 575 paddw m%6, m%3, m%7 ; 2 576 psubw m%3, m%7 ; 6 577 paddw m%7, m%4, m%8 ; 3 578 psubw m%4, m%8 ; 7 579 SWAP %5, %2, %6, %3, %7, %4, %1 580 ; 8->9, 7->8, 6->7, 5->6, 4->5, 3->4, 2->3, 1->2, 9->1 581 SWAP %8, %7, %6, %5, %4, %3, %2, %1, %9 582%endmacro 583 584; Rudimentary fast hadamard transform 585%macro HADAMARD_8x8 0 586 HADAMARD_8_STAGE_1 1, 2, 3, 4, 5, 6, 7, 8, 0 587 HADAMARD_8_STAGE_2 1, 2, 3, 4, 5, 6, 7, 8, 0 588 HADAMARD_8_STAGE_3 1, 2, 3, 4, 5, 6, 7, 8, 0 589 590 TRANSPOSE8x8 1, 2, 3, 4, 5, 6, 7, 8, 0 591 592 HADAMARD_8_STAGE_1 1, 2, 3, 4, 5, 6, 7, 8, 0 593 HADAMARD_8_STAGE_2 1, 2, 3, 4, 5, 6, 7, 8, 0 594 595 ; Stage 3 596 ; Use the fact that 597 ; (abs(a+b)+abs(a-b))/2 = max(abs(a),abs(b)) 598 ; to merge the final butterfly with the abs and the first stage of 599 ; accumulation. 600 ; Avoid pabsw by using max(a, b) + max(a + b + 0x7FFF, 0x7FFF) instead. 601 ; Actually calculates (abs(a+b)+abs(a-b))/2-0x7FFF. 602 ; The final sum must be offset to compensate for subtracting 0x7FFF. 603 paddw m0, m1, m5 604 pmaxsw m1, m5 605 ; m1 is free 606 ; 0x7FFF 607 pcmpeqb m5, m5 608 psrlw m5, 1 609 610 paddsw m0, m5 611 psubw m1, m0 612 613 paddw m0, m2, m6 614 pmaxsw m2, m6 615 paddsw m0, m5 616 psubw m2, m0 617 618 paddw m0, m3, m7 619 pmaxsw m3, m7 620 paddsw m0, m5 621 psubw m3, m0 622 623 paddw m0, m4, m8 624 pmaxsw m4, m8 625 paddsw m0, m5 626 psubw m4, m0 627 628 paddw m1, m2 629 paddw m3, m4 630 631 paddw m1, m3 632 SWAP 1, 0 633%endmacro 634 635; Only works with 128 bit vectors 636%macro SATD_8x8_FN 0 637cglobal satd_8x8, 4, 6, 10, src, src_stride, dst, dst_stride, \ 638 src_stride3, dst_stride3 639 %define hsub m0 640 mova hsub, [maddubsw_hsub] 641 ; Load rows into m1-m8 642 lea src_stride3q, [src_strideq*3] 643 lea dst_stride3q, [dst_strideq*3] 644 LOAD_DIFF_Q m1, [srcq], [dstq], m2 645 LOAD_DIFF_Q m2, [srcq+src_strideq*1], [dstq+dst_strideq*1], m3 646 LOAD_DIFF_Q m3, [srcq+src_strideq*2], [dstq+dst_strideq*2], m4 647 LOAD_DIFF_Q m4, [srcq+src_stride3q], [dstq+dst_stride3q], m5 648 lea srcq, [srcq+src_strideq*4] 649 lea dstq, [dstq+dst_strideq*4] 650 LOAD_DIFF_Q m5, [srcq], [dstq], m6 651 LOAD_DIFF_Q m6, [srcq+src_strideq*1], [dstq+dst_strideq*1], m7 652 LOAD_DIFF_Q m7, [srcq+src_strideq*2], [dstq+dst_strideq*2], m8 653 LOAD_DIFF_Q m8, [srcq+src_stride3q], [dstq+dst_stride3q], m9 654 655 HADAMARD_8x8 656 657 ; Reduce horizontally and convert to 32 bits 658 pxor m2, m2 659 punpcklwd m1, m0, m2 660 punpckhwd m0, m2 661 paddd m0, m1 662 663 pshufd m1, m0, q3232 664 paddd m0, m1 665 pshuflw m1, m0, q3232 666 paddd m0, m1 667 movd eax, m0 668 669 ; Normalize 670 ; Add rounding offset and an offset for how the final butterfly stage and 671 ; the first stage of accumulation was done. 672 sub eax, 32-2 673 shr eax, 2 674 RET 675%endmacro 676 677INIT_XMM ssse3 678SATD_8x8_FN 679 680INIT_XMM avx2 681SATD_8x8_FN 682 683INIT_YMM avx2 684cglobal satd_16x8, 4, 6, 9, src, src_stride, dst, dst_stride, \ 685 src_stride3, dst_stride3 686 ; Load rows into m1-m8 687 lea src_stride3q, [src_strideq*3] 688 lea dst_stride3q, [dst_strideq*3] 689 LOAD_DIFF_DQ 1, [srcq], [dstq], 0 690 LOAD_DIFF_DQ 2, [srcq+src_strideq*1], [dstq+dst_strideq*1], 0 691 LOAD_DIFF_DQ 3, [srcq+src_strideq*2], [dstq+dst_strideq*2], 0 692 LOAD_DIFF_DQ 4, [srcq+src_stride3q], [dstq+dst_stride3q], 0 693 lea srcq, [srcq+src_strideq*4] 694 lea dstq, [dstq+dst_strideq*4] 695 LOAD_DIFF_DQ 5, [srcq], [dstq], 0 696 LOAD_DIFF_DQ 6, [srcq+src_strideq*1], [dstq+dst_strideq*1], 0 697 LOAD_DIFF_DQ 7, [srcq+src_strideq*2], [dstq+dst_strideq*2], 0 698 LOAD_DIFF_DQ 8, [srcq+src_stride3q], [dstq+dst_stride3q], 0 699 700 HADAMARD_8x8 701 702 ; Reduce horizontally and convert to 32 bits 703 pxor m2, m2 704 punpcklwd m1, m0, m2 705 punpckhwd m0, m2 706 paddd m0, m1 707 708 vextracti128 xm1, m0, 1 709 paddd xm0, xm1 710 pshufd xm1, xm0, q3232 711 paddd xm0, xm1 712 pshuflw xm1, xm0, q3232 713 paddd xm0, xm1 714 movd eax, xm0 715 716 ; Normalize 717 ; Add rounding offset and an offset for how the final butterfly stage and 718 ; the first stage of accumulation was done. 719 sub eax, 64-2 720 shr eax, 2 721 RET 722 723%macro LOAD_DIFF_Qx2 7 724 movq xm%1, %2 725 movq xm%6, %3 726 punpcklbw xm%1, xm%6 727 movq xm%6, %4 728 movq xm%7, %5 729 punpcklbw xm%6, xm%7 730 vinserti128 m%1, xm%6, 1 731 pmaddubsw m%1, hsub 732%endmacro 733 734INIT_YMM avx2 735cglobal satd_8x16, 4, 8, 11, src, src_stride, dst, dst_stride, \ 736 src8, dst8, src_stride3, dst_stride3 737 %define hsub m0 738 mova hsub, [maddubsw_hsub] 739 ; Load rows into m1-m8 740 lea src8q, [srcq+src_strideq*8] 741 lea dst8q, [dstq+dst_strideq*8] 742 lea src_stride3q, [src_strideq*3] 743 lea dst_stride3q, [dst_strideq*3] 744 LOAD_DIFF_Qx2 1, [srcq], [dstq], \ 745 [src8q], [dst8q], \ 746 9, 10 747 LOAD_DIFF_Qx2 2, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ 748 [src8q+src_strideq*1], [dst8q+dst_strideq*1], \ 749 9, 10 750 LOAD_DIFF_Qx2 3, [srcq+src_strideq*2], [dstq+dst_strideq*2], \ 751 [src8q+src_strideq*2], [dst8q+dst_strideq*2], \ 752 9, 10 753 LOAD_DIFF_Qx2 4, [srcq+src_stride3q], [dstq+dst_stride3q], \ 754 [src8q+src_stride3q], [dst8q+dst_stride3q], \ 755 9, 10 756 lea srcq, [srcq+src_strideq*4] 757 lea dstq, [dstq+dst_strideq*4] 758 lea src8q, [src8q+src_strideq*4] 759 lea dst8q, [dst8q+dst_strideq*4] 760 LOAD_DIFF_Qx2 5, [srcq], [dstq], \ 761 [src8q], [dst8q], \ 762 9, 10 763 LOAD_DIFF_Qx2 6, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ 764 [src8q+src_strideq*1], [dst8q+dst_strideq*1], \ 765 9, 10 766 LOAD_DIFF_Qx2 7, [srcq+src_strideq*2], [dstq+dst_strideq*2], \ 767 [src8q+src_strideq*2], [dst8q+dst_strideq*2], \ 768 9, 10 769 LOAD_DIFF_Qx2 8, [srcq+src_stride3q], [dstq+dst_stride3q], \ 770 [src8q+src_stride3q], [dst8q+dst_stride3q], \ 771 9, 10 772 773 HADAMARD_8x8 774 775 ; Reduce horizontally and convert to 32 bits 776 pxor m2, m2 777 punpcklwd m1, m0, m2 778 punpckhwd m0, m2 779 paddd m0, m1 780 781 vextracti128 xm1, m0, 1 782 paddd xm0, xm1 783 pshufd xm1, xm0, q3232 784 paddd xm0, xm1 785 pshuflw xm1, xm0, q3232 786 paddd xm0, xm1 787 movd eax, xm0 788 789 ; Normalize 790 ; Add rounding offset and an offset for how the final butterfly stage and 791 ; the first stage of accumulation was done. 792 sub eax, 64-2 793 shr eax, 2 794 RET 795 796; Less optimized, boilerplate implementations 797 798INIT_YMM avx2 799cglobal satd_8x32, 4, 9, 13, src, src_stride, dst, dst_stride, \ 800 src8, dst8, src_stride3, dst_stride3, cnt 801 ; ones for converting to 32-bit with pmaddwd 802 pcmpeqw m11, m11 803 pabsw m11, m11 804 ; sum 805 pxor m12, m12 806 mov cntd, 1 807 lea src_stride3q, [src_strideq*3] 808 lea dst_stride3q, [dst_strideq*3] 809 lea src8q, [srcq+src_strideq*8] 810 lea dst8q, [dstq+dst_strideq*8] 811.loop: 812 %define hsub m0 813 mova hsub, [maddubsw_hsub] 814 ; Load rows into m1-m8 815 LOAD_DIFF_Qx2 1, [srcq], [dstq], \ 816 [src8q], [dst8q], \ 817 9, 10 818 LOAD_DIFF_Qx2 2, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ 819 [src8q+src_strideq*1], [dst8q+dst_strideq*1], \ 820 9, 10 821 LOAD_DIFF_Qx2 3, [srcq+src_strideq*2], [dstq+dst_strideq*2], \ 822 [src8q+src_strideq*2], [dst8q+dst_strideq*2], \ 823 9, 10 824 LOAD_DIFF_Qx2 4, [srcq+src_stride3q], [dstq+dst_stride3q], \ 825 [src8q+src_stride3q], [dst8q+dst_stride3q], \ 826 9, 10 827 lea srcq, [srcq+src_strideq*4] 828 lea dstq, [dstq+dst_strideq*4] 829 lea src8q, [src8q+src_strideq*4] 830 lea dst8q, [dst8q+dst_strideq*4] 831 LOAD_DIFF_Qx2 5, [srcq], [dstq], \ 832 [src8q], [dst8q], \ 833 9, 10 834 LOAD_DIFF_Qx2 6, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ 835 [src8q+src_strideq*1], [dst8q+dst_strideq*1], \ 836 9, 10 837 LOAD_DIFF_Qx2 7, [srcq+src_strideq*2], [dstq+dst_strideq*2], \ 838 [src8q+src_strideq*2], [dst8q+dst_strideq*2], \ 839 9, 10 840 LOAD_DIFF_Qx2 8, [srcq+src_stride3q], [dstq+dst_stride3q], \ 841 [src8q+src_stride3q], [dst8q+dst_stride3q], \ 842 9, 10 843 844 HADAMARD_8x8 845 846 ; Reduce horizontally and convert to 32 bits 847 pmaddwd m0, m11 848 paddd m12, m0 849 850 lea srcq, [srcq+src_stride3q*4] 851 lea dstq, [dstq+dst_stride3q*4] 852 lea src8q, [src8q+src_stride3q*4] 853 lea dst8q, [dst8q+dst_stride3q*4] 854 dec cntd 855 jge .loop 856 857 vextracti128 xm0, m12, 1 858 paddd xm0, xm12 859 pshufd xm1, xm0, q3232 860 paddd xm0, xm1 861 pshuflw xm1, xm0, q3232 862 paddd xm0, xm1 863 movd eax, xm0 864 865 ; Normalize 866 ; Add rounding offset and an offset for how the final butterfly stage and 867 ; the first stage of accumulation was done. 868 sub eax, 128-2 869 shr eax, 2 870 RET 871 872INIT_YMM avx2 873cglobal satd_16x8_internal, 0, 0, 0, \ 874 dummy1, src_stride, dummy2, dst_stride, \ 875 src_stride3, dst_stride3, src, dst 876 %define hadd m9 877 %define sum m10 878 ; Load rows into m1-m8 879 LOAD_DIFF_DQ 1, [srcq], [dstq], 0 880 LOAD_DIFF_DQ 2, [srcq+src_strideq*1], [dstq+dst_strideq*1], 0 881 LOAD_DIFF_DQ 3, [srcq+src_strideq*2], [dstq+dst_strideq*2], 0 882 LOAD_DIFF_DQ 4, [srcq+src_stride3q], [dstq+dst_stride3q], 0 883 lea srcq, [srcq+src_strideq*4] 884 lea dstq, [dstq+dst_strideq*4] 885 LOAD_DIFF_DQ 5, [srcq], [dstq], 0 886 LOAD_DIFF_DQ 6, [srcq+src_strideq*1], [dstq+dst_strideq*1], 0 887 LOAD_DIFF_DQ 7, [srcq+src_strideq*2], [dstq+dst_strideq*2], 0 888 LOAD_DIFF_DQ 8, [srcq+src_stride3q], [dstq+dst_stride3q], 0 889 890 HADAMARD_8x8 891 892 pmaddwd m0, hadd 893 paddd sum, m0 894 ret 895 896%macro SATD_NXM 2 897%if %1 > 16 898%if %2 > 8 899cglobal satd_%1x%2, 4, 10, 11, src, src_stride, dst, dst_stride, \ 900 src_stride3, dst_stride3, call_src, call_dst, \ 901 w, h 902%else 903cglobal satd_%1x%2, 4, 9, 11, src, src_stride, dst, dst_stride, \ 904 src_stride3, dst_stride3, call_src, call_dst, \ 905 w 906%endif 907%else ; %2 > 8 908cglobal satd_%1x%2, 4, 9, 11, src, src_stride, dst, dst_stride, \ 909 src_stride3, dst_stride3, call_src, call_dst, \ 910 h 911%endif 912 ; ones for converting to 32-bit with pmaddwd 913 pcmpeqw m9, m9 914 pabsw m9, m9 915 ; sum 916 pxor m10, m10 917 lea src_stride3q, [src_strideq*3] 918 lea dst_stride3q, [dst_strideq*3] 919%if %2 > 8 920 mov hd, %2/8 - 1 921.looph: 922%endif 923%if %1 > 16 924 mov wd, %1/16 - 1 925.loopv: 926%endif 927 mov call_srcq, srcq 928 mov call_dstq, dstq 929 call m(satd_16x8_internal) 930%if %1 > 16 931 add srcq, 16 932 add dstq, 16 933 dec wd 934 jge .loopv 935 sub srcq, %1 936 sub dstq, %1 937%endif 938%if %2 > 8 939 lea srcq, [srcq+src_strideq*8] 940 lea dstq, [dstq+dst_strideq*8] 941 dec hd 942 jge .looph 943%endif 944 945 ; Reduce horizontally 946 vextracti128 xm0, m10, 1 947 paddd xm0, xm10 948 pshufd xm1, xm0, q3232 949 paddd xm0, xm1 950 pshuflw xm1, xm0, q3232 951 paddd xm0, xm1 952 movd eax, xm0 953 954 ; Normalize 955 ; Add rounding offset and an offset for how the final butterfly stage and 956 ; the first stage of accumulation was done. 957 sub eax, %1*%2/2 - 2 958 shr eax, 2 959 RET 960%endmacro 961 962INIT_YMM avx2 963SATD_NXM 16, 16 964SATD_NXM 32, 32 965SATD_NXM 64, 64 966SATD_NXM 128, 128 967 968SATD_NXM 16, 32 969SATD_NXM 32, 16 970SATD_NXM 32, 64 971SATD_NXM 64, 32 972SATD_NXM 64, 128 973SATD_NXM 128, 64 974 975SATD_NXM 32, 8 976SATD_NXM 16, 64 977SATD_NXM 64, 16 978 979%endif ; ARCH_X86_64 980