1;***************************************************************************** 2;* dct-64.asm: x86_64 transform and zigzag 3;***************************************************************************** 4;* Copyright (C) 2003-2021 x264 project 5;* 6;* Authors: Loren Merritt <lorenm@u.washington.edu> 7;* Holger Lubitz <holger@lubitz.org> 8;* Laurent Aimar <fenrir@via.ecp.fr> 9;* Min Chen <chenm001.163.com> 10;* 11;* This program is free software; you can redistribute it and/or modify 12;* it under the terms of the GNU General Public License as published by 13;* the Free Software Foundation; either version 2 of the License, or 14;* (at your option) any later version. 15;* 16;* This program is distributed in the hope that it will be useful, 17;* but WITHOUT ANY WARRANTY; without even the implied warranty of 18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19;* GNU General Public License for more details. 20;* 21;* You should have received a copy of the GNU General Public License 22;* along with this program; if not, write to the Free Software 23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. 24;* 25;* This program is also available under a commercial proprietary license. 26;* For more information, contact us at licensing@x264.com. 27;***************************************************************************** 28 29%include "x86inc.asm" 30%include "x86util.asm" 31 32SECTION .text 33 34cextern pd_32 35cextern pw_pixel_max 36cextern pw_2 37cextern pw_m2 38cextern pw_32 39cextern hsub_mul 40 41; in: size, m0..m7, temp, temp 42; out: m0..m7 43%macro DCT8_1D 11 44 SUMSUB_BA %1, %6, %5, %11 ; %6=s34, %5=d34 45 SUMSUB_BA %1, %7, %4, %11 ; %7=s25, %4=d25 46 SUMSUB_BA %1, %8, %3, %11 ; %8=s16, %3=d16 47 SUMSUB_BA %1, %9, %2, %11 ; %9=s07, %2=d07 48 49 SUMSUB_BA %1, %7, %8, %11 ; %7=a1, %8=a3 50 SUMSUB_BA %1, %6, %9, %11 ; %6=a0, %9=a2 51 52 psra%1 m%10, m%2, 1 53 padd%1 m%10, m%2 54 padd%1 m%10, m%3 55 padd%1 m%10, m%4 ; %10=a4 56 57 psra%1 m%11, m%5, 1 58 padd%1 m%11, m%5 59 padd%1 m%11, m%3 60 psub%1 m%11, m%4 ; %11=a7 61 62 SUMSUB_BA %1, %5, %2 63 psub%1 m%2, m%4 64 psub%1 m%5, m%3 65 psra%1 m%4, 1 66 psra%1 m%3, 1 67 psub%1 m%2, m%4 ; %2=a5 68 psub%1 m%5, m%3 ; %5=a6 69 70 psra%1 m%3, m%11, 2 71 padd%1 m%3, m%10 ; %3=b1 72 psra%1 m%10, 2 73 psub%1 m%10, m%11 ; %10=b7 74 75 SUMSUB_BA %1, %7, %6, %11 ; %7=b0, %6=b4 76 77 psra%1 m%4, m%8, 1 78 padd%1 m%4, m%9 ; %4=b2 79 psra%1 m%9, 1 80 psub%1 m%9, m%8 ; %9=b6 81 82 psra%1 m%8, m%5, 2 83 padd%1 m%8, m%2 ; %8=b3 84 psra%1 m%2, 2 85 psub%1 m%5, m%2 ; %5=b5 86 87 SWAP %2, %7, %5, %8, %9, %10 88%endmacro 89 90%macro IDCT8_1D 11 91 SUMSUB_BA %1, %6, %2, %10 ; %5=a0, %1=a2 92 93 psra%1 m%10, m%3, 1 94 padd%1 m%10, m%3 95 padd%1 m%10, m%5 96 padd%1 m%10, m%7 ; %9=a7 97 98 psra%1 m%11, m%4, 1 99 psub%1 m%11, m%8 ; %10=a4 100 psra%1 m%8, 1 101 padd%1 m%8, m%4 ; %7=a6 102 103 psra%1 m%4, m%7, 1 104 padd%1 m%4, m%7 105 padd%1 m%4, m%9 106 psub%1 m%4, m%3 ; %3=a5 107 108 psub%1 m%3, m%5 109 psub%1 m%7, m%5 110 padd%1 m%3, m%9 111 psub%1 m%7, m%9 112 psra%1 m%5, 1 113 psra%1 m%9, 1 114 psub%1 m%3, m%5 ; %2=a3 115 psub%1 m%7, m%9 ; %6=a1 116 117 psra%1 m%5, m%10, 2 118 padd%1 m%5, m%7 ; %4=b1 119 psra%1 m%7, 2 120 psub%1 m%10, m%7 ; %9=b7 121 122 SUMSUB_BA %1, %8, %6, %7 ; %7=b0, %5=b6 123 SUMSUB_BA %1, %11, %2, %7 ; %10=b2, %1=b4 124 125 psra%1 m%9, m%4, 2 126 padd%1 m%9, m%3 ; %8=b3 127 psra%1 m%3, 2 128 psub%1 m%3, m%4 ; %2=b5 129 130 SUMSUB_BA %1, %10, %8, %7 ; %9=c0, %7=c7 131 SUMSUB_BA %1, %3, %11, %7 ; %2=c1, %10=c6 132 SUMSUB_BA %1, %9, %2, %7 ; %8=c2, %1=c5 133 SUMSUB_BA %1, %5, %6, %7 ; %4=c3, %5=c4 134 135 SWAP %11, %4 136 SWAP %2, %10, %7 137 SWAP %4, %9, %8 138%endmacro 139 140%if HIGH_BIT_DEPTH 141 142%macro SUB8x8_DCT8 0 143cglobal sub8x8_dct8, 3,3,14 144 TAIL_CALL .skip_prologue, 0 145cglobal_label .skip_prologue 146 LOAD_DIFF8x4 0,1,2,3, none,none, r1, r2 147 LOAD_DIFF8x4 4,5,6,7, none,none, r1, r2 148 149 DCT8_1D w, 0,1,2,3,4,5,6,7, 8,9 150 151 TRANSPOSE4x4W 0,1,2,3,8 152 WIDEN_SXWD 0,8 153 WIDEN_SXWD 1,9 154 WIDEN_SXWD 2,10 155 WIDEN_SXWD 3,11 156 DCT8_1D d, 0,8,1,9,2,10,3,11, 12,13 157 mova [r0+0x00], m0 158 mova [r0+0x20], m8 159 mova [r0+0x40], m1 160 mova [r0+0x60], m9 161 mova [r0+0x80], m2 162 mova [r0+0xA0], m10 163 mova [r0+0xC0], m3 164 mova [r0+0xE0], m11 165 166 TRANSPOSE4x4W 4,5,6,7,0 167 WIDEN_SXWD 4,0 168 WIDEN_SXWD 5,1 169 WIDEN_SXWD 6,2 170 WIDEN_SXWD 7,3 171 DCT8_1D d,4,0,5,1,6,2,7,3, 8,9 172 mova [r0+0x10], m4 173 mova [r0+0x30], m0 174 mova [r0+0x50], m5 175 mova [r0+0x70], m1 176 mova [r0+0x90], m6 177 mova [r0+0xB0], m2 178 mova [r0+0xD0], m7 179 mova [r0+0xF0], m3 180 ret 181%endmacro ; SUB8x8_DCT8 182 183INIT_XMM sse2 184SUB8x8_DCT8 185INIT_XMM sse4 186SUB8x8_DCT8 187INIT_XMM avx 188SUB8x8_DCT8 189 190%macro ADD8x8_IDCT8 0 191cglobal add8x8_idct8, 2,2,16 192 add r1, 128 193 TAIL_CALL .skip_prologue, 0 194cglobal_label .skip_prologue 195 mova m0, [r1-128] 196 mova m1, [r1-96] 197 mova m2, [r1-64] 198 mova m3, [r1-32] 199 mova m4, [r1+ 0] 200 mova m5, [r1+32] 201 mova m6, [r1+64] 202 mova m7, [r1+96] 203 IDCT8_1D d,0,1,2,3,4,5,6,7,8,9 204 TRANSPOSE4x4D 0,1,2,3,8 205 TRANSPOSE4x4D 4,5,6,7,8 206 paddd m0, [pd_32] 207 paddd m4, [pd_32] 208 mova [r1+64], m6 209 mova [r1+96], m7 210 mova m8, [r1-112] 211 mova m9, [r1-80] 212 mova m10, [r1-48] 213 mova m11, [r1-16] 214 mova m12, [r1+16] 215 mova m13, [r1+48] 216 mova m14, [r1+80] 217 mova m15, [r1+112] 218 IDCT8_1D d,8,9,10,11,12,13,14,15,6,7 219 TRANSPOSE4x4D 8,9,10,11,6 220 TRANSPOSE4x4D 12,13,14,15,6 221 IDCT8_1D d,0,1,2,3,8,9,10,11,6,7 222 mova [r1-112], m8 223 mova [r1-80], m9 224 mova m6, [r1+64] 225 mova m7, [r1+96] 226 IDCT8_1D d,4,5,6,7,12,13,14,15,8,9 227 pxor m8, m8 228 mova m9, [pw_pixel_max] 229 STORE_DIFF m0, m4, m8, m9, [r0+0*FDEC_STRIDEB] 230 STORE_DIFF m1, m5, m8, m9, [r0+1*FDEC_STRIDEB] 231 STORE_DIFF m2, m6, m8, m9, [r0+2*FDEC_STRIDEB] 232 STORE_DIFF m3, m7, m8, m9, [r0+3*FDEC_STRIDEB] 233 mova m0, [r1-112] 234 mova m1, [r1-80] 235 STORE_DIFF m0, m12, m8, m9, [r0+4*FDEC_STRIDEB] 236 STORE_DIFF m1, m13, m8, m9, [r0+5*FDEC_STRIDEB] 237 STORE_DIFF m10, m14, m8, m9, [r0+6*FDEC_STRIDEB] 238 STORE_DIFF m11, m15, m8, m9, [r0+7*FDEC_STRIDEB] 239 ret 240%endmacro ; ADD8x8_IDCT8 241 242INIT_XMM sse2 243ADD8x8_IDCT8 244INIT_XMM avx 245ADD8x8_IDCT8 246 247%else ; !HIGH_BIT_DEPTH 248 249%macro DCT_SUB8 0 250cglobal sub8x8_dct, 3,3,10 251 add r2, 4*FDEC_STRIDE 252%if cpuflag(ssse3) 253 mova m7, [hsub_mul] 254%endif 255 TAIL_CALL .skip_prologue, 0 256cglobal_label .skip_prologue 257 SWAP 7, 9 258 LOAD_DIFF8x4 0, 1, 2, 3, 8, 9, r1, r2-4*FDEC_STRIDE 259 LOAD_DIFF8x4 4, 5, 6, 7, 8, 9, r1, r2-4*FDEC_STRIDE 260 DCT4_1D 0, 1, 2, 3, 8 261 TRANSPOSE2x4x4W 0, 1, 2, 3, 8 262 DCT4_1D 4, 5, 6, 7, 8 263 TRANSPOSE2x4x4W 4, 5, 6, 7, 8 264 DCT4_1D 0, 1, 2, 3, 8 265 STORE_DCT 0, 1, 2, 3, r0, 0 266 DCT4_1D 4, 5, 6, 7, 8 267 STORE_DCT 4, 5, 6, 7, r0, 64 268 ret 269 270;----------------------------------------------------------------------------- 271; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) 272;----------------------------------------------------------------------------- 273cglobal sub8x8_dct8, 3,3,11 274 add r2, 4*FDEC_STRIDE 275%if cpuflag(ssse3) 276 mova m7, [hsub_mul] 277%endif 278 TAIL_CALL .skip_prologue, 0 279cglobal_label .skip_prologue 280 SWAP 7, 10 281 LOAD_DIFF8x4 0, 1, 2, 3, 4, 10, r1, r2-4*FDEC_STRIDE 282 LOAD_DIFF8x4 4, 5, 6, 7, 8, 10, r1, r2-4*FDEC_STRIDE 283 DCT8_1D w, 0,1,2,3,4,5,6,7,8,9 284 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 285 DCT8_1D w, 0,1,2,3,4,5,6,7,8,9 286 movdqa [r0+0x00], m0 287 movdqa [r0+0x10], m1 288 movdqa [r0+0x20], m2 289 movdqa [r0+0x30], m3 290 movdqa [r0+0x40], m4 291 movdqa [r0+0x50], m5 292 movdqa [r0+0x60], m6 293 movdqa [r0+0x70], m7 294 ret 295%endmacro 296 297INIT_XMM sse2 298%define movdqa movaps 299%define punpcklqdq movlhps 300DCT_SUB8 301%undef movdqa 302%undef punpcklqdq 303INIT_XMM ssse3 304DCT_SUB8 305INIT_XMM avx 306DCT_SUB8 307INIT_XMM xop 308DCT_SUB8 309 310INIT_YMM avx2 311cglobal sub16x16_dct8, 3,3,10 312 add r0, 128 313 add r2, 4*FDEC_STRIDE 314 call .sub16x8_dct8 315 add r0, 256 316 add r1, FENC_STRIDE*8 317 add r2, FDEC_STRIDE*8 318 call .sub16x8_dct8 319 RET 320.sub16x8_dct8: 321 LOAD_DIFF16x2_AVX2 0, 1, 2, 3, 0, 1 322 LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3 323 LOAD_DIFF16x2_AVX2 4, 5, 6, 7, 4, 5 324 LOAD_DIFF16x2_AVX2 6, 7, 8, 9, 6, 7 325 DCT8_1D w, 0,1,2,3,4,5,6,7,8,9 326 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 327 DCT8_1D w, 0,1,2,3,4,5,6,7,8,9 328 mova [r0-0x80+0x00], xm0 329 vextracti128 [r0+0x00], m0, 1 330 mova [r0-0x80+0x10], xm1 331 vextracti128 [r0+0x10], m1, 1 332 mova [r0-0x80+0x20], xm2 333 vextracti128 [r0+0x20], m2, 1 334 mova [r0-0x80+0x30], xm3 335 vextracti128 [r0+0x30], m3, 1 336 mova [r0-0x80+0x40], xm4 337 vextracti128 [r0+0x40], m4, 1 338 mova [r0-0x80+0x50], xm5 339 vextracti128 [r0+0x50], m5, 1 340 mova [r0-0x80+0x60], xm6 341 vextracti128 [r0+0x60], m6, 1 342 mova [r0-0x80+0x70], xm7 343 vextracti128 [r0+0x70], m7, 1 344 ret 345 346;----------------------------------------------------------------------------- 347; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] ) 348;----------------------------------------------------------------------------- 349%macro ADD8x8_IDCT8 0 350cglobal add8x8_idct8, 2,2,11 351 add r0, 4*FDEC_STRIDE 352 pxor m7, m7 353 TAIL_CALL .skip_prologue, 0 354cglobal_label .skip_prologue 355 SWAP 7, 9 356 movdqa m0, [r1+0x00] 357 movdqa m1, [r1+0x10] 358 movdqa m2, [r1+0x20] 359 movdqa m3, [r1+0x30] 360 movdqa m4, [r1+0x40] 361 movdqa m5, [r1+0x50] 362 movdqa m6, [r1+0x60] 363 movdqa m7, [r1+0x70] 364 IDCT8_1D w,0,1,2,3,4,5,6,7,8,10 365 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 366 paddw m0, [pw_32] ; rounding for the >>6 at the end 367 IDCT8_1D w,0,1,2,3,4,5,6,7,8,10 368 DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE] 369 DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE] 370 DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE] 371 DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE] 372 STORE_IDCT m1, m3, m5, m7 373 ret 374%endmacro ; ADD8x8_IDCT8 375 376INIT_XMM sse2 377ADD8x8_IDCT8 378INIT_XMM avx 379ADD8x8_IDCT8 380 381;----------------------------------------------------------------------------- 382; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] ) 383;----------------------------------------------------------------------------- 384%macro ADD8x8 0 385cglobal add8x8_idct, 2,2,11 386 add r0, 4*FDEC_STRIDE 387 pxor m7, m7 388 TAIL_CALL .skip_prologue, 0 389cglobal_label .skip_prologue 390 SWAP 7, 9 391 mova m0, [r1+ 0] 392 mova m2, [r1+16] 393 mova m1, [r1+32] 394 mova m3, [r1+48] 395 SBUTTERFLY qdq, 0, 1, 4 396 SBUTTERFLY qdq, 2, 3, 4 397 mova m4, [r1+64] 398 mova m6, [r1+80] 399 mova m5, [r1+96] 400 mova m7, [r1+112] 401 SBUTTERFLY qdq, 4, 5, 8 402 SBUTTERFLY qdq, 6, 7, 8 403 IDCT4_1D w,0,1,2,3,8,10 404 TRANSPOSE2x4x4W 0,1,2,3,8 405 IDCT4_1D w,4,5,6,7,8,10 406 TRANSPOSE2x4x4W 4,5,6,7,8 407 paddw m0, [pw_32] 408 IDCT4_1D w,0,1,2,3,8,10 409 paddw m4, [pw_32] 410 IDCT4_1D w,4,5,6,7,8,10 411 DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE] 412 DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE] 413 DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE] 414 DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE] 415 STORE_IDCT m1, m3, m5, m7 416 ret 417%endmacro ; ADD8x8 418 419INIT_XMM sse2 420ADD8x8 421INIT_XMM avx 422ADD8x8 423 424%endif ; !HIGH_BIT_DEPTH 425