1;***************************************************************************** 2;* cabac-a.asm: x86 cabac 3;***************************************************************************** 4;* Copyright (C) 2008-2014 x264 project 5;* 6;* Authors: Loren Merritt <lorenm@u.washington.edu> 7;* Fiona Glaser <fiona@x264.com> 8;* Holger Lubitz <holger@lubitz.org> 9;* 10;* This program is free software; you can redistribute it and/or modify 11;* it under the terms of the GNU General Public License as published by 12;* the Free Software Foundation; either version 2 of the License, or 13;* (at your option) any later version. 14;* 15;* This program is distributed in the hope that it will be useful, 16;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18;* GNU General Public License for more details. 19;* 20;* You should have received a copy of the GNU General Public License 21;* along with this program; if not, write to the Free Software 22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. 23;* 24;* This program is also available under a commercial proprietary license. 25;* For more information, contact us at licensing@x264.com. 26;***************************************************************************** 27 28%include "x86inc.asm" 29%include "x86util.asm" 30 31SECTION_RODATA 32 33coeff_abs_level1_ctx: db 1, 2, 3, 4, 0, 0, 0, 0 34coeff_abs_levelgt1_ctx: db 5, 5, 5, 5, 6, 7, 8, 9 35coeff_abs_level_transition: db 1, 2, 3, 3, 4, 5, 6, 7 36 db 4, 4, 4, 4, 5, 6, 7, 7 37 38%if ARCH_X86_64 39%macro COEFF_LAST_TABLE 17 40 %define funccpu1 %1 41 %define funccpu2 %2 42 %define funccpu3 %3 43 %rep 14 44 %ifidn %4, 4 45 dq mangle(x264_coeff_last%4_ %+ funccpu1) 46 %elifidn %4, 64 47 dq mangle(x264_coeff_last%4_ %+ funccpu2) 48 %else 49 dq mangle(x264_coeff_last%4_ %+ funccpu3) 50 %endif 51 %rotate 1 52 %endrep 53%endmacro 54 55cextern coeff_last4_mmx2 56cextern coeff_last4_mmx2_lzcnt 57cextern coeff_last15_sse2 58cextern coeff_last15_sse2_lzcnt 59cextern coeff_last16_sse2 60cextern coeff_last16_sse2_lzcnt 61cextern coeff_last64_sse2 62cextern coeff_last64_sse2_lzcnt 63cextern coeff_last64_avx2_lzcnt 64 65%ifdef PIC 66SECTION .data 67%endif 68coeff_last_sse2: COEFF_LAST_TABLE mmx2, sse2, sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 69coeff_last_sse2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, sse2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 70coeff_last_avx2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, avx2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 71%endif 72 73SECTION .text 74 75cextern cabac_range_lps 76cextern cabac_transition 77cextern cabac_renorm_shift 78cextern cabac_entropy 79cextern cabac_size_unary 80cextern cabac_transition_unary 81cextern significant_coeff_flag_offset 82cextern significant_coeff_flag_offset_8x8 83cextern last_coeff_flag_offset 84cextern last_coeff_flag_offset_8x8 85cextern coeff_abs_level_m1_offset 86cextern count_cat_m1 87cextern cabac_encode_ue_bypass 88 89%if ARCH_X86_64 90 %define pointer resq 91%else 92 %define pointer resd 93%endif 94 95struc cb 96 .low: resd 1 97 .range: resd 1 98 .queue: resd 1 99 .bytes_outstanding: resd 1 100 .start: pointer 1 101 .p: pointer 1 102 .end: pointer 1 103 align 16, resb 1 104 .bits_encoded: resd 1 105 .state: resb 1024 106endstruc 107 108%macro LOAD_GLOBAL 3-5 0 ; dst, base, off1, off2, tmp 109%ifdef PIC 110 %ifidn %4, 0 111 movzx %1, byte [%2+%3+r7-$$] 112 %else 113 lea %5, [r7+%4] 114 movzx %1, byte [%2+%3+%5-$$] 115 %endif 116%else 117 movzx %1, byte [%2+%3+%4] 118%endif 119%endmacro 120 121%macro CABAC 1 122; t3 must be ecx, since it's used for shift. 123%if WIN64 124 DECLARE_REG_TMP 3,1,2,0,5,6,4,4 125%elif ARCH_X86_64 126 DECLARE_REG_TMP 0,1,2,3,4,5,6,6 127%else 128 DECLARE_REG_TMP 0,4,2,1,3,5,6,2 129%endif 130 131cglobal cabac_encode_decision_%1, 1,7 132 movifnidn t1d, r1m 133 mov t5d, [r0+cb.range] 134 movzx t6d, byte [r0+cb.state+t1] 135 movifnidn t0, r0 ; WIN64 136 mov t4d, ~1 137 mov t3d, t5d 138 and t4d, t6d 139 shr t5d, 6 140 movifnidn t2d, r2m 141%if WIN64 142 PUSH r7 143%endif 144%ifdef PIC 145 lea r7, [$$] 146%endif 147 LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2, t4 148 LOAD_GLOBAL t4d, cabac_transition, t2, t6*2, t4 149 and t6d, 1 150 sub t3d, t5d 151 cmp t6d, t2d 152 mov t6d, [t0+cb.low] 153 lea t2, [t6+t3] 154 cmovne t3d, t5d 155 cmovne t6d, t2d 156 mov [t0+cb.state+t1], t4b 157;cabac_encode_renorm 158 mov t4d, t3d 159%ifidn %1, bmi2 160 lzcnt t3d, t3d 161 sub t3d, 23 162 shlx t4d, t4d, t3d 163 shlx t6d, t6d, t3d 164%else 165 shr t3d, 3 166 LOAD_GLOBAL t3d, cabac_renorm_shift, t3 167 shl t4d, t3b 168 shl t6d, t3b 169%endif 170%if WIN64 171 POP r7 172%endif 173 mov [t0+cb.range], t4d 174 add t3d, [t0+cb.queue] 175 jge cabac_putbyte_%1 176.update_queue_low: 177 mov [t0+cb.low], t6d 178 mov [t0+cb.queue], t3d 179 RET 180 181cglobal cabac_encode_bypass_%1, 2,3 182 mov t7d, [r0+cb.low] 183 and r1d, [r0+cb.range] 184 lea t7d, [t7*2+r1] 185 movifnidn t0, r0 ; WIN64 186 mov t3d, [r0+cb.queue] 187 inc t3d 188%if ARCH_X86_64 ; .putbyte compiles to nothing but a jmp 189 jge cabac_putbyte_%1 190%else 191 jge .putbyte 192%endif 193 mov [t0+cb.low], t7d 194 mov [t0+cb.queue], t3d 195 RET 196%if ARCH_X86_64 == 0 197.putbyte: 198 PROLOGUE 0,7 199 movifnidn t6d, t7d 200 jmp cabac_putbyte_%1 201%endif 202 203%ifnidn %1,bmi2 204cglobal cabac_encode_terminal_%1, 1,3 205 sub dword [r0+cb.range], 2 206; shortcut: the renormalization shift in terminal 207; can only be 0 or 1 and is zero over 99% of the time. 208 test dword [r0+cb.range], 0x100 209 je .renorm 210 RET 211.renorm: 212 shl dword [r0+cb.low], 1 213 shl dword [r0+cb.range], 1 214 inc dword [r0+cb.queue] 215 jge .putbyte 216 RET 217.putbyte: 218 PROLOGUE 0,7 219 movifnidn t0, r0 ; WIN64 220 mov t3d, [r0+cb.queue] 221 mov t6d, [t0+cb.low] 222%endif 223 224cabac_putbyte_%1: 225 ; alive: t0=cb t3=queue t6=low 226%if WIN64 227 DECLARE_REG_TMP 3,6,1,0,2,5,4 228%endif 229%ifidn %1, bmi2 230 add t3d, 10 231 shrx t2d, t6d, t3d 232 bzhi t6d, t6d, t3d 233 sub t3d, 18 234%else 235 mov t1d, -1 236 add t3d, 10 237 mov t2d, t6d 238 shl t1d, t3b 239 shr t2d, t3b ; out 240 not t1d 241 sub t3d, 18 242 and t6d, t1d 243%endif 244 mov t5d, [t0+cb.bytes_outstanding] 245 cmp t2b, 0xff ; FIXME is a 32bit op faster? 246 jz .postpone 247 mov t1, [t0+cb.p] 248 add [t1-1], t2h 249 dec t2h 250.loop_outstanding: 251 mov [t1], t2h 252 inc t1 253 dec t5d 254 jge .loop_outstanding 255 mov [t1-1], t2b 256 mov [t0+cb.p], t1 257.postpone: 258 inc t5d 259 mov [t0+cb.bytes_outstanding], t5d 260 jmp mangle(x264_cabac_encode_decision_%1.update_queue_low) 261%endmacro 262 263CABAC asm 264CABAC bmi2 265 266; %1 = label name 267; %2 = node_ctx init? 268%macro COEFF_ABS_LEVEL_GT1 2 269%if %2 270 %define ctx 1 271%else 272 movzx r11d, byte [coeff_abs_level1_ctx+r2 GLOBAL] 273 %define ctx r11 274%endif 275 movzx r9d, byte [r8+ctx] 276; if( coeff_abs > 1 ) 277 cmp r1d, 1 278 jg .%1_gt1 279; x264_cabac_encode_decision( cb, ctx_level+ctx, 0 ) 280 movzx r10d, byte [cabac_transition+r9*2 GLOBAL] 281 movzx r9d, word [cabac_entropy+r9*2 GLOBAL] 282 lea r0d, [r0+r9+256] 283 mov [r8+ctx], r10b 284%if %2 285 mov r2d, 1 286%else 287 movzx r2d, byte [coeff_abs_level_transition+r2 GLOBAL] 288%endif 289 jmp .%1_end 290 291.%1_gt1: 292; x264_cabac_encode_decision( cb, ctx_level+ctx, 1 ) 293 movzx r10d, byte [cabac_transition+r9*2+1 GLOBAL] 294 xor r9d, 1 295 movzx r9d, word [cabac_entropy+r9*2 GLOBAL] 296 mov [r8+ctx], r10b 297 add r0d, r9d 298%if %2 299 %define ctx 5 300%else 301 movzx r11d, byte [coeff_abs_levelgt1_ctx+r2 GLOBAL] 302 %define ctx r11 303%endif 304; if( coeff_abs < 15 ) 305 cmp r1d, 15 306 jge .%1_escape 307 shl r1d, 7 308; x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx_level+ctx]] 309 movzx r9d, byte [r8+ctx] 310 add r9d, r1d 311 movzx r10d, byte [cabac_transition_unary-128+r9 GLOBAL] 312; x264_cabac_size_unary[coeff_abs-1][cb->state[ctx_level+ctx]] 313 movzx r9d, word [cabac_size_unary-256+r9*2 GLOBAL] 314 mov [r8+ctx], r10b 315 add r0d, r9d 316 jmp .%1_gt1_end 317 318.%1_escape: 319; x264_cabac_transition_unary[14][cb->state[ctx_level+ctx]] 320 movzx r9d, byte [r8+ctx] 321 movzx r10d, byte [cabac_transition_unary+128*14+r9 GLOBAL] 322; x264_cabac_size_unary[14][cb->state[ctx_level+ctx]] 323 movzx r9d, word [cabac_size_unary+256*14+r9*2 GLOBAL] 324 add r0d, r9d 325 mov [r8+ctx], r10b 326 sub r1d, 14 327%if cpuflag(lzcnt) 328 lzcnt r9d, r1d 329 xor r9d, 0x1f 330%else 331 bsr r9d, r1d 332%endif 333; bs_size_ue_big(coeff_abs-15)<<8 334 shl r9d, 9 335; (ilog2(coeff_abs-14)+1) << 8 336 lea r0d, [r0+r9+256] 337.%1_gt1_end: 338%if %2 339 mov r2d, 4 340%else 341 movzx r2d, byte [coeff_abs_level_transition+8+r2 GLOBAL] 342%endif 343.%1_end: 344%endmacro 345 346%macro LOAD_DCTCOEF 1 347%if HIGH_BIT_DEPTH 348 mov %1, [dct+r6*4] 349%else 350 movzx %1, word [dct+r6*2] 351%endif 352%endmacro 353 354%macro ABS_DCTCOEFS 2 355%assign i 0 356%rep %2/16 357%if HIGH_BIT_DEPTH 358 ABSD m0, [%1+ 0+i*64], m4 359 ABSD m1, [%1+16+i*64], m5 360 ABSD m2, [%1+32+i*64], m4 361 ABSD m3, [%1+48+i*64], m5 362 mova [rsp+ 0+i*64], m0 363 mova [rsp+16+i*64], m1 364 mova [rsp+32+i*64], m2 365 mova [rsp+48+i*64], m3 366%else 367 ABSW m0, [%1+ 0+i*32], m2 368 ABSW m1, [%1+16+i*32], m3 369 mova [rsp+ 0+i*32], m0 370 mova [rsp+16+i*32], m1 371%endif 372%assign i i+1 373%endrep 374%endmacro 375 376%macro SIG_OFFSET 1 377%if %1 378 movzx r11d, byte [r4+r6] 379%endif 380%endmacro 381 382%macro LAST_OFFSET 1 383%if %1 384 movzx r11d, byte [last_coeff_flag_offset_8x8+r6 GLOBAL] 385%endif 386%endmacro 387 388;----------------------------------------------------------------------------- 389; void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced, 390; int ctx_block_cat, x264_cabac_t *cb ); 391;----------------------------------------------------------------------------- 392 393;%1 = 8x8 mode 394%macro CABAC_RESIDUAL_RD 2 395%if %1 396 %define func cabac_block_residual_8x8_rd_internal 397 %define maxcoeffs 64 398 %define dct rsp 399%else 400 %define func cabac_block_residual_rd_internal 401 %define maxcoeffs 16 402 %define dct r4 403%endif 404 405%ifdef PIC 406 cglobal func, 4,13 407 lea r12, [$$] 408 %define GLOBAL +r12-$$ 409%else 410 cglobal func, 4,12 411 %define GLOBAL 412%endif 413 414%assign pad gprsize+SIZEOF_DCTCOEF*maxcoeffs-(stack_offset&15) 415 SUB rsp, pad 416 shl r1d, 4 ; MB_INTERLACED*16 417%if %1 418 lea r4, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL] ; r12 = sig offset 8x8 419%endif 420 add r1d, r2d 421 movzx r5d, word [significant_coeff_flag_offset+r1*2 GLOBAL] ; r5 = ctx_sig 422 movzx r7d, word [last_coeff_flag_offset+r1*2 GLOBAL] ; r7 = ctx_last 423 movzx r8d, word [coeff_abs_level_m1_offset+r2*2 GLOBAL] ; r8 = ctx_level 424 425; abs() all the coefficients; copy them to the stack to avoid 426; changing the originals. 427; overreading is okay; it's all valid aligned data anyways. 428%if %1 429 ABS_DCTCOEFS r0, 64 430%else 431 mov r4, r0 ; r4 = dct 432 mov r6, ~SIZEOF_DCTCOEF 433 and r6, r4 ; handle AC coefficient case 434 ABS_DCTCOEFS r6, 16 435 sub r4, r6 ; calculate our new dct pointer 436 add r4, rsp ; restore AC coefficient offset 437%endif 438 mov r1, [%2+gprsize*r2 GLOBAL] 439; for improved OOE performance, run coeff_last on the original coefficients. 440 call r1 ; coeff_last[ctx_block_cat]( dct ) 441; we know on 64-bit that the SSE2 versions of this function only 442; overwrite r0, r1, and rax (r6). last64 overwrites r2 too, but we 443; don't need r2 in 8x8 mode. 444 mov r0d, [r3+cb.bits_encoded] ; r0 = cabac.f8_bits_encoded 445; pre-add some values to simplify addressing 446 add r3, cb.state 447 add r5, r3 448 add r7, r3 449 add r8, r3 ; precalculate cabac state pointers 450 451; if( last != count_cat_m1[ctx_block_cat] ) 452%if %1 453 cmp r6b, 63 454%else 455 cmp r6b, [count_cat_m1+r2 GLOBAL] 456%endif 457 je .skip_last_sigmap 458 459; in 8x8 mode we have to do a bit of extra calculation for ctx_sig/last, 460; so we'll use r11 for this. 461%if %1 462 %define siglast_ctx r11 463%else 464 %define siglast_ctx r6 465%endif 466 467; x264_cabac_encode_decision( cb, ctx_sig + last, 1 ) 468; x264_cabac_encode_decision( cb, ctx_last + last, 1 ) 469 SIG_OFFSET %1 470 movzx r1d, byte [r5+siglast_ctx] 471 movzx r9d, byte [cabac_transition+1+r1*2 GLOBAL] 472 xor r1d, 1 473 movzx r1d, word [cabac_entropy+r1*2 GLOBAL] 474 mov [r5+siglast_ctx], r9b 475 add r0d, r1d 476 477 LAST_OFFSET %1 478 movzx r1d, byte [r7+siglast_ctx] 479 movzx r9d, byte [cabac_transition+1+r1*2 GLOBAL] 480 xor r1d, 1 481 movzx r1d, word [cabac_entropy+r1*2 GLOBAL] 482 mov [r7+siglast_ctx], r9b 483 add r0d, r1d 484.skip_last_sigmap: 485 LOAD_DCTCOEF r1d 486 COEFF_ABS_LEVEL_GT1 last, 1 487; for( int i = last-1 ; i >= 0; i-- ) 488 dec r6d 489 jl .end 490.coeff_loop: 491 LOAD_DCTCOEF r1d 492; if( l[i] ) 493 SIG_OFFSET %1 494 movzx r9d, byte [r5+siglast_ctx] 495 test r1d, r1d 496 jnz .coeff_nonzero 497; x264_cabac_encode_decision( cb, ctx_sig + i, 0 ) 498 movzx r10d, byte [cabac_transition+r9*2 GLOBAL] 499 movzx r9d, word [cabac_entropy+r9*2 GLOBAL] 500 mov [r5+siglast_ctx], r10b 501 add r0d, r9d 502 dec r6d 503 jge .coeff_loop 504 jmp .end 505.coeff_nonzero: 506; x264_cabac_encode_decision( cb, ctx_sig + i, 1 ) 507 movzx r10d, byte [cabac_transition+r9*2+1 GLOBAL] 508 xor r9d, 1 509 movzx r9d, word [cabac_entropy+r9*2 GLOBAL] 510 mov [r5+siglast_ctx], r10b 511 add r0d, r9d 512; x264_cabac_encode_decision( cb, ctx_last + i, 0 ); 513 LAST_OFFSET %1 514 movzx r9d, byte [r7+siglast_ctx] 515 movzx r10d, byte [cabac_transition+r9*2 GLOBAL] 516 movzx r9d, word [cabac_entropy+r9*2 GLOBAL] 517 mov [r7+siglast_ctx], r10b 518 add r0d, r9d 519 COEFF_ABS_LEVEL_GT1 coeff, 0 520 dec r6d 521 jge .coeff_loop 522.end: 523 mov [r3+cb.bits_encoded-cb.state], r0d 524 ADD rsp, pad 525 RET 526%endmacro 527 528%if ARCH_X86_64 529INIT_XMM sse2 530CABAC_RESIDUAL_RD 0, coeff_last_sse2 531CABAC_RESIDUAL_RD 1, coeff_last_sse2 532INIT_XMM sse2,lzcnt 533CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt 534CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt 535INIT_XMM ssse3 536CABAC_RESIDUAL_RD 0, coeff_last_sse2 537CABAC_RESIDUAL_RD 1, coeff_last_sse2 538INIT_XMM ssse3,lzcnt 539CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt 540CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt 541%endif 542 543;----------------------------------------------------------------------------- 544; void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced, 545; int ctx_block_cat, x264_cabac_t *cb ); 546;----------------------------------------------------------------------------- 547 548%macro CALL_CABAC 0 549%if cpuflag(bmi2) 550 call cabac_encode_decision_bmi2 551%else 552 call cabac_encode_decision_asm 553%endif 554%if WIN64 ; move cabac back 555 mov r0, r3 556%endif 557%endmacro 558 559; %1 = 8x8 mode 560; %2 = dct register 561; %3 = countcat 562; %4 = name 563%macro SIGMAP_LOOP 3-4 564.sigmap_%4loop: 565%if HIGH_BIT_DEPTH 566 mov %2, [dct+r10*4] 567%else 568 movsx %2, word [dct+r10*2] 569%endif 570%if %1 571 movzx r1d, byte [sigoff_8x8 + r10] 572 add r1d, sigoffd 573%else 574 lea r1d, [sigoffd + r10d] 575%endif 576 test %2, %2 577 jz .sigmap_%4zero ; if( l[i] ) 578 inc coeffidxd 579 mov [coeffs+coeffidxq*4], %2 ; coeffs[++coeff_idx] = l[i]; 580 mov r2d, 1 581 CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 1 ); 582%if %1 583 movzx r1d, byte [last_coeff_flag_offset_8x8 + r10 GLOBAL] 584 add r1d, lastoffd 585%else 586 lea r1d, [lastoffd + r10d] 587%endif 588 cmp r10d, lastm ; if( i == last ) 589 je .sigmap_%4last 590 xor r2d, r2d 591 CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_last + last_off, 0 ); 592 jmp .sigmap_%4loop_endcheck 593.sigmap_%4zero: 594 xor r2d, r2d 595 CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 0 ); 596.sigmap_%4loop_endcheck: 597 inc r10d 598 cmp r10d, %3 599 jne .sigmap_%4loop ; if( ++i == count_m1 ) 600%if HIGH_BIT_DEPTH 601 mov %2, [dct+r10*4] 602%else 603 movsx %2, word [dct+r10*2] 604%endif 605 inc coeffidxd 606 mov [coeffs+coeffidxq*4], %2 ; coeffs[++coeff_idx] = l[i] 607 jmp .sigmap_%4end 608.sigmap_%4last: ; x264_cabac_encode_decision( cb, ctx_last + last_off, 1 ); 609 mov r2d, 1 610 CALL_CABAC 611.sigmap_%4end: 612%if %1==0 613 jmp .level_loop_start 614%endif 615%endmacro 616 617%macro CABAC_RESIDUAL 1 618cglobal cabac_block_residual_internal, 4,15 619%ifdef PIC 620; if we use the same r7 as in cabac_encode_decision, we can cheat and save a register. 621 lea r7, [$$] 622 %define lastm [rsp+4*1] 623 %define GLOBAL +r7-$$ 624%else 625 %define lastm r7d 626 %define GLOBAL 627%endif 628%assign pad gprsize+4*2+4*64-(stack_offset&15) 629 SUB rsp, pad 630 shl r1d, 4 631 632 %define sigoffq r8 633 %define sigoffd r8d 634 %define lastoffq r9 635 %define lastoffd r9d 636 %define leveloffq r10 637 %define leveloffd r10d 638 %define leveloffm [rsp+4*0] 639 %define countcatd r11d 640 %define sigoff_8x8 r12 641 %define coeffidxq r13 642 %define coeffidxd r13d 643 %define dct r14 644 %define coeffs rsp+4*2 645 646 lea sigoff_8x8, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL] 647 add r1d, r2d 648 movzx sigoffd, word [significant_coeff_flag_offset+r1*2 GLOBAL] 649 movzx lastoffd, word [last_coeff_flag_offset+r1*2 GLOBAL] 650 movzx leveloffd, word [coeff_abs_level_m1_offset+r2*2 GLOBAL] 651 movzx countcatd, byte [count_cat_m1+r2 GLOBAL] 652 mov coeffidxd, -1 653 mov dct, r0 654 mov leveloffm, leveloffd 655 656 mov r1, [%1+gprsize*r2 GLOBAL] 657 call r1 658 mov lastm, eax 659; put cabac in r0; needed for cabac_encode_decision 660 mov r0, r3 661 662 xor r10d, r10d 663 cmp countcatd, 63 664 je .sigmap_8x8 665 SIGMAP_LOOP 0, r12d, countcatd, 666.sigmap_8x8: 667 SIGMAP_LOOP 1, r11d, 63, _8x8 668.level_loop_start: 669; we now have r8, r9, r11, r12, and r7/r14(dct) free for the main loop. 670 %define nodectxq r8 671 %define nodectxd r8d 672 mov leveloffd, leveloffm 673 xor nodectxd, nodectxd 674.level_loop: 675 mov r9d, [coeffs+coeffidxq*4] 676 mov r11d, r9d 677 sar r11d, 31 678 add r9d, r11d 679 movzx r1d, byte [coeff_abs_level1_ctx+nodectxq GLOBAL] 680 xor r9d, r11d 681 add r1d, leveloffd 682 cmp r9d, 1 683 jg .level_gt1 684 xor r2d, r2d 685 CALL_CABAC 686 movzx nodectxd, byte [coeff_abs_level_transition+nodectxq GLOBAL] 687 jmp .level_sign 688.level_gt1: 689 mov r2d, 1 690 CALL_CABAC 691 movzx r14d, byte [coeff_abs_levelgt1_ctx+nodectxq GLOBAL] 692 add r14d, leveloffd 693 cmp r9d, 15 694 mov r12d, 15 695 cmovl r12d, r9d 696 sub r12d, 2 697 jz .level_eq2 698.level_gt1_loop: 699 mov r1d, r14d 700 mov r2d, 1 701 CALL_CABAC 702 dec r12d 703 jg .level_gt1_loop 704 cmp r9d, 15 705 jge .level_bypass 706.level_eq2: 707 mov r1d, r14d 708 xor r2d, r2d 709 CALL_CABAC 710 jmp .level_gt1_end 711.level_bypass: 712 lea r2d, [r9d-15] 713 xor r1d, r1d 714 push r0 715; we could avoid this if we implemented it in asm, but I don't feel like that 716; right now. 717%if UNIX64 718 push r7 719 push r8 720%else 721 sub rsp, 32 ; shadow space 722%endif 723 call cabac_encode_ue_bypass 724%if UNIX64 725 pop r8 726 pop r7 727%else 728 add rsp, 32 729%endif 730 pop r0 731.level_gt1_end: 732 movzx nodectxd, byte [coeff_abs_level_transition+8+nodectxq GLOBAL] 733.level_sign: 734 mov r1d, r11d 735%if cpuflag(bmi2) 736 call cabac_encode_bypass_bmi2 737%else 738 call cabac_encode_bypass_asm 739%endif 740%if WIN64 741 mov r0, r3 742%endif 743 dec coeffidxd 744 jge .level_loop 745 ADD rsp, pad 746 RET 747%endmacro 748 749%if ARCH_X86_64 750INIT_XMM sse2 751CABAC_RESIDUAL coeff_last_sse2 752INIT_XMM sse2,lzcnt 753CABAC_RESIDUAL coeff_last_sse2_lzcnt 754INIT_XMM avx2,bmi2 755CABAC_RESIDUAL coeff_last_avx2_lzcnt 756%endif 757