1;*****************************************************************************
2;* cabac-a.asm: x86 cabac
3;*****************************************************************************
4;* Copyright (C) 2008-2014 x264 project
5;*
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;*          Fiona Glaser <fiona@x264.com>
8;*          Holger Lubitz <holger@lubitz.org>
9;*
10;* This program is free software; you can redistribute it and/or modify
11;* it under the terms of the GNU General Public License as published by
12;* the Free Software Foundation; either version 2 of the License, or
13;* (at your option) any later version.
14;*
15;* This program is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18;* GNU General Public License for more details.
19;*
20;* You should have received a copy of the GNU General Public License
21;* along with this program; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
23;*
24;* This program is also available under a commercial proprietary license.
25;* For more information, contact us at licensing@x264.com.
26;*****************************************************************************
27
28%include "x86inc.asm"
29%include "x86util.asm"
30
31SECTION_RODATA
32
33coeff_abs_level1_ctx:       db 1, 2, 3, 4, 0, 0, 0, 0
34coeff_abs_levelgt1_ctx:     db 5, 5, 5, 5, 6, 7, 8, 9
35coeff_abs_level_transition: db 1, 2, 3, 3, 4, 5, 6, 7
36                            db 4, 4, 4, 4, 5, 6, 7, 7
37
38%if ARCH_X86_64
39%macro COEFF_LAST_TABLE 17
40    %define funccpu1 %1
41    %define funccpu2 %2
42    %define funccpu3 %3
43    %rep 14
44        %ifidn %4, 4
45            dq mangle(x264_coeff_last%4_ %+ funccpu1)
46        %elifidn %4, 64
47            dq mangle(x264_coeff_last%4_ %+ funccpu2)
48        %else
49            dq mangle(x264_coeff_last%4_ %+ funccpu3)
50        %endif
51        %rotate 1
52    %endrep
53%endmacro
54
55cextern coeff_last4_mmx2
56cextern coeff_last4_mmx2_lzcnt
57cextern coeff_last15_sse2
58cextern coeff_last15_sse2_lzcnt
59cextern coeff_last16_sse2
60cextern coeff_last16_sse2_lzcnt
61cextern coeff_last64_sse2
62cextern coeff_last64_sse2_lzcnt
63cextern coeff_last64_avx2_lzcnt
64
65%ifdef PIC
66SECTION .data
67%endif
68coeff_last_sse2:       COEFF_LAST_TABLE       mmx2,       sse2,       sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
69coeff_last_sse2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, sse2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
70coeff_last_avx2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, avx2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
71%endif
72
73SECTION .text
74
75cextern cabac_range_lps
76cextern cabac_transition
77cextern cabac_renorm_shift
78cextern cabac_entropy
79cextern cabac_size_unary
80cextern cabac_transition_unary
81cextern significant_coeff_flag_offset
82cextern significant_coeff_flag_offset_8x8
83cextern last_coeff_flag_offset
84cextern last_coeff_flag_offset_8x8
85cextern coeff_abs_level_m1_offset
86cextern count_cat_m1
87cextern cabac_encode_ue_bypass
88
89%if ARCH_X86_64
90    %define pointer resq
91%else
92    %define pointer resd
93%endif
94
95struc cb
96    .low: resd 1
97    .range: resd 1
98    .queue: resd 1
99    .bytes_outstanding: resd 1
100    .start: pointer 1
101    .p: pointer 1
102    .end: pointer 1
103    align 16, resb 1
104    .bits_encoded: resd 1
105    .state: resb 1024
106endstruc
107
108%macro LOAD_GLOBAL 3-5 0 ; dst, base, off1, off2, tmp
109%ifdef PIC
110    %ifidn %4, 0
111        movzx %1, byte [%2+%3+r7-$$]
112    %else
113        lea   %5, [r7+%4]
114        movzx %1, byte [%2+%3+%5-$$]
115    %endif
116%else
117    movzx %1, byte [%2+%3+%4]
118%endif
119%endmacro
120
121%macro CABAC 1
122; t3 must be ecx, since it's used for shift.
123%if WIN64
124    DECLARE_REG_TMP 3,1,2,0,5,6,4,4
125%elif ARCH_X86_64
126    DECLARE_REG_TMP 0,1,2,3,4,5,6,6
127%else
128    DECLARE_REG_TMP 0,4,2,1,3,5,6,2
129%endif
130
131cglobal cabac_encode_decision_%1, 1,7
132    movifnidn t1d, r1m
133    mov   t5d, [r0+cb.range]
134    movzx t6d, byte [r0+cb.state+t1]
135    movifnidn t0,  r0 ; WIN64
136    mov   t4d, ~1
137    mov   t3d, t5d
138    and   t4d, t6d
139    shr   t5d, 6
140    movifnidn t2d, r2m
141%if WIN64
142    PUSH r7
143%endif
144%ifdef PIC
145    lea    r7, [$$]
146%endif
147    LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2, t4
148    LOAD_GLOBAL t4d, cabac_transition, t2, t6*2, t4
149    and   t6d, 1
150    sub   t3d, t5d
151    cmp   t6d, t2d
152    mov   t6d, [t0+cb.low]
153    lea    t2, [t6+t3]
154    cmovne t3d, t5d
155    cmovne t6d, t2d
156    mov   [t0+cb.state+t1], t4b
157;cabac_encode_renorm
158    mov   t4d, t3d
159%ifidn %1, bmi2
160    lzcnt t3d, t3d
161    sub   t3d, 23
162    shlx  t4d, t4d, t3d
163    shlx  t6d, t6d, t3d
164%else
165    shr   t3d, 3
166    LOAD_GLOBAL t3d, cabac_renorm_shift, t3
167    shl   t4d, t3b
168    shl   t6d, t3b
169%endif
170%if WIN64
171    POP r7
172%endif
173    mov   [t0+cb.range], t4d
174    add   t3d, [t0+cb.queue]
175    jge cabac_putbyte_%1
176.update_queue_low:
177    mov   [t0+cb.low], t6d
178    mov   [t0+cb.queue], t3d
179    RET
180
181cglobal cabac_encode_bypass_%1, 2,3
182    mov       t7d, [r0+cb.low]
183    and       r1d, [r0+cb.range]
184    lea       t7d, [t7*2+r1]
185    movifnidn  t0, r0 ; WIN64
186    mov       t3d, [r0+cb.queue]
187    inc       t3d
188%if ARCH_X86_64 ; .putbyte compiles to nothing but a jmp
189    jge cabac_putbyte_%1
190%else
191    jge .putbyte
192%endif
193    mov   [t0+cb.low], t7d
194    mov   [t0+cb.queue], t3d
195    RET
196%if ARCH_X86_64 == 0
197.putbyte:
198    PROLOGUE 0,7
199    movifnidn t6d, t7d
200    jmp cabac_putbyte_%1
201%endif
202
203%ifnidn %1,bmi2
204cglobal cabac_encode_terminal_%1, 1,3
205    sub  dword [r0+cb.range], 2
206; shortcut: the renormalization shift in terminal
207; can only be 0 or 1 and is zero over 99% of the time.
208    test dword [r0+cb.range], 0x100
209    je .renorm
210    RET
211.renorm:
212    shl  dword [r0+cb.low], 1
213    shl  dword [r0+cb.range], 1
214    inc  dword [r0+cb.queue]
215    jge .putbyte
216    RET
217.putbyte:
218    PROLOGUE 0,7
219    movifnidn t0, r0 ; WIN64
220    mov t3d, [r0+cb.queue]
221    mov t6d, [t0+cb.low]
222%endif
223
224cabac_putbyte_%1:
225    ; alive: t0=cb t3=queue t6=low
226%if WIN64
227    DECLARE_REG_TMP 3,6,1,0,2,5,4
228%endif
229%ifidn %1, bmi2
230    add   t3d, 10
231    shrx  t2d, t6d, t3d
232    bzhi  t6d, t6d, t3d
233    sub   t3d, 18
234%else
235    mov   t1d, -1
236    add   t3d, 10
237    mov   t2d, t6d
238    shl   t1d, t3b
239    shr   t2d, t3b ; out
240    not   t1d
241    sub   t3d, 18
242    and   t6d, t1d
243%endif
244    mov   t5d, [t0+cb.bytes_outstanding]
245    cmp   t2b, 0xff ; FIXME is a 32bit op faster?
246    jz    .postpone
247    mov    t1, [t0+cb.p]
248    add   [t1-1], t2h
249    dec   t2h
250.loop_outstanding:
251    mov   [t1], t2h
252    inc   t1
253    dec   t5d
254    jge .loop_outstanding
255    mov   [t1-1], t2b
256    mov   [t0+cb.p], t1
257.postpone:
258    inc   t5d
259    mov   [t0+cb.bytes_outstanding], t5d
260    jmp mangle(x264_cabac_encode_decision_%1.update_queue_low)
261%endmacro
262
263CABAC asm
264CABAC bmi2
265
266; %1 = label name
267; %2 = node_ctx init?
268%macro COEFF_ABS_LEVEL_GT1 2
269%if %2
270    %define ctx 1
271%else
272    movzx  r11d, byte [coeff_abs_level1_ctx+r2 GLOBAL]
273    %define ctx r11
274%endif
275    movzx   r9d, byte [r8+ctx]
276; if( coeff_abs > 1 )
277    cmp     r1d, 1
278    jg .%1_gt1
279; x264_cabac_encode_decision( cb, ctx_level+ctx, 0 )
280    movzx  r10d, byte [cabac_transition+r9*2 GLOBAL]
281    movzx   r9d, word [cabac_entropy+r9*2 GLOBAL]
282    lea     r0d, [r0+r9+256]
283    mov [r8+ctx], r10b
284%if %2
285    mov     r2d, 1
286%else
287    movzx   r2d, byte [coeff_abs_level_transition+r2 GLOBAL]
288%endif
289    jmp .%1_end
290
291.%1_gt1:
292; x264_cabac_encode_decision( cb, ctx_level+ctx, 1 )
293    movzx  r10d, byte [cabac_transition+r9*2+1 GLOBAL]
294    xor     r9d, 1
295    movzx   r9d, word [cabac_entropy+r9*2 GLOBAL]
296    mov [r8+ctx], r10b
297    add     r0d, r9d
298%if %2
299    %define ctx 5
300%else
301    movzx  r11d, byte [coeff_abs_levelgt1_ctx+r2 GLOBAL]
302    %define ctx r11
303%endif
304; if( coeff_abs < 15 )
305    cmp     r1d, 15
306    jge .%1_escape
307    shl     r1d, 7
308; x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx_level+ctx]]
309    movzx   r9d, byte [r8+ctx]
310    add     r9d, r1d
311    movzx  r10d, byte [cabac_transition_unary-128+r9 GLOBAL]
312; x264_cabac_size_unary[coeff_abs-1][cb->state[ctx_level+ctx]]
313    movzx   r9d, word [cabac_size_unary-256+r9*2 GLOBAL]
314    mov [r8+ctx], r10b
315    add     r0d, r9d
316    jmp .%1_gt1_end
317
318.%1_escape:
319; x264_cabac_transition_unary[14][cb->state[ctx_level+ctx]]
320    movzx   r9d, byte [r8+ctx]
321    movzx  r10d, byte [cabac_transition_unary+128*14+r9 GLOBAL]
322; x264_cabac_size_unary[14][cb->state[ctx_level+ctx]]
323    movzx   r9d, word [cabac_size_unary+256*14+r9*2 GLOBAL]
324    add     r0d, r9d
325    mov [r8+ctx], r10b
326    sub     r1d, 14
327%if cpuflag(lzcnt)
328    lzcnt   r9d, r1d
329    xor     r9d, 0x1f
330%else
331    bsr     r9d, r1d
332%endif
333; bs_size_ue_big(coeff_abs-15)<<8
334    shl     r9d, 9
335; (ilog2(coeff_abs-14)+1) << 8
336    lea     r0d, [r0+r9+256]
337.%1_gt1_end:
338%if %2
339    mov     r2d, 4
340%else
341    movzx   r2d, byte [coeff_abs_level_transition+8+r2 GLOBAL]
342%endif
343.%1_end:
344%endmacro
345
346%macro LOAD_DCTCOEF 1
347%if HIGH_BIT_DEPTH
348    mov     %1, [dct+r6*4]
349%else
350    movzx   %1, word [dct+r6*2]
351%endif
352%endmacro
353
354%macro ABS_DCTCOEFS 2
355%assign i 0
356%rep %2/16
357%if HIGH_BIT_DEPTH
358    ABSD   m0, [%1+ 0+i*64], m4
359    ABSD   m1, [%1+16+i*64], m5
360    ABSD   m2, [%1+32+i*64], m4
361    ABSD   m3, [%1+48+i*64], m5
362    mova [rsp+ 0+i*64], m0
363    mova [rsp+16+i*64], m1
364    mova [rsp+32+i*64], m2
365    mova [rsp+48+i*64], m3
366%else
367    ABSW   m0, [%1+ 0+i*32], m2
368    ABSW   m1, [%1+16+i*32], m3
369    mova [rsp+ 0+i*32], m0
370    mova [rsp+16+i*32], m1
371%endif
372%assign i i+1
373%endrep
374%endmacro
375
376%macro SIG_OFFSET 1
377%if %1
378    movzx  r11d, byte [r4+r6]
379%endif
380%endmacro
381
382%macro LAST_OFFSET 1
383%if %1
384    movzx  r11d, byte [last_coeff_flag_offset_8x8+r6 GLOBAL]
385%endif
386%endmacro
387
388;-----------------------------------------------------------------------------
389; void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced,
390;                                                   int ctx_block_cat, x264_cabac_t *cb );
391;-----------------------------------------------------------------------------
392
393;%1 = 8x8 mode
394%macro CABAC_RESIDUAL_RD 2
395%if %1
396    %define func cabac_block_residual_8x8_rd_internal
397    %define maxcoeffs 64
398    %define dct rsp
399%else
400    %define func cabac_block_residual_rd_internal
401    %define maxcoeffs 16
402    %define dct r4
403%endif
404
405%ifdef PIC
406    cglobal func, 4,13
407    lea     r12, [$$]
408    %define GLOBAL +r12-$$
409%else
410    cglobal func, 4,12
411    %define GLOBAL
412%endif
413
414%assign pad gprsize+SIZEOF_DCTCOEF*maxcoeffs-(stack_offset&15)
415    SUB     rsp, pad
416    shl     r1d, 4                                            ; MB_INTERLACED*16
417%if %1
418    lea      r4, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL]     ; r12 = sig offset 8x8
419%endif
420    add     r1d, r2d
421    movzx   r5d, word [significant_coeff_flag_offset+r1*2 GLOBAL]    ; r5 = ctx_sig
422    movzx   r7d, word [last_coeff_flag_offset+r1*2 GLOBAL]           ; r7 = ctx_last
423    movzx   r8d, word [coeff_abs_level_m1_offset+r2*2 GLOBAL]        ; r8 = ctx_level
424
425; abs() all the coefficients; copy them to the stack to avoid
426; changing the originals.
427; overreading is okay; it's all valid aligned data anyways.
428%if %1
429    ABS_DCTCOEFS r0, 64
430%else
431    mov      r4, r0                                           ; r4 = dct
432    mov      r6, ~SIZEOF_DCTCOEF
433    and      r6, r4                                           ; handle AC coefficient case
434    ABS_DCTCOEFS r6, 16
435    sub      r4, r6                                           ; calculate our new dct pointer
436    add      r4, rsp                                          ; restore AC coefficient offset
437%endif
438    mov      r1, [%2+gprsize*r2 GLOBAL]
439; for improved OOE performance, run coeff_last on the original coefficients.
440    call     r1                                               ; coeff_last[ctx_block_cat]( dct )
441; we know on 64-bit that the SSE2 versions of this function only
442; overwrite r0, r1, and rax (r6). last64 overwrites r2 too, but we
443; don't need r2 in 8x8 mode.
444    mov     r0d, [r3+cb.bits_encoded]                         ; r0 = cabac.f8_bits_encoded
445; pre-add some values to simplify addressing
446    add      r3, cb.state
447    add      r5, r3
448    add      r7, r3
449    add      r8, r3                                           ; precalculate cabac state pointers
450
451; if( last != count_cat_m1[ctx_block_cat] )
452%if %1
453    cmp     r6b, 63
454%else
455    cmp     r6b, [count_cat_m1+r2 GLOBAL]
456%endif
457    je .skip_last_sigmap
458
459; in 8x8 mode we have to do a bit of extra calculation for ctx_sig/last,
460; so we'll use r11 for this.
461%if %1
462    %define siglast_ctx r11
463%else
464    %define siglast_ctx r6
465%endif
466
467; x264_cabac_encode_decision( cb, ctx_sig + last, 1 )
468; x264_cabac_encode_decision( cb, ctx_last + last, 1 )
469    SIG_OFFSET %1
470    movzx   r1d, byte [r5+siglast_ctx]
471    movzx   r9d, byte [cabac_transition+1+r1*2 GLOBAL]
472    xor     r1d, 1
473    movzx   r1d, word [cabac_entropy+r1*2 GLOBAL]
474    mov [r5+siglast_ctx], r9b
475    add     r0d, r1d
476
477    LAST_OFFSET %1
478    movzx   r1d, byte [r7+siglast_ctx]
479    movzx   r9d, byte [cabac_transition+1+r1*2 GLOBAL]
480    xor     r1d, 1
481    movzx   r1d, word [cabac_entropy+r1*2 GLOBAL]
482    mov [r7+siglast_ctx], r9b
483    add     r0d, r1d
484.skip_last_sigmap:
485    LOAD_DCTCOEF r1d
486    COEFF_ABS_LEVEL_GT1 last, 1
487; for( int i = last-1 ; i >= 0; i-- )
488    dec     r6d
489    jl .end
490.coeff_loop:
491    LOAD_DCTCOEF r1d
492; if( l[i] )
493    SIG_OFFSET %1
494    movzx   r9d, byte [r5+siglast_ctx]
495    test    r1d, r1d
496    jnz .coeff_nonzero
497; x264_cabac_encode_decision( cb, ctx_sig + i, 0 )
498    movzx  r10d, byte [cabac_transition+r9*2 GLOBAL]
499    movzx   r9d, word [cabac_entropy+r9*2 GLOBAL]
500    mov [r5+siglast_ctx], r10b
501    add     r0d, r9d
502    dec     r6d
503    jge .coeff_loop
504    jmp .end
505.coeff_nonzero:
506; x264_cabac_encode_decision( cb, ctx_sig + i, 1 )
507    movzx  r10d, byte [cabac_transition+r9*2+1 GLOBAL]
508    xor     r9d, 1
509    movzx   r9d, word [cabac_entropy+r9*2 GLOBAL]
510    mov [r5+siglast_ctx], r10b
511    add     r0d, r9d
512; x264_cabac_encode_decision( cb, ctx_last + i, 0 );
513    LAST_OFFSET %1
514    movzx   r9d, byte [r7+siglast_ctx]
515    movzx  r10d, byte [cabac_transition+r9*2 GLOBAL]
516    movzx   r9d, word [cabac_entropy+r9*2 GLOBAL]
517    mov [r7+siglast_ctx], r10b
518    add     r0d, r9d
519    COEFF_ABS_LEVEL_GT1 coeff, 0
520    dec     r6d
521    jge .coeff_loop
522.end:
523    mov [r3+cb.bits_encoded-cb.state], r0d
524    ADD     rsp, pad
525    RET
526%endmacro
527
528%if ARCH_X86_64
529INIT_XMM sse2
530CABAC_RESIDUAL_RD 0, coeff_last_sse2
531CABAC_RESIDUAL_RD 1, coeff_last_sse2
532INIT_XMM sse2,lzcnt
533CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt
534CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
535INIT_XMM ssse3
536CABAC_RESIDUAL_RD 0, coeff_last_sse2
537CABAC_RESIDUAL_RD 1, coeff_last_sse2
538INIT_XMM ssse3,lzcnt
539CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt
540CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
541%endif
542
543;-----------------------------------------------------------------------------
544; void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced,
545;                                                int ctx_block_cat, x264_cabac_t *cb );
546;-----------------------------------------------------------------------------
547
548%macro CALL_CABAC 0
549%if cpuflag(bmi2)
550    call cabac_encode_decision_bmi2
551%else
552    call cabac_encode_decision_asm
553%endif
554%if WIN64 ; move cabac back
555    mov r0, r3
556%endif
557%endmacro
558
559; %1 = 8x8 mode
560; %2 = dct register
561; %3 = countcat
562; %4 = name
563%macro SIGMAP_LOOP 3-4
564.sigmap_%4loop:
565%if HIGH_BIT_DEPTH
566    mov      %2, [dct+r10*4]
567%else
568    movsx    %2, word [dct+r10*2]
569%endif
570%if %1
571    movzx   r1d, byte [sigoff_8x8 + r10]
572    add     r1d, sigoffd
573%else
574    lea     r1d, [sigoffd + r10d]
575%endif
576    test     %2, %2
577    jz .sigmap_%4zero               ; if( l[i] )
578    inc coeffidxd
579    mov [coeffs+coeffidxq*4], %2    ; coeffs[++coeff_idx] = l[i];
580    mov     r2d, 1
581    CALL_CABAC                      ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 1 );
582%if %1
583    movzx   r1d, byte [last_coeff_flag_offset_8x8 + r10 GLOBAL]
584    add     r1d, lastoffd
585%else
586    lea     r1d, [lastoffd + r10d]
587%endif
588    cmp    r10d, lastm              ; if( i == last )
589    je .sigmap_%4last
590    xor     r2d, r2d
591    CALL_CABAC                      ; x264_cabac_encode_decision( cb, ctx_last + last_off, 0 );
592    jmp .sigmap_%4loop_endcheck
593.sigmap_%4zero:
594    xor     r2d, r2d
595    CALL_CABAC                      ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 0 );
596.sigmap_%4loop_endcheck:
597    inc    r10d
598    cmp    r10d, %3
599    jne .sigmap_%4loop              ; if( ++i == count_m1 )
600%if HIGH_BIT_DEPTH
601    mov      %2, [dct+r10*4]
602%else
603    movsx    %2, word [dct+r10*2]
604%endif
605    inc coeffidxd
606    mov [coeffs+coeffidxq*4], %2    ; coeffs[++coeff_idx] = l[i]
607    jmp .sigmap_%4end
608.sigmap_%4last:                     ; x264_cabac_encode_decision( cb, ctx_last + last_off, 1 );
609    mov     r2d, 1
610    CALL_CABAC
611.sigmap_%4end:
612%if %1==0
613    jmp .level_loop_start
614%endif
615%endmacro
616
617%macro CABAC_RESIDUAL 1
618cglobal cabac_block_residual_internal, 4,15
619%ifdef PIC
620; if we use the same r7 as in cabac_encode_decision, we can cheat and save a register.
621    lea     r7, [$$]
622    %define lastm [rsp+4*1]
623    %define GLOBAL +r7-$$
624%else
625    %define lastm r7d
626    %define GLOBAL
627%endif
628%assign pad gprsize+4*2+4*64-(stack_offset&15)
629    SUB     rsp, pad
630    shl     r1d, 4
631
632    %define sigoffq r8
633    %define sigoffd r8d
634    %define lastoffq r9
635    %define lastoffd r9d
636    %define leveloffq r10
637    %define leveloffd r10d
638    %define leveloffm [rsp+4*0]
639    %define countcatd r11d
640    %define sigoff_8x8 r12
641    %define coeffidxq r13
642    %define coeffidxd r13d
643    %define dct r14
644    %define coeffs rsp+4*2
645
646    lea sigoff_8x8, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL]
647    add     r1d, r2d
648    movzx sigoffd, word [significant_coeff_flag_offset+r1*2 GLOBAL]
649    movzx lastoffd, word [last_coeff_flag_offset+r1*2 GLOBAL]
650    movzx leveloffd, word [coeff_abs_level_m1_offset+r2*2 GLOBAL]
651    movzx countcatd, byte [count_cat_m1+r2 GLOBAL]
652    mov coeffidxd, -1
653    mov     dct, r0
654    mov leveloffm, leveloffd
655
656    mov      r1, [%1+gprsize*r2 GLOBAL]
657    call     r1
658    mov   lastm, eax
659; put cabac in r0; needed for cabac_encode_decision
660    mov      r0, r3
661
662    xor    r10d, r10d
663    cmp countcatd, 63
664    je .sigmap_8x8
665    SIGMAP_LOOP 0, r12d, countcatd,
666.sigmap_8x8:
667    SIGMAP_LOOP 1, r11d, 63, _8x8
668.level_loop_start:
669; we now have r8, r9, r11, r12, and r7/r14(dct) free for the main loop.
670    %define nodectxq r8
671    %define nodectxd r8d
672    mov leveloffd, leveloffm
673    xor nodectxd, nodectxd
674.level_loop:
675    mov     r9d, [coeffs+coeffidxq*4]
676    mov    r11d, r9d
677    sar    r11d, 31
678    add     r9d, r11d
679    movzx   r1d, byte [coeff_abs_level1_ctx+nodectxq GLOBAL]
680    xor     r9d, r11d
681    add     r1d, leveloffd
682    cmp     r9d, 1
683    jg .level_gt1
684    xor     r2d, r2d
685    CALL_CABAC
686    movzx nodectxd, byte [coeff_abs_level_transition+nodectxq GLOBAL]
687    jmp .level_sign
688.level_gt1:
689    mov     r2d, 1
690    CALL_CABAC
691    movzx  r14d, byte [coeff_abs_levelgt1_ctx+nodectxq GLOBAL]
692    add    r14d, leveloffd
693    cmp     r9d, 15
694    mov    r12d, 15
695    cmovl  r12d, r9d
696    sub    r12d, 2
697    jz .level_eq2
698.level_gt1_loop:
699    mov     r1d, r14d
700    mov     r2d, 1
701    CALL_CABAC
702    dec    r12d
703    jg .level_gt1_loop
704    cmp     r9d, 15
705    jge .level_bypass
706.level_eq2:
707    mov     r1d, r14d
708    xor     r2d, r2d
709    CALL_CABAC
710    jmp .level_gt1_end
711.level_bypass:
712    lea     r2d, [r9d-15]
713    xor     r1d, r1d
714    push     r0
715; we could avoid this if we implemented it in asm, but I don't feel like that
716; right now.
717%if UNIX64
718    push     r7
719    push     r8
720%else
721    sub      rsp, 32 ; shadow space
722%endif
723    call cabac_encode_ue_bypass
724%if UNIX64
725    pop      r8
726    pop      r7
727%else
728    add      rsp, 32
729%endif
730    pop      r0
731.level_gt1_end:
732    movzx nodectxd, byte [coeff_abs_level_transition+8+nodectxq GLOBAL]
733.level_sign:
734    mov     r1d, r11d
735%if cpuflag(bmi2)
736    call cabac_encode_bypass_bmi2
737%else
738    call cabac_encode_bypass_asm
739%endif
740%if WIN64
741    mov      r0, r3
742%endif
743    dec coeffidxd
744    jge .level_loop
745    ADD     rsp, pad
746    RET
747%endmacro
748
749%if ARCH_X86_64
750INIT_XMM sse2
751CABAC_RESIDUAL coeff_last_sse2
752INIT_XMM sse2,lzcnt
753CABAC_RESIDUAL coeff_last_sse2_lzcnt
754INIT_XMM avx2,bmi2
755CABAC_RESIDUAL coeff_last_avx2_lzcnt
756%endif
757