1;
2; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
3;
4; Copyright 2009-2011, 2014-2016 D. R. Commander.
5; Copyright 2015 Matthieu Darbois
6;
7; Based on
8; x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17;
18; This file contains an SSE2 implementation for Huffman coding of one block.
19; The following code is based directly on jchuff.c; see jchuff.c for more
20; details.
21;
22; [TAB8]
23
24%include "jsimdext.inc"
25
26; --------------------------------------------------------------------------
27        SECTION SEG_CONST
28
29        alignz  16
30        global  EXTN(jconst_huff_encode_one_block) PRIVATE
31
32EXTN(jconst_huff_encode_one_block):
33
34%include "jpeg_nbits_table.inc"
35
36        alignz  16
37
38; --------------------------------------------------------------------------
39        SECTION SEG_TEXT
40        BITS    32
41
42; These macros perform the same task as the emit_bits() function in the
43; original libjpeg code.  In addition to reducing overhead by explicitly
44; inlining the code, additional performance is achieved by taking into
45; account the size of the bit buffer and waiting until it is almost full
46; before emptying it.  This mostly benefits 64-bit platforms, since 6
47; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
48
49%macro EMIT_BYTE 0
50        sub put_bits, 8  ; put_bits -= 8;
51        mov edx, put_buffer
52        mov ecx, put_bits
53        shr edx, cl  ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
54        mov byte [eax], dl  ; *buffer++ = c;
55        add eax, 1
56        cmp dl, 0xFF  ; need to stuff a zero byte?
57        jne %%.EMIT_BYTE_END
58        mov byte [eax], 0  ; *buffer++ = 0;
59        add eax, 1
60%%.EMIT_BYTE_END:
61%endmacro
62
63%macro PUT_BITS 1
64        add put_bits, ecx  ; put_bits += size;
65        shl put_buffer, cl  ; put_buffer = (put_buffer << size);
66        or  put_buffer, %1
67%endmacro
68
69%macro CHECKBUF15 0
70        cmp put_bits, 16  ; if (put_bits > 31) {
71        jl %%.CHECKBUF15_END
72        mov eax, POINTER [esp+buffer]
73        EMIT_BYTE
74        EMIT_BYTE
75        mov POINTER [esp+buffer], eax
76%%.CHECKBUF15_END:
77%endmacro
78
79%macro EMIT_BITS 1
80        PUT_BITS %1
81        CHECKBUF15
82%endmacro
83
84%macro kloop_prepare 37  ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
85    pxor xmm4, xmm4  ; __m128i neg = _mm_setzero_si128();
86    pxor xmm5, xmm5  ; __m128i neg = _mm_setzero_si128();
87    pxor xmm6, xmm6  ; __m128i neg = _mm_setzero_si128();
88    pxor xmm7, xmm7  ; __m128i neg = _mm_setzero_si128();
89    pinsrw %34, word [esi + %2  * SIZEOF_WORD], 0  ; xmm_shadow[0] = block[jno0];
90    pinsrw %35, word [esi + %10 * SIZEOF_WORD], 0  ; xmm_shadow[8] = block[jno8];
91    pinsrw %36, word [esi + %18 * SIZEOF_WORD], 0  ; xmm_shadow[16] = block[jno16];
92    pinsrw %37, word [esi + %26 * SIZEOF_WORD], 0  ; xmm_shadow[24] = block[jno24];
93    pinsrw %34, word [esi + %3  * SIZEOF_WORD], 1  ; xmm_shadow[1] = block[jno1];
94    pinsrw %35, word [esi + %11 * SIZEOF_WORD], 1  ; xmm_shadow[9] = block[jno9];
95    pinsrw %36, word [esi + %19 * SIZEOF_WORD], 1  ; xmm_shadow[17] = block[jno17];
96    pinsrw %37, word [esi + %27 * SIZEOF_WORD], 1  ; xmm_shadow[25] = block[jno25];
97    pinsrw %34, word [esi + %4  * SIZEOF_WORD], 2  ; xmm_shadow[2] = block[jno2];
98    pinsrw %35, word [esi + %12 * SIZEOF_WORD], 2  ; xmm_shadow[10] = block[jno10];
99    pinsrw %36, word [esi + %20 * SIZEOF_WORD], 2  ; xmm_shadow[18] = block[jno18];
100    pinsrw %37, word [esi + %28 * SIZEOF_WORD], 2  ; xmm_shadow[26] = block[jno26];
101    pinsrw %34, word [esi + %5  * SIZEOF_WORD], 3  ; xmm_shadow[3] = block[jno3];
102    pinsrw %35, word [esi + %13 * SIZEOF_WORD], 3  ; xmm_shadow[11] = block[jno11];
103    pinsrw %36, word [esi + %21 * SIZEOF_WORD], 3  ; xmm_shadow[19] = block[jno19];
104    pinsrw %37, word [esi + %29 * SIZEOF_WORD], 3  ; xmm_shadow[27] = block[jno27];
105    pinsrw %34, word [esi + %6  * SIZEOF_WORD], 4  ; xmm_shadow[4] = block[jno4];
106    pinsrw %35, word [esi + %14 * SIZEOF_WORD], 4  ; xmm_shadow[12] = block[jno12];
107    pinsrw %36, word [esi + %22 * SIZEOF_WORD], 4  ; xmm_shadow[20] = block[jno20];
108    pinsrw %37, word [esi + %30 * SIZEOF_WORD], 4  ; xmm_shadow[28] = block[jno28];
109    pinsrw %34, word [esi + %7  * SIZEOF_WORD], 5  ; xmm_shadow[5] = block[jno5];
110    pinsrw %35, word [esi + %15 * SIZEOF_WORD], 5  ; xmm_shadow[13] = block[jno13];
111    pinsrw %36, word [esi + %23 * SIZEOF_WORD], 5  ; xmm_shadow[21] = block[jno21];
112    pinsrw %37, word [esi + %31 * SIZEOF_WORD], 5  ; xmm_shadow[29] = block[jno29];
113    pinsrw %34, word [esi + %8  * SIZEOF_WORD], 6  ; xmm_shadow[6] = block[jno6];
114    pinsrw %35, word [esi + %16 * SIZEOF_WORD], 6  ; xmm_shadow[14] = block[jno14];
115    pinsrw %36, word [esi + %24 * SIZEOF_WORD], 6  ; xmm_shadow[22] = block[jno22];
116    pinsrw %37, word [esi + %32 * SIZEOF_WORD], 6  ; xmm_shadow[30] = block[jno30];
117    pinsrw %34, word [esi + %9  * SIZEOF_WORD], 7  ; xmm_shadow[7] = block[jno7];
118    pinsrw %35, word [esi + %17 * SIZEOF_WORD], 7  ; xmm_shadow[15] = block[jno15];
119    pinsrw %36, word [esi + %25 * SIZEOF_WORD], 7  ; xmm_shadow[23] = block[jno23];
120%if %1 != 32
121    pinsrw %37, word [esi + %33 * SIZEOF_WORD], 7  ; xmm_shadow[31] = block[jno31];
122%else
123    pinsrw %37, ecx, 7  ; xmm_shadow[31] = block[jno31];
124%endif
125    pcmpgtw xmm4, %34  ; neg = _mm_cmpgt_epi16(neg, x1);
126    pcmpgtw xmm5, %35  ; neg = _mm_cmpgt_epi16(neg, x1);
127    pcmpgtw xmm6, %36  ; neg = _mm_cmpgt_epi16(neg, x1);
128    pcmpgtw xmm7, %37  ; neg = _mm_cmpgt_epi16(neg, x1);
129    paddw %34, xmm4   ; x1 = _mm_add_epi16(x1, neg);
130    paddw %35, xmm5   ; x1 = _mm_add_epi16(x1, neg);
131    paddw %36, xmm6  ; x1 = _mm_add_epi16(x1, neg);
132    paddw %37, xmm7  ; x1 = _mm_add_epi16(x1, neg);
133    pxor %34, xmm4    ; x1 = _mm_xor_si128(x1, neg);
134    pxor %35, xmm5    ; x1 = _mm_xor_si128(x1, neg);
135    pxor %36, xmm6   ; x1 = _mm_xor_si128(x1, neg);
136    pxor %37, xmm7   ; x1 = _mm_xor_si128(x1, neg);
137    pxor xmm4, %34    ; neg = _mm_xor_si128(neg, x1);
138    pxor xmm5, %35    ; neg = _mm_xor_si128(neg, x1);
139    pxor xmm6, %36   ; neg = _mm_xor_si128(neg, x1);
140    pxor xmm7, %37   ; neg = _mm_xor_si128(neg, x1);
141    movdqa XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34  ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
142    movdqa XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35  ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
143    movdqa XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36  ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
144    movdqa XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37  ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
145    movdqa XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4  ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
146    movdqa XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5  ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
147    movdqa XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6  ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
148    movdqa XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7  ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
149%endmacro
150
151;
152; Encode a single block's worth of coefficients.
153;
154; GLOBAL(JOCTET*)
155; jsimd_huff_encode_one_block_sse2 (working_state *state, JOCTET *buffer,
156;                                   JCOEFPTR block, int last_dc_val,
157;                                   c_derived_tbl *dctbl, c_derived_tbl *actbl)
158;
159
160; eax + 8 = working_state *state
161; eax + 12 = JOCTET *buffer
162; eax + 16 = JCOEFPTR block
163; eax + 20 = int last_dc_val
164; eax + 24 = c_derived_tbl *dctbl
165; eax + 28 = c_derived_tbl *actbl
166
167%define pad             6*SIZEOF_DWORD  ; Align to 16 bytes
168%define t1              pad
169%define t2              t1+(DCTSIZE2*SIZEOF_WORD)
170%define block           t2+(DCTSIZE2*SIZEOF_WORD)
171%define actbl           block+SIZEOF_DWORD
172%define buffer          actbl+SIZEOF_DWORD
173%define temp            buffer+SIZEOF_DWORD
174%define temp2           temp+SIZEOF_DWORD
175%define temp3           temp2+SIZEOF_DWORD
176%define temp4           temp3+SIZEOF_DWORD
177%define temp5           temp4+SIZEOF_DWORD
178%define gotptr          temp5+SIZEOF_DWORD  ; void *gotptr
179%define put_buffer      ebx
180%define put_bits        edi
181
182        align   16
183        global  EXTN(jsimd_huff_encode_one_block_sse2) PRIVATE
184
185EXTN(jsimd_huff_encode_one_block_sse2):
186        push    ebp
187        mov     eax,esp                         ; eax = original ebp
188        sub     esp, byte 4
189        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
190        mov     [esp],eax
191        mov     ebp,esp                         ; ebp = aligned ebp
192        sub     esp, temp5+9*SIZEOF_DWORD-pad
193        push    ebx
194        push    ecx
195;       push    edx             ; need not be preserved
196        push    esi
197        push    edi
198        push    ebp
199
200        mov esi, POINTER [eax+8]        ; (working_state *state)
201        mov put_buffer,  DWORD [esi+8]  ; put_buffer = state->cur.put_buffer;
202        mov put_bits,    DWORD [esi+12]  ; put_bits = state->cur.put_bits;
203        push esi  ; esi is now scratch
204
205        get_GOT edx                       ; get GOT address
206        movpic POINTER [esp+gotptr], edx  ; save GOT address
207
208        mov ecx, POINTER [eax+28]
209        mov edx, POINTER [eax+16]
210        mov esi, POINTER [eax+12]
211        mov POINTER [esp+actbl],  ecx
212        mov POINTER [esp+block],  edx
213        mov POINTER [esp+buffer], esi
214
215        ; Encode the DC coefficient difference per section F.1.2.1
216        mov esi, POINTER [esp+block]        ; block
217        movsx ecx, word [esi]  ; temp = temp2 = block[0] - last_dc_val;
218        sub   ecx, DWORD [eax+20]
219        mov   esi, ecx
220
221        ; This is a well-known technique for obtaining the absolute value
222        ; without a branch.  It is derived from an assembly language technique
223        ; presented in "How to Optimize for the Pentium Processors",
224        ; Copyright (c) 1996, 1997 by Agner Fog.
225        mov edx, ecx
226        sar edx, 31   ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
227        xor ecx, edx ; temp ^= temp3;
228        sub ecx, edx ; temp -= temp3;
229
230        ; For a negative input, want temp2 = bitwise complement of abs(input)
231        ; This code assumes we are on a two's complement machine
232        add esi, edx  ; temp2 += temp3;
233        mov DWORD [esp+temp], esi  ; backup temp2 in temp
234
235        ; Find the number of bits needed for the magnitude of the coefficient
236        movpic ebp, POINTER [esp+gotptr]   ; load GOT address (ebp)
237        movzx edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)]  ; nbits = JPEG_NBITS(temp);
238        mov DWORD [esp+temp2], edx  ; backup nbits in temp2
239
240        ; Emit the Huffman-coded symbol for the number of bits
241        mov    ebp, POINTER [eax+24]  ; After this point, arguments are not accessible anymore
242        mov    eax,  INT [ebp + edx * 4]  ; code = dctbl->ehufco[nbits];
243        movzx  ecx, byte [ebp + edx + 1024]  ; size = dctbl->ehufsi[nbits];
244        EMIT_BITS eax  ; EMIT_BITS(code, size)
245
246        mov ecx, DWORD [esp+temp2]  ; restore nbits
247
248        ; Mask off any extra bits in code
249        mov eax, 1
250        shl eax, cl
251        dec eax
252        and eax, DWORD [esp+temp]  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
253
254        ; Emit that number of bits of the value, if positive,
255        ; or the complement of its magnitude, if negative.
256        EMIT_BITS eax  ; EMIT_BITS(temp2, nbits)
257
258        ; Prepare data
259        xor ecx, ecx
260        mov esi, POINTER [esp+block]
261        kloop_prepare  0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, \
262                       18, 11, 4,  5,  12, 19, 26, 33, 40, 48, 41, 34, \
263                       27, 20, 13, 6,  7,  14, 21, 28, 35, \
264                       xmm0, xmm1, xmm2, xmm3
265        kloop_prepare  32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
266                       30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
267                       53, 60, 61, 54, 47, 55, 62, 63, 63, \
268                       xmm0, xmm1, xmm2, xmm3
269
270        pxor xmm7, xmm7
271        movdqa xmm0, XMMWORD [esp + t1 + 0 * SIZEOF_WORD]   ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
272        movdqa xmm1, XMMWORD [esp + t1 + 8 * SIZEOF_WORD]   ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
273        movdqa xmm2, XMMWORD [esp + t1 + 16 * SIZEOF_WORD]  ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
274        movdqa xmm3, XMMWORD [esp + t1 + 24 * SIZEOF_WORD]  ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
275        pcmpeqw xmm0, xmm7  ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
276        pcmpeqw xmm1, xmm7  ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
277        pcmpeqw xmm2, xmm7  ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
278        pcmpeqw xmm3, xmm7  ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
279        packsswb xmm0, xmm1  ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
280        packsswb xmm2, xmm3  ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
281        pmovmskb edx, xmm0  ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
282        pmovmskb ecx, xmm2  ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
283        shl ecx, 16
284        or  edx, ecx
285        not edx  ; index = ~index;
286
287        lea esi, [esp+t1]
288        mov ebp, POINTER [esp+actbl]  ; ebp = actbl
289
290.BLOOP:
291        bsf ecx, edx  ; r = __builtin_ctzl(index);
292        jz .ELOOP
293        lea esi, [esi+ecx*2]  ; k += r;
294        shr edx, cl  ; index >>= r;
295        mov DWORD [esp+temp3], edx
296.BRLOOP:
297        cmp ecx, 16  ; while (r > 15) {
298        jl .ERLOOP
299        sub ecx, 16 ; r -= 16;
300        mov DWORD [esp+temp], ecx
301        mov   eax, INT [ebp + 240 * 4]  ; code_0xf0 = actbl->ehufco[0xf0];
302        movzx ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
303        EMIT_BITS eax  ; EMIT_BITS(code_0xf0, size_0xf0)
304        mov ecx, DWORD [esp+temp]
305        jmp .BRLOOP
306.ERLOOP:
307        movsx eax, word [esi]  ; temp = t1[k];
308        movpic edx, POINTER [esp+gotptr]   ; load GOT address (edx)
309        movzx eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)]  ; nbits = JPEG_NBITS(temp);
310        mov DWORD [esp+temp2], eax
311        ; Emit Huffman symbol for run length / number of bits
312        shl ecx, 4  ; temp3 = (r << 4) + nbits;
313        add ecx, eax
314        mov   eax,  INT [ebp + ecx * 4]  ; code = actbl->ehufco[temp3];
315        movzx ecx, byte [ebp + ecx + 1024]  ; size = actbl->ehufsi[temp3];
316        EMIT_BITS eax
317
318        movsx edx, word [esi+DCTSIZE2*2]  ; temp2 = t2[k];
319        ; Mask off any extra bits in code
320        mov ecx, DWORD [esp+temp2]
321        mov eax, 1
322        shl eax, cl
323        dec eax
324        and eax, edx  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
325        EMIT_BITS eax  ; PUT_BITS(temp2, nbits)
326        mov edx, DWORD [esp+temp3]
327        add esi, 2  ; ++k;
328        shr edx, 1  ; index >>= 1;
329
330        jmp .BLOOP
331.ELOOP:
332        movdqa xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD]  ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
333        movdqa xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD]  ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
334        movdqa xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD]  ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
335        movdqa xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD]  ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
336        pcmpeqw xmm0, xmm7  ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
337        pcmpeqw xmm1, xmm7  ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
338        pcmpeqw xmm2, xmm7  ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
339        pcmpeqw xmm3, xmm7  ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
340        packsswb xmm0, xmm1  ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
341        packsswb xmm2, xmm3  ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
342        pmovmskb edx, xmm0  ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
343        pmovmskb ecx, xmm2  ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
344        shl ecx, 16
345        or  edx, ecx
346        not edx  ; index = ~index;
347
348        lea eax, [esp + t1 + (DCTSIZE2/2) * 2]
349        sub eax, esi
350        shr eax, 1
351        bsf ecx, edx  ; r = __builtin_ctzl(index);
352        jz .ELOOP2
353        shr edx, cl  ; index >>= r;
354        add ecx, eax
355        lea esi, [esi+ecx*2]  ; k += r;
356        mov DWORD [esp+temp3], edx
357        jmp .BRLOOP2
358.BLOOP2:
359        bsf ecx, edx  ; r = __builtin_ctzl(index);
360        jz .ELOOP2
361        lea esi, [esi+ecx*2]  ; k += r;
362        shr edx, cl  ; index >>= r;
363        mov DWORD [esp+temp3], edx
364.BRLOOP2:
365        cmp ecx, 16  ; while (r > 15) {
366        jl .ERLOOP2
367        sub ecx, 16  ; r -= 16;
368        mov DWORD [esp+temp], ecx
369        mov   eax, INT [ebp + 240 * 4]  ; code_0xf0 = actbl->ehufco[0xf0];
370        movzx ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
371        EMIT_BITS eax  ; EMIT_BITS(code_0xf0, size_0xf0)
372        mov ecx, DWORD [esp+temp]
373        jmp .BRLOOP2
374.ERLOOP2:
375        movsx eax, word [esi]  ; temp = t1[k];
376        bsr eax, eax  ; nbits = 32 - __builtin_clz(temp);
377        inc eax
378        mov DWORD [esp+temp2], eax
379        ; Emit Huffman symbol for run length / number of bits
380        shl ecx, 4  ; temp3 = (r << 4) + nbits;
381        add ecx, eax
382        mov   eax,  INT [ebp + ecx * 4]  ; code = actbl->ehufco[temp3];
383        movzx ecx, byte [ebp + ecx + 1024]  ; size = actbl->ehufsi[temp3];
384        EMIT_BITS eax
385
386        movsx edx, word [esi+DCTSIZE2*2]  ; temp2 = t2[k];
387        ; Mask off any extra bits in code
388        mov ecx, DWORD [esp+temp2]
389        mov eax, 1
390        shl eax, cl
391        dec eax
392        and eax, edx  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
393        EMIT_BITS eax  ; PUT_BITS(temp2, nbits)
394        mov edx, DWORD [esp+temp3]
395        add esi, 2  ; ++k;
396        shr edx, 1  ; index >>= 1;
397
398        jmp .BLOOP2
399.ELOOP2:
400        ; If the last coef(s) were zero, emit an end-of-block code
401        lea edx, [esp + t1 + (DCTSIZE2-1) * 2]  ; r = DCTSIZE2-1-k;
402        cmp edx, esi  ; if (r > 0) {
403        je .EFN
404        mov   eax,  INT [ebp]  ; code = actbl->ehufco[0];
405        movzx ecx, byte [ebp + 1024]  ; size = actbl->ehufsi[0];
406        EMIT_BITS eax
407.EFN:
408        mov eax, [esp+buffer]
409        pop esi
410        ; Save put_buffer & put_bits
411        mov DWORD [esi+8], put_buffer  ; state->cur.put_buffer = put_buffer;
412        mov DWORD [esi+12], put_bits  ; state->cur.put_bits = put_bits;
413
414        pop     ebp
415        pop     edi
416        pop     esi
417;       pop     edx             ; need not be preserved
418        pop     ecx
419        pop     ebx
420        mov     esp,ebp         ; esp <- aligned ebp
421        pop     esp             ; esp <- original ebp
422        pop     ebp
423        ret
424
425; For some reason, the OS X linker does not honor the request to align the
426; segment unless we do this.
427        align   16
428