1;  vim:filetype=nasm ts=8
2
3;  libFLAC - Free Lossless Audio Codec library
4;  Copyright (C) 2001,2002,2003,2004,2005,2006,2007  Josh Coalson
5;
6;  Redistribution and use in source and binary forms, with or without
7;  modification, are permitted provided that the following conditions
8;  are met:
9;
10;  - Redistributions of source code must retain the above copyright
11;  notice, this list of conditions and the following disclaimer.
12;
13;  - Redistributions in binary form must reproduce the above copyright
14;  notice, this list of conditions and the following disclaimer in the
15;  documentation and/or other materials provided with the distribution.
16;
17;  - Neither the name of the Xiph.org Foundation nor the names of its
18;  contributors may be used to endorse or promote products derived from
19;  this software without specific prior written permission.
20;
21;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
25;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
33%include "nasm.h"
34
35	data_section
36
37cextern FLAC__crc16_table		; unsigned FLAC__crc16_table[256];
38cextern bitreader_read_from_client_	; FLAC__bool bitreader_read_from_client_(FLAC__BitReader *br);
39
40cglobal FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap
41
42	code_section
43
44
45; **********************************************************************
46;
47; void FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter)
48;
49; Some details like assertions and other checking is performed by the caller.
50	ALIGN 16
51cident FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap
52
53	;ASSERT(0 != br);
54	;ASSERT(0 != br->buffer);
55	; WATCHOUT: code only works if sizeof(brword)==32; we can make things much faster with this assertion
56	;ASSERT(FLAC__BITS_PER_WORD == 32);
57	;ASSERT(parameter < 32);
58	; the above two asserts also guarantee that the binary part never straddles more than 2 words, so we don't have to loop to read it
59
60	;; peppered throughout the code at major checkpoints are keys like this as to where things are at that point in time
61	;; [esp + 16]	unsigned parameter
62	;; [esp + 12]	unsigned nvals
63	;; [esp + 8]	int vals[]
64	;; [esp + 4]	FLAC__BitReader *br
65	mov	eax, [esp + 12]		; if(nvals == 0)
66	test	eax, eax
67	ja	.nvals_gt_0
68	mov	eax, 1			;   return true;
69	ret
70
71.nvals_gt_0:
72	push	ebp
73	push	ebx
74	push	esi
75	push	edi
76	sub	esp, 4
77	;; [esp + 36]	unsigned parameter
78	;; [esp + 32]	unsigned nvals
79	;; [esp + 28]	int vals[]
80	;; [esp + 24]	FLAC__BitReader *br
81	;; [esp]	ucbits
82	mov	ebp, [esp + 24]		; ebp <- br == br->buffer
83	mov	esi, [ebp + 16]		; esi <- br->consumed_words (aka 'cwords' in the C version)
84	mov	ecx, [ebp + 20]		; ecx <- br->consumed_bits  (aka 'cbits'  in the C version)
85	xor	edi, edi		; edi <- 0  'uval'
86	;; ecx		cbits
87	;; esi		cwords
88	;; edi		uval
89	;; ebp		br
90	;; [ebp]	br->buffer
91	;; [ebp + 8]	br->words
92	;; [ebp + 12]	br->bytes
93	;; [ebp + 16]	br->consumed_words
94	;; [ebp + 20]	br->consumed_bits
95	;; [ebp + 24]	br->read_crc
96	;; [ebp + 28]	br->crc16_align
97
98					; ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
99	mov	eax, [ebp + 8]		;   eax <- br->words
100	sub	eax, esi		;   eax <- br->words-cwords
101	shl	eax, 2			;   eax <- (br->words-cwords)*FLAC__BYTES_PER_WORD
102	add	eax, [ebp + 12]		;   eax <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes
103	shl	eax, 3			;   eax <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8
104	sub	eax, ecx		;   eax <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits
105	mov	[esp], eax		;   ucbits <- eax
106
107	ALIGN 16
108.val_loop:				; while(1) {
109
110	;
111	; read unary part
112	;
113.unary_loop:				;   while(1) {
114	;; ecx		cbits
115	;; esi		cwords
116	;; edi		uval
117	;; ebp		br
118	cmp	esi, [ebp + 8]		;     while(cwords < br->words)   /* if we've not consumed up to a partial tail word... */
119	jae	near .c1_next1
120.c1_loop:				;     {
121	mov	ebx, [ebp]
122	mov	eax, [ebx + 4*esi]	;       b = br->buffer[cwords]
123	mov	edx, eax		;       edx = br->buffer[cwords] (saved for later use)
124	shl	eax, cl 		;       b = br->buffer[cwords] << cbits
125	test	eax, eax		;         (still have to test since cbits may be 0, thus ZF not updated for shl eax,0)
126	jz	near .c1_next2		;       if(b) {
127	bsr	ebx, eax
128	not	ebx
129	and	ebx, 31			;         ebx = 'i' = # of leading 0 bits in 'b' (eax)
130	add	ecx, ebx		;         cbits += i;
131	add	edi, ebx		;         uval += i;
132	add	ecx, byte 1		;         cbits++; /* skip over stop bit */
133	test	ecx, ~31
134	jz	near .break1 		;         if(cbits >= FLAC__BITS_PER_WORD) { /* faster way of testing if(cbits == FLAC__BITS_PER_WORD) */
135					;           crc16_update_word_(br, br->buffer[cwords]);
136	push	edi			;		[need more registers]
137	bswap	edx			;		edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier
138	mov	ecx, [ebp + 28]		;		ecx <- br->crc16_align
139	mov	eax, [ebp + 24]		;		ax <- br->read_crc (a.k.a. crc)
140%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
141	mov	edi, _FLAC__crc16_table
142%else
143	mov	edi, FLAC__crc16_table
144%endif
145	;; eax (ax)	crc a.k.a. br->read_crc
146	;; ebx (bl)	intermediate result index into FLAC__crc16_table[]
147	;; ecx		br->crc16_align
148	;; edx		byteswapped brword to CRC
149	;; esi		cwords
150	;; edi		unsigned FLAC__crc16_table[]
151	;; ebp		br
152	test	ecx, ecx		;		switch(br->crc16_align) ...
153	jnz	.c0b4			;		[br->crc16_align is 0 the vast majority of the time so we optimize the common case]
154.c0b0:	xor	dl, ah			;		dl <- (crc>>8)^(word>>24)
155	movzx	ebx, dl
156	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^(word>>24)]
157	shl	eax, 8			;		ax <- (crc<<8)
158	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)]
159.c0b1:	xor	dh, ah			;		dh <- (crc>>8)^((word>>16)&0xff))
160	movzx	ebx, dh
161	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
162	shl	eax, 8			;		ax <- (crc<<8)
163	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
164	shr	edx, 16
165.c0b2:	xor	dl, ah			;		dl <- (crc>>8)^((word>>8)&0xff))
166	movzx	ebx, dl
167	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
168	shl	eax, 8			;		ax <- (crc<<8)
169	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
170.c0b3:	xor	dh, ah			;		dh <- (crc>>8)^(word&0xff)
171	movzx	ebx, dh
172	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)]
173	shl	eax, 8			;		ax <- (crc<<8)
174	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)]
175	movzx	eax, ax
176	mov	[ebp + 24], eax		;		br->read_crc <- crc
177	pop	edi
178
179	add	esi, byte 1		;           cwords++;
180	xor	ecx, ecx		;           cbits = 0;
181					;         }
182	jmp	near .break1		;         goto break1;
183	;; this section relocated out of the way for performance
184.c0b4:
185	mov	[ebp + 28], dword 0	;		br->crc16_align <- 0
186	cmp	ecx, 8
187	je	.c0b1
188	shr	edx, 16
189	cmp	ecx, 16
190	je	.c0b2
191	jmp	.c0b3
192
193	;; this section relocated out of the way for performance
194.c1b4:
195	mov	[ebp + 28], dword 0	;		br->crc16_align <- 0
196	cmp	ecx, 8
197	je	.c1b1
198	shr	edx, 16
199	cmp	ecx, 16
200	je	.c1b2
201	jmp	.c1b3
202
203.c1_next2:				;       } else {
204	;; ecx		cbits
205	;; edx		current brword 'b'
206	;; esi		cwords
207	;; edi		uval
208	;; ebp		br
209	add	edi, 32
210	sub	edi, ecx		;         uval += FLAC__BITS_PER_WORD - cbits;
211					;         crc16_update_word_(br, br->buffer[cwords]);
212	push	edi			;		[need more registers]
213	bswap	edx			;		edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier
214	mov	ecx, [ebp + 28]		;		ecx <- br->crc16_align
215	mov	eax, [ebp + 24]		;		ax <- br->read_crc (a.k.a. crc)
216%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
217	mov	edi, _FLAC__crc16_table
218%else
219	mov	edi, FLAC__crc16_table
220%endif
221	;; eax (ax)	crc a.k.a. br->read_crc
222	;; ebx (bl)	intermediate result index into FLAC__crc16_table[]
223	;; ecx		br->crc16_align
224	;; edx		byteswapped brword to CRC
225	;; esi		cwords
226	;; edi		unsigned FLAC__crc16_table[]
227	;; ebp		br
228	test	ecx, ecx		;		switch(br->crc16_align) ...
229	jnz	.c1b4			;		[br->crc16_align is 0 the vast majority of the time so we optimize the common case]
230.c1b0:	xor	dl, ah			;		dl <- (crc>>8)^(word>>24)
231	movzx	ebx, dl
232	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^(word>>24)]
233	shl	eax, 8			;		ax <- (crc<<8)
234	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)]
235.c1b1:	xor	dh, ah			;		dh <- (crc>>8)^((word>>16)&0xff))
236	movzx	ebx, dh
237	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
238	shl	eax, 8			;		ax <- (crc<<8)
239	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
240	shr	edx, 16
241.c1b2:	xor	dl, ah			;		dl <- (crc>>8)^((word>>8)&0xff))
242	movzx	ebx, dl
243	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
244	shl	eax, 8			;		ax <- (crc<<8)
245	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
246.c1b3:	xor	dh, ah			;		dh <- (crc>>8)^(word&0xff)
247	movzx	ebx, dh
248	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)]
249	shl	eax, 8			;		ax <- (crc<<8)
250	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)]
251	movzx	eax, ax
252	mov	[ebp + 24], eax		;		br->read_crc <- crc
253	pop	edi
254
255	add	esi, byte 1		;         cwords++;
256	xor	ecx, ecx		;         cbits = 0;
257					;         /* didn't find stop bit yet, have to keep going... */
258					;       }
259
260	cmp	esi, [ebp + 8]		;     } while(cwords < br->words)   /* if we've not consumed up to a partial tail word... */
261	jb	near .c1_loop
262
263.c1_next1:
264	; at this point we've eaten up all the whole words; have to try
265	; reading through any tail bytes before calling the read callback.
266	; this is a repeat of the above logic adjusted for the fact we
267	; don't have a whole word.  note though if the client is feeding
268	; us data a byte at a time (unlikely), br->consumed_bits may not
269	; be zero.
270	;; ecx		cbits
271	;; esi		cwords
272	;; edi		uval
273	;; ebp		br
274	mov	edx, [ebp + 12]		;     edx <- br->bytes
275	test	edx, edx
276	jz	.read1			;     if(br->bytes) {  [NOTE: this case is rare so it doesn't have to be all that fast ]
277	mov	ebx, [ebp]
278	shl	edx, 3			;       edx <- const unsigned end = br->bytes * 8;
279	mov	eax, [ebx + 4*esi]	;       b = br->buffer[cwords]
280	xchg	edx, ecx		;       [edx <- cbits , ecx <- end]
281	mov	ebx, 0xffffffff		;       ebx <- FLAC__WORD_ALL_ONES
282	shr	ebx, cl			;       ebx <- FLAC__WORD_ALL_ONES >> end
283	not	ebx			;       ebx <- ~(FLAC__WORD_ALL_ONES >> end)
284	xchg	edx, ecx		;       [edx <- end , ecx <- cbits]
285	and	eax, ebx		;       b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end));
286	shl	eax, cl 		;       b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end)) << cbits;
287	test	eax, eax		;         (still have to test since cbits may be 0, thus ZF not updated for shl eax,0)
288	jz	.c1_next3		;       if(b) {
289	bsr	ebx, eax
290	not	ebx
291	and	ebx, 31			;         ebx = 'i' = # of leading 0 bits in 'b' (eax)
292	add	ecx, ebx		;         cbits += i;
293	add	edi, ebx		;         uval += i;
294	add	ecx, byte 1		;         cbits++; /* skip over stop bit */
295	jmp	short .break1 		;         goto break1;
296.c1_next3:				;       } else {
297	sub	edi, ecx
298	add	edi, edx		;         uval += end - cbits;
299	add	ecx, edx		;         cbits += end
300					;         /* didn't find stop bit yet, have to keep going... */
301					;       }
302					;     }
303.read1:
304	; flush registers and read; bitreader_read_from_client_() does
305	; not touch br->consumed_bits at all but we still need to set
306	; it in case it fails and we have to return false.
307	;; ecx		cbits
308	;; esi		cwords
309	;; edi		uval
310	;; ebp		br
311	mov	[ebp + 16], esi		;     br->consumed_words = cwords;
312	mov	[ebp + 20], ecx		;     br->consumed_bits = cbits;
313	push	ecx			;     /* save */
314	push	ebp			;     /* push br argument */
315%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
316	call	_bitreader_read_from_client_
317%else
318	call	bitreader_read_from_client_
319%endif
320	pop	edx			;     /* discard, unused */
321	pop	ecx			;     /* restore */
322	mov	esi, [ebp + 16]		;     cwords = br->consumed_words;
323					;     ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
324	mov	ebx, [ebp + 8]		;       ebx <- br->words
325	sub	ebx, esi		;       ebx <- br->words-cwords
326	shl	ebx, 2			;       ebx <- (br->words-cwords)*FLAC__BYTES_PER_WORD
327	add	ebx, [ebp + 12]		;       ebx <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes
328	shl	ebx, 3			;       ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8
329	sub	ebx, ecx		;       ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits
330	add	ebx, edi		;       ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits + uval
331					;           + uval to offset our count by the # of unary bits already
332					;           consumed before the read, because we will add these back
333					;           in all at once at break1
334	mov	[esp], ebx		;       ucbits <- ebx
335	test	eax, eax		;     if(!bitreader_read_from_client_(br))
336	jnz	near .unary_loop
337	jmp	.end			;       return false; /* eax (the return value) is already 0 */
338					;   } /* end while(1) unary part */
339
340	ALIGN 16
341.break1:
342	;; ecx		cbits
343	;; esi		cwords
344	;; edi		uval
345	;; ebp		br
346	;; [esp]	ucbits
347	sub	[esp], edi		;   ucbits -= uval;
348	sub	dword [esp], byte 1	;   ucbits--; /* account for stop bit */
349
350	;
351	; read binary part
352	;
353	mov	ebx, [esp + 36]		;   ebx <- parameter
354	test	ebx, ebx		;   if(parameter) {
355	jz	near .break2
356.read2:
357	cmp	[esp], ebx		;     while(ucbits < parameter) {
358	jae	.c2_next1
359	; flush registers and read; bitreader_read_from_client_() does
360	; not touch br->consumed_bits at all but we still need to set
361	; it in case it fails and we have to return false.
362	mov	[ebp + 16], esi		;       br->consumed_words = cwords;
363	mov	[ebp + 20], ecx		;       br->consumed_bits = cbits;
364	push	ecx			;       /* save */
365	push	ebp			;       /* push br argument */
366%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
367	call	_bitreader_read_from_client_
368%else
369	call	bitreader_read_from_client_
370%endif
371	pop	edx			;       /* discard, unused */
372	pop	ecx			;       /* restore */
373	mov	esi, [ebp + 16]		;       cwords = br->consumed_words;
374					;       ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
375	mov	edx, [ebp + 8]		;         edx <- br->words
376	sub	edx, esi		;         edx <- br->words-cwords
377	shl	edx, 2			;         edx <- (br->words-cwords)*FLAC__BYTES_PER_WORD
378	add	edx, [ebp + 12]		;         edx <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes
379	shl	edx, 3			;         edx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8
380	sub	edx, ecx		;         edx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits
381	mov	[esp], edx		;         ucbits <- edx
382	test	eax, eax		;       if(!bitreader_read_from_client_(br))
383	jnz	.read2
384	jmp	.end			;         return false; /* eax (the return value) is already 0 */
385					;     }
386.c2_next1:
387	;; ebx		parameter
388	;; ecx		cbits
389	;; esi		cwords
390	;; edi		uval
391	;; ebp		br
392	;; [esp]	ucbits
393	cmp	esi, [ebp + 8]		;     if(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
394	jae	near .c2_next2
395	test	ecx, ecx		;       if(cbits) {
396	jz	near .c2_next3		;         /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
397	mov	eax, 32
398	mov	edx, [ebp]
399	sub	eax, ecx		;         const unsigned n = FLAC__BITS_PER_WORD - cbits;
400	mov	edx, [edx + 4*esi]	;         const brword word = br->buffer[cwords];
401	cmp	ebx, eax		;         if(parameter < n) {
402	jae	.c2_next4
403					;           uval <<= parameter;
404					;           uval |= (word & (FLAC__WORD_ALL_ONES >> cbits)) >> (n-parameter);
405	shl	edx, cl
406	xchg	ebx, ecx
407	shld	edi, edx, cl
408	add	ebx, ecx		;           cbits += parameter;
409	xchg	ebx, ecx		;           ebx <- parameter, ecx <- cbits
410	jmp	.break2			;           goto break2;
411					;         }
412.c2_next4:
413					;         uval <<= n;
414					;         uval |= word & (FLAC__WORD_ALL_ONES >> cbits);
415%if 1
416	rol	edx, cl			;            @@@@@@OPT: may be faster to use rol to save edx so we can restore it for CRC'ing
417					;            @@@@@@OPT: or put parameter in ch instead and free up ebx completely again
418%else
419	shl	edx, cl
420%endif
421	xchg	eax, ecx
422	shld	edi, edx, cl
423	xchg	eax, ecx
424%if 1
425	ror	edx, cl			;            restored.
426%else
427	mov	edx, [ebp]
428	mov	edx, [edx + 4*esi]
429%endif
430					;         crc16_update_word_(br, br->buffer[cwords]);
431	push	edi			;		[need more registers]
432	push	ebx			;		[need more registers]
433	push	eax			;		[need more registers]
434	bswap	edx			;		edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier
435	mov	ecx, [ebp + 28]		;		ecx <- br->crc16_align
436	mov	eax, [ebp + 24]		;		ax <- br->read_crc (a.k.a. crc)
437%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
438	mov	edi, _FLAC__crc16_table
439%else
440	mov	edi, FLAC__crc16_table
441%endif
442	;; eax (ax)	crc a.k.a. br->read_crc
443	;; ebx (bl)	intermediate result index into FLAC__crc16_table[]
444	;; ecx		br->crc16_align
445	;; edx		byteswapped brword to CRC
446	;; esi		cwords
447	;; edi		unsigned FLAC__crc16_table[]
448	;; ebp		br
449	test	ecx, ecx		;		switch(br->crc16_align) ...
450	jnz	.c2b4			;		[br->crc16_align is 0 the vast majority of the time so we optimize the common case]
451.c2b0:	xor	dl, ah			;		dl <- (crc>>8)^(word>>24)
452	movzx	ebx, dl
453	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^(word>>24)]
454	shl	eax, 8			;		ax <- (crc<<8)
455	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)]
456.c2b1:	xor	dh, ah			;		dh <- (crc>>8)^((word>>16)&0xff))
457	movzx	ebx, dh
458	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
459	shl	eax, 8			;		ax <- (crc<<8)
460	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
461	shr	edx, 16
462.c2b2:	xor	dl, ah			;		dl <- (crc>>8)^((word>>8)&0xff))
463	movzx	ebx, dl
464	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
465	shl	eax, 8			;		ax <- (crc<<8)
466	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
467.c2b3:	xor	dh, ah			;		dh <- (crc>>8)^(word&0xff)
468	movzx	ebx, dh
469	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)]
470	shl	eax, 8			;		ax <- (crc<<8)
471	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)]
472	movzx	eax, ax
473	mov	[ebp + 24], eax		;		br->read_crc <- crc
474	pop	eax
475	pop	ebx
476	pop	edi
477	add	esi, byte 1		;         cwords++;
478	mov	ecx, ebx
479	sub	ecx, eax		;         cbits = parameter - n;
480	jz	.break2			;         if(cbits) { /* parameter > n, i.e. if there are still bits left to read, there have to be less than 32 so they will all be in the next word */
481					;           uval <<= cbits;
482					;           uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits));
483	mov	eax, [ebp]
484	mov	eax, [eax + 4*esi]
485	shld	edi, eax, cl
486					;         }
487	jmp	.break2			;         goto break2;
488
489	;; this section relocated out of the way for performance
490.c2b4:
491	mov	[ebp + 28], dword 0	;		br->crc16_align <- 0
492	cmp	ecx, 8
493	je	.c2b1
494	shr	edx, 16
495	cmp	ecx, 16
496	je	.c2b2
497	jmp	.c2b3
498
499.c2_next3:				;       } else {
500	mov	ecx, ebx		;         cbits = parameter;
501					;         uval <<= cbits;
502					;         uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits));
503	mov	eax, [ebp]
504	mov	eax, [eax + 4*esi]
505	shld	edi, eax, cl
506	jmp	.break2			;         goto break2;
507					;       }
508.c2_next2:				;     } else {
509	; in this case we're starting our read at a partial tail word;
510	; the reader has guaranteed that we have at least 'parameter'
511	; bits available to read, which makes this case simpler.
512					;       uval <<= parameter;
513					;       if(cbits) {
514					;         /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
515					;         uval |= (br->buffer[cwords] & (FLAC__WORD_ALL_ONES >> cbits)) >> (FLAC__BITS_PER_WORD-cbits-parameter);
516					;         cbits += parameter;
517					;         goto break2;
518					;       } else {
519					;         cbits = parameter;
520					;         uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits);
521					;         goto break2;
522					;       }
523					;       the above is much shorter in assembly:
524	mov	eax, [ebp]
525	mov	eax, [eax + 4*esi]	;       eax <- br->buffer[cwords]
526	shl	eax, cl			;       eax <- br->buffer[cwords] << cbits
527	add	ecx, ebx		;       cbits += parameter
528	xchg	ebx, ecx		;       ebx <- cbits, ecx <- parameter
529	shld	edi, eax, cl		;       uval <<= parameter <<< 'parameter' bits of tail word
530	xchg	ebx, ecx		;       ebx <- parameter, ecx <- cbits
531					;     }
532					;   }
533.break2:
534	sub	[esp], ebx		;   ucbits -= parameter;
535
536	;
537	; compose the value
538	;
539	mov	ebx, [esp + 28]		;   ebx <- vals
540	mov	edx, edi		;   edx <- uval
541	and	edi, 1			;   edi <- uval & 1
542	shr	edx, 1			;   edx <- uval >> 1
543	neg	edi			;   edi <- -(int)(uval & 1)
544	xor	edx, edi		;   edx <- (uval >> 1 ^ -(int)(uval & 1))
545	mov	[ebx], edx		;   *vals <- edx
546	sub	dword [esp + 32], byte 1	;   --nvals;
547	jz	.finished		;   if(nvals == 0) /* jump to finish */
548	xor	edi, edi		;   uval = 0;
549	add	dword [esp + 28], 4	;   ++vals
550	jmp	.val_loop		; }
551
552.finished:
553	mov	[ebp + 16], esi		; br->consumed_words = cwords;
554	mov	[ebp + 20], ecx		; br->consumed_bits = cbits;
555	mov	eax, 1
556.end:
557	add	esp, 4
558	pop	edi
559	pop	esi
560	pop	ebx
561	pop	ebp
562	ret
563
564end
565
566%ifdef OBJ_FORMAT_elf
567	section .note.GNU-stack noalloc
568%endif
569