1;  vim:filetype=nasm ts=8
2
3;  libFLAC - Free Lossless Audio Codec library
4;  Copyright (C) 2001-2009  Josh Coalson
5;  Copyright (C) 2011-2013  Xiph.Org Foundation
6;
7;  Redistribution and use in source and binary forms, with or without
8;  modification, are permitted provided that the following conditions
9;  are met:
10;
11;  - Redistributions of source code must retain the above copyright
12;  notice, this list of conditions and the following disclaimer.
13;
14;  - Redistributions in binary form must reproduce the above copyright
15;  notice, this list of conditions and the following disclaimer in the
16;  documentation and/or other materials provided with the distribution.
17;
18;  - Neither the name of the Xiph.org Foundation nor the names of its
19;  contributors may be used to endorse or promote products derived from
20;  this software without specific prior written permission.
21;
22;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
26;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
27;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
28;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
29;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33
34%include "nasm.h"
35
36	data_section
37
38cextern FLAC__crc16_table		; unsigned FLAC__crc16_table[256];
39cextern bitreader_read_from_client_	; FLAC__bool bitreader_read_from_client_(FLAC__BitReader *br);
40
41cglobal FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap
42
43	code_section
44
45
46; **********************************************************************
47;
48; void FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter)
49;
50; Some details like assertions and other checking is performed by the caller.
51	ALIGN 16
52cident FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap
53
54	;ASSERT(0 != br);
55	;ASSERT(0 != br->buffer);
56	; WATCHOUT: code only works if sizeof(brword)==32; we can make things much faster with this assertion
57	;ASSERT(FLAC__BITS_PER_WORD == 32);
58	;ASSERT(parameter < 32);
59	; the above two asserts also guarantee that the binary part never straddles more than 2 words, so we don't have to loop to read it
60
61	;; peppered throughout the code at major checkpoints are keys like this as to where things are at that point in time
62	;; [esp + 16]	unsigned parameter
63	;; [esp + 12]	unsigned nvals
64	;; [esp + 8]	int vals[]
65	;; [esp + 4]	FLAC__BitReader *br
66	mov	eax, [esp + 12]		; if(nvals == 0)
67	test	eax, eax
68	ja	.nvals_gt_0
69	mov	eax, 1			;   return true;
70	ret
71
72.nvals_gt_0:
73	push	ebp
74	push	ebx
75	push	esi
76	push	edi
77	sub	esp, 4
78	;; [esp + 36]	unsigned parameter
79	;; [esp + 32]	unsigned nvals
80	;; [esp + 28]	int vals[]
81	;; [esp + 24]	FLAC__BitReader *br
82	;; [esp]	ucbits
83	mov	ebp, [esp + 24]		; ebp <- br == br->buffer
84	mov	esi, [ebp + 16]		; esi <- br->consumed_words (aka 'cwords' in the C version)
85	mov	ecx, [ebp + 20]		; ecx <- br->consumed_bits  (aka 'cbits'  in the C version)
86	xor	edi, edi		; edi <- 0  'uval'
87	;; ecx		cbits
88	;; esi		cwords
89	;; edi		uval
90	;; ebp		br
91	;; [ebp]	br->buffer
92	;; [ebp + 8]	br->words
93	;; [ebp + 12]	br->bytes
94	;; [ebp + 16]	br->consumed_words
95	;; [ebp + 20]	br->consumed_bits
96	;; [ebp + 24]	br->read_crc
97	;; [ebp + 28]	br->crc16_align
98
99					; ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
100	mov	eax, [ebp + 8]		;   eax <- br->words
101	sub	eax, esi		;   eax <- br->words-cwords
102	shl	eax, 2			;   eax <- (br->words-cwords)*FLAC__BYTES_PER_WORD
103	add	eax, [ebp + 12]		;   eax <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes
104	shl	eax, 3			;   eax <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8
105	sub	eax, ecx		;   eax <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits
106	mov	[esp], eax		;   ucbits <- eax
107
108	ALIGN 16
109.val_loop:				; while(1) {
110
111	;
112	; read unary part
113	;
114.unary_loop:				;   while(1) {
115	;; ecx		cbits
116	;; esi		cwords
117	;; edi		uval
118	;; ebp		br
119	cmp	esi, [ebp + 8]		;     while(cwords < br->words)   /* if we've not consumed up to a partial tail word... */
120	jae	near .c1_next1
121.c1_loop:				;     {
122	mov	ebx, [ebp]
123	mov	eax, [ebx + 4*esi]	;       b = br->buffer[cwords]
124	mov	edx, eax		;       edx = br->buffer[cwords] (saved for later use)
125	shl	eax, cl 		;       b = br->buffer[cwords] << cbits
126	test	eax, eax		;         (still have to test since cbits may be 0, thus ZF not updated for shl eax,0)
127	jz	near .c1_next2		;       if(b) {
128	bsr	ebx, eax
129	not	ebx
130	and	ebx, 31			;         ebx = 'i' = # of leading 0 bits in 'b' (eax)
131	add	ecx, ebx		;         cbits += i;
132	add	edi, ebx		;         uval += i;
133	add	ecx, byte 1		;         cbits++; /* skip over stop bit */
134	test	ecx, ~31
135	jz	near .break1 		;         if(cbits >= FLAC__BITS_PER_WORD) { /* faster way of testing if(cbits == FLAC__BITS_PER_WORD) */
136					;           crc16_update_word_(br, br->buffer[cwords]);
137	push	edi			;		[need more registers]
138	bswap	edx			;		edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier
139	mov	ecx, [ebp + 28]		;		ecx <- br->crc16_align
140	mov	eax, [ebp + 24]		;		ax <- br->read_crc (a.k.a. crc)
141%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
142	mov	edi, _FLAC__crc16_table
143%else
144%ifdef OBJ_FORMAT_elf
145	mov	edi, [esp + 16]		;		saved ebx (GOT base)
146	lea	edi, [edi + FLAC__crc16_table wrt ..gotoff]
147%else
148	mov	edi, FLAC__crc16_table
149%endif
150%endif
151	;; eax (ax)	crc a.k.a. br->read_crc
152	;; ebx (bl)	intermediate result index into FLAC__crc16_table[]
153	;; ecx		br->crc16_align
154	;; edx		byteswapped brword to CRC
155	;; esi		cwords
156	;; edi		unsigned FLAC__crc16_table[]
157	;; ebp		br
158	test	ecx, ecx		;		switch(br->crc16_align) ...
159	jnz	.c0b4			;		[br->crc16_align is 0 the vast majority of the time so we optimize the common case]
160.c0b0:	xor	dl, ah			;		dl <- (crc>>8)^(word>>24)
161	movzx	ebx, dl
162	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^(word>>24)]
163	shl	eax, 8			;		ax <- (crc<<8)
164	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)]
165.c0b1:	xor	dh, ah			;		dh <- (crc>>8)^((word>>16)&0xff))
166	movzx	ebx, dh
167	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
168	shl	eax, 8			;		ax <- (crc<<8)
169	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
170	shr	edx, 16
171.c0b2:	xor	dl, ah			;		dl <- (crc>>8)^((word>>8)&0xff))
172	movzx	ebx, dl
173	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
174	shl	eax, 8			;		ax <- (crc<<8)
175	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
176.c0b3:	xor	dh, ah			;		dh <- (crc>>8)^(word&0xff)
177	movzx	ebx, dh
178	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)]
179	shl	eax, 8			;		ax <- (crc<<8)
180	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)]
181	movzx	eax, ax
182	mov	[ebp + 24], eax		;		br->read_crc <- crc
183	pop	edi
184
185	add	esi, byte 1		;           cwords++;
186	xor	ecx, ecx		;           cbits = 0;
187					;         }
188	jmp	near .break1		;         goto break1;
189	;; this section relocated out of the way for performance
190.c0b4:
191	mov	[ebp + 28], dword 0	;		br->crc16_align <- 0
192	cmp	ecx, 8
193	je	.c0b1
194	shr	edx, 16
195	cmp	ecx, 16
196	je	.c0b2
197	jmp	.c0b3
198
199	;; this section relocated out of the way for performance
200.c1b4:
201	mov	[ebp + 28], dword 0	;		br->crc16_align <- 0
202	cmp	ecx, 8
203	je	.c1b1
204	shr	edx, 16
205	cmp	ecx, 16
206	je	.c1b2
207	jmp	.c1b3
208
209.c1_next2:				;       } else {
210	;; ecx		cbits
211	;; edx		current brword 'b'
212	;; esi		cwords
213	;; edi		uval
214	;; ebp		br
215	add	edi, 32
216	sub	edi, ecx		;         uval += FLAC__BITS_PER_WORD - cbits;
217					;         crc16_update_word_(br, br->buffer[cwords]);
218	push	edi			;		[need more registers]
219	bswap	edx			;		edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier
220	mov	ecx, [ebp + 28]		;		ecx <- br->crc16_align
221	mov	eax, [ebp + 24]		;		ax <- br->read_crc (a.k.a. crc)
222%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
223	mov	edi, _FLAC__crc16_table
224%else
225%ifdef OBJ_FORMAT_elf
226	mov	edi, [esp + 16]		;		saved ebx (GOT base)
227	lea	edi, [edi + FLAC__crc16_table wrt ..gotoff]
228%else
229	mov	edi, FLAC__crc16_table
230%endif
231%endif
232	;; eax (ax)	crc a.k.a. br->read_crc
233	;; ebx (bl)	intermediate result index into FLAC__crc16_table[]
234	;; ecx		br->crc16_align
235	;; edx		byteswapped brword to CRC
236	;; esi		cwords
237	;; edi		unsigned FLAC__crc16_table[]
238	;; ebp		br
239	test	ecx, ecx		;		switch(br->crc16_align) ...
240	jnz	.c1b4			;		[br->crc16_align is 0 the vast majority of the time so we optimize the common case]
241.c1b0:	xor	dl, ah			;		dl <- (crc>>8)^(word>>24)
242	movzx	ebx, dl
243	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^(word>>24)]
244	shl	eax, 8			;		ax <- (crc<<8)
245	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)]
246.c1b1:	xor	dh, ah			;		dh <- (crc>>8)^((word>>16)&0xff))
247	movzx	ebx, dh
248	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
249	shl	eax, 8			;		ax <- (crc<<8)
250	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
251	shr	edx, 16
252.c1b2:	xor	dl, ah			;		dl <- (crc>>8)^((word>>8)&0xff))
253	movzx	ebx, dl
254	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
255	shl	eax, 8			;		ax <- (crc<<8)
256	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
257.c1b3:	xor	dh, ah			;		dh <- (crc>>8)^(word&0xff)
258	movzx	ebx, dh
259	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)]
260	shl	eax, 8			;		ax <- (crc<<8)
261	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)]
262	movzx	eax, ax
263	mov	[ebp + 24], eax		;		br->read_crc <- crc
264	pop	edi
265
266	add	esi, byte 1		;         cwords++;
267	xor	ecx, ecx		;         cbits = 0;
268					;         /* didn't find stop bit yet, have to keep going... */
269					;       }
270
271	cmp	esi, [ebp + 8]		;     } while(cwords < br->words)   /* if we've not consumed up to a partial tail word... */
272	jb	near .c1_loop
273
274.c1_next1:
275	; at this point we've eaten up all the whole words; have to try
276	; reading through any tail bytes before calling the read callback.
277	; this is a repeat of the above logic adjusted for the fact we
278	; don't have a whole word.  note though if the client is feeding
279	; us data a byte at a time (unlikely), br->consumed_bits may not
280	; be zero.
281	;; ecx		cbits
282	;; esi		cwords
283	;; edi		uval
284	;; ebp		br
285	mov	edx, [ebp + 12]		;     edx <- br->bytes
286	shl	edx, 3			;     edx <- br->bytes*8
287	cmp	edx, ecx
288	jbe	.read1			;     if(br->bytes*8 > cbits) {  [NOTE: this case is rare so it doesn't have to be all that fast ]
289	mov	ebx, [ebp]
290					;       edx <- const unsigned end = br->bytes * 8;
291	mov	eax, [ebx + 4*esi]	;       b = br->buffer[cwords]
292	xchg	edx, ecx		;       [edx <- cbits , ecx <- end]
293	mov	ebx, 0xffffffff		;       ebx <- FLAC__WORD_ALL_ONES
294	shr	ebx, cl			;       ebx <- FLAC__WORD_ALL_ONES >> end
295	not	ebx			;       ebx <- ~(FLAC__WORD_ALL_ONES >> end)
296	xchg	edx, ecx		;       [edx <- end , ecx <- cbits]
297	and	eax, ebx		;       b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end));
298	shl	eax, cl 		;       b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end)) << cbits;
299	test	eax, eax		;         (still have to test since cbits may be 0, thus ZF not updated for shl eax,0)
300	jz	.c1_next3		;       if(b) {
301	bsr	ebx, eax
302	not	ebx
303	and	ebx, 31			;         ebx = 'i' = # of leading 0 bits in 'b' (eax)
304	add	ecx, ebx		;         cbits += i;
305	add	edi, ebx		;         uval += i;
306	add	ecx, byte 1		;         cbits++; /* skip over stop bit */
307	jmp	short .break1 		;         goto break1;
308.c1_next3:				;       } else {
309	sub	edi, ecx
310	add	edi, edx		;         uval += end - cbits;
311	mov	ecx, edx		;         cbits = end
312					;         /* didn't find stop bit yet, have to keep going... */
313					;       }
314					;     }
315.read1:
316	; flush registers and read; bitreader_read_from_client_() does
317	; not touch br->consumed_bits at all but we still need to set
318	; it in case it fails and we have to return false.
319	;; ecx		cbits
320	;; esi		cwords
321	;; edi		uval
322	;; ebp		br
323	mov	[ebp + 16], esi		;     br->consumed_words = cwords;
324	mov	[ebp + 20], ecx		;     br->consumed_bits = cbits;
325	push	ecx			;     /* save */
326	push	ebp			;     /* push br argument */
327%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
328	call	_bitreader_read_from_client_
329%else
330%ifdef OBJ_FORMAT_elf
331	mov	ebx, [esp + 20]		;		saved ebx (GOT base)
332	call	bitreader_read_from_client_ wrt ..plt
333%else
334	call	bitreader_read_from_client_
335%endif
336%endif
337	pop	edx			;     /* discard, unused */
338	pop	ecx			;     /* restore */
339	mov	esi, [ebp + 16]		;     cwords = br->consumed_words;
340					;     ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
341	mov	ebx, [ebp + 8]		;       ebx <- br->words
342	sub	ebx, esi		;       ebx <- br->words-cwords
343	shl	ebx, 2			;       ebx <- (br->words-cwords)*FLAC__BYTES_PER_WORD
344	add	ebx, [ebp + 12]		;       ebx <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes
345	shl	ebx, 3			;       ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8
346	sub	ebx, ecx		;       ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits
347	add	ebx, edi		;       ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits + uval
348					;           + uval to offset our count by the # of unary bits already
349					;           consumed before the read, because we will add these back
350					;           in all at once at break1
351	mov	[esp], ebx		;       ucbits <- ebx
352	test	eax, eax		;     if(!bitreader_read_from_client_(br))
353	jnz	near .unary_loop
354	jmp	.end			;       return false; /* eax (the return value) is already 0 */
355					;   } /* end while(1) unary part */
356
357	ALIGN 16
358.break1:
359	;; ecx		cbits
360	;; esi		cwords
361	;; edi		uval
362	;; ebp		br
363	;; [esp]	ucbits
364	sub	[esp], edi		;   ucbits -= uval;
365	sub	dword [esp], byte 1	;   ucbits--; /* account for stop bit */
366
367	;
368	; read binary part
369	;
370	mov	ebx, [esp + 36]		;   ebx <- parameter
371	test	ebx, ebx		;   if(parameter) {
372	jz	near .break2
373.read2:
374	cmp	[esp], ebx		;     while(ucbits < parameter) {
375	jae	.c2_next1
376	; flush registers and read; bitreader_read_from_client_() does
377	; not touch br->consumed_bits at all but we still need to set
378	; it in case it fails and we have to return false.
379	mov	[ebp + 16], esi		;       br->consumed_words = cwords;
380	mov	[ebp + 20], ecx		;       br->consumed_bits = cbits;
381	push	ecx			;       /* save */
382	push	ebx			;       /* save */
383	push	ebp			;       /* push br argument */
384%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
385	call	_bitreader_read_from_client_
386%else
387%ifdef OBJ_FORMAT_elf
388	mov	ebx, [esp + 24]		;		saved ebx (GOT base)
389	call	bitreader_read_from_client_ wrt ..plt
390%else
391	call	bitreader_read_from_client_
392%endif
393%endif
394	pop	edx			;       /* discard, unused */
395	pop	ebx			;       /* restore */
396	pop	ecx			;       /* restore */
397	mov	esi, [ebp + 16]		;       cwords = br->consumed_words;
398					;       ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
399	mov	edx, [ebp + 8]		;         edx <- br->words
400	sub	edx, esi		;         edx <- br->words-cwords
401	shl	edx, 2			;         edx <- (br->words-cwords)*FLAC__BYTES_PER_WORD
402	add	edx, [ebp + 12]		;         edx <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes
403	shl	edx, 3			;         edx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8
404	sub	edx, ecx		;         edx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits
405	mov	[esp], edx		;         ucbits <- edx
406	test	eax, eax		;       if(!bitreader_read_from_client_(br))
407	jnz	.read2
408	jmp	.end			;         return false; /* eax (the return value) is already 0 */
409					;     }
410.c2_next1:
411	;; ebx		parameter
412	;; ecx		cbits
413	;; esi		cwords
414	;; edi		uval
415	;; ebp		br
416	;; [esp]	ucbits
417	cmp	esi, [ebp + 8]		;     if(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
418	jae	near .c2_next2
419	test	ecx, ecx		;       if(cbits) {
420	jz	near .c2_next3		;         /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
421	mov	eax, 32
422	mov	edx, [ebp]
423	sub	eax, ecx		;         const unsigned n = FLAC__BITS_PER_WORD - cbits;
424	mov	edx, [edx + 4*esi]	;         const brword word = br->buffer[cwords];
425	cmp	ebx, eax		;         if(parameter < n) {
426	jae	.c2_next4
427					;           uval <<= parameter;
428					;           uval |= (word & (FLAC__WORD_ALL_ONES >> cbits)) >> (n-parameter);
429	shl	edx, cl
430	xchg	ebx, ecx
431	shld	edi, edx, cl
432	add	ebx, ecx		;           cbits += parameter;
433	xchg	ebx, ecx		;           ebx <- parameter, ecx <- cbits
434	jmp	.break2			;           goto break2;
435					;         }
436.c2_next4:
437					;         uval <<= n;
438					;         uval |= word & (FLAC__WORD_ALL_ONES >> cbits);
439%if 1
440	rol	edx, cl			;            @@@@@@OPT: may be faster to use rol to save edx so we can restore it for CRC'ing
441					;            @@@@@@OPT: or put parameter in ch instead and free up ebx completely again
442%else
443	shl	edx, cl
444%endif
445	xchg	eax, ecx
446	shld	edi, edx, cl
447	xchg	eax, ecx
448%if 1
449	ror	edx, cl			;            restored.
450%else
451	mov	edx, [ebp]
452	mov	edx, [edx + 4*esi]
453%endif
454					;         crc16_update_word_(br, br->buffer[cwords]);
455	push	edi			;		[need more registers]
456	push	ebx			;		[need more registers]
457	push	eax			;		[need more registers]
458	bswap	edx			;		edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier
459	mov	ecx, [ebp + 28]		;		ecx <- br->crc16_align
460	mov	eax, [ebp + 24]		;		ax <- br->read_crc (a.k.a. crc)
461%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
462	mov	edi, _FLAC__crc16_table
463%else
464%ifdef OBJ_FORMAT_elf
465	mov	edi, [esp + 24]		;		saved ebx (GOT base)
466	lea	edi, [edi + FLAC__crc16_table wrt ..gotoff]
467%else
468	mov	edi, FLAC__crc16_table
469%endif
470%endif
471	;; eax (ax)	crc a.k.a. br->read_crc
472	;; ebx (bl)	intermediate result index into FLAC__crc16_table[]
473	;; ecx		br->crc16_align
474	;; edx		byteswapped brword to CRC
475	;; esi		cwords
476	;; edi		unsigned FLAC__crc16_table[]
477	;; ebp		br
478	test	ecx, ecx		;		switch(br->crc16_align) ...
479	jnz	.c2b4			;		[br->crc16_align is 0 the vast majority of the time so we optimize the common case]
480.c2b0:	xor	dl, ah			;		dl <- (crc>>8)^(word>>24)
481	movzx	ebx, dl
482	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^(word>>24)]
483	shl	eax, 8			;		ax <- (crc<<8)
484	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)]
485.c2b1:	xor	dh, ah			;		dh <- (crc>>8)^((word>>16)&0xff))
486	movzx	ebx, dh
487	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
488	shl	eax, 8			;		ax <- (crc<<8)
489	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
490	shr	edx, 16
491.c2b2:	xor	dl, ah			;		dl <- (crc>>8)^((word>>8)&0xff))
492	movzx	ebx, dl
493	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
494	shl	eax, 8			;		ax <- (crc<<8)
495	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
496.c2b3:	xor	dh, ah			;		dh <- (crc>>8)^(word&0xff)
497	movzx	ebx, dh
498	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)]
499	shl	eax, 8			;		ax <- (crc<<8)
500	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)]
501	movzx	eax, ax
502	mov	[ebp + 24], eax		;		br->read_crc <- crc
503	pop	eax
504	pop	ebx
505	pop	edi
506	add	esi, byte 1		;         cwords++;
507	mov	ecx, ebx
508	sub	ecx, eax		;         cbits = parameter - n;
509	jz	.break2			;         if(cbits) { /* parameter > n, i.e. if there are still bits left to read, there have to be less than 32 so they will all be in the next word */
510					;           uval <<= cbits;
511					;           uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits));
512	mov	eax, [ebp]
513	mov	eax, [eax + 4*esi]
514	shld	edi, eax, cl
515					;         }
516	jmp	.break2			;         goto break2;
517
518	;; this section relocated out of the way for performance
519.c2b4:
520	mov	[ebp + 28], dword 0	;		br->crc16_align <- 0
521	cmp	ecx, 8
522	je	.c2b1
523	shr	edx, 16
524	cmp	ecx, 16
525	je	.c2b2
526	jmp	.c2b3
527
528.c2_next3:				;       } else {
529	mov	ecx, ebx		;         cbits = parameter;
530					;         uval <<= cbits;
531					;         uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits));
532	mov	eax, [ebp]
533	mov	eax, [eax + 4*esi]
534	shld	edi, eax, cl
535	jmp	.break2			;         goto break2;
536					;       }
537.c2_next2:				;     } else {
538	; in this case we're starting our read at a partial tail word;
539	; the reader has guaranteed that we have at least 'parameter'
540	; bits available to read, which makes this case simpler.
541					;       uval <<= parameter;
542					;       if(cbits) {
543					;         /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
544					;         uval |= (br->buffer[cwords] & (FLAC__WORD_ALL_ONES >> cbits)) >> (FLAC__BITS_PER_WORD-cbits-parameter);
545					;         cbits += parameter;
546					;         goto break2;
547					;       } else {
548					;         cbits = parameter;
549					;         uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits);
550					;         goto break2;
551					;       }
552					;       the above is much shorter in assembly:
553	mov	eax, [ebp]
554	mov	eax, [eax + 4*esi]	;       eax <- br->buffer[cwords]
555	shl	eax, cl			;       eax <- br->buffer[cwords] << cbits
556	add	ecx, ebx		;       cbits += parameter
557	xchg	ebx, ecx		;       ebx <- cbits, ecx <- parameter
558	shld	edi, eax, cl		;       uval <<= parameter <<< 'parameter' bits of tail word
559	xchg	ebx, ecx		;       ebx <- parameter, ecx <- cbits
560					;     }
561					;   }
562.break2:
563	sub	[esp], ebx		;   ucbits -= parameter;
564
565	;
566	; compose the value
567	;
568	mov	ebx, [esp + 28]		;   ebx <- vals
569	mov	edx, edi		;   edx <- uval
570	and	edi, 1			;   edi <- uval & 1
571	shr	edx, 1			;   edx <- uval >> 1
572	neg	edi			;   edi <- -(int)(uval & 1)
573	xor	edx, edi		;   edx <- (uval >> 1 ^ -(int)(uval & 1))
574	mov	[ebx], edx		;   *vals <- edx
575	sub	dword [esp + 32], byte 1	;   --nvals;
576	jz	.finished		;   if(nvals == 0) /* jump to finish */
577	xor	edi, edi		;   uval = 0;
578	add	dword [esp + 28], 4	;   ++vals
579	jmp	.val_loop		; }
580
581.finished:
582	mov	[ebp + 16], esi		; br->consumed_words = cwords;
583	mov	[ebp + 20], ecx		; br->consumed_bits = cbits;
584	mov	eax, 1
585.end:
586	add	esp, 4
587	pop	edi
588	pop	esi
589	pop	ebx
590	pop	ebp
591	ret
592
593; end
594