1/*
2 *  encode_x86.S
3 *
4 *     Copyright (C) Peter Schlaile - February 2001
5 *
6 *  This file is part of libdv, a free DV (IEC 61834/SMPTE 314M)
7 *  codec.
8 *
9 *  libdv is free software; you can redistribute it and/or modify it
10 *  under the terms of the GNU Lesser Public License as published by
11 *  the Free Software Foundation; either version 2.1, or (at your
12 *  option) any later version.
13 *
14 *  libdv is distributed in the hope that it will be useful, but
15 *  WITHOUT ANY WARRANTY; without even the implied warranty of
16 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 *  Lesser Public License for more details.
18 *
19 *  You should have received a copy of the GNU Lesser Public License
20 *  along with libdv; see the file COPYING.  If not, write to
21 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 *  The libdv homepage is http://libdv.sourceforge.net/.
24 */
25
26.data
27ALLONE:		.word 1,1,1,1
28VLCADDMASK:	.byte 255,0,0,0,255,0,0,0
29
30
31.section .note.GNU-stack, "", @progbits
32
33.text
34
35.global _dv_vlc_encode_block_mmx
36.hidden _dv_vlc_encode_block_mmx
37.type   _dv_vlc_encode_block_mmx,@function
38_dv_vlc_encode_block_mmx:
39	pushl	%ebx
40	pushl	%esi
41	pushl	%edi
42	pushl	%ebp
43
44	xorl	%eax, %eax
45	xorl	%edx, %edx
46	movl	4+4*4(%esp), %edi                # src
47	movl	4+4*4+4(%esp), %edx              # &dst
48	movl	(%edx), %edx
49	addl	$2, %edi
50
51	movl	$63, %ecx
52
53	movl	vlc_encode_lookup, %esi
54
55	pxor	%mm0, %mm0
56	pxor	%mm2, %mm2
57	movq	VLCADDMASK, %mm1
58	xorl	%ebp, %ebp
59	subl	$8, %edx
60vlc_encode_block_mmx_loop:
61	pand	%mm1, %mm0
62	movw	(%edi), %ax
63	addl	$8, %edx
64	paddd	%mm0, %mm2
65	cmpw	$0, %ax
66	jz	vlc_encode_block_amp_zero
67	addw	$255, %ax
68	addl	$2, %edi
69	movq	(%esi, %eax, 8), %mm0
70	movq	%mm0, (%edx)
71	decl	%ecx
72	jnz	vlc_encode_block_mmx_loop
73	pand	%mm1, %mm0
74	paddd	%mm0, %mm2
75	jmp     vlc_encode_block_out
76
77vlc_encode_block_amp_zero:
78        movl    %ecx, %ebp
79        incl    %ecx
80        repz    scasw
81        jecxz   vlc_encode_block_out
82        movw    -2(%edi), %ax
83        subl    %ecx, %ebp
84	addw	$255, %ax
85        shll    $9, %ebp
86	orl	%ebp, %eax
87
88	movq	(%esi, %eax, 8), %mm0
89	movq	%mm0, (%edx)
90
91	decl	%ecx
92	jnz	vlc_encode_block_mmx_loop
93
94	pand	%mm1, %mm0
95	paddd	%mm0, %mm2
96
97vlc_encode_block_out:
98	movq	%mm2, %mm0
99	psrlq	$32, %mm0
100	paddd	%mm0, %mm2
101
102	movl	4+4*4+4(%esp), %ebx
103	movl	%edx, (%ebx)
104
105	movd	%mm2, %eax
106
107	popl	%ebp
108	popl	%edi
109	popl	%esi
110	popl	%ebx
111	ret
112
113.global _dv_vlc_num_bits_block_x86
114.hidden _dv_vlc_num_bits_block_x86
115.type   _dv_vlc_num_bits_block_x86,@function
116_dv_vlc_num_bits_block_x86:
117	pushl	%ebx
118	pushl	%esi
119	pushl	%edi
120	pushl	%ebp
121
122	xorl	%eax, %eax
123	xorl	%edx, %edx
124	xorl	%ebx, %ebx
125	xorl	%ebp, %ebp
126
127	movl	4+4*4(%esp), %edi                # src
128	addl	$2, %edi
129
130	movl	$63, %ecx
131	movl	vlc_num_bits_lookup, %esi
132
133vlc_num_bits_block_x86_loop:
134	movw	(%edi), %ax
135	addl	%ebx, %edx
136	cmpw	$0, %ax
137	jz	vlc_num_bits_block_amp_zero
138	addw	$255, %ax
139	addl	$2, %edi
140	movb	(%esi, %eax), %bl
141
142	decl	%ecx
143	jnz	vlc_num_bits_block_x86_loop
144	addl	%ebx, %edx
145	jmp	vlc_num_bits_block_out
146
147vlc_num_bits_block_amp_zero:
148        movl    %ecx, %ebp
149        incl    %ecx
150        repz    scasw
151        jecxz   vlc_num_bits_block_out
152
153        subl    %ecx, %ebp
154        movw    -2(%edi), %ax
155        shll    $9, %ebp
156	addw	$255, %ax
157	orl	%ebp, %eax
158	movb	(%esi, %eax), %bl
159
160	decl	%ecx
161	jnz	vlc_num_bits_block_x86_loop
162	addl	%ebx, %edx
163
164vlc_num_bits_block_out:
165	movl	%edx, %eax
166
167	popl	%ebp
168	popl	%edi
169	popl	%esi
170	popl	%ebx
171	ret
172
173.global _dv_vlc_encode_block_pass_1_x86
174.hidden _dv_vlc_encode_block_pass_1_x86
175.type   _dv_vlc_encode_block_pass_1_x86,@function
176_dv_vlc_encode_block_pass_1_x86:
177	pushl	%ebx
178	pushl	%esi
179	pushl	%edi
180	pushl	%ebp
181
182	subl	$4, %esp
183
184	movl	1*4+5*4(%esp), %esi                # start
185	movl	(%esi), %esi
186	movl	2*4+5*4(%esp), %edi		   # end
187	movl	3*4+5*4(%esp), %eax                # bit_budget
188	movl	(%eax), %eax
189	movl	%eax, (%esp)
190	movl	4*4+5*4(%esp), %ebp		   # bit_offset
191	movl	(%ebp), %ebp
192	/*      5*4+5*4(%esp)                      # vsbuffer */
193	xorl	%ecx, %ecx
194	xorl	%edx, %edx
195
196vlc_encode_block_pass_1_x86_loop:
197	lodsl
198	movb	%al, %cl
199
200	subl	%ecx, (%esp)             # bit_budget -= len
201	jl	vlc_encode_block_pass1_x86_out
202
203	movl	%ebp, %ebx               # bit_offset
204	negl	%ecx                     # -len
205
206	andl	$7, %ebx                 # bit_offset & 7
207	addl	$32, %ecx                # 32-len
208
209	movb	%al, %dl                 # len
210	subl	%ebx, %ecx               # 32-len-(bit_offset & 7)
211
212	shrl	$8, %eax                 # value
213	movl	%ebp, %ebx               # bit_offset
214
215	shll	%cl, %eax                # value <<= 32-len-(bit_offset & 7)
216	shrl	$3, %ebx                 # bit_offset >> 3
217
218	bswap	%eax
219	addl	5*4+5*4(%esp), %ebx      # vsbuffer + bit_offset >> 3
220
221	addl	%edx, %ebp               # bit_offset += len
222	orl	%eax, (%ebx)             # store value
223
224	cmpl	%esi, %edi
225	jnz	vlc_encode_block_pass_1_x86_loop
226
227	xorl	%ecx, %ecx
228	addl	$4, %esi
229
230vlc_encode_block_pass1_x86_out:
231	subl	$4, %esi
232	addl	(%esp), %ecx            # bit_budget
233
234	movl	1*4+5*4(%esp), %eax     # start
235	movl	%esi, (%eax)
236
237	movl	3*4+5*4(%esp), %eax     # bit_budget
238	movl	%ecx, (%eax)
239
240	movl	4*4+5*4(%esp), %eax     # bit_offset
241	movl	%ebp, (%eax)
242
243	addl	$4, %esp
244
245	popl	%ebp
246	popl	%edi
247	popl	%esi
248	popl	%ebx
249	ret
250
251.global _dv_classify_mmx
252.hidden _dv_classify_mmx
253.type   _dv_classify_mmx,@function
254_dv_classify_mmx:
255
256	pushl   %ebp
257	movl    %esp, %ebp
258	pushl   %esi
259
260	movl	12(%ebp), %esi
261	movq	(%esi), %mm7            # amp_ofs
262	movl	16(%ebp), %esi
263	movq	(%esi), %mm6            # amp_cmp
264
265	movl    8(%ebp), %esi          # source
266
267	movq	%mm7, %mm5
268	movq	%mm6, %mm4
269
270	pxor	%mm3, %mm3
271	pxor	%mm2, %mm2
272
273	movq	0*8(%esi), %mm0
274	movq	1*8(%esi), %mm1
275
276	paddw	%mm7, %mm0
277	paddw	%mm5, %mm1
278	pcmpgtw	%mm6, %mm0
279	pcmpgtw %mm4, %mm1
280	paddw	%mm0, %mm3
281	paddw	%mm1, %mm2
282
283	movq	2*8(%esi), %mm0
284	movq	3*8(%esi), %mm1
285	paddw	%mm7, %mm0
286	paddw	%mm5, %mm1
287	pcmpgtw	%mm6, %mm0
288	pcmpgtw %mm4, %mm1
289	paddw	%mm0, %mm3
290	paddw	%mm1, %mm2
291
292	movq	4*8(%esi), %mm0
293	movq	5*8(%esi), %mm1
294	paddw	%mm7, %mm0
295	paddw	%mm5, %mm1
296	pcmpgtw	%mm6, %mm0
297	pcmpgtw %mm4, %mm1
298	paddw	%mm0, %mm3
299	paddw	%mm1, %mm2
300
301	movq	6*8(%esi), %mm0
302	movq	7*8(%esi), %mm1
303	paddw	%mm7, %mm0
304	paddw	%mm5, %mm1
305	pcmpgtw	%mm6, %mm0
306	pcmpgtw %mm4, %mm1
307	paddw	%mm0, %mm3
308	paddw	%mm1, %mm2
309
310	movq	8*8(%esi), %mm0
311	movq	9*8(%esi), %mm1
312	paddw	%mm7, %mm0
313	paddw	%mm5, %mm1
314	pcmpgtw	%mm6, %mm0
315	pcmpgtw %mm4, %mm1
316	paddw	%mm0, %mm3
317	paddw	%mm1, %mm2
318
319	movq	10*8(%esi), %mm0
320	movq	11*8(%esi), %mm1
321	paddw	%mm7, %mm0
322	paddw	%mm5, %mm1
323	pcmpgtw	%mm6, %mm0
324	pcmpgtw %mm4, %mm1
325	paddw	%mm0, %mm3
326	paddw	%mm1, %mm2
327
328	movq	12*8(%esi), %mm0
329	movq	13*8(%esi), %mm1
330	paddw	%mm7, %mm0
331	paddw	%mm5, %mm1
332	pcmpgtw	%mm6, %mm0
333	pcmpgtw %mm4, %mm1
334	paddw	%mm0, %mm3
335	paddw	%mm1, %mm2
336
337	movq	14*8(%esi), %mm0
338	movq	15*8(%esi), %mm1
339	paddw	%mm7, %mm0
340	paddw	%mm5, %mm1
341	pcmpgtw	%mm6, %mm0
342	pcmpgtw %mm4, %mm1
343	paddw	%mm0, %mm3
344	paddw	%mm1, %mm2
345
346	paddw	%mm2, %mm3
347	packsswb %mm3, %mm3
348
349	movd	%mm3, %eax
350
351	pop	%esi
352	pop	%ebp
353	ret
354
355/* FIXME: _dv_reorder_block_mmx isn't really _that_ faster than the C version...
356	 don't know why... */
357
358.global _dv_reorder_block_mmx
359.hidden _dv_reorder_block_mmx
360.type   _dv_reorder_block_mmx,@function
361_dv_reorder_block_mmx:
362
363	pushl   %ebp
364	movl    %esp, %ebp
365	pushl   %esi
366	pushl	%edi
367	pushl	%ebx
368	pushl	%ecx
369	pushl	%edx
370
371	movl	8(%ebp), %esi      # source
372	movl	12(%ebp), %edi     # reorder_table
373
374	xorl	%ebp, %ebp
375	xorl	%eax, %eax
376	xorl	%ebx, %ebx
377	xorl	%ecx, %ecx
378	xorl	%edx, %edx
379
380	subl	$128, %esp
381
382reorder_loop:
383
384	movw	 (%esi, %ebp), %ax
385	movw	2(%esi, %ebp), %bx
386
387	movw	 (%edi, %ebp), %cx
388	movw	2(%edi, %ebp), %dx
389
390	movw	 %ax, (%esp,%ecx)
391	movw	 %bx, (%esp,%edx)
392
393	movw	4(%esi, %ebp), %ax
394	movw	6(%esi, %ebp), %bx
395
396	movw	4(%edi, %ebp), %cx
397	movw	6(%edi, %ebp), %dx
398
399	movw	 %ax, (%esp,%ecx)
400	movw	 %bx, (%esp,%edx)
401
402	movw	 8(%esi, %ebp), %ax
403	movw	10(%esi, %ebp), %bx
404
405	movw	8(%edi, %ebp), %cx
406	movw	10(%edi, %ebp), %dx
407
408	movw	 %ax, (%esp,%ecx)
409	movw	 %bx, (%esp,%edx)
410
411	movw	12(%esi, %ebp), %ax
412	movw	14(%esi, %ebp), %bx
413
414	movw	12(%edi, %ebp), %cx
415	movw	14(%edi, %ebp), %dx
416
417	movw	 %ax, (%esp,%ecx)
418	movw	 %bx, (%esp,%edx)
419
420	addl	$16, %ebp
421
422	cmpl	$128, %ebp
423	jne	reorder_loop
424
425	movq	(%esp)  , %mm0
426	movq	8(%esp) , %mm1
427	movq	16(%esp), %mm2
428	movq	24(%esp), %mm3
429
430	movq	%mm0, (%esi)
431	movq	%mm1, 8(%esi)
432	movq	%mm2, 16(%esi)
433	movq	%mm3, 24(%esi)
434
435	movq	32(%esp)   , %mm0
436	movq	32+8(%esp) , %mm1
437	movq	32+16(%esp), %mm2
438	movq	32+24(%esp), %mm3
439
440	movq	%mm0, 32(%esi)
441	movq	%mm1, 32+8(%esi)
442	movq	%mm2, 32+16(%esi)
443	movq	%mm3, 32+24(%esi)
444
445	movq	64(%esp)   , %mm0
446	movq	64+8(%esp) , %mm1
447	movq	64+16(%esp), %mm2
448	movq	64+24(%esp), %mm3
449
450	movq	%mm0, 64(%esi)
451	movq	%mm1, 64+8(%esi)
452	movq	%mm2, 64+16(%esi)
453	movq	%mm3, 64+24(%esi)
454
455	movq	96(%esp)   , %mm0
456	movq	96+8(%esp) , %mm1
457	movq	96+16(%esp), %mm2
458	movq	96+24(%esp), %mm3
459
460	addl	$128, %esp
461
462	movq	%mm0, 96(%esi)
463	movq	%mm1, 96+8(%esi)
464	movq	%mm2, 96+16(%esi)
465	movq	%mm3, 96+24(%esi)
466
467	popl	%edx
468	popl	%ecx
469	popl	%ebx
470	popl	%edi
471	popl	%esi
472	popl	%ebp
473	ret
474
475.global _dv_need_dct_248_mmx_rows
476.hidden _dv_need_dct_248_mmx_rows
477.type   _dv_need_dct_248_mmx_rows,@function
478_dv_need_dct_248_mmx_rows:
479
480	pushl   %ebp
481	movl    %esp, %ebp
482	pushl   %esi
483	pushl	%edi
484
485	movl	8(%ebp), %esi      # source
486
487	movq	(0*8+0)*2(%esi), %mm0
488	movq	(0*8+4)*2(%esi), %mm1
489	psubw	(1*8+0)*2(%esi), %mm0
490	psubw	(1*8+4)*2(%esi), %mm1
491	movq	%mm0, %mm2
492	movq	%mm1, %mm3
493	psraw	$15, %mm2
494	psraw	$15, %mm3
495	pxor	%mm2, %mm0
496	pxor	%mm3, %mm1
497	psubw	%mm2, %mm0
498	psubw	%mm3, %mm1
499
500	movq	(1*8+0)*2(%esi), %mm4
501	movq	(1*8+4)*2(%esi), %mm5
502	psubw	(2*8+0)*2(%esi), %mm4
503	psubw	(2*8+4)*2(%esi), %mm5
504	movq	%mm4, %mm6
505	movq	%mm5, %mm7
506	psraw	$15, %mm6
507	psraw	$15, %mm7
508	pxor	%mm6, %mm4
509	pxor	%mm7, %mm5
510	psubw	%mm6, %mm4
511	psubw	%mm7, %mm5
512
513	paddw	%mm4, %mm0
514	paddw	%mm5, %mm1
515
516	movq	(2*8+0)*2(%esi), %mm4
517	movq	(2*8+4)*2(%esi), %mm5
518	psubw	(3*8+0)*2(%esi), %mm4
519	psubw	(3*8+4)*2(%esi), %mm5
520	movq	%mm4, %mm6
521	movq	%mm5, %mm7
522	psraw	$15, %mm6
523	psraw	$15, %mm7
524	pxor	%mm6, %mm4
525	pxor	%mm7, %mm5
526	psubw	%mm6, %mm4
527	psubw	%mm7, %mm5
528
529	paddw	%mm4, %mm0
530	paddw	%mm5, %mm1
531
532	movq	(3*8+0)*2(%esi), %mm4
533	movq	(3*8+4)*2(%esi), %mm5
534	psubw	(4*8+0)*2(%esi), %mm4
535	psubw	(4*8+4)*2(%esi), %mm5
536	movq	%mm4, %mm6
537	movq	%mm5, %mm7
538	psraw	$15, %mm6
539	psraw	$15, %mm7
540	pxor	%mm6, %mm4
541	pxor	%mm7, %mm5
542	psubw	%mm6, %mm4
543	psubw	%mm7, %mm5
544
545	paddw	%mm4, %mm0
546	paddw	%mm5, %mm1
547
548	movq	(4*8+0)*2(%esi), %mm4
549	movq	(4*8+4)*2(%esi), %mm5
550	psubw	(5*8+0)*2(%esi), %mm4
551	psubw	(5*8+4)*2(%esi), %mm5
552	movq	%mm4, %mm6
553	movq	%mm5, %mm7
554	psraw	$15, %mm6
555	psraw	$15, %mm7
556	pxor	%mm6, %mm4
557	pxor	%mm7, %mm5
558	psubw	%mm6, %mm4
559	psubw	%mm7, %mm5
560
561	paddw	%mm4, %mm0
562	paddw	%mm5, %mm1
563
564	movq	(5*8+0)*2(%esi), %mm4
565	movq	(5*8+4)*2(%esi), %mm5
566	psubw	(6*8+0)*2(%esi), %mm4
567	psubw	(6*8+4)*2(%esi), %mm5
568	movq	%mm4, %mm6
569	movq	%mm5, %mm7
570	psraw	$15, %mm6
571	psraw	$15, %mm7
572	pxor	%mm6, %mm4
573	pxor	%mm7, %mm5
574	psubw	%mm6, %mm4
575	psubw	%mm7, %mm5
576
577	paddw	%mm4, %mm0
578	paddw	%mm5, %mm1
579
580	movq	(6*8+0)*2(%esi), %mm4
581	movq	(6*8+4)*2(%esi), %mm5
582	psubw	(7*8+0)*2(%esi), %mm4
583	psubw	(7*8+4)*2(%esi), %mm5
584	movq	%mm4, %mm6
585	movq	%mm5, %mm7
586	psraw	$15, %mm6
587	psraw	$15, %mm7
588	pxor	%mm6, %mm4
589	pxor	%mm7, %mm5
590	psubw	%mm6, %mm4
591	psubw	%mm7, %mm5
592
593	paddw	%mm4, %mm0
594	paddw	%mm5, %mm1
595
596	paddw	%mm1, %mm0
597
598	pmaddwd	ALLONE, %mm0
599	movq	%mm0, %mm1
600	psrlq	$32, %mm1
601	paddd	%mm1, %mm0
602
603	movd	%mm0, %eax
604
605	popl	%edi
606	popl	%esi
607	popl	%ebp
608
609	ret
610
611
612
613
614