1/*
2 *  rgbtoyuv.S
3 *
4 *     Copyright (C) Peter Schlaile - February 2001
5 *
6 *  This file is part of libdv, a free DV (IEC 61834/SMPTE 314M)
7 *  codec.
8 *
9 *  libdv is free software; you can redistribute it and/or modify it
10 *  under the terms of the GNU Lesser Public License as published by
11 *  the Free Software Foundation; either version 2.1, or (at your
12 *  option) any later version.
13 *
14 *  libdv is distributed in the hope that it will be useful, but
15 *  WITHOUT ANY WARRANTY; without even the implied warranty of
16 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 *  Lesser Public License for more details.
18 *
19 *  You should have received a copy of the GNU Lesser Public License
20 *  along with libdv; see the file COPYING.  If not, write to
21 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 *  The libdv homepage is http://libdv.sourceforge.net/.
24 */
25
26
27# The loop processes interleaved RGB values for 8 pixels.
28# The notation in the comments which describe the data locate
29# the first byte on the right. For example in a register containing
30# G2R2B1G1R1B0G0R0, R0 is in the position of the lease significant
31# byte and G2 is in the position of the most significant byte.
32# The output is to separate Y, U, and V buffers. Input are bytes,
33# output are words
34
35#define CONSTSHIFT 15
36#define PRECISION  1
37#define FIXPSHIFT  CONSTSHIFT-PRECISION
38
39#define	DV_WIDTH_SHORT      720*2
40#define	DV_WIDTH_BYTE       720
41#define DV_WIDTH_SHORT_HALF 720
42#define DV_WIDTH_BYTE_HALF  360
43
44.global _dv_rgbtoycb_mmx
45# .global yuvtoycb_mmx
46
47.data
48
49.align 8
50ZEROSX: .word   0,0,0,0
51ZEROS:  .long   0,0
52
53ALLONE:	.word	1,1,1,1
54
55OFFSETDX:       .word   0,64,0,64       #offset used before shift
56OFFSETD:        .long   0,0
57OFFSETWX:       .word   128,0,128,0     #offset used before pack 32
58OFFSETW:        .long   0,0
59OFFSETBX:       .word   128,128,128,128
60OFFSETB:        .long   0,0
61OFFSETY:	.word	(16-128) << PRECISION
62		.word   (16-128) << PRECISION
63		.word   (16-128) << PRECISION
64		.word   (16-128) << PRECISION
65
66TEMP0:          .long   0,0
67TEMPY:  .long   0,0
68TEMPU:          .long   0,0
69TEMPV:  .long   0,0
70
71#if 0 /* Original YUV */
72YR0GRX: .word   9798,19235,0,9798
73YBG0BX: .word   3736,0,19235,3736
74YR0GR:  .long   0,0
75YBG0B:  .long   0,0
76UR0GRX: .word   -4784,-9437,0,-4784
77UBG0BX: .word   14221,0,-9437,14221
78UR0GR:  .long   0,0
79UBG0B:  .long   0,0
80VR0GRX: .word   20218,-16941,0,20218
81VBG0BX: .word   -3277,0,-16941,-3277
82VR0GR:  .long   0,0
83VBG0B:  .long   0,0
84
85YR0GRX: .word   8420,16529,0,8420
86YBG0BX: .word   3203,0,16529,3203
87YR0GR:  .long   0,0
88YBG0B:  .long   0,0
89UR0GRX: .word   14391,-12055,0,14391
90UBG0BX: .word   -2336,0,-12055,-2336
91UR0GR:  .long   0,0
92UBG0B:  .long   0,0
93VR0GRX: .word   -4857,-9534,0,-4857
94VBG0BX: .word   14391,0,-9534,14391
95VR0GR:  .long   0,0
96VBG0B:  .long   0,0
97
98#else
99YR0GRX: .word   8414,16519,0,8414
100YBG0BX: .word   3208,0,16519,3208
101YR0GR:  .long   0,0
102YBG0B:  .long   0,0
103UR0GRX: .word   14392,-12061,0,14392
104UBG0BX: .word   -2332,0,-12061,-2332
105UR0GR:  .long   0,0
106UBG0B:  .long   0,0
107VR0GRX: .word   -4864,-9528,0,-4864
108VBG0BX: .word   14392,0,-9528,14392
109VR0GR:  .long   0,0
110VBG0B:  .long   0,0
111
112#endif
113
114.section .note.GNU-stack, "", @progbits
115
116.text
117
118#define _inPtr     8
119#define _rows      12
120#define _columns   16
121#define _outyPtr   20
122#define _outuPtr   24
123#define _outvPtr   28
124
125.global _dv_rgbtoycb_mmx
126.hidden _dv_rgbtoycb_mmx
127.type   _dv_rgbtoycb_mmx,@function
128_dv_rgbtoycb_mmx:
129
130	pushl   %ebp
131	movl    %esp, %ebp
132	pushl   %eax
133	pushl   %ebx
134	pushl   %ecx
135	pushl   %edx
136	pushl   %esi
137	pushl   %edi
138
139	leal    ZEROSX, %eax    #This section gets around a bug
140	movq    (%eax), %mm0    #unlikely to persist
141	movq    %mm0, ZEROS
142	leal    OFFSETDX, %eax
143	movq    (%eax), %mm0
144	movq    %mm0, OFFSETD
145	leal    OFFSETWX, %eax
146	movq    (%eax), %mm0
147	movq    %mm0, OFFSETW
148	leal    OFFSETBX, %eax
149	movq    (%eax), %mm0
150	movq    %mm0, OFFSETB
151	leal    YR0GRX, %eax
152	movq    (%eax), %mm0
153	movq    %mm0, YR0GR
154	leal    YBG0BX, %eax
155	movq    (%eax), %mm0
156	movq    %mm0, YBG0B
157	leal    UR0GRX, %eax
158	movq    (%eax), %mm0
159	movq    %mm0, UR0GR
160	leal    UBG0BX, %eax
161	movq    (%eax), %mm0
162	movq    %mm0, UBG0B
163	leal    VR0GRX, %eax
164	movq    (%eax), %mm0
165	movq    %mm0, VR0GR
166	leal    VBG0BX, %eax
167	movq    (%eax), %mm0
168	movq    %mm0, VBG0B
169
170	movl    _rows(%ebp), %eax
171	movl    _columns(%ebp), %ebx
172	mull    %ebx            #number pixels
173	shrl    $3, %eax        #number of loops
174	movl    %eax, %edi      #loop counter in edi
175	movl    _inPtr(%ebp), %eax
176	movl    _outyPtr(%ebp), %ebx
177	movl    _outuPtr(%ebp), %ecx
178	movl    _outvPtr(%ebp), %edx
179rgbtoycb_mmx_loop:
180	movq    (%eax), %mm1    #load G2R2B1G1R1B0G0R0
181	pxor    %mm6, %mm6      #0 -> mm6
182
183	movq    %mm1, %mm0      #G2R2B1G1R1B0G0R0 -> mm0
184	psrlq   $16, %mm1       #00G2R2B1G1R1B0-> mm1
185
186	punpcklbw %mm6, %mm0     #R1B0G0R0 -> mm0
187	movq    %mm1, %mm7      #00G2R2B1G1R1B0-> mm7
188
189	punpcklbw %mm6, %mm1     #B1G1R1B0 -> mm1
190	movq    %mm0, %mm2      #R1B0G0R0 -> mm2
191
192	pmaddwd YR0GR, %mm0     #yrR1,ygG0+yrR0 -> mm0
193	movq    %mm1, %mm3      #B1G1R1B0 -> mm3
194
195	pmaddwd YBG0B, %mm1     #ybB1+ygG1,ybB0 -> mm1
196	movq    %mm2, %mm4      #R1B0G0R0 -> mm4
197
198	pmaddwd UR0GR, %mm2     #urR1,ugG0+urR0 -> mm2
199	movq    %mm3, %mm5      #B1G1R1B0 -> mm5
200
201	pmaddwd UBG0B, %mm3     #ubB1+ugG1,ubB0 -> mm3
202	punpckhbw       %mm6, %mm7 #    00G2R2 -> mm7
203
204	pmaddwd VR0GR, %mm4     #vrR1,vgG0+vrR0 -> mm4
205	paddd   %mm1, %mm0      #Y1Y0 -> mm0
206
207	pmaddwd VBG0B, %mm5     #vbB1+vgG1,vbB0 -> mm5
208
209	movq    8(%eax), %mm1   #R5B4G4R4B3G3R3B2 -> mm1
210	paddd   %mm3, %mm2      #U1U0 -> mm2
211
212	movq    %mm1, %mm6      #R5B4G4R4B3G3R3B2 -> mm6
213
214	punpcklbw       ZEROS, %mm1     #B3G3R3B2 -> mm1
215	paddd   %mm5, %mm4      #V1V0 -> mm4
216
217	movq    %mm1, %mm5      #B3G3R3B2 -> mm5
218	psllq   $32, %mm1       #R3B200 -> mm1
219
220	paddd   %mm7, %mm1      #R3B200+00G2R2=R3B2G2R2->mm1
221
222	punpckhbw       ZEROS, %mm6     #R5B4G4R3 -> mm6
223	movq    %mm1, %mm3      #R3B2G2R2 -> mm3
224
225	pmaddwd YR0GR, %mm1     #yrR3,ygG2+yrR2 -> mm1
226	movq    %mm5, %mm7      #B3G3R3B2 -> mm7
227
228	pmaddwd YBG0B, %mm5     #ybB3+ygG3,ybB2 -> mm5
229	psrad   $FIXPSHIFT, %mm0       #32-bit scaled Y1Y0 -> mm0
230
231	movq    %mm6, TEMP0     #R5B4G4R4 -> TEMP0
232	movq    %mm3, %mm6      #R3B2G2R2 -> mm6
233	pmaddwd UR0GR, %mm6     #urR3,ugG2+urR2 -> mm6
234	psrad   $FIXPSHIFT, %mm2       #32-bit scaled U1U0 -> mm2
235
236	paddd   %mm5, %mm1      #Y3Y2 -> mm1
237	movq    %mm7, %mm5      #B3G3R3B2 -> mm5
238	pmaddwd UBG0B, %mm7     #ubB3+ugG3,ubB2
239	psrad   $FIXPSHIFT, %mm1 #32-bit scaled Y3Y2 -> mm1
240
241	pmaddwd VR0GR, %mm3     #vrR3,vgG2+vgR2
242	packssdw        %mm1, %mm0      #Y3Y2Y1Y0 -> mm0
243
244	pmaddwd VBG0B, %mm5     #vbB3+vgG3,vbB2 -> mm5
245	psrad   $FIXPSHIFT, %mm4       #32-bit scaled V1V0 -> mm4
246
247	movq    16(%eax), %mm1  #B7G7R7B6G6R6B5G5 -> mm7
248	paddd   %mm7, %mm6      #U3U2 -> mm6
249
250	movq    %mm1, %mm7      #B7G7R7B6G6R6B5G5 -> mm1
251	psrad   $FIXPSHIFT, %mm6       #32-bit scaled U3U2 -> mm6
252
253	paddd   %mm5, %mm3      #V3V2 -> mm3
254	psllq   $16, %mm7       #R7B6G6R6B5G500 -> mm7
255
256	movq    %mm7, %mm5      #R7B6G6R6B5G500 -> mm5
257	psrad   $FIXPSHIFT, %mm3       #32-bit scaled V3V2 -> mm3
258
259	paddw	OFFSETY, %mm0
260	movq    %mm0, (%ebx)     #store Y3Y2Y1Y0
261	packssdw %mm6, %mm2      #32-bit scaled U3U2U1U0 -> mm2
262
263	movq    TEMP0, %mm0     #R5B4G4R4 -> mm0
264	addl	$8, %ebx
265
266	punpcklbw       ZEROS, %mm7     #B5G500 -> mm7
267	movq    %mm0, %mm6      #R5B4G4R4 -> mm6
268
269	movq    %mm2, TEMPU     #32-bit scaled U3U2U1U0 -> TEMPU
270	psrlq   $32, %mm0       #00R5B4 -> mm0
271
272	paddw   %mm0, %mm7      #B5G5R5B4 -> mm7
273	movq    %mm6, %mm2      #B5B4G4R4 -> mm2
274
275	pmaddwd YR0GR, %mm2     #yrR5,ygG4+yrR4 -> mm2
276	movq    %mm7, %mm0      #B5G5R5B4 -> mm0
277
278	pmaddwd YBG0B, %mm7     #ybB5+ygG5,ybB4 -> mm7
279	packssdw        %mm3, %mm4      #32-bit scaled V3V2V1V0 -> mm4
280
281	addl    $24, %eax       #increment RGB count
282
283	movq    %mm4, TEMPV     #(V3V2V1V0)/256 -> mm4
284	movq    %mm6, %mm4      #B5B4G4R4 -> mm4
285
286	pmaddwd UR0GR, %mm6     #urR5,ugG4+urR4
287	movq    %mm0, %mm3      #B5G5R5B4 -> mm0
288
289	pmaddwd UBG0B, %mm0     #ubB5+ugG5,ubB4
290	paddd   %mm7, %mm2      #Y5Y4 -> mm2
291
292	pmaddwd         VR0GR, %mm4     #vrR5,vgG4+vrR4 -> mm4
293	pxor    %mm7, %mm7      #0 -> mm7
294
295	pmaddwd VBG0B, %mm3     #vbB5+vgG5,vbB4 -> mm3
296	punpckhbw       %mm7, %mm1      #B7G7R7B6 -> mm1
297
298	paddd   %mm6, %mm0      #U5U4 -> mm0
299	movq    %mm1, %mm6      #B7G7R7B6 -> mm6
300
301	pmaddwd YBG0B, %mm6     #ybB7+ygG7,ybB6 -> mm6
302	punpckhbw       %mm7, %mm5      #R7B6G6R6 -> mm5
303
304	movq    %mm5, %mm7      #R7B6G6R6 -> mm7
305	paddd   %mm4, %mm3      #V5V4 -> mm3
306
307	pmaddwd YR0GR, %mm5     #yrR7,ygG6+yrR6 -> mm5
308	movq    %mm1, %mm4      #B7G7R7B6 -> mm4
309
310	pmaddwd UBG0B, %mm4     #ubB7+ugG7,ubB6 -> mm4
311	psrad   $FIXPSHIFT, %mm0       #32-bit scaled U5U4 -> mm0
312
313	psrad   $FIXPSHIFT, %mm2       #32-bit scaled Y5Y4 -> mm2
314
315	paddd   %mm5, %mm6      #Y7Y6 -> mm6
316	movq    %mm7, %mm5      #R7B6G6R6 -> mm5
317
318	pmaddwd UR0GR, %mm7     #urR7,ugG6+ugR6 -> mm7
319	psrad   $FIXPSHIFT, %mm3       #32-bit scaled V5V4 -> mm3
320
321	pmaddwd VBG0B, %mm1     #vbB7+vgG7,vbB6 -> mm1
322	psrad   $FIXPSHIFT, %mm6 #32-bit scaled Y7Y6 -> mm6
323
324	packssdw %mm6, %mm2     #Y7Y6Y5Y4 -> mm2
325
326	pmaddwd VR0GR, %mm5     #vrR7,vgG6+vrR6 -> mm5
327	paddd   %mm4, %mm7      #U7U6 -> mm7
328
329	psrad   $FIXPSHIFT, %mm7       #32-bit scaled U7U6 -> mm7
330	paddw	OFFSETY, %mm2
331	movq	%mm2, (%ebx)    #store Y7Y6Y5Y4
332
333	movq	ALLONE, %mm6
334	packssdw %mm7, %mm0     #32-bit scaled U7U6U5U4 -> mm0
335
336	movq    TEMPU, %mm4     #32-bit scaled U3U2U1U0 -> mm4
337	pmaddwd	%mm6, %mm0      #U7U6U5U4 averaged -> (U7U6)(U5U4)=UU3 UU2->mm0
338
339	pmaddwd	%mm6, %mm4      #U3U2U1U0 averaged -> (U3U2)(U1U0)=UU1 UU0->mm4
340
341	paddd   %mm5, %mm1      #V7V6 -> mm1
342	packssdw %mm0, %mm4     #UU3UU2UU1UU0 -> mm4
343
344	psrad   $FIXPSHIFT, %mm1       #32-bit scaled V7V6 -> mm1
345	psraw	$1, %mm4 	#divide UU3 UU2 UU1 UU0 by 2 -> mm4
346
347	movq    TEMPV, %mm5     #32-bit scaled V3V2V1V0 -> mm5
348
349	movq	%mm4, (%ecx)    # store U
350
351	pmaddwd %mm6, %mm5	#V3V2V1V0 averaged -> VV1 VV0 -> mm5
352	packssdw %mm1, %mm3     #V7V6V5V4 -> mm3
353
354	pmaddwd %mm6, %mm3	#V7V6V5V4 averaged -> VV3 VV2 -> mm3
355
356	packssdw %mm3, %mm5     # VV3 VV2 VV1 VV0 -> mm5
357	psraw	$1, %mm5
358
359	addl    $8, %ebx        #increment Y count
360	addl    $8, %ecx        #increment U count
361
362	movq    %mm5, (%edx)    #store V
363
364	addl    $8, %edx        #increment V count
365
366	decl    %edi            #decrement loop counter
367	jnz     rgbtoycb_mmx_loop  #do 24 more bytes if not 0
368
369	popl    %edi
370	popl    %esi
371	popl    %edx
372	popl    %ecx
373	popl    %ebx
374	popl    %eax
375	popl    %ebp
376
377	ret
378
379.global _dv_ppm_copy_y_block_mmx
380.hidden _dv_ppm_copy_y_block_mmx
381.type   _dv_ppm_copy_y_block_mmx,@function
382_dv_ppm_copy_y_block_mmx:
383
384	pushl   %ebp
385	movl    %esp, %ebp
386	pushl   %esi
387	pushl	%edi
388
389	movl    8(%ebp), %edi          # dest
390	movl    12(%ebp), %esi         # src
391
392	movq	(%esi), %mm0
393	movq	8(%esi), %mm1
394	movq	%mm0, 0*8(%edi)
395	movq	%mm1, 1*8(%edi)
396	movq	DV_WIDTH_SHORT(%esi), %mm2
397	movq	DV_WIDTH_SHORT+8(%esi), %mm3
398	movq	%mm2, 2*8(%edi)
399	movq	%mm3, 3*8(%edi)
400	movq	DV_WIDTH_SHORT*2(%esi), %mm4
401	movq	DV_WIDTH_SHORT*2+8(%esi), %mm5
402	movq	%mm4, 4*8(%edi)
403	movq	%mm5, 5*8(%edi)
404	movq	DV_WIDTH_SHORT*3(%esi), %mm6
405	movq	DV_WIDTH_SHORT*3+8(%esi), %mm7
406	movq	%mm6, 6*8(%edi)
407	movq	%mm7, 7*8(%edi)
408
409	movq	DV_WIDTH_SHORT*4(%esi), %mm0
410	movq	DV_WIDTH_SHORT*4+8(%esi), %mm1
411	movq	%mm0, 8*8(%edi)
412	movq	%mm1, 9*8(%edi)
413	movq	DV_WIDTH_SHORT*5(%esi), %mm2
414	movq	DV_WIDTH_SHORT*5+8(%esi), %mm3
415	movq	%mm2, 10*8(%edi)
416	movq	%mm3, 11*8(%edi)
417	movq	DV_WIDTH_SHORT*6(%esi), %mm4
418	movq	DV_WIDTH_SHORT*6+8(%esi), %mm5
419	movq	%mm4, 12*8(%edi)
420	movq	%mm5, 13*8(%edi)
421	movq	DV_WIDTH_SHORT*7(%esi), %mm6
422	movq	DV_WIDTH_SHORT*7+8(%esi), %mm7
423	movq	%mm6, 14*8(%edi)
424	movq	%mm7, 15*8(%edi)
425
426	pop	%edi
427	pop	%esi
428	pop	%ebp
429	ret
430
431.global _dv_pgm_copy_y_block_mmx
432.hidden _dv_pgm_copy_y_block_mmx
433.type   _dv_pgm_copy_y_block_mmx,@function
434_dv_pgm_copy_y_block_mmx:
435
436	pushl   %ebp
437	movl    %esp, %ebp
438	pushl   %esi
439	pushl	%edi
440
441	movl    8(%ebp), %edi          # dest
442	movl    12(%ebp), %esi         # src
443
444	movq	OFFSETY, %mm7
445	pxor	%mm6, %mm6
446
447	movq	(%esi), %mm0
448	movq	DV_WIDTH_BYTE(%esi), %mm1
449
450	movq	%mm0, %mm2
451	movq	%mm1, %mm3
452
453	punpcklbw %mm6, %mm0
454	punpcklbw %mm6, %mm1
455
456	punpckhbw %mm6, %mm2
457	punpckhbw %mm6, %mm3
458
459#if PRECISION > 0
460	psllw	$PRECISION, %mm0
461	psllw	$PRECISION, %mm1
462	psllw	$PRECISION, %mm2
463	psllw	$PRECISION, %mm3
464#endif
465
466	paddw	%mm7, %mm0
467	paddw	%mm7, %mm1
468	paddw	%mm7, %mm2
469	paddw	%mm7, %mm3
470
471	movq	%mm0, (%edi)
472	movq	%mm2, 8(%edi)
473	movq	%mm1, 16(%edi)
474	movq	%mm3, 24(%edi)
475
476	addl	$2*DV_WIDTH_BYTE, %esi
477	addl	$32, %edi
478
479	movq	(%esi), %mm0
480	movq	DV_WIDTH_BYTE(%esi), %mm1
481
482	movq	%mm0, %mm2
483	movq	%mm1, %mm3
484
485	punpcklbw %mm6, %mm0
486	punpcklbw %mm6, %mm1
487
488	punpckhbw %mm6, %mm2
489	punpckhbw %mm6, %mm3
490
491#if PRECISION > 0
492	psllw	$PRECISION, %mm0
493	psllw	$PRECISION, %mm1
494	psllw	$PRECISION, %mm2
495	psllw	$PRECISION, %mm3
496#endif
497
498	paddw	%mm7, %mm0
499	paddw	%mm7, %mm1
500	paddw	%mm7, %mm2
501	paddw	%mm7, %mm3
502
503	movq	%mm0, (%edi)
504	movq	%mm2, 8(%edi)
505	movq	%mm1, 16(%edi)
506	movq	%mm3, 24(%edi)
507
508	addl	$2*DV_WIDTH_BYTE, %esi
509	addl	$32, %edi
510
511	movq	(%esi), %mm0
512	movq	DV_WIDTH_BYTE(%esi), %mm1
513
514	movq	%mm0, %mm2
515	movq	%mm1, %mm3
516
517	punpcklbw %mm6, %mm0
518	punpcklbw %mm6, %mm1
519
520	punpckhbw %mm6, %mm2
521	punpckhbw %mm6, %mm3
522
523#if PRECISION > 0
524	psllw	$PRECISION, %mm0
525	psllw	$PRECISION, %mm1
526	psllw	$PRECISION, %mm2
527	psllw	$PRECISION, %mm3
528#endif
529	paddw	%mm7, %mm0
530	paddw	%mm7, %mm1
531	paddw	%mm7, %mm2
532	paddw	%mm7, %mm3
533
534	movq	%mm0, (%edi)
535	movq	%mm2, 8(%edi)
536	movq	%mm1, 16(%edi)
537	movq	%mm3, 24(%edi)
538
539	addl	$2*DV_WIDTH_BYTE, %esi
540	addl	$32, %edi
541
542	movq	(%esi), %mm0
543	movq	DV_WIDTH_BYTE(%esi), %mm1
544
545	movq	%mm0, %mm2
546	movq	%mm1, %mm3
547
548	punpcklbw %mm6, %mm0
549	punpcklbw %mm6, %mm1
550
551	punpckhbw %mm6, %mm2
552	punpckhbw %mm6, %mm3
553
554#if PRECISION > 0
555	psllw	$PRECISION, %mm0
556	psllw	$PRECISION, %mm1
557	psllw	$PRECISION, %mm2
558	psllw	$PRECISION, %mm3
559#endif
560	paddw	%mm7, %mm0
561	paddw	%mm7, %mm1
562	paddw	%mm7, %mm2
563	paddw	%mm7, %mm3
564
565	movq	%mm0, (%edi)
566	movq	%mm2, 8(%edi)
567	movq	%mm1, 16(%edi)
568	movq	%mm3, 24(%edi)
569
570	pop	%edi
571	pop	%esi
572	pop	%ebp
573	ret
574
575.global _dv_video_copy_y_block_mmx
576.hidden _dv_video_copy_y_block_mmx
577.type   _dv_video_copy_y_block_mmx,@function
578_dv_video_copy_y_block_mmx:
579
580	pushl   %ebp
581	movl    %esp, %ebp
582	pushl   %esi
583	pushl	%edi
584
585	movl    8(%ebp), %edi          # dest
586	movl    12(%ebp), %esi         # src
587
588	movq	OFFSETBX, %mm7
589	pxor	%mm6, %mm6
590
591	movq	(%esi), %mm0
592	movq	DV_WIDTH_BYTE(%esi), %mm1
593
594	movq	%mm0, %mm2
595	movq	%mm1, %mm3
596
597	punpcklbw %mm6, %mm0
598	punpcklbw %mm6, %mm1
599
600	punpckhbw %mm6, %mm2
601	punpckhbw %mm6, %mm3
602
603	psubw	%mm7, %mm0
604	psubw	%mm7, %mm1
605	psubw	%mm7, %mm2
606	psubw	%mm7, %mm3
607
608#if PRECISION > 0
609	psllw	$PRECISION, %mm0
610	psllw	$PRECISION, %mm1
611	psllw	$PRECISION, %mm2
612	psllw	$PRECISION, %mm3
613#endif
614
615	movq	%mm0, (%edi)
616	movq	%mm2, 8(%edi)
617	movq	%mm1, 16(%edi)
618	movq	%mm3, 24(%edi)
619
620	addl	$2*DV_WIDTH_BYTE, %esi
621	addl	$32, %edi
622
623	movq	(%esi), %mm0
624	movq	DV_WIDTH_BYTE(%esi), %mm1
625
626	movq	%mm0, %mm2
627	movq	%mm1, %mm3
628
629	punpcklbw %mm6, %mm0
630	punpcklbw %mm6, %mm1
631
632	punpckhbw %mm6, %mm2
633	punpckhbw %mm6, %mm3
634
635	psubw	%mm7, %mm0
636	psubw	%mm7, %mm1
637	psubw	%mm7, %mm2
638	psubw	%mm7, %mm3
639
640#if PRECISION > 0
641	psllw	$PRECISION, %mm0
642	psllw	$PRECISION, %mm1
643	psllw	$PRECISION, %mm2
644	psllw	$PRECISION, %mm3
645#endif
646
647	movq	%mm0, (%edi)
648	movq	%mm2, 8(%edi)
649	movq	%mm1, 16(%edi)
650	movq	%mm3, 24(%edi)
651
652	addl	$2*DV_WIDTH_BYTE, %esi
653	addl	$32, %edi
654
655	movq	(%esi), %mm0
656	movq	DV_WIDTH_BYTE(%esi), %mm1
657
658	movq	%mm0, %mm2
659	movq	%mm1, %mm3
660
661	punpcklbw %mm6, %mm0
662	punpcklbw %mm6, %mm1
663
664	punpckhbw %mm6, %mm2
665	punpckhbw %mm6, %mm3
666
667	psubw	%mm7, %mm0
668	psubw	%mm7, %mm1
669	psubw	%mm7, %mm2
670	psubw	%mm7, %mm3
671
672#if PRECISION > 0
673	psllw	$PRECISION, %mm0
674	psllw	$PRECISION, %mm1
675	psllw	$PRECISION, %mm2
676	psllw	$PRECISION, %mm3
677#endif
678
679	movq	%mm0, (%edi)
680	movq	%mm2, 8(%edi)
681	movq	%mm1, 16(%edi)
682	movq	%mm3, 24(%edi)
683
684	addl	$2*DV_WIDTH_BYTE, %esi
685	addl	$32, %edi
686
687	movq	(%esi), %mm0
688	movq	DV_WIDTH_BYTE(%esi), %mm1
689
690	movq	%mm0, %mm2
691	movq	%mm1, %mm3
692
693	punpcklbw %mm6, %mm0
694	punpcklbw %mm6, %mm1
695
696	punpckhbw %mm6, %mm2
697	punpckhbw %mm6, %mm3
698
699	psubw	%mm7, %mm0
700	psubw	%mm7, %mm1
701	psubw	%mm7, %mm2
702	psubw	%mm7, %mm3
703
704#if PRECISION > 0
705	psllw	$PRECISION, %mm0
706	psllw	$PRECISION, %mm1
707	psllw	$PRECISION, %mm2
708	psllw	$PRECISION, %mm3
709#endif
710
711	movq	%mm0, (%edi)
712	movq	%mm2, 8(%edi)
713	movq	%mm1, 16(%edi)
714	movq	%mm3, 24(%edi)
715
716	pop	%edi
717	pop	%esi
718	pop	%ebp
719	ret
720
721
722.global _dv_ppm_copy_pal_c_block_mmx
723.hidden _dv_ppm_copy_pal_c_block_mmx
724.type   _dv_ppm_copy_pal_c_block_mmx,@function
725_dv_ppm_copy_pal_c_block_mmx:
726
727	pushl   %ebp
728	movl    %esp, %ebp
729	pushl   %esi
730	pushl	%edi
731	pushl	%ebx
732
733	movl    8(%ebp), %edi          # dest
734	movl    12(%ebp), %esi         # src
735
736	movq	          (%esi), %mm0
737	movq	DV_WIDTH_SHORT_HALF(%esi), %mm1
738	movq               8(%esi), %mm2
739	movq	DV_WIDTH_SHORT_HALF+8(%esi), %mm3
740
741	paddw	%mm0, %mm1
742	paddw	%mm2, %mm3
743	psraw	$1, %mm1
744	psraw	$1, %mm3
745
746	movq	%mm1, 0*8(%edi)
747	movq	%mm3, 1*8(%edi)
748
749	addl	$DV_WIDTH_SHORT, %esi
750	addl	$16, %edi
751
752	movq	          (%esi), %mm0
753	movq	DV_WIDTH_SHORT_HALF(%esi), %mm1
754	movq               8(%esi), %mm2
755	movq	DV_WIDTH_SHORT_HALF+8(%esi), %mm3
756
757	paddw	%mm0, %mm1
758	paddw	%mm2, %mm3
759	psraw	$1, %mm1
760	psraw	$1, %mm3
761
762	movq	%mm1, 0*8(%edi)
763	movq	%mm3, 1*8(%edi)
764
765	addl	$DV_WIDTH_SHORT, %esi
766	addl	$16, %edi
767
768	movq	          (%esi), %mm0
769	movq	DV_WIDTH_SHORT_HALF(%esi), %mm1
770	movq               8(%esi), %mm2
771	movq	DV_WIDTH_SHORT_HALF+8(%esi), %mm3
772
773	paddw	%mm0, %mm1
774	paddw	%mm2, %mm3
775	psraw	$1, %mm1
776	psraw	$1, %mm3
777
778	movq	%mm1, 0*8(%edi)
779	movq	%mm3, 1*8(%edi)
780
781	addl	$DV_WIDTH_SHORT, %esi
782	addl	$16, %edi
783
784	movq	          (%esi), %mm0
785	movq	DV_WIDTH_SHORT_HALF(%esi), %mm1
786	movq               8(%esi), %mm2
787	movq	DV_WIDTH_SHORT_HALF+8(%esi), %mm3
788
789	paddw	%mm0, %mm1
790	paddw	%mm2, %mm3
791	psraw	$1, %mm1
792	psraw	$1, %mm3
793
794	movq	%mm1, 0*8(%edi)
795	movq	%mm3, 1*8(%edi)
796
797	addl	$DV_WIDTH_SHORT, %esi
798	addl	$16, %edi
799
800	movq	          (%esi), %mm0
801	movq	DV_WIDTH_SHORT_HALF(%esi), %mm1
802	movq               8(%esi), %mm2
803	movq	DV_WIDTH_SHORT_HALF+8(%esi), %mm3
804
805	paddw	%mm0, %mm1
806	paddw	%mm2, %mm3
807	psraw	$1, %mm1
808	psraw	$1, %mm3
809
810	movq	%mm1, 0*8(%edi)
811	movq	%mm3, 1*8(%edi)
812
813	addl	$DV_WIDTH_SHORT, %esi
814	addl	$16, %edi
815
816	movq	          (%esi), %mm0
817	movq	DV_WIDTH_SHORT_HALF(%esi), %mm1
818	movq               8(%esi), %mm2
819	movq	DV_WIDTH_SHORT_HALF+8(%esi), %mm3
820
821	paddw	%mm0, %mm1
822	paddw	%mm2, %mm3
823	psraw	$1, %mm1
824	psraw	$1, %mm3
825
826	movq	%mm1, 0*8(%edi)
827	movq	%mm3, 1*8(%edi)
828
829	addl	$DV_WIDTH_SHORT, %esi
830	addl	$16, %edi
831
832	movq	          (%esi), %mm0
833	movq	DV_WIDTH_SHORT_HALF(%esi), %mm1
834	movq               8(%esi), %mm2
835	movq	DV_WIDTH_SHORT_HALF+8(%esi), %mm3
836
837	paddw	%mm0, %mm1
838	paddw	%mm2, %mm3
839	psraw	$1, %mm1
840	psraw	$1, %mm3
841
842	movq	%mm1, 0*8(%edi)
843	movq	%mm3, 1*8(%edi)
844
845	addl	$DV_WIDTH_SHORT, %esi
846	addl	$16, %edi
847
848	movq	          (%esi), %mm0
849	movq	DV_WIDTH_SHORT_HALF(%esi), %mm1
850	movq               8(%esi), %mm2
851	movq	DV_WIDTH_SHORT_HALF+8(%esi), %mm3
852
853	paddw	%mm0, %mm1
854	paddw	%mm2, %mm3
855	psraw	$1, %mm1
856	psraw	$1, %mm3
857
858	movq	%mm1, 0*8(%edi)
859	movq	%mm3, 1*8(%edi)
860
861	pop	%ebx
862	pop	%edi
863	pop	%esi
864	pop	%ebp
865	ret
866
867.global _dv_pgm_copy_pal_c_block_mmx
868.hidden _dv_pgm_copy_pal_c_block_mmx
869.type   _dv_pgm_copy_pal_c_block_mmx,@function
870_dv_pgm_copy_pal_c_block_mmx:
871
872	pushl   %ebp
873	movl    %esp, %ebp
874	pushl   %esi
875	pushl	%edi
876	pushl	%ebx
877
878	movl    8(%ebp), %edi          # dest
879	movl    12(%ebp), %esi         # src
880
881
882	movq	OFFSETBX, %mm7
883	pxor	%mm6, %mm6
884
885
886	movq	(%esi), %mm0
887	movq	DV_WIDTH_BYTE(%esi), %mm1
888
889	movq	%mm0, %mm2
890	movq	%mm1, %mm3
891
892	punpcklbw %mm6, %mm0
893	punpcklbw %mm6, %mm1
894
895	punpckhbw %mm6, %mm2
896	punpckhbw %mm6, %mm3
897
898	psubw	%mm7, %mm0
899	psubw	%mm7, %mm1
900	psubw	%mm7, %mm2
901	psubw	%mm7, %mm3
902
903#if PRECISION > 0
904	psllw	$PRECISION, %mm0
905	psllw	$PRECISION, %mm1
906	psllw	$PRECISION, %mm2
907	psllw	$PRECISION, %mm3
908#endif
909
910	movq	%mm0, (%edi)
911	movq	%mm2, 8(%edi)
912	movq	%mm1, 16(%edi)
913	movq	%mm3, 24(%edi)
914
915	addl	$2*DV_WIDTH_BYTE, %esi
916	addl	$32, %edi
917
918	movq	(%esi), %mm0
919	movq	DV_WIDTH_BYTE(%esi), %mm1
920
921	movq	%mm0, %mm2
922	movq	%mm1, %mm3
923
924	punpcklbw %mm6, %mm0
925	punpcklbw %mm6, %mm1
926
927	punpckhbw %mm6, %mm2
928	punpckhbw %mm6, %mm3
929
930	psubw	%mm7, %mm0
931	psubw	%mm7, %mm1
932	psubw	%mm7, %mm2
933	psubw	%mm7, %mm3
934
935#if PRECISION > 0
936	psllw	$PRECISION, %mm0
937	psllw	$PRECISION, %mm1
938	psllw	$PRECISION, %mm2
939	psllw	$PRECISION, %mm3
940#endif
941
942	movq	%mm0, (%edi)
943	movq	%mm2, 8(%edi)
944	movq	%mm1, 16(%edi)
945	movq	%mm3, 24(%edi)
946
947	addl	$2*DV_WIDTH_BYTE, %esi
948	addl	$32, %edi
949
950	movq	(%esi), %mm0
951	movq	DV_WIDTH_BYTE(%esi), %mm1
952
953	movq	%mm0, %mm2
954	movq	%mm1, %mm3
955
956	punpcklbw %mm6, %mm0
957	punpcklbw %mm6, %mm1
958
959	punpckhbw %mm6, %mm2
960	punpckhbw %mm6, %mm3
961
962	psubw	%mm7, %mm0
963	psubw	%mm7, %mm1
964	psubw	%mm7, %mm2
965	psubw	%mm7, %mm3
966
967#if PRECISION > 0
968	psllw	$PRECISION, %mm0
969	psllw	$PRECISION, %mm1
970	psllw	$PRECISION, %mm2
971	psllw	$PRECISION, %mm3
972#endif
973
974	movq	%mm0, (%edi)
975	movq	%mm2, 8(%edi)
976	movq	%mm1, 16(%edi)
977	movq	%mm3, 24(%edi)
978
979	addl	$2*DV_WIDTH_BYTE, %esi
980	addl	$32, %edi
981
982	movq	(%esi), %mm0
983	movq	DV_WIDTH_BYTE(%esi), %mm1
984
985	movq	%mm0, %mm2
986	movq	%mm1, %mm3
987
988	punpcklbw %mm6, %mm0
989	punpcklbw %mm6, %mm1
990
991	punpckhbw %mm6, %mm2
992	punpckhbw %mm6, %mm3
993
994	psubw	%mm7, %mm0
995	psubw	%mm7, %mm1
996	psubw	%mm7, %mm2
997	psubw	%mm7, %mm3
998
999#if PRECISION > 0
1000	psllw	$PRECISION, %mm0
1001	psllw	$PRECISION, %mm1
1002	psllw	$PRECISION, %mm2
1003	psllw	$PRECISION, %mm3
1004#endif
1005
1006	movq	%mm0, (%edi)
1007	movq	%mm2, 8(%edi)
1008	movq	%mm1, 16(%edi)
1009	movq	%mm3, 24(%edi)
1010
1011	pop	%ebx
1012	pop	%edi
1013	pop	%esi
1014	pop	%ebp
1015	ret
1016
1017.global _dv_video_copy_pal_c_block_mmx
1018.hidden _dv_video_copy_pal_c_block_mmx
1019.type   _dv_video_copy_pal_c_block_mmx,@function
1020_dv_video_copy_pal_c_block_mmx:
1021
1022	pushl   %ebp
1023	movl    %esp, %ebp
1024	pushl   %esi
1025	pushl	%edi
1026	pushl	%ebx
1027
1028	movl    8(%ebp), %edi          # dest
1029	movl    12(%ebp), %esi         # src
1030
1031	movq	OFFSETBX, %mm7
1032	paddw	%mm7, %mm7
1033	pxor	%mm6, %mm6
1034
1035	movl	$4, %ebx
1036
1037video_copy_pal_c_block_mmx_loop:
1038
1039	movq	(%esi), %mm0
1040	movq    DV_WIDTH_BYTE_HALF(%esi), %mm2
1041
1042	movq	%mm0, %mm1
1043	movq	%mm2, %mm3
1044
1045	punpcklbw %mm6, %mm0
1046	punpcklbw %mm6, %mm2
1047
1048	punpckhbw %mm6, %mm1
1049	punpckhbw %mm6, %mm3
1050
1051	paddw	%mm0, %mm2
1052	paddw	%mm1, %mm3
1053
1054	psubw	%mm7, %mm2
1055	psubw	%mm7, %mm3
1056
1057#if PRECISION == 0
1058	psraw	$1, %mm2
1059	psraw	$1, %mm3
1060#else
1061#if PRECISION > 1
1062	psllw	$PRECISION-1, %mm2
1063	psllw	$PRECISION-1, %mm3
1064#endif
1065#endif
1066	movq	%mm2, 0*8(%edi)
1067	movq	%mm3, 1*8(%edi)
1068
1069	addl	$DV_WIDTH_BYTE, %esi
1070	addl	$16, %edi
1071
1072	movq	(%esi), %mm0
1073	movq    DV_WIDTH_BYTE_HALF(%esi), %mm2
1074
1075	movq	%mm0, %mm1
1076	movq	%mm2, %mm3
1077
1078	punpcklbw %mm6, %mm0
1079	punpcklbw %mm6, %mm2
1080
1081	punpckhbw %mm6, %mm1
1082	punpckhbw %mm6, %mm3
1083
1084	paddw	%mm0, %mm2
1085	paddw	%mm1, %mm3
1086
1087	psubw	%mm7, %mm2
1088	psubw	%mm7, %mm3
1089
1090#if PRECISION == 0
1091	psraw	$1, %mm2
1092	psraw	$1, %mm3
1093#else
1094#if PRECISION > 1
1095	psllw	$PRECISION-1, %mm2
1096	psllw	$PRECISION-1, %mm3
1097#endif
1098#endif
1099	movq	%mm2, 0*8(%edi)
1100	movq	%mm3, 1*8(%edi)
1101
1102	addl	$DV_WIDTH_BYTE, %esi
1103	addl	$16, %edi
1104
1105	decl	%ebx
1106	jnz	video_copy_pal_c_block_mmx_loop
1107
1108	pop	%ebx
1109	pop	%edi
1110	pop	%esi
1111	pop	%ebp
1112	ret
1113
1114.global _dv_ppm_copy_ntsc_c_block_mmx
1115.hidden _dv_ppm_copy_ntsc_c_block_mmx
1116.type   _dv_ppm_copy_ntsc_c_block_mmx,@function
1117_dv_ppm_copy_ntsc_c_block_mmx:
1118
1119	pushl   %ebp
1120	movl    %esp, %ebp
1121	pushl   %esi
1122	pushl	%edi
1123	pushl	%ebx
1124
1125	movl    8(%ebp), %edi          # dest
1126	movl    12(%ebp), %esi         # src
1127
1128	movl	$4, %ebx
1129
1130	movq	ALLONE, %mm6
1131
1132ppm_copy_ntsc_c_block_mmx_loop:
1133
1134	movq	(%esi), %mm0
1135	movq    8(%esi), %mm1
1136	movq	16(%esi), %mm2
1137	movq	24(%esi), %mm3
1138
1139	pmaddwd %mm6, %mm0
1140	pmaddwd %mm6, %mm1
1141
1142	pmaddwd %mm6, %mm2
1143	pmaddwd %mm6, %mm3
1144
1145	packssdw %mm1, %mm0
1146	packssdw %mm3, %mm2
1147
1148	psraw	$1, %mm0
1149	psraw	$1, %mm2
1150
1151	movq	%mm0, 0*8(%edi)
1152	movq	%mm2, 1*8(%edi)
1153
1154	addl	$DV_WIDTH_SHORT_HALF, %esi
1155	addl	$16, %edi
1156
1157	movq	(%esi), %mm0
1158	movq    8(%esi), %mm1
1159	movq	16(%esi), %mm2
1160	movq	24(%esi), %mm3
1161
1162	pmaddwd %mm6, %mm0
1163	pmaddwd %mm6, %mm1
1164
1165	pmaddwd %mm6, %mm2
1166	pmaddwd %mm6, %mm3
1167
1168	packssdw %mm1, %mm0
1169	packssdw %mm3, %mm2
1170
1171	psraw	$1, %mm0
1172	psraw	$1, %mm2
1173
1174	movq	%mm0, 0*8(%edi)
1175	movq	%mm2, 1*8(%edi)
1176
1177	addl	$DV_WIDTH_SHORT_HALF, %esi
1178	addl	$16, %edi
1179
1180	decl	%ebx
1181	jnz	ppm_copy_ntsc_c_block_mmx_loop
1182
1183	pop	%ebx
1184	pop	%edi
1185	pop	%esi
1186	pop	%ebp
1187	ret
1188
1189.global _dv_pgm_copy_ntsc_c_block_mmx
1190.hidden _dv_pgm_copy_ntsc_c_block_mmx
1191.type   _dv_pgm_copy_ntsc_c_block_mmx,@function
1192_dv_pgm_copy_ntsc_c_block_mmx:
1193
1194	pushl   %ebp
1195	movl    %esp, %ebp
1196	pushl   %esi
1197	pushl	%edi
1198
1199	movl    8(%ebp), %edi          # dest
1200	movl    12(%ebp), %esi         # src
1201
1202	movq	OFFSETBX, %mm7
1203	paddw	%mm7, %mm7
1204	pxor	%mm6, %mm6
1205
1206	movq	(%esi), %mm0
1207	movq    8(%esi), %mm2
1208
1209	movq	%mm0, %mm1
1210	movq	%mm2, %mm3
1211
1212	punpcklbw %mm6, %mm0
1213	punpcklbw %mm6, %mm2
1214
1215	punpckhbw %mm6, %mm1
1216	punpckhbw %mm6, %mm3
1217
1218	paddw	%mm0, %mm1
1219	paddw	%mm2, %mm3
1220
1221	psubw	%mm7, %mm1
1222	psubw	%mm7, %mm3
1223
1224#if PRECISION == 0
1225	psraw	$1, %mm1
1226	psraw	$1, %mm3
1227#else
1228#if PRECISION > 1
1229	psllw	$PRECISION-1, %mm1
1230	psllw	$PRECISION-1, %mm3
1231#endif
1232#endif
1233	movq	%mm1, 0*8(%edi)
1234	movq	%mm3, 1*8(%edi)
1235	movq	%mm1, 2*8(%edi)
1236	movq	%mm3, 3*8(%edi)
1237
1238	addl	$DV_WIDTH_BYTE, %esi
1239	addl	$32, %edi
1240
1241	movq	(%esi), %mm0
1242	movq    8(%esi), %mm2
1243
1244	movq	%mm0, %mm1
1245	movq	%mm2, %mm3
1246
1247	punpcklbw %mm6, %mm0
1248	punpcklbw %mm6, %mm2
1249
1250	punpckhbw %mm6, %mm1
1251	punpckhbw %mm6, %mm3
1252
1253	paddw	%mm0, %mm1
1254	paddw	%mm2, %mm3
1255
1256	psubw	%mm7, %mm1
1257	psubw	%mm7, %mm3
1258
1259#if PRECISION == 0
1260	psraw	$1, %mm1
1261	psraw	$1, %mm3
1262#else
1263#if PRECISION > 1
1264	psllw	$PRECISION-1, %mm1
1265	psllw	$PRECISION-1, %mm3
1266#endif
1267#endif
1268	movq	%mm1, 0*8(%edi)
1269	movq	%mm3, 1*8(%edi)
1270	movq	%mm1, 2*8(%edi)
1271	movq	%mm3, 3*8(%edi)
1272
1273	addl	$DV_WIDTH_BYTE, %esi
1274	addl	$32, %edi
1275
1276	movq	(%esi), %mm0
1277	movq    8(%esi), %mm2
1278
1279	movq	%mm0, %mm1
1280	movq	%mm2, %mm3
1281
1282	punpcklbw %mm6, %mm0
1283	punpcklbw %mm6, %mm2
1284
1285	punpckhbw %mm6, %mm1
1286	punpckhbw %mm6, %mm3
1287
1288	paddw	%mm0, %mm1
1289	paddw	%mm2, %mm3
1290
1291	psubw	%mm7, %mm1
1292	psubw	%mm7, %mm3
1293
1294#if PRECISION == 0
1295	psraw	$1, %mm1
1296	psraw	$1, %mm3
1297#else
1298#if PRECISION > 1
1299	psllw	$PRECISION-1, %mm1
1300	psllw	$PRECISION-1, %mm3
1301#endif
1302#endif
1303	movq	%mm1, 0*8(%edi)
1304	movq	%mm3, 1*8(%edi)
1305	movq	%mm1, 2*8(%edi)
1306	movq	%mm3, 3*8(%edi)
1307
1308	addl	$DV_WIDTH_BYTE, %esi
1309	addl	$32, %edi
1310
1311	movq	(%esi), %mm0
1312	movq    8(%esi), %mm2
1313
1314	movq	%mm0, %mm1
1315	movq	%mm2, %mm3
1316
1317	punpcklbw %mm6, %mm0
1318	punpcklbw %mm6, %mm2
1319
1320	punpckhbw %mm6, %mm1
1321	punpckhbw %mm6, %mm3
1322
1323	paddw	%mm0, %mm1
1324	paddw	%mm2, %mm3
1325
1326	psubw	%mm7, %mm1
1327	psubw	%mm7, %mm3
1328
1329#if PRECISION == 0
1330	psraw	$1, %mm1
1331	psraw	$1, %mm3
1332#else
1333#if PRECISION > 1
1334	psllw	$PRECISION-1, %mm1
1335	psllw	$PRECISION-1, %mm3
1336#endif
1337#endif
1338	movq	%mm1, 0*8(%edi)
1339	movq	%mm3, 1*8(%edi)
1340	movq	%mm1, 2*8(%edi)
1341	movq	%mm3, 3*8(%edi)
1342
1343	pop	%edi
1344	pop	%esi
1345	pop	%ebp
1346	ret
1347
1348.global _dv_video_copy_ntsc_c_block_mmx
1349.hidden _dv_video_copy_ntsc_c_block_mmx
1350.type   _dv_video_copy_ntsc_c_block_mmx,@function
1351_dv_video_copy_ntsc_c_block_mmx:
1352
1353	pushl   %ebp
1354	movl    %esp, %ebp
1355	pushl   %esi
1356	pushl	%edi
1357	pushl	%ebx
1358
1359	movl    8(%ebp), %edi          # dest
1360	movl    12(%ebp), %esi         # src
1361
1362	movq	OFFSETBX, %mm7
1363	paddw	%mm7, %mm7
1364	pxor	%mm6, %mm6
1365
1366	movl	$4, %ebx
1367
1368video_copy_ntsc_c_block_mmx_loop:
1369
1370	movq	(%esi), %mm0
1371	movq    8(%esi), %mm2
1372
1373	movq	%mm0, %mm1
1374	movq	%mm2, %mm3
1375
1376	punpcklbw %mm6, %mm0
1377	punpcklbw %mm6, %mm2
1378
1379	punpckhbw %mm6, %mm1
1380	punpckhbw %mm6, %mm3
1381
1382	paddw	%mm0, %mm1
1383	paddw	%mm2, %mm3
1384
1385	psubw	%mm7, %mm1
1386	psubw	%mm7, %mm3
1387
1388#if PRECISION == 0
1389	psraw	$1, %mm1
1390	psraw	$1, %mm3
1391#else
1392#if PRECISION > 1
1393	psllw	$PRECISION-1, %mm1
1394	psllw	$PRECISION-1, %mm3
1395#endif
1396#endif
1397	movq	%mm1, 0*8(%edi)
1398	movq	%mm3, 1*8(%edi)
1399
1400	addl	$DV_WIDTH_BYTE_HALF, %esi
1401	addl	$16, %edi
1402
1403	movq	(%esi), %mm0
1404	movq    8(%esi), %mm2
1405
1406	movq	%mm0, %mm1
1407	movq	%mm2, %mm3
1408
1409	punpcklbw %mm6, %mm0
1410	punpcklbw %mm6, %mm2
1411
1412	punpckhbw %mm6, %mm1
1413	punpckhbw %mm6, %mm3
1414
1415	paddw	%mm0, %mm1
1416	paddw	%mm2, %mm3
1417
1418	psubw	%mm7, %mm1
1419	psubw	%mm7, %mm3
1420
1421#if PRECISION == 0
1422	psraw	$1, %mm1
1423	psraw	$1, %mm3
1424#else
1425#if PRECISION > 1
1426	psllw	$PRECISION-1, %mm1
1427	psllw	$PRECISION-1, %mm3
1428#endif
1429#endif
1430	movq	%mm1, 0*8(%edi)
1431	movq	%mm3, 1*8(%edi)
1432
1433	addl	$DV_WIDTH_BYTE_HALF, %esi
1434	addl	$16, %edi
1435
1436	decl	%ebx
1437	jnz	video_copy_ntsc_c_block_mmx_loop
1438
1439
1440	pop	%ebx
1441	pop	%edi
1442	pop	%esi
1443	pop	%ebp
1444	ret
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454