1/*
2 *  rgbtoyuv.S
3 *
4 *     Copyright (C) Peter Schlaile - February 2001
5 *
6 *  This file is part of libdv, a free DV (IEC 61834/SMPTE 314M)
7 *  codec.
8 *
9 *  libdv is free software; you can redistribute it and/or modify it
10 *  under the terms of the GNU Lesser Public License as published by
11 *  the Free Software Foundation; either version 2.1, or (at your
12 *  option) any later version.
13 *
14 *  libdv is distributed in the hope that it will be useful, but
15 *  WITHOUT ANY WARRANTY; without even the implied warranty of
16 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 *  Lesser Public License for more details.
18 *
19 *  You should have received a copy of the GNU Lesser Public License
20 *  along with libdv; see the file COPYING.  If not, write to
21 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 *  The libdv homepage is http://libdv.sourceforge.net/.
24 */
25
26
27# The loop processes interleaved RGB values for 8 pixels.
28# The notation in the comments which describe the data locate
29# the first byte on the right. For example in a register containing
30# G2R2B1G1R1B0G0R0, R0 is in the position of the lease significant
31# byte and G2 is in the position of the most significant byte.
32# The output is to separate Y, U, and V buffers. Input are bytes,
33# output are words
34
35#define CONSTSHIFT 15
36#define PRECISION  1
37#define FIXPSHIFT  CONSTSHIFT-PRECISION
38
39#define	DV_WIDTH_SHORT      720*2
40#define	DV_WIDTH_BYTE       720
41#define DV_WIDTH_SHORT_HALF 720
42#define DV_WIDTH_BYTE_HALF  360
43
44.global _dv_rgbtoycb_mmx_x86_64
45# .global yuvtoycb_mmx_x86_64
46
47.data
48
49.align 8
50ZEROSX: .word   0,0,0,0
51ZEROS:  .long   0,0
52
53ALLONE:	.word	1,1,1,1
54
55OFFSETDX:       .word   0,64,0,64       #offset used before shift
56OFFSETD:        .long   0,0
57OFFSETWX:       .word   128,0,128,0     #offset used before pack 32
58OFFSETW:        .long   0,0
59OFFSETBX:       .word   128,128,128,128
60OFFSETB:        .long   0,0
61OFFSETY:	.word	(16-128) << PRECISION
62		.word   (16-128) << PRECISION
63		.word   (16-128) << PRECISION
64		.word   (16-128) << PRECISION
65
66TEMP0:          .long   0,0
67TEMPY:  .long   0,0
68TEMPU:          .long   0,0
69TEMPV:  .long   0,0
70
71#if 0 /* Original YUV */
72YR0GRX: .word   9798,19235,0,9798
73YBG0BX: .word   3736,0,19235,3736
74YR0GR:  .long   0,0
75YBG0B:  .long   0,0
76UR0GRX: .word   -4784,-9437,0,-4784
77UBG0BX: .word   14221,0,-9437,14221
78UR0GR:  .long   0,0
79UBG0B:  .long   0,0
80VR0GRX: .word   20218,-16941,0,20218
81VBG0BX: .word   -3277,0,-16941,-3277
82VR0GR:  .long   0,0
83VBG0B:  .long   0,0
84
85YR0GRX: .word   8420,16529,0,8420
86YBG0BX: .word   3203,0,16529,3203
87YR0GR:  .long   0,0
88YBG0B:  .long   0,0
89UR0GRX: .word   14391,-12055,0,14391
90UBG0BX: .word   -2336,0,-12055,-2336
91UR0GR:  .long   0,0
92UBG0B:  .long   0,0
93VR0GRX: .word   -4857,-9534,0,-4857
94VBG0BX: .word   14391,0,-9534,14391
95VR0GR:  .long   0,0
96VBG0B:  .long   0,0
97
98#else
99YR0GRX: .word   8414,16519,0,8414
100YBG0BX: .word   3208,0,16519,3208
101YR0GR:  .long   0,0
102YBG0B:  .long   0,0
103UR0GRX: .word   14392,-12061,0,14392
104UBG0BX: .word   -2332,0,-12061,-2332
105UR0GR:  .long   0,0
106UBG0B:  .long   0,0
107VR0GRX: .word   -4864,-9528,0,-4864
108VBG0BX: .word   14392,0,-9528,14392
109VR0GR:  .long   0,0
110VBG0B:  .long   0,0
111
112#endif
113
114.section .note.GNU-stack, "", @progbits
115
116.text
117
118#define _inPtr     8
119#define _rows      12
120#define _columns   16
121#define _outyPtr   20
122#define _outuPtr   24
123#define _outvPtr   28
124
125.global _dv_rgbtoycb_mmx_x86_64
126.hidden _dv_rgbtoycb_mmx_x86_64
127.type   _dv_rgbtoycb_mmx_x86_64,@function
128_dv_rgbtoycb_mmx_x86_64:
129
130        /* extern void _dv_rgbtoycb_mmx_x86_64(unsigned char* inPtr,    rdi
131	                                       int            rows,     rsi
132	                                       int            columns,  rdx
133			                       short*         outyPtr,  rcx
134	                                       short*         outuPtr,  r8
135	                                       short*         outvPtr); r9
136	*/
137
138	push   %rax
139	push   %rbx
140	push   %r12
141	push   %r13
142
143	lea     ZEROSX(%rip), %rax    #This section gets around a bug
144	movq    (%rax), %mm0          #unlikely to persist
145	movq    %mm0, ZEROS(%rip)
146	lea     OFFSETDX(%rip), %rax
147	movq    (%rax), %mm0
148	movq    %mm0, OFFSETD(%rip)
149	lea     OFFSETWX(%rip), %rax
150	movq    (%rax), %mm0
151	movq    %mm0, OFFSETW(%rip)
152	lea     OFFSETBX(%rip), %rax
153	movq    (%rax), %mm0
154	movq    %mm0, OFFSETB(%rip)
155	lea     YR0GRX(%rip), %rax
156	movq    (%rax), %mm0
157	movq    %mm0, YR0GR(%rip)
158	lea     YBG0BX(%rip), %rax
159	movq    (%rax), %mm0
160	movq    %mm0, YBG0B(%rip)
161	lea     UR0GRX(%rip), %rax
162	movq    (%rax), %mm0
163	movq    %mm0, UR0GR(%rip)
164	lea     UBG0BX(%rip), %rax
165	movq    (%rax), %mm0
166	movq    %mm0, UBG0B(%rip)
167	lea     VR0GRX(%rip), %rax
168	movq    (%rax), %mm0
169	movq    %mm0, VR0GR(%rip)
170	lea     VBG0BX(%rip), %rax
171	movq    (%rax), %mm0
172	movq    %mm0, VBG0B(%rip)
173
174	mov     %rsi, %rax      #rows
175	mov     %rdx, %rbx      #columns
176	mul     %rbx            #number pixels
177	shr     $3, %rax        #number of loops
178	mov     %rax,%r11       #loop counter in r11
179
180	mov     %rdi,%rax       #inPtr
181	mov     %rcx,%rbx       #outyPtr
182	mov     %r8,%r12        #outuPtr
183	mov     %r9,%r13        #outvPtr
184rgbtoycb_mmx_loop:
185	movq    (%rax), %mm1    #load G2R2B1G1R1B0G0R0
186	pxor    %mm6, %mm6      #0 -> mm6
187
188	movq    %mm1, %mm0      #G2R2B1G1R1B0G0R0 -> mm0
189	psrlq   $16, %mm1       #00G2R2B1G1R1B0-> mm1
190
191	punpcklbw %mm6, %mm0     #R1B0G0R0 -> mm0
192	movq    %mm1, %mm7      #00G2R2B1G1R1B0-> mm7
193
194	punpcklbw %mm6, %mm1     #B1G1R1B0 -> mm1
195	movq    %mm0, %mm2      #R1B0G0R0 -> mm2
196
197	pmaddwd YR0GR(%rip), %mm0     #yrR1,ygG0+yrR0 -> mm0
198	movq    %mm1, %mm3      #B1G1R1B0 -> mm3
199
200	pmaddwd YBG0B(%rip), %mm1     #ybB1+ygG1,ybB0 -> mm1
201	movq    %mm2, %mm4      #R1B0G0R0 -> mm4
202
203	pmaddwd UR0GR(%rip), %mm2     #urR1,ugG0+urR0 -> mm2
204	movq    %mm3, %mm5      #B1G1R1B0 -> mm5
205
206	pmaddwd UBG0B(%rip), %mm3     #ubB1+ugG1,ubB0 -> mm3
207	punpckhbw       %mm6, %mm7 #    00G2R2 -> mm7
208
209	pmaddwd VR0GR(%rip), %mm4     #vrR1,vgG0+vrR0 -> mm4
210	paddd   %mm1, %mm0      #Y1Y0 -> mm0
211
212	pmaddwd VBG0B(%rip), %mm5     #vbB1+vgG1,vbB0 -> mm5
213
214	movq    8(%rax), %mm1   #R5B4G4R4B3G3R3B2 -> mm1
215	paddd   %mm3, %mm2      #U1U0 -> mm2
216
217	movq    %mm1, %mm6      #R5B4G4R4B3G3R3B2 -> mm6
218
219	punpcklbw       ZEROS(%rip), %mm1     #B3G3R3B2 -> mm1
220	paddd   %mm5, %mm4      #V1V0 -> mm4
221
222	movq    %mm1, %mm5      #B3G3R3B2 -> mm5
223	psllq   $32, %mm1       #R3B200 -> mm1
224
225	paddd   %mm7, %mm1      #R3B200+00G2R2=R3B2G2R2->mm1
226
227	punpckhbw       ZEROS(%rip), %mm6     #R5B4G4R3 -> mm6
228	movq    %mm1, %mm3      #R3B2G2R2 -> mm3
229
230	pmaddwd YR0GR(%rip), %mm1     #yrR3,ygG2+yrR2 -> mm1
231	movq    %mm5, %mm7      #B3G3R3B2 -> mm7
232
233	pmaddwd YBG0B(%rip), %mm5     #ybB3+ygG3,ybB2 -> mm5
234	psrad   $FIXPSHIFT, %mm0       #32-bit scaled Y1Y0 -> mm0
235
236	movq    %mm6, TEMP0(%rip)     #R5B4G4R4 -> TEMP0
237	movq    %mm3, %mm6      #R3B2G2R2 -> mm6
238	pmaddwd UR0GR(%rip), %mm6     #urR3,ugG2+urR2 -> mm6
239	psrad   $FIXPSHIFT, %mm2       #32-bit scaled U1U0 -> mm2
240
241	paddd   %mm5, %mm1      #Y3Y2 -> mm1
242	movq    %mm7, %mm5      #B3G3R3B2 -> mm5
243	pmaddwd UBG0B(%rip), %mm7     #ubB3+ugG3,ubB2
244	psrad   $FIXPSHIFT, %mm1 #32-bit scaled Y3Y2 -> mm1
245
246	pmaddwd VR0GR(%rip), %mm3     #vrR3,vgG2+vgR2
247	packssdw        %mm1, %mm0      #Y3Y2Y1Y0 -> mm0
248
249	pmaddwd VBG0B(%rip), %mm5     #vbB3+vgG3,vbB2 -> mm5
250	psrad   $FIXPSHIFT, %mm4       #32-bit scaled V1V0 -> mm4
251
252	movq    16(%rax), %mm1  #B7G7R7B6G6R6B5G5 -> mm7
253	paddd   %mm7, %mm6      #U3U2 -> mm6
254
255	movq    %mm1, %mm7      #B7G7R7B6G6R6B5G5 -> mm1
256	psrad   $FIXPSHIFT, %mm6       #32-bit scaled U3U2 -> mm6
257
258	paddd   %mm5, %mm3      #V3V2 -> mm3
259	psllq   $16, %mm7       #R7B6G6R6B5G500 -> mm7
260
261	movq    %mm7, %mm5      #R7B6G6R6B5G500 -> mm5
262	psrad   $FIXPSHIFT, %mm3       #32-bit scaled V3V2 -> mm3
263
264	paddw	OFFSETY(%rip), %mm0
265	movq    %mm0, (%rbx)     #store Y3Y2Y1Y0
266	packssdw %mm6, %mm2      #32-bit scaled U3U2U1U0 -> mm2
267
268	movq    TEMP0(%rip), %mm0     #R5B4G4R4 -> mm0
269	add	$8, %rbx
270
271	punpcklbw       ZEROS(%rip), %mm7     #B5G500 -> mm7
272	movq    %mm0, %mm6      #R5B4G4R4 -> mm6
273
274	movq    %mm2, TEMPU(%rip)     #32-bit scaled U3U2U1U0 -> TEMPU
275	psrlq   $32, %mm0       #00R5B4 -> mm0
276
277	paddw   %mm0, %mm7      #B5G5R5B4 -> mm7
278	movq    %mm6, %mm2      #B5B4G4R4 -> mm2
279
280	pmaddwd YR0GR(%rip), %mm2     #yrR5,ygG4+yrR4 -> mm2
281	movq    %mm7, %mm0      #B5G5R5B4 -> mm0
282
283	pmaddwd YBG0B(%rip), %mm7     #ybB5+ygG5,ybB4 -> mm7
284	packssdw        %mm3, %mm4      #32-bit scaled V3V2V1V0 -> mm4
285
286	add     $24, %rax       #increment RGB count
287
288	movq    %mm4, TEMPV(%rip)     #(V3V2V1V0)/256 -> mm4
289	movq    %mm6, %mm4      #B5B4G4R4 -> mm4
290
291	pmaddwd UR0GR(%rip), %mm6     #urR5,ugG4+urR4
292	movq    %mm0, %mm3      #B5G5R5B4 -> mm0
293
294	pmaddwd UBG0B(%rip), %mm0     #ubB5+ugG5,ubB4
295	paddd   %mm7, %mm2      #Y5Y4 -> mm2
296
297	pmaddwd         VR0GR(%rip), %mm4     #vrR5,vgG4+vrR4 -> mm4
298	pxor    %mm7, %mm7      #0 -> mm7
299
300	pmaddwd VBG0B(%rip), %mm3     #vbB5+vgG5,vbB4 -> mm3
301	punpckhbw       %mm7, %mm1      #B7G7R7B6 -> mm1
302
303	paddd   %mm6, %mm0      #U5U4 -> mm0
304	movq    %mm1, %mm6      #B7G7R7B6 -> mm6
305
306	pmaddwd YBG0B(%rip), %mm6     #ybB7+ygG7,ybB6 -> mm6
307	punpckhbw       %mm7, %mm5      #R7B6G6R6 -> mm5
308
309	movq    %mm5, %mm7      #R7B6G6R6 -> mm7
310	paddd   %mm4, %mm3      #V5V4 -> mm3
311
312	pmaddwd YR0GR(%rip), %mm5     #yrR7,ygG6+yrR6 -> mm5
313	movq    %mm1, %mm4      #B7G7R7B6 -> mm4
314
315	pmaddwd UBG0B(%rip), %mm4     #ubB7+ugG7,ubB6 -> mm4
316	psrad   $FIXPSHIFT, %mm0       #32-bit scaled U5U4 -> mm0
317
318	psrad   $FIXPSHIFT, %mm2       #32-bit scaled Y5Y4 -> mm2
319
320	paddd   %mm5, %mm6      #Y7Y6 -> mm6
321	movq    %mm7, %mm5      #R7B6G6R6 -> mm5
322
323	pmaddwd UR0GR(%rip), %mm7     #urR7,ugG6+ugR6 -> mm7
324	psrad   $FIXPSHIFT, %mm3       #32-bit scaled V5V4 -> mm3
325
326	pmaddwd VBG0B(%rip), %mm1     #vbB7+vgG7,vbB6 -> mm1
327	psrad   $FIXPSHIFT, %mm6 #32-bit scaled Y7Y6 -> mm6
328
329	packssdw %mm6, %mm2     #Y7Y6Y5Y4 -> mm2
330
331	pmaddwd VR0GR(%rip), %mm5     #vrR7,vgG6+vrR6 -> mm5
332	paddd   %mm4, %mm7      #U7U6 -> mm7
333
334	psrad   $FIXPSHIFT, %mm7       #32-bit scaled U7U6 -> mm7
335	paddw	OFFSETY(%rip), %mm2
336	movq	%mm2, (%rbx)    #store Y7Y6Y5Y4
337
338	movq	ALLONE(%rip), %mm6
339	packssdw %mm7, %mm0     #32-bit scaled U7U6U5U4 -> mm0
340
341	movq    TEMPU(%rip), %mm4     #32-bit scaled U3U2U1U0 -> mm4
342	pmaddwd	%mm6, %mm0      #U7U6U5U4 averaged -> (U7U6)(U5U4)=UU3 UU2->mm0
343
344	pmaddwd	%mm6, %mm4      #U3U2U1U0 averaged -> (U3U2)(U1U0)=UU1 UU0->mm4
345
346	paddd   %mm5, %mm1      #V7V6 -> mm1
347	packssdw %mm0, %mm4     #UU3UU2UU1UU0 -> mm4
348
349	psrad   $FIXPSHIFT, %mm1       #32-bit scaled V7V6 -> mm1
350	psraw	$1, %mm4 	#divide UU3 UU2 UU1 UU0 by 2 -> mm4
351
352	movq    TEMPV(%rip), %mm5     #32-bit scaled V3V2V1V0 -> mm5
353
354	movq	%mm4, (%r12)    # store U
355
356	pmaddwd %mm6, %mm5	#V3V2V1V0 averaged -> VV1 VV0 -> mm5
357	packssdw %mm1, %mm3     #V7V6V5V4 -> mm3
358
359	pmaddwd %mm6, %mm3	#V7V6V5V4 averaged -> VV3 VV2 -> mm3
360
361	packssdw %mm3, %mm5     # VV3 VV2 VV1 VV0 -> mm5
362	psraw	$1, %mm5
363
364	add    $8, %rbx        #increment Y count
365	add    $8, %r12        #increment U count
366
367	movq    %mm5, (%r13)    #store V
368
369	add     $8, %r13        #increment V count
370
371	dec     %r11            #decrement loop counter
372	jnz     rgbtoycb_mmx_loop  #do 24 more bytes if not 0
373
374	pop     %r13
375	pop     %r12
376	pop     %rbx
377	pop     %rax
378
379	ret
380
381.global _dv_ppm_copy_y_block_mmx_x86_64
382.hidden _dv_ppm_copy_y_block_mmx_x86_64
383.type   _dv_ppm_copy_y_block_mmx_x86_64,@function
384_dv_ppm_copy_y_block_mmx_x86_64:
385
386/* extern void _dv_ppm_copy_y_block_mmx_x86_64(short * dst, short * src); */
387
388	/* arguments are dst=rdi, src=rsi */
389
390	movq	(%rsi), %mm0
391	movq	8(%rsi), %mm1
392	movq	%mm0, 0*8(%rdi)
393	movq	%mm1, 1*8(%rdi)
394	movq	DV_WIDTH_SHORT(%rsi), %mm2
395	movq	DV_WIDTH_SHORT+8(%rsi), %mm3
396	movq	%mm2, 2*8(%rdi)
397	movq	%mm3, 3*8(%rdi)
398	movq	DV_WIDTH_SHORT*2(%rsi), %mm4
399	movq	DV_WIDTH_SHORT*2+8(%rsi), %mm5
400	movq	%mm4, 4*8(%rdi)
401	movq	%mm5, 5*8(%rdi)
402	movq	DV_WIDTH_SHORT*3(%rsi), %mm6
403	movq	DV_WIDTH_SHORT*3+8(%rsi), %mm7
404	movq	%mm6, 6*8(%rdi)
405	movq	%mm7, 7*8(%rdi)
406
407	movq	DV_WIDTH_SHORT*4(%rsi), %mm0
408	movq	DV_WIDTH_SHORT*4+8(%rsi), %mm1
409	movq	%mm0, 8*8(%rdi)
410	movq	%mm1, 9*8(%rdi)
411	movq	DV_WIDTH_SHORT*5(%rsi), %mm2
412	movq	DV_WIDTH_SHORT*5+8(%rsi), %mm3
413	movq	%mm2, 10*8(%rdi)
414	movq	%mm3, 11*8(%rdi)
415	movq	DV_WIDTH_SHORT*6(%rsi), %mm4
416	movq	DV_WIDTH_SHORT*6+8(%rsi), %mm5
417	movq	%mm4, 12*8(%rdi)
418	movq	%mm5, 13*8(%rdi)
419	movq	DV_WIDTH_SHORT*7(%rsi), %mm6
420	movq	DV_WIDTH_SHORT*7+8(%rsi), %mm7
421	movq	%mm6, 14*8(%rdi)
422	movq	%mm7, 15*8(%rdi)
423
424	ret
425
426.global _dv_pgm_copy_y_block_mmx_x86_64
427.hidden _dv_pgm_copy_y_block_mmx_x86_64
428.type   _dv_pgm_copy_y_block_mmx_x86_64,@function
429_dv_pgm_copy_y_block_mmx_x86_64:
430
431/* extern void _dv_pgm_copy_y_block_mmx_x86_64(short * dst, unsigned char * src); */
432
433	/* arguments are dst=rdi, src=rsi */
434
435	movq	OFFSETY(%rip), %mm7
436	pxor	%mm6, %mm6
437
438	movq	(%rsi), %mm0
439	movq	DV_WIDTH_BYTE(%rsi), %mm1
440
441	movq	%mm0, %mm2
442	movq	%mm1, %mm3
443
444	punpcklbw %mm6, %mm0
445	punpcklbw %mm6, %mm1
446
447	punpckhbw %mm6, %mm2
448	punpckhbw %mm6, %mm3
449
450#if PRECISION > 0
451	psllw	$PRECISION, %mm0
452	psllw	$PRECISION, %mm1
453	psllw	$PRECISION, %mm2
454	psllw	$PRECISION, %mm3
455#endif
456
457	paddw	%mm7, %mm0
458	paddw	%mm7, %mm1
459	paddw	%mm7, %mm2
460	paddw	%mm7, %mm3
461
462	movq	%mm0, (%rdi)
463	movq	%mm2, 8(%rdi)
464	movq	%mm1, 16(%rdi)
465	movq	%mm3, 24(%rdi)
466
467	add	$2*DV_WIDTH_BYTE, %rsi
468	add	$32, %rdi
469
470	movq	(%rsi), %mm0
471	movq	DV_WIDTH_BYTE(%rsi), %mm1
472
473	movq	%mm0, %mm2
474	movq	%mm1, %mm3
475
476	punpcklbw %mm6, %mm0
477	punpcklbw %mm6, %mm1
478
479	punpckhbw %mm6, %mm2
480	punpckhbw %mm6, %mm3
481
482#if PRECISION > 0
483	psllw	$PRECISION, %mm0
484	psllw	$PRECISION, %mm1
485	psllw	$PRECISION, %mm2
486	psllw	$PRECISION, %mm3
487#endif
488
489	paddw	%mm7, %mm0
490	paddw	%mm7, %mm1
491	paddw	%mm7, %mm2
492	paddw	%mm7, %mm3
493
494	movq	%mm0, (%rdi)
495	movq	%mm2, 8(%rdi)
496	movq	%mm1, 16(%rdi)
497	movq	%mm3, 24(%rdi)
498
499	add	$2*DV_WIDTH_BYTE, %rsi
500	add	$32, %rdi
501
502	movq	(%rsi), %mm0
503	movq	DV_WIDTH_BYTE(%rsi), %mm1
504
505	movq	%mm0, %mm2
506	movq	%mm1, %mm3
507
508	punpcklbw %mm6, %mm0
509	punpcklbw %mm6, %mm1
510
511	punpckhbw %mm6, %mm2
512	punpckhbw %mm6, %mm3
513
514#if PRECISION > 0
515	psllw	$PRECISION, %mm0
516	psllw	$PRECISION, %mm1
517	psllw	$PRECISION, %mm2
518	psllw	$PRECISION, %mm3
519#endif
520	paddw	%mm7, %mm0
521	paddw	%mm7, %mm1
522	paddw	%mm7, %mm2
523	paddw	%mm7, %mm3
524
525	movq	%mm0, (%rdi)
526	movq	%mm2, 8(%rdi)
527	movq	%mm1, 16(%rdi)
528	movq	%mm3, 24(%rdi)
529
530	add	$2*DV_WIDTH_BYTE, %rsi
531	add	$32, %rdi
532
533	movq	(%rsi), %mm0
534	movq	DV_WIDTH_BYTE(%rsi), %mm1
535
536	movq	%mm0, %mm2
537	movq	%mm1, %mm3
538
539	punpcklbw %mm6, %mm0
540	punpcklbw %mm6, %mm1
541
542	punpckhbw %mm6, %mm2
543	punpckhbw %mm6, %mm3
544
545#if PRECISION > 0
546	psllw	$PRECISION, %mm0
547	psllw	$PRECISION, %mm1
548	psllw	$PRECISION, %mm2
549	psllw	$PRECISION, %mm3
550#endif
551	paddw	%mm7, %mm0
552	paddw	%mm7, %mm1
553	paddw	%mm7, %mm2
554	paddw	%mm7, %mm3
555
556	movq	%mm0, (%rdi)
557	movq	%mm2, 8(%rdi)
558	movq	%mm1, 16(%rdi)
559	movq	%mm3, 24(%rdi)
560
561	ret
562
563.global _dv_video_copy_y_block_mmx_x86_64
564.hidden _dv_video_copy_y_block_mmx_x86_64
565.type   _dv_video_copy_y_block_mmx_x86_64,@function
566_dv_video_copy_y_block_mmx_x86_64:
567
568/* extern void _dv_video_copy_y_block_mmx_x86_64(short * dst, unsigned char * src); */
569
570	/* arguments are dst=rdi, src=rsi */
571
572	movq	OFFSETBX(%rip), %mm7
573	pxor	%mm6, %mm6
574
575	movq	(%rsi), %mm0
576	movq	DV_WIDTH_BYTE(%rsi), %mm1
577
578	movq	%mm0, %mm2
579	movq	%mm1, %mm3
580
581	punpcklbw %mm6, %mm0
582	punpcklbw %mm6, %mm1
583
584	punpckhbw %mm6, %mm2
585	punpckhbw %mm6, %mm3
586
587	psubw	%mm7, %mm0
588	psubw	%mm7, %mm1
589	psubw	%mm7, %mm2
590	psubw	%mm7, %mm3
591
592#if PRECISION > 0
593	psllw	$PRECISION, %mm0
594	psllw	$PRECISION, %mm1
595	psllw	$PRECISION, %mm2
596	psllw	$PRECISION, %mm3
597#endif
598
599	movq	%mm0, (%rdi)
600	movq	%mm2, 8(%rdi)
601	movq	%mm1, 16(%rdi)
602	movq	%mm3, 24(%rdi)
603
604	add	$2*DV_WIDTH_BYTE, %rsi
605	add	$32, %rdi
606
607	movq	(%rsi), %mm0
608	movq	DV_WIDTH_BYTE(%rsi), %mm1
609
610	movq	%mm0, %mm2
611	movq	%mm1, %mm3
612
613	punpcklbw %mm6, %mm0
614	punpcklbw %mm6, %mm1
615
616	punpckhbw %mm6, %mm2
617	punpckhbw %mm6, %mm3
618
619	psubw	%mm7, %mm0
620	psubw	%mm7, %mm1
621	psubw	%mm7, %mm2
622	psubw	%mm7, %mm3
623
624#if PRECISION > 0
625	psllw	$PRECISION, %mm0
626	psllw	$PRECISION, %mm1
627	psllw	$PRECISION, %mm2
628	psllw	$PRECISION, %mm3
629#endif
630
631	movq	%mm0, (%rdi)
632	movq	%mm2, 8(%rdi)
633	movq	%mm1, 16(%rdi)
634	movq	%mm3, 24(%rdi)
635
636	add	$2*DV_WIDTH_BYTE, %rsi
637	add	$32, %rdi
638
639	movq	(%rsi), %mm0
640	movq	DV_WIDTH_BYTE(%rsi), %mm1
641
642	movq	%mm0, %mm2
643	movq	%mm1, %mm3
644
645	punpcklbw %mm6, %mm0
646	punpcklbw %mm6, %mm1
647
648	punpckhbw %mm6, %mm2
649	punpckhbw %mm6, %mm3
650
651	psubw	%mm7, %mm0
652	psubw	%mm7, %mm1
653	psubw	%mm7, %mm2
654	psubw	%mm7, %mm3
655
656#if PRECISION > 0
657	psllw	$PRECISION, %mm0
658	psllw	$PRECISION, %mm1
659	psllw	$PRECISION, %mm2
660	psllw	$PRECISION, %mm3
661#endif
662
663	movq	%mm0, (%rdi)
664	movq	%mm2, 8(%rdi)
665	movq	%mm1, 16(%rdi)
666	movq	%mm3, 24(%rdi)
667
668	add	$2*DV_WIDTH_BYTE, %rsi
669	add	$32, %rdi
670
671	movq	(%rsi), %mm0
672	movq	DV_WIDTH_BYTE(%rsi), %mm1
673
674	movq	%mm0, %mm2
675	movq	%mm1, %mm3
676
677	punpcklbw %mm6, %mm0
678	punpcklbw %mm6, %mm1
679
680	punpckhbw %mm6, %mm2
681	punpckhbw %mm6, %mm3
682
683	psubw	%mm7, %mm0
684	psubw	%mm7, %mm1
685	psubw	%mm7, %mm2
686	psubw	%mm7, %mm3
687
688#if PRECISION > 0
689	psllw	$PRECISION, %mm0
690	psllw	$PRECISION, %mm1
691	psllw	$PRECISION, %mm2
692	psllw	$PRECISION, %mm3
693#endif
694
695	movq	%mm0, (%rdi)
696	movq	%mm2, 8(%rdi)
697	movq	%mm1, 16(%rdi)
698	movq	%mm3, 24(%rdi)
699
700	ret
701
702
703.global _dv_ppm_copy_pal_c_block_mmx_x86_64
704.hidden _dv_ppm_copy_pal_c_block_mmx_x86_64
705.type   _dv_ppm_copy_pal_c_block_mmx_x86_64,@function
706_dv_ppm_copy_pal_c_block_mmx_x86_64:
707
708/* extern void _dv_ppm_copy_pal_c_block_mmx_x86_64(short * dst, short * src); */
709
710	/* arguments are dst=rdi, src=rsi */
711
712	movq	          (%rsi), %mm0
713	movq	DV_WIDTH_SHORT_HALF(%rsi), %mm1
714	movq               8(%rsi), %mm2
715	movq	DV_WIDTH_SHORT_HALF+8(%rsi), %mm3
716
717	paddw	%mm0, %mm1
718	paddw	%mm2, %mm3
719	psraw	$1, %mm1
720	psraw	$1, %mm3
721
722	movq	%mm1, 0*8(%rdi)
723	movq	%mm3, 1*8(%rdi)
724
725	add	$DV_WIDTH_SHORT, %rsi
726	add	$16, %rdi
727
728	movq	          (%rsi), %mm0
729	movq	DV_WIDTH_SHORT_HALF(%rsi), %mm1
730	movq               8(%rsi), %mm2
731	movq	DV_WIDTH_SHORT_HALF+8(%rsi), %mm3
732
733	paddw	%mm0, %mm1
734	paddw	%mm2, %mm3
735	psraw	$1, %mm1
736	psraw	$1, %mm3
737
738	movq	%mm1, 0*8(%rdi)
739	movq	%mm3, 1*8(%rdi)
740
741	add	$DV_WIDTH_SHORT, %rsi
742	add	$16, %rdi
743
744	movq	          (%rsi), %mm0
745	movq	DV_WIDTH_SHORT_HALF(%rsi), %mm1
746	movq               8(%rsi), %mm2
747	movq	DV_WIDTH_SHORT_HALF+8(%rsi), %mm3
748
749	paddw	%mm0, %mm1
750	paddw	%mm2, %mm3
751	psraw	$1, %mm1
752	psraw	$1, %mm3
753
754	movq	%mm1, 0*8(%rdi)
755	movq	%mm3, 1*8(%rdi)
756
757	add	$DV_WIDTH_SHORT, %rsi
758	add	$16, %rdi
759
760	movq	          (%rsi), %mm0
761	movq	DV_WIDTH_SHORT_HALF(%rsi), %mm1
762	movq               8(%rsi), %mm2
763	movq	DV_WIDTH_SHORT_HALF+8(%rsi), %mm3
764
765	paddw	%mm0, %mm1
766	paddw	%mm2, %mm3
767	psraw	$1, %mm1
768	psraw	$1, %mm3
769
770	movq	%mm1, 0*8(%rdi)
771	movq	%mm3, 1*8(%rdi)
772
773	add	$DV_WIDTH_SHORT, %rsi
774	add	$16, %rdi
775
776	movq	          (%rsi), %mm0
777	movq	DV_WIDTH_SHORT_HALF(%rsi), %mm1
778	movq               8(%rsi), %mm2
779	movq	DV_WIDTH_SHORT_HALF+8(%rsi), %mm3
780
781	paddw	%mm0, %mm1
782	paddw	%mm2, %mm3
783	psraw	$1, %mm1
784	psraw	$1, %mm3
785
786	movq	%mm1, 0*8(%rdi)
787	movq	%mm3, 1*8(%rdi)
788
789	add	$DV_WIDTH_SHORT, %rsi
790	add	$16, %rdi
791
792	movq	          (%rsi), %mm0
793	movq	DV_WIDTH_SHORT_HALF(%rsi), %mm1
794	movq               8(%rsi), %mm2
795	movq	DV_WIDTH_SHORT_HALF+8(%rsi), %mm3
796
797	paddw	%mm0, %mm1
798	paddw	%mm2, %mm3
799	psraw	$1, %mm1
800	psraw	$1, %mm3
801
802	movq	%mm1, 0*8(%rdi)
803	movq	%mm3, 1*8(%rdi)
804
805	add	$DV_WIDTH_SHORT, %rsi
806	add	$16, %rdi
807
808	movq	          (%rsi), %mm0
809	movq	DV_WIDTH_SHORT_HALF(%rsi), %mm1
810	movq               8(%rsi), %mm2
811	movq	DV_WIDTH_SHORT_HALF+8(%rsi), %mm3
812
813	paddw	%mm0, %mm1
814	paddw	%mm2, %mm3
815	psraw	$1, %mm1
816	psraw	$1, %mm3
817
818	movq	%mm1, 0*8(%rdi)
819	movq	%mm3, 1*8(%rdi)
820
821	add	$DV_WIDTH_SHORT, %rsi
822	add	$16, %rdi
823
824	movq	          (%rsi), %mm0
825	movq	DV_WIDTH_SHORT_HALF(%rsi), %mm1
826	movq               8(%rsi), %mm2
827	movq	DV_WIDTH_SHORT_HALF+8(%rsi), %mm3
828
829	paddw	%mm0, %mm1
830	paddw	%mm2, %mm3
831	psraw	$1, %mm1
832	psraw	$1, %mm3
833
834	movq	%mm1, 0*8(%rdi)
835	movq	%mm3, 1*8(%rdi)
836
837	ret
838
839.global _dv_pgm_copy_pal_c_block_mmx_x86_64
840.hidden _dv_pgm_copy_pal_c_block_mmx_x86_64
841.type   _dv_pgm_copy_pal_c_block_mmx_x86_64,@function
842_dv_pgm_copy_pal_c_block_mmx_x86_64:
843
844/* extern void _dv_pgm_copy_pal_c_block_mmx_x86_64(short * dst, unsigned char * src); */
845
846	/* arguments are dst=rdi, src=rsi */
847
848	movq	OFFSETBX(%rip), %mm7
849	pxor	%mm6, %mm6
850
851
852	movq	(%rsi), %mm0
853	movq	DV_WIDTH_BYTE(%rsi), %mm1
854
855	movq	%mm0, %mm2
856	movq	%mm1, %mm3
857
858	punpcklbw %mm6, %mm0
859	punpcklbw %mm6, %mm1
860
861	punpckhbw %mm6, %mm2
862	punpckhbw %mm6, %mm3
863
864	psubw	%mm7, %mm0
865	psubw	%mm7, %mm1
866	psubw	%mm7, %mm2
867	psubw	%mm7, %mm3
868
869#if PRECISION > 0
870	psllw	$PRECISION, %mm0
871	psllw	$PRECISION, %mm1
872	psllw	$PRECISION, %mm2
873	psllw	$PRECISION, %mm3
874#endif
875
876	movq	%mm0, (%rdi)
877	movq	%mm2, 8(%rdi)
878	movq	%mm1, 16(%rdi)
879	movq	%mm3, 24(%rdi)
880
881	add	$2*DV_WIDTH_BYTE, %rsi
882	add	$32, %rdi
883
884	movq	(%rsi), %mm0
885	movq	DV_WIDTH_BYTE(%rsi), %mm1
886
887	movq	%mm0, %mm2
888	movq	%mm1, %mm3
889
890	punpcklbw %mm6, %mm0
891	punpcklbw %mm6, %mm1
892
893	punpckhbw %mm6, %mm2
894	punpckhbw %mm6, %mm3
895
896	psubw	%mm7, %mm0
897	psubw	%mm7, %mm1
898	psubw	%mm7, %mm2
899	psubw	%mm7, %mm3
900
901#if PRECISION > 0
902	psllw	$PRECISION, %mm0
903	psllw	$PRECISION, %mm1
904	psllw	$PRECISION, %mm2
905	psllw	$PRECISION, %mm3
906#endif
907
908	movq	%mm0, (%rdi)
909	movq	%mm2, 8(%rdi)
910	movq	%mm1, 16(%rdi)
911	movq	%mm3, 24(%rdi)
912
913	add	$2*DV_WIDTH_BYTE, %rsi
914	add	$32, %rdi
915
916	movq	(%rsi), %mm0
917	movq	DV_WIDTH_BYTE(%rsi), %mm1
918
919	movq	%mm0, %mm2
920	movq	%mm1, %mm3
921
922	punpcklbw %mm6, %mm0
923	punpcklbw %mm6, %mm1
924
925	punpckhbw %mm6, %mm2
926	punpckhbw %mm6, %mm3
927
928	psubw	%mm7, %mm0
929	psubw	%mm7, %mm1
930	psubw	%mm7, %mm2
931	psubw	%mm7, %mm3
932
933#if PRECISION > 0
934	psllw	$PRECISION, %mm0
935	psllw	$PRECISION, %mm1
936	psllw	$PRECISION, %mm2
937	psllw	$PRECISION, %mm3
938#endif
939
940	movq	%mm0, (%rdi)
941	movq	%mm2, 8(%rdi)
942	movq	%mm1, 16(%rdi)
943	movq	%mm3, 24(%rdi)
944
945	add	$2*DV_WIDTH_BYTE, %rsi
946	add	$32, %rdi
947
948	movq	(%rsi), %mm0
949	movq	DV_WIDTH_BYTE(%rsi), %mm1
950
951	movq	%mm0, %mm2
952	movq	%mm1, %mm3
953
954	punpcklbw %mm6, %mm0
955	punpcklbw %mm6, %mm1
956
957	punpckhbw %mm6, %mm2
958	punpckhbw %mm6, %mm3
959
960	psubw	%mm7, %mm0
961	psubw	%mm7, %mm1
962	psubw	%mm7, %mm2
963	psubw	%mm7, %mm3
964
965#if PRECISION > 0
966	psllw	$PRECISION, %mm0
967	psllw	$PRECISION, %mm1
968	psllw	$PRECISION, %mm2
969	psllw	$PRECISION, %mm3
970#endif
971
972	movq	%mm0, (%rdi)
973	movq	%mm2, 8(%rdi)
974	movq	%mm1, 16(%rdi)
975	movq	%mm3, 24(%rdi)
976
977	ret
978
979.global _dv_video_copy_pal_c_block_mmx_x86_64
980.hidden _dv_video_copy_pal_c_block_mmx_x86_64
981.type   _dv_video_copy_pal_c_block_mmx_x86_64,@function
982_dv_video_copy_pal_c_block_mmx_x86_64:
983
984	/* extern void _dv_video_copy_pal_c_block_mmx_x86_64(short * dst, unsigned char * src); */
985
986	/* arguments are dst=rdi, src=rsi */
987
988	push	%rbx
989
990	movq	OFFSETBX(%rip), %mm7
991	paddw	%mm7, %mm7
992	pxor	%mm6, %mm6
993
994	mov 	$4, %rbx
995
996video_copy_pal_c_block_mmx_loop:
997
998	movq	(%rsi), %mm0
999	movq    DV_WIDTH_BYTE_HALF(%rsi), %mm2
1000
1001	movq	%mm0, %mm1
1002	movq	%mm2, %mm3
1003
1004	punpcklbw %mm6, %mm0
1005	punpcklbw %mm6, %mm2
1006
1007	punpckhbw %mm6, %mm1
1008	punpckhbw %mm6, %mm3
1009
1010	paddw	%mm0, %mm2
1011	paddw	%mm1, %mm3
1012
1013	psubw	%mm7, %mm2
1014	psubw	%mm7, %mm3
1015
1016#if PRECISION == 0
1017	psraw	$1, %mm2
1018	psraw	$1, %mm3
1019#else
1020#if PRECISION > 1
1021	psllw	$PRECISION-1, %mm2
1022	psllw	$PRECISION-1, %mm3
1023#endif
1024#endif
1025	movq	%mm2, 0*8(%rdi)
1026	movq	%mm3, 1*8(%rdi)
1027
1028	add	$DV_WIDTH_BYTE, %rsi
1029	add	$16, %rdi
1030
1031	movq	(%rsi), %mm0
1032	movq    DV_WIDTH_BYTE_HALF(%rsi), %mm2
1033
1034	movq	%mm0, %mm1
1035	movq	%mm2, %mm3
1036
1037	punpcklbw %mm6, %mm0
1038	punpcklbw %mm6, %mm2
1039
1040	punpckhbw %mm6, %mm1
1041	punpckhbw %mm6, %mm3
1042
1043	paddw	%mm0, %mm2
1044	paddw	%mm1, %mm3
1045
1046	psubw	%mm7, %mm2
1047	psubw	%mm7, %mm3
1048
1049#if PRECISION == 0
1050	psraw	$1, %mm2
1051	psraw	$1, %mm3
1052#else
1053#if PRECISION > 1
1054	psllw	$PRECISION-1, %mm2
1055	psllw	$PRECISION-1, %mm3
1056#endif
1057#endif
1058	movq	%mm2, 0*8(%rdi)
1059	movq	%mm3, 1*8(%rdi)
1060
1061	add	$DV_WIDTH_BYTE, %rsi
1062	add	$16, %rdi
1063
1064	dec	%rbx
1065	jnz	video_copy_pal_c_block_mmx_loop
1066
1067	pop	%rbx
1068
1069	ret
1070
1071.global _dv_ppm_copy_ntsc_c_block_mmx_x86_64
1072.hidden _dv_ppm_copy_ntsc_c_block_mmx_x86_64
1073.type   _dv_ppm_copy_ntsc_c_block_mmx_x86_64,@function
1074_dv_ppm_copy_ntsc_c_block_mmx_x86_64:
1075
1076	/* extern void _dv_ppm_copy_ntsc_c_block_mmx_x86_64(short * dst, short * src); */
1077
1078	/* arguments are dst=rdi, src=rsi */
1079
1080	push	%rbx
1081
1082	mov	$4, %rbx	   # loop counter
1083
1084	movq	ALLONE(%rip), %mm6
1085
1086ppm_copy_ntsc_c_block_mmx_loop:
1087
1088	movq	(%rsi), %mm0
1089	movq    8(%rsi), %mm1
1090	movq	16(%rsi), %mm2
1091	movq	24(%rsi), %mm3
1092
1093	pmaddwd %mm6, %mm0
1094	pmaddwd %mm6, %mm1
1095
1096	pmaddwd %mm6, %mm2
1097	pmaddwd %mm6, %mm3
1098
1099	packssdw %mm1, %mm0
1100	packssdw %mm3, %mm2
1101
1102	psraw	$1, %mm0
1103	psraw	$1, %mm2
1104
1105	movq	%mm0, 0*8(%rdi)
1106	movq	%mm2, 1*8(%rdi)
1107
1108	add	$DV_WIDTH_SHORT_HALF, %rsi
1109	add	$16, %rdi
1110
1111	movq	(%rsi), %mm0
1112	movq    8(%rsi), %mm1
1113	movq	16(%rsi), %mm2
1114	movq	24(%rsi), %mm3
1115
1116	pmaddwd %mm6, %mm0
1117	pmaddwd %mm6, %mm1
1118
1119	pmaddwd %mm6, %mm2
1120	pmaddwd %mm6, %mm3
1121
1122	packssdw %mm1, %mm0
1123	packssdw %mm3, %mm2
1124
1125	psraw	$1, %mm0
1126	psraw	$1, %mm2
1127
1128	movq	%mm0, 0*8(%rdi)
1129	movq	%mm2, 1*8(%rdi)
1130
1131	add	$DV_WIDTH_SHORT_HALF, %rsi
1132	add	$16, %rdi
1133
1134	dec	%rbx
1135	jnz	ppm_copy_ntsc_c_block_mmx_loop
1136
1137	pop	%rbx
1138
1139	ret
1140
1141.global _dv_pgm_copy_ntsc_c_block_mmx_x86_64
1142.hidden _dv_pgm_copy_ntsc_c_block_mmx_x86_64
1143.type   _dv_pgm_copy_ntsc_c_block_mmx_x86_64,@function
1144_dv_pgm_copy_ntsc_c_block_mmx_x86_64:
1145
1146	/* extern void _dv_pgm_copy_ntsc_c_block_mmx_x86_64(short * dst, unsigned char * src); */
1147
1148	/* arguments are dst=rdi, src=rsi */
1149
1150	movq	OFFSETBX(%rip), %mm7
1151	paddw	%mm7, %mm7
1152	pxor	%mm6, %mm6
1153
1154	movq	(%rsi), %mm0
1155	movq    8(%rsi), %mm2
1156
1157	movq	%mm0, %mm1
1158	movq	%mm2, %mm3
1159
1160	punpcklbw %mm6, %mm0
1161	punpcklbw %mm6, %mm2
1162
1163	punpckhbw %mm6, %mm1
1164	punpckhbw %mm6, %mm3
1165
1166	paddw	%mm0, %mm1
1167	paddw	%mm2, %mm3
1168
1169	psubw	%mm7, %mm1
1170	psubw	%mm7, %mm3
1171
1172#if PRECISION == 0
1173	psraw	$1, %mm1
1174	psraw	$1, %mm3
1175#else
1176#if PRECISION > 1
1177	psllw	$PRECISION-1, %mm1
1178	psllw	$PRECISION-1, %mm3
1179#endif
1180#endif
1181	movq	%mm1, 0*8(%rdi)
1182	movq	%mm3, 1*8(%rdi)
1183	movq	%mm1, 2*8(%rdi)
1184	movq	%mm3, 3*8(%rdi)
1185
1186	add	$DV_WIDTH_BYTE, %rsi
1187	add	$32, %rdi
1188
1189	movq	(%rsi), %mm0
1190	movq    8(%rsi), %mm2
1191
1192	movq	%mm0, %mm1
1193	movq	%mm2, %mm3
1194
1195	punpcklbw %mm6, %mm0
1196	punpcklbw %mm6, %mm2
1197
1198	punpckhbw %mm6, %mm1
1199	punpckhbw %mm6, %mm3
1200
1201	paddw	%mm0, %mm1
1202	paddw	%mm2, %mm3
1203
1204	psubw	%mm7, %mm1
1205	psubw	%mm7, %mm3
1206
1207#if PRECISION == 0
1208	psraw	$1, %mm1
1209	psraw	$1, %mm3
1210#else
1211#if PRECISION > 1
1212	psllw	$PRECISION-1, %mm1
1213	psllw	$PRECISION-1, %mm3
1214#endif
1215#endif
1216	movq	%mm1, 0*8(%rdi)
1217	movq	%mm3, 1*8(%rdi)
1218	movq	%mm1, 2*8(%rdi)
1219	movq	%mm3, 3*8(%rdi)
1220
1221	add	$DV_WIDTH_BYTE, %rsi
1222	add	$32, %rdi
1223
1224	movq	(%rsi), %mm0
1225	movq    8(%rsi), %mm2
1226
1227	movq	%mm0, %mm1
1228	movq	%mm2, %mm3
1229
1230	punpcklbw %mm6, %mm0
1231	punpcklbw %mm6, %mm2
1232
1233	punpckhbw %mm6, %mm1
1234	punpckhbw %mm6, %mm3
1235
1236	paddw	%mm0, %mm1
1237	paddw	%mm2, %mm3
1238
1239	psubw	%mm7, %mm1
1240	psubw	%mm7, %mm3
1241
1242#if PRECISION == 0
1243	psraw	$1, %mm1
1244	psraw	$1, %mm3
1245#else
1246#if PRECISION > 1
1247	psllw	$PRECISION-1, %mm1
1248	psllw	$PRECISION-1, %mm3
1249#endif
1250#endif
1251	movq	%mm1, 0*8(%rdi)
1252	movq	%mm3, 1*8(%rdi)
1253	movq	%mm1, 2*8(%rdi)
1254	movq	%mm3, 3*8(%rdi)
1255
1256	add	$DV_WIDTH_BYTE, %rsi
1257	add	$32, %rdi
1258
1259	movq	(%rsi), %mm0
1260	movq    8(%rsi), %mm2
1261
1262	movq	%mm0, %mm1
1263	movq	%mm2, %mm3
1264
1265	punpcklbw %mm6, %mm0
1266	punpcklbw %mm6, %mm2
1267
1268	punpckhbw %mm6, %mm1
1269	punpckhbw %mm6, %mm3
1270
1271	paddw	%mm0, %mm1
1272	paddw	%mm2, %mm3
1273
1274	psubw	%mm7, %mm1
1275	psubw	%mm7, %mm3
1276
1277#if PRECISION == 0
1278	psraw	$1, %mm1
1279	psraw	$1, %mm3
1280#else
1281#if PRECISION > 1
1282	psllw	$PRECISION-1, %mm1
1283	psllw	$PRECISION-1, %mm3
1284#endif
1285#endif
1286	movq	%mm1, 0*8(%rdi)
1287	movq	%mm3, 1*8(%rdi)
1288	movq	%mm1, 2*8(%rdi)
1289	movq	%mm3, 3*8(%rdi)
1290
1291	ret
1292
1293.global _dv_video_copy_ntsc_c_block_mmx_x86_64
1294.hidden _dv_video_copy_ntsc_c_block_mmx_x86_64
1295.type   _dv_video_copy_ntsc_c_block_mmx_x86_64,@function
1296_dv_video_copy_ntsc_c_block_mmx_x86_64:
1297
1298	/* extern void _dv_video_copy_ntsc_c_block_mmx_x86_64(short * dst, unsigned char * src); */
1299
1300	/* arguments are dst=rdi, src=rsi */
1301
1302	push	%rbx
1303
1304	movq	OFFSETBX(%rip), %mm7
1305	paddw	%mm7, %mm7
1306	pxor	%mm6, %mm6
1307
1308	mov	$4, %rbx           # loop counter
1309
1310video_copy_ntsc_c_block_mmx_loop:
1311
1312	movq	(%rsi), %mm0
1313	movq    8(%rsi), %mm2
1314
1315	movq	%mm0, %mm1
1316	movq	%mm2, %mm3
1317
1318	punpcklbw %mm6, %mm0
1319	punpcklbw %mm6, %mm2
1320
1321	punpckhbw %mm6, %mm1
1322	punpckhbw %mm6, %mm3
1323
1324	paddw	%mm0, %mm1
1325	paddw	%mm2, %mm3
1326
1327	psubw	%mm7, %mm1
1328	psubw	%mm7, %mm3
1329
1330#if PRECISION == 0
1331	psraw	$1, %mm1
1332	psraw	$1, %mm3
1333#else
1334#if PRECISION > 1
1335	psllw	$PRECISION-1, %mm1
1336	psllw	$PRECISION-1, %mm3
1337#endif
1338#endif
1339	movq	%mm1, 0*8(%rdi)
1340	movq	%mm3, 1*8(%rdi)
1341
1342	add	$DV_WIDTH_BYTE_HALF, %rsi
1343	add	$16, %rdi
1344
1345	movq	(%rsi), %mm0
1346	movq    8(%rsi), %mm2
1347
1348	movq	%mm0, %mm1
1349	movq	%mm2, %mm3
1350
1351	punpcklbw %mm6, %mm0
1352	punpcklbw %mm6, %mm2
1353
1354	punpckhbw %mm6, %mm1
1355	punpckhbw %mm6, %mm3
1356
1357	paddw	%mm0, %mm1
1358	paddw	%mm2, %mm3
1359
1360	psubw	%mm7, %mm1
1361	psubw	%mm7, %mm3
1362
1363#if PRECISION == 0
1364	psraw	$1, %mm1
1365	psraw	$1, %mm3
1366#else
1367#if PRECISION > 1
1368	psllw	$PRECISION-1, %mm1
1369	psllw	$PRECISION-1, %mm3
1370#endif
1371#endif
1372	movq	%mm1, 0*8(%rdi)
1373	movq	%mm3, 1*8(%rdi)
1374
1375	add	$DV_WIDTH_BYTE_HALF, %rsi
1376	add	$16, %rdi
1377
1378	dec	%rbx
1379	jnz	video_copy_ntsc_c_block_mmx_loop
1380
1381
1382	pop	%rbx
1383
1384	ret
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394