1;/****************************************************************************
2; *
3; *  XVID MPEG-4 VIDEO CODEC
4; *  - 8<->16 bit transfer functions -
5; *
6; *  Copyright (C) 2001 Peter Ross <pross@xvid.org>
7; *                2001-2008 Michael Militzer <michael@xvid.org>
8; *                2002 Pascal Massimino <skal@planet-d.net>
9; *
10; *  This program is free software ; you can redistribute it and/or modify
11; *  it under the terms of the GNU General Public License as published by
12; *  the Free Software Foundation ; either version 2 of the License, or
13; *  (at your option) any later version.
14; *
15; *  This program is distributed in the hope that it will be useful,
16; *  but WITHOUT ANY WARRANTY ; without even the implied warranty of
17; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18; *  GNU General Public License for more details.
19; *
20; *  You should have received a copy of the GNU General Public License
21; *  along with this program ; if not, write to the Free Software
22; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
23; *
24; * $Id: mem_transfer_mmx.asm,v 1.22 2009-09-16 17:07:58 Isibaar Exp $
25; *
26; ***************************************************************************/
27
28%include "nasm.inc"
29
30;=============================================================================
31; Read only data
32;=============================================================================
33
34DATA
35
36ALIGN SECTION_ALIGN
37mmx_one:
38	dw 1, 1, 1, 1
39
40;=============================================================================
41; Code
42;=============================================================================
43
44TEXT
45
46cglobal transfer_8to16copy_mmx
47cglobal transfer_16to8copy_mmx
48cglobal transfer_8to16sub_mmx
49cglobal transfer_8to16subro_mmx
50cglobal transfer_8to16sub2_mmx
51cglobal transfer_8to16sub2_xmm
52cglobal transfer_8to16sub2ro_xmm
53cglobal transfer_16to8add_mmx
54cglobal transfer8x8_copy_mmx
55cglobal transfer8x4_copy_mmx
56
57;-----------------------------------------------------------------------------
58;
59; void transfer_8to16copy_mmx(int16_t * const dst,
60;							const uint8_t * const src,
61;							uint32_t stride);
62;
63;-----------------------------------------------------------------------------
64
65%macro COPY_8_TO_16 1
66  movq mm0, [_EAX]
67  movq mm1, [_EAX+TMP1]
68  movq mm2, mm0
69  movq mm3, mm1
70  punpcklbw mm0, mm7
71  movq [TMP0+%1*32], mm0
72  punpcklbw mm1, mm7
73  movq [TMP0+%1*32+16], mm1
74  punpckhbw mm2, mm7
75  punpckhbw mm3, mm7
76  lea _EAX, [_EAX+2*TMP1]
77  movq [TMP0+%1*32+8], mm2
78  movq [TMP0+%1*32+24], mm3
79%endmacro
80
81ALIGN SECTION_ALIGN
82transfer_8to16copy_mmx:
83
84  mov TMP0, prm1 ; Dst
85  mov _EAX, prm2 ; Src
86  mov TMP1, prm3 ; Stride
87  pxor mm7, mm7
88
89  COPY_8_TO_16 0
90  COPY_8_TO_16 1
91  COPY_8_TO_16 2
92  COPY_8_TO_16 3
93  ret
94ENDFUNC
95
96;-----------------------------------------------------------------------------
97;
98; void transfer_16to8copy_mmx(uint8_t * const dst,
99;							const int16_t * const src,
100;							uint32_t stride);
101;
102;-----------------------------------------------------------------------------
103
104%macro COPY_16_TO_8 1
105  movq mm0, [_EAX+%1*32]
106  movq mm1, [_EAX+%1*32+8]
107  packuswb mm0, mm1
108  movq [TMP0], mm0
109  movq mm2, [_EAX+%1*32+16]
110  movq mm3, [_EAX+%1*32+24]
111  packuswb mm2, mm3
112  movq [TMP0+TMP1], mm2
113%endmacro
114
115ALIGN SECTION_ALIGN
116transfer_16to8copy_mmx:
117
118  mov TMP0, prm1 ; Dst
119  mov _EAX, prm2 ; Src
120  mov TMP1, prm3 ; Stride
121
122  COPY_16_TO_8 0
123  lea TMP0,[TMP0+2*TMP1]
124  COPY_16_TO_8 1
125  lea TMP0,[TMP0+2*TMP1]
126  COPY_16_TO_8 2
127  lea TMP0,[TMP0+2*TMP1]
128  COPY_16_TO_8 3
129  ret
130ENDFUNC
131
132;-----------------------------------------------------------------------------
133;
134; void transfer_8to16sub_mmx(int16_t * const dct,
135;				uint8_t * const cur,
136;				const uint8_t * const ref,
137;				const uint32_t stride);
138;
139;-----------------------------------------------------------------------------
140
141; when second argument == 1, reference (ebx) block is to current (_EAX)
142%macro COPY_8_TO_16_SUB 2
143  movq mm0, [_EAX]      ; cur
144  movq mm2, [_EAX+TMP1]
145  movq mm1, mm0
146  movq mm3, mm2
147
148  punpcklbw mm0, mm7
149  punpcklbw mm2, mm7
150  movq mm4, [_EBX]      ; ref
151  punpckhbw mm1, mm7
152  punpckhbw mm3, mm7
153  movq mm5, [_EBX+TMP1]  ; ref
154
155  movq mm6, mm4
156%if %2 == 1
157  movq [_EAX], mm4
158  movq [_EAX+TMP1], mm5
159%endif
160  punpcklbw mm4, mm7
161  punpckhbw mm6, mm7
162  psubsw mm0, mm4
163  psubsw mm1, mm6
164  movq mm6, mm5
165  punpcklbw mm5, mm7
166  punpckhbw mm6, mm7
167  psubsw mm2, mm5
168  lea _EAX, [_EAX+2*TMP1]
169  psubsw mm3, mm6
170  lea _EBX,[_EBX+2*TMP1]
171
172  movq [TMP0+%1*32+ 0], mm0 ; dst
173  movq [TMP0+%1*32+ 8], mm1
174  movq [TMP0+%1*32+16], mm2
175  movq [TMP0+%1*32+24], mm3
176%endmacro
177
178ALIGN SECTION_ALIGN
179transfer_8to16sub_mmx:
180  mov TMP0, prm1 ; Dst
181  mov _EAX, prm2 ; Cur
182  mov TMP1, prm4 ; Stride
183
184  push _EBX
185%ifdef ARCH_IS_X86_64
186  mov _EBX, prm3
187%else
188  mov _EBX, [_ESP+4+12] ; Ref
189%endif
190  pxor mm7, mm7
191
192  COPY_8_TO_16_SUB 0, 1
193  COPY_8_TO_16_SUB 1, 1
194  COPY_8_TO_16_SUB 2, 1
195  COPY_8_TO_16_SUB 3, 1
196
197  pop _EBX
198  ret
199ENDFUNC
200
201
202ALIGN SECTION_ALIGN
203transfer_8to16subro_mmx:
204  mov TMP0, prm1 ; Dst
205  mov _EAX, prm2 ; Cur
206  mov TMP1, prm4 ; Stride
207
208  push _EBX
209%ifdef ARCH_IS_X86_64
210  mov _EBX, prm3
211%else
212  mov _EBX, [_ESP+4+12] ; Ref
213%endif
214  pxor mm7, mm7
215
216  COPY_8_TO_16_SUB 0, 0
217  COPY_8_TO_16_SUB 1, 0
218  COPY_8_TO_16_SUB 2, 0
219  COPY_8_TO_16_SUB 3, 0
220
221  pop _EBX
222  ret
223ENDFUNC
224
225
226;-----------------------------------------------------------------------------
227;
228; void transfer_8to16sub2_mmx(int16_t * const dct,
229;				uint8_t * const cur,
230;				const uint8_t * ref1,
231;				const uint8_t * ref2,
232;				const uint32_t stride)
233;
234;-----------------------------------------------------------------------------
235
236%macro COPY_8_TO_16_SUB2_MMX 1
237  movq mm0, [_EAX]      ; cur
238  movq mm2, [_EAX+TMP1]
239
240  ; mm4 <- (ref1+ref2+1) / 2
241  movq mm4, [_EBX]      ; ref1
242  movq mm1, [_ESI]      ; ref2
243  movq mm6, mm4
244  movq mm3, mm1
245  punpcklbw mm4, mm7
246  punpcklbw mm1, mm7
247  punpckhbw mm6, mm7
248  punpckhbw mm3, mm7
249  paddusw mm4, mm1
250  paddusw mm6, mm3
251  paddusw mm4, [mmx_one]
252  paddusw mm6, [mmx_one]
253  psrlw mm4, 1
254  psrlw mm6, 1
255  packuswb mm4, mm6
256  movq [_EAX], mm4
257
258    ; mm5 <- (ref1+ref2+1) / 2
259  movq mm5, [_EBX+TMP1]  ; ref1
260  movq mm1, [_ESI+TMP1]  ; ref2
261  movq mm6, mm5
262  movq mm3, mm1
263  punpcklbw mm5, mm7
264  punpcklbw mm1, mm7
265  punpckhbw mm6, mm7
266  punpckhbw mm3, mm7
267  paddusw mm5, mm1
268  paddusw mm6, mm3
269  paddusw mm5, [mmx_one]
270  paddusw mm6, [mmx_one]
271  lea _ESI, [_ESI+2*TMP1]
272  psrlw mm5, 1
273  psrlw mm6, 1
274  packuswb mm5, mm6
275  movq [_EAX+TMP1], mm5
276
277  movq mm1, mm0
278  movq mm3, mm2
279  punpcklbw mm0, mm7
280  punpcklbw mm2, mm7
281  punpckhbw mm1, mm7
282  punpckhbw mm3, mm7
283
284  movq mm6, mm4
285  punpcklbw mm4, mm7
286  punpckhbw mm6, mm7
287  psubsw mm0, mm4
288  psubsw mm1, mm6
289  movq mm6, mm5
290  punpcklbw mm5, mm7
291  punpckhbw mm6, mm7
292  psubsw mm2, mm5
293  lea _EAX, [_EAX+2*TMP1]
294  psubsw mm3, mm6
295  lea _EBX, [_EBX+2*TMP1]
296
297  movq [TMP0+%1*32+ 0], mm0 ; dst
298  movq [TMP0+%1*32+ 8], mm1
299  movq [TMP0+%1*32+16], mm2
300  movq [TMP0+%1*32+24], mm3
301%endmacro
302
303ALIGN SECTION_ALIGN
304transfer_8to16sub2_mmx:
305  mov TMP0, prm1   ; Dst
306  mov TMP1d, prm5d ; Stride
307  mov _EAX, prm2   ; Cur
308
309  push _EBX
310%ifdef ARCH_IS_X86_64
311  mov _EBX, prm3
312%else
313  mov _EBX, [_ESP+4+12] ; Ref1
314%endif
315
316  push _ESI
317%ifdef ARCH_IS_X86_64
318  mov _ESI, prm4
319%else
320  mov _ESI, [_ESP+8+16] ; Ref2
321%endif
322
323  pxor mm7, mm7
324
325  COPY_8_TO_16_SUB2_MMX 0
326  COPY_8_TO_16_SUB2_MMX 1
327  COPY_8_TO_16_SUB2_MMX 2
328  COPY_8_TO_16_SUB2_MMX 3
329
330  pop _ESI
331  pop _EBX
332  ret
333ENDFUNC
334
335;-----------------------------------------------------------------------------
336;
337; void transfer_8to16sub2_xmm(int16_t * const dct,
338;				uint8_t * const cur,
339;				const uint8_t * ref1,
340;				const uint8_t * ref2,
341;				const uint32_t stride)
342;
343;-----------------------------------------------------------------------------
344
345%macro COPY_8_TO_16_SUB2_SSE 1
346  movq mm0, [_EAX]      ; cur
347  movq mm2, [_EAX+TMP1]
348  movq mm1, mm0
349  movq mm3, mm2
350
351  punpcklbw mm0, mm7
352  punpcklbw mm2, mm7
353  movq mm4, [_EBX]     ; ref1
354  pavgb mm4, [_ESI]     ; ref2
355  movq [_EAX], mm4
356  punpckhbw mm1, mm7
357  punpckhbw mm3, mm7
358  movq mm5, [_EBX+TMP1] ; ref
359  pavgb mm5, [_ESI+TMP1] ; ref2
360  movq [_EAX+TMP1], mm5
361
362  movq mm6, mm4
363  punpcklbw mm4, mm7
364  punpckhbw mm6, mm7
365  psubsw mm0, mm4
366  psubsw mm1, mm6
367  lea _ESI, [_ESI+2*TMP1]
368  movq mm6, mm5
369  punpcklbw mm5, mm7
370  punpckhbw mm6, mm7
371  psubsw mm2, mm5
372  lea _EAX, [_EAX+2*TMP1]
373  psubsw mm3, mm6
374  lea _EBX, [_EBX+2*TMP1]
375
376  movq [TMP0+%1*32+ 0], mm0 ; dst
377  movq [TMP0+%1*32+ 8], mm1
378  movq [TMP0+%1*32+16], mm2
379  movq [TMP0+%1*32+24], mm3
380%endmacro
381
382ALIGN SECTION_ALIGN
383transfer_8to16sub2_xmm:
384  mov TMP0, prm1   ; Dst
385  mov _EAX, prm2   ; Cur
386  mov TMP1d, prm5d ; Stride
387
388  push _EBX
389%ifdef ARCH_IS_X86_64
390  mov _EBX, prm3 ; Ref1
391%else
392  mov _EBX, [_ESP+4+12] ; Ref1
393%endif
394
395  push _ESI
396%ifdef ARCH_IS_X86_64
397  mov _ESI, prm4 ; Ref1
398%else
399  mov _ESI, [_ESP+8+16] ; Ref2
400%endif
401
402  pxor mm7, mm7
403
404  COPY_8_TO_16_SUB2_SSE 0
405  COPY_8_TO_16_SUB2_SSE 1
406  COPY_8_TO_16_SUB2_SSE 2
407  COPY_8_TO_16_SUB2_SSE 3
408
409  pop _ESI
410  pop _EBX
411  ret
412ENDFUNC
413
414
415;-----------------------------------------------------------------------------
416;
417; void transfer_8to16sub2ro_xmm(int16_t * const dct,
418;				const uint8_t * const cur,
419;				const uint8_t * ref1,
420;				const uint8_t * ref2,
421;				const uint32_t stride)
422;
423;-----------------------------------------------------------------------------
424
425%macro COPY_8_TO_16_SUB2RO_SSE 1
426  movq mm0, [_EAX]      ; cur
427  movq mm2, [_EAX+TMP1]
428  movq mm1, mm0
429  movq mm3, mm2
430
431  punpcklbw mm0, mm7
432  punpcklbw mm2, mm7
433  movq mm4, [_EBX]     ; ref1
434  pavgb mm4, [_ESI]     ; ref2
435  punpckhbw mm1, mm7
436  punpckhbw mm3, mm7
437  movq mm5, [_EBX+TMP1] ; ref
438  pavgb mm5, [_ESI+TMP1] ; ref2
439
440  movq mm6, mm4
441  punpcklbw mm4, mm7
442  punpckhbw mm6, mm7
443  psubsw mm0, mm4
444  psubsw mm1, mm6
445  lea _ESI, [_ESI+2*TMP1]
446  movq mm6, mm5
447  punpcklbw mm5, mm7
448  punpckhbw mm6, mm7
449  psubsw mm2, mm5
450  lea _EAX, [_EAX+2*TMP1]
451  psubsw mm3, mm6
452  lea _EBX, [_EBX+2*TMP1]
453
454  movq [TMP0+%1*32+ 0], mm0 ; dst
455  movq [TMP0+%1*32+ 8], mm1
456  movq [TMP0+%1*32+16], mm2
457  movq [TMP0+%1*32+24], mm3
458%endmacro
459
460ALIGN SECTION_ALIGN
461transfer_8to16sub2ro_xmm:
462  pxor mm7, mm7
463  mov TMP0, prm1   ; Dst
464  mov _EAX, prm2   ; Cur
465  mov TMP1d, prm5d ; Stride
466
467  push _EBX
468%ifdef ARCH_IS_X86_64
469  mov _EBX, prm3
470%else
471  mov _EBX, [_ESP+4+12] ; Ref1
472%endif
473
474  push _ESI
475%ifdef ARCH_IS_X86_64
476  mov _ESI, prm4
477%else
478  mov _ESI, [_ESP+8+16] ; Ref2
479%endif
480
481  COPY_8_TO_16_SUB2RO_SSE 0
482  COPY_8_TO_16_SUB2RO_SSE 1
483  COPY_8_TO_16_SUB2RO_SSE 2
484  COPY_8_TO_16_SUB2RO_SSE 3
485
486  pop _ESI
487  pop _EBX
488  ret
489ENDFUNC
490
491
492;-----------------------------------------------------------------------------
493;
494; void transfer_16to8add_mmx(uint8_t * const dst,
495;						const int16_t * const src,
496;						uint32_t stride);
497;
498;-----------------------------------------------------------------------------
499
500%macro COPY_16_TO_8_ADD 1
501  movq mm0, [TMP0]
502  movq mm2, [TMP0+TMP1]
503  movq mm1, mm0
504  movq mm3, mm2
505  punpcklbw mm0, mm7
506  punpcklbw mm2, mm7
507  punpckhbw mm1, mm7
508  punpckhbw mm3, mm7
509  paddsw mm0, [_EAX+%1*32+ 0]
510  paddsw mm1, [_EAX+%1*32+ 8]
511  paddsw mm2, [_EAX+%1*32+16]
512  paddsw mm3, [_EAX+%1*32+24]
513  packuswb mm0, mm1
514  movq [TMP0], mm0
515  packuswb mm2, mm3
516  movq [TMP0+TMP1], mm2
517%endmacro
518
519
520ALIGN SECTION_ALIGN
521transfer_16to8add_mmx:
522  mov TMP0, prm1 ; Dst
523  mov _EAX, prm2 ; Src
524  mov TMP1, prm3 ; Stride
525  pxor mm7, mm7
526
527  COPY_16_TO_8_ADD 0
528  lea TMP0,[TMP0+2*TMP1]
529  COPY_16_TO_8_ADD 1
530  lea TMP0,[TMP0+2*TMP1]
531  COPY_16_TO_8_ADD 2
532  lea TMP0,[TMP0+2*TMP1]
533  COPY_16_TO_8_ADD 3
534  ret
535ENDFUNC
536
537;-----------------------------------------------------------------------------
538;
539; void transfer8x8_copy_mmx(uint8_t * const dst,
540;					const uint8_t * const src,
541;					const uint32_t stride);
542;
543;
544;-----------------------------------------------------------------------------
545
546%macro COPY_8_TO_8 0
547  movq mm0, [_EAX]
548  movq mm1, [_EAX+TMP1]
549  movq [TMP0], mm0
550  lea _EAX, [_EAX+2*TMP1]
551  movq [TMP0+TMP1], mm1
552%endmacro
553
554ALIGN SECTION_ALIGN
555transfer8x8_copy_mmx:
556  mov TMP0, prm1 ; Dst
557  mov _EAX, prm2 ; Src
558  mov TMP1, prm3 ; Stride
559
560  COPY_8_TO_8
561  lea TMP0,[TMP0+2*TMP1]
562  COPY_8_TO_8
563  lea TMP0,[TMP0+2*TMP1]
564  COPY_8_TO_8
565  lea TMP0,[TMP0+2*TMP1]
566  COPY_8_TO_8
567  ret
568ENDFUNC
569
570;-----------------------------------------------------------------------------
571;
572; void transfer8x4_copy_mmx(uint8_t * const dst,
573;					const uint8_t * const src,
574;					const uint32_t stride);
575;
576;
577;-----------------------------------------------------------------------------
578
579ALIGN SECTION_ALIGN
580transfer8x4_copy_mmx:
581  mov TMP0, prm1 ; Dst
582  mov _EAX, prm2 ; Src
583  mov TMP1, prm3 ; Stride
584
585  COPY_8_TO_8
586  lea TMP0,[TMP0+2*TMP1]
587  COPY_8_TO_8
588  ret
589ENDFUNC
590
591NON_EXEC_STACK
592