1;/*****************************************************************************
2; *
3; *  XVID MPEG-4 VIDEO CODEC
4; *  - Quarter-pixel interpolation -
5; *  Copyright(C) 2002 Pascal Massimino <skal@planet-d.net>
6; *
7; *  This file is part of Xvid, a free MPEG-4 video encoder/decoder
8; *
9; *  Xvid is free software; you can redistribute it and/or modify it
10; *  under the terms of the GNU General Public License as published by
11; *  the Free Software Foundation; either version 2 of the License, or
12; *  (at your option) any later version.
13; *
14; *  This program is distributed in the hope that it will be useful,
15; *  but WITHOUT ANY WARRANTY; without even the implied warranty of
16; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17; *  GNU General Public License for more details.
18; *
19; *  You should have received a copy of the GNU General Public License
20; *  along with this program; if not, write to the Free Software
21; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
22; *
23; * $Id: qpel_mmx.asm,v 1.13 2010-11-28 15:18:21 Isibaar Exp $
24; *
25; *************************************************************************/
26
27;/**************************************************************************
28; *
29; *	History:
30; *
31; * 22.10.2002  initial coding. unoptimized 'proof of concept',
32; *             just to heft the qpel filtering. - Skal -
33; *
34; *************************************************************************/
35
36
37%define USE_TABLES      ; in order to use xvid_FIR_x_x_x_x tables
38                        ; instead of xvid_Expand_mmx...
39
40
41%include "nasm.inc"
42
43;//////////////////////////////////////////////////////////////////////
44;// Declarations
45;//   all signatures are:
46;// void XXX(uint8_t *dst, const uint8_t *src,
47;//          int32_t length, int32_t stride, int32_t rounding)
48;//////////////////////////////////////////////////////////////////////
49
50cglobal xvid_H_Pass_16_mmx
51cglobal xvid_H_Pass_Avrg_16_mmx
52cglobal xvid_H_Pass_Avrg_Up_16_mmx
53cglobal xvid_V_Pass_16_mmx
54cglobal xvid_V_Pass_Avrg_16_mmx
55cglobal xvid_V_Pass_Avrg_Up_16_mmx
56cglobal xvid_H_Pass_8_mmx
57cglobal xvid_H_Pass_Avrg_8_mmx
58cglobal xvid_H_Pass_Avrg_Up_8_mmx
59cglobal xvid_V_Pass_8_mmx
60cglobal xvid_V_Pass_Avrg_8_mmx
61cglobal xvid_V_Pass_Avrg_Up_8_mmx
62
63cglobal xvid_H_Pass_Add_16_mmx
64cglobal xvid_H_Pass_Avrg_Add_16_mmx
65cglobal xvid_H_Pass_Avrg_Up_Add_16_mmx
66cglobal xvid_V_Pass_Add_16_mmx
67cglobal xvid_V_Pass_Avrg_Add_16_mmx
68cglobal xvid_V_Pass_Avrg_Up_Add_16_mmx
69cglobal xvid_H_Pass_8_Add_mmx
70cglobal xvid_H_Pass_Avrg_8_Add_mmx
71cglobal xvid_H_Pass_Avrg_Up_8_Add_mmx
72cglobal xvid_V_Pass_8_Add_mmx
73cglobal xvid_V_Pass_Avrg_8_Add_mmx
74cglobal xvid_V_Pass_Avrg_Up_8_Add_mmx
75
76cglobal xvid_Expand_mmx
77
78cglobal xvid_FIR_1_0_0_0
79cglobal xvid_FIR_3_1_0_0
80cglobal xvid_FIR_6_3_1_0
81cglobal xvid_FIR_14_3_2_1
82cglobal xvid_FIR_20_6_3_1
83cglobal xvid_FIR_20_20_6_3
84cglobal xvid_FIR_23_19_6_3
85cglobal xvid_FIR_7_20_20_6
86cglobal xvid_FIR_6_20_20_6
87cglobal xvid_FIR_6_20_20_7
88cglobal xvid_FIR_3_6_20_20
89cglobal xvid_FIR_3_6_19_23
90cglobal xvid_FIR_1_3_6_20
91cglobal xvid_FIR_1_2_3_14
92cglobal xvid_FIR_0_1_3_6
93cglobal xvid_FIR_0_0_1_3
94cglobal xvid_FIR_0_0_0_1
95
96SECTION .data align=SECTION_ALIGN
97
98align SECTION_ALIGN
99xvid_Expand_mmx:
100times 256*4 dw 0        ; uint16_t xvid_Expand_mmx[256][4]
101ENDFUNC
102
103xvid_FIR_1_0_0_0:
104times 256*4 dw 0
105ENDFUNC
106
107xvid_FIR_3_1_0_0:
108times 256*4 dw 0
109ENDFUNC
110
111xvid_FIR_6_3_1_0:
112times 256*4 dw 0
113ENDFUNC
114
115xvid_FIR_14_3_2_1:
116times 256*4 dw 0
117ENDFUNC
118
119xvid_FIR_20_6_3_1:
120times 256*4 dw 0
121ENDFUNC
122
123xvid_FIR_20_20_6_3:
124times 256*4 dw 0
125ENDFUNC
126
127xvid_FIR_23_19_6_3:
128times 256*4 dw 0
129ENDFUNC
130
131xvid_FIR_7_20_20_6:
132times 256*4 dw 0
133ENDFUNC
134
135xvid_FIR_6_20_20_6:
136times 256*4 dw 0
137ENDFUNC
138
139xvid_FIR_6_20_20_7:
140times 256*4 dw 0
141ENDFUNC
142
143xvid_FIR_3_6_20_20:
144times 256*4 dw 0
145ENDFUNC
146
147xvid_FIR_3_6_19_23:
148times 256*4 dw 0
149ENDFUNC
150
151xvid_FIR_1_3_6_20:
152times 256*4 dw 0
153ENDFUNC
154
155xvid_FIR_1_2_3_14:
156times 256*4 dw 0
157ENDFUNC
158
159xvid_FIR_0_1_3_6:
160times 256*4 dw 0
161ENDFUNC
162
163xvid_FIR_0_0_1_3:
164times 256*4 dw 0
165ENDFUNC
166
167xvid_FIR_0_0_0_1:
168times 256*4 dw 0
169ENDFUNC
170
171;//////////////////////////////////////////////////////////////////////
172
173DATA
174
175align SECTION_ALIGN
176Rounder1_MMX:
177times 4 dw 1
178Rounder0_MMX:
179times 4 dw 0
180
181align SECTION_ALIGN
182Rounder_QP_MMX:
183times 4 dw 16
184times 4 dw 15
185
186%ifndef USE_TABLES
187
188align SECTION_ALIGN
189
190  ; H-Pass table shared by 16x? and 8x? filters
191
192FIR_R0:  dw 14, -3,  2, -1
193align SECTION_ALIGN
194FIR_R1:  dw 23, 19, -6,  3,   -1,  0,  0,  0
195
196FIR_R2:  dw -7, 20, 20, -6,    3, -1,  0,  0
197
198FIR_R3:  dw  3, -6, 20, 20,   -6,  3, -1,  0
199
200FIR_R4:  dw -1,  3, -6, 20,   20, -6,  3, -1
201
202FIR_R5:  dw  0, -1,  3, -6,   20, 20, -6,  3,   -1,  0,  0,  0
203align SECTION_ALIGN
204FIR_R6:  dw  0,  0, -1,  3,   -6, 20, 20, -6,    3, -1,  0,  0
205align SECTION_ALIGN
206FIR_R7:  dw  0,  0,  0, -1,    3, -6, 20, 20,   -6,  3, -1,  0
207align SECTION_ALIGN
208FIR_R8:  dw                   -1,  3, -6, 20,   20, -6,  3, -1
209
210FIR_R9:  dw                    0, -1,  3, -6,   20, 20, -6,  3,   -1,  0,  0,  0
211align SECTION_ALIGN
212FIR_R10: dw                    0,  0, -1,  3,   -6, 20, 20, -6,    3, -1,  0,  0
213align SECTION_ALIGN
214FIR_R11: dw                    0,  0,  0, -1,    3, -6, 20, 20,   -6,  3, -1,  0
215align SECTION_ALIGN
216FIR_R12: dw                                     -1,  3, -6, 20,   20, -6,  3, -1
217
218FIR_R13: dw                                      0, -1,  3, -6,   20, 20, -6,  3
219
220FIR_R14: dw                                      0,  0, -1,  3,   -6, 20, 20, -7
221
222FIR_R15: dw                                      0,  0,  0, -1,    3, -6, 19, 23
223
224FIR_R16: dw                                                       -1,  2, -3, 14
225
226%endif  ; !USE_TABLES
227
228  ; V-Pass taps
229
230align SECTION_ALIGN
231FIR_Cm7: times 4 dw -7
232FIR_Cm6: times 4 dw -6
233FIR_Cm3: times 4 dw -3
234FIR_Cm1: times 4 dw -1
235FIR_C2:  times 4 dw  2
236FIR_C3:  times 4 dw  3
237FIR_C14: times 4 dw 14
238FIR_C19: times 4 dw 19
239FIR_C20: times 4 dw 20
240FIR_C23: times 4 dw 23
241
242TEXT
243
244;//////////////////////////////////////////////////////////////////////
245;// Here we go with the Q-Pel mess.
246;//  For horizontal passes, we process 4 *output* pixel in parallel
247;//  For vertical ones, we process 4 *input* pixel in parallel.
248;//////////////////////////////////////////////////////////////////////
249
250%ifdef ARCH_IS_X86_64
251%macro XVID_MOVQ 3
252  lea r9, [%2]
253  movq %1, [r9 + %3]
254%endmacro
255%macro XVID_PADDW 3
256  lea r9, [%2]
257  paddw %1, [r9 + %3]
258%endmacro
259%define SRC_PTR prm2
260%define DST_PTR prm1
261%else
262%macro XVID_MOVQ 3
263  movq %1, [%2 + %3]
264%endmacro
265%macro XVID_PADDW 3
266  paddw %1, [%2 + %3]
267%endmacro
268%define SRC_PTR _ESI
269%define DST_PTR _EDI
270%endif
271
272%macro PROLOG_NO_AVRG 0
273  mov TMP0, prm3 ; Size
274  mov TMP1, prm4 ; BpS
275  mov eax, prm5d ; Rnd
276
277%ifndef ARCH_IS_X86_64
278  push SRC_PTR
279  push DST_PTR
280%endif
281  push _EBP
282  mov _EBP, TMP1
283
284%ifndef ARCH_IS_X86_64
285  mov DST_PTR, [_ESP+16 + 0*4] ; Dst
286  mov SRC_PTR, [_ESP+16 + 1*4] ; Src
287%endif
288
289  and _EAX, 1
290  lea TMP1, [Rounder_QP_MMX]
291  movq mm7, [TMP1+_EAX*8]  ; rounder
292%endmacro
293
294%macro EPILOG_NO_AVRG 0
295  pop _EBP
296%ifndef ARCH_IS_X86_64
297  pop DST_PTR
298  pop SRC_PTR
299%endif
300  ret
301%endmacro
302
303%macro PROLOG_AVRG 0
304  mov TMP0, prm3 ; Size
305  mov TMP1, prm4 ; BpS
306  mov eax, prm5d ; Rnd
307
308  push _EBX
309  push _EBP
310%ifndef ARCH_IS_X86_64
311  push SRC_PTR
312  push DST_PTR
313%endif
314  mov _EBP, TMP1
315
316%ifndef ARCH_IS_X86_64
317  mov DST_PTR, [_ESP+20 + 0*4] ; Dst
318  mov SRC_PTR, [_ESP+20 + 1*4] ; Src
319%endif
320
321  and _EAX, 1
322  lea TMP1, [Rounder_QP_MMX]
323  movq mm7, [TMP1+_EAX*8]  ; rounder
324  lea TMP1, [Rounder1_MMX]
325  lea _EBX, [TMP1+_EAX*8]     ; *Rounder2
326%endmacro
327
328%macro EPILOG_AVRG 0
329%ifndef ARCH_IS_X86_64
330  pop DST_PTR
331  pop SRC_PTR
332%endif
333  pop _EBP
334  pop _EBX
335  ret
336%endmacro
337
338;//////////////////////////////////////////////////////////////////////
339;//
340;// All horizontal passes
341;//
342;//////////////////////////////////////////////////////////////////////
343
344  ; macros for USE_TABLES
345
346%macro TLOAD 2     ; %1,%2: src pixels
347  movzx _EAX, byte [SRC_PTR+%1]
348  movzx TMP1, byte [SRC_PTR+%2]
349  XVID_MOVQ mm0, xvid_FIR_14_3_2_1, _EAX*8
350  XVID_MOVQ mm3, xvid_FIR_1_2_3_14, TMP1*8
351  paddw mm0, mm7
352  paddw mm3, mm7
353%endmacro
354
355%macro TACCUM2 5   ;%1:src pixel/%2-%3:Taps tables/ %4-%5:dst regs
356  movzx _EAX, byte [SRC_PTR+%1]
357  XVID_PADDW %4, %2, _EAX*8
358  XVID_PADDW %5, %3, _EAX*8
359%endmacro
360
361%macro TACCUM3 7   ;%1:src pixel/%2-%4:Taps tables/%5-%7:dst regs
362  movzx _EAX, byte [SRC_PTR+%1]
363  XVID_PADDW %5, %2, _EAX*8
364  XVID_PADDW %6, %3, _EAX*8
365  XVID_PADDW %7, %4, _EAX*8
366%endmacro
367
368;//////////////////////////////////////////////////////////////////////
369
370  ; macros without USE_TABLES
371
372%macro LOAD 2     ; %1,%2: src pixels
373  movzx _EAX, byte [SRC_PTR+%1]
374  movzx TMP1, byte [SRC_PTR+%2]
375  XVID_MOVQ mm0, xvid_Expand_mmx, _EAX*8
376  XVID_MOVQ mm3, xvid_Expand_mmx, TMP1*8
377  pmullw mm0, [FIR_R0 ]
378  pmullw mm3, [FIR_R16]
379  paddw mm0, mm7
380  paddw mm3, mm7
381%endmacro
382
383%macro ACCUM2 4   ;src pixel/Taps/dst regs #1-#2
384  movzx _EAX, byte [SRC_PTR+%1]
385  XVID_MOVQ mm4, xvid_Expand_mmx, _EAX*8
386  movq mm5, mm4
387  pmullw mm4, [%2]
388  pmullw mm5, [%2+8]
389  paddw %3, mm4
390  paddw %4, mm5
391%endmacro
392
393%macro ACCUM3 5   ;src pixel/Taps/dst regs #1-#2-#3
394  movzx _EAX, byte [SRC_PTR+%1]
395  XVID_MOVQ mm4, xvid_Expand_mmx, _EAX*8
396  movq mm5, mm4
397  movq mm6, mm5
398  pmullw mm4, [%2   ]
399  pmullw mm5, [%2+ 8]
400  pmullw mm6, [%2+16]
401  paddw %3, mm4
402  paddw %4, mm5
403  paddw %5, mm6
404%endmacro
405
406;//////////////////////////////////////////////////////////////////////
407
408%macro MIX 3   ; %1:reg, %2:src, %3:rounder
409  pxor mm6, mm6
410  movq mm4, [%2]
411  movq mm1, %1
412  movq mm5, mm4
413  punpcklbw %1, mm6
414  punpcklbw mm4, mm6
415  punpckhbw mm1, mm6
416  punpckhbw mm5, mm6
417  movq mm6, [%3]   ; rounder #2
418  paddusw %1, mm4
419  paddusw mm1, mm5
420  paddusw %1, mm6
421  paddusw mm1, mm6
422  psrlw %1, 1
423  psrlw mm1, 1
424  packuswb %1, mm1
425%endmacro
426
427;//////////////////////////////////////////////////////////////////////
428
429%macro H_PASS_16  2   ; %1:src-op (0=NONE,1=AVRG,2=AVRG-UP), %2:dst-op (NONE/AVRG)
430
431%if (%2==0) && (%1==0)
432   PROLOG_NO_AVRG
433%else
434   PROLOG_AVRG
435%endif
436
437.Loop:
438
439    ;  mm0..mm3 serves as a 4x4 delay line
440
441%ifndef USE_TABLES
442
443  LOAD 0, 16  ; special case for 1rst/last pixel
444  movq mm1, mm7
445  movq mm2, mm7
446
447  ACCUM2 1,    FIR_R1, mm0, mm1
448  ACCUM2 2,    FIR_R2, mm0, mm1
449  ACCUM2 3,    FIR_R3, mm0, mm1
450  ACCUM2 4,    FIR_R4, mm0, mm1
451
452  ACCUM3 5,    FIR_R5, mm0, mm1, mm2
453  ACCUM3 6,    FIR_R6, mm0, mm1, mm2
454  ACCUM3 7,    FIR_R7, mm0, mm1, mm2
455  ACCUM2 8,    FIR_R8, mm1, mm2
456  ACCUM3 9,    FIR_R9, mm1, mm2, mm3
457  ACCUM3 10,   FIR_R10,mm1, mm2, mm3
458  ACCUM3 11,   FIR_R11,mm1, mm2, mm3
459
460  ACCUM2 12,   FIR_R12, mm2, mm3
461  ACCUM2 13,   FIR_R13, mm2, mm3
462  ACCUM2 14,   FIR_R14, mm2, mm3
463  ACCUM2 15,   FIR_R15, mm2, mm3
464
465%else
466
467  TLOAD 0, 16  ; special case for 1rst/last pixel
468  movq mm1, mm7
469  movq mm2, mm7
470
471  TACCUM2 1,    xvid_FIR_23_19_6_3, xvid_FIR_1_0_0_0 , mm0, mm1
472  TACCUM2 2,    xvid_FIR_7_20_20_6, xvid_FIR_3_1_0_0 , mm0, mm1
473  TACCUM2 3,    xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0 , mm0, mm1
474  TACCUM2 4,    xvid_FIR_1_3_6_20 , xvid_FIR_20_6_3_1, mm0, mm1
475
476  TACCUM3 5,    xvid_FIR_0_1_3_6  , xvid_FIR_20_20_6_3, xvid_FIR_1_0_0_0  , mm0, mm1, mm2
477  TACCUM3 6,    xvid_FIR_0_0_1_3  , xvid_FIR_6_20_20_6, xvid_FIR_3_1_0_0  , mm0, mm1, mm2
478  TACCUM3 7,    xvid_FIR_0_0_0_1  , xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0  , mm0, mm1, mm2
479
480  TACCUM2 8,                       xvid_FIR_1_3_6_20 , xvid_FIR_20_6_3_1 ,      mm1, mm2
481
482  TACCUM3 9,                       xvid_FIR_0_1_3_6  , xvid_FIR_20_20_6_3, xvid_FIR_1_0_0_0,  mm1, mm2, mm3
483  TACCUM3 10,                      xvid_FIR_0_0_1_3  , xvid_FIR_6_20_20_6, xvid_FIR_3_1_0_0,  mm1, mm2, mm3
484  TACCUM3 11,                      xvid_FIR_0_0_0_1  , xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0,  mm1, mm2, mm3
485
486  TACCUM2 12,  xvid_FIR_1_3_6_20, xvid_FIR_20_6_3_1 , mm2, mm3
487  TACCUM2 13,  xvid_FIR_0_1_3_6 , xvid_FIR_20_20_6_3, mm2, mm3
488  TACCUM2 14,  xvid_FIR_0_0_1_3 , xvid_FIR_6_20_20_7, mm2, mm3
489  TACCUM2 15,  xvid_FIR_0_0_0_1 , xvid_FIR_3_6_19_23, mm2, mm3
490
491%endif
492
493  psraw mm0, 5
494  psraw mm1, 5
495  psraw mm2, 5
496  psraw mm3, 5
497  packuswb mm0, mm1
498  packuswb mm2, mm3
499
500%if (%1==1)
501  MIX mm0, SRC_PTR, _EBX
502%elif (%1==2)
503  MIX mm0, SRC_PTR+1, _EBX
504%endif
505%if (%2==1)
506  MIX mm0, DST_PTR, Rounder1_MMX
507%endif
508
509%if (%1==1)
510  MIX mm2, SRC_PTR+8, _EBX
511%elif (%1==2)
512  MIX mm2, SRC_PTR+9, _EBX
513%endif
514%if (%2==1)
515  MIX mm2, DST_PTR+8, Rounder1_MMX
516%endif
517
518  lea SRC_PTR, [SRC_PTR+_EBP]
519
520  movq [DST_PTR+0], mm0
521  movq [DST_PTR+8], mm2
522
523  add DST_PTR, _EBP
524  dec TMP0
525  jg .Loop
526
527%if (%2==0) && (%1==0)
528  EPILOG_NO_AVRG
529%else
530  EPILOG_AVRG
531%endif
532
533%endmacro
534
535
536;//////////////////////////////////////////////////////////////////////
537
538%macro H_PASS_8  2   ; %1:src-op (0=NONE,1=AVRG,2=AVRG-UP), %2:dst-op (NONE/AVRG)
539
540%if (%2==0) && (%1==0)
541  PROLOG_NO_AVRG
542%else
543  PROLOG_AVRG
544%endif
545
546.Loop:
547    ;  mm0..mm3 serves as a 4x4 delay line
548
549%ifndef USE_TABLES
550
551  LOAD 0, 8  ; special case for 1rst/last pixel
552  ACCUM2 1,  FIR_R1,  mm0, mm3
553  ACCUM2 2,  FIR_R2,  mm0, mm3
554  ACCUM2 3,  FIR_R3,  mm0, mm3
555  ACCUM2 4,  FIR_R4,  mm0, mm3
556
557  ACCUM2 5,  FIR_R13,  mm0, mm3
558  ACCUM2 6,  FIR_R14,  mm0, mm3
559  ACCUM2 7,  FIR_R15,  mm0, mm3
560
561%else
562
563%if 0   ; test with no unrolling
564
565  TLOAD 0, 8  ; special case for 1rst/last pixel
566  TACCUM2 1,  xvid_FIR_23_19_6_3, xvid_FIR_1_0_0_0  , mm0, mm3
567  TACCUM2 2,  xvid_FIR_7_20_20_6, xvid_FIR_3_1_0_0  , mm0, mm3
568  TACCUM2 3,  xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0  , mm0, mm3
569  TACCUM2 4,  xvid_FIR_1_3_6_20 , xvid_FIR_20_6_3_1 , mm0, mm3
570  TACCUM2 5,  xvid_FIR_0_1_3_6  , xvid_FIR_20_20_6_3, mm0, mm3
571  TACCUM2 6,  xvid_FIR_0_0_1_3  , xvid_FIR_6_20_20_7, mm0, mm3
572  TACCUM2 7,  xvid_FIR_0_0_0_1  , xvid_FIR_3_6_19_23, mm0, mm3
573
574%else  ; test with unrolling (little faster, but not much)
575
576  movzx _EAX, byte [SRC_PTR]
577  movzx TMP1, byte [SRC_PTR+8]
578  XVID_MOVQ mm0, xvid_FIR_14_3_2_1, _EAX*8
579  movzx _EAX, byte [SRC_PTR+1]
580  XVID_MOVQ mm3, xvid_FIR_1_2_3_14, TMP1*8
581  paddw mm0, mm7
582  paddw mm3, mm7
583
584  movzx TMP1, byte [SRC_PTR+2]
585  XVID_PADDW mm0, xvid_FIR_23_19_6_3, _EAX*8
586  XVID_PADDW mm3, xvid_FIR_1_0_0_0, _EAX*8
587
588  movzx _EAX, byte [SRC_PTR+3]
589  XVID_PADDW mm0, xvid_FIR_7_20_20_6, TMP1*8
590  XVID_PADDW mm3, xvid_FIR_3_1_0_0, TMP1*8
591
592  movzx TMP1, byte [SRC_PTR+4]
593  XVID_PADDW mm0, xvid_FIR_3_6_20_20, _EAX*8
594  XVID_PADDW mm3, xvid_FIR_6_3_1_0, _EAX*8
595
596  movzx _EAX, byte [SRC_PTR+5]
597  XVID_PADDW mm0, xvid_FIR_1_3_6_20, TMP1*8
598  XVID_PADDW mm3, xvid_FIR_20_6_3_1, TMP1*8
599
600  movzx TMP1, byte [SRC_PTR+6]
601  XVID_PADDW mm0, xvid_FIR_0_1_3_6, _EAX*8
602  XVID_PADDW mm3, xvid_FIR_20_20_6_3, _EAX*8
603
604  movzx _EAX, byte [SRC_PTR+7]
605  XVID_PADDW mm0, xvid_FIR_0_0_1_3, TMP1*8
606  XVID_PADDW mm3, xvid_FIR_6_20_20_7, TMP1*8
607
608  XVID_PADDW mm0, xvid_FIR_0_0_0_1, _EAX*8
609  XVID_PADDW mm3, xvid_FIR_3_6_19_23, _EAX*8
610
611%endif
612
613%endif    ; !USE_TABLES
614
615  psraw mm0, 5
616  psraw mm3, 5
617  packuswb mm0, mm3
618
619%if (%1==1)
620  MIX mm0, SRC_PTR, _EBX
621%elif (%1==2)
622  MIX mm0, SRC_PTR+1, _EBX
623%endif
624%if (%2==1)
625  MIX mm0, DST_PTR, Rounder1_MMX
626%endif
627
628  movq [DST_PTR], mm0
629
630  add DST_PTR, _EBP
631  add SRC_PTR, _EBP
632  dec TMP0
633  jg .Loop
634
635%if (%2==0) && (%1==0)
636  EPILOG_NO_AVRG
637%else
638  EPILOG_AVRG
639%endif
640
641%endmacro
642
643;//////////////////////////////////////////////////////////////////////
644;// 16x? copy Functions
645
646xvid_H_Pass_16_mmx:
647  H_PASS_16 0, 0
648ENDFUNC
649xvid_H_Pass_Avrg_16_mmx:
650  H_PASS_16 1, 0
651ENDFUNC
652xvid_H_Pass_Avrg_Up_16_mmx:
653  H_PASS_16 2, 0
654ENDFUNC
655
656;//////////////////////////////////////////////////////////////////////
657;// 8x? copy Functions
658
659xvid_H_Pass_8_mmx:
660  H_PASS_8 0, 0
661ENDFUNC
662xvid_H_Pass_Avrg_8_mmx:
663  H_PASS_8 1, 0
664ENDFUNC
665xvid_H_Pass_Avrg_Up_8_mmx:
666  H_PASS_8 2, 0
667ENDFUNC
668
669;//////////////////////////////////////////////////////////////////////
670;// 16x? avrg Functions
671
672xvid_H_Pass_Add_16_mmx:
673  H_PASS_16 0, 1
674ENDFUNC
675xvid_H_Pass_Avrg_Add_16_mmx:
676  H_PASS_16 1, 1
677ENDFUNC
678xvid_H_Pass_Avrg_Up_Add_16_mmx:
679  H_PASS_16 2, 1
680ENDFUNC
681
682;//////////////////////////////////////////////////////////////////////
683;// 8x? avrg Functions
684
685xvid_H_Pass_8_Add_mmx:
686  H_PASS_8 0, 1
687ENDFUNC
688xvid_H_Pass_Avrg_8_Add_mmx:
689  H_PASS_8 1, 1
690ENDFUNC
691xvid_H_Pass_Avrg_Up_8_Add_mmx:
692  H_PASS_8 2, 1
693ENDFUNC
694
695
696
697;//////////////////////////////////////////////////////////////////////
698;//
699;// All vertical passes
700;//
701;//////////////////////////////////////////////////////////////////////
702
703%macro V_LOAD 1  ; %1=Last?
704
705  movd mm4, dword [TMP1]
706  pxor mm6, mm6
707%if (%1==0)
708  add TMP1, _EBP
709%endif
710  punpcklbw mm4, mm6
711
712%endmacro
713
714%macro V_ACC1 2   ; %1:reg; 2:tap
715  pmullw mm4, [%2]
716  paddw %1, mm4
717%endmacro
718
719%macro V_ACC2 4   ; %1-%2: regs, %3-%4: taps
720  movq mm5, mm4
721  movq mm6, mm4
722  pmullw mm5, [%3]
723  pmullw mm6, [%4]
724  paddw %1, mm5
725  paddw %2, mm6
726%endmacro
727
728%macro V_ACC2l 4   ; %1-%2: regs, %3-%4: taps
729  movq mm5, mm4
730  pmullw mm5, [%3]
731  pmullw mm4, [%4]
732  paddw %1, mm5
733  paddw %2, mm4
734%endmacro
735
736%macro V_ACC4 8   ; %1-%4: regs, %5-%8: taps
737  V_ACC2 %1,%2, %5,%6
738  V_ACC2l %3,%4, %7,%8
739%endmacro
740
741
742%macro V_MIX 3  ; %1:dst-reg, %2:src, %3: rounder
743  pxor mm6, mm6
744  movq mm4, [%2]
745  punpcklbw %1, mm6
746  punpcklbw mm4, mm6
747  paddusw %1, mm4
748  paddusw %1, [%3]
749  psrlw %1, 1
750  packuswb %1, %1
751%endmacro
752
753%macro V_STORE 4    ; %1-%2: mix ops, %3: reg, %4:last?
754
755  psraw %3, 5
756  packuswb %3, %3
757
758%if (%1==1)
759  V_MIX %3, SRC_PTR, _EBX
760  add SRC_PTR, _EBP
761%elif (%1==2)
762  add SRC_PTR, _EBP
763  V_MIX %3, SRC_PTR, _EBX
764%endif
765%if (%2==1)
766  V_MIX %3, DST_PTR, Rounder1_MMX
767%endif
768
769  movd eax, %3
770  mov dword [DST_PTR], eax
771
772%if (%4==0)
773  add DST_PTR, _EBP
774%endif
775
776%endmacro
777
778;//////////////////////////////////////////////////////////////////////
779
780%macro V_PASS_16  2   ; %1:src-op (0=NONE,1=AVRG,2=AVRG-UP), %2:dst-op (NONE/AVRG)
781
782%if (%2==0) && (%1==0)
783  PROLOG_NO_AVRG
784%else
785  PROLOG_AVRG
786%endif
787
788    ; we process one stripe of 4x16 pixel each time.
789    ; the size (3rd argument) is meant to be a multiple of 4
790    ;  mm0..mm3 serves as a 4x4 delay line
791
792.Loop:
793
794  push DST_PTR
795  push SRC_PTR      ; SRC_PTR is preserved for src-mixing
796  mov TMP1, SRC_PTR
797
798    ; ouput rows [0..3], from input rows [0..8]
799
800  movq mm0, mm7
801  movq mm1, mm7
802  movq mm2, mm7
803  movq mm3, mm7
804
805  V_LOAD 0
806  V_ACC4  mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2,  FIR_Cm1
807  V_LOAD 0
808  V_ACC4  mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3
809  V_LOAD 0
810  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6
811  V_LOAD 0
812  V_ACC4  mm0, mm1, mm2, mm3, FIR_C3,  FIR_Cm6, FIR_C20, FIR_C20
813  V_LOAD 0
814  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3,  FIR_Cm6, FIR_C20
815  V_STORE %1, %2, mm0, 0
816
817  V_LOAD 0
818  V_ACC2 mm1, mm2, FIR_Cm1,  FIR_C3
819  V_ACC1 mm3, FIR_Cm6
820  V_STORE %1, %2, mm1, 0
821
822  V_LOAD 0
823  V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3
824  V_STORE %1, %2, mm2, 0
825
826  V_LOAD 1
827  V_ACC1 mm3, FIR_Cm1
828  V_STORE %1, %2, mm3, 0
829
830    ; ouput rows [4..7], from input rows [1..11] (!!)
831
832  mov SRC_PTR, [_ESP]
833  lea TMP1, [SRC_PTR+_EBP]
834
835  lea SRC_PTR, [SRC_PTR+4*_EBP]  ; for src-mixing
836  push SRC_PTR              ; this will be the new value for next round
837
838  movq mm0, mm7
839  movq mm1, mm7
840  movq mm2, mm7
841  movq mm3, mm7
842
843  V_LOAD 0
844  V_ACC1 mm0, FIR_Cm1
845
846  V_LOAD 0
847  V_ACC2l mm0, mm1, FIR_C3,  FIR_Cm1
848
849  V_LOAD 0
850  V_ACC2 mm0, mm1, FIR_Cm6,  FIR_C3
851  V_ACC1 mm2, FIR_Cm1
852
853  V_LOAD 0
854  V_ACC4  mm0, mm1, mm2, mm3, FIR_C20, FIR_Cm6, FIR_C3, FIR_Cm1
855  V_LOAD 0
856  V_ACC4  mm0, mm1, mm2, mm3, FIR_C20, FIR_C20, FIR_Cm6, FIR_C3
857  V_LOAD 0
858  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm6, FIR_C20, FIR_C20, FIR_Cm6
859  V_LOAD 0
860  V_ACC4  mm0, mm1, mm2, mm3, FIR_C3,  FIR_Cm6, FIR_C20, FIR_C20
861  V_LOAD 0
862  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3,  FIR_Cm6, FIR_C20
863  V_STORE %1, %2, mm0, 0
864
865  V_LOAD 0
866  V_ACC2 mm1, mm2, FIR_Cm1,  FIR_C3
867  V_ACC1 mm3, FIR_Cm6
868  V_STORE %1, %2, mm1, 0
869
870  V_LOAD 0
871  V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3
872  V_STORE %1, %2, mm2, 0
873
874  V_LOAD 1
875  V_ACC1 mm3, FIR_Cm1
876  V_STORE %1, %2, mm3, 0
877
878    ; ouput rows [8..11], from input rows [5..15]
879
880  pop SRC_PTR
881  lea TMP1, [SRC_PTR+_EBP]
882
883  lea SRC_PTR, [SRC_PTR+4*_EBP]  ; for src-mixing
884  push SRC_PTR              ; this will be the new value for next round
885
886  movq mm0, mm7
887  movq mm1, mm7
888  movq mm2, mm7
889  movq mm3, mm7
890
891  V_LOAD 0
892  V_ACC1 mm0, FIR_Cm1
893
894  V_LOAD 0
895  V_ACC2l mm0, mm1, FIR_C3,  FIR_Cm1
896
897  V_LOAD 0
898  V_ACC2 mm0, mm1, FIR_Cm6,  FIR_C3
899  V_ACC1 mm2, FIR_Cm1
900
901  V_LOAD 0
902  V_ACC4  mm0, mm1, mm2, mm3, FIR_C20, FIR_Cm6, FIR_C3, FIR_Cm1
903  V_LOAD 0
904  V_ACC4  mm0, mm1, mm2, mm3, FIR_C20, FIR_C20, FIR_Cm6, FIR_C3
905  V_LOAD 0
906  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm6, FIR_C20, FIR_C20, FIR_Cm6
907  V_LOAD 0
908  V_ACC4  mm0, mm1, mm2, mm3, FIR_C3,  FIR_Cm6, FIR_C20, FIR_C20
909  V_LOAD 0
910  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3,  FIR_Cm6, FIR_C20
911
912  V_STORE %1, %2, mm0, 0
913
914  V_LOAD 0
915  V_ACC2 mm1, mm2, FIR_Cm1,  FIR_C3
916  V_ACC1 mm3, FIR_Cm6
917  V_STORE %1, %2, mm1, 0
918
919  V_LOAD 0
920  V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3
921  V_STORE %1, %2, mm2, 0
922
923  V_LOAD 1
924  V_ACC1 mm3, FIR_Cm1
925  V_STORE %1, %2, mm3, 0
926
927
928    ; ouput rows [12..15], from input rows [9.16]
929
930  pop SRC_PTR
931  lea TMP1, [SRC_PTR+_EBP]
932
933%if (%1!=0)
934  lea SRC_PTR, [SRC_PTR+4*_EBP]  ; for src-mixing
935%endif
936
937  movq mm0, mm7
938  movq mm1, mm7
939  movq mm2, mm7
940  movq mm3, mm7
941
942  V_LOAD 0
943  V_ACC1 mm3, FIR_Cm1
944
945  V_LOAD 0
946  V_ACC2l mm2, mm3, FIR_Cm1,  FIR_C3
947
948  V_LOAD 0
949  V_ACC2 mm1, mm2, FIR_Cm1,  FIR_C3
950  V_ACC1 mm3, FIR_Cm6
951
952  V_LOAD 0
953  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3,  FIR_Cm6, FIR_C20
954  V_LOAD 0
955  V_ACC4  mm0, mm1, mm2, mm3, FIR_C3,  FIR_Cm6, FIR_C20, FIR_C20
956  V_LOAD 0
957  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6
958  V_LOAD 0
959  V_ACC4  mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3
960  V_LOAD 1
961  V_ACC4  mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1
962
963  V_STORE %1, %2, mm3, 0
964  V_STORE %1, %2, mm2, 0
965  V_STORE %1, %2, mm1, 0
966  V_STORE %1, %2, mm0, 1
967
968    ; ... next 4 columns
969
970  pop SRC_PTR
971  pop DST_PTR
972  add SRC_PTR, 4
973  add DST_PTR, 4
974  sub TMP0, 4
975  jg .Loop
976
977%if (%2==0) && (%1==0)
978  EPILOG_NO_AVRG
979%else
980  EPILOG_AVRG
981%endif
982
983%endmacro
984
985;//////////////////////////////////////////////////////////////////////
986
987%macro V_PASS_8  2   ; %1:src-op (0=NONE,1=AVRG,2=AVRG-UP), %2:dst-op (NONE/AVRG)
988
989%if (%2==0) && (%1==0)
990  PROLOG_NO_AVRG
991%else
992  PROLOG_AVRG
993%endif
994
995    ; we process one stripe of 4x8 pixel each time
996    ; the size (3rd argument) is meant to be a multiple of 4
997    ;  mm0..mm3 serves as a 4x4 delay line
998.Loop:
999
1000  push DST_PTR
1001  push SRC_PTR      ; SRC_PTR is preserved for src-mixing
1002  mov TMP1, SRC_PTR
1003
1004    ; ouput rows [0..3], from input rows [0..8]
1005
1006  movq mm0, mm7
1007  movq mm1, mm7
1008  movq mm2, mm7
1009  movq mm3, mm7
1010
1011  V_LOAD 0
1012  V_ACC4  mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2,  FIR_Cm1
1013  V_LOAD 0
1014  V_ACC4  mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3
1015  V_LOAD 0
1016  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6
1017  V_LOAD 0
1018  V_ACC4  mm0, mm1, mm2, mm3, FIR_C3,  FIR_Cm6, FIR_C20, FIR_C20
1019  V_LOAD 0
1020  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3,  FIR_Cm6, FIR_C20
1021  V_STORE %1, %2, mm0, 0
1022
1023  V_LOAD 0
1024  V_ACC2 mm1, mm2, FIR_Cm1,  FIR_C3
1025  V_ACC1 mm3, FIR_Cm6
1026
1027  V_STORE %1, %2, mm1, 0
1028
1029  V_LOAD 0
1030  V_ACC2l mm2, mm3, FIR_Cm1,  FIR_C3
1031  V_STORE %1, %2, mm2, 0
1032
1033  V_LOAD 1
1034  V_ACC1 mm3, FIR_Cm1
1035  V_STORE %1, %2, mm3, 0
1036
1037    ; ouput rows [4..7], from input rows [1..9]
1038
1039  mov SRC_PTR, [_ESP]
1040  lea TMP1, [SRC_PTR+_EBP]
1041
1042%if (%1!=0)
1043  lea SRC_PTR, [SRC_PTR+4*_EBP]  ; for src-mixing
1044%endif
1045
1046  movq mm0, mm7
1047  movq mm1, mm7
1048  movq mm2, mm7
1049  movq mm3, mm7
1050
1051  V_LOAD 0
1052  V_ACC1 mm3, FIR_Cm1
1053
1054  V_LOAD 0
1055  V_ACC2l mm2, mm3, FIR_Cm1,  FIR_C3
1056
1057  V_LOAD 0
1058  V_ACC2 mm1, mm2, FIR_Cm1,  FIR_C3
1059  V_ACC1 mm3, FIR_Cm6
1060
1061  V_LOAD 0
1062  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3,  FIR_Cm6, FIR_C20
1063  V_LOAD 0
1064  V_ACC4  mm0, mm1, mm2, mm3, FIR_C3,  FIR_Cm6, FIR_C20, FIR_C20
1065  V_LOAD 0
1066  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6
1067  V_LOAD 0
1068  V_ACC4  mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3
1069  V_LOAD 1
1070  V_ACC4  mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1
1071
1072  V_STORE %1, %2, mm3, 0
1073  V_STORE %1, %2, mm2, 0
1074  V_STORE %1, %2, mm1, 0
1075  V_STORE %1, %2, mm0, 1
1076
1077    ; ... next 4 columns
1078
1079  pop SRC_PTR
1080  pop DST_PTR
1081  add SRC_PTR, 4
1082  add DST_PTR, 4
1083  sub TMP0, 4
1084  jg .Loop
1085
1086%if (%2==0) && (%1==0)
1087  EPILOG_NO_AVRG
1088%else
1089  EPILOG_AVRG
1090%endif
1091
1092%endmacro
1093
1094
1095;//////////////////////////////////////////////////////////////////////
1096;// 16x? copy Functions
1097
1098xvid_V_Pass_16_mmx:
1099  V_PASS_16 0, 0
1100ENDFUNC
1101xvid_V_Pass_Avrg_16_mmx:
1102  V_PASS_16 1, 0
1103ENDFUNC
1104xvid_V_Pass_Avrg_Up_16_mmx:
1105  V_PASS_16 2, 0
1106ENDFUNC
1107
1108;//////////////////////////////////////////////////////////////////////
1109;// 8x? copy Functions
1110
1111xvid_V_Pass_8_mmx:
1112  V_PASS_8 0, 0
1113ENDFUNC
1114xvid_V_Pass_Avrg_8_mmx:
1115  V_PASS_8 1, 0
1116ENDFUNC
1117xvid_V_Pass_Avrg_Up_8_mmx:
1118  V_PASS_8 2, 0
1119ENDFUNC
1120
1121;//////////////////////////////////////////////////////////////////////
1122;// 16x? avrg Functions
1123
1124xvid_V_Pass_Add_16_mmx:
1125  V_PASS_16 0, 1
1126ENDFUNC
1127xvid_V_Pass_Avrg_Add_16_mmx:
1128  V_PASS_16 1, 1
1129ENDFUNC
1130xvid_V_Pass_Avrg_Up_Add_16_mmx:
1131  V_PASS_16 2, 1
1132ENDFUNC
1133
1134;//////////////////////////////////////////////////////////////////////
1135;// 8x? avrg Functions
1136
1137xvid_V_Pass_8_Add_mmx:
1138  V_PASS_8 0, 1
1139ENDFUNC
1140xvid_V_Pass_Avrg_8_Add_mmx:
1141  V_PASS_8 1, 1
1142ENDFUNC
1143xvid_V_Pass_Avrg_Up_8_Add_mmx:
1144  V_PASS_8 2, 1
1145ENDFUNC
1146
1147;//////////////////////////////////////////////////////////////////////
1148
1149%undef SRC_PTR
1150%undef DST_PTR
1151
1152NON_EXEC_STACK
1153