1;******************************************************************************
2;* MMX/SSSE3-optimized functions for H264 chroma MC
3;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
4;*               2005-2008 Loren Merritt
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA
26
27rnd_rv40_2d_tbl: times 4 dw  0
28                 times 4 dw 16
29                 times 4 dw 32
30                 times 4 dw 16
31                 times 4 dw 32
32                 times 4 dw 28
33                 times 4 dw 32
34                 times 4 dw 28
35                 times 4 dw  0
36                 times 4 dw 32
37                 times 4 dw 16
38                 times 4 dw 32
39                 times 4 dw 32
40                 times 4 dw 28
41                 times 4 dw 32
42                 times 4 dw 28
43rnd_rv40_1d_tbl: times 4 dw  0
44                 times 4 dw  2
45                 times 4 dw  4
46                 times 4 dw  2
47                 times 4 dw  4
48                 times 4 dw  3
49                 times 4 dw  4
50                 times 4 dw  3
51                 times 4 dw  0
52                 times 4 dw  4
53                 times 4 dw  2
54                 times 4 dw  4
55                 times 4 dw  4
56                 times 4 dw  3
57                 times 4 dw  4
58                 times 4 dw  3
59
60cextern pw_3
61cextern pw_4
62cextern pw_8
63pw_28: times 8 dw 28
64cextern pw_32
65cextern pw_64
66
67SECTION .text
68
69%macro mv0_pixels_mc8 0
70    lea           r4, [r2*2 ]
71.next4rows:
72    movq         mm0, [r1   ]
73    movq         mm1, [r1+r2]
74    add           r1, r4
75    CHROMAMC_AVG mm0, [r0   ]
76    CHROMAMC_AVG mm1, [r0+r2]
77    movq     [r0   ], mm0
78    movq     [r0+r2], mm1
79    add           r0, r4
80    movq         mm0, [r1   ]
81    movq         mm1, [r1+r2]
82    add           r1, r4
83    CHROMAMC_AVG mm0, [r0   ]
84    CHROMAMC_AVG mm1, [r0+r2]
85    movq     [r0   ], mm0
86    movq     [r0+r2], mm1
87    add           r0, r4
88    sub          r3d, 4
89    jne .next4rows
90%endmacro
91
92%macro chroma_mc8_mmx_func 2-3
93%ifidn %2, rv40
94%ifdef PIC
95%define rnd_1d_rv40 r8
96%define rnd_2d_rv40 r8
97%define extra_regs 2
98%else ; no-PIC
99%define rnd_1d_rv40 rnd_rv40_1d_tbl
100%define rnd_2d_rv40 rnd_rv40_2d_tbl
101%define extra_regs 1
102%endif ; PIC
103%else
104%define extra_regs 0
105%endif ; rv40
106; void ff_put/avg_h264_chroma_mc8_*(uint8_t *dst /* align 8 */,
107;                                   uint8_t *src /* align 1 */,
108;                                   int stride, int h, int mx, int my)
109cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
110%if ARCH_X86_64
111    movsxd        r2, r2d
112%endif
113    mov          r6d, r5d
114    or           r6d, r4d
115    jne .at_least_one_non_zero
116    ; mx == 0 AND my == 0 - no filter needed
117    mv0_pixels_mc8
118    REP_RET
119
120.at_least_one_non_zero:
121%ifidn %2, rv40
122%if ARCH_X86_64
123    mov           r7, r5
124    and           r7, 6         ; &~1 for mx/my=[0,7]
125    lea           r7, [r7*4+r4]
126    sar          r7d, 1
127%define rnd_bias r7
128%define dest_reg r0
129%else ; x86-32
130    mov           r0, r5
131    and           r0, 6         ; &~1 for mx/my=[0,7]
132    lea           r0, [r0*4+r4]
133    sar          r0d, 1
134%define rnd_bias r0
135%define dest_reg r5
136%endif
137%else ; vc1, h264
138%define rnd_bias  0
139%define dest_reg r0
140%endif
141
142    test         r5d, r5d
143    mov           r6, 1
144    je .my_is_zero
145    test         r4d, r4d
146    mov           r6, r2        ; dxy = x ? 1 : stride
147    jne .both_non_zero
148.my_is_zero:
149    ; mx == 0 XOR my == 0 - 1 dimensional filter only
150    or           r4d, r5d       ; x + y
151
152%ifidn %2, rv40
153%ifdef PIC
154    lea           r8, [rnd_rv40_1d_tbl]
155%endif
156%if ARCH_X86_64 == 0
157    mov           r5, r0m
158%endif
159%endif
160
161    movd          m5, r4d
162    movq          m4, [pw_8]
163    movq          m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3
164    punpcklwd     m5, m5
165    punpckldq     m5, m5        ; mm5 = B = x
166    pxor          m7, m7
167    psubw         m4, m5        ; mm4 = A = 8-x
168
169.next1drow:
170    movq          m0, [r1   ]   ; mm0 = src[0..7]
171    movq          m2, [r1+r6]   ; mm1 = src[1..8]
172
173    movq          m1, m0
174    movq          m3, m2
175    punpcklbw     m0, m7
176    punpckhbw     m1, m7
177    punpcklbw     m2, m7
178    punpckhbw     m3, m7
179    pmullw        m0, m4        ; [mm0,mm1] = A * src[0..7]
180    pmullw        m1, m4
181    pmullw        m2, m5        ; [mm2,mm3] = B * src[1..8]
182    pmullw        m3, m5
183
184    paddw         m0, m6
185    paddw         m1, m6
186    paddw         m0, m2
187    paddw         m1, m3
188    psrlw         m0, 3
189    psrlw         m1, 3
190    packuswb      m0, m1
191    CHROMAMC_AVG  m0, [dest_reg]
192    movq  [dest_reg], m0        ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
193
194    add     dest_reg, r2
195    add           r1, r2
196    dec           r3d
197    jne .next1drow
198    REP_RET
199
200.both_non_zero: ; general case, bilinear
201    movd          m4, r4d         ; x
202    movd          m6, r5d         ; y
203%ifidn %2, rv40
204%ifdef PIC
205    lea           r8, [rnd_rv40_2d_tbl]
206%endif
207%if ARCH_X86_64 == 0
208    mov           r5, r0m
209%endif
210%endif
211    mov           r6, rsp         ; backup stack pointer
212    and          rsp, ~(mmsize-1) ; align stack
213    sub          rsp, 16          ; AA and DD
214
215    punpcklwd     m4, m4
216    punpcklwd     m6, m6
217    punpckldq     m4, m4          ; mm4 = x words
218    punpckldq     m6, m6          ; mm6 = y words
219    movq          m5, m4
220    pmullw        m4, m6          ; mm4 = x * y
221    psllw         m5, 3
222    psllw         m6, 3
223    movq          m7, m5
224    paddw         m7, m6
225    movq     [rsp+8], m4          ; DD = x * y
226    psubw         m5, m4          ; mm5 = B = 8x - xy
227    psubw         m6, m4          ; mm6 = C = 8y - xy
228    paddw         m4, [pw_64]
229    psubw         m4, m7          ; mm4 = A = xy - (8x+8y) + 64
230    pxor          m7, m7
231    movq     [rsp  ], m4
232
233    movq          m0, [r1  ]      ; mm0 = src[0..7]
234    movq          m1, [r1+1]      ; mm1 = src[1..8]
235.next2drow:
236    add           r1, r2
237
238    movq          m2, m0
239    movq          m3, m1
240    punpckhbw     m0, m7
241    punpcklbw     m1, m7
242    punpcklbw     m2, m7
243    punpckhbw     m3, m7
244    pmullw        m0, [rsp]
245    pmullw        m2, [rsp]
246    pmullw        m1, m5
247    pmullw        m3, m5
248    paddw         m2, m1          ; mm2 = A * src[0..3] + B * src[1..4]
249    paddw         m3, m0          ; mm3 = A * src[4..7] + B * src[5..8]
250
251    movq          m0, [r1]
252    movq          m1, m0
253    punpcklbw     m0, m7
254    punpckhbw     m1, m7
255    pmullw        m0, m6
256    pmullw        m1, m6
257    paddw         m2, m0
258    paddw         m3, m1          ; [mm2,mm3] += C * src[0..7]
259
260    movq          m1, [r1+1]
261    movq          m0, m1
262    movq          m4, m1
263    punpcklbw     m0, m7
264    punpckhbw     m4, m7
265    pmullw        m0, [rsp+8]
266    pmullw        m4, [rsp+8]
267    paddw         m2, m0
268    paddw         m3, m4          ; [mm2,mm3] += D * src[1..8]
269    movq          m0, [r1]
270
271    paddw         m2, [rnd_2d_%2+rnd_bias*8]
272    paddw         m3, [rnd_2d_%2+rnd_bias*8]
273    psrlw         m2, 6
274    psrlw         m3, 6
275    packuswb      m2, m3
276    CHROMAMC_AVG  m2, [dest_reg]
277    movq  [dest_reg], m2          ; dst[0..7] = ([mm2,mm3] + rnd) >> 6
278
279    add     dest_reg, r2
280    dec          r3d
281    jne .next2drow
282    mov          rsp, r6          ; restore stack pointer
283    RET
284%endmacro
285
286%macro chroma_mc4_mmx_func 2
287%define extra_regs 0
288%ifidn %2, rv40
289%ifdef PIC
290%define extra_regs 1
291%endif ; PIC
292%endif ; rv40
293cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
294%if ARCH_X86_64
295    movsxd        r2, r2d
296%endif
297    pxor          m7, m7
298    movd          m2, r4d         ; x
299    movd          m3, r5d         ; y
300    movq          m4, [pw_8]
301    movq          m5, [pw_8]
302    punpcklwd     m2, m2
303    punpcklwd     m3, m3
304    punpcklwd     m2, m2
305    punpcklwd     m3, m3
306    psubw         m4, m2
307    psubw         m5, m3
308
309%ifidn %2, rv40
310%ifdef PIC
311   lea            r6, [rnd_rv40_2d_tbl]
312%define rnd_2d_rv40 r6
313%else
314%define rnd_2d_rv40 rnd_rv40_2d_tbl
315%endif
316    and           r5, 6         ; &~1 for mx/my=[0,7]
317    lea           r5, [r5*4+r4]
318    sar          r5d, 1
319%define rnd_bias r5
320%else ; vc1, h264
321%define rnd_bias 0
322%endif
323
324    movd          m0, [r1  ]
325    movd          m6, [r1+1]
326    add           r1, r2
327    punpcklbw     m0, m7
328    punpcklbw     m6, m7
329    pmullw        m0, m4
330    pmullw        m6, m2
331    paddw         m6, m0
332
333.next2rows:
334    movd          m0, [r1  ]
335    movd          m1, [r1+1]
336    add           r1, r2
337    punpcklbw     m0, m7
338    punpcklbw     m1, m7
339    pmullw        m0, m4
340    pmullw        m1, m2
341    paddw         m1, m0
342    movq          m0, m1
343
344    pmullw        m6, m5
345    pmullw        m1, m3
346    paddw         m6, [rnd_2d_%2+rnd_bias*8]
347    paddw         m1, m6
348    psrlw         m1, 6
349    packuswb      m1, m1
350    CHROMAMC_AVG4 m1, m6, [r0]
351    movd        [r0], m1
352    add           r0, r2
353
354    movd          m6, [r1  ]
355    movd          m1, [r1+1]
356    add           r1, r2
357    punpcklbw     m6, m7
358    punpcklbw     m1, m7
359    pmullw        m6, m4
360    pmullw        m1, m2
361    paddw         m1, m6
362    movq          m6, m1
363    pmullw        m0, m5
364    pmullw        m1, m3
365    paddw         m0, [rnd_2d_%2+rnd_bias*8]
366    paddw         m1, m0
367    psrlw         m1, 6
368    packuswb      m1, m1
369    CHROMAMC_AVG4 m1, m0, [r0]
370    movd        [r0], m1
371    add           r0, r2
372    sub          r3d, 2
373    jnz .next2rows
374    REP_RET
375%endmacro
376
377%macro chroma_mc2_mmx_func 2
378cglobal %1_%2_chroma_mc2, 6, 7, 0
379%if ARCH_X86_64
380    movsxd        r2, r2d
381%endif
382
383    mov          r6d, r4d
384    shl          r4d, 16
385    sub          r4d, r6d
386    add          r4d, 8
387    imul         r5d, r4d         ; x*y<<16 | y*(8-x)
388    shl          r4d, 3
389    sub          r4d, r5d         ; x*(8-y)<<16 | (8-x)*(8-y)
390
391    movd          m5, r4d
392    movd          m6, r5d
393    punpckldq     m5, m5          ; mm5 = {A,B,A,B}
394    punpckldq     m6, m6          ; mm6 = {C,D,C,D}
395    pxor          m7, m7
396    movd          m2, [r1]
397    punpcklbw     m2, m7
398    pshufw        m2, m2, 0x94    ; mm0 = src[0,1,1,2]
399
400.nextrow:
401    add           r1, r2
402    movq          m1, m2
403    pmaddwd       m1, m5          ; mm1 = A * src[0,1] + B * src[1,2]
404    movd          m0, [r1]
405    punpcklbw     m0, m7
406    pshufw        m0, m0, 0x94    ; mm0 = src[0,1,1,2]
407    movq          m2, m0
408    pmaddwd       m0, m6
409    paddw         m1, [rnd_2d_%2]
410    paddw         m1, m0          ; mm1 += C * src[0,1] + D * src[1,2]
411    psrlw         m1, 6
412    packssdw      m1, m7
413    packuswb      m1, m7
414    CHROMAMC_AVG4 m1, m3, [r0]
415    movd         r5d, m1
416    mov         [r0], r5w
417    add           r0, r2
418    sub          r3d, 1
419    jnz .nextrow
420    REP_RET
421%endmacro
422
423%define rnd_1d_h264 pw_4
424%define rnd_2d_h264 pw_32
425%define rnd_1d_vc1  pw_3
426%define rnd_2d_vc1  pw_28
427
428%macro NOTHING 2-3
429%endmacro
430%macro DIRECT_AVG 2
431    PAVGB         %1, %2
432%endmacro
433%macro COPY_AVG 3
434    movd          %2, %3
435    PAVGB         %1, %2
436%endmacro
437
438INIT_MMX mmx
439%define CHROMAMC_AVG  NOTHING
440%define CHROMAMC_AVG4 NOTHING
441chroma_mc8_mmx_func put, h264, _rnd
442chroma_mc8_mmx_func put, vc1,  _nornd
443chroma_mc8_mmx_func put, rv40
444chroma_mc4_mmx_func put, h264
445chroma_mc4_mmx_func put, rv40
446
447INIT_MMX mmxext
448chroma_mc2_mmx_func put, h264
449
450%define CHROMAMC_AVG  DIRECT_AVG
451%define CHROMAMC_AVG4 COPY_AVG
452chroma_mc8_mmx_func avg, h264, _rnd
453chroma_mc8_mmx_func avg, vc1,  _nornd
454chroma_mc8_mmx_func avg, rv40
455chroma_mc4_mmx_func avg, h264
456chroma_mc4_mmx_func avg, rv40
457chroma_mc2_mmx_func avg, h264
458
459INIT_MMX 3dnow
460chroma_mc8_mmx_func avg, h264, _rnd
461chroma_mc8_mmx_func avg, vc1,  _nornd
462chroma_mc8_mmx_func avg, rv40
463chroma_mc4_mmx_func avg, h264
464chroma_mc4_mmx_func avg, rv40
465
466%macro chroma_mc8_ssse3_func 2-3
467cglobal %1_%2_chroma_mc8%3, 6, 7, 8
468%if ARCH_X86_64
469    movsxd        r2, r2d
470%endif
471    mov          r6d, r5d
472    or           r6d, r4d
473    jne .at_least_one_non_zero
474    ; mx == 0 AND my == 0 - no filter needed
475    mv0_pixels_mc8
476    REP_RET
477
478.at_least_one_non_zero:
479    test         r5d, r5d
480    je .my_is_zero
481    test         r4d, r4d
482    je .mx_is_zero
483
484    ; general case, bilinear
485    mov          r6d, r4d
486    shl          r4d, 8
487    sub           r4, r6
488    mov           r6, 8
489    add           r4, 8           ; x*288+8 = x<<8 | (8-x)
490    sub          r6d, r5d
491    imul          r6, r4          ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
492    imul         r4d, r5d         ;    y *(x*255+8) =    y *x<<8 |    y *(8-x)
493
494    movd          m7, r6d
495    movd          m6, r4d
496    movdqa        m5, [rnd_2d_%2]
497    movq          m0, [r1  ]
498    movq          m1, [r1+1]
499    pshuflw       m7, m7, 0
500    pshuflw       m6, m6, 0
501    punpcklbw     m0, m1
502    movlhps       m7, m7
503    movlhps       m6, m6
504
505.next2rows:
506    movq          m1, [r1+r2*1   ]
507    movq          m2, [r1+r2*1+1]
508    movq          m3, [r1+r2*2  ]
509    movq          m4, [r1+r2*2+1]
510    lea           r1, [r1+r2*2]
511    punpcklbw     m1, m2
512    movdqa        m2, m1
513    punpcklbw     m3, m4
514    movdqa        m4, m3
515    pmaddubsw     m0, m7
516    pmaddubsw     m1, m6
517    pmaddubsw     m2, m7
518    pmaddubsw     m3, m6
519    paddw         m0, m5
520    paddw         m2, m5
521    paddw         m1, m0
522    paddw         m3, m2
523    psrlw         m1, 6
524    movdqa        m0, m4
525    psrlw         m3, 6
526%ifidn %1, avg
527    movq          m2, [r0   ]
528    movhps        m2, [r0+r2]
529%endif
530    packuswb      m1, m3
531    CHROMAMC_AVG  m1, m2
532    movq     [r0   ], m1
533    movhps   [r0+r2], m1
534    sub          r3d, 2
535    lea           r0, [r0+r2*2]
536    jg .next2rows
537    REP_RET
538
539.my_is_zero:
540    mov          r5d, r4d
541    shl          r4d, 8
542    add           r4, 8
543    sub           r4, r5          ; 255*x+8 = x<<8 | (8-x)
544    movd          m7, r4d
545    movdqa        m6, [rnd_1d_%2]
546    pshuflw       m7, m7, 0
547    movlhps       m7, m7
548
549.next2xrows:
550    movq          m0, [r1     ]
551    movq          m1, [r1   +1]
552    movq          m2, [r1+r2  ]
553    movq          m3, [r1+r2+1]
554    punpcklbw     m0, m1
555    punpcklbw     m2, m3
556    pmaddubsw     m0, m7
557    pmaddubsw     m2, m7
558%ifidn %1, avg
559    movq          m4, [r0   ]
560    movhps        m4, [r0+r2]
561%endif
562    paddw         m0, m6
563    paddw         m2, m6
564    psrlw         m0, 3
565    psrlw         m2, 3
566    packuswb      m0, m2
567    CHROMAMC_AVG  m0, m4
568    movq     [r0   ], m0
569    movhps   [r0+r2], m0
570    sub          r3d, 2
571    lea           r0, [r0+r2*2]
572    lea           r1, [r1+r2*2]
573    jg .next2xrows
574    REP_RET
575
576.mx_is_zero:
577    mov          r4d, r5d
578    shl          r5d, 8
579    add           r5, 8
580    sub           r5, r4          ; 255*y+8 = y<<8 | (8-y)
581    movd          m7, r5d
582    movdqa        m6, [rnd_1d_%2]
583    pshuflw       m7, m7, 0
584    movlhps       m7, m7
585
586.next2yrows:
587    movq          m0, [r1     ]
588    movq          m1, [r1+r2  ]
589    movdqa        m2, m1
590    movq          m3, [r1+r2*2]
591    lea           r1, [r1+r2*2]
592    punpcklbw     m0, m1
593    punpcklbw     m2, m3
594    pmaddubsw     m0, m7
595    pmaddubsw     m2, m7
596%ifidn %1, avg
597    movq          m4, [r0   ]
598    movhps        m4, [r0+r2]
599%endif
600    paddw         m0, m6
601    paddw         m2, m6
602    psrlw         m0, 3
603    psrlw         m2, 3
604    packuswb      m0, m2
605    CHROMAMC_AVG  m0, m4
606    movq     [r0   ], m0
607    movhps   [r0+r2], m0
608    sub          r3d, 2
609    lea           r0, [r0+r2*2]
610    jg .next2yrows
611    REP_RET
612%endmacro
613
614%macro chroma_mc4_ssse3_func 2
615cglobal %1_%2_chroma_mc4, 6, 7, 0
616%if ARCH_X86_64
617    movsxd        r2, r2d
618%endif
619    mov           r6, r4
620    shl          r4d, 8
621    sub          r4d, r6d
622    mov           r6, 8
623    add          r4d, 8           ; x*288+8
624    sub          r6d, r5d
625    imul         r6d, r4d         ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
626    imul         r4d, r5d         ;    y *(x*255+8) =    y *x<<8 |    y *(8-x)
627
628    movd          m7, r6d
629    movd          m6, r4d
630    movq          m5, [pw_32]
631    movd          m0, [r1  ]
632    pshufw        m7, m7, 0
633    punpcklbw     m0, [r1+1]
634    pshufw        m6, m6, 0
635
636.next2rows:
637    movd          m1, [r1+r2*1  ]
638    movd          m3, [r1+r2*2  ]
639    punpcklbw     m1, [r1+r2*1+1]
640    punpcklbw     m3, [r1+r2*2+1]
641    lea           r1, [r1+r2*2]
642    movq          m2, m1
643    movq          m4, m3
644    pmaddubsw     m0, m7
645    pmaddubsw     m1, m6
646    pmaddubsw     m2, m7
647    pmaddubsw     m3, m6
648    paddw         m0, m5
649    paddw         m2, m5
650    paddw         m1, m0
651    paddw         m3, m2
652    psrlw         m1, 6
653    movq          m0, m4
654    psrlw         m3, 6
655    packuswb      m1, m1
656    packuswb      m3, m3
657    CHROMAMC_AVG  m1, [r0  ]
658    CHROMAMC_AVG  m3, [r0+r2]
659    movd     [r0   ], m1
660    movd     [r0+r2], m3
661    sub          r3d, 2
662    lea           r0, [r0+r2*2]
663    jg .next2rows
664    REP_RET
665%endmacro
666
667%define CHROMAMC_AVG NOTHING
668INIT_XMM ssse3
669chroma_mc8_ssse3_func put, h264, _rnd
670chroma_mc8_ssse3_func put, vc1,  _nornd
671INIT_MMX ssse3
672chroma_mc4_ssse3_func put, h264
673
674%define CHROMAMC_AVG DIRECT_AVG
675INIT_XMM ssse3
676chroma_mc8_ssse3_func avg, h264, _rnd
677chroma_mc8_ssse3_func avg, vc1,  _nornd
678INIT_MMX ssse3
679chroma_mc4_ssse3_func avg, h264
680