1;******************************************************************************
2;* MMX/SSSE3-optimized functions for H.264 chroma MC
3;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
4;*               2005-2008 Loren Merritt
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA
26
27rnd_rv40_2d_tbl: times 4 dw  0
28                 times 4 dw 16
29                 times 4 dw 32
30                 times 4 dw 16
31                 times 4 dw 32
32                 times 4 dw 28
33                 times 4 dw 32
34                 times 4 dw 28
35                 times 4 dw  0
36                 times 4 dw 32
37                 times 4 dw 16
38                 times 4 dw 32
39                 times 4 dw 32
40                 times 4 dw 28
41                 times 4 dw 32
42                 times 4 dw 28
43rnd_rv40_1d_tbl: times 4 dw  0
44                 times 4 dw  2
45                 times 4 dw  4
46                 times 4 dw  2
47                 times 4 dw  4
48                 times 4 dw  3
49                 times 4 dw  4
50                 times 4 dw  3
51                 times 4 dw  0
52                 times 4 dw  4
53                 times 4 dw  2
54                 times 4 dw  4
55                 times 4 dw  4
56                 times 4 dw  3
57                 times 4 dw  4
58                 times 4 dw  3
59
60cextern pw_3
61cextern pw_4
62cextern pw_8
63pw_28: times 8 dw 28
64cextern pw_32
65cextern pw_64
66
67SECTION .text
68
69%macro mv0_pixels_mc8 0
70    lea           r4, [r2*2 ]
71.next4rows:
72    movq         mm0, [r1   ]
73    movq         mm1, [r1+r2]
74    add           r1, r4
75    CHROMAMC_AVG mm0, [r0   ]
76    CHROMAMC_AVG mm1, [r0+r2]
77    movq     [r0   ], mm0
78    movq     [r0+r2], mm1
79    add           r0, r4
80    movq         mm0, [r1   ]
81    movq         mm1, [r1+r2]
82    add           r1, r4
83    CHROMAMC_AVG mm0, [r0   ]
84    CHROMAMC_AVG mm1, [r0+r2]
85    movq     [r0   ], mm0
86    movq     [r0+r2], mm1
87    add           r0, r4
88    sub          r3d, 4
89    jne .next4rows
90%endmacro
91
92%macro chroma_mc8_mmx_func 2-3
93%ifidn %2, rv40
94%ifdef PIC
95%define rnd_1d_rv40 r8
96%define rnd_2d_rv40 r8
97%define extra_regs 2
98%else ; no-PIC
99%define rnd_1d_rv40 rnd_rv40_1d_tbl
100%define rnd_2d_rv40 rnd_rv40_2d_tbl
101%define extra_regs 1
102%endif ; PIC
103%else
104%define extra_regs 0
105%endif ; rv40
106; void ff_put/avg_h264_chroma_mc8_*(uint8_t *dst /* align 8 */,
107;                                   uint8_t *src /* align 1 */,
108;                                   ptrdiff_t stride, int h, int mx, int my)
109cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
110    mov          r6d, r5d
111    or           r6d, r4d
112    jne .at_least_one_non_zero
113    ; mx == 0 AND my == 0 - no filter needed
114    mv0_pixels_mc8
115    REP_RET
116
117.at_least_one_non_zero:
118%ifidn %2, rv40
119%if ARCH_X86_64
120    mov           r7, r5
121    and           r7, 6         ; &~1 for mx/my=[0,7]
122    lea           r7, [r7*4+r4]
123    sar          r7d, 1
124%define rnd_bias r7
125%define dest_reg r0
126%else ; x86-32
127    mov           r0, r5
128    and           r0, 6         ; &~1 for mx/my=[0,7]
129    lea           r0, [r0*4+r4]
130    sar          r0d, 1
131%define rnd_bias r0
132%define dest_reg r5
133%endif
134%else ; vc1, h264
135%define rnd_bias  0
136%define dest_reg r0
137%endif
138
139    test         r5d, r5d
140    mov           r6, 1
141    je .my_is_zero
142    test         r4d, r4d
143    mov           r6, r2        ; dxy = x ? 1 : stride
144    jne .both_non_zero
145.my_is_zero:
146    ; mx == 0 XOR my == 0 - 1 dimensional filter only
147    or           r4d, r5d       ; x + y
148
149%ifidn %2, rv40
150%ifdef PIC
151    lea           r8, [rnd_rv40_1d_tbl]
152%endif
153%if ARCH_X86_64 == 0
154    mov           r5, r0m
155%endif
156%endif
157
158    movd          m5, r4d
159    movq          m4, [pw_8]
160    movq          m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3
161    punpcklwd     m5, m5
162    punpckldq     m5, m5        ; mm5 = B = x
163    pxor          m7, m7
164    psubw         m4, m5        ; mm4 = A = 8-x
165
166.next1drow:
167    movq          m0, [r1   ]   ; mm0 = src[0..7]
168    movq          m2, [r1+r6]   ; mm1 = src[1..8]
169
170    movq          m1, m0
171    movq          m3, m2
172    punpcklbw     m0, m7
173    punpckhbw     m1, m7
174    punpcklbw     m2, m7
175    punpckhbw     m3, m7
176    pmullw        m0, m4        ; [mm0,mm1] = A * src[0..7]
177    pmullw        m1, m4
178    pmullw        m2, m5        ; [mm2,mm3] = B * src[1..8]
179    pmullw        m3, m5
180
181    paddw         m0, m6
182    paddw         m1, m6
183    paddw         m0, m2
184    paddw         m1, m3
185    psrlw         m0, 3
186    psrlw         m1, 3
187    packuswb      m0, m1
188    CHROMAMC_AVG  m0, [dest_reg]
189    movq  [dest_reg], m0        ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
190
191    add     dest_reg, r2
192    add           r1, r2
193    dec           r3d
194    jne .next1drow
195    REP_RET
196
197.both_non_zero: ; general case, bilinear
198    movd          m4, r4d         ; x
199    movd          m6, r5d         ; y
200%ifidn %2, rv40
201%ifdef PIC
202    lea           r8, [rnd_rv40_2d_tbl]
203%endif
204%if ARCH_X86_64 == 0
205    mov           r5, r0m
206%endif
207%endif
208    mov           r6, rsp         ; backup stack pointer
209    and          rsp, ~(mmsize-1) ; align stack
210    sub          rsp, 16          ; AA and DD
211
212    punpcklwd     m4, m4
213    punpcklwd     m6, m6
214    punpckldq     m4, m4          ; mm4 = x words
215    punpckldq     m6, m6          ; mm6 = y words
216    movq          m5, m4
217    pmullw        m4, m6          ; mm4 = x * y
218    psllw         m5, 3
219    psllw         m6, 3
220    movq          m7, m5
221    paddw         m7, m6
222    movq     [rsp+8], m4          ; DD = x * y
223    psubw         m5, m4          ; mm5 = B = 8x - xy
224    psubw         m6, m4          ; mm6 = C = 8y - xy
225    paddw         m4, [pw_64]
226    psubw         m4, m7          ; mm4 = A = xy - (8x+8y) + 64
227    pxor          m7, m7
228    movq     [rsp  ], m4
229
230    movq          m0, [r1  ]      ; mm0 = src[0..7]
231    movq          m1, [r1+1]      ; mm1 = src[1..8]
232.next2drow:
233    add           r1, r2
234
235    movq          m2, m0
236    movq          m3, m1
237    punpckhbw     m0, m7
238    punpcklbw     m1, m7
239    punpcklbw     m2, m7
240    punpckhbw     m3, m7
241    pmullw        m0, [rsp]
242    pmullw        m2, [rsp]
243    pmullw        m1, m5
244    pmullw        m3, m5
245    paddw         m2, m1          ; mm2 = A * src[0..3] + B * src[1..4]
246    paddw         m3, m0          ; mm3 = A * src[4..7] + B * src[5..8]
247
248    movq          m0, [r1]
249    movq          m1, m0
250    punpcklbw     m0, m7
251    punpckhbw     m1, m7
252    pmullw        m0, m6
253    pmullw        m1, m6
254    paddw         m2, m0
255    paddw         m3, m1          ; [mm2,mm3] += C * src[0..7]
256
257    movq          m1, [r1+1]
258    movq          m0, m1
259    movq          m4, m1
260    punpcklbw     m0, m7
261    punpckhbw     m4, m7
262    pmullw        m0, [rsp+8]
263    pmullw        m4, [rsp+8]
264    paddw         m2, m0
265    paddw         m3, m4          ; [mm2,mm3] += D * src[1..8]
266    movq          m0, [r1]
267
268    paddw         m2, [rnd_2d_%2+rnd_bias*8]
269    paddw         m3, [rnd_2d_%2+rnd_bias*8]
270    psrlw         m2, 6
271    psrlw         m3, 6
272    packuswb      m2, m3
273    CHROMAMC_AVG  m2, [dest_reg]
274    movq  [dest_reg], m2          ; dst[0..7] = ([mm2,mm3] + rnd) >> 6
275
276    add     dest_reg, r2
277    dec          r3d
278    jne .next2drow
279    mov          rsp, r6          ; restore stack pointer
280    RET
281%endmacro
282
283%macro chroma_mc4_mmx_func 2
284%define extra_regs 0
285%ifidn %2, rv40
286%ifdef PIC
287%define extra_regs 1
288%endif ; PIC
289%endif ; rv40
290cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
291    pxor          m7, m7
292    movd          m2, r4d         ; x
293    movd          m3, r5d         ; y
294    movq          m4, [pw_8]
295    movq          m5, [pw_8]
296    punpcklwd     m2, m2
297    punpcklwd     m3, m3
298    punpcklwd     m2, m2
299    punpcklwd     m3, m3
300    psubw         m4, m2
301    psubw         m5, m3
302
303%ifidn %2, rv40
304%ifdef PIC
305   lea            r6, [rnd_rv40_2d_tbl]
306%define rnd_2d_rv40 r6
307%else
308%define rnd_2d_rv40 rnd_rv40_2d_tbl
309%endif
310    and           r5, 6         ; &~1 for mx/my=[0,7]
311    lea           r5, [r5*4+r4]
312    sar          r5d, 1
313%define rnd_bias r5
314%else ; vc1, h264
315%define rnd_bias 0
316%endif
317
318    movd          m0, [r1  ]
319    movd          m6, [r1+1]
320    add           r1, r2
321    punpcklbw     m0, m7
322    punpcklbw     m6, m7
323    pmullw        m0, m4
324    pmullw        m6, m2
325    paddw         m6, m0
326
327.next2rows:
328    movd          m0, [r1  ]
329    movd          m1, [r1+1]
330    add           r1, r2
331    punpcklbw     m0, m7
332    punpcklbw     m1, m7
333    pmullw        m0, m4
334    pmullw        m1, m2
335    paddw         m1, m0
336    movq          m0, m1
337
338    pmullw        m6, m5
339    pmullw        m1, m3
340    paddw         m6, [rnd_2d_%2+rnd_bias*8]
341    paddw         m1, m6
342    psrlw         m1, 6
343    packuswb      m1, m1
344    CHROMAMC_AVG4 m1, m6, [r0]
345    movd        [r0], m1
346    add           r0, r2
347
348    movd          m6, [r1  ]
349    movd          m1, [r1+1]
350    add           r1, r2
351    punpcklbw     m6, m7
352    punpcklbw     m1, m7
353    pmullw        m6, m4
354    pmullw        m1, m2
355    paddw         m1, m6
356    movq          m6, m1
357    pmullw        m0, m5
358    pmullw        m1, m3
359    paddw         m0, [rnd_2d_%2+rnd_bias*8]
360    paddw         m1, m0
361    psrlw         m1, 6
362    packuswb      m1, m1
363    CHROMAMC_AVG4 m1, m0, [r0]
364    movd        [r0], m1
365    add           r0, r2
366    sub          r3d, 2
367    jnz .next2rows
368    REP_RET
369%endmacro
370
371%macro chroma_mc2_mmx_func 2
372cglobal %1_%2_chroma_mc2, 6, 7, 0
373    mov          r6d, r4d
374    shl          r4d, 16
375    sub          r4d, r6d
376    add          r4d, 8
377    imul         r5d, r4d         ; x*y<<16 | y*(8-x)
378    shl          r4d, 3
379    sub          r4d, r5d         ; x*(8-y)<<16 | (8-x)*(8-y)
380
381    movd          m5, r4d
382    movd          m6, r5d
383    punpckldq     m5, m5          ; mm5 = {A,B,A,B}
384    punpckldq     m6, m6          ; mm6 = {C,D,C,D}
385    pxor          m7, m7
386    movd          m2, [r1]
387    punpcklbw     m2, m7
388    pshufw        m2, m2, 0x94    ; mm0 = src[0,1,1,2]
389
390.nextrow:
391    add           r1, r2
392    movq          m1, m2
393    pmaddwd       m1, m5          ; mm1 = A * src[0,1] + B * src[1,2]
394    movd          m0, [r1]
395    punpcklbw     m0, m7
396    pshufw        m0, m0, 0x94    ; mm0 = src[0,1,1,2]
397    movq          m2, m0
398    pmaddwd       m0, m6
399    paddw         m1, [rnd_2d_%2]
400    paddw         m1, m0          ; mm1 += C * src[0,1] + D * src[1,2]
401    psrlw         m1, 6
402    packssdw      m1, m7
403    packuswb      m1, m7
404    CHROMAMC_AVG4 m1, m3, [r0]
405    movd         r5d, m1
406    mov         [r0], r5w
407    add           r0, r2
408    sub          r3d, 1
409    jnz .nextrow
410    REP_RET
411%endmacro
412
413%define rnd_1d_h264 pw_4
414%define rnd_2d_h264 pw_32
415%define rnd_1d_vc1  pw_3
416%define rnd_2d_vc1  pw_28
417
418%macro NOTHING 2-3
419%endmacro
420%macro DIRECT_AVG 2
421    PAVGB         %1, %2
422%endmacro
423%macro COPY_AVG 3
424    movd          %2, %3
425    PAVGB         %1, %2
426%endmacro
427
428INIT_MMX mmx
429%define CHROMAMC_AVG  NOTHING
430%define CHROMAMC_AVG4 NOTHING
431chroma_mc8_mmx_func put, h264, _rnd
432chroma_mc8_mmx_func put, vc1,  _nornd
433chroma_mc8_mmx_func put, rv40
434chroma_mc4_mmx_func put, h264
435chroma_mc4_mmx_func put, rv40
436
437INIT_MMX mmxext
438chroma_mc2_mmx_func put, h264
439
440%define CHROMAMC_AVG  DIRECT_AVG
441%define CHROMAMC_AVG4 COPY_AVG
442chroma_mc8_mmx_func avg, h264, _rnd
443chroma_mc8_mmx_func avg, vc1,  _nornd
444chroma_mc8_mmx_func avg, rv40
445chroma_mc4_mmx_func avg, h264
446chroma_mc4_mmx_func avg, rv40
447chroma_mc2_mmx_func avg, h264
448
449INIT_MMX 3dnow
450chroma_mc8_mmx_func avg, h264, _rnd
451chroma_mc8_mmx_func avg, vc1,  _nornd
452chroma_mc8_mmx_func avg, rv40
453chroma_mc4_mmx_func avg, h264
454chroma_mc4_mmx_func avg, rv40
455
456%macro chroma_mc8_ssse3_func 2-3
457cglobal %1_%2_chroma_mc8%3, 6, 7, 8
458    mov          r6d, r5d
459    or           r6d, r4d
460    jne .at_least_one_non_zero
461    ; mx == 0 AND my == 0 - no filter needed
462    mv0_pixels_mc8
463    REP_RET
464
465.at_least_one_non_zero:
466    test         r5d, r5d
467    je .my_is_zero
468    test         r4d, r4d
469    je .mx_is_zero
470
471    ; general case, bilinear
472    mov          r6d, r4d
473    shl          r4d, 8
474    sub           r4, r6
475    mov           r6, 8
476    add           r4, 8           ; x*288+8 = x<<8 | (8-x)
477    sub          r6d, r5d
478    imul          r6, r4          ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
479    imul         r4d, r5d         ;    y *(x*255+8) =    y *x<<8 |    y *(8-x)
480
481    movd          m7, r6d
482    movd          m6, r4d
483    movdqa        m5, [rnd_2d_%2]
484    movq          m0, [r1  ]
485    movq          m1, [r1+1]
486    pshuflw       m7, m7, 0
487    pshuflw       m6, m6, 0
488    punpcklbw     m0, m1
489    movlhps       m7, m7
490    movlhps       m6, m6
491
492.next2rows:
493    movq          m1, [r1+r2*1   ]
494    movq          m2, [r1+r2*1+1]
495    movq          m3, [r1+r2*2  ]
496    movq          m4, [r1+r2*2+1]
497    lea           r1, [r1+r2*2]
498    punpcklbw     m1, m2
499    movdqa        m2, m1
500    punpcklbw     m3, m4
501    movdqa        m4, m3
502    pmaddubsw     m0, m7
503    pmaddubsw     m1, m6
504    pmaddubsw     m2, m7
505    pmaddubsw     m3, m6
506    paddw         m0, m5
507    paddw         m2, m5
508    paddw         m1, m0
509    paddw         m3, m2
510    psrlw         m1, 6
511    movdqa        m0, m4
512    psrlw         m3, 6
513%ifidn %1, avg
514    movq          m2, [r0   ]
515    movhps        m2, [r0+r2]
516%endif
517    packuswb      m1, m3
518    CHROMAMC_AVG  m1, m2
519    movq     [r0   ], m1
520    movhps   [r0+r2], m1
521    sub          r3d, 2
522    lea           r0, [r0+r2*2]
523    jg .next2rows
524    REP_RET
525
526.my_is_zero:
527    mov          r5d, r4d
528    shl          r4d, 8
529    add           r4, 8
530    sub           r4, r5          ; 255*x+8 = x<<8 | (8-x)
531    movd          m7, r4d
532    movdqa        m6, [rnd_1d_%2]
533    pshuflw       m7, m7, 0
534    movlhps       m7, m7
535
536.next2xrows:
537    movq          m0, [r1     ]
538    movq          m1, [r1   +1]
539    movq          m2, [r1+r2  ]
540    movq          m3, [r1+r2+1]
541    punpcklbw     m0, m1
542    punpcklbw     m2, m3
543    pmaddubsw     m0, m7
544    pmaddubsw     m2, m7
545%ifidn %1, avg
546    movq          m4, [r0   ]
547    movhps        m4, [r0+r2]
548%endif
549    paddw         m0, m6
550    paddw         m2, m6
551    psrlw         m0, 3
552    psrlw         m2, 3
553    packuswb      m0, m2
554    CHROMAMC_AVG  m0, m4
555    movq     [r0   ], m0
556    movhps   [r0+r2], m0
557    sub          r3d, 2
558    lea           r0, [r0+r2*2]
559    lea           r1, [r1+r2*2]
560    jg .next2xrows
561    REP_RET
562
563.mx_is_zero:
564    mov          r4d, r5d
565    shl          r5d, 8
566    add           r5, 8
567    sub           r5, r4          ; 255*y+8 = y<<8 | (8-y)
568    movd          m7, r5d
569    movdqa        m6, [rnd_1d_%2]
570    pshuflw       m7, m7, 0
571    movlhps       m7, m7
572
573.next2yrows:
574    movq          m0, [r1     ]
575    movq          m1, [r1+r2  ]
576    movdqa        m2, m1
577    movq          m3, [r1+r2*2]
578    lea           r1, [r1+r2*2]
579    punpcklbw     m0, m1
580    punpcklbw     m2, m3
581    pmaddubsw     m0, m7
582    pmaddubsw     m2, m7
583%ifidn %1, avg
584    movq          m4, [r0   ]
585    movhps        m4, [r0+r2]
586%endif
587    paddw         m0, m6
588    paddw         m2, m6
589    psrlw         m0, 3
590    psrlw         m2, 3
591    packuswb      m0, m2
592    CHROMAMC_AVG  m0, m4
593    movq     [r0   ], m0
594    movhps   [r0+r2], m0
595    sub          r3d, 2
596    lea           r0, [r0+r2*2]
597    jg .next2yrows
598    REP_RET
599%endmacro
600
601%macro chroma_mc4_ssse3_func 2
602cglobal %1_%2_chroma_mc4, 6, 7, 0
603    mov           r6, r4
604    shl          r4d, 8
605    sub          r4d, r6d
606    mov           r6, 8
607    add          r4d, 8           ; x*288+8
608    sub          r6d, r5d
609    imul         r6d, r4d         ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
610    imul         r4d, r5d         ;    y *(x*255+8) =    y *x<<8 |    y *(8-x)
611
612    movd          m7, r6d
613    movd          m6, r4d
614    movq          m5, [pw_32]
615    movd          m0, [r1  ]
616    pshufw        m7, m7, 0
617    punpcklbw     m0, [r1+1]
618    pshufw        m6, m6, 0
619
620.next2rows:
621    movd          m1, [r1+r2*1  ]
622    movd          m3, [r1+r2*2  ]
623    punpcklbw     m1, [r1+r2*1+1]
624    punpcklbw     m3, [r1+r2*2+1]
625    lea           r1, [r1+r2*2]
626    movq          m2, m1
627    movq          m4, m3
628    pmaddubsw     m0, m7
629    pmaddubsw     m1, m6
630    pmaddubsw     m2, m7
631    pmaddubsw     m3, m6
632    paddw         m0, m5
633    paddw         m2, m5
634    paddw         m1, m0
635    paddw         m3, m2
636    psrlw         m1, 6
637    movq          m0, m4
638    psrlw         m3, 6
639    packuswb      m1, m1
640    packuswb      m3, m3
641    CHROMAMC_AVG  m1, [r0  ]
642    CHROMAMC_AVG  m3, [r0+r2]
643    movd     [r0   ], m1
644    movd     [r0+r2], m3
645    sub          r3d, 2
646    lea           r0, [r0+r2*2]
647    jg .next2rows
648    REP_RET
649%endmacro
650
651%define CHROMAMC_AVG NOTHING
652INIT_XMM ssse3
653chroma_mc8_ssse3_func put, h264, _rnd
654chroma_mc8_ssse3_func put, vc1,  _nornd
655INIT_MMX ssse3
656chroma_mc4_ssse3_func put, h264
657
658%define CHROMAMC_AVG DIRECT_AVG
659INIT_XMM ssse3
660chroma_mc8_ssse3_func avg, h264, _rnd
661chroma_mc8_ssse3_func avg, vc1,  _nornd
662INIT_MMX ssse3
663chroma_mc4_ssse3_func avg, h264
664