1;******************************************************************************
2;* x86 optimizations for PNG decoding
3;*
4;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu>
5;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA
27
28cextern pw_255
29
30SECTION_TEXT
31
32; %1 = nr. of xmm registers used
33%macro ADD_BYTES_FN 1
34cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i
35%if ARCH_X86_64
36    movsxd             waq, wad
37%endif
38    xor                 iq, iq
39
40    ; vector loop
41    mov                 wq, waq
42    and                waq, ~(mmsize*2-1)
43    jmp .end_v
44.loop_v:
45    mova                m0, [src1q+iq]
46    mova                m1, [src1q+iq+mmsize]
47    paddb               m0, [src2q+iq]
48    paddb               m1, [src2q+iq+mmsize]
49    mova  [dstq+iq       ], m0
50    mova  [dstq+iq+mmsize], m1
51    add                 iq, mmsize*2
52.end_v:
53    cmp                 iq, waq
54    jl .loop_v
55
56%if mmsize == 16
57    ; vector loop
58    mov                waq, wq
59    and                waq, ~7
60    jmp .end_l
61.loop_l:
62    movq               mm0, [src1q+iq]
63    paddb              mm0, [src2q+iq]
64    movq  [dstq+iq       ], mm0
65    add                 iq, 8
66.end_l:
67    cmp                 iq, waq
68    jl .loop_l
69%endif
70
71    ; scalar loop for leftover
72    jmp .end_s
73.loop_s:
74    mov                wab, [src1q+iq]
75    add                wab, [src2q+iq]
76    mov          [dstq+iq], wab
77    inc                 iq
78.end_s:
79    cmp                 iq, wq
80    jl .loop_s
81    REP_RET
82%endmacro
83
84%if ARCH_X86_32
85INIT_MMX mmx
86ADD_BYTES_FN 0
87%endif
88
89INIT_XMM sse2
90ADD_BYTES_FN 2
91
92%macro ADD_PAETH_PRED_FN 1
93cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr
94%if ARCH_X86_64
95    movsxd            bppq, bppd
96    movsxd              wq, wd
97%endif
98    lea               endq, [dstq+wq-(mmsize/2-1)]
99    sub               topq, dstq
100    sub               srcq, dstq
101    sub               dstq, bppq
102    pxor                m7, m7
103
104    PUSH              dstq
105    lea              cntrq, [bppq-1]
106    shr              cntrq, 2 + mmsize/16
107.bpp_loop:
108    lea               dstq, [dstq+cntrq*(mmsize/2)]
109    movh                m0, [dstq]
110    movh                m1, [topq+dstq]
111    punpcklbw           m0, m7
112    punpcklbw           m1, m7
113    add               dstq, bppq
114.loop:
115    mova                m2, m1
116    movh                m1, [topq+dstq]
117    mova                m3, m2
118    punpcklbw           m1, m7
119    mova                m4, m2
120    psubw               m3, m1
121    psubw               m4, m0
122    mova                m5, m3
123    paddw               m5, m4
124%if cpuflag(ssse3)
125    pabsw               m3, m3
126    pabsw               m4, m4
127    pabsw               m5, m5
128%else ; !cpuflag(ssse3)
129    psubw               m7, m5
130    pmaxsw              m5, m7
131    pxor                m6, m6
132    pxor                m7, m7
133    psubw               m6, m3
134    psubw               m7, m4
135    pmaxsw              m3, m6
136    pmaxsw              m4, m7
137    pxor                m7, m7
138%endif ; cpuflag(ssse3)
139    mova                m6, m4
140    pminsw              m6, m5
141    pcmpgtw             m3, m6
142    pcmpgtw             m4, m5
143    mova                m6, m4
144    pand                m4, m3
145    pandn               m6, m3
146    pandn               m3, m0
147    movh                m0, [srcq+dstq]
148    pand                m6, m1
149    pand                m2, m4
150    punpcklbw           m0, m7
151    paddw               m0, m6
152    paddw               m3, m2
153    paddw               m0, m3
154    pand                m0, [pw_255]
155    mova                m3, m0
156    packuswb            m3, m3
157    movh            [dstq], m3
158    add               dstq, bppq
159    cmp               dstq, endq
160    jle .loop
161
162    mov               dstq, [rsp]
163    dec              cntrq
164    jge .bpp_loop
165    POP               dstq
166    RET
167%endmacro
168
169INIT_MMX mmxext
170ADD_PAETH_PRED_FN 0
171
172INIT_MMX ssse3
173ADD_PAETH_PRED_FN 0
174