1;******************************************************************************
2;* SIMD-optimized quarterpel functions
3;* Copyright (c) 2008 Loren Merritt
4;* Copyright (c) 2003-2013 Michael Niedermayer
5;* Copyright (c) 2013 Daniel Kang
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION .text
27
28%macro op_avgh 3
29    movh   %3, %2
30    pavgb  %1, %3
31    movh   %2, %1
32%endmacro
33
34%macro op_avg 2
35    pavgb  %1, %2
36    mova   %2, %1
37%endmacro
38
39%macro op_puth 2-3
40    movh   %2, %1
41%endmacro
42
43%macro op_put 2
44    mova   %2, %1
45%endmacro
46
47; void ff_put/avg_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
48;                                   int dstStride, int src1Stride, int h)
49%macro PIXELS4_L2 1
50%define OP op_%1h
51cglobal %1_pixels4_l2, 6,6
52    movsxdifnidn r3, r3d
53    movsxdifnidn r4, r4d
54    test        r5d, 1
55    je        .loop
56    movd         m0, [r1]
57    movd         m1, [r2]
58    add          r1, r4
59    add          r2, 4
60    pavgb        m0, m1
61    OP           m0, [r0], m3
62    add          r0, r3
63    dec         r5d
64.loop:
65    mova         m0, [r1]
66    mova         m1, [r1+r4]
67    lea          r1, [r1+2*r4]
68    pavgb        m0, [r2]
69    pavgb        m1, [r2+4]
70    OP           m0, [r0], m3
71    OP           m1, [r0+r3], m3
72    lea          r0, [r0+2*r3]
73    mova         m0, [r1]
74    mova         m1, [r1+r4]
75    lea          r1, [r1+2*r4]
76    pavgb        m0, [r2+8]
77    pavgb        m1, [r2+12]
78    OP           m0, [r0], m3
79    OP           m1, [r0+r3], m3
80    lea          r0, [r0+2*r3]
81    add          r2, 16
82    sub         r5d, 4
83    jne       .loop
84    REP_RET
85%endmacro
86
87INIT_MMX mmxext
88PIXELS4_L2 put
89PIXELS4_L2 avg
90
91; void ff_put/avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
92;                                   int dstStride, int src1Stride, int h)
93%macro PIXELS8_L2 1
94%define OP op_%1
95cglobal %1_pixels8_l2, 6,6
96    movsxdifnidn r3, r3d
97    movsxdifnidn r4, r4d
98    test        r5d, 1
99    je        .loop
100    mova         m0, [r1]
101    mova         m1, [r2]
102    add          r1, r4
103    add          r2, 8
104    pavgb        m0, m1
105    OP           m0, [r0]
106    add          r0, r3
107    dec         r5d
108.loop:
109    mova         m0, [r1]
110    mova         m1, [r1+r4]
111    lea          r1, [r1+2*r4]
112    pavgb        m0, [r2]
113    pavgb        m1, [r2+8]
114    OP           m0, [r0]
115    OP           m1, [r0+r3]
116    lea          r0, [r0+2*r3]
117    mova         m0, [r1]
118    mova         m1, [r1+r4]
119    lea          r1, [r1+2*r4]
120    pavgb        m0, [r2+16]
121    pavgb        m1, [r2+24]
122    OP           m0, [r0]
123    OP           m1, [r0+r3]
124    lea          r0, [r0+2*r3]
125    add          r2, 32
126    sub         r5d, 4
127    jne       .loop
128    REP_RET
129%endmacro
130
131INIT_MMX mmxext
132PIXELS8_L2 put
133PIXELS8_L2 avg
134
135; void ff_put/avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
136;                                    int dstStride, int src1Stride, int h)
137%macro PIXELS16_L2 1
138%define OP op_%1
139cglobal %1_pixels16_l2, 6,6
140    movsxdifnidn r3, r3d
141    movsxdifnidn r4, r4d
142    test        r5d, 1
143    je        .loop
144    mova         m0, [r1]
145    mova         m1, [r1+8]
146    pavgb        m0, [r2]
147    pavgb        m1, [r2+8]
148    add          r1, r4
149    add          r2, 16
150    OP           m0, [r0]
151    OP           m1, [r0+8]
152    add          r0, r3
153    dec         r5d
154.loop:
155    mova         m0, [r1]
156    mova         m1, [r1+8]
157    add          r1, r4
158    pavgb        m0, [r2]
159    pavgb        m1, [r2+8]
160    OP           m0, [r0]
161    OP           m1, [r0+8]
162    add          r0, r3
163    mova         m0, [r1]
164    mova         m1, [r1+8]
165    add          r1, r4
166    pavgb        m0, [r2+16]
167    pavgb        m1, [r2+24]
168    OP           m0, [r0]
169    OP           m1, [r0+8]
170    add          r0, r3
171    add          r2, 32
172    sub         r5d, 2
173    jne       .loop
174    REP_RET
175%endmacro
176
177INIT_MMX mmxext
178PIXELS16_L2 put
179PIXELS16_L2 avg
180