1;******************************************************************************
2;* SIMD-optimized IDCT-related routines
3;* Copyright (c) 2008 Loren Merritt
4;* Copyright (c) 2003-2013 Michael Niedermayer
5;* Copyright (c) 2013 Daniel Kang
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA
27
28cextern pb_80
29
30SECTION .text
31
32;--------------------------------------------------------------------------
33;void ff_put_signed_pixels_clamped(const int16_t *block, uint8_t *pixels,
34;                                  ptrdiff_t line_size)
35;--------------------------------------------------------------------------
36
37%macro PUT_SIGNED_PIXELS_CLAMPED_HALF 1
38    mova     m1, [blockq+mmsize*0+%1]
39    mova     m2, [blockq+mmsize*2+%1]
40%if mmsize == 8
41    mova     m3, [blockq+mmsize*4+%1]
42    mova     m4, [blockq+mmsize*6+%1]
43%endif
44    packsswb m1, [blockq+mmsize*1+%1]
45    packsswb m2, [blockq+mmsize*3+%1]
46%if mmsize == 8
47    packsswb m3, [blockq+mmsize*5+%1]
48    packsswb m4, [blockq+mmsize*7+%1]
49%endif
50    paddb    m1, m0
51    paddb    m2, m0
52%if mmsize == 8
53    paddb    m3, m0
54    paddb    m4, m0
55    movq     [pixelsq+lsizeq*0], m1
56    movq     [pixelsq+lsizeq*1], m2
57    movq     [pixelsq+lsizeq*2], m3
58    movq     [pixelsq+lsize3q ], m4
59%else
60    movq     [pixelsq+lsizeq*0], m1
61    movhps   [pixelsq+lsizeq*1], m1
62    movq     [pixelsq+lsizeq*2], m2
63    movhps   [pixelsq+lsize3q ], m2
64%endif
65%endmacro
66
67%macro PUT_SIGNED_PIXELS_CLAMPED 1
68cglobal put_signed_pixels_clamped, 3, 4, %1, block, pixels, lsize, lsize3
69    mova     m0, [pb_80]
70    lea      lsize3q, [lsizeq*3]
71    PUT_SIGNED_PIXELS_CLAMPED_HALF 0
72    lea      pixelsq, [pixelsq+lsizeq*4]
73    PUT_SIGNED_PIXELS_CLAMPED_HALF 64
74    RET
75%endmacro
76
77INIT_MMX mmx
78PUT_SIGNED_PIXELS_CLAMPED 0
79INIT_XMM sse2
80PUT_SIGNED_PIXELS_CLAMPED 3
81
82;--------------------------------------------------------------------------
83; void ff_put_pixels_clamped(const int16_t *block, uint8_t *pixels,
84;                            ptrdiff_t line_size);
85;--------------------------------------------------------------------------
86; %1 = block offset
87%macro PUT_PIXELS_CLAMPED_HALF 1
88    mova     m0, [blockq+mmsize*0+%1]
89    mova     m1, [blockq+mmsize*2+%1]
90%if mmsize == 8
91    mova     m2, [blockq+mmsize*4+%1]
92    mova     m3, [blockq+mmsize*6+%1]
93%endif
94    packuswb m0, [blockq+mmsize*1+%1]
95    packuswb m1, [blockq+mmsize*3+%1]
96%if mmsize == 8
97    packuswb m2, [blockq+mmsize*5+%1]
98    packuswb m3, [blockq+mmsize*7+%1]
99    movq           [pixelsq], m0
100    movq    [lsizeq+pixelsq], m1
101    movq  [2*lsizeq+pixelsq], m2
102    movq   [lsize3q+pixelsq], m3
103%else
104    movq           [pixelsq], m0
105    movhps  [lsizeq+pixelsq], m0
106    movq  [2*lsizeq+pixelsq], m1
107    movhps [lsize3q+pixelsq], m1
108%endif
109%endmacro
110
111%macro PUT_PIXELS_CLAMPED 0
112cglobal put_pixels_clamped, 3, 4, 2, block, pixels, lsize, lsize3
113    lea lsize3q, [lsizeq*3]
114    PUT_PIXELS_CLAMPED_HALF 0
115    lea pixelsq, [pixelsq+lsizeq*4]
116    PUT_PIXELS_CLAMPED_HALF 64
117    RET
118%endmacro
119
120INIT_MMX mmx
121PUT_PIXELS_CLAMPED
122INIT_XMM sse2
123PUT_PIXELS_CLAMPED
124
125;--------------------------------------------------------------------------
126; void ff_add_pixels_clamped(const int16_t *block, uint8_t *pixels,
127;                            ptrdiff_t line_size);
128;--------------------------------------------------------------------------
129; %1 = block offset
130%macro ADD_PIXELS_CLAMPED 1
131    mova       m0, [blockq+mmsize*0+%1]
132    mova       m1, [blockq+mmsize*1+%1]
133%if mmsize == 8
134    mova       m5, [blockq+mmsize*2+%1]
135    mova       m6, [blockq+mmsize*3+%1]
136%endif
137    movq       m2, [pixelsq]
138    movq       m3, [pixelsq+lsizeq]
139%if mmsize == 8
140    mova       m7, m2
141    punpcklbw  m2, m4
142    punpckhbw  m7, m4
143    paddsw     m0, m2
144    paddsw     m1, m7
145    mova       m7, m3
146    punpcklbw  m3, m4
147    punpckhbw  m7, m4
148    paddsw     m5, m3
149    paddsw     m6, m7
150%else
151    punpcklbw  m2, m4
152    punpcklbw  m3, m4
153    paddsw     m0, m2
154    paddsw     m1, m3
155%endif
156    packuswb   m0, m1
157%if mmsize == 8
158    packuswb   m5, m6
159    movq       [pixelsq], m0
160    movq       [pixelsq+lsizeq], m5
161%else
162    movq       [pixelsq], m0
163    movhps     [pixelsq+lsizeq], m0
164%endif
165%endmacro
166
167%macro ADD_PIXELS_CLAMPED 0
168cglobal add_pixels_clamped, 3, 3, 5, block, pixels, lsize
169    pxor       m4, m4
170    ADD_PIXELS_CLAMPED 0
171    lea        pixelsq, [pixelsq+lsizeq*2]
172    ADD_PIXELS_CLAMPED 32
173    lea        pixelsq, [pixelsq+lsizeq*2]
174    ADD_PIXELS_CLAMPED 64
175    lea        pixelsq, [pixelsq+lsizeq*2]
176    ADD_PIXELS_CLAMPED 96
177    RET
178%endmacro
179
180INIT_MMX mmx
181ADD_PIXELS_CLAMPED
182INIT_XMM sse2
183ADD_PIXELS_CLAMPED
184