1;******************************************************************************
2;* x86-SIMD-optimized IDCT for prores
3;* this is identical to "simple" IDCT written by Michael Niedermayer
4;* except for the clip range
5;*
6;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
7;*
8;* This file is part of FFmpeg.
9;*
10;* FFmpeg is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* FFmpeg is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with FFmpeg; if not, write to the Free Software
22;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "libavutil/x86/x86util.asm"
26
27%define W1sh2 22725 ; W1 = 90901 = 22725<<2 + 1
28%define W2sh2 21407 ; W2 = 85627 = 21407<<2 - 1
29%define W3sh2 19265 ; W3 = 77062 = 19265<<2 + 2
30%define W4sh2 16384 ; W4 = 65535 = 16384<<2 - 1
31%define W5sh2 12873 ; W5 = 51491 = 12873<<2 - 1
32%define W6sh2  8867 ; W6 = 35468 =  8867<<2
33%define W7sh2  4520 ; W7 = 18081 =  4520<<2 + 1
34
35%if ARCH_X86_64
36
37SECTION_RODATA
38
39w4_plus_w2: times 4 dw W4sh2, +W2sh2
40w4_min_w2:  times 4 dw W4sh2, -W2sh2
41w4_plus_w6: times 4 dw W4sh2, +W6sh2
42w4_min_w6:  times 4 dw W4sh2, -W6sh2
43w1_plus_w3: times 4 dw W1sh2, +W3sh2
44w3_min_w1:  times 4 dw W3sh2, -W1sh2
45w7_plus_w3: times 4 dw W7sh2, +W3sh2
46w3_min_w7:  times 4 dw W3sh2, -W7sh2
47w1_plus_w5: times 4 dw W1sh2, +W5sh2
48w5_min_w1:  times 4 dw W5sh2, -W1sh2
49w5_plus_w7: times 4 dw W5sh2, +W7sh2
50w7_min_w5:  times 4 dw W7sh2, -W5sh2
51pw_88:      times 8 dw 0x2008
52
53cextern pw_1
54cextern pw_4
55cextern pw_512
56cextern pw_1019
57
58section .text align=16
59
60; interleave data while maintaining source
61; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave
62%macro SBUTTERFLY3 5
63    punpckl%1   m%2, m%4, m%5
64    punpckh%1   m%3, m%4, m%5
65%endmacro
66
67; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift
68; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6
69;         %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3)
70%macro SUMSUB_SHPK 7
71    psubd       %3,  %1,  %5       ; { a0 - b0 }[0-3]
72    psubd       %4,  %2,  %6       ; { a0 - b0 }[4-7]
73    paddd       %1,  %5            ; { a0 + b0 }[0-3]
74    paddd       %2,  %6            ; { a0 + b0 }[4-7]
75    psrad       %1,  %7
76    psrad       %2,  %7
77    psrad       %3,  %7
78    psrad       %4,  %7
79    packssdw    %1,  %2            ; row[0]
80    packssdw    %3,  %4            ; row[7]
81%endmacro
82
83; %1 = row or col (for rounding variable)
84; %2 = number of bits to shift at the end
85%macro IDCT_1D 2
86    ; a0 = (W4 * row[0]) + (1 << (15 - 1));
87    ; a1 = a0;
88    ; a2 = a0;
89    ; a3 = a0;
90    ; a0 += W2 * row[2];
91    ; a1 += W6 * row[2];
92    ; a2 -= W6 * row[2];
93    ; a3 -= W2 * row[2];
94%ifidn %1, col
95    paddw       m10,[pw_88]
96%endif
97%ifidn %1, row
98    paddw       m10,[pw_1]
99%endif
100    SBUTTERFLY3 wd,  0,  1, 10,  8 ; { row[0], row[2] }[0-3]/[4-7]
101    pmaddwd     m2,  m0, [w4_plus_w6]
102    pmaddwd     m3,  m1, [w4_plus_w6]
103    pmaddwd     m4,  m0, [w4_min_w6]
104    pmaddwd     m5,  m1, [w4_min_w6]
105    pmaddwd     m6,  m0, [w4_min_w2]
106    pmaddwd     m7,  m1, [w4_min_w2]
107    pmaddwd     m0, [w4_plus_w2]
108    pmaddwd     m1, [w4_plus_w2]
109
110    ; a0: -1*row[0]-1*row[2]
111    ; a1: -1*row[0]
112    ; a2: -1*row[0]
113    ; a3: -1*row[0]+1*row[2]
114
115    ; a0 +=   W4*row[4] + W6*row[6]; i.e. -1*row[4]
116    ; a1 -=   W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6]
117    ; a2 -=   W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6]
118    ; a3 +=   W4*row[4] - W6*row[6]; i.e. -1*row[4]
119    SBUTTERFLY3 wd,  8,  9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7]
120    pmaddwd     m10, m8, [w4_plus_w6]
121    pmaddwd     m11, m9, [w4_plus_w6]
122    paddd       m0,  m10            ; a0[0-3]
123    paddd       m1,  m11            ; a0[4-7]
124    pmaddwd     m10, m8, [w4_min_w6]
125    pmaddwd     m11, m9, [w4_min_w6]
126    paddd       m6,  m10           ; a3[0-3]
127    paddd       m7,  m11           ; a3[4-7]
128    pmaddwd     m10, m8, [w4_min_w2]
129    pmaddwd     m11, m9, [w4_min_w2]
130    pmaddwd     m8, [w4_plus_w2]
131    pmaddwd     m9, [w4_plus_w2]
132    psubd       m4,  m10           ; a2[0-3] intermediate
133    psubd       m5,  m11           ; a2[4-7] intermediate
134    psubd       m2,  m8            ; a1[0-3] intermediate
135    psubd       m3,  m9            ; a1[4-7] intermediate
136
137    ; load/store
138    mova   [r2+  0], m0
139    mova   [r2+ 32], m2
140    mova   [r2+ 64], m4
141    mova   [r2+ 96], m6
142    mova        m10,[r2+ 16]       ; { row[1] }[0-7]
143    mova        m8, [r2+ 48]       ; { row[3] }[0-7]
144    mova        m13,[r2+ 80]       ; { row[5] }[0-7]
145    mova        m14,[r2+112]       ; { row[7] }[0-7]
146    mova   [r2+ 16], m1
147    mova   [r2+ 48], m3
148    mova   [r2+ 80], m5
149    mova   [r2+112], m7
150%ifidn %1, row
151    pmullw      m10,[r3+ 16]
152    pmullw      m8, [r3+ 48]
153    pmullw      m13,[r3+ 80]
154    pmullw      m14,[r3+112]
155%endif
156
157    ; b0 = MUL(W1, row[1]);
158    ; MAC(b0, W3, row[3]);
159    ; b1 = MUL(W3, row[1]);
160    ; MAC(b1, -W7, row[3]);
161    ; b2 = MUL(W5, row[1]);
162    ; MAC(b2, -W1, row[3]);
163    ; b3 = MUL(W7, row[1]);
164    ; MAC(b3, -W5, row[3]);
165    SBUTTERFLY3 wd,  0,  1, 10, 8  ; { row[1], row[3] }[0-3]/[4-7]
166    pmaddwd     m2,  m0, [w3_min_w7]
167    pmaddwd     m3,  m1, [w3_min_w7]
168    pmaddwd     m4,  m0, [w5_min_w1]
169    pmaddwd     m5,  m1, [w5_min_w1]
170    pmaddwd     m6,  m0, [w7_min_w5]
171    pmaddwd     m7,  m1, [w7_min_w5]
172    pmaddwd     m0, [w1_plus_w3]
173    pmaddwd     m1, [w1_plus_w3]
174
175    ; b0: +1*row[1]+2*row[3]
176    ; b1: +2*row[1]-1*row[3]
177    ; b2: -1*row[1]-1*row[3]
178    ; b3: +1*row[1]+1*row[3]
179
180    ; MAC(b0,  W5, row[5]);
181    ; MAC(b0,  W7, row[7]);
182    ; MAC(b1, -W1, row[5]);
183    ; MAC(b1, -W5, row[7]);
184    ; MAC(b2,  W7, row[5]);
185    ; MAC(b2,  W3, row[7]);
186    ; MAC(b3,  W3, row[5]);
187    ; MAC(b3, -W1, row[7]);
188    SBUTTERFLY3 wd,  8,  9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7]
189
190    ; b0: -1*row[5]+1*row[7]
191    ; b1: -1*row[5]+1*row[7]
192    ; b2: +1*row[5]+2*row[7]
193    ; b3: +2*row[5]-1*row[7]
194
195    pmaddwd     m10, m8, [w1_plus_w5]
196    pmaddwd     m11, m9, [w1_plus_w5]
197    pmaddwd     m12, m8, [w5_plus_w7]
198    pmaddwd     m13, m9, [w5_plus_w7]
199    psubd       m2,  m10           ; b1[0-3]
200    psubd       m3,  m11           ; b1[4-7]
201    paddd       m0,  m12            ; b0[0-3]
202    paddd       m1,  m13            ; b0[4-7]
203    pmaddwd     m12, m8, [w7_plus_w3]
204    pmaddwd     m13, m9, [w7_plus_w3]
205    pmaddwd     m8, [w3_min_w1]
206    pmaddwd     m9, [w3_min_w1]
207    paddd       m4,  m12           ; b2[0-3]
208    paddd       m5,  m13           ; b2[4-7]
209    paddd       m6,  m8            ; b3[0-3]
210    paddd       m7,  m9            ; b3[4-7]
211
212    ; row[0] = (a0 + b0) >> 15;
213    ; row[7] = (a0 - b0) >> 15;
214    ; row[1] = (a1 + b1) >> 15;
215    ; row[6] = (a1 - b1) >> 15;
216    ; row[2] = (a2 + b2) >> 15;
217    ; row[5] = (a2 - b2) >> 15;
218    ; row[3] = (a3 + b3) >> 15;
219    ; row[4] = (a3 - b3) >> 15;
220    mova        m8, [r2+ 0]        ; a0[0-3]
221    mova        m9, [r2+16]        ; a0[4-7]
222    SUMSUB_SHPK m8,  m9,  m10, m11, m0,  m1,  %2
223    mova        m0, [r2+32]        ; a1[0-3]
224    mova        m1, [r2+48]        ; a1[4-7]
225    SUMSUB_SHPK m0,  m1,  m9,  m11, m2,  m3,  %2
226    mova        m1, [r2+64]        ; a2[0-3]
227    mova        m2, [r2+80]        ; a2[4-7]
228    SUMSUB_SHPK m1,  m2,  m11, m3,  m4,  m5,  %2
229    mova        m2, [r2+96]        ; a3[0-3]
230    mova        m3, [r2+112]       ; a3[4-7]
231    SUMSUB_SHPK m2,  m3,  m4,  m5,  m6,  m7,  %2
232%endmacro
233
234; void ff_prores_idct_put_10_<opt>(uint8_t *pixels, int stride,
235;                                  int16_t *block, const int16_t *qmat);
236%macro idct_put_fn 1
237cglobal prores_idct_put_10, 4, 4, %1
238    movsxd      r1,  r1d
239    pxor        m15, m15           ; zero
240
241    ; for (i = 0; i < 8; i++)
242    ;     idctRowCondDC(block + i*8);
243    mova        m10,[r2+ 0]        ; { row[0] }[0-7]
244    mova        m8, [r2+32]        ; { row[2] }[0-7]
245    mova        m13,[r2+64]        ; { row[4] }[0-7]
246    mova        m12,[r2+96]        ; { row[6] }[0-7]
247
248    pmullw      m10,[r3+ 0]
249    pmullw      m8, [r3+32]
250    pmullw      m13,[r3+64]
251    pmullw      m12,[r3+96]
252
253    IDCT_1D     row, 15
254
255    ; transpose for second part of IDCT
256    TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
257    mova   [r2+ 16], m0
258    mova   [r2+ 48], m2
259    mova   [r2+ 80], m11
260    mova   [r2+112], m10
261    SWAP         8,  10
262    SWAP         1,   8
263    SWAP         4,  13
264    SWAP         9,  12
265
266    ; for (i = 0; i < 8; i++)
267    ;     idctSparseColAdd(dest + i, line_size, block + i);
268    IDCT_1D     col, 18
269
270    ; clip/store
271    mova        m3, [pw_4]
272    mova        m5, [pw_1019]
273    pmaxsw      m8,  m3
274    pmaxsw      m0,  m3
275    pmaxsw      m1,  m3
276    pmaxsw      m2,  m3
277    pmaxsw      m4,  m3
278    pmaxsw      m11, m3
279    pmaxsw      m9,  m3
280    pmaxsw      m10, m3
281    pminsw      m8,  m5
282    pminsw      m0,  m5
283    pminsw      m1,  m5
284    pminsw      m2,  m5
285    pminsw      m4,  m5
286    pminsw      m11, m5
287    pminsw      m9,  m5
288    pminsw      m10, m5
289
290    lea         r2, [r1*3]
291    mova  [r0     ], m8
292    mova  [r0+r1  ], m0
293    mova  [r0+r1*2], m1
294    mova  [r0+r2  ], m2
295    lea         r0, [r0+r1*4]
296    mova  [r0     ], m4
297    mova  [r0+r1  ], m11
298    mova  [r0+r1*2], m9
299    mova  [r0+r2  ], m10
300    RET
301%endmacro
302
303%macro SIGNEXTEND 2-3
304%if cpuflag(sse4) ; dstlow, dsthigh
305    movhlps     %2,  %1
306    pmovsxwd    %1,  %1
307    pmovsxwd    %2,  %2
308%elif cpuflag(sse2) ; dstlow, dsthigh, tmp
309    pxor        %3,  %3
310    pcmpgtw     %3,  %1
311    mova        %2,  %1
312    punpcklwd   %1,  %3
313    punpckhwd   %2,  %3
314%endif
315%endmacro
316
317INIT_XMM sse2
318idct_put_fn 16
319INIT_XMM sse4
320idct_put_fn 16
321%if HAVE_AVX_EXTERNAL
322INIT_XMM avx
323idct_put_fn 16
324%endif
325
326%endif
327