1;******************************************************************************
2;* x86-SIMD-optimized IDCT for prores
3;* this is identical to "simple" IDCT written by Michael Niedermayer
4;* except for the clip range
5;*
6;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
7;*
8;* This file is part of FFmpeg.
9;*
10;* FFmpeg is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* FFmpeg is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with FFmpeg; if not, write to the Free Software
22;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25; add SECTION_RODATA and proper include before including this file!
26
27%if ARCH_X86_64
28
29%macro define_constants 1
30    %undef w4_plus_w2
31    %undef w4_min_w2
32    %undef w4_plus_w6
33    %undef w4_min_w6
34    %undef w1_plus_w3
35    %undef w3_min_w1
36    %undef w7_plus_w3
37    %undef w3_min_w7
38    %define w4_plus_w2 w4_plus_w2%1
39    %define w4_min_w2  w4_min_w2%1
40    %define w4_plus_w6 w4_plus_w6%1
41    %define w4_min_w6  w4_min_w6%1
42    %define w1_plus_w3 w1_plus_w3%1
43    %define w3_min_w1  w3_min_w1%1
44    %define w7_plus_w3 w7_plus_w3%1
45    %define w3_min_w7  w3_min_w7%1
46%endmacro
47
48; interleave data while maintaining source
49; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave
50%macro SBUTTERFLY3 5
51    punpckl%1   m%2, m%4, m%5
52    punpckh%1   m%3, m%4, m%5
53%endmacro
54
55; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift
56; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6
57;         %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3)
58%macro SUMSUB_SHPK 7
59    psubd       %3,  %1,  %5       ; { a0 - b0 }[0-3]
60    psubd       %4,  %2,  %6       ; { a0 - b0 }[4-7]
61    paddd       %1,  %5            ; { a0 + b0 }[0-3]
62    paddd       %2,  %6            ; { a0 + b0 }[4-7]
63    psrad       %1,  %7
64    psrad       %2,  %7
65    psrad       %3,  %7
66    psrad       %4,  %7
67    packssdw    %1,  %2            ; row[0]
68    packssdw    %3,  %4            ; row[7]
69%endmacro
70
71; %1 = initial bias ("" if nop)
72; %2 = number of bits to shift at the end
73; %3 = qmat (for prores)
74%macro IDCT_1D 2-3
75    ; a0 = (W4 * row[0]) + (1 << (15 - 1));
76    ; a1 = a0;
77    ; a2 = a0;
78    ; a3 = a0;
79    ; a0 += W2 * row[2];
80    ; a1 += W6 * row[2];
81    ; a2 -= W6 * row[2];
82    ; a3 -= W2 * row[2];
83%ifstr %1
84    mova        m15, [pd_round_ %+ %2]
85%else
86    paddw       m10, [%1]
87%endif
88    SBUTTERFLY3 wd,  0,  1, 10,  8 ; { row[0], row[2] }[0-3]/[4-7]
89    pmaddwd     m2,  m0, [w4_plus_w6]
90    pmaddwd     m3,  m1, [w4_plus_w6]
91    pmaddwd     m4,  m0, [w4_min_w6]
92    pmaddwd     m5,  m1, [w4_min_w6]
93    pmaddwd     m6,  m0, [w4_min_w2]
94    pmaddwd     m7,  m1, [w4_min_w2]
95    pmaddwd     m0, [w4_plus_w2]
96    pmaddwd     m1, [w4_plus_w2]
97%ifstr %1
98    ; Adding 1<<(%2-1) for >=15 bits values
99    paddd       m2, m15
100    paddd       m3, m15
101    paddd       m4, m15
102    paddd       m5, m15
103    paddd       m6, m15
104    paddd       m7, m15
105    paddd       m0, m15
106    paddd       m1, m15
107%endif
108
109    ; a0: -1*row[0]-1*row[2]
110    ; a1: -1*row[0]
111    ; a2: -1*row[0]
112    ; a3: -1*row[0]+1*row[2]
113
114    ; a0 +=   W4*row[4] + W6*row[6]; i.e. -1*row[4]
115    ; a1 -=   W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6]
116    ; a2 -=   W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6]
117    ; a3 +=   W4*row[4] - W6*row[6]; i.e. -1*row[4]
118    SBUTTERFLY3 wd,  8,  9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7]
119    pmaddwd     m10, m8, [w4_plus_w6]
120    pmaddwd     m11, m9, [w4_plus_w6]
121    paddd       m0,  m10            ; a0[0-3]
122    paddd       m1,  m11            ; a0[4-7]
123    pmaddwd     m10, m8, [w4_min_w6]
124    pmaddwd     m11, m9, [w4_min_w6]
125    paddd       m6,  m10           ; a3[0-3]
126    paddd       m7,  m11           ; a3[4-7]
127    pmaddwd     m10, m8, [w4_min_w2]
128    pmaddwd     m11, m9, [w4_min_w2]
129    pmaddwd     m8, [w4_plus_w2]
130    pmaddwd     m9, [w4_plus_w2]
131    psubd       m4,  m10           ; a2[0-3] intermediate
132    psubd       m5,  m11           ; a2[4-7] intermediate
133    psubd       m2,  m8            ; a1[0-3] intermediate
134    psubd       m3,  m9            ; a1[4-7] intermediate
135
136    ; load/store
137    mova   [blockq+  0], m0
138    mova   [blockq+ 32], m2
139    mova   [blockq+ 64], m4
140    mova   [blockq+ 96], m6
141    mova        m10,[blockq+ 16]       ; { row[1] }[0-7]
142    mova        m8, [blockq+ 48]       ; { row[3] }[0-7]
143    mova        m13,[blockq+ 80]       ; { row[5] }[0-7]
144    mova        m14,[blockq+112]       ; { row[7] }[0-7]
145    mova   [blockq+ 16], m1
146    mova   [blockq+ 48], m3
147    mova   [blockq+ 80], m5
148    mova   [blockq+112], m7
149%if %0 == 3
150    pmullw      m10,[%3+ 16]
151    pmullw      m8, [%3+ 48]
152    pmullw      m13,[%3+ 80]
153    pmullw      m14,[%3+112]
154%endif
155
156    ; b0 = MUL(W1, row[1]);
157    ; MAC(b0, W3, row[3]);
158    ; b1 = MUL(W3, row[1]);
159    ; MAC(b1, -W7, row[3]);
160    ; b2 = MUL(W5, row[1]);
161    ; MAC(b2, -W1, row[3]);
162    ; b3 = MUL(W7, row[1]);
163    ; MAC(b3, -W5, row[3]);
164    SBUTTERFLY3 wd,  0,  1, 10, 8  ; { row[1], row[3] }[0-3]/[4-7]
165    pmaddwd     m2,  m0, [w3_min_w7]
166    pmaddwd     m3,  m1, [w3_min_w7]
167    pmaddwd     m4,  m0, [w5_min_w1]
168    pmaddwd     m5,  m1, [w5_min_w1]
169    pmaddwd     m6,  m0, [w7_min_w5]
170    pmaddwd     m7,  m1, [w7_min_w5]
171    pmaddwd     m0, [w1_plus_w3]
172    pmaddwd     m1, [w1_plus_w3]
173
174    ; b0: +1*row[1]+2*row[3]
175    ; b1: +2*row[1]-1*row[3]
176    ; b2: -1*row[1]-1*row[3]
177    ; b3: +1*row[1]+1*row[3]
178
179    ; MAC(b0,  W5, row[5]);
180    ; MAC(b0,  W7, row[7]);
181    ; MAC(b1, -W1, row[5]);
182    ; MAC(b1, -W5, row[7]);
183    ; MAC(b2,  W7, row[5]);
184    ; MAC(b2,  W3, row[7]);
185    ; MAC(b3,  W3, row[5]);
186    ; MAC(b3, -W1, row[7]);
187    SBUTTERFLY3 wd,  8,  9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7]
188
189    ; b0: -1*row[5]+1*row[7]
190    ; b1: -1*row[5]+1*row[7]
191    ; b2: +1*row[5]+2*row[7]
192    ; b3: +2*row[5]-1*row[7]
193
194    pmaddwd     m10, m8, [w1_plus_w5]
195    pmaddwd     m11, m9, [w1_plus_w5]
196    pmaddwd     m12, m8, [w5_plus_w7]
197    pmaddwd     m13, m9, [w5_plus_w7]
198    psubd       m2,  m10           ; b1[0-3]
199    psubd       m3,  m11           ; b1[4-7]
200    paddd       m0,  m12            ; b0[0-3]
201    paddd       m1,  m13            ; b0[4-7]
202    pmaddwd     m12, m8, [w7_plus_w3]
203    pmaddwd     m13, m9, [w7_plus_w3]
204    pmaddwd     m8, [w3_min_w1]
205    pmaddwd     m9, [w3_min_w1]
206    paddd       m4,  m12           ; b2[0-3]
207    paddd       m5,  m13           ; b2[4-7]
208    paddd       m6,  m8            ; b3[0-3]
209    paddd       m7,  m9            ; b3[4-7]
210
211    ; row[0] = (a0 + b0) >> 15;
212    ; row[7] = (a0 - b0) >> 15;
213    ; row[1] = (a1 + b1) >> 15;
214    ; row[6] = (a1 - b1) >> 15;
215    ; row[2] = (a2 + b2) >> 15;
216    ; row[5] = (a2 - b2) >> 15;
217    ; row[3] = (a3 + b3) >> 15;
218    ; row[4] = (a3 - b3) >> 15;
219    mova        m8, [blockq+ 0]        ; a0[0-3]
220    mova        m9, [blockq+16]        ; a0[4-7]
221    SUMSUB_SHPK m8,  m9,  m10, m11, m0,  m1,  %2
222    mova        m0, [blockq+32]        ; a1[0-3]
223    mova        m1, [blockq+48]        ; a1[4-7]
224    SUMSUB_SHPK m0,  m1,  m9,  m11, m2,  m3,  %2
225    mova        m1, [blockq+64]        ; a2[0-3]
226    mova        m2, [blockq+80]        ; a2[4-7]
227    SUMSUB_SHPK m1,  m2,  m11, m3,  m4,  m5,  %2
228    mova        m2, [blockq+96]        ; a3[0-3]
229    mova        m3, [blockq+112]       ; a3[4-7]
230    SUMSUB_SHPK m2,  m3,  m4,  m5,  m6,  m7,  %2
231%endmacro
232
233; void ff_prores_idct_put_10_<opt>(uint8_t *pixels, ptrdiff_t stride,
234;                                  int16_t *block, const int16_t *qmat);
235
236; %1 = row shift
237; %2 = row bias macro
238; %3 = column shift
239; %4 = column bias macro
240; %5 = final action (nothing, "store", "put", "add")
241; %6 = min pixel value
242; %7 = max pixel value
243; %8 = qmat (for prores)
244
245%macro IDCT_FN 4-8
246    ; for (i = 0; i < 8; i++)
247    ;     idctRowCondDC(block + i*8);
248    mova        m10,[blockq+ 0]        ; { row[0] }[0-7]
249    mova        m8, [blockq+32]        ; { row[2] }[0-7]
250    mova        m13,[blockq+64]        ; { row[4] }[0-7]
251    mova        m12,[blockq+96]        ; { row[6] }[0-7]
252
253%if %0 == 8
254    pmullw      m10,[%8+ 0]
255    pmullw      m8, [%8+32]
256    pmullw      m13,[%8+64]
257    pmullw      m12,[%8+96]
258
259    IDCT_1D     %1, %2, %8
260%elif %2 == 11
261    ; This copies the DC-only shortcut.  When there is only a DC coefficient the
262    ; C shifts the value and splats it to all coeffs rather than multiplying and
263    ; doing the full IDCT.  This causes a difference on 8-bit because the
264    ; coefficient is 16383 rather than 16384 (which you can get with shifting).
265    por      m1,  m8, m13
266    por      m1,  m12
267    por      m1, [blockq+ 16]       ; { row[1] }[0-7]
268    por      m1, [blockq+ 48]       ; { row[3] }[0-7]
269    por      m1, [blockq+ 80]       ; { row[5] }[0-7]
270    por      m1, [blockq+112]       ; { row[7] }[0-7]
271    pxor     m2,  m2
272    pcmpeqw  m1,  m2
273    psllw    m2,  m10, 3
274    pand     m2,  m1
275    pcmpeqb  m3,  m3
276    pxor     m1,  m3
277    mova    [rsp],    m1
278    mova    [rsp+16], m2
279
280    IDCT_1D  %1,  %2
281
282    mova     m5, [rsp]
283    mova     m6, [rsp+16]
284    pand     m8,  m5
285    por      m8,  m6
286    pand     m0,  m5
287    por      m0,  m6
288    pand     m1,  m5
289    por      m1,  m6
290    pand     m2,  m5
291    por      m2,  m6
292    pand     m4,  m5
293    por      m4,  m6
294    pand     m11, m5
295    por      m11, m6
296    pand     m9,  m5
297    por      m9,  m6
298    pand     m10, m5
299    por      m10, m6
300%else
301    IDCT_1D     %1, %2
302%endif
303
304    ; transpose for second part of IDCT
305    TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
306    mova   [blockq+ 16], m0
307    mova   [blockq+ 48], m2
308    mova   [blockq+ 80], m11
309    mova   [blockq+112], m10
310    SWAP         8,  10
311    SWAP         1,   8
312    SWAP         4,  13
313    SWAP         9,  12
314
315    ; for (i = 0; i < 8; i++)
316    ;     idctSparseColAdd(dest + i, line_size, block + i);
317    IDCT_1D     %3, %4
318
319    ; clip/store
320%if %0 >= 5
321%ifidn %5,"store"
322    ; No clamping, means pure idct
323    mova  [blockq+  0], m8
324    mova  [blockq+ 16], m0
325    mova  [blockq+ 32], m1
326    mova  [blockq+ 48], m2
327    mova  [blockq+ 64], m4
328    mova  [blockq+ 80], m11
329    mova  [blockq+ 96], m9
330    mova  [blockq+112], m10
331%elifidn %5,"put"
332%ifidn %6, 0
333    pxor        m3, m3
334%else
335    mova        m3, [%6]
336%endif ; ifidn %6, 0
337    mova        m5, [%7]
338    pmaxsw      m8,  m3
339    pmaxsw      m0,  m3
340    pmaxsw      m1,  m3
341    pmaxsw      m2,  m3
342    pmaxsw      m4,  m3
343    pmaxsw      m11, m3
344    pmaxsw      m9,  m3
345    pmaxsw      m10, m3
346    pminsw      m8,  m5
347    pminsw      m0,  m5
348    pminsw      m1,  m5
349    pminsw      m2,  m5
350    pminsw      m4,  m5
351    pminsw      m11, m5
352    pminsw      m9,  m5
353    pminsw      m10, m5
354
355    lea         r2, [r1*3]
356    mova  [r0     ], m8
357    mova  [r0+r1  ], m0
358    mova  [r0+r1*2], m1
359    mova  [r0+r2  ], m2
360    lea         r0, [r0+r1*4]
361    mova  [r0     ], m4
362    mova  [r0+r1  ], m11
363    mova  [r0+r1*2], m9
364    mova  [r0+r2  ], m10
365%endif ; %5 action
366%endif; if %0 >= 5
367%endmacro
368
369%endif
370