1;*******************************************************************************
2;* SIMD-optimized IDCT functions for HEVC decoding
3;* Copyright (c) 2014 Pierre-Edouard LEPERE
4;* Copyright (c) 2014 James Almer
5;* Copyright (c) 2016 Alexandra Hájková
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA
27
28pd_64: times 4 dd 64
29pd_2048: times 4 dd 2048
30pd_512: times 4 dd 512
31
32; 4x4 transform coeffs
33cextern pw_64
34pw_64_m64: times 4 dw 64, -64
35pw_83_36: times 4 dw 83, 36
36pw_36_m83: times 4 dw 36, -83
37
38; 8x8 transform coeffs
39pw_89_75: times 4 dw 89, 75
40pw_50_18: times 4 dw 50, 18
41
42pw_75_m18: times 4 dw 75, -18
43pw_m89_m50: times 4 dw -89, -50
44
45pw_50_m89: times 4 dw 50, -89
46pw_18_75: times 4 dw 18, 75
47
48pw_18_m50: times 4 dw 18, -50
49pw_75_m89: times 4 dw 75, -89
50
51; 16x16 transformation coeffs
52trans_coeffs16: times 4 dw 90, 87
53times 4 dw 80, 70
54times 4 dw 57, 43
55times 4 dw 25, 9
56
57times 4 dw 87, 57
58times 4 dw 9, -43
59times 4 dw -80, -90
60times 4 dw -70, -25
61
62times 4 dw 80, 9
63times 4 dw -70, -87
64times 4 dw -25, 57
65times 4 dw 90, 43
66
67times 4 dw 70, -43
68times 4 dw -87, 9
69times 4 dw 90, 25
70times 4 dw -80, -57
71
72times 4 dw 57, -80
73times 4 dw -25, 90
74times 4 dw -9, -87
75times 4 dw 43, 70
76
77times 4 dw 43, -90
78times 4 dw 57, 25
79times 4 dw -87, 70
80times 4 dw 9, -80
81
82times 4 dw 25, -70
83times 4 dw 90, -80
84times 4 dw 43, 9
85times 4 dw -57, 87
86
87times 4 dw 9, -25
88times 4 dw 43, -57
89times 4 dw 70, -80
90times 4 dw 87, -90
91
92; 32x32 transform coeffs
93trans_coeff32: times 8 dw 90
94times 4 dw 88, 85
95times 4 dw 82, 78
96times 4 dw 73, 67
97times 4 dw 61, 54
98times 4 dw 46, 38
99times 4 dw 31, 22
100times 4 dw 13, 4
101
102times 4 dw 90, 82
103times 4 dw 67, 46
104times 4 dw 22, -4
105times 4 dw -31, -54
106times 4 dw -73, -85
107times 4 dw -90, -88
108times 4 dw -78, -61
109times 4 dw -38, -13
110
111times 4 dw 88, 67
112times 4 dw 31, -13
113times 4 dw -54, -82
114times 4 dw -90, -78
115times 4 dw -46, -4
116times 4 dw 38, 73
117times 4 dw 90, 85
118times 4 dw 61, 22
119
120times 4 dw 85, 46
121times 4 dw -13, -67
122times 4 dw -90, -73
123times 4 dw -22, 38
124times 4 dw 82, 88
125times 4 dw 54, -4
126times 4 dw -61, -90
127times 4 dw -78, -31
128
129times 4 dw 82, 22
130times 4 dw -54, -90
131times 4 dw -61, 13
132times 4 dw 78, 85
133times 4 dw 31, -46
134times 4 dw -90, -67
135times 4 dw 4, 73
136times 4 dw 88, 38
137
138times 4 dw 78, -4
139times 4 dw -82, -73
140times 4 dw 13, 85
141times 4 dw 67, -22
142times 4 dw -88, -61
143times 4 dw 31, 90
144times 4 dw 54, -38
145times 4 dw -90, -46
146
147times 4 dw 73, -31
148times 4 dw -90, -22
149times 4 dw 78, 67
150times 4 dw -38, -90
151times 4 dw -13, 82
152times 4 dw 61, -46
153times 4 dw -88, -4
154times 4 dw 85, 54
155
156times 4 dw 67, -54
157times 4 dw -78, 38
158times 4 dw 85, -22
159times 4 dw -90, 4
160times 4 dw 90, 13
161times 4 dw -88, -31
162times 4 dw 82, 46
163times 4 dw -73, -61
164
165times 4 dw 61, -73
166times 4 dw -46, 82
167times 4 dw 31, -88
168times 4 dw -13, 90
169times 4 dw -4, -90
170times 4 dw 22, 85
171times 4 dw -38, -78
172times 4 dw 54, 67
173
174times 4 dw 54, -85
175times 4 dw -4, 88
176times 4 dw -46, -61
177times 4 dw 82, 13
178times 4 dw -90, 38
179times 4 dw 67, -78
180times 4 dw -22, 90
181times 4 dw -31, -73
182
183times 4 dw 46, -90
184times 4 dw 38, 54
185times 4 dw -90, 31
186times 4 dw 61, -88
187times 4 dw 22, 67
188times 4 dw -85, 13
189times 4 dw 73, -82
190times 4 dw 4, 78
191
192times 4 dw 38, -88
193times 4 dw 73, -4
194times 4 dw -67, 90
195times 4 dw -46, -31
196times 4 dw 85, -78
197times 4 dw 13, 61
198times 4 dw -90, 54
199times 4 dw 22, -82
200
201times 4 dw 31, -78
202times 4 dw 90, -61
203times 4 dw 4, 54
204times 4 dw -88, 82
205times 4 dw -38, -22
206times 4 dw 73, -90
207times 4 dw 67, -13
208times 4 dw -46, 85
209
210times 4 dw 22, -61
211times 4 dw 85, -90
212times 4 dw 73, -38
213times 4 dw -4, 46
214times 4 dw -78, 90
215times 4 dw -82, 54
216times 4 dw -13, -31
217times 4 dw 67, -88
218
219times 4 dw 13, -38
220times 4 dw 61, -78
221times 4 dw 88, -90
222times 4 dw 85, -73
223times 4 dw 54, -31
224times 4 dw 4, 22
225times 4 dw -46, 67
226times 4 dw -82, 90
227
228times 4 dw 4, -13
229times 4 dw 22, -31
230times 4 dw 38, -46
231times 4 dw 54, -61
232times 4 dw 67, -73
233times 4 dw 78, -82
234times 4 dw 85, -88
235times 4 dw 90, -90
236
237SECTION .text
238
239; void ff_hevc_idct_HxW_dc_{8,10}_<opt>(int16_t *coeffs)
240; %1 = HxW
241; %2 = number of loops
242; %3 = bitdepth
243%macro IDCT_DC 3
244cglobal hevc_idct_%1x%1_dc_%3, 1, 2, 1, coeff, tmp
245    movsx             tmpd, word [coeffq]
246    add               tmpd, (1 << (14 - %3)) + 1
247    sar               tmpd, (15 - %3)
248    movd               xm0, tmpd
249    SPLATW              m0, xm0
250    DEFINE_ARGS coeff, cnt
251    mov               cntd, %2
252.loop:
253    mova [coeffq+mmsize*0], m0
254    mova [coeffq+mmsize*1], m0
255    mova [coeffq+mmsize*2], m0
256    mova [coeffq+mmsize*3], m0
257    add  coeffq, mmsize*8
258    mova [coeffq+mmsize*-4], m0
259    mova [coeffq+mmsize*-3], m0
260    mova [coeffq+mmsize*-2], m0
261    mova [coeffq+mmsize*-1], m0
262    dec  cntd
263    jg  .loop
264    RET
265%endmacro
266
267; %1 = HxW
268; %2 = bitdepth
269%macro IDCT_DC_NL 2 ; No loop
270cglobal hevc_idct_%1x%1_dc_%2, 1, 2, 1, coeff, tmp
271    movsx             tmpd, word [coeffq]
272    add               tmpd, (1 << (14 - %2)) + 1
273    sar               tmpd, (15 - %2)
274    movd                m0, tmpd
275    SPLATW              m0, xm0
276    mova [coeffq+mmsize*0], m0
277    mova [coeffq+mmsize*1], m0
278    mova [coeffq+mmsize*2], m0
279    mova [coeffq+mmsize*3], m0
280%if mmsize == 16
281    mova [coeffq+mmsize*4], m0
282    mova [coeffq+mmsize*5], m0
283    mova [coeffq+mmsize*6], m0
284    mova [coeffq+mmsize*7], m0
285%endif
286    RET
287%endmacro
288
289; IDCT 4x4, expects input in m0, m1
290; %1 - shift
291; %2 - 1/0 - SCALE and Transpose or not
292; %3 - 1/0 add constant or not
293%macro TR_4x4 3
294    ; interleaves src0 with src2 to m0
295    ;         and src1 with scr3 to m2
296    ; src0: 00 01 02 03     m0: 00 20 01 21 02 22 03 23
297    ; src1: 10 11 12 13 -->
298    ; src2: 20 21 22 23     m1: 10 30 11 31 12 32 13 33
299    ; src3: 30 31 32 33
300
301    SBUTTERFLY wd, 0, 1, 2
302
303    pmaddwd m2, m0, [pw_64]    ; e0
304    pmaddwd m3, m1, [pw_83_36] ; o0
305    pmaddwd m0, [pw_64_m64]    ; e1
306    pmaddwd m1, [pw_36_m83]    ; o1
307
308%if %3 == 1
309    %assign %%add 1 << (%1 - 1)
310    mova  m4, [pd_ %+ %%add]
311    paddd m2, m4
312    paddd m0, m4
313%endif
314
315    SUMSUB_BADC d, 3, 2, 1, 0, 4
316
317%if %2 == 1
318    psrad m3, %1 ; e0 + o0
319    psrad m1, %1 ; e1 + o1
320    psrad m2, %1 ; e0 - o0
321    psrad m0, %1 ; e1 - o1
322    ;clip16
323    packssdw m3, m1
324    packssdw m0, m2
325    ; Transpose
326    SBUTTERFLY wd, 3, 0, 1
327    SBUTTERFLY wd, 3, 0, 1
328    SWAP 3, 1, 0
329%else
330    SWAP 3, 2, 0
331%endif
332%endmacro
333
334%macro DEFINE_BIAS 1
335    %assign shift (20 - %1)
336    %assign c_add (1 << (shift - 1))
337    %define arr_add pd_ %+ c_add
338%endmacro
339
340; %1 - bit_depth
341; %2 - register add constant
342; is loaded to
343; shift = 20 - bit_depth
344%macro LOAD_BIAS 2
345    DEFINE_BIAS %1
346    mova %2, [arr_add]
347%endmacro
348
349; %1, %2 - registers to load packed 16 bit values to
350; %3, %4, %5, %6 - vertical offsets
351; %7 - horizontal offset
352%macro LOAD_BLOCK 7
353    movq   %1, [r0 + %3 + %7]
354    movhps %1, [r0 + %5 + %7]
355    movq   %2, [r0 + %4 + %7]
356    movhps %2, [r0 + %6 + %7]
357%endmacro
358
359; void ff_hevc_idct_4x4__{8,10}_<opt>(int16_t *coeffs, int col_limit)
360; %1 = bitdepth
361%macro IDCT_4x4 1
362cglobal hevc_idct_4x4_%1, 1, 1, 5, coeffs
363    mova m0, [coeffsq]
364    mova m1, [coeffsq + 16]
365
366    TR_4x4 7, 1, 1
367    TR_4x4 20 - %1, 1, 1
368
369    mova [coeffsq],      m0
370    mova [coeffsq + 16], m1
371    RET
372%endmacro
373
374; scale, pack (clip16) and store the residuals     0 e8[0] + o8[0] --> + %1
375; 4 at one time (4 columns)                        1 e8[1] + o8[1]
376; from %5: e8/16 + o8/16, with %1 offset                  ...
377; and  %3: e8/16 - o8/16, with %2 offset           6 e8[1] - o8[1]
378; %4 - shift                                       7 e8[0] - o8[0] --> + %2
379%macro STORE_8 7
380    psrad    %5, %4
381    psrad    %3, %4
382    packssdw %5, %3
383    movq     [coeffsq + %1], %5
384    movhps   [coeffsq + %2], %5
385%endmacro
386
387; %1 - horizontal offset
388; %2 - shift
389; %3, %4 - transform coeffs
390; %5 - vertical offset for e8 + o8
391; %6 - vertical offset for e8 - o8
392; %7 - register with e8 inside
393; %8 - block_size
394; %9 - register to store e8 +o8
395; %10 - register to store e8 - o8
396%macro E8_O8 10
397    pmaddwd m6, m4, %3
398    pmaddwd m7, m5, %4
399
400    paddd m6, m7
401    paddd m7, m6, %7 ; o8 + e8
402    psubd %7, m6     ; e8 - o8
403%if %8 == 8
404    STORE_8 %5 + %1, %6 + %1, %7, %2, m7, 0, 0
405%else
406    SWAP m7, %9
407    SWAP %7, %10
408%endif
409%endmacro
410
411; 8x4 residuals are processed and stored
412; %1 - horizontal offset
413; %2 - shift
414; %3 - offset of the even row
415; %4 - step: 1 for 8x8, 2 for 16x16, 4 for 32x32
416; %5 - offset of the odd row
417; %6 - block size
418; %7 - 1/0 add a constant in TR_4x4 or not
419; I want to add a constant for 8x8 transform but not for 16x16 and 32x32
420%macro TR_8x4 7
421    ; load 4 columns of even rows
422    LOAD_BLOCK  m0, m1, 0, 2 * %4 * %3, %4 * %3, 3 * %4 * %3, %1
423
424    TR_4x4 %2, 0, %7 ; e8: m0, m1, m2, m3, for 4 columns only
425
426    ; load 4 columns of odd rows
427    LOAD_BLOCK m4, m5, %4 * %5, 3 * %4 * %5, 5 * %4 * %5, 7 * %4 * %5, %1
428
429    ; 00 01 02 03
430    ; 10 11 12 13      m4: 10 30 11 31 12 32 13 33
431
432    ; ...        -- >
433    ;                  m5: 50 70 51 71 52 72 53 73
434    ; 70 71 72 73
435    SBUTTERFLY wd, 4, 5, 6
436
437    E8_O8 %1, %2, [pw_89_75],  [pw_50_18],   0,      %5 * 7, m0, %6, m8, m15
438    E8_O8 %1, %2, [pw_75_m18], [pw_m89_m50], %5,     %5 * 6, m1, %6, m9, m14
439    E8_O8 %1, %2, [pw_50_m89], [pw_18_75],   %5 * 2, %5 * 5, m2, %6, m10, m13
440    E8_O8 %1, %2, [pw_18_m50], [pw_75_m89],  %5 * 3, %5 * 4, m3, %6, m11, m12
441%endmacro
442
443%macro STORE_PACKED 7
444    movq   [r0 + %3 + %7], %1
445    movhps [r0 + %4 + %7], %1
446    movq   [r0 + %5 + %7], %2
447    movhps [r0 + %6 + %7], %2
448%endmacro
449
450; transpose 4x4 block packed
451; in %1 and %2 registers
452; %3 - temporary register
453%macro TRANSPOSE_4x4 3
454    SBUTTERFLY wd, %1, %2, %3
455    SBUTTERFLY dq, %1, %2, %3
456%endmacro
457
458; %1 - horizontal offset of the block i
459; %2 - vertical offset of the block i
460; %3 - width in bytes
461; %4 - vertical offset for the block j
462; %5 - horizontal offset for the block j
463%macro SWAP_BLOCKS 5
464    ; M_j
465    LOAD_BLOCK m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5
466    TRANSPOSE_4x4 4, 5, 6
467
468    ; M_i
469    LOAD_BLOCK m6, m7, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
470
471    STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
472
473    ; transpose and store M_i
474    SWAP m6, m4
475    SWAP m7, m5
476    TRANSPOSE_4x4 4, 5, 6
477    STORE_PACKED m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5
478%endmacro
479
480; %1 - horizontal offset
481; %2 - vertical offset of the block
482; %3 - width in bytes
483%macro TRANSPOSE_BLOCK 3
484    LOAD_BLOCK m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
485    TRANSPOSE_4x4 4, 5, 6
486    STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
487%endmacro
488
489%macro TRANSPOSE_8x8 0
490cglobal hevc_idct_transpose_8x8, 0, 0, 0
491    ; M1 M2 ^T = M1^t M3^t
492    ; M3 M4      M2^t M4^t
493
494    ; M1 4x4 block
495    TRANSPOSE_BLOCK 0, 0, 16
496
497    ; M2 and M3
498    SWAP_BLOCKS 0, 64, 16, 0, 8
499
500    ; M4
501    TRANSPOSE_BLOCK 8, 64, 16
502
503    ret
504%endmacro
505
506; void ff_hevc_idct_8x8_{8,10}_<opt>(int16_t *coeffs, int col_limit)
507; %1 = bitdepth
508%macro IDCT_8x8 1
509cglobal hevc_idct_8x8_%1, 1, 1, 8, coeffs
510    TR_8x4 0, 7, 32, 1, 16, 8, 1
511    TR_8x4 8, 7, 32, 1, 16, 8, 1
512
513    call hevc_idct_transpose_8x8_ %+ cpuname
514
515    DEFINE_BIAS %1
516    TR_8x4 0, shift, 32, 1, 16, 8, 1
517    TR_8x4 8, shift, 32, 1, 16, 8, 1
518
519    TAIL_CALL hevc_idct_transpose_8x8_ %+ cpuname, 1
520%endmacro
521
522; store intermedite e32 coeffs on stack
523; as 16x4 matrix
524; from m10: e8 + o8, with %6 offset
525; and  %3:  e8 - o8, with %7 offset
526; %4 - shift, unused here
527%macro STORE_16 7
528    mova [rsp + %6], %5
529    mova [rsp + %7], %3
530%endmacro
531
532; %1, %2 - transform constants
533; %3, %4 - regs with interleaved coeffs
534; %5 - 1/0 SWAP or add
535; %6, %7 - registers for intermidiate sums
536; %8 - accumulator register
537%macro ADD_ROWS 8
538    pmaddwd %6, %3, %1
539    pmaddwd %7, %4, %2
540    paddd   %6, %7
541%if %5 == 1
542    SWAP %6, %8
543%else
544    paddd %8, %6
545%endif
546%endmacro
547
548; %1 - transform coeffs
549; %2, %3 offsets for storing e+o/e-o back to coeffsq
550; %4 - shift
551; %5 - add
552; %6 - block_size
553; %7 - register with e16
554; %8, %9 - stack offsets for storing e+o/e-o
555%macro E16_O16 9
556    ADD_ROWS [%1],          [%1 +     16], m0, m1, 1, m5, m6, m7
557    ADD_ROWS [%1 + 2 * 16], [%1 + 3 * 16], m2, m3, 0, m5, m6, m7
558
559%if %6 == 8
560    paddd %7, %5
561%endif
562
563    paddd m4, m7, %7 ; o16 + e16
564    psubd %7, m7     ; e16 - o16
565    STORE_%6 %2, %3, %7, %4, m4, %8, %9
566%endmacro
567
568%macro TR_16x4 10
569    ; produce 8x4 matrix of e16 coeffs
570    ; for 4 first rows and store it on stack (128 bytes)
571    TR_8x4 %1, 7, %4, %5, %6, %8, 0
572
573    ; load 8 even rows
574    LOAD_BLOCK m0, m1, %9 * %6, %9 * 3 * %6, %9 * 5 * %6, %9 * 7 * %6, %1
575    LOAD_BLOCK m2, m3, %9 * 9 * %6, %9 * 11 * %6, %9 * 13 * %6, %9 * 15 * %6, %1
576
577    SBUTTERFLY wd, 0, 1, 4
578    SBUTTERFLY wd, 2, 3, 4
579
580    E16_O16 trans_coeffs16,               0 + %1, 15 * %6 + %1, %2, %3, %7, m8,       0, 15 * 16
581    mova m8, %3
582    E16_O16 trans_coeffs16 +     64,     %6 + %1, 14 * %6 + %1, %2, m8, %7, m9,      16, 14 * 16
583    E16_O16 trans_coeffs16 + 2 * 64, 2 * %6 + %1, 13 * %6 + %1, %2, m8, %7, m10, 2 * 16, 13 * 16
584    E16_O16 trans_coeffs16 + 3 * 64, 3 * %6 + %1, 12 * %6 + %1, %2, m8, %7, m11, 3 * 16, 12 * 16
585    E16_O16 trans_coeffs16 + 4 * 64, 4 * %6 + %1, 11 * %6 + %1, %2, m8, %7, m12, 4 * 16, 11 * 16
586    E16_O16 trans_coeffs16 + 5 * 64, 5 * %6 + %1, 10 * %6 + %1, %2, m8, %7, m13, 5 * 16, 10 * 16
587    E16_O16 trans_coeffs16 + 6 * 64, 6 * %6 + %1,  9 * %6 + %1, %2, m8, %7, m14, 6 * 16,  9 * 16
588    E16_O16 trans_coeffs16 + 7 * 64, 7 * %6 + %1,  8 * %6 + %1, %2, m8, %7, m15, 7 * 16,  8 * 16
589%endmacro
590
591%macro TRANSPOSE_16x16 0
592cglobal hevc_idct_transpose_16x16, 0, 0, 0
593; M1  M2  M3  M4 ^T      m1 m5 m9  m13   M_i^T = m_i
594; M5  M6  M7  M8    -->  m2 m6 m10 m14
595; M9  M10 M11 M12        m3 m7 m11 m15
596; M13 M14 M15 M16        m4 m8 m12 m16
597
598    ; M1 4x4 block
599    TRANSPOSE_BLOCK 0, 0, 32
600
601    ; M5, M2
602    SWAP_BLOCKS 0, 128, 32, 0, 8
603    ; M9, M3
604    SWAP_BLOCKS 0, 256, 32, 0, 16
605    ; M13, M4
606    SWAP_BLOCKS 0, 384, 32, 0, 24
607
608    ;M6
609    TRANSPOSE_BLOCK 8, 128, 32
610
611    ; M10, M7
612    SWAP_BLOCKS 8, 256, 32, 128, 16
613    ; M14, M8
614    SWAP_BLOCKS 8, 384, 32, 128, 24
615
616    ;M11
617    TRANSPOSE_BLOCK 16, 256, 32
618
619    ; M15, M12
620    SWAP_BLOCKS 16, 384, 32, 256, 24
621
622    ;M16
623    TRANSPOSE_BLOCK 24, 384, 32
624
625    ret
626%endmacro
627
628; void ff_hevc_idct_16x16_{8,10}_<opt>(int16_t *coeffs, int col_limit)
629; %1 = bitdepth
630%macro IDCT_16x16 1
631cglobal hevc_idct_16x16_%1, 1, 2, 16, coeffs
632    mov r1d, 3
633.loop16:
634    TR_16x4 8 * r1, 7, [pd_64], 64, 2, 32, 8, 16, 1, 0
635    dec r1d
636    jge .loop16
637
638    call hevc_idct_transpose_16x16_ %+ cpuname
639
640    DEFINE_BIAS %1
641    mov r1d, 3
642.loop16_2:
643    TR_16x4 8 * r1, shift, [arr_add], 64, 2, 32, 8, 16, 1, 1
644    dec r1d
645    jge .loop16_2
646
647    TAIL_CALL hevc_idct_transpose_16x16_ %+ cpuname, 1
648%endmacro
649
650; scale, pack (clip16) and store the residuals     0 e32[0] + o32[0] --> %1
651; 4 at one time (4 columns)                        1 e32[1] + o32[1]
652; %1 - address to store e32 + o32
653; %2 - address to store e32 - e32
654; %5 - reg with e32 + o32                                  ...
655; %3 - reg with e32 - o32                          30 e32[1] - o32[1]
656; %4 - shift                                       31 e32[0] - o32[0] --> %2
657%macro STORE_32 5
658    psrad    %5, %4
659    psrad    %3, %4
660    packssdw %5, %3
661    movq     [%1], %5
662    movhps   [%2], %5
663%endmacro
664
665; %1 - transform coeffs
666; %2 - stack offset for e32
667; %2, %3 offsets for storing e+o/e-o back to coeffsq
668; %4 - shift
669; %5 - stack offset of e32
670%macro E32_O32 5
671    ADD_ROWS [%1],          [%1 +     16], m0, m1, 1, m8, m9, m10
672    ADD_ROWS [%1 + 2 * 16], [%1 + 3 * 16], m2, m3, 0, m8, m9, m10
673    ADD_ROWS [%1 + 4 * 16], [%1 + 5 * 16], m4, m5, 0, m8, m9, m10
674    ADD_ROWS [%1 + 6 * 16], [%1 + 7 * 16], m6, m7, 0, m8, m9, m10
675
676    paddd m11, m14, [rsp + %5]
677    paddd m12, m10, m11 ; o32 + e32
678    psubd m11, m10      ; e32 - o32
679    STORE_32 %2, %3, m11, %4, m12
680%endmacro
681
682; %1 - horizontal offset
683; %2 - bitdepth
684%macro TR_32x4 3
685    TR_16x4 %1, 7, [pd_64], 128, 4, 64, 16, 16, 2, 0
686
687    LOAD_BLOCK m0, m1,      64,  3 * 64,  5 * 64,  7 * 64, %1
688    LOAD_BLOCK m2, m3,  9 * 64, 11 * 64, 13 * 64, 15 * 64, %1
689    LOAD_BLOCK m4, m5, 17 * 64, 19 * 64, 21 * 64, 23 * 64, %1
690    LOAD_BLOCK m6, m7, 25 * 64, 27 * 64, 29 * 64, 31 * 64, %1
691
692    SBUTTERFLY wd, 0, 1, 8
693    SBUTTERFLY wd, 2, 3, 8
694    SBUTTERFLY wd, 4, 5, 8
695    SBUTTERFLY wd, 6, 7, 8
696
697%if %3 == 1
698    %assign shift 7
699    mova m14, [pd_64]
700%else
701    LOAD_BIAS %2, m14
702%endif
703
704    lea r2, [trans_coeff32 + 15 * 128]
705    lea r3, [coeffsq + %1]
706    lea r4, [r3 + 16 * 64]
707    mov r5d, 15 * 16
708%%loop:
709    E32_O32 r2, r3 + r5 * 4, r4, shift, r5
710    sub r2, 128
711    add r4, 64
712    sub r5d, 16
713    jge %%loop
714%endmacro
715
716%macro TRANSPOSE_32x32 0
717cglobal hevc_idct_transpose_32x32, 0, 0, 0
718    ; M0  M1 ... M7
719    ; M8         M15
720    ;
721    ; ...
722    ;
723    ; M56        M63
724
725    TRANSPOSE_BLOCK 0, 0, 64 ; M1
726    mov r1d, 7
727    mov r2d, 7 * 256
728.loop_transpose:
729    SWAP_BLOCKS 0, r2, 64, 0, r1 * 8
730    sub r2d, 256
731    dec r1d
732    jg .loop_transpose
733
734    TRANSPOSE_BLOCK 8, 256, 64 ; M9
735    mov r1d, 6
736    mov r2d, 512
737    mov r3d, 16
738.loop_transpose2:
739    SWAP_BLOCKS 8, r2, 64, 256, r3
740    add r3d, 8
741    add r2d, 256
742    dec r1d
743    jg .loop_transpose2
744
745    TRANSPOSE_BLOCK 2 * 8, 2 * 256, 64 ; M9
746    mov r1d, 5
747    mov r2d, 768
748    mov r3d, 24
749.loop_transpose3:
750    SWAP_BLOCKS 2 * 8, r2, 64, 2 * 256, r3
751    add r3d, 8
752    add r2d, 256
753    dec r1d
754    jg .loop_transpose3
755
756    TRANSPOSE_BLOCK 3 * 8, 3 * 256, 64 ; M27
757    mov r1d, 4
758    mov r2d, 1024
759    mov r3d, 32
760.loop_transpose4:
761    SWAP_BLOCKS 3 * 8, r2, 64, 3 * 256, r3
762    add r3d, 8
763    add r2d, 256
764    dec r1d
765    jg .loop_transpose4
766
767    TRANSPOSE_BLOCK 4 * 8, 4 * 256, 64 ; M36
768    mov r1d, 3
769    mov r2d, 1280
770    mov r3d, 40
771.loop_transpose5:
772    SWAP_BLOCKS 4 * 8, r2, 64, 4 * 256, r3
773    add r3d, 8
774    add r2d, 256
775    dec r1d
776    jg .loop_transpose5
777
778    TRANSPOSE_BLOCK 5 * 8, 5 * 256, 64 ; M45
779    SWAP_BLOCKS 5 * 8, 6 * 256, 64, 5 * 256, 6 * 8
780    SWAP_BLOCKS 5 * 8, 7 * 256, 64, 5 * 256, 7 * 8
781
782    TRANSPOSE_BLOCK 6 * 8, 6 * 256, 64 ; M54
783    SWAP_BLOCKS 6 * 8, 7 * 256, 64, 6 * 256, 7 * 8
784
785    TRANSPOSE_BLOCK 7 * 8, 7 * 256, 64 ; M63
786
787    ret
788%endmacro
789
790; void ff_hevc_idct_32x32_{8,10}_<opt>(int16_t *coeffs, int col_limit)
791; %1 = bitdepth
792%macro IDCT_32x32 1
793cglobal hevc_idct_32x32_%1, 1, 6, 16, 256, coeffs
794    mov r1d, 7
795.loop32:
796    TR_32x4 8 * r1, %1, 1
797    dec r1d
798    jge .loop32
799
800    call hevc_idct_transpose_32x32_ %+ cpuname
801
802    mov r1d, 7
803.loop32_2:
804    TR_32x4 8 * r1, %1, 0
805    dec r1d
806    jge .loop32_2
807
808    TAIL_CALL hevc_idct_transpose_32x32_ %+ cpuname, 1
809%endmacro
810
811%macro INIT_IDCT_DC 1
812INIT_MMX mmxext
813IDCT_DC_NL  4,      %1
814IDCT_DC     8,  2,  %1
815
816INIT_XMM sse2
817IDCT_DC_NL  8,      %1
818IDCT_DC    16,  4,  %1
819IDCT_DC    32, 16,  %1
820
821%if HAVE_AVX2_EXTERNAL
822    INIT_YMM avx2
823    IDCT_DC    16,  2,  %1
824    IDCT_DC    32,  8,  %1
825%endif ;HAVE_AVX2_EXTERNAL
826%endmacro
827
828%macro INIT_IDCT 2
829INIT_XMM %2
830%if %1 == 8
831    TRANSPOSE_8x8
832    %if ARCH_X86_64
833        TRANSPOSE_16x16
834        TRANSPOSE_32x32
835    %endif
836%endif
837%if ARCH_X86_64
838    IDCT_32x32 %1
839    IDCT_16x16 %1
840%endif
841IDCT_8x8 %1
842IDCT_4x4 %1
843%endmacro
844
845INIT_IDCT_DC 8
846INIT_IDCT_DC 10
847INIT_IDCT_DC 12
848INIT_IDCT 8, sse2
849INIT_IDCT 8, avx
850INIT_IDCT 10, sse2
851INIT_IDCT 10, avx
852;INIT_IDCT 12, sse2
853;INIT_IDCT 12, avx
854