1;*****************************************************************************
2;* MMX/SSE2-optimized H.264 iDCT
3;*****************************************************************************
4;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
5;* Copyright (C) 2003-2008 x264 project
6;*
7;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
8;*          Loren Merritt <lorenm@u.washington.edu>
9;*          Holger Lubitz <hal@duncan.ol.sub.de>
10;*          Min Chen <chenm001.163.com>
11;*
12;* This file is part of FFmpeg.
13;*
14;* FFmpeg is free software; you can redistribute it and/or
15;* modify it under the terms of the GNU Lesser General Public
16;* License as published by the Free Software Foundation; either
17;* version 2.1 of the License, or (at your option) any later version.
18;*
19;* FFmpeg is distributed in the hope that it will be useful,
20;* but WITHOUT ANY WARRANTY; without even the implied warranty of
21;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22;* Lesser General Public License for more details.
23;*
24;* You should have received a copy of the GNU Lesser General Public
25;* License along with FFmpeg; if not, write to the Free Software
26;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27;*****************************************************************************
28
29%include "libavutil/x86/x86util.asm"
30
31SECTION_RODATA
32
33scan8_mem: db  4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
34           db  6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
35           db  4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
36           db  6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
37           db  4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
38           db  6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
39           db  4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
40           db  6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
41           db  4+11*8, 5+11*8, 4+12*8, 5+12*8
42           db  6+11*8, 7+11*8, 6+12*8, 7+12*8
43           db  4+13*8, 5+13*8, 4+14*8, 5+14*8
44           db  6+13*8, 7+13*8, 6+14*8, 7+14*8
45%ifdef PIC
46%define npicregs 1
47%define scan8 picregq
48%else
49%define npicregs 0
50%define scan8 scan8_mem
51%endif
52
53cextern pw_32
54cextern pw_1
55
56SECTION .text
57
58; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
59%macro IDCT4_ADD 3
60    ; Load dct coeffs
61    movq         m0, [%2]
62    movq         m1, [%2+8]
63    movq         m2, [%2+16]
64    movq         m3, [%2+24]
65
66    IDCT4_1D      w, 0, 1, 2, 3, 4, 5
67    mova         m6, [pw_32]
68    %if mmsize == 8
69        TRANSPOSE4x4W 0, 1, 2, 3, 4
70    %else
71        punpcklwd m0, m1
72        punpcklwd m2, m3
73        SBUTTERFLY dq, 0, 2, 4
74        MOVHL m1, m0
75        MOVHL m3, m2
76    %endif
77    paddw        m0, m6
78    IDCT4_1D      w, 0, 1, 2, 3, 4, 5
79    pxor         m7, m7
80    movq    [%2+ 0], m7
81    movq    [%2+ 8], m7
82    movq    [%2+16], m7
83    movq    [%2+24], m7
84
85    STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
86    lea          %1, [%1+%3*2]
87    STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
88%endmacro
89
90INIT_MMX mmx
91; void ff_h264_idct_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
92cglobal h264_idct_add_8, 3, 3, 0
93    movsxdifnidn r2, r2d
94    IDCT4_ADD    r0, r1, r2
95    RET
96
97%macro IDCT8_1D 2
98    psraw        m0, m1, 1
99    SWAP 0, 1
100    psraw        m4, m5, 1
101    paddw        m4, m5
102    paddw        m1, m0
103    paddw        m4, m7
104    paddw        m1, m5
105    psubw        m4, m0
106    paddw        m1, m3
107
108    psubw        m0, m3
109    psubw        m5, m3
110    psraw        m3, 1
111    paddw        m0, m7
112    psubw        m5, m7
113    psraw        m7, 1
114    psubw        m0, m3
115    psubw        m5, m7
116
117    psraw        m7, m1, 2
118    SWAP 7,1
119    psraw        m3, m4, 2
120    paddw        m3, m0
121    psraw        m0, 2
122    paddw        m1, m5
123    psraw        m5, 2
124    psubw        m0, m4
125    psubw        m7, m5
126
127    psraw        m5, m6, 1
128    SWAP 5,6
129    psraw        m4, m2, 1
130    paddw        m6, m2
131    psubw        m4, m5
132
133    mova         m2, %1
134    mova         m5, %2
135    SUMSUB_BA    w, 5, 2
136    SUMSUB_BA    w, 6, 5
137    SUMSUB_BA    w, 4, 2
138    SUMSUB_BA    w, 7, 6
139    SUMSUB_BA    w, 0, 4
140    SUMSUB_BA    w, 3, 2
141    SUMSUB_BA    w, 1, 5
142    SWAP         7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
143%endmacro
144
145%macro IDCT8_1D_FULL 1
146    mova         m7, [%1+112]
147    mova         m6, [%1+ 96]
148    mova         m5, [%1+ 80]
149    mova         m3, [%1+ 48]
150    mova         m2, [%1+ 32]
151    mova         m1, [%1+ 16]
152    IDCT8_1D   [%1], [%1+ 64]
153%endmacro
154
155; %1=int16_t *block, %2=int16_t *dstblock
156%macro IDCT8_ADD_MMX_START 2
157    IDCT8_1D_FULL %1
158    mova       [%1], m7
159    TRANSPOSE4x4W 0, 1, 2, 3, 7
160    mova         m7, [%1]
161    mova    [%2   ], m0
162    mova    [%2+16], m1
163    mova    [%2+32], m2
164    mova    [%2+48], m3
165    TRANSPOSE4x4W 4, 5, 6, 7, 3
166    mova    [%2+ 8], m4
167    mova    [%2+24], m5
168    mova    [%2+40], m6
169    mova    [%2+56], m7
170%endmacro
171
172; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
173%macro IDCT8_ADD_MMX_END 3-4
174    IDCT8_1D_FULL %2
175    mova    [%2   ], m5
176    mova    [%2+16], m6
177    mova    [%2+32], m7
178
179    pxor         m7, m7
180%if %0 == 4
181    movq   [%4+  0], m7
182    movq   [%4+  8], m7
183    movq   [%4+ 16], m7
184    movq   [%4+ 24], m7
185    movq   [%4+ 32], m7
186    movq   [%4+ 40], m7
187    movq   [%4+ 48], m7
188    movq   [%4+ 56], m7
189    movq   [%4+ 64], m7
190    movq   [%4+ 72], m7
191    movq   [%4+ 80], m7
192    movq   [%4+ 88], m7
193    movq   [%4+ 96], m7
194    movq   [%4+104], m7
195    movq   [%4+112], m7
196    movq   [%4+120], m7
197%endif
198    STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
199    lea          %1, [%1+%3*2]
200    STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
201    mova         m0, [%2   ]
202    mova         m1, [%2+16]
203    mova         m2, [%2+32]
204    lea          %1, [%1+%3*2]
205    STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
206    lea          %1, [%1+%3*2]
207    STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
208%endmacro
209
210INIT_MMX mmx
211; void ff_h264_idct8_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
212cglobal h264_idct8_add_8, 3, 4, 0
213    movsxdifnidn r2, r2d
214    %assign pad 128+4-(stack_offset&7)
215    SUB         rsp, pad
216
217    add   word [r1], 32
218    IDCT8_ADD_MMX_START r1  , rsp
219    IDCT8_ADD_MMX_START r1+8, rsp+64
220    lea          r3, [r0+4]
221    IDCT8_ADD_MMX_END   r0  , rsp,   r2, r1
222    IDCT8_ADD_MMX_END   r3  , rsp+8, r2
223
224    ADD         rsp, pad
225    RET
226
227; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
228%macro IDCT8_ADD_SSE 4
229    IDCT8_1D_FULL %2
230%if ARCH_X86_64
231    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
232%else
233    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
234%endif
235    paddw        m0, [pw_32]
236
237%if ARCH_X86_64 == 0
238    mova    [%2   ], m0
239    mova    [%2+16], m4
240    IDCT8_1D   [%2], [%2+ 16]
241    mova    [%2   ], m6
242    mova    [%2+16], m7
243%else
244    SWAP          0, 8
245    SWAP          4, 9
246    IDCT8_1D     m8, m9
247    SWAP          6, 8
248    SWAP          7, 9
249%endif
250
251    pxor         m7, m7
252    lea          %4, [%3*3]
253    STORE_DIFF   m0, m6, m7, [%1     ]
254    STORE_DIFF   m1, m6, m7, [%1+%3  ]
255    STORE_DIFF   m2, m6, m7, [%1+%3*2]
256    STORE_DIFF   m3, m6, m7, [%1+%4  ]
257%if ARCH_X86_64 == 0
258    mova         m0, [%2   ]
259    mova         m1, [%2+16]
260%else
261    SWAP          0, 8
262    SWAP          1, 9
263%endif
264    mova   [%2+  0], m7
265    mova   [%2+ 16], m7
266    mova   [%2+ 32], m7
267    mova   [%2+ 48], m7
268    mova   [%2+ 64], m7
269    mova   [%2+ 80], m7
270    mova   [%2+ 96], m7
271    mova   [%2+112], m7
272    lea          %1, [%1+%3*4]
273    STORE_DIFF   m4, m6, m7, [%1     ]
274    STORE_DIFF   m5, m6, m7, [%1+%3  ]
275    STORE_DIFF   m0, m6, m7, [%1+%3*2]
276    STORE_DIFF   m1, m6, m7, [%1+%4  ]
277%endmacro
278
279INIT_XMM sse2
280; void ff_h264_idct8_add_8_sse2(uint8_t *dst, int16_t *block, int stride)
281cglobal h264_idct8_add_8, 3, 4, 10
282    movsxdifnidn  r2, r2d
283    IDCT8_ADD_SSE r0, r1, r2, r3
284    RET
285
286%macro DC_ADD_MMXEXT_INIT 2
287    add          %1, 32
288    sar          %1, 6
289    movd         m0, %1d
290    lea          %1, [%2*3]
291    pshufw       m0, m0, 0
292    pxor         m1, m1
293    psubw        m1, m0
294    packuswb     m0, m0
295    packuswb     m1, m1
296%endmacro
297
298%macro DC_ADD_MMXEXT_OP 4
299    %1           m2, [%2     ]
300    %1           m3, [%2+%3  ]
301    %1           m4, [%2+%3*2]
302    %1           m5, [%2+%4  ]
303    paddusb      m2, m0
304    paddusb      m3, m0
305    paddusb      m4, m0
306    paddusb      m5, m0
307    psubusb      m2, m1
308    psubusb      m3, m1
309    psubusb      m4, m1
310    psubusb      m5, m1
311    %1    [%2     ], m2
312    %1    [%2+%3  ], m3
313    %1    [%2+%3*2], m4
314    %1    [%2+%4  ], m5
315%endmacro
316
317INIT_MMX mmxext
318; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
319%if ARCH_X86_64
320cglobal h264_idct_dc_add_8, 3, 4, 0
321    movsxd       r2, r2d
322    movsx        r3, word [r1]
323    mov  dword [r1], 0
324    DC_ADD_MMXEXT_INIT r3, r2
325    DC_ADD_MMXEXT_OP movh, r0, r2, r3
326    RET
327
328; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
329cglobal h264_idct8_dc_add_8, 3, 4, 0
330    movsxd       r2, r2d
331    movsx        r3, word [r1]
332    mov  dword [r1], 0
333    DC_ADD_MMXEXT_INIT r3, r2
334    DC_ADD_MMXEXT_OP mova, r0, r2, r3
335    lea          r0, [r0+r2*4]
336    DC_ADD_MMXEXT_OP mova, r0, r2, r3
337    RET
338%else
339; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
340cglobal h264_idct_dc_add_8, 2, 3, 0
341    movsx        r2, word [r1]
342    mov  dword [r1], 0
343    mov          r1, r2m
344    DC_ADD_MMXEXT_INIT r2, r1
345    DC_ADD_MMXEXT_OP movh, r0, r1, r2
346    RET
347
348; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
349cglobal h264_idct8_dc_add_8, 2, 3, 0
350    movsx        r2, word [r1]
351    mov  dword [r1], 0
352    mov          r1, r2m
353    DC_ADD_MMXEXT_INIT r2, r1
354    DC_ADD_MMXEXT_OP mova, r0, r1, r2
355    lea          r0, [r0+r1*4]
356    DC_ADD_MMXEXT_OP mova, r0, r1, r2
357    RET
358%endif
359
360INIT_MMX mmx
361; void ff_h264_idct_add16_8_mmx(uint8_t *dst, const int *block_offset,
362;                               int16_t *block, int stride,
363;                               const uint8_t nnzc[6 * 8])
364cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
365    movsxdifnidn r3, r3d
366    xor          r5, r5
367%ifdef PIC
368    lea     picregq, [scan8_mem]
369%endif
370.nextblock:
371    movzx        r6, byte [scan8+r5]
372    movzx        r6, byte [r4+r6]
373    test         r6, r6
374    jz .skipblock
375    mov         r6d, dword [r1+r5*4]
376    lea          r6, [r0+r6]
377    IDCT4_ADD    r6, r2, r3
378.skipblock:
379    inc          r5
380    add          r2, 32
381    cmp          r5, 16
382    jl .nextblock
383    REP_RET
384
385; void ff_h264_idct8_add4_8_mmx(uint8_t *dst, const int *block_offset,
386;                               int16_t *block, int stride,
387;                               const uint8_t nnzc[6 * 8])
388cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
389    movsxdifnidn r3, r3d
390    %assign pad 128+4-(stack_offset&7)
391    SUB         rsp, pad
392
393    xor          r5, r5
394%ifdef PIC
395    lea     picregq, [scan8_mem]
396%endif
397.nextblock:
398    movzx        r6, byte [scan8+r5]
399    movzx        r6, byte [r4+r6]
400    test         r6, r6
401    jz .skipblock
402    mov         r6d, dword [r1+r5*4]
403    add          r6, r0
404    add   word [r2], 32
405    IDCT8_ADD_MMX_START r2  , rsp
406    IDCT8_ADD_MMX_START r2+8, rsp+64
407    IDCT8_ADD_MMX_END   r6  , rsp,   r3, r2
408    mov         r6d, dword [r1+r5*4]
409    lea          r6, [r0+r6+4]
410    IDCT8_ADD_MMX_END   r6  , rsp+8, r3
411.skipblock:
412    add          r5, 4
413    add          r2, 128
414    cmp          r5, 16
415    jl .nextblock
416    ADD         rsp, pad
417    RET
418
419INIT_MMX mmxext
420; void ff_h264_idct_add16_8_mmxext(uint8_t *dst, const int *block_offset,
421;                                  int16_t *block, int stride,
422;                                  const uint8_t nnzc[6 * 8])
423cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
424    movsxdifnidn r3, r3d
425    xor          r5, r5
426%ifdef PIC
427    lea     picregq, [scan8_mem]
428%endif
429.nextblock:
430    movzx        r6, byte [scan8+r5]
431    movzx        r6, byte [r4+r6]
432    test         r6, r6
433    jz .skipblock
434    cmp          r6, 1
435    jnz .no_dc
436    movsx        r6, word [r2]
437    test         r6, r6
438    jz .no_dc
439    mov   word [r2], 0
440    DC_ADD_MMXEXT_INIT r6, r3
441%if ARCH_X86_64 == 0
442%define dst2q r1
443%define dst2d r1d
444%endif
445    mov       dst2d, dword [r1+r5*4]
446    lea       dst2q, [r0+dst2q]
447    DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
448%if ARCH_X86_64 == 0
449    mov          r1, r1m
450%endif
451    inc          r5
452    add          r2, 32
453    cmp          r5, 16
454    jl .nextblock
455    REP_RET
456.no_dc:
457    mov         r6d, dword [r1+r5*4]
458    add          r6, r0
459    IDCT4_ADD    r6, r2, r3
460.skipblock:
461    inc          r5
462    add          r2, 32
463    cmp          r5, 16
464    jl .nextblock
465    REP_RET
466
467INIT_MMX mmx
468; void ff_h264_idct_add16intra_8_mmx(uint8_t *dst, const int *block_offset,
469;                                    int16_t *block, int stride,
470;                                    const uint8_t nnzc[6 * 8])
471cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
472    movsxdifnidn r3, r3d
473    xor          r5, r5
474%ifdef PIC
475    lea     picregq, [scan8_mem]
476%endif
477.nextblock:
478    movzx        r6, byte [scan8+r5]
479    movzx        r6, byte [r4+r6]
480    or          r6w, word [r2]
481    test         r6, r6
482    jz .skipblock
483    mov         r6d, dword [r1+r5*4]
484    add          r6, r0
485    IDCT4_ADD    r6, r2, r3
486.skipblock:
487    inc          r5
488    add          r2, 32
489    cmp          r5, 16
490    jl .nextblock
491    REP_RET
492
493INIT_MMX mmxext
494; void ff_h264_idct_add16intra_8_mmxext(uint8_t *dst, const int *block_offset,
495;                                       int16_t *block, int stride,
496;                                       const uint8_t nnzc[6 * 8])
497cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
498    movsxdifnidn r3, r3d
499    xor          r5, r5
500%ifdef PIC
501    lea     picregq, [scan8_mem]
502%endif
503.nextblock:
504    movzx        r6, byte [scan8+r5]
505    movzx        r6, byte [r4+r6]
506    test         r6, r6
507    jz .try_dc
508    mov         r6d, dword [r1+r5*4]
509    lea          r6, [r0+r6]
510    IDCT4_ADD    r6, r2, r3
511    inc          r5
512    add          r2, 32
513    cmp          r5, 16
514    jl .nextblock
515    REP_RET
516.try_dc:
517    movsx        r6, word [r2]
518    test         r6, r6
519    jz .skipblock
520    mov   word [r2], 0
521    DC_ADD_MMXEXT_INIT r6, r3
522%if ARCH_X86_64 == 0
523%define dst2q r1
524%define dst2d r1d
525%endif
526    mov       dst2d, dword [r1+r5*4]
527    add       dst2q, r0
528    DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
529%if ARCH_X86_64 == 0
530    mov          r1, r1m
531%endif
532.skipblock:
533    inc          r5
534    add          r2, 32
535    cmp          r5, 16
536    jl .nextblock
537    REP_RET
538
539; void ff_h264_idct8_add4_8_mmxext(uint8_t *dst, const int *block_offset,
540;                                  int16_t *block, int stride,
541;                                  const uint8_t nnzc[6 * 8])
542cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
543    movsxdifnidn r3, r3d
544    %assign pad 128+4-(stack_offset&7)
545    SUB         rsp, pad
546
547    xor          r5, r5
548%ifdef PIC
549    lea     picregq, [scan8_mem]
550%endif
551.nextblock:
552    movzx        r6, byte [scan8+r5]
553    movzx        r6, byte [r4+r6]
554    test         r6, r6
555    jz .skipblock
556    cmp          r6, 1
557    jnz .no_dc
558    movsx        r6, word [r2]
559    test         r6, r6
560    jz .no_dc
561    mov   word [r2], 0
562    DC_ADD_MMXEXT_INIT r6, r3
563%if ARCH_X86_64 == 0
564%define dst2q r1
565%define dst2d r1d
566%endif
567    mov       dst2d, dword [r1+r5*4]
568    lea       dst2q, [r0+dst2q]
569    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
570    lea       dst2q, [dst2q+r3*4]
571    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
572%if ARCH_X86_64 == 0
573    mov          r1, r1m
574%endif
575    add          r5, 4
576    add          r2, 128
577    cmp          r5, 16
578    jl .nextblock
579
580    ADD         rsp, pad
581    RET
582.no_dc:
583    mov         r6d, dword [r1+r5*4]
584    add          r6, r0
585    add   word [r2], 32
586    IDCT8_ADD_MMX_START r2  , rsp
587    IDCT8_ADD_MMX_START r2+8, rsp+64
588    IDCT8_ADD_MMX_END   r6  , rsp,   r3, r2
589    mov         r6d, dword [r1+r5*4]
590    lea          r6, [r0+r6+4]
591    IDCT8_ADD_MMX_END   r6  , rsp+8, r3
592.skipblock:
593    add          r5, 4
594    add          r2, 128
595    cmp          r5, 16
596    jl .nextblock
597
598    ADD         rsp, pad
599    RET
600
601INIT_XMM sse2
602; void ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset,
603;                                int16_t *block, int stride,
604;                                const uint8_t nnzc[6 * 8])
605cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
606    movsxdifnidn r3, r3d
607    xor          r5, r5
608%ifdef PIC
609    lea     picregq, [scan8_mem]
610%endif
611.nextblock:
612    movzx        r6, byte [scan8+r5]
613    movzx        r6, byte [r4+r6]
614    test         r6, r6
615    jz .skipblock
616    cmp          r6, 1
617    jnz .no_dc
618    movsx        r6, word [r2]
619    test         r6, r6
620    jz .no_dc
621INIT_MMX cpuname
622    mov   word [r2], 0
623    DC_ADD_MMXEXT_INIT r6, r3
624%if ARCH_X86_64 == 0
625%define dst2q r1
626%define dst2d r1d
627%endif
628    mov       dst2d, dword [r1+r5*4]
629    add       dst2q, r0
630    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
631    lea       dst2q, [dst2q+r3*4]
632    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
633%if ARCH_X86_64 == 0
634    mov          r1, r1m
635%endif
636    add          r5, 4
637    add          r2, 128
638    cmp          r5, 16
639    jl .nextblock
640    REP_RET
641.no_dc:
642INIT_XMM cpuname
643    mov       dst2d, dword [r1+r5*4]
644    add       dst2q, r0
645    IDCT8_ADD_SSE dst2q, r2, r3, r6
646%if ARCH_X86_64 == 0
647    mov          r1, r1m
648%endif
649.skipblock:
650    add          r5, 4
651    add          r2, 128
652    cmp          r5, 16
653    jl .nextblock
654    REP_RET
655
656INIT_MMX mmx
657h264_idct_add8_mmx_plane:
658    movsxdifnidn r3, r3d
659.nextblock:
660    movzx        r6, byte [scan8+r5]
661    movzx        r6, byte [r4+r6]
662    or          r6w, word [r2]
663    test         r6, r6
664    jz .skipblock
665%if ARCH_X86_64
666    mov         r0d, dword [r1+r5*4]
667    add          r0, [dst2q]
668%else
669    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
670    mov          r0, [r0]
671    add          r0, dword [r1+r5*4]
672%endif
673    IDCT4_ADD    r0, r2, r3
674.skipblock:
675    inc          r5
676    add          r2, 32
677    test         r5, 3
678    jnz .nextblock
679    rep ret
680
681; void ff_h264_idct_add8_8_mmx(uint8_t **dest, const int *block_offset,
682;                              int16_t *block, int stride,
683;                              const uint8_t nnzc[6 * 8])
684cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
685    movsxdifnidn r3, r3d
686    mov          r5, 16
687    add          r2, 512
688%ifdef PIC
689    lea     picregq, [scan8_mem]
690%endif
691%if ARCH_X86_64
692    mov       dst2q, r0
693%endif
694    call         h264_idct_add8_mmx_plane
695    mov          r5, 32
696    add          r2, 384
697%if ARCH_X86_64
698    add       dst2q, gprsize
699%else
700    add        r0mp, gprsize
701%endif
702    call         h264_idct_add8_mmx_plane
703    RET ; TODO: check rep ret after a function call
704
705cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
706; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
707    movsxdifnidn r3, r3d
708%ifdef PIC
709    lea     picregq, [scan8_mem]
710%endif
711%if ARCH_X86_64
712    mov       dst2q, r0
713%endif
714
715    mov          r5, 16  ; i
716    add          r2, 512 ; i * 16 * sizeof(dctcoef) ; #define dctcoef int16_t
717
718    call         h264_idct_add8_mmx_plane
719    add r5, 4
720    call         h264_idct_add8_mmx_plane
721
722%if ARCH_X86_64
723    add       dst2q, gprsize ; dest[1]
724%else
725    add        r0mp, gprsize
726%endif
727
728    add r5, 4   ; set to 32
729    add r2, 256 ; set to i * 16 * sizeof(dctcoef)
730
731    call         h264_idct_add8_mmx_plane
732    add r5, 4
733    call         h264_idct_add8_mmx_plane
734
735    RET ; TODO: check rep ret after a function call
736
737h264_idct_add8_mmxext_plane:
738    movsxdifnidn r3, r3d
739.nextblock:
740    movzx        r6, byte [scan8+r5]
741    movzx        r6, byte [r4+r6]
742    test         r6, r6
743    jz .try_dc
744%if ARCH_X86_64
745    mov         r0d, dword [r1+r5*4]
746    add          r0, [dst2q]
747%else
748    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
749    mov          r0, [r0]
750    add          r0, dword [r1+r5*4]
751%endif
752    IDCT4_ADD    r0, r2, r3
753    inc          r5
754    add          r2, 32
755    test         r5, 3
756    jnz .nextblock
757    rep ret
758.try_dc:
759    movsx        r6, word [r2]
760    test         r6, r6
761    jz .skipblock
762    mov   word [r2], 0
763    DC_ADD_MMXEXT_INIT r6, r3
764%if ARCH_X86_64
765    mov         r0d, dword [r1+r5*4]
766    add          r0, [dst2q]
767%else
768    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
769    mov          r0, [r0]
770    add          r0, dword [r1+r5*4]
771%endif
772    DC_ADD_MMXEXT_OP movh, r0, r3, r6
773.skipblock:
774    inc          r5
775    add          r2, 32
776    test         r5, 3
777    jnz .nextblock
778    rep ret
779
780INIT_MMX mmxext
781; void ff_h264_idct_add8_8_mmxext(uint8_t **dest, const int *block_offset,
782;                                 int16_t *block, int stride,
783;                                 const uint8_t nnzc[6 * 8])
784cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
785    movsxdifnidn r3, r3d
786    mov          r5, 16
787    add          r2, 512
788%if ARCH_X86_64
789    mov       dst2q, r0
790%endif
791%ifdef PIC
792    lea     picregq, [scan8_mem]
793%endif
794    call h264_idct_add8_mmxext_plane
795    mov          r5, 32
796    add          r2, 384
797%if ARCH_X86_64
798    add       dst2q, gprsize
799%else
800    add        r0mp, gprsize
801%endif
802    call h264_idct_add8_mmxext_plane
803    RET ; TODO: check rep ret after a function call
804
805; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
806h264_idct_dc_add8_mmxext:
807    movsxdifnidn r3, r3d
808    movd         m0, [r2   ]          ;  0 0 X D
809    mov word [r2+ 0], 0
810    punpcklwd    m0, [r2+32]          ;  x X d D
811    mov word [r2+32], 0
812    paddsw       m0, [pw_32]
813    psraw        m0, 6
814    punpcklwd    m0, m0               ;  d d D D
815    pxor         m1, m1               ;  0 0 0 0
816    psubw        m1, m0               ; -d-d-D-D
817    packuswb     m0, m1               ; -d-d-D-D d d D D
818    pshufw       m1, m0, 0xFA         ; -d-d-d-d-D-D-D-D
819    punpcklwd    m0, m0               ;  d d d d D D D D
820    lea          r6, [r3*3]
821    DC_ADD_MMXEXT_OP movq, r0, r3, r6
822    ret
823
824ALIGN 16
825INIT_XMM sse2
826; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
827h264_add8x4_idct_sse2:
828    movsxdifnidn r3, r3d
829    movq   m0, [r2+ 0]
830    movq   m1, [r2+ 8]
831    movq   m2, [r2+16]
832    movq   m3, [r2+24]
833    movhps m0, [r2+32]
834    movhps m1, [r2+40]
835    movhps m2, [r2+48]
836    movhps m3, [r2+56]
837    IDCT4_1D w,0,1,2,3,4,5
838    TRANSPOSE2x4x4W 0,1,2,3,4
839    paddw m0, [pw_32]
840    IDCT4_1D w,0,1,2,3,4,5
841    pxor  m7, m7
842    mova [r2+ 0], m7
843    mova [r2+16], m7
844    mova [r2+32], m7
845    mova [r2+48], m7
846    STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
847    lea   r0, [r0+r3*2]
848    STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
849    ret
850
851%macro add16_sse2_cycle 2
852    movzx       r0, word [r4+%2]
853    test        r0, r0
854    jz .cycle%1end
855    mov        r0d, dword [r1+%1*8]
856%if ARCH_X86_64
857    add         r0, r5
858%else
859    add         r0, r0m
860%endif
861    call        h264_add8x4_idct_sse2
862.cycle%1end:
863%if %1 < 7
864    add         r2, 64
865%endif
866%endmacro
867
868; void ff_h264_idct_add16_8_sse2(uint8_t *dst, const int *block_offset,
869;                                int16_t *block, int stride,
870;                                const uint8_t nnzc[6 * 8])
871cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
872    movsxdifnidn r3, r3d
873%if ARCH_X86_64
874    mov         r5, r0
875%endif
876    ; unrolling of the loop leads to an average performance gain of
877    ; 20-25%
878    add16_sse2_cycle 0, 0xc
879    add16_sse2_cycle 1, 0x14
880    add16_sse2_cycle 2, 0xe
881    add16_sse2_cycle 3, 0x16
882    add16_sse2_cycle 4, 0x1c
883    add16_sse2_cycle 5, 0x24
884    add16_sse2_cycle 6, 0x1e
885    add16_sse2_cycle 7, 0x26
886REP_RET
887
888%macro add16intra_sse2_cycle 2
889    movzx       r0, word [r4+%2]
890    test        r0, r0
891    jz .try%1dc
892    mov        r0d, dword [r1+%1*8]
893%if ARCH_X86_64
894    add         r0, r7
895%else
896    add         r0, r0m
897%endif
898    call        h264_add8x4_idct_sse2
899    jmp .cycle%1end
900.try%1dc:
901    movsx       r0, word [r2   ]
902    or         r0w, word [r2+32]
903    jz .cycle%1end
904    mov        r0d, dword [r1+%1*8]
905%if ARCH_X86_64
906    add         r0, r7
907%else
908    add         r0, r0m
909%endif
910    call        h264_idct_dc_add8_mmxext
911.cycle%1end:
912%if %1 < 7
913    add         r2, 64
914%endif
915%endmacro
916
917; void ff_h264_idct_add16intra_8_sse2(uint8_t *dst, const int *block_offset,
918;                                     int16_t *block, int stride,
919;                                     const uint8_t nnzc[6 * 8])
920cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
921    movsxdifnidn r3, r3d
922%if ARCH_X86_64
923    mov         r7, r0
924%endif
925    add16intra_sse2_cycle 0, 0xc
926    add16intra_sse2_cycle 1, 0x14
927    add16intra_sse2_cycle 2, 0xe
928    add16intra_sse2_cycle 3, 0x16
929    add16intra_sse2_cycle 4, 0x1c
930    add16intra_sse2_cycle 5, 0x24
931    add16intra_sse2_cycle 6, 0x1e
932    add16intra_sse2_cycle 7, 0x26
933REP_RET
934
935%macro add8_sse2_cycle 2
936    movzx       r0, word [r4+%2]
937    test        r0, r0
938    jz .try%1dc
939%if ARCH_X86_64
940    mov        r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
941    add         r0, [r7]
942%else
943    mov         r0, r0m
944    mov         r0, [r0]
945    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
946%endif
947    call        h264_add8x4_idct_sse2
948    jmp .cycle%1end
949.try%1dc:
950    movsx       r0, word [r2   ]
951    or         r0w, word [r2+32]
952    jz .cycle%1end
953%if ARCH_X86_64
954    mov        r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
955    add         r0, [r7]
956%else
957    mov         r0, r0m
958    mov         r0, [r0]
959    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
960%endif
961    call        h264_idct_dc_add8_mmxext
962.cycle%1end:
963%if %1 == 1
964    add         r2, 384+64
965%elif %1 < 3
966    add         r2, 64
967%endif
968%endmacro
969
970; void ff_h264_idct_add8_8_sse2(uint8_t **dest, const int *block_offset,
971;                               int16_t *block, int stride,
972;                               const uint8_t nnzc[6 * 8])
973cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
974    movsxdifnidn r3, r3d
975    add          r2, 512
976%if ARCH_X86_64
977    mov          r7, r0
978%endif
979    add8_sse2_cycle 0, 0x34
980    add8_sse2_cycle 1, 0x3c
981%if ARCH_X86_64
982    add          r7, gprsize
983%else
984    add        r0mp, gprsize
985%endif
986    add8_sse2_cycle 2, 0x5c
987    add8_sse2_cycle 3, 0x64
988REP_RET
989
990;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul)
991
992%macro WALSH4_1D 5
993    SUMSUB_BADC w, %4, %3, %2, %1, %5
994    SUMSUB_BADC w, %4, %2, %3, %1, %5
995    SWAP %1, %4, %3
996%endmacro
997
998%macro DEQUANT 1-3
999%if cpuflag(sse2)
1000    movd      xmm4, t3d
1001    movq      xmm5, [pw_1]
1002    pshufd    xmm4, xmm4, 0
1003    movq2dq   xmm0, m0
1004    movq2dq   xmm1, m1
1005    movq2dq   xmm2, m2
1006    movq2dq   xmm3, m3
1007    punpcklwd xmm0, xmm5
1008    punpcklwd xmm1, xmm5
1009    punpcklwd xmm2, xmm5
1010    punpcklwd xmm3, xmm5
1011    pmaddwd   xmm0, xmm4
1012    pmaddwd   xmm1, xmm4
1013    pmaddwd   xmm2, xmm4
1014    pmaddwd   xmm3, xmm4
1015    psrad     xmm0, %1
1016    psrad     xmm1, %1
1017    psrad     xmm2, %1
1018    psrad     xmm3, %1
1019    packssdw  xmm0, xmm1
1020    packssdw  xmm2, xmm3
1021%else
1022    mova        m7, [pw_1]
1023    mova        m4, %1
1024    punpcklwd   %1, m7
1025    punpckhwd   m4, m7
1026    mova        m5, %2
1027    punpcklwd   %2, m7
1028    punpckhwd   m5, m7
1029    movd        m7, t3d
1030    punpckldq   m7, m7
1031    pmaddwd     %1, m7
1032    pmaddwd     %2, m7
1033    pmaddwd     m4, m7
1034    pmaddwd     m5, m7
1035    psrad       %1, %3
1036    psrad       %2, %3
1037    psrad       m4, %3
1038    psrad       m5, %3
1039    packssdw    %1, m4
1040    packssdw    %2, m5
1041%endif
1042%endmacro
1043
1044%macro STORE_WORDS 5-9
1045%if cpuflag(sse)
1046    movd  t0d, %1
1047    psrldq  %1, 4
1048    movd  t1d, %1
1049    psrldq  %1, 4
1050    mov [t2+%2*32], t0w
1051    mov [t2+%4*32], t1w
1052    shr   t0d, 16
1053    shr   t1d, 16
1054    mov [t2+%3*32], t0w
1055    mov [t2+%5*32], t1w
1056    movd  t0d, %1
1057    psrldq  %1, 4
1058    movd  t1d, %1
1059    mov [t2+%6*32], t0w
1060    mov [t2+%8*32], t1w
1061    shr   t0d, 16
1062    shr   t1d, 16
1063    mov [t2+%7*32], t0w
1064    mov [t2+%9*32], t1w
1065%else
1066    movd  t0d, %1
1067    psrlq  %1, 32
1068    movd  t1d, %1
1069    mov [t2+%2*32], t0w
1070    mov [t2+%4*32], t1w
1071    shr   t0d, 16
1072    shr   t1d, 16
1073    mov [t2+%3*32], t0w
1074    mov [t2+%5*32], t1w
1075%endif
1076%endmacro
1077
1078%macro DEQUANT_STORE 1
1079%if cpuflag(sse2)
1080    DEQUANT     %1
1081    STORE_WORDS xmm0,  0,  1,  4,  5,  2,  3,  6,  7
1082    STORE_WORDS xmm2,  8,  9, 12, 13, 10, 11, 14, 15
1083%else
1084    DEQUANT     m0, m1, %1
1085    STORE_WORDS m0,  0,  1,  4,  5
1086    STORE_WORDS m1,  2,  3,  6,  7
1087
1088    DEQUANT     m2, m3, %1
1089    STORE_WORDS m2,  8,  9, 12, 13
1090    STORE_WORDS m3, 10, 11, 14, 15
1091%endif
1092%endmacro
1093
1094%macro IDCT_DC_DEQUANT 1
1095cglobal h264_luma_dc_dequant_idct, 3, 4, %1
1096    ; manually spill XMM registers for Win64 because
1097    ; the code here is initialized with INIT_MMX
1098    WIN64_SPILL_XMM %1
1099    movq        m3, [r1+24]
1100    movq        m2, [r1+16]
1101    movq        m1, [r1+ 8]
1102    movq        m0, [r1+ 0]
1103    WALSH4_1D    0,1,2,3,4
1104    TRANSPOSE4x4W 0,1,2,3,4
1105    WALSH4_1D    0,1,2,3,4
1106
1107; shift, tmp, output, qmul
1108%if WIN64
1109    DECLARE_REG_TMP 0,3,1,2
1110    ; we can't avoid this, because r0 is the shift register (ecx) on win64
1111    xchg        r0, t2
1112%elif ARCH_X86_64
1113    DECLARE_REG_TMP 3,1,0,2
1114%else
1115    DECLARE_REG_TMP 1,3,0,2
1116%endif
1117
1118    cmp        t3d, 32767
1119    jg .big_qmul
1120    add        t3d, 128 << 16
1121    DEQUANT_STORE 8
1122    RET
1123.big_qmul:
1124    bsr        t0d, t3d
1125    add        t3d, 128 << 16
1126    mov        t1d, 7
1127    cmp        t0d, t1d
1128    cmovg      t0d, t1d
1129    inc        t1d
1130    shr        t3d, t0b
1131    sub        t1d, t0d
1132%if cpuflag(sse2)
1133    movd      xmm6, t1d
1134    DEQUANT_STORE xmm6
1135%else
1136    movd        m6, t1d
1137    DEQUANT_STORE m6
1138%endif
1139    RET
1140%endmacro
1141
1142INIT_MMX mmx
1143IDCT_DC_DEQUANT 0
1144INIT_MMX sse2
1145IDCT_DC_DEQUANT 7
1146
1147%ifdef __NASM_VER__
1148%if __NASM_MAJOR__ >= 2 && __NASM_MINOR__ >= 4
1149%unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet
1150%endif
1151%endif
1152%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
1153    movd       %3, [%7]
1154    movd       %4, [%7+%8]
1155    psraw      %1, %6
1156    psraw      %2, %6
1157    punpcklbw  %3, %5
1158    punpcklbw  %4, %5
1159    paddw      %3, %1
1160    paddw      %4, %2
1161    packuswb   %3, %5
1162    packuswb   %4, %5
1163    movd     [%7], %3
1164    movd  [%7+%8], %4
1165%endmacro
1166
1167%macro DC_ADD_INIT 1
1168    add      %1d, 32
1169    sar      %1d, 6
1170    movd     m0, %1d
1171    pshuflw  m0, m0, 0
1172    lea      %1, [3*stride_q]
1173    pxor     m1, m1
1174    psubw    m1, m0
1175    packuswb m0, m0
1176    packuswb m1, m1
1177%endmacro
1178
1179%macro IDCT_XMM 1
1180
1181INIT_XMM %1
1182
1183cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
1184    movsxdifnidn stride_q, stride_d
1185    IDCT4_ADD    dst_q, block_q, stride_q
1186RET
1187
1188cglobal h264_idct_dc_add_8, 3, 4, 6, dst_, block_, stride_
1189    movsxdifnidn stride_q, stride_d
1190    movsx             r3d, word [block_q]
1191    mov   dword [block_q], 0
1192    DC_ADD_INIT r3
1193    DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3
1194RET
1195
1196%endmacro
1197
1198IDCT_XMM sse2
1199IDCT_XMM avx
1200