1;*****************************************************************************
2;* MMX/SSE2-optimized H.264 iDCT
3;*****************************************************************************
4;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
5;* Copyright (C) 2003-2008 x264 project
6;*
7;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
8;*          Loren Merritt <lorenm@u.washington.edu>
9;*          Holger Lubitz <hal@duncan.ol.sub.de>
10;*          Min Chen <chenm001.163.com>
11;*
12;* This file is part of FFmpeg.
13;*
14;* FFmpeg is free software; you can redistribute it and/or
15;* modify it under the terms of the GNU Lesser General Public
16;* License as published by the Free Software Foundation; either
17;* version 2.1 of the License, or (at your option) any later version.
18;*
19;* FFmpeg is distributed in the hope that it will be useful,
20;* but WITHOUT ANY WARRANTY; without even the implied warranty of
21;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22;* Lesser General Public License for more details.
23;*
24;* You should have received a copy of the GNU Lesser General Public
25;* License along with FFmpeg; if not, write to the Free Software
26;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27;*****************************************************************************
28
29%include "libavutil/x86/x86util.asm"
30
31SECTION_RODATA
32
33scan8_mem: db  4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
34           db  6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
35           db  4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
36           db  6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
37           db  4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
38           db  6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
39           db  4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
40           db  6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
41           db  4+11*8, 5+11*8, 4+12*8, 5+12*8
42           db  6+11*8, 7+11*8, 6+12*8, 7+12*8
43           db  4+13*8, 5+13*8, 4+14*8, 5+14*8
44           db  6+13*8, 7+13*8, 6+14*8, 7+14*8
45%ifdef PIC
46%define npicregs 1
47%define scan8 picregq
48%else
49%define npicregs 0
50%define scan8 scan8_mem
51%endif
52
53cextern pw_32
54cextern pw_1
55
56SECTION .text
57
58; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
59%macro IDCT4_ADD 3
60    ; Load dct coeffs
61    movq         m0, [%2]
62    movq         m1, [%2+8]
63    movq         m2, [%2+16]
64    movq         m3, [%2+24]
65
66    IDCT4_1D      w, 0, 1, 2, 3, 4, 5
67    mova         m6, [pw_32]
68    TRANSPOSE4x4W 0, 1, 2, 3, 4
69    paddw        m0, m6
70    IDCT4_1D      w, 0, 1, 2, 3, 4, 5
71    pxor         m7, m7
72    movq    [%2+ 0], m7
73    movq    [%2+ 8], m7
74    movq    [%2+16], m7
75    movq    [%2+24], m7
76
77    STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
78    lea          %1, [%1+%3*2]
79    STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
80%endmacro
81
82INIT_MMX mmx
83; void ff_h264_idct_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
84cglobal h264_idct_add_8, 3, 3, 0
85    IDCT4_ADD    r0, r1, r2
86    RET
87
88%macro IDCT8_1D 2
89    mova         m0, m1
90    psraw        m1, 1
91    mova         m4, m5
92    psraw        m4, 1
93    paddw        m4, m5
94    paddw        m1, m0
95    paddw        m4, m7
96    paddw        m1, m5
97    psubw        m4, m0
98    paddw        m1, m3
99
100    psubw        m0, m3
101    psubw        m5, m3
102    psraw        m3, 1
103    paddw        m0, m7
104    psubw        m5, m7
105    psraw        m7, 1
106    psubw        m0, m3
107    psubw        m5, m7
108
109    mova         m7, m1
110    psraw        m1, 2
111    mova         m3, m4
112    psraw        m3, 2
113    paddw        m3, m0
114    psraw        m0, 2
115    paddw        m1, m5
116    psraw        m5, 2
117    psubw        m0, m4
118    psubw        m7, m5
119
120    mova         m5, m6
121    psraw        m6, 1
122    mova         m4, m2
123    psraw        m4, 1
124    paddw        m6, m2
125    psubw        m4, m5
126
127    mova         m2, %1
128    mova         m5, %2
129    SUMSUB_BA    w, 5, 2
130    SUMSUB_BA    w, 6, 5
131    SUMSUB_BA    w, 4, 2
132    SUMSUB_BA    w, 7, 6
133    SUMSUB_BA    w, 0, 4
134    SUMSUB_BA    w, 3, 2
135    SUMSUB_BA    w, 1, 5
136    SWAP         7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
137%endmacro
138
139%macro IDCT8_1D_FULL 1
140    mova         m7, [%1+112]
141    mova         m6, [%1+ 96]
142    mova         m5, [%1+ 80]
143    mova         m3, [%1+ 48]
144    mova         m2, [%1+ 32]
145    mova         m1, [%1+ 16]
146    IDCT8_1D   [%1], [%1+ 64]
147%endmacro
148
149; %1=int16_t *block, %2=int16_t *dstblock
150%macro IDCT8_ADD_MMX_START 2
151    IDCT8_1D_FULL %1
152    mova       [%1], m7
153    TRANSPOSE4x4W 0, 1, 2, 3, 7
154    mova         m7, [%1]
155    mova    [%2   ], m0
156    mova    [%2+16], m1
157    mova    [%2+32], m2
158    mova    [%2+48], m3
159    TRANSPOSE4x4W 4, 5, 6, 7, 3
160    mova    [%2+ 8], m4
161    mova    [%2+24], m5
162    mova    [%2+40], m6
163    mova    [%2+56], m7
164%endmacro
165
166; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
167%macro IDCT8_ADD_MMX_END 3-4
168    IDCT8_1D_FULL %2
169    mova    [%2   ], m5
170    mova    [%2+16], m6
171    mova    [%2+32], m7
172
173    pxor         m7, m7
174%if %0 == 4
175    movq   [%4+  0], m7
176    movq   [%4+  8], m7
177    movq   [%4+ 16], m7
178    movq   [%4+ 24], m7
179    movq   [%4+ 32], m7
180    movq   [%4+ 40], m7
181    movq   [%4+ 48], m7
182    movq   [%4+ 56], m7
183    movq   [%4+ 64], m7
184    movq   [%4+ 72], m7
185    movq   [%4+ 80], m7
186    movq   [%4+ 88], m7
187    movq   [%4+ 96], m7
188    movq   [%4+104], m7
189    movq   [%4+112], m7
190    movq   [%4+120], m7
191%endif
192    STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
193    lea          %1, [%1+%3*2]
194    STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
195    mova         m0, [%2   ]
196    mova         m1, [%2+16]
197    mova         m2, [%2+32]
198    lea          %1, [%1+%3*2]
199    STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
200    lea          %1, [%1+%3*2]
201    STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
202%endmacro
203
204INIT_MMX mmx
205; void ff_h264_idct8_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
206cglobal h264_idct8_add_8, 3, 4, 0
207    %assign pad 128+4-(stack_offset&7)
208    SUB         rsp, pad
209
210    add   word [r1], 32
211    IDCT8_ADD_MMX_START r1  , rsp
212    IDCT8_ADD_MMX_START r1+8, rsp+64
213    lea          r3, [r0+4]
214    IDCT8_ADD_MMX_END   r0  , rsp,   r2, r1
215    IDCT8_ADD_MMX_END   r3  , rsp+8, r2
216
217    ADD         rsp, pad
218    RET
219
220; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
221%macro IDCT8_ADD_SSE 4
222    IDCT8_1D_FULL %2
223%if ARCH_X86_64
224    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
225%else
226    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
227%endif
228    paddw        m0, [pw_32]
229
230%if ARCH_X86_64 == 0
231    mova    [%2   ], m0
232    mova    [%2+16], m4
233    IDCT8_1D   [%2], [%2+ 16]
234    mova    [%2   ], m6
235    mova    [%2+16], m7
236%else
237    SWAP          0, 8
238    SWAP          4, 9
239    IDCT8_1D     m8, m9
240    SWAP          6, 8
241    SWAP          7, 9
242%endif
243
244    pxor         m7, m7
245    lea          %4, [%3*3]
246    STORE_DIFF   m0, m6, m7, [%1     ]
247    STORE_DIFF   m1, m6, m7, [%1+%3  ]
248    STORE_DIFF   m2, m6, m7, [%1+%3*2]
249    STORE_DIFF   m3, m6, m7, [%1+%4  ]
250%if ARCH_X86_64 == 0
251    mova         m0, [%2   ]
252    mova         m1, [%2+16]
253%else
254    SWAP          0, 8
255    SWAP          1, 9
256%endif
257    mova   [%2+  0], m7
258    mova   [%2+ 16], m7
259    mova   [%2+ 32], m7
260    mova   [%2+ 48], m7
261    mova   [%2+ 64], m7
262    mova   [%2+ 80], m7
263    mova   [%2+ 96], m7
264    mova   [%2+112], m7
265    lea          %1, [%1+%3*4]
266    STORE_DIFF   m4, m6, m7, [%1     ]
267    STORE_DIFF   m5, m6, m7, [%1+%3  ]
268    STORE_DIFF   m0, m6, m7, [%1+%3*2]
269    STORE_DIFF   m1, m6, m7, [%1+%4  ]
270%endmacro
271
272INIT_XMM sse2
273; void ff_h264_idct8_add_8_sse2(uint8_t *dst, int16_t *block, int stride)
274cglobal h264_idct8_add_8, 3, 4, 10
275    IDCT8_ADD_SSE r0, r1, r2, r3
276    RET
277
278%macro DC_ADD_MMXEXT_INIT 2
279    add          %1, 32
280    sar          %1, 6
281    movd         m0, %1d
282    lea          %1, [%2*3]
283    pshufw       m0, m0, 0
284    pxor         m1, m1
285    psubw        m1, m0
286    packuswb     m0, m0
287    packuswb     m1, m1
288%endmacro
289
290%macro DC_ADD_MMXEXT_OP 4
291    %1           m2, [%2     ]
292    %1           m3, [%2+%3  ]
293    %1           m4, [%2+%3*2]
294    %1           m5, [%2+%4  ]
295    paddusb      m2, m0
296    paddusb      m3, m0
297    paddusb      m4, m0
298    paddusb      m5, m0
299    psubusb      m2, m1
300    psubusb      m3, m1
301    psubusb      m4, m1
302    psubusb      m5, m1
303    %1    [%2     ], m2
304    %1    [%2+%3  ], m3
305    %1    [%2+%3*2], m4
306    %1    [%2+%4  ], m5
307%endmacro
308
309INIT_MMX mmxext
310; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
311%if ARCH_X86_64
312cglobal h264_idct_dc_add_8, 3, 4, 0
313    movsx        r3, word [r1]
314    mov  dword [r1], 0
315    DC_ADD_MMXEXT_INIT r3, r2
316    DC_ADD_MMXEXT_OP movh, r0, r2, r3
317    RET
318
319; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
320cglobal h264_idct8_dc_add_8, 3, 4, 0
321    movsx        r3, word [r1]
322    mov  dword [r1], 0
323    DC_ADD_MMXEXT_INIT r3, r2
324    DC_ADD_MMXEXT_OP mova, r0, r2, r3
325    lea          r0, [r0+r2*4]
326    DC_ADD_MMXEXT_OP mova, r0, r2, r3
327    RET
328%else
329; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
330cglobal h264_idct_dc_add_8, 2, 3, 0
331    movsx        r2, word [r1]
332    mov  dword [r1], 0
333    mov          r1, r2m
334    DC_ADD_MMXEXT_INIT r2, r1
335    DC_ADD_MMXEXT_OP movh, r0, r1, r2
336    RET
337
338; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
339cglobal h264_idct8_dc_add_8, 2, 3, 0
340    movsx        r2, word [r1]
341    mov  dword [r1], 0
342    mov          r1, r2m
343    DC_ADD_MMXEXT_INIT r2, r1
344    DC_ADD_MMXEXT_OP mova, r0, r1, r2
345    lea          r0, [r0+r1*4]
346    DC_ADD_MMXEXT_OP mova, r0, r1, r2
347    RET
348%endif
349
350INIT_MMX mmx
351; void ff_h264_idct_add16_8_mmx(uint8_t *dst, const int *block_offset,
352;                               int16_t *block, int stride,
353;                               const uint8_t nnzc[6 * 8])
354cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
355    xor          r5, r5
356%ifdef PIC
357    lea     picregq, [scan8_mem]
358%endif
359.nextblock:
360    movzx        r6, byte [scan8+r5]
361    movzx        r6, byte [r4+r6]
362    test         r6, r6
363    jz .skipblock
364    mov         r6d, dword [r1+r5*4]
365    lea          r6, [r0+r6]
366    IDCT4_ADD    r6, r2, r3
367.skipblock:
368    inc          r5
369    add          r2, 32
370    cmp          r5, 16
371    jl .nextblock
372    REP_RET
373
374; void ff_h264_idct8_add4_8_mmx(uint8_t *dst, const int *block_offset,
375;                               int16_t *block, int stride,
376;                               const uint8_t nnzc[6 * 8])
377cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
378    %assign pad 128+4-(stack_offset&7)
379    SUB         rsp, pad
380
381    xor          r5, r5
382%ifdef PIC
383    lea     picregq, [scan8_mem]
384%endif
385.nextblock:
386    movzx        r6, byte [scan8+r5]
387    movzx        r6, byte [r4+r6]
388    test         r6, r6
389    jz .skipblock
390    mov         r6d, dword [r1+r5*4]
391    add          r6, r0
392    add   word [r2], 32
393    IDCT8_ADD_MMX_START r2  , rsp
394    IDCT8_ADD_MMX_START r2+8, rsp+64
395    IDCT8_ADD_MMX_END   r6  , rsp,   r3, r2
396    mov         r6d, dword [r1+r5*4]
397    lea          r6, [r0+r6+4]
398    IDCT8_ADD_MMX_END   r6  , rsp+8, r3
399.skipblock:
400    add          r5, 4
401    add          r2, 128
402    cmp          r5, 16
403    jl .nextblock
404    ADD         rsp, pad
405    RET
406
407INIT_MMX mmxext
408; void ff_h264_idct_add16_8_mmxext(uint8_t *dst, const int *block_offset,
409;                                  int16_t *block, int stride,
410;                                  const uint8_t nnzc[6 * 8])
411cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
412    xor          r5, r5
413%ifdef PIC
414    lea     picregq, [scan8_mem]
415%endif
416.nextblock:
417    movzx        r6, byte [scan8+r5]
418    movzx        r6, byte [r4+r6]
419    test         r6, r6
420    jz .skipblock
421    cmp          r6, 1
422    jnz .no_dc
423    movsx        r6, word [r2]
424    test         r6, r6
425    jz .no_dc
426    mov   word [r2], 0
427    DC_ADD_MMXEXT_INIT r6, r3
428%if ARCH_X86_64 == 0
429%define dst2q r1
430%define dst2d r1d
431%endif
432    mov       dst2d, dword [r1+r5*4]
433    lea       dst2q, [r0+dst2q]
434    DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
435%if ARCH_X86_64 == 0
436    mov          r1, r1m
437%endif
438    inc          r5
439    add          r2, 32
440    cmp          r5, 16
441    jl .nextblock
442    REP_RET
443.no_dc:
444    mov         r6d, dword [r1+r5*4]
445    add          r6, r0
446    IDCT4_ADD    r6, r2, r3
447.skipblock:
448    inc          r5
449    add          r2, 32
450    cmp          r5, 16
451    jl .nextblock
452    REP_RET
453
454INIT_MMX mmx
455; void ff_h264_idct_add16intra_8_mmx(uint8_t *dst, const int *block_offset,
456;                                    int16_t *block, int stride,
457;                                    const uint8_t nnzc[6 * 8])
458cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
459    xor          r5, r5
460%ifdef PIC
461    lea     picregq, [scan8_mem]
462%endif
463.nextblock:
464    movzx        r6, byte [scan8+r5]
465    movzx        r6, byte [r4+r6]
466    or          r6w, word [r2]
467    test         r6, r6
468    jz .skipblock
469    mov         r6d, dword [r1+r5*4]
470    add          r6, r0
471    IDCT4_ADD    r6, r2, r3
472.skipblock:
473    inc          r5
474    add          r2, 32
475    cmp          r5, 16
476    jl .nextblock
477    REP_RET
478
479INIT_MMX mmxext
480; void ff_h264_idct_add16intra_8_mmxext(uint8_t *dst, const int *block_offset,
481;                                       int16_t *block, int stride,
482;                                       const uint8_t nnzc[6 * 8])
483cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
484    xor          r5, r5
485%ifdef PIC
486    lea     picregq, [scan8_mem]
487%endif
488.nextblock:
489    movzx        r6, byte [scan8+r5]
490    movzx        r6, byte [r4+r6]
491    test         r6, r6
492    jz .try_dc
493    mov         r6d, dword [r1+r5*4]
494    lea          r6, [r0+r6]
495    IDCT4_ADD    r6, r2, r3
496    inc          r5
497    add          r2, 32
498    cmp          r5, 16
499    jl .nextblock
500    REP_RET
501.try_dc:
502    movsx        r6, word [r2]
503    test         r6, r6
504    jz .skipblock
505    mov   word [r2], 0
506    DC_ADD_MMXEXT_INIT r6, r3
507%if ARCH_X86_64 == 0
508%define dst2q r1
509%define dst2d r1d
510%endif
511    mov       dst2d, dword [r1+r5*4]
512    add       dst2q, r0
513    DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
514%if ARCH_X86_64 == 0
515    mov          r1, r1m
516%endif
517.skipblock:
518    inc          r5
519    add          r2, 32
520    cmp          r5, 16
521    jl .nextblock
522    REP_RET
523
524; void ff_h264_idct8_add4_8_mmxext(uint8_t *dst, const int *block_offset,
525;                                  int16_t *block, int stride,
526;                                  const uint8_t nnzc[6 * 8])
527cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
528    %assign pad 128+4-(stack_offset&7)
529    SUB         rsp, pad
530
531    xor          r5, r5
532%ifdef PIC
533    lea     picregq, [scan8_mem]
534%endif
535.nextblock:
536    movzx        r6, byte [scan8+r5]
537    movzx        r6, byte [r4+r6]
538    test         r6, r6
539    jz .skipblock
540    cmp          r6, 1
541    jnz .no_dc
542    movsx        r6, word [r2]
543    test         r6, r6
544    jz .no_dc
545    mov   word [r2], 0
546    DC_ADD_MMXEXT_INIT r6, r3
547%if ARCH_X86_64 == 0
548%define dst2q r1
549%define dst2d r1d
550%endif
551    mov       dst2d, dword [r1+r5*4]
552    lea       dst2q, [r0+dst2q]
553    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
554    lea       dst2q, [dst2q+r3*4]
555    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
556%if ARCH_X86_64 == 0
557    mov          r1, r1m
558%endif
559    add          r5, 4
560    add          r2, 128
561    cmp          r5, 16
562    jl .nextblock
563
564    ADD         rsp, pad
565    RET
566.no_dc:
567    mov         r6d, dword [r1+r5*4]
568    add          r6, r0
569    add   word [r2], 32
570    IDCT8_ADD_MMX_START r2  , rsp
571    IDCT8_ADD_MMX_START r2+8, rsp+64
572    IDCT8_ADD_MMX_END   r6  , rsp,   r3, r2
573    mov         r6d, dword [r1+r5*4]
574    lea          r6, [r0+r6+4]
575    IDCT8_ADD_MMX_END   r6  , rsp+8, r3
576.skipblock:
577    add          r5, 4
578    add          r2, 128
579    cmp          r5, 16
580    jl .nextblock
581
582    ADD         rsp, pad
583    RET
584
585INIT_XMM sse2
586; void ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset,
587;                                int16_t *block, int stride,
588;                                const uint8_t nnzc[6 * 8])
589cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
590    xor          r5, r5
591%ifdef PIC
592    lea     picregq, [scan8_mem]
593%endif
594.nextblock:
595    movzx        r6, byte [scan8+r5]
596    movzx        r6, byte [r4+r6]
597    test         r6, r6
598    jz .skipblock
599    cmp          r6, 1
600    jnz .no_dc
601    movsx        r6, word [r2]
602    test         r6, r6
603    jz .no_dc
604INIT_MMX cpuname
605    mov   word [r2], 0
606    DC_ADD_MMXEXT_INIT r6, r3
607%if ARCH_X86_64 == 0
608%define dst2q r1
609%define dst2d r1d
610%endif
611    mov       dst2d, dword [r1+r5*4]
612    add       dst2q, r0
613    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
614    lea       dst2q, [dst2q+r3*4]
615    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
616%if ARCH_X86_64 == 0
617    mov          r1, r1m
618%endif
619    add          r5, 4
620    add          r2, 128
621    cmp          r5, 16
622    jl .nextblock
623    REP_RET
624.no_dc:
625INIT_XMM cpuname
626    mov       dst2d, dword [r1+r5*4]
627    add       dst2q, r0
628    IDCT8_ADD_SSE dst2q, r2, r3, r6
629%if ARCH_X86_64 == 0
630    mov          r1, r1m
631%endif
632.skipblock:
633    add          r5, 4
634    add          r2, 128
635    cmp          r5, 16
636    jl .nextblock
637    REP_RET
638
639INIT_MMX mmx
640h264_idct_add8_mmx_plane:
641.nextblock:
642    movzx        r6, byte [scan8+r5]
643    movzx        r6, byte [r4+r6]
644    or          r6w, word [r2]
645    test         r6, r6
646    jz .skipblock
647%if ARCH_X86_64
648    mov         r0d, dword [r1+r5*4]
649    add          r0, [dst2q]
650%else
651    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
652    mov          r0, [r0]
653    add          r0, dword [r1+r5*4]
654%endif
655    IDCT4_ADD    r0, r2, r3
656.skipblock:
657    inc          r5
658    add          r2, 32
659    test         r5, 3
660    jnz .nextblock
661    rep ret
662
663; void ff_h264_idct_add8_8_mmx(uint8_t **dest, const int *block_offset,
664;                              int16_t *block, int stride,
665;                              const uint8_t nnzc[6 * 8])
666cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
667    mov          r5, 16
668    add          r2, 512
669%ifdef PIC
670    lea     picregq, [scan8_mem]
671%endif
672%if ARCH_X86_64
673    mov       dst2q, r0
674%endif
675    call         h264_idct_add8_mmx_plane
676    mov          r5, 32
677    add          r2, 384
678%if ARCH_X86_64
679    add       dst2q, gprsize
680%else
681    add        r0mp, gprsize
682%endif
683    call         h264_idct_add8_mmx_plane
684    RET
685
686h264_idct_add8_mmxext_plane:
687.nextblock:
688    movzx        r6, byte [scan8+r5]
689    movzx        r6, byte [r4+r6]
690    test         r6, r6
691    jz .try_dc
692%if ARCH_X86_64
693    mov         r0d, dword [r1+r5*4]
694    add          r0, [dst2q]
695%else
696    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
697    mov          r0, [r0]
698    add          r0, dword [r1+r5*4]
699%endif
700    IDCT4_ADD    r0, r2, r3
701    inc          r5
702    add          r2, 32
703    test         r5, 3
704    jnz .nextblock
705    rep ret
706.try_dc:
707    movsx        r6, word [r2]
708    test         r6, r6
709    jz .skipblock
710    mov   word [r2], 0
711    DC_ADD_MMXEXT_INIT r6, r3
712%if ARCH_X86_64
713    mov         r0d, dword [r1+r5*4]
714    add          r0, [dst2q]
715%else
716    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
717    mov          r0, [r0]
718    add          r0, dword [r1+r5*4]
719%endif
720    DC_ADD_MMXEXT_OP movh, r0, r3, r6
721.skipblock:
722    inc          r5
723    add          r2, 32
724    test         r5, 3
725    jnz .nextblock
726    rep ret
727
728INIT_MMX mmxext
729; void ff_h264_idct_add8_8_mmxext(uint8_t **dest, const int *block_offset,
730;                                 int16_t *block, int stride,
731;                                 const uint8_t nnzc[6 * 8])
732cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
733    mov          r5, 16
734    add          r2, 512
735%if ARCH_X86_64
736    mov       dst2q, r0
737%endif
738%ifdef PIC
739    lea     picregq, [scan8_mem]
740%endif
741    call h264_idct_add8_mmxext_plane
742    mov          r5, 32
743    add          r2, 384
744%if ARCH_X86_64
745    add       dst2q, gprsize
746%else
747    add        r0mp, gprsize
748%endif
749    call h264_idct_add8_mmxext_plane
750    RET
751
752; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
753h264_idct_dc_add8_mmxext:
754    movd         m0, [r2   ]          ;  0 0 X D
755    mov word [r2+ 0], 0
756    punpcklwd    m0, [r2+32]          ;  x X d D
757    mov word [r2+32], 0
758    paddsw       m0, [pw_32]
759    psraw        m0, 6
760    punpcklwd    m0, m0               ;  d d D D
761    pxor         m1, m1               ;  0 0 0 0
762    psubw        m1, m0               ; -d-d-D-D
763    packuswb     m0, m1               ; -d-d-D-D d d D D
764    pshufw       m1, m0, 0xFA         ; -d-d-d-d-D-D-D-D
765    punpcklwd    m0, m0               ;  d d d d D D D D
766    lea          r6, [r3*3]
767    DC_ADD_MMXEXT_OP movq, r0, r3, r6
768    ret
769
770ALIGN 16
771INIT_XMM sse2
772; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
773h264_add8x4_idct_sse2:
774    movq   m0, [r2+ 0]
775    movq   m1, [r2+ 8]
776    movq   m2, [r2+16]
777    movq   m3, [r2+24]
778    movhps m0, [r2+32]
779    movhps m1, [r2+40]
780    movhps m2, [r2+48]
781    movhps m3, [r2+56]
782    IDCT4_1D w,0,1,2,3,4,5
783    TRANSPOSE2x4x4W 0,1,2,3,4
784    paddw m0, [pw_32]
785    IDCT4_1D w,0,1,2,3,4,5
786    pxor  m7, m7
787    mova [r2+ 0], m7
788    mova [r2+16], m7
789    mova [r2+32], m7
790    mova [r2+48], m7
791    STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
792    lea   r0, [r0+r3*2]
793    STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
794    ret
795
796%macro add16_sse2_cycle 2
797    movzx       r0, word [r4+%2]
798    test        r0, r0
799    jz .cycle%1end
800    mov        r0d, dword [r1+%1*8]
801%if ARCH_X86_64
802    add         r0, r5
803%else
804    add         r0, r0m
805%endif
806    call        h264_add8x4_idct_sse2
807.cycle%1end:
808%if %1 < 7
809    add         r2, 64
810%endif
811%endmacro
812
813; void ff_h264_idct_add16_8_sse2(uint8_t *dst, const int *block_offset,
814;                                int16_t *block, int stride,
815;                                const uint8_t nnzc[6 * 8])
816cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
817%if ARCH_X86_64
818    mov         r5, r0
819%endif
820    ; unrolling of the loop leads to an average performance gain of
821    ; 20-25%
822    add16_sse2_cycle 0, 0xc
823    add16_sse2_cycle 1, 0x14
824    add16_sse2_cycle 2, 0xe
825    add16_sse2_cycle 3, 0x16
826    add16_sse2_cycle 4, 0x1c
827    add16_sse2_cycle 5, 0x24
828    add16_sse2_cycle 6, 0x1e
829    add16_sse2_cycle 7, 0x26
830    RET
831
832%macro add16intra_sse2_cycle 2
833    movzx       r0, word [r4+%2]
834    test        r0, r0
835    jz .try%1dc
836    mov        r0d, dword [r1+%1*8]
837%if ARCH_X86_64
838    add         r0, r7
839%else
840    add         r0, r0m
841%endif
842    call        h264_add8x4_idct_sse2
843    jmp .cycle%1end
844.try%1dc:
845    movsx       r0, word [r2   ]
846    or         r0w, word [r2+32]
847    jz .cycle%1end
848    mov        r0d, dword [r1+%1*8]
849%if ARCH_X86_64
850    add         r0, r7
851%else
852    add         r0, r0m
853%endif
854    call        h264_idct_dc_add8_mmxext
855.cycle%1end:
856%if %1 < 7
857    add         r2, 64
858%endif
859%endmacro
860
861; void ff_h264_idct_add16intra_8_sse2(uint8_t *dst, const int *block_offset,
862;                                     int16_t *block, int stride,
863;                                     const uint8_t nnzc[6 * 8])
864cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
865%if ARCH_X86_64
866    mov         r7, r0
867%endif
868    add16intra_sse2_cycle 0, 0xc
869    add16intra_sse2_cycle 1, 0x14
870    add16intra_sse2_cycle 2, 0xe
871    add16intra_sse2_cycle 3, 0x16
872    add16intra_sse2_cycle 4, 0x1c
873    add16intra_sse2_cycle 5, 0x24
874    add16intra_sse2_cycle 6, 0x1e
875    add16intra_sse2_cycle 7, 0x26
876    RET
877
878%macro add8_sse2_cycle 2
879    movzx       r0, word [r4+%2]
880    test        r0, r0
881    jz .try%1dc
882%if ARCH_X86_64
883    mov        r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
884    add         r0, [r7]
885%else
886    mov         r0, r0m
887    mov         r0, [r0]
888    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
889%endif
890    call        h264_add8x4_idct_sse2
891    jmp .cycle%1end
892.try%1dc:
893    movsx       r0, word [r2   ]
894    or         r0w, word [r2+32]
895    jz .cycle%1end
896%if ARCH_X86_64
897    mov        r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
898    add         r0, [r7]
899%else
900    mov         r0, r0m
901    mov         r0, [r0]
902    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
903%endif
904    call        h264_idct_dc_add8_mmxext
905.cycle%1end:
906%if %1 == 1
907    add         r2, 384+64
908%elif %1 < 3
909    add         r2, 64
910%endif
911%endmacro
912
913; void ff_h264_idct_add8_8_sse2(uint8_t **dest, const int *block_offset,
914;                               int16_t *block, int stride,
915;                               const uint8_t nnzc[6 * 8])
916cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
917    add          r2, 512
918%if ARCH_X86_64
919    mov          r7, r0
920%endif
921    add8_sse2_cycle 0, 0x34
922    add8_sse2_cycle 1, 0x3c
923%if ARCH_X86_64
924    add          r7, gprsize
925%else
926    add        r0mp, gprsize
927%endif
928    add8_sse2_cycle 2, 0x5c
929    add8_sse2_cycle 3, 0x64
930    RET
931
932;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul)
933
934%macro WALSH4_1D 5
935    SUMSUB_BADC w, %4, %3, %2, %1, %5
936    SUMSUB_BADC w, %4, %2, %3, %1, %5
937    SWAP %1, %4, %3
938%endmacro
939
940%macro DEQUANT_MMX 3
941    mova        m7, [pw_1]
942    mova        m4, %1
943    punpcklwd   %1, m7
944    punpckhwd   m4, m7
945    mova        m5, %2
946    punpcklwd   %2, m7
947    punpckhwd   m5, m7
948    movd        m7, t3d
949    punpckldq   m7, m7
950    pmaddwd     %1, m7
951    pmaddwd     %2, m7
952    pmaddwd     m4, m7
953    pmaddwd     m5, m7
954    psrad       %1, %3
955    psrad       %2, %3
956    psrad       m4, %3
957    psrad       m5, %3
958    packssdw    %1, m4
959    packssdw    %2, m5
960%endmacro
961
962%macro STORE_WORDS 5-9
963%if cpuflag(sse)
964    movd  t0d, %1
965    psrldq  %1, 4
966    movd  t1d, %1
967    psrldq  %1, 4
968    mov [t2+%2*32], t0w
969    mov [t2+%4*32], t1w
970    shr   t0d, 16
971    shr   t1d, 16
972    mov [t2+%3*32], t0w
973    mov [t2+%5*32], t1w
974    movd  t0d, %1
975    psrldq  %1, 4
976    movd  t1d, %1
977    mov [t2+%6*32], t0w
978    mov [t2+%8*32], t1w
979    shr   t0d, 16
980    shr   t1d, 16
981    mov [t2+%7*32], t0w
982    mov [t2+%9*32], t1w
983%else
984    movd  t0d, %1
985    psrlq  %1, 32
986    movd  t1d, %1
987    mov [t2+%2*32], t0w
988    mov [t2+%4*32], t1w
989    shr   t0d, 16
990    shr   t1d, 16
991    mov [t2+%3*32], t0w
992    mov [t2+%5*32], t1w
993%endif
994%endmacro
995
996%macro DEQUANT_STORE 1
997%if cpuflag(sse2)
998    movd      xmm4, t3d
999    movq      xmm5, [pw_1]
1000    pshufd    xmm4, xmm4, 0
1001    movq2dq   xmm0, m0
1002    movq2dq   xmm1, m1
1003    movq2dq   xmm2, m2
1004    movq2dq   xmm3, m3
1005    punpcklwd xmm0, xmm5
1006    punpcklwd xmm1, xmm5
1007    punpcklwd xmm2, xmm5
1008    punpcklwd xmm3, xmm5
1009    pmaddwd   xmm0, xmm4
1010    pmaddwd   xmm1, xmm4
1011    pmaddwd   xmm2, xmm4
1012    pmaddwd   xmm3, xmm4
1013    psrad     xmm0, %1
1014    psrad     xmm1, %1
1015    psrad     xmm2, %1
1016    psrad     xmm3, %1
1017    packssdw  xmm0, xmm1
1018    packssdw  xmm2, xmm3
1019    STORE_WORDS xmm0,  0,  1,  4,  5,  2,  3,  6,  7
1020    STORE_WORDS xmm2,  8,  9, 12, 13, 10, 11, 14, 15
1021%else
1022    DEQUANT_MMX m0, m1, %1
1023    STORE_WORDS m0,  0,  1,  4,  5
1024    STORE_WORDS m1,  2,  3,  6,  7
1025
1026    DEQUANT_MMX m2, m3, %1
1027    STORE_WORDS m2,  8,  9, 12, 13
1028    STORE_WORDS m3, 10, 11, 14, 15
1029%endif
1030%endmacro
1031
1032%macro IDCT_DC_DEQUANT 1
1033cglobal h264_luma_dc_dequant_idct, 3, 4, %1
1034    ; manually spill XMM registers for Win64 because
1035    ; the code here is initialized with INIT_MMX
1036    WIN64_SPILL_XMM %1
1037    movq        m3, [r1+24]
1038    movq        m2, [r1+16]
1039    movq        m1, [r1+ 8]
1040    movq        m0, [r1+ 0]
1041    WALSH4_1D    0,1,2,3,4
1042    TRANSPOSE4x4W 0,1,2,3,4
1043    WALSH4_1D    0,1,2,3,4
1044
1045; shift, tmp, output, qmul
1046%if WIN64
1047    DECLARE_REG_TMP 0,3,1,2
1048    ; we can't avoid this, because r0 is the shift register (ecx) on win64
1049    xchg        r0, t2
1050%elif ARCH_X86_64
1051    DECLARE_REG_TMP 3,1,0,2
1052%else
1053    DECLARE_REG_TMP 1,3,0,2
1054%endif
1055
1056    cmp        t3d, 32767
1057    jg .big_qmul
1058    add        t3d, 128 << 16
1059    DEQUANT_STORE 8
1060    RET
1061.big_qmul:
1062    bsr        t0d, t3d
1063    add        t3d, 128 << 16
1064    mov        t1d, 7
1065    cmp        t0d, t1d
1066    cmovg      t0d, t1d
1067    inc        t1d
1068    shr        t3d, t0b
1069    sub        t1d, t0d
1070%if cpuflag(sse2)
1071    movd      xmm6, t1d
1072    DEQUANT_STORE xmm6
1073%else
1074    movd        m6, t1d
1075    DEQUANT_STORE m6
1076%endif
1077    RET
1078%endmacro
1079
1080INIT_MMX mmx
1081IDCT_DC_DEQUANT 0
1082INIT_MMX sse2
1083IDCT_DC_DEQUANT 7
1084