1;*****************************************************************************
2;* MMX/SSE2/AVX-optimized H.264 deblocking code
3;*****************************************************************************
4;* Copyright (C) 2005-2011 x264 project
5;*
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;*          Fiona Glaser <fiona@x264.com>
8;*          Oskar Arvidsson <oskar@irock.se>
9;*
10;* This file is part of FFmpeg.
11;*
12;* FFmpeg is free software; you can redistribute it and/or
13;* modify it under the terms of the GNU Lesser General Public
14;* License as published by the Free Software Foundation; either
15;* version 2.1 of the License, or (at your option) any later version.
16;*
17;* FFmpeg is distributed in the hope that it will be useful,
18;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20;* Lesser General Public License for more details.
21;*
22;* You should have received a copy of the GNU Lesser General Public
23;* License along with FFmpeg; if not, write to the Free Software
24;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25;******************************************************************************
26
27%include "libavutil/x86/x86util.asm"
28
29SECTION_RODATA
30
31pb_A1: times 16 db 0xA1
32pb_3_1: times 4 db 3, 1
33
34SECTION .text
35
36cextern pb_0
37cextern pb_1
38cextern pb_3
39
40; expands to [base],...,[base+7*stride]
41%define PASS8ROWS(base, base3, stride, stride3) \
42    [base], [base+stride], [base+stride*2], [base3], \
43    [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
44
45%define PASS8ROWS(base, base3, stride, stride3, offset) \
46    PASS8ROWS(base+offset, base3+offset, stride, stride3)
47
48; in: 8 rows of 4 bytes in %4..%11
49; out: 4 rows of 8 bytes in m0..m3
50%macro TRANSPOSE4x8_LOAD 11
51    movh       m0, %4
52    movh       m2, %5
53    movh       m1, %6
54    movh       m3, %7
55    punpckl%1  m0, m2
56    punpckl%1  m1, m3
57    mova       m2, m0
58    punpckl%2  m0, m1
59    punpckh%2  m2, m1
60
61    movh       m4, %8
62    movh       m6, %9
63    movh       m5, %10
64    movh       m7, %11
65    punpckl%1  m4, m6
66    punpckl%1  m5, m7
67    mova       m6, m4
68    punpckl%2  m4, m5
69    punpckh%2  m6, m5
70
71    punpckh%3  m1, m0, m4
72    punpckh%3  m3, m2, m6
73    punpckl%3  m0, m4
74    punpckl%3  m2, m6
75%endmacro
76
77; in: 4 rows of 8 bytes in m0..m3
78; out: 8 rows of 4 bytes in %1..%8
79%macro TRANSPOSE8x4B_STORE 8
80    punpckhdq  m4, m0, m0
81    punpckhdq  m5, m1, m1
82    punpckhdq  m6, m2, m2
83
84    punpcklbw  m0, m1
85    punpcklbw  m2, m3
86    punpcklwd  m1, m0, m2
87    punpckhwd  m0, m2
88    movh       %1, m1
89    punpckhdq  m1, m1
90    movh       %2, m1
91    movh       %3, m0
92    punpckhdq  m0, m0
93    movh       %4, m0
94
95    punpckhdq  m3, m3
96    punpcklbw  m4, m5
97    punpcklbw  m6, m3
98    punpcklwd  m5, m4, m6
99    punpckhwd  m4, m6
100    movh       %5, m5
101    punpckhdq  m5, m5
102    movh       %6, m5
103    movh       %7, m4
104    punpckhdq  m4, m4
105    movh       %8, m4
106%endmacro
107
108%macro TRANSPOSE4x8B_LOAD 8
109    TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
110%endmacro
111
112%macro SBUTTERFLY3 4
113    punpckh%1  %4, %2, %3
114    punpckl%1  %2, %3
115%endmacro
116
117; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
118; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
119%macro TRANSPOSE6x8_MEM 9
120    RESET_MM_PERMUTATION
121    movq  m0, %1
122    movq  m1, %2
123    movq  m2, %3
124    movq  m3, %4
125    movq  m4, %5
126    movq  m5, %6
127    movq  m6, %7
128    SBUTTERFLY bw, 0, 1, 7
129    SBUTTERFLY bw, 2, 3, 7
130    SBUTTERFLY bw, 4, 5, 7
131    movq  [%9+0x10], m3
132    SBUTTERFLY3 bw, m6, %8, m7
133    SBUTTERFLY wd, 0, 2, 3
134    SBUTTERFLY wd, 4, 6, 3
135    punpckhdq m0, m4
136    movq  [%9+0x00], m0
137    SBUTTERFLY3 wd, m1, [%9+0x10], m3
138    SBUTTERFLY wd, 5, 7, 0
139    SBUTTERFLY dq, 1, 5, 0
140    SBUTTERFLY dq, 2, 6, 0
141    punpckldq m3, m7
142    movq  [%9+0x10], m2
143    movq  [%9+0x20], m6
144    movq  [%9+0x30], m1
145    movq  [%9+0x40], m5
146    movq  [%9+0x50], m3
147    RESET_MM_PERMUTATION
148%endmacro
149
150; in: 8 rows of 8 in %1..%8
151; out: 8 rows of 8 in %9..%16
152%macro TRANSPOSE8x8_MEM 16
153    RESET_MM_PERMUTATION
154    movq  m0, %1
155    movq  m1, %2
156    movq  m2, %3
157    movq  m3, %4
158    movq  m4, %5
159    movq  m5, %6
160    movq  m6, %7
161    SBUTTERFLY bw, 0, 1, 7
162    SBUTTERFLY bw, 2, 3, 7
163    SBUTTERFLY bw, 4, 5, 7
164    SBUTTERFLY3 bw, m6, %8, m7
165    movq  %9,  m5
166    SBUTTERFLY wd, 0, 2, 5
167    SBUTTERFLY wd, 4, 6, 5
168    SBUTTERFLY wd, 1, 3, 5
169    movq  %11, m6
170    movq  m6,  %9
171    SBUTTERFLY wd, 6, 7, 5
172    SBUTTERFLY dq, 0, 4, 5
173    SBUTTERFLY dq, 1, 6, 5
174    movq  %9,  m0
175    movq  %10, m4
176    movq  %13, m1
177    movq  %14, m6
178    SBUTTERFLY3 dq, m2, %11, m0
179    SBUTTERFLY dq, 3, 7, 4
180    movq  %11, m2
181    movq  %12, m0
182    movq  %15, m3
183    movq  %16, m7
184    RESET_MM_PERMUTATION
185%endmacro
186
187; out: %4 = |%1-%2|>%3
188; clobbers: %5
189%macro DIFF_GT 5
190%if avx_enabled == 0
191    mova    %5, %2
192    mova    %4, %1
193    psubusb %5, %1
194    psubusb %4, %2
195%else
196    psubusb %5, %2, %1
197    psubusb %4, %1, %2
198%endif
199    por     %4, %5
200    psubusb %4, %3
201%endmacro
202
203; out: %4 = |%1-%2|>%3
204; clobbers: %5
205%macro DIFF_GT2 5
206%if ARCH_X86_64
207    psubusb %5, %2, %1
208    psubusb %4, %1, %2
209%else
210    mova    %5, %2
211    mova    %4, %1
212    psubusb %5, %1
213    psubusb %4, %2
214%endif
215    psubusb %5, %3
216    psubusb %4, %3
217    pcmpeqb %4, %5
218%endmacro
219
220; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
221; out: m5=beta-1, m7=mask, %3=alpha-1
222; clobbers: m4,m6
223%macro LOAD_MASK 2-3
224    movd     m4, %1
225    movd     m5, %2
226    SPLATW   m4, m4
227    SPLATW   m5, m5
228    packuswb m4, m4  ; 16x alpha-1
229    packuswb m5, m5  ; 16x beta-1
230%if %0>2
231    mova     %3, m4
232%endif
233    DIFF_GT  m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
234    DIFF_GT  m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
235    por      m7, m4
236    DIFF_GT  m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
237    por      m7, m4
238    pxor     m6, m6
239    pcmpeqb  m7, m6
240%endmacro
241
242; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
243; out: m1=p0' m2=q0'
244; clobbers: m0,3-6
245%macro DEBLOCK_P0_Q0 0
246    pcmpeqb m4, m4
247    pxor    m5, m1, m2   ; p0^q0
248    pxor    m3, m4
249    pand    m5, [pb_1]   ; (p0^q0)&1
250    pavgb   m3, m0       ; (p1 - q1 + 256)>>1
251    pxor    m4, m1
252    pavgb   m3, [pb_3]   ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
253    pavgb   m4, m2       ; (q0 - p0 + 256)>>1
254    pavgb   m3, m5
255    mova    m6, [pb_A1]
256    paddusb m3, m4       ; d+128+33
257    psubusb m6, m3
258    psubusb m3, [pb_A1]
259    pminub  m6, m7
260    pminub  m3, m7
261    psubusb m1, m6
262    psubusb m2, m3
263    paddusb m1, m3
264    paddusb m2, m6
265%endmacro
266
267; in: m1=p0 m2=q0
268;     %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
269; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
270; clobbers: q2, tmp, tc0
271%macro LUMA_Q1 6
272    pavgb   %6, m1, m2
273    pavgb   %2, %6       ; avg(p2,avg(p0,q0))
274    pxor    %6, %3
275    pand    %6, [pb_1]   ; (p2^avg(p0,q0))&1
276    psubusb %2, %6       ; (p2+((p0+q0+1)>>1))>>1
277    psubusb %6, %1, %5
278    paddusb %5, %1
279    pmaxub  %2, %6
280    pminub  %2, %5
281    mova    %4, %2
282%endmacro
283
284%if ARCH_X86_64
285;-----------------------------------------------------------------------------
286; void ff_deblock_v_luma(uint8_t *pix, int stride, int alpha, int beta,
287;                        int8_t *tc0)
288;-----------------------------------------------------------------------------
289%macro DEBLOCK_LUMA 0
290cglobal deblock_v_luma_8, 5,5,10
291    movd    m8, [r4] ; tc0
292    lea     r4, [r1*3]
293    dec     r2d        ; alpha-1
294    neg     r4
295    dec     r3d        ; beta-1
296    add     r4, r0     ; pix-3*stride
297
298    mova    m0, [r4+r1]   ; p1
299    mova    m1, [r4+2*r1] ; p0
300    mova    m2, [r0]      ; q0
301    mova    m3, [r0+r1]   ; q1
302    LOAD_MASK r2d, r3d
303
304    punpcklbw m8, m8
305    punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
306    pcmpeqb m9, m9
307    pcmpeqb m9, m8
308    pandn   m9, m7
309    pand    m8, m9
310
311    movdqa  m3, [r4] ; p2
312    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
313    pand    m6, m9
314    psubb   m7, m8, m6
315    pand    m6, m8
316    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
317
318    movdqa  m4, [r0+2*r1] ; q2
319    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
320    pand    m6, m9
321    pand    m8, m6
322    psubb   m7, m6
323    mova    m3, [r0+r1]
324    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
325
326    DEBLOCK_P0_Q0
327    mova    [r4+2*r1], m1
328    mova    [r0], m2
329    RET
330
331;-----------------------------------------------------------------------------
332; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta,
333;                        int8_t *tc0)
334;-----------------------------------------------------------------------------
335INIT_MMX cpuname
336cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
337    movsxd r7,  r1d
338    lea    r8,  [r7+r7*2]
339    lea    r6,  [r0-4]
340    lea    r5,  [r0-4+r8]
341%if WIN64
342    %define pix_tmp rsp+0x30 ; shadow space + r4
343%else
344    %define pix_tmp rsp
345%endif
346
347    ; transpose 6x16 -> tmp space
348    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r7, r8), pix_tmp
349    lea    r6, [r6+r7*8]
350    lea    r5, [r5+r7*8]
351    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r7, r8), pix_tmp+8
352
353    ; vertical filter
354    ; alpha, beta, tc0 are still in r2d, r3d, r4
355    ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
356    lea    r0, [pix_tmp+0x30]
357    mov    r1d, 0x10
358%if WIN64
359    mov    [rsp+0x20], r4
360%endif
361    call   deblock_v_luma_8
362
363    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
364    add    r6, 2
365    add    r5, 2
366    movq   m0, [pix_tmp+0x18]
367    movq   m1, [pix_tmp+0x28]
368    movq   m2, [pix_tmp+0x38]
369    movq   m3, [pix_tmp+0x48]
370    TRANSPOSE8x4B_STORE  PASS8ROWS(r6, r5, r7, r8)
371
372    shl    r7,  3
373    sub    r6,  r7
374    sub    r5,  r7
375    shr    r7,  3
376    movq   m0, [pix_tmp+0x10]
377    movq   m1, [pix_tmp+0x20]
378    movq   m2, [pix_tmp+0x30]
379    movq   m3, [pix_tmp+0x40]
380    TRANSPOSE8x4B_STORE  PASS8ROWS(r6, r5, r7, r8)
381
382    RET
383%endmacro
384
385INIT_XMM sse2
386DEBLOCK_LUMA
387%if HAVE_AVX_EXTERNAL
388INIT_XMM avx
389DEBLOCK_LUMA
390%endif
391
392%else
393
394%macro DEBLOCK_LUMA 2
395;-----------------------------------------------------------------------------
396; void ff_deblock_v8_luma(uint8_t *pix, int stride, int alpha, int beta,
397;                         int8_t *tc0)
398;-----------------------------------------------------------------------------
399cglobal deblock_%1_luma_8, 5,5,8,2*%2
400    lea     r4, [r1*3]
401    dec     r2     ; alpha-1
402    neg     r4
403    dec     r3     ; beta-1
404    add     r4, r0 ; pix-3*stride
405
406    mova    m0, [r4+r1]   ; p1
407    mova    m1, [r4+2*r1] ; p0
408    mova    m2, [r0]      ; q0
409    mova    m3, [r0+r1]   ; q1
410    LOAD_MASK r2, r3
411
412    mov     r3, r4mp
413    pcmpeqb m3, m3
414    movd    m4, [r3] ; tc0
415    punpcklbw m4, m4
416    punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
417    mova   [esp+%2], m4 ; tc
418    pcmpgtb m4, m3
419    mova    m3, [r4] ; p2
420    pand    m4, m7
421    mova   [esp], m4 ; mask
422
423    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
424    pand    m6, m4
425    pand    m4, [esp+%2] ; tc
426    psubb   m7, m4, m6
427    pand    m6, m4
428    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
429
430    mova    m4, [r0+2*r1] ; q2
431    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
432    pand    m6, [esp] ; mask
433    mova    m5, [esp+%2] ; tc
434    psubb   m7, m6
435    pand    m5, m6
436    mova    m3, [r0+r1]
437    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
438
439    DEBLOCK_P0_Q0
440    mova    [r4+2*r1], m1
441    mova    [r0], m2
442    RET
443
444;-----------------------------------------------------------------------------
445; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta,
446;                        int8_t *tc0)
447;-----------------------------------------------------------------------------
448INIT_MMX cpuname
449cglobal deblock_h_luma_8, 0,5,8,0x60+HAVE_ALIGNED_STACK*12
450    mov    r0, r0mp
451    mov    r3, r1m
452    lea    r4, [r3*3]
453    sub    r0, 4
454    lea    r1, [r0+r4]
455%define pix_tmp esp+12*HAVE_ALIGNED_STACK
456
457    ; transpose 6x16 -> tmp space
458    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp
459    lea    r0, [r0+r3*8]
460    lea    r1, [r1+r3*8]
461    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
462
463    ; vertical filter
464    lea    r0, [pix_tmp+0x30]
465    PUSH   dword r4m
466    PUSH   dword r3m
467    PUSH   dword r2m
468    PUSH   dword 16
469    PUSH   dword r0
470    call   deblock_%1_luma_8
471%ifidn %1, v8
472    add    dword [esp   ], 8 ; pix_tmp+0x38
473    add    dword [esp+16], 2 ; tc0+2
474    call   deblock_%1_luma_8
475%endif
476    ADD    esp, 20
477
478    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
479    mov    r0, r0mp
480    sub    r0, 2
481
482    movq   m0, [pix_tmp+0x10]
483    movq   m1, [pix_tmp+0x20]
484    lea    r1, [r0+r4]
485    movq   m2, [pix_tmp+0x30]
486    movq   m3, [pix_tmp+0x40]
487    TRANSPOSE8x4B_STORE  PASS8ROWS(r0, r1, r3, r4)
488
489    lea    r0, [r0+r3*8]
490    lea    r1, [r1+r3*8]
491    movq   m0, [pix_tmp+0x18]
492    movq   m1, [pix_tmp+0x28]
493    movq   m2, [pix_tmp+0x38]
494    movq   m3, [pix_tmp+0x48]
495    TRANSPOSE8x4B_STORE  PASS8ROWS(r0, r1, r3, r4)
496
497    RET
498%endmacro ; DEBLOCK_LUMA
499
500INIT_MMX mmxext
501DEBLOCK_LUMA v8, 8
502INIT_XMM sse2
503DEBLOCK_LUMA v, 16
504%if HAVE_AVX_EXTERNAL
505INIT_XMM avx
506DEBLOCK_LUMA v, 16
507%endif
508
509%endif ; ARCH
510
511
512
513%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
514%if ARCH_X86_64
515    pavgb t0, p2, p1
516    pavgb t1, p0, q0
517%else
518    mova  t0, p2
519    mova  t1, p0
520    pavgb t0, p1
521    pavgb t1, q0
522%endif
523    pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
524    mova  t5, t1
525%if ARCH_X86_64
526    paddb t2, p2, p1
527    paddb t3, p0, q0
528%else
529    mova  t2, p2
530    mova  t3, p0
531    paddb t2, p1
532    paddb t3, q0
533%endif
534    paddb t2, t3
535    mova  t3, t2
536    mova  t4, t2
537    psrlw t2, 1
538    pavgb t2, mpb_0
539    pxor  t2, t0
540    pand  t2, mpb_1
541    psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
542
543%if ARCH_X86_64
544    pavgb t1, p2, q1
545    psubb t2, p2, q1
546%else
547    mova  t1, p2
548    mova  t2, p2
549    pavgb t1, q1
550    psubb t2, q1
551%endif
552    paddb t3, t3
553    psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
554    pand  t2, mpb_1
555    psubb t1, t2
556    pavgb t1, p1
557    pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
558    psrlw t3, 2
559    pavgb t3, mpb_0
560    pxor  t3, t1
561    pand  t3, mpb_1
562    psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
563
564    pxor  t3, p0, q1
565    pavgb t2, p0, q1
566    pand  t3, mpb_1
567    psubb t2, t3
568    pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
569
570    pxor  t1, t2
571    pxor  t2, p0
572    pand  t1, mask1p
573    pand  t2, mask0
574    pxor  t1, t2
575    pxor  t1, p0
576    mova  %1, t1 ; store p0
577
578    mova  t1, %4 ; p3
579    paddb t2, t1, p2
580    pavgb t1, p2
581    pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
582    paddb t2, t2
583    paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
584    psrlw t2, 2
585    pavgb t2, mpb_0
586    pxor  t2, t1
587    pand  t2, mpb_1
588    psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
589
590    pxor  t0, p1
591    pxor  t1, p2
592    pand  t0, mask1p
593    pand  t1, mask1p
594    pxor  t0, p1
595    pxor  t1, p2
596    mova  %2, t0 ; store p1
597    mova  %3, t1 ; store p2
598%endmacro
599
600%macro LUMA_INTRA_SWAP_PQ 0
601    %define q1 m0
602    %define q0 m1
603    %define p0 m2
604    %define p1 m3
605    %define p2 q2
606    %define mask1p mask1q
607%endmacro
608
609%macro DEBLOCK_LUMA_INTRA 1
610    %define p1 m0
611    %define p0 m1
612    %define q0 m2
613    %define q1 m3
614    %define t0 m4
615    %define t1 m5
616    %define t2 m6
617    %define t3 m7
618%if ARCH_X86_64
619    %define p2 m8
620    %define q2 m9
621    %define t4 m10
622    %define t5 m11
623    %define mask0 m12
624    %define mask1p m13
625%if WIN64
626    %define mask1q [rsp]
627%else
628    %define mask1q [rsp-24]
629%endif
630    %define mpb_0 m14
631    %define mpb_1 m15
632%else
633    %define spill(x) [esp+16*x]
634    %define p2 [r4+r1]
635    %define q2 [r0+2*r1]
636    %define t4 spill(0)
637    %define t5 spill(1)
638    %define mask0 spill(2)
639    %define mask1p spill(3)
640    %define mask1q spill(4)
641    %define mpb_0 [pb_0]
642    %define mpb_1 [pb_1]
643%endif
644
645;-----------------------------------------------------------------------------
646; void ff_deblock_v_luma_intra(uint8_t *pix, int stride, int alpha, int beta)
647;-----------------------------------------------------------------------------
648%if WIN64
649cglobal deblock_%1_luma_intra_8, 4,6,16,0x10
650%else
651cglobal deblock_%1_luma_intra_8, 4,6,16,ARCH_X86_64*0x50-0x50
652%endif
653    lea     r4, [r1*4]
654    lea     r5, [r1*3] ; 3*stride
655    dec     r2d        ; alpha-1
656    jl .end
657    neg     r4
658    dec     r3d        ; beta-1
659    jl .end
660    add     r4, r0     ; pix-4*stride
661    mova    p1, [r4+2*r1]
662    mova    p0, [r4+r5]
663    mova    q0, [r0]
664    mova    q1, [r0+r1]
665%if ARCH_X86_64
666    pxor    mpb_0, mpb_0
667    mova    mpb_1, [pb_1]
668    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
669    SWAP    7, 12 ; m12=mask0
670    pavgb   t5, mpb_0
671    pavgb   t5, mpb_1 ; alpha/4+1
672    movdqa  p2, [r4+r1]
673    movdqa  q2, [r0+2*r1]
674    DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
675    DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
676    DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
677    pand    t0, mask0
678    pand    t4, t0
679    pand    t2, t0
680    mova    mask1q, t4
681    mova    mask1p, t2
682%else
683    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
684    mova    m4, t5
685    mova    mask0, m7
686    pavgb   m4, [pb_0]
687    pavgb   m4, [pb_1] ; alpha/4+1
688    DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
689    pand    m6, mask0
690    DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
691    pand    m4, m6
692    mova    mask1p, m4
693    DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
694    pand    m4, m6
695    mova    mask1q, m4
696%endif
697    LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
698    LUMA_INTRA_SWAP_PQ
699    LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
700.end:
701    RET
702
703INIT_MMX cpuname
704%if ARCH_X86_64
705;-----------------------------------------------------------------------------
706; void ff_deblock_h_luma_intra(uint8_t *pix, int stride, int alpha, int beta)
707;-----------------------------------------------------------------------------
708cglobal deblock_h_luma_intra_8, 4,9,0,0x80
709    movsxd r7,  r1d
710    lea    r8,  [r7*3]
711    lea    r6,  [r0-4]
712    lea    r5,  [r0-4+r8]
713%if WIN64
714    %define pix_tmp rsp+0x20 ; shadow space
715%else
716    %define pix_tmp rsp
717%endif
718
719    ; transpose 8x16 -> tmp space
720    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
721    lea    r6, [r6+r7*8]
722    lea    r5, [r5+r7*8]
723    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
724
725    lea    r0,  [pix_tmp+0x40]
726    mov    r1,  0x10
727    call   deblock_v_luma_intra_8
728
729    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
730    lea    r5, [r6+r8]
731    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
732    shl    r7,  3
733    sub    r6,  r7
734    sub    r5,  r7
735    shr    r7,  3
736    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
737    RET
738%else
739cglobal deblock_h_luma_intra_8, 2,4,8,0x80
740    lea    r3,  [r1*3]
741    sub    r0,  4
742    lea    r2,  [r0+r3]
743    %define pix_tmp rsp
744
745    ; transpose 8x16 -> tmp space
746    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
747    lea    r0,  [r0+r1*8]
748    lea    r2,  [r2+r1*8]
749    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
750
751    lea    r0,  [pix_tmp+0x40]
752    PUSH   dword r3m
753    PUSH   dword r2m
754    PUSH   dword 16
755    PUSH   r0
756    call   deblock_%1_luma_intra_8
757%ifidn %1, v8
758    add    dword [rsp], 8 ; pix_tmp+8
759    call   deblock_%1_luma_intra_8
760%endif
761    ADD    esp, 16
762
763    mov    r1,  r1m
764    mov    r0,  r0mp
765    lea    r3,  [r1*3]
766    sub    r0,  4
767    lea    r2,  [r0+r3]
768    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
769    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
770    lea    r0,  [r0+r1*8]
771    lea    r2,  [r2+r1*8]
772    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
773    RET
774%endif ; ARCH_X86_64
775%endmacro ; DEBLOCK_LUMA_INTRA
776
777INIT_XMM sse2
778DEBLOCK_LUMA_INTRA v
779%if HAVE_AVX_EXTERNAL
780INIT_XMM avx
781DEBLOCK_LUMA_INTRA v
782%endif
783%if ARCH_X86_64 == 0
784INIT_MMX mmxext
785DEBLOCK_LUMA_INTRA v8
786%endif
787
788INIT_MMX mmxext
789
790%macro CHROMA_V_START 0
791    dec    r2d      ; alpha-1
792    dec    r3d      ; beta-1
793    mov    t5, r0
794    sub    t5, r1
795    sub    t5, r1
796%endmacro
797
798%macro CHROMA_H_START 0
799    dec    r2d
800    dec    r3d
801    sub    r0, 2
802    lea    t6, [r1*3]
803    mov    t5, r0
804    add    r0, t6
805%endmacro
806
807%define t5 r5
808%define t6 r6
809
810;-----------------------------------------------------------------------------
811; void ff_deblock_v_chroma(uint8_t *pix, int stride, int alpha, int beta,
812;                          int8_t *tc0)
813;-----------------------------------------------------------------------------
814cglobal deblock_v_chroma_8, 5,6
815    CHROMA_V_START
816    movq  m0, [t5]
817    movq  m1, [t5+r1]
818    movq  m2, [r0]
819    movq  m3, [r0+r1]
820    call ff_chroma_inter_body_mmxext
821    movq  [t5+r1], m1
822    movq  [r0], m2
823    RET
824
825;-----------------------------------------------------------------------------
826; void ff_deblock_h_chroma(uint8_t *pix, int stride, int alpha, int beta,
827;                          int8_t *tc0)
828;-----------------------------------------------------------------------------
829cglobal deblock_h_chroma_8, 5,7
830%if ARCH_X86_64
831    ; This could use the red zone on 64 bit unix to avoid the stack pointer
832    ; readjustment, but valgrind assumes the red zone is clobbered on
833    ; function calls and returns.
834    sub   rsp, 16
835    %define buf0 [rsp]
836    %define buf1 [rsp+8]
837%else
838    %define buf0 r0m
839    %define buf1 r2m
840%endif
841    CHROMA_H_START
842    TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
843    movq  buf0, m0
844    movq  buf1, m3
845    LOAD_MASK  r2d, r3d
846    movd       m6, [r4] ; tc0
847    punpcklbw  m6, m6
848    pand       m7, m6
849    DEBLOCK_P0_Q0
850    movq  m0, buf0
851    movq  m3, buf1
852    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
853%if ARCH_X86_64
854    add   rsp, 16
855%endif
856    RET
857
858ALIGN 16
859ff_chroma_inter_body_mmxext:
860    LOAD_MASK  r2d, r3d
861    movd       m6, [r4] ; tc0
862    punpcklbw  m6, m6
863    pand       m7, m6
864    DEBLOCK_P0_Q0
865    ret
866
867
868
869; in: %1=p0 %2=p1 %3=q1
870; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
871%macro CHROMA_INTRA_P0 3
872    movq    m4, %1
873    pxor    m4, %3
874    pand    m4, [pb_1] ; m4 = (p0^q1)&1
875    pavgb   %1, %3
876    psubusb %1, m4
877    pavgb   %1, %2             ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
878%endmacro
879
880%define t5 r4
881%define t6 r5
882
883;------------------------------------------------------------------------------
884; void ff_deblock_v_chroma_intra(uint8_t *pix, int stride, int alpha, int beta)
885;------------------------------------------------------------------------------
886cglobal deblock_v_chroma_intra_8, 4,5
887    CHROMA_V_START
888    movq  m0, [t5]
889    movq  m1, [t5+r1]
890    movq  m2, [r0]
891    movq  m3, [r0+r1]
892    call ff_chroma_intra_body_mmxext
893    movq  [t5+r1], m1
894    movq  [r0], m2
895    RET
896
897;------------------------------------------------------------------------------
898; void ff_deblock_h_chroma_intra(uint8_t *pix, int stride, int alpha, int beta)
899;------------------------------------------------------------------------------
900cglobal deblock_h_chroma_intra_8, 4,6
901    CHROMA_H_START
902    TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
903    call ff_chroma_intra_body_mmxext
904    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
905    RET
906
907ALIGN 16
908ff_chroma_intra_body_mmxext:
909    LOAD_MASK r2d, r3d
910    movq   m5, m1
911    movq   m6, m2
912    CHROMA_INTRA_P0  m1, m0, m3
913    CHROMA_INTRA_P0  m2, m3, m0
914    psubb  m1, m5
915    psubb  m2, m6
916    pand   m1, m7
917    pand   m2, m7
918    paddb  m1, m5
919    paddb  m2, m6
920    ret
921
922;-----------------------------------------------------------------------------
923; void ff_h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40],
924;                                   int8_t ref[2][40], int16_t mv[2][40][2],
925;                                   int bidir,    int edges,    int step,
926;                                   int mask_mv0, int mask_mv1, int field);
927;
928; bidir    is 0 or 1
929; edges    is 1 or 4
930; step     is 1 or 2
931; mask_mv0 is 0 or 3
932; mask_mv1 is 0 or 1
933; field    is 0 or 1
934;-----------------------------------------------------------------------------
935%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv,
936                                        ; dir, d_idx, mask_dir, bidir
937%define edgesd    %1
938%define stepd     %2
939%define mask_mvd  %3
940%define dir       %4
941%define d_idx     %5
942%define mask_dir  %6
943%define bidir     %7
944    xor          b_idxd, b_idxd ; for (b_idx = 0; b_idx < edges; b_idx += step)
945%%.b_idx_loop:
946%if mask_dir == 0
947    pxor             m0, m0
948%endif
949    test         b_idxd, dword mask_mvd
950    jnz %%.skip_loop_iter                       ; if (!(b_idx & mask_mv))
951%if bidir == 1
952    movd             m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] }
953    punpckldq        m2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] }
954    pshufw           m0, [refq+b_idxq+12], 0x44 ; { ref0[b],  ref0[b]  }
955    pshufw           m1, [refq+b_idxq+52], 0x44 ; { ref1[b],  ref1[b]  }
956    pshufw           m3, m2, 0x4E               ; { ref1[bn], ref0[bn] }
957    psubb            m0, m2                     ; { ref0[b] != ref0[bn],
958                                                ;   ref0[b] != ref1[bn] }
959    psubb            m1, m3                     ; { ref1[b] != ref1[bn],
960                                                ;   ref1[b] != ref0[bn] }
961
962    por              m0, m1
963    mova             m1, [mvq+b_idxq*4+(d_idx+12)*4]
964    mova             m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
965    mova             m3, m1
966    mova             m4, m2
967    psubw            m1, [mvq+b_idxq*4+12*4]
968    psubw            m2, [mvq+b_idxq*4+12*4+mmsize]
969    psubw            m3, [mvq+b_idxq*4+52*4]
970    psubw            m4, [mvq+b_idxq*4+52*4+mmsize]
971    packsswb         m1, m2
972    packsswb         m3, m4
973    paddb            m1, m6
974    paddb            m3, m6
975    psubusb          m1, m5 ; abs(mv[b] - mv[bn]) >= limit
976    psubusb          m3, m5
977    packsswb         m1, m3
978
979    por              m0, m1
980    mova             m1, [mvq+b_idxq*4+(d_idx+52)*4]
981    mova             m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize]
982    mova             m3, m1
983    mova             m4, m2
984    psubw            m1, [mvq+b_idxq*4+12*4]
985    psubw            m2, [mvq+b_idxq*4+12*4+mmsize]
986    psubw            m3, [mvq+b_idxq*4+52*4]
987    psubw            m4, [mvq+b_idxq*4+52*4+mmsize]
988    packsswb         m1, m2
989    packsswb         m3, m4
990    paddb            m1, m6
991    paddb            m3, m6
992    psubusb          m1, m5 ; abs(mv[b] - mv[bn]) >= limit
993    psubusb          m3, m5
994    packsswb         m1, m3
995
996    pshufw           m1, m1, 0x4E
997    por              m0, m1
998    pshufw           m1, m0, 0x4E
999    pminub           m0, m1
1000%else ; bidir == 0
1001    movd             m0, [refq+b_idxq+12]
1002    psubb            m0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn]
1003
1004    mova             m1, [mvq+b_idxq*4+12*4]
1005    mova             m2, [mvq+b_idxq*4+12*4+mmsize]
1006    psubw            m1, [mvq+b_idxq*4+(d_idx+12)*4]
1007    psubw            m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
1008    packsswb         m1, m2
1009    paddb            m1, m6
1010    psubusb          m1, m5 ; abs(mv[b] - mv[bn]) >= limit
1011    packsswb         m1, m1
1012    por              m0, m1
1013%endif ; bidir == 1/0
1014
1015%%.skip_loop_iter:
1016    movd             m1, [nnzq+b_idxq+12]
1017    por              m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn]
1018
1019    pminub           m1, m7
1020    pminub           m0, m7
1021    psllw            m1, 1
1022    pxor             m2, m2
1023    pmaxub           m1, m0
1024    punpcklbw        m1, m2
1025    movq [bsq+b_idxq+32*dir], m1
1026
1027    add          b_idxd, dword stepd
1028    cmp          b_idxd, dword edgesd
1029    jl %%.b_idx_loop
1030%endmacro
1031
1032INIT_MMX mmxext
1033cglobal h264_loop_filter_strength, 9, 9, 0, bs, nnz, ref, mv, bidir, edges, \
1034                                            step, mask_mv0, mask_mv1, field
1035%define b_idxq bidirq
1036%define b_idxd bidird
1037    cmp    dword fieldm, 0
1038    mova             m7, [pb_1]
1039    mova             m5, [pb_3]
1040    je .nofield
1041    mova             m5, [pb_3_1]
1042.nofield:
1043    mova             m6, m5
1044    paddb            m5, m5
1045
1046    shl     dword stepd, 3
1047    shl    dword edgesd, 3
1048%if ARCH_X86_32
1049%define mask_mv0d mask_mv0m
1050%define mask_mv1d mask_mv1m
1051%endif
1052    shl dword mask_mv1d, 3
1053    shl dword mask_mv0d, 3
1054
1055    cmp    dword bidird, 0
1056    jne .bidir
1057    loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8,  0, 0
1058    loop_filter_strength_iteration     32,     8, mask_mv0d, 0, -1, -1, 0
1059
1060    mova             m0, [bsq+mmsize*0]
1061    mova             m1, [bsq+mmsize*1]
1062    mova             m2, [bsq+mmsize*2]
1063    mova             m3, [bsq+mmsize*3]
1064    TRANSPOSE4x4W 0, 1, 2, 3, 4
1065    mova  [bsq+mmsize*0], m0
1066    mova  [bsq+mmsize*1], m1
1067    mova  [bsq+mmsize*2], m2
1068    mova  [bsq+mmsize*3], m3
1069    RET
1070
1071.bidir:
1072    loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8,  0, 1
1073    loop_filter_strength_iteration     32,     8, mask_mv0d, 0, -1, -1, 1
1074
1075    mova             m0, [bsq+mmsize*0]
1076    mova             m1, [bsq+mmsize*1]
1077    mova             m2, [bsq+mmsize*2]
1078    mova             m3, [bsq+mmsize*3]
1079    TRANSPOSE4x4W 0, 1, 2, 3, 4
1080    mova  [bsq+mmsize*0], m0
1081    mova  [bsq+mmsize*1], m1
1082    mova  [bsq+mmsize*2], m2
1083    mova  [bsq+mmsize*3], m3
1084    RET
1085