1;*****************************************************************************
2;* SSE2-optimized HEVC deblocking code
3;*****************************************************************************
4;* Copyright (C) 2013 VTT
5;*
6;* Authors: Seppo Tomperi <seppo.tomperi@vtt.fi>
7;*
8;* This file is part of FFmpeg.
9;*
10;* FFmpeg is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* FFmpeg is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with FFmpeg; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "libavutil/x86/x86util.asm"
26
27SECTION_RODATA
28
29cextern pw_1023
30%define pw_pixel_max_10 pw_1023
31pw_pixel_max_12: times 8 dw ((1 << 12)-1)
32pw_m2:           times 8 dw -2
33pd_1 :           times 4 dd  1
34
35cextern pw_4
36cextern pw_8
37cextern pw_m1
38
39SECTION .text
40INIT_XMM sse2
41
42; in: 8 rows of 4 bytes in %4..%11
43; out: 4 rows of 8 words in m0..m3
44%macro TRANSPOSE4x8B_LOAD 8
45    movd             m0, %1
46    movd             m2, %2
47    movd             m1, %3
48    movd             m3, %4
49
50    punpcklbw        m0, m2
51    punpcklbw        m1, m3
52    punpcklwd        m0, m1
53
54    movd             m4, %5
55    movd             m6, %6
56    movd             m5, %7
57    movd             m3, %8
58
59    punpcklbw        m4, m6
60    punpcklbw        m5, m3
61    punpcklwd        m4, m5
62
63    punpckhdq        m2, m0, m4
64    punpckldq        m0, m4
65
66    pxor             m5, m5
67    punpckhbw        m1, m0, m5
68    punpcklbw        m0, m5
69    punpckhbw        m3, m2, m5
70    punpcklbw        m2, m5
71%endmacro
72
73; in: 4 rows of 8 words in m0..m3
74; out: 8 rows of 4 bytes in %1..%8
75%macro TRANSPOSE8x4B_STORE 8
76    packuswb         m0, m2
77    packuswb         m1, m3
78    SBUTTERFLY bw, 0, 1, 2
79    SBUTTERFLY wd, 0, 1, 2
80
81    movd             %1, m0
82    pshufd           m0, m0, 0x39
83    movd             %2, m0
84    pshufd           m0, m0, 0x39
85    movd             %3, m0
86    pshufd           m0, m0, 0x39
87    movd             %4, m0
88
89    movd             %5, m1
90    pshufd           m1, m1, 0x39
91    movd             %6, m1
92    pshufd           m1, m1, 0x39
93    movd             %7, m1
94    pshufd           m1, m1, 0x39
95    movd             %8, m1
96%endmacro
97
98; in: 8 rows of 4 words in %4..%11
99; out: 4 rows of 8 words in m0..m3
100%macro TRANSPOSE4x8W_LOAD 8
101    movq             m0, %1
102    movq             m2, %2
103    movq             m1, %3
104    movq             m3, %4
105
106    punpcklwd        m0, m2
107    punpcklwd        m1, m3
108    punpckhdq        m2, m0, m1
109    punpckldq        m0, m1
110
111    movq             m4, %5
112    movq             m6, %6
113    movq             m5, %7
114    movq             m3, %8
115
116    punpcklwd        m4, m6
117    punpcklwd        m5, m3
118    punpckhdq        m6, m4, m5
119    punpckldq        m4, m5
120
121    punpckhqdq       m1, m0, m4
122    punpcklqdq       m0, m4
123    punpckhqdq       m3, m2, m6
124    punpcklqdq       m2, m6
125
126%endmacro
127
128; in: 4 rows of 8 words in m0..m3
129; out: 8 rows of 4 words in %1..%8
130%macro TRANSPOSE8x4W_STORE 9
131    TRANSPOSE4x4W     0, 1, 2, 3, 4
132
133    pxor             m5, m5; zeros reg
134    CLIPW            m0, m5, %9
135    CLIPW            m1, m5, %9
136    CLIPW            m2, m5, %9
137    CLIPW            m3, m5, %9
138
139    movq             %1, m0
140    movhps           %2, m0
141    movq             %3, m1
142    movhps           %4, m1
143    movq             %5, m2
144    movhps           %6, m2
145    movq             %7, m3
146    movhps           %8, m3
147%endmacro
148
149; in: 8 rows of 8 bytes in %1..%8
150; out: 8 rows of 8 words in m0..m7
151%macro TRANSPOSE8x8B_LOAD 8
152    movq             m7, %1
153    movq             m2, %2
154    movq             m1, %3
155    movq             m3, %4
156
157    punpcklbw        m7, m2
158    punpcklbw        m1, m3
159    punpcklwd        m3, m7, m1
160    punpckhwd        m7, m1
161
162    movq             m4, %5
163    movq             m6, %6
164    movq             m5, %7
165    movq            m15, %8
166
167    punpcklbw        m4, m6
168    punpcklbw        m5, m15
169    punpcklwd        m9, m4, m5
170    punpckhwd        m4, m5
171
172    punpckldq        m1, m3, m9;  0, 1
173    punpckhdq        m3, m9;  2, 3
174
175    punpckldq        m5, m7, m4;  4, 5
176    punpckhdq        m7, m4;  6, 7
177
178    pxor            m13, m13
179
180    punpcklbw        m0, m1, m13; 0 in 16 bit
181    punpckhbw        m1, m13; 1 in 16 bit
182
183    punpcklbw        m2, m3, m13; 2
184    punpckhbw        m3, m13; 3
185
186    punpcklbw        m4, m5, m13; 4
187    punpckhbw        m5, m13; 5
188
189    punpcklbw        m6, m7, m13; 6
190    punpckhbw        m7, m13; 7
191%endmacro
192
193
194; in: 8 rows of 8 words in m0..m8
195; out: 8 rows of 8 bytes in %1..%8
196%macro TRANSPOSE8x8B_STORE 8
197    packuswb         m0, m4
198    packuswb         m1, m5
199    packuswb         m2, m6
200    packuswb         m3, m7
201    TRANSPOSE2x4x4B   0, 1, 2, 3, 4
202
203    movq             %1, m0
204    movhps           %2, m0
205    movq             %3, m1
206    movhps           %4, m1
207    movq             %5, m2
208    movhps           %6, m2
209    movq             %7, m3
210    movhps           %8, m3
211%endmacro
212
213; in: 8 rows of 8 words in %1..%8
214; out: 8 rows of 8 words in m0..m7
215%macro TRANSPOSE8x8W_LOAD 8
216    movdqu           m0, %1
217    movdqu           m1, %2
218    movdqu           m2, %3
219    movdqu           m3, %4
220    movdqu           m4, %5
221    movdqu           m5, %6
222    movdqu           m6, %7
223    movdqu           m7, %8
224    TRANSPOSE8x8W     0, 1, 2, 3, 4, 5, 6, 7, 8
225%endmacro
226
227; in: 8 rows of 8 words in m0..m8
228; out: 8 rows of 8 words in %1..%8
229%macro TRANSPOSE8x8W_STORE 9
230    TRANSPOSE8x8W     0, 1, 2, 3, 4, 5, 6, 7, 8
231
232    pxor             m8, m8
233    CLIPW            m0, m8, %9
234    CLIPW            m1, m8, %9
235    CLIPW            m2, m8, %9
236    CLIPW            m3, m8, %9
237    CLIPW            m4, m8, %9
238    CLIPW            m5, m8, %9
239    CLIPW            m6, m8, %9
240    CLIPW            m7, m8, %9
241
242    movdqu           %1, m0
243    movdqu           %2, m1
244    movdqu           %3, m2
245    movdqu           %4, m3
246    movdqu           %5, m4
247    movdqu           %6, m5
248    movdqu           %7, m6
249    movdqu           %8, m7
250%endmacro
251
252
253; in: %2 clobbered
254; out: %1
255; mask in m11
256; clobbers m10
257%macro MASKED_COPY 2
258    pand             %2, m11 ; and mask
259    pandn           m10, m11, %1; and -mask
260    por              %2, m10
261    mova             %1, %2
262%endmacro
263
264; in: %2 clobbered
265; out: %1
266; mask in %3, will be clobbered
267%macro MASKED_COPY2 3
268    pand             %2, %3 ; and mask
269    pandn            %3, %1; and -mask
270    por              %2, %3
271    mova             %1, %2
272%endmacro
273
274ALIGN 16
275; input in m0 ... m3 and tcs in r2. Output in m1 and m2
276%macro CHROMA_DEBLOCK_BODY 1
277    psubw            m4, m2, m1; q0 - p0
278    psubw            m5, m0, m3; p1 - q1
279    psllw            m4, 2; << 2
280    paddw            m5, m4;
281
282    ;tc calculations
283    movq             m6, [tcq]; tc0
284    punpcklwd        m6, m6
285    pshufd           m6, m6, 0xA0; tc0, tc1
286%if cpuflag(ssse3)
287    psignw           m4, m6, [pw_m1]; -tc0, -tc1
288%else
289    pmullw           m4, m6, [pw_m1]; -tc0, -tc1
290%endif
291    ;end tc calculations
292
293    paddw            m5, [pw_4]; +4
294    psraw            m5, 3; >> 3
295
296%if %1 > 8
297    psllw            m4, %1-8; << (BIT_DEPTH - 8)
298    psllw            m6, %1-8; << (BIT_DEPTH - 8)
299%endif
300    pmaxsw           m5, m4
301    pminsw           m5, m6
302    paddw            m1, m5; p0 + delta0
303    psubw            m2, m5; q0 - delta0
304%endmacro
305
306; input in m0 ... m7, beta in r2 tcs in r3. Output in m1...m6
307%macro LUMA_DEBLOCK_BODY 2
308    psllw            m9, m2, 1; *2
309    psubw           m10, m1, m9
310    paddw           m10, m3
311    ABS1            m10, m11 ; 0dp0, 0dp3 , 1dp0, 1dp3
312
313    psllw            m9, m5, 1; *2
314    psubw           m11, m6, m9
315    paddw           m11, m4
316    ABS1            m11, m13 ; 0dq0, 0dq3 , 1dq0, 1dq3
317
318    ;beta calculations
319%if %1 > 8
320    shl             betaq, %1 - 8
321%endif
322    movd            m13, betad
323    SPLATW          m13, m13, 0
324    ;end beta calculations
325
326    paddw            m9, m10, m11;   0d0, 0d3  ,  1d0, 1d3
327
328    pshufhw         m14, m9, 0x0f ;0b00001111;  0d3 0d3 0d0 0d0 in high
329    pshuflw         m14, m14, 0x0f ;0b00001111;  1d3 1d3 1d0 1d0 in low
330
331    pshufhw          m9, m9, 0xf0 ;0b11110000; 0d0 0d0 0d3 0d3
332    pshuflw          m9, m9, 0xf0 ;0b11110000; 1d0 1d0 1d3 1d3
333
334    paddw           m14, m9; 0d0+0d3, 1d0+1d3
335
336    ;compare
337    pcmpgtw         m15, m13, m14
338    movmskps        r13, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1)
339    test            r13, r13
340    je              .bypassluma
341
342    ;weak / strong decision compare to beta_2
343    psraw           m15, m13, 2;   beta >> 2
344    psllw            m8, m9, 1;
345    pcmpgtw         m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2
346    movmskps        r6, m15;
347    ;end weak / strong decision
348
349    ; weak filter nd_p/q calculation
350    pshufd           m8, m10, 0x31
351    psrld            m8, 16
352    paddw            m8, m10
353    movd            r7d, m8
354    pshufd           m8, m8, 0x4E
355    movd            r8d, m8
356
357    pshufd           m8, m11, 0x31
358    psrld            m8, 16
359    paddw            m8, m11
360    movd            r9d, m8
361    pshufd           m8, m8, 0x4E
362    movd           r10d, m8
363    ; end calc for weak filter
364
365    ; filtering mask
366    mov             r11, r13
367    shr             r11, 3
368    movd            m15, r11d
369    and             r13, 1
370    movd            m11, r13d
371    shufps          m11, m15, 0
372    shl             r11, 1
373    or              r13, r11
374
375    pcmpeqd         m11, [pd_1]; filtering mask
376
377    ;decide between strong and weak filtering
378    ;tc25 calculations
379    mov            r11d, [tcq];
380%if %1 > 8
381    shl             r11, %1 - 8
382%endif
383    movd             m8, r11d; tc0
384    mov             r3d, [tcq+4];
385%if %1 > 8
386    shl              r3, %1 - 8
387%endif
388    add            r11d, r3d; tc0 + tc1
389    jz             .bypassluma
390    movd             m9, r3d; tc1
391    punpcklwd        m8, m8
392    punpcklwd        m9, m9
393    shufps           m8, m9, 0; tc0, tc1
394    mova             m9, m8
395    psllw            m8, 2; tc << 2
396    pavgw            m8, m9; tc25 = ((tc * 5 + 1) >> 1)
397    ;end tc25 calculations
398
399    ;----beta_3 comparison-----
400    psubw           m12, m0, m3;      p3 - p0
401    ABS1            m12, m14; abs(p3 - p0)
402
403    psubw           m15, m7, m4;      q3 - q0
404    ABS1            m15, m14; abs(q3 - q0)
405
406    paddw           m12, m15; abs(p3 - p0) + abs(q3 - q0)
407
408    pshufhw         m12, m12, 0xf0 ;0b11110000;
409    pshuflw         m12, m12, 0xf0 ;0b11110000;
410
411    psraw           m13, 3; beta >> 3
412    pcmpgtw         m13, m12;
413    movmskps        r11, m13;
414    and             r6, r11; strong mask , beta_2 and beta_3 comparisons
415    ;----beta_3 comparison end-----
416    ;----tc25 comparison---
417    psubw           m12, m3, m4;      p0 - q0
418    ABS1            m12, m14; abs(p0 - q0)
419
420    pshufhw         m12, m12, 0xf0 ;0b11110000;
421    pshuflw         m12, m12, 0xf0 ;0b11110000;
422
423    pcmpgtw          m8, m12; tc25 comparisons
424    movmskps        r11, m8;
425    and             r6, r11; strong mask, beta_2, beta_3 and tc25 comparisons
426    ;----tc25 comparison end---
427    mov             r11, r6;
428    shr             r11, 1;
429    and             r6, r11; strong mask, bits 2 and 0
430
431    pmullw          m14, m9, [pw_m2]; -tc * 2
432    paddw            m9, m9
433
434    and             r6, 5; 0b101
435    mov             r11, r6; strong mask
436    shr             r6, 2;
437    movd            m12, r6d; store to xmm for mask generation
438    shl             r6, 1
439    and             r11, 1
440    movd            m10, r11d; store to xmm for mask generation
441    or              r6, r11; final strong mask, bits 1 and 0
442    jz      .weakfilter
443
444    shufps          m10, m12, 0
445    pcmpeqd         m10, [pd_1]; strong mask
446
447    mova            m13, [pw_4]; 4 in every cell
448    pand            m11, m10; combine filtering mask and strong mask
449    paddw           m12, m2, m3;          p1 +   p0
450    paddw           m12, m4;          p1 +   p0 +   q0
451    mova            m10, m12; copy
452    paddw           m12, m12;       2*p1 + 2*p0 + 2*q0
453    paddw           m12, m1;   p2 + 2*p1 + 2*p0 + 2*q0
454    paddw           m12, m5;   p2 + 2*p1 + 2*p0 + 2*q0 + q1
455    paddw           m12, m13;  p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4
456    psraw           m12, 3;  ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3)
457    psubw           m12, m3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3) - p0
458    pmaxsw          m12, m14
459    pminsw          m12, m9; av_clip( , -2 * tc, 2 * tc)
460    paddw           m12, m3; p0'
461
462    paddw           m15, m1, m10; p2 + p1 + p0 + q0
463    psrlw           m13, 1; 2 in every cell
464    paddw           m15, m13; p2 + p1 + p0 + q0 + 2
465    psraw           m15, 2;  (p2 + p1 + p0 + q0 + 2) >> 2
466    psubw           m15, m2;((p2 + p1 + p0 + q0 + 2) >> 2) - p1
467    pmaxsw          m15, m14
468    pminsw          m15, m9; av_clip( , -2 * tc, 2 * tc)
469    paddw           m15, m2; p1'
470
471    paddw            m8, m1, m0;     p3 +   p2
472    paddw            m8, m8;   2*p3 + 2*p2
473    paddw            m8, m1;   2*p3 + 3*p2
474    paddw            m8, m10;  2*p3 + 3*p2 + p1 + p0 + q0
475    paddw           m13, m13
476    paddw            m8, m13;  2*p3 + 3*p2 + p1 + p0 + q0 + 4
477    psraw            m8, 3;   (2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3
478    psubw            m8, m1; ((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3) - p2
479    pmaxsw           m8, m14
480    pminsw           m8, m9; av_clip( , -2 * tc, 2 * tc)
481    paddw            m8, m1; p2'
482    MASKED_COPY      m1, m8
483
484    paddw            m8, m3, m4;         p0 +   q0
485    paddw            m8, m5;         p0 +   q0 +   q1
486    paddw            m8, m8;       2*p0 + 2*q0 + 2*q1
487    paddw            m8, m2;  p1 + 2*p0 + 2*q0 + 2*q1
488    paddw            m8, m6;  p1 + 2*p0 + 2*q0 + 2*q1 + q2
489    paddw            m8, m13; p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4
490    psraw            m8, 3;  (p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4) >>3
491    psubw            m8, m4;
492    pmaxsw           m8, m14
493    pminsw           m8, m9; av_clip( , -2 * tc, 2 * tc)
494    paddw            m8, m4; q0'
495    MASKED_COPY      m2, m15
496
497    paddw           m15, m3, m4;   p0 + q0
498    paddw           m15, m5;   p0 + q0 + q1
499    mova            m10, m15;
500    paddw           m15, m6;   p0 + q0 + q1 + q2
501    psrlw           m13, 1; 2 in every cell
502    paddw           m15, m13;  p0 + q0 + q1 + q2 + 2
503    psraw           m15, 2;   (p0 + q0 + q1 + q2 + 2) >> 2
504    psubw           m15, m5; ((p0 + q0 + q1 + q2 + 2) >> 2) - q1
505    pmaxsw          m15, m14
506    pminsw          m15, m9; av_clip( , -2 * tc, 2 * tc)
507    paddw           m15, m5; q1'
508
509    paddw           m13, m7;      q3 + 2
510    paddw           m13, m6;      q3 +  q2 + 2
511    paddw           m13, m13;   2*q3 + 2*q2 + 4
512    paddw           m13, m6;    2*q3 + 3*q2 + 4
513    paddw           m13, m10;   2*q3 + 3*q2 + q1 + q0 + p0 + 4
514    psraw           m13, 3;    (2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3
515    psubw           m13, m6;  ((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3) - q2
516    pmaxsw          m13, m14
517    pminsw          m13, m9; av_clip( , -2 * tc, 2 * tc)
518    paddw           m13, m6; q2'
519
520    MASKED_COPY      m6, m13
521    MASKED_COPY      m5, m15
522    MASKED_COPY      m4, m8
523    MASKED_COPY      m3, m12
524
525.weakfilter:
526    not             r6; strong mask -> weak mask
527    and             r6, r13; final weak filtering mask, bits 0 and 1
528    jz             .store
529
530    ; weak filtering mask
531    mov             r11, r6
532    shr             r11, 1
533    movd            m12, r11d
534    and             r6, 1
535    movd            m11, r6d
536    shufps          m11, m12, 0
537    pcmpeqd         m11, [pd_1]; filtering mask
538
539    mov             r13, betaq
540    shr             r13, 1;
541    add             betaq, r13
542    shr             betaq, 3; ((beta + (beta >> 1)) >> 3))
543
544    mova            m13, [pw_8]
545    psubw           m12, m4, m3 ; q0 - p0
546    psllw           m10, m12, 3; 8 * (q0 - p0)
547    paddw           m12, m10 ; 9 * (q0 - p0)
548
549    psubw           m10, m5, m2 ; q1 - p1
550    psllw            m8, m10, 1; 2 * ( q1 - p1 )
551    paddw           m10, m8; 3 * ( q1 - p1 )
552    psubw           m12, m10; 9 * (q0 - p0) - 3 * ( q1 - p1 )
553    paddw           m12, m13; + 8
554    psraw           m12, 4; >> 4 , delta0
555    PABSW           m13, m12; abs(delta0)
556
557
558    psllw           m10, m9, 2; 8 * tc
559    paddw           m10, m9; 10 * tc
560    pcmpgtw         m10, m13
561    pand            m11, m10
562
563    psraw            m9, 1;   tc * 2 -> tc
564    psraw           m14, 1; -tc * 2 -> -tc
565
566    pmaxsw          m12, m14
567    pminsw          m12, m9;  av_clip(delta0, -tc, tc)
568
569    psraw            m9, 1;   tc -> tc / 2
570%if cpuflag(ssse3)
571    psignw          m14, m9, [pw_m1]; -tc / 2
572%else
573    pmullw          m14, m9, [pw_m1]; -tc / 2
574%endif
575
576    pavgw           m15, m1, m3;   (p2 + p0 + 1) >> 1
577    psubw           m15, m2;  ((p2 + p0 + 1) >> 1) - p1
578    paddw           m15, m12; ((p2 + p0 + 1) >> 1) - p1 + delta0
579    psraw           m15, 1;   (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1
580    pmaxsw          m15, m14
581    pminsw          m15, m9; av_clip(deltap1, -tc/2, tc/2)
582    paddw           m15, m2; p1'
583
584    ;beta calculations
585    movd            m10, betad
586    SPLATW          m10, m10, 0
587
588    movd            m13, r7d; 1dp0 + 1dp3
589    movd             m8, r8d; 0dp0 + 0dp3
590    punpcklwd        m8, m8
591    punpcklwd       m13, m13
592    shufps          m13, m8, 0;
593    pcmpgtw          m8, m10, m13
594    pand             m8, m11
595    ;end beta calculations
596    MASKED_COPY2     m2, m15, m8; write p1'
597
598    pavgw            m8, m6, m4;   (q2 + q0 + 1) >> 1
599    psubw            m8, m5;  ((q2 + q0 + 1) >> 1) - q1
600    psubw            m8, m12; ((q2 + q0 + 1) >> 1) - q1 - delta0)
601    psraw            m8, 1;   ((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1
602    pmaxsw           m8, m14
603    pminsw           m8, m9; av_clip(deltaq1, -tc/2, tc/2)
604    paddw            m8, m5; q1'
605
606    movd            m13, r9d;
607    movd            m15, r10d;
608    punpcklwd       m15, m15
609    punpcklwd       m13, m13
610    shufps          m13, m15, 0; dq0 + dq3
611
612    pcmpgtw         m10, m13; compare to ((beta+(beta>>1))>>3)
613    pand            m10, m11
614    MASKED_COPY2     m5, m8, m10; write q1'
615
616    paddw           m15, m3, m12 ; p0 + delta0
617    MASKED_COPY      m3, m15
618
619    psubw            m8, m4, m12 ; q0 - delta0
620    MASKED_COPY      m4, m8
621%endmacro
622
623;-----------------------------------------------------------------------------
624; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
625;                                   uint8_t *_no_p, uint8_t *_no_q);
626;-----------------------------------------------------------------------------
627%macro LOOP_FILTER_CHROMA 0
628cglobal hevc_v_loop_filter_chroma_8, 3, 5, 7, pix, stride, tc, pix0, r3stride
629    sub            pixq, 2
630    lea       r3strideq, [3*strideq]
631    mov           pix0q, pixq
632    add            pixq, r3strideq
633    TRANSPOSE4x8B_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
634    CHROMA_DEBLOCK_BODY 8
635    TRANSPOSE8x4B_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq)
636    RET
637
638cglobal hevc_v_loop_filter_chroma_10, 3, 5, 7, pix, stride, tc, pix0, r3stride
639    sub            pixq, 4
640    lea       r3strideq, [3*strideq]
641    mov           pix0q, pixq
642    add            pixq, r3strideq
643    TRANSPOSE4x8W_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
644    CHROMA_DEBLOCK_BODY 10
645    TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_10]
646    RET
647
648cglobal hevc_v_loop_filter_chroma_12, 3, 5, 7, pix, stride, tc, pix0, r3stride
649    sub            pixq, 4
650    lea       r3strideq, [3*strideq]
651    mov           pix0q, pixq
652    add            pixq, r3strideq
653    TRANSPOSE4x8W_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
654    CHROMA_DEBLOCK_BODY 12
655    TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_12]
656    RET
657
658;-----------------------------------------------------------------------------
659; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
660;                                   uint8_t *_no_p, uint8_t *_no_q);
661;-----------------------------------------------------------------------------
662cglobal hevc_h_loop_filter_chroma_8, 3, 4, 7, pix, stride, tc, pix0
663    mov           pix0q, pixq
664    sub           pix0q, strideq
665    sub           pix0q, strideq
666    movq             m0, [pix0q];    p1
667    movq             m1, [pix0q+strideq]; p0
668    movq             m2, [pixq];    q0
669    movq             m3, [pixq+strideq]; q1
670    pxor             m5, m5; zeros reg
671    punpcklbw        m0, m5
672    punpcklbw        m1, m5
673    punpcklbw        m2, m5
674    punpcklbw        m3, m5
675    CHROMA_DEBLOCK_BODY  8
676    packuswb         m1, m2
677    movh[pix0q+strideq], m1
678    movhps       [pixq], m1
679    RET
680
681cglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0
682    mov          pix0q, pixq
683    sub          pix0q, strideq
684    sub          pix0q, strideq
685    movu            m0, [pix0q];    p1
686    movu            m1, [pix0q+strideq]; p0
687    movu            m2, [pixq];    q0
688    movu            m3, [pixq+strideq]; q1
689    CHROMA_DEBLOCK_BODY 10
690    pxor            m5, m5; zeros reg
691    CLIPW           m1, m5, [pw_pixel_max_10]
692    CLIPW           m2, m5, [pw_pixel_max_10]
693    movu [pix0q+strideq], m1
694    movu        [pixq], m2
695    RET
696
697cglobal hevc_h_loop_filter_chroma_12, 3, 4, 7, pix, stride, tc, pix0
698    mov          pix0q, pixq
699    sub          pix0q, strideq
700    sub          pix0q, strideq
701    movu            m0, [pix0q];    p1
702    movu            m1, [pix0q+strideq]; p0
703    movu            m2, [pixq];    q0
704    movu            m3, [pixq+strideq]; q1
705    CHROMA_DEBLOCK_BODY 12
706    pxor            m5, m5; zeros reg
707    CLIPW           m1, m5, [pw_pixel_max_12]
708    CLIPW           m2, m5, [pw_pixel_max_12]
709    movu [pix0q+strideq], m1
710    movu        [pixq], m2
711    RET
712%endmacro
713
714INIT_XMM sse2
715LOOP_FILTER_CHROMA
716INIT_XMM avx
717LOOP_FILTER_CHROMA
718
719%if ARCH_X86_64
720%macro LOOP_FILTER_LUMA 0
721;-----------------------------------------------------------------------------
722; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
723;                                 int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
724;-----------------------------------------------------------------------------
725cglobal hevc_v_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
726    sub            pixq, 4
727    lea           pix0q, [3 * r1]
728    mov     src3strideq, pixq
729    add            pixq, pix0q
730    TRANSPOSE8x8B_LOAD  PASS8ROWS(src3strideq, pixq, r1, pix0q)
731    LUMA_DEBLOCK_BODY 8, v
732.store:
733    TRANSPOSE8x8B_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q)
734.bypassluma:
735    RET
736
737cglobal hevc_v_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
738    sub            pixq, 8
739    lea           pix0q, [3 * strideq]
740    mov     src3strideq, pixq
741    add            pixq, pix0q
742    TRANSPOSE8x8W_LOAD  PASS8ROWS(src3strideq, pixq, strideq, pix0q)
743    LUMA_DEBLOCK_BODY 10, v
744.store:
745    TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_10]
746.bypassluma:
747    RET
748
749cglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
750    sub            pixq, 8
751    lea           pix0q, [3 * strideq]
752    mov     src3strideq, pixq
753    add            pixq, pix0q
754    TRANSPOSE8x8W_LOAD  PASS8ROWS(src3strideq, pixq, strideq, pix0q)
755    LUMA_DEBLOCK_BODY 12, v
756.store:
757    TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_12]
758.bypassluma:
759    RET
760
761;-----------------------------------------------------------------------------
762; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
763;                                 int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
764;-----------------------------------------------------------------------------
765cglobal hevc_h_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
766    lea     src3strideq, [3 * strideq]
767    mov           pix0q, pixq
768    sub           pix0q, src3strideq
769    sub           pix0q, strideq
770    movq             m0, [pix0q];               p3
771    movq             m1, [pix0q +     strideq]; p2
772    movq             m2, [pix0q + 2 * strideq]; p1
773    movq             m3, [pix0q + src3strideq]; p0
774    movq             m4, [pixq];                q0
775    movq             m5, [pixq +     strideq];  q1
776    movq             m6, [pixq + 2 * strideq];  q2
777    movq             m7, [pixq + src3strideq];  q3
778    pxor             m8, m8
779    punpcklbw        m0, m8
780    punpcklbw        m1, m8
781    punpcklbw        m2, m8
782    punpcklbw        m3, m8
783    punpcklbw        m4, m8
784    punpcklbw        m5, m8
785    punpcklbw        m6, m8
786    punpcklbw        m7, m8
787    LUMA_DEBLOCK_BODY 8, h
788.store:
789    packuswb          m1, m2
790    packuswb          m3, m4
791    packuswb          m5, m6
792    movh   [pix0q +     strideq], m1
793    movhps [pix0q + 2 * strideq], m1
794    movh   [pix0q + src3strideq], m3
795    movhps [pixq               ], m3
796    movh   [pixq  +     strideq], m5
797    movhps [pixq  + 2 * strideq], m5
798.bypassluma:
799    RET
800
801cglobal hevc_h_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
802    lea                  src3strideq, [3 * strideq]
803    mov                        pix0q, pixq
804    sub                        pix0q, src3strideq
805    sub                        pix0q, strideq
806    movdqu                        m0, [pix0q];               p3
807    movdqu                        m1, [pix0q +     strideq]; p2
808    movdqu                        m2, [pix0q + 2 * strideq]; p1
809    movdqu                        m3, [pix0q + src3strideq]; p0
810    movdqu                        m4, [pixq];                q0
811    movdqu                        m5, [pixq  +     strideq]; q1
812    movdqu                        m6, [pixq  + 2 * strideq]; q2
813    movdqu                        m7, [pixq  + src3strideq]; q3
814    LUMA_DEBLOCK_BODY             10, h
815.store:
816    pxor                          m8, m8; zeros reg
817    CLIPW                         m1, m8, [pw_pixel_max_10]
818    CLIPW                         m2, m8, [pw_pixel_max_10]
819    CLIPW                         m3, m8, [pw_pixel_max_10]
820    CLIPW                         m4, m8, [pw_pixel_max_10]
821    CLIPW                         m5, m8, [pw_pixel_max_10]
822    CLIPW                         m6, m8, [pw_pixel_max_10]
823    movdqu     [pix0q +     strideq], m1;  p2
824    movdqu     [pix0q + 2 * strideq], m2;  p1
825    movdqu     [pix0q + src3strideq], m3;  p0
826    movdqu     [pixq               ], m4;  q0
827    movdqu     [pixq  +     strideq], m5;  q1
828    movdqu     [pixq  + 2 * strideq], m6;  q2
829.bypassluma:
830    RET
831
832cglobal hevc_h_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
833    lea                  src3strideq, [3 * strideq]
834    mov                        pix0q, pixq
835    sub                        pix0q, src3strideq
836    sub                        pix0q, strideq
837    movdqu                        m0, [pix0q];               p3
838    movdqu                        m1, [pix0q +     strideq]; p2
839    movdqu                        m2, [pix0q + 2 * strideq]; p1
840    movdqu                        m3, [pix0q + src3strideq]; p0
841    movdqu                        m4, [pixq];                q0
842    movdqu                        m5, [pixq  +     strideq]; q1
843    movdqu                        m6, [pixq  + 2 * strideq]; q2
844    movdqu                        m7, [pixq  + src3strideq]; q3
845    LUMA_DEBLOCK_BODY             12, h
846.store:
847    pxor                          m8, m8; zeros reg
848    CLIPW                         m1, m8, [pw_pixel_max_12]
849    CLIPW                         m2, m8, [pw_pixel_max_12]
850    CLIPW                         m3, m8, [pw_pixel_max_12]
851    CLIPW                         m4, m8, [pw_pixel_max_12]
852    CLIPW                         m5, m8, [pw_pixel_max_12]
853    CLIPW                         m6, m8, [pw_pixel_max_12]
854    movdqu     [pix0q +     strideq], m1;  p2
855    movdqu     [pix0q + 2 * strideq], m2;  p1
856    movdqu     [pix0q + src3strideq], m3;  p0
857    movdqu     [pixq               ], m4;  q0
858    movdqu     [pixq  +     strideq], m5;  q1
859    movdqu     [pixq  + 2 * strideq], m6;  q2
860.bypassluma:
861    RET
862
863%endmacro
864
865INIT_XMM sse2
866LOOP_FILTER_LUMA
867INIT_XMM ssse3
868LOOP_FILTER_LUMA
869INIT_XMM avx
870LOOP_FILTER_LUMA
871%endif
872