1;*****************************************************************************
2;* SSE2-optimized HEVC deblocking code
3;*****************************************************************************
4;* Copyright (C) 2013 VTT
5;*
6;* Authors: Seppo Tomperi <seppo.tomperi@vtt.fi>
7;*
8;* This file is part of FFmpeg.
9;*
10;* FFmpeg is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* FFmpeg is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with FFmpeg; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "libavutil/x86/x86util.asm"
26
27SECTION_RODATA
28
29pw_pixel_max_12: times 8 dw ((1 << 12)-1)
30pw_pixel_max_10: times 8 dw ((1 << 10)-1)
31pw_m2:           times 8 dw -2
32pd_1 :           times 4 dd  1
33
34cextern pw_4
35cextern pw_8
36cextern pw_m1
37
38SECTION .text
39INIT_XMM sse2
40
41; expands to [base],...,[base+7*stride]
42%define PASS8ROWS(base, base3, stride, stride3) \
43    [base], [base+stride], [base+stride*2], [base3], \
44    [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
45
46; in: 8 rows of 4 bytes in %4..%11
47; out: 4 rows of 8 words in m0..m3
48%macro TRANSPOSE4x8B_LOAD 8
49    movd             m0, %1
50    movd             m2, %2
51    movd             m1, %3
52    movd             m3, %4
53
54    punpcklbw        m0, m2
55    punpcklbw        m1, m3
56    punpcklwd        m0, m1
57
58    movd             m4, %5
59    movd             m6, %6
60    movd             m5, %7
61    movd             m3, %8
62
63    punpcklbw        m4, m6
64    punpcklbw        m5, m3
65    punpcklwd        m4, m5
66
67    punpckhdq        m2, m0, m4
68    punpckldq        m0, m4
69
70    pxor             m5, m5
71    punpckhbw        m1, m0, m5
72    punpcklbw        m0, m5
73    punpckhbw        m3, m2, m5
74    punpcklbw        m2, m5
75%endmacro
76
77; in: 4 rows of 8 words in m0..m3
78; out: 8 rows of 4 bytes in %1..%8
79%macro TRANSPOSE8x4B_STORE 8
80    packuswb         m0, m2
81    packuswb         m1, m3
82    SBUTTERFLY bw, 0, 1, 2
83    SBUTTERFLY wd, 0, 1, 2
84
85    movd             %1, m0
86    pshufd           m0, m0, 0x39
87    movd             %2, m0
88    pshufd           m0, m0, 0x39
89    movd             %3, m0
90    pshufd           m0, m0, 0x39
91    movd             %4, m0
92
93    movd             %5, m1
94    pshufd           m1, m1, 0x39
95    movd             %6, m1
96    pshufd           m1, m1, 0x39
97    movd             %7, m1
98    pshufd           m1, m1, 0x39
99    movd             %8, m1
100%endmacro
101
102; in: 8 rows of 4 words in %4..%11
103; out: 4 rows of 8 words in m0..m3
104%macro TRANSPOSE4x8W_LOAD 8
105    movq             m0, %1
106    movq             m2, %2
107    movq             m1, %3
108    movq             m3, %4
109
110    punpcklwd        m0, m2
111    punpcklwd        m1, m3
112    punpckhdq        m2, m0, m1
113    punpckldq        m0, m1
114
115    movq             m4, %5
116    movq             m6, %6
117    movq             m5, %7
118    movq             m3, %8
119
120    punpcklwd        m4, m6
121    punpcklwd        m5, m3
122    punpckhdq        m6, m4, m5
123    punpckldq        m4, m5
124
125    punpckhqdq       m1, m0, m4
126    punpcklqdq       m0, m4
127    punpckhqdq       m3, m2, m6
128    punpcklqdq       m2, m6
129
130%endmacro
131
132; in: 4 rows of 8 words in m0..m3
133; out: 8 rows of 4 words in %1..%8
134%macro TRANSPOSE8x4W_STORE 9
135    TRANSPOSE4x4W     0, 1, 2, 3, 4
136
137    pxor             m5, m5; zeros reg
138    CLIPW            m0, m5, %9
139    CLIPW            m1, m5, %9
140    CLIPW            m2, m5, %9
141    CLIPW            m3, m5, %9
142
143    movq             %1, m0
144    movhps           %2, m0
145    movq             %3, m1
146    movhps           %4, m1
147    movq             %5, m2
148    movhps           %6, m2
149    movq             %7, m3
150    movhps           %8, m3
151%endmacro
152
153; in: 8 rows of 8 bytes in %1..%8
154; out: 8 rows of 8 words in m0..m7
155%macro TRANSPOSE8x8B_LOAD 8
156    movq             m7, %1
157    movq             m2, %2
158    movq             m1, %3
159    movq             m3, %4
160
161    punpcklbw        m7, m2
162    punpcklbw        m1, m3
163    punpcklwd        m3, m7, m1
164    punpckhwd        m7, m1
165
166    movq             m4, %5
167    movq             m6, %6
168    movq             m5, %7
169    movq            m15, %8
170
171    punpcklbw        m4, m6
172    punpcklbw        m5, m15
173    punpcklwd        m9, m4, m5
174    punpckhwd        m4, m5
175
176    punpckldq        m1, m3, m9;  0, 1
177    punpckhdq        m3, m9;  2, 3
178
179    punpckldq        m5, m7, m4;  4, 5
180    punpckhdq        m7, m4;  6, 7
181
182    pxor            m13, m13
183
184    punpcklbw        m0, m1, m13; 0 in 16 bit
185    punpckhbw        m1, m13; 1 in 16 bit
186
187    punpcklbw        m2, m3, m13; 2
188    punpckhbw        m3, m13; 3
189
190    punpcklbw        m4, m5, m13; 4
191    punpckhbw        m5, m13; 5
192
193    punpcklbw        m6, m7, m13; 6
194    punpckhbw        m7, m13; 7
195%endmacro
196
197
198; in: 8 rows of 8 words in m0..m8
199; out: 8 rows of 8 bytes in %1..%8
200%macro TRANSPOSE8x8B_STORE 8
201    packuswb         m0, m4
202    packuswb         m1, m5
203    packuswb         m2, m6
204    packuswb         m3, m7
205    TRANSPOSE2x4x4B   0, 1, 2, 3, 4
206
207    movq             %1, m0
208    movhps           %2, m0
209    movq             %3, m1
210    movhps           %4, m1
211    movq             %5, m2
212    movhps           %6, m2
213    movq             %7, m3
214    movhps           %8, m3
215%endmacro
216
217; in: 8 rows of 8 words in %1..%8
218; out: 8 rows of 8 words in m0..m7
219%macro TRANSPOSE8x8W_LOAD 8
220    movdqu           m0, %1
221    movdqu           m1, %2
222    movdqu           m2, %3
223    movdqu           m3, %4
224    movdqu           m4, %5
225    movdqu           m5, %6
226    movdqu           m6, %7
227    movdqu           m7, %8
228    TRANSPOSE8x8W     0, 1, 2, 3, 4, 5, 6, 7, 8
229%endmacro
230
231; in: 8 rows of 8 words in m0..m8
232; out: 8 rows of 8 words in %1..%8
233%macro TRANSPOSE8x8W_STORE 9
234    TRANSPOSE8x8W     0, 1, 2, 3, 4, 5, 6, 7, 8
235
236    pxor             m8, m8
237    CLIPW            m0, m8, %9
238    CLIPW            m1, m8, %9
239    CLIPW            m2, m8, %9
240    CLIPW            m3, m8, %9
241    CLIPW            m4, m8, %9
242    CLIPW            m5, m8, %9
243    CLIPW            m6, m8, %9
244    CLIPW            m7, m8, %9
245
246    movdqu           %1, m0
247    movdqu           %2, m1
248    movdqu           %3, m2
249    movdqu           %4, m3
250    movdqu           %5, m4
251    movdqu           %6, m5
252    movdqu           %7, m6
253    movdqu           %8, m7
254%endmacro
255
256
257; in: %2 clobbered
258; out: %1
259; mask in m11
260; clobbers m10
261%macro MASKED_COPY 2
262    pand             %2, m11 ; and mask
263    pandn           m10, m11, %1; and -mask
264    por              %2, m10
265    mova             %1, %2
266%endmacro
267
268; in: %2 clobbered
269; out: %1
270; mask in %3, will be clobbered
271%macro MASKED_COPY2 3
272    pand             %2, %3 ; and mask
273    pandn            %3, %1; and -mask
274    por              %2, %3
275    mova             %1, %2
276%endmacro
277
278ALIGN 16
279; input in m0 ... m3 and tcs in r2. Output in m1 and m2
280%macro CHROMA_DEBLOCK_BODY 1
281    psubw            m4, m2, m1; q0 - p0
282    psubw            m5, m0, m3; p1 - q1
283    psllw            m4, 2; << 2
284    paddw            m5, m4;
285
286    ;tc calculations
287    movq             m6, [tcq]; tc0
288    punpcklwd        m6, m6
289    pshufd           m6, m6, 0xA0; tc0, tc1
290%if cpuflag(ssse3)
291    psignw           m4, m6, [pw_m1]; -tc0, -tc1
292%else
293    pmullw           m4, m6, [pw_m1]; -tc0, -tc1
294%endif
295    ;end tc calculations
296
297    paddw            m5, [pw_4]; +4
298    psraw            m5, 3; >> 3
299
300%if %1 > 8
301    psllw            m4, %1-8; << (BIT_DEPTH - 8)
302    psllw            m6, %1-8; << (BIT_DEPTH - 8)
303%endif
304    pmaxsw           m5, m4
305    pminsw           m5, m6
306    paddw            m1, m5; p0 + delta0
307    psubw            m2, m5; q0 - delta0
308%endmacro
309
310; input in m0 ... m7, beta in r2 tcs in r3. Output in m1...m6
311%macro LUMA_DEBLOCK_BODY 2
312    psllw            m9, m2, 1; *2
313    psubw           m10, m1, m9
314    paddw           m10, m3
315    ABS1            m10, m11 ; 0dp0, 0dp3 , 1dp0, 1dp3
316
317    psllw            m9, m5, 1; *2
318    psubw           m11, m6, m9
319    paddw           m11, m4
320    ABS1            m11, m13 ; 0dq0, 0dq3 , 1dq0, 1dq3
321
322    ;beta calculations
323%if %1 > 8
324    shl             betaq, %1 - 8
325%endif
326    movd            m13, betad
327    SPLATW          m13, m13, 0
328    ;end beta calculations
329
330    paddw            m9, m10, m11;   0d0, 0d3  ,  1d0, 1d3
331
332    pshufhw         m14, m9, 0x0f ;0b00001111;  0d3 0d3 0d0 0d0 in high
333    pshuflw         m14, m14, 0x0f ;0b00001111;  1d3 1d3 1d0 1d0 in low
334
335    pshufhw          m9, m9, 0xf0 ;0b11110000; 0d0 0d0 0d3 0d3
336    pshuflw          m9, m9, 0xf0 ;0b11110000; 1d0 1d0 1d3 1d3
337
338    paddw           m14, m9; 0d0+0d3, 1d0+1d3
339
340    ;compare
341    pcmpgtw         m15, m13, m14
342    movmskps        r13, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1)
343    test            r13, r13
344    je              .bypassluma
345
346    ;weak / strong decision compare to beta_2
347    psraw           m15, m13, 2;   beta >> 2
348    psllw            m8, m9, 1;
349    pcmpgtw         m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2
350    movmskps        r6, m15;
351    ;end weak / strong decision
352
353    ; weak filter nd_p/q calculation
354    pshufd           m8, m10, 0x31
355    psrld            m8, 16
356    paddw            m8, m10
357    movd            r7d, m8
358    pshufd           m8, m8, 0x4E
359    movd            r8d, m8
360
361    pshufd           m8, m11, 0x31
362    psrld            m8, 16
363    paddw            m8, m11
364    movd            r9d, m8
365    pshufd           m8, m8, 0x4E
366    movd           r10d, m8
367    ; end calc for weak filter
368
369    ; filtering mask
370    mov             r11, r13
371    shr             r11, 3
372    movd            m15, r11d
373    and             r13, 1
374    movd            m11, r13d
375    shufps          m11, m15, 0
376    shl             r11, 1
377    or              r13, r11
378
379    pcmpeqd         m11, [pd_1]; filtering mask
380
381    ;decide between strong and weak filtering
382    ;tc25 calculations
383    mov            r11d, [tcq];
384%if %1 > 8
385    shl             r11, %1 - 8
386%endif
387    movd             m8, r11d; tc0
388    mov             r3d, [tcq+4];
389%if %1 > 8
390    shl              r3, %1 - 8
391%endif
392    add            r11d, r3d; tc0 + tc1
393    jz             .bypassluma
394    movd             m9, r3d; tc1
395    punpcklwd        m8, m8
396    punpcklwd        m9, m9
397    shufps           m8, m9, 0; tc0, tc1
398    mova             m9, m8
399    psllw            m8, 2; tc << 2
400    pavgw            m8, m9; tc25 = ((tc * 5 + 1) >> 1)
401    ;end tc25 calculations
402
403    ;----beta_3 comparison-----
404    psubw           m12, m0, m3;      p3 - p0
405    ABS1            m12, m14; abs(p3 - p0)
406
407    psubw           m15, m7, m4;      q3 - q0
408    ABS1            m15, m14; abs(q3 - q0)
409
410    paddw           m12, m15; abs(p3 - p0) + abs(q3 - q0)
411
412    pshufhw         m12, m12, 0xf0 ;0b11110000;
413    pshuflw         m12, m12, 0xf0 ;0b11110000;
414
415    psraw           m13, 3; beta >> 3
416    pcmpgtw         m13, m12;
417    movmskps        r11, m13;
418    and             r6, r11; strong mask , beta_2 and beta_3 comparisons
419    ;----beta_3 comparison end-----
420    ;----tc25 comparison---
421    psubw           m12, m3, m4;      p0 - q0
422    ABS1            m12, m14; abs(p0 - q0)
423
424    pshufhw         m12, m12, 0xf0 ;0b11110000;
425    pshuflw         m12, m12, 0xf0 ;0b11110000;
426
427    pcmpgtw          m8, m12; tc25 comparisons
428    movmskps        r11, m8;
429    and             r6, r11; strong mask, beta_2, beta_3 and tc25 comparisons
430    ;----tc25 comparison end---
431    mov             r11, r6;
432    shr             r11, 1;
433    and             r6, r11; strong mask, bits 2 and 0
434
435    pmullw          m14, m9, [pw_m2]; -tc * 2
436    paddw            m9, m9
437
438    and             r6, 5; 0b101
439    mov             r11, r6; strong mask
440    shr             r6, 2;
441    movd            m12, r6d; store to xmm for mask generation
442    shl             r6, 1
443    and             r11, 1
444    movd            m10, r11d; store to xmm for mask generation
445    or              r6, r11; final strong mask, bits 1 and 0
446    jz      .weakfilter
447
448    shufps          m10, m12, 0
449    pcmpeqd         m10, [pd_1]; strong mask
450
451    mova            m13, [pw_4]; 4 in every cell
452    pand            m11, m10; combine filtering mask and strong mask
453    paddw           m12, m2, m3;          p1 +   p0
454    paddw           m12, m4;          p1 +   p0 +   q0
455    mova            m10, m12; copy
456    paddw           m12, m12;       2*p1 + 2*p0 + 2*q0
457    paddw           m12, m1;   p2 + 2*p1 + 2*p0 + 2*q0
458    paddw           m12, m5;   p2 + 2*p1 + 2*p0 + 2*q0 + q1
459    paddw           m12, m13;  p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4
460    psraw           m12, 3;  ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3)
461    psubw           m12, m3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3) - p0
462    pmaxsw          m12, m14
463    pminsw          m12, m9; av_clip( , -2 * tc, 2 * tc)
464    paddw           m12, m3; p0'
465
466    paddw           m15, m1, m10; p2 + p1 + p0 + q0
467    psrlw           m13, 1; 2 in every cell
468    paddw           m15, m13; p2 + p1 + p0 + q0 + 2
469    psraw           m15, 2;  (p2 + p1 + p0 + q0 + 2) >> 2
470    psubw           m15, m2;((p2 + p1 + p0 + q0 + 2) >> 2) - p1
471    pmaxsw          m15, m14
472    pminsw          m15, m9; av_clip( , -2 * tc, 2 * tc)
473    paddw           m15, m2; p1'
474
475    paddw            m8, m1, m0;     p3 +   p2
476    paddw            m8, m8;   2*p3 + 2*p2
477    paddw            m8, m1;   2*p3 + 3*p2
478    paddw            m8, m10;  2*p3 + 3*p2 + p1 + p0 + q0
479    paddw           m13, m13
480    paddw            m8, m13;  2*p3 + 3*p2 + p1 + p0 + q0 + 4
481    psraw            m8, 3;   (2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3
482    psubw            m8, m1; ((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3) - p2
483    pmaxsw           m8, m14
484    pminsw           m8, m9; av_clip( , -2 * tc, 2 * tc)
485    paddw            m8, m1; p2'
486    MASKED_COPY      m1, m8
487
488    paddw            m8, m3, m4;         p0 +   q0
489    paddw            m8, m5;         p0 +   q0 +   q1
490    paddw            m8, m8;       2*p0 + 2*q0 + 2*q1
491    paddw            m8, m2;  p1 + 2*p0 + 2*q0 + 2*q1
492    paddw            m8, m6;  p1 + 2*p0 + 2*q0 + 2*q1 + q2
493    paddw            m8, m13; p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4
494    psraw            m8, 3;  (p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4) >>3
495    psubw            m8, m4;
496    pmaxsw           m8, m14
497    pminsw           m8, m9; av_clip( , -2 * tc, 2 * tc)
498    paddw            m8, m4; q0'
499    MASKED_COPY      m2, m15
500
501    paddw           m15, m3, m4;   p0 + q0
502    paddw           m15, m5;   p0 + q0 + q1
503    mova            m10, m15;
504    paddw           m15, m6;   p0 + q0 + q1 + q2
505    psrlw           m13, 1; 2 in every cell
506    paddw           m15, m13;  p0 + q0 + q1 + q2 + 2
507    psraw           m15, 2;   (p0 + q0 + q1 + q2 + 2) >> 2
508    psubw           m15, m5; ((p0 + q0 + q1 + q2 + 2) >> 2) - q1
509    pmaxsw          m15, m14
510    pminsw          m15, m9; av_clip( , -2 * tc, 2 * tc)
511    paddw           m15, m5; q1'
512
513    paddw           m13, m7;      q3 + 2
514    paddw           m13, m6;      q3 +  q2 + 2
515    paddw           m13, m13;   2*q3 + 2*q2 + 4
516    paddw           m13, m6;    2*q3 + 3*q2 + 4
517    paddw           m13, m10;   2*q3 + 3*q2 + q1 + q0 + p0 + 4
518    psraw           m13, 3;    (2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3
519    psubw           m13, m6;  ((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3) - q2
520    pmaxsw          m13, m14
521    pminsw          m13, m9; av_clip( , -2 * tc, 2 * tc)
522    paddw           m13, m6; q2'
523
524    MASKED_COPY      m6, m13
525    MASKED_COPY      m5, m15
526    MASKED_COPY      m4, m8
527    MASKED_COPY      m3, m12
528
529.weakfilter:
530    not             r6; strong mask -> weak mask
531    and             r6, r13; final weak filtering mask, bits 0 and 1
532    jz             .store
533
534    ; weak filtering mask
535    mov             r11, r6
536    shr             r11, 1
537    movd            m12, r11d
538    and             r6, 1
539    movd            m11, r6d
540    shufps          m11, m12, 0
541    pcmpeqd         m11, [pd_1]; filtering mask
542
543    mov             r13, betaq
544    shr             r13, 1;
545    add             betaq, r13
546    shr             betaq, 3; ((beta + (beta >> 1)) >> 3))
547
548    mova            m13, [pw_8]
549    psubw           m12, m4, m3 ; q0 - p0
550    psllw           m10, m12, 3; 8 * (q0 - p0)
551    paddw           m12, m10 ; 9 * (q0 - p0)
552
553    psubw           m10, m5, m2 ; q1 - p1
554    psllw            m8, m10, 1; 2 * ( q1 - p1 )
555    paddw           m10, m8; 3 * ( q1 - p1 )
556    psubw           m12, m10; 9 * (q0 - p0) - 3 * ( q1 - p1 )
557    paddw           m12, m13; + 8
558    psraw           m12, 4; >> 4 , delta0
559    PABSW           m13, m12; abs(delta0)
560
561
562    psllw           m10, m9, 2; 8 * tc
563    paddw           m10, m9; 10 * tc
564    pcmpgtw         m10, m13
565    pand            m11, m10
566
567    psraw            m9, 1;   tc * 2 -> tc
568    psraw           m14, 1; -tc * 2 -> -tc
569
570    pmaxsw          m12, m14
571    pminsw          m12, m9;  av_clip(delta0, -tc, tc)
572
573    psraw            m9, 1;   tc -> tc / 2
574%if cpuflag(ssse3)
575    psignw          m14, m9, [pw_m1]; -tc / 2
576%else
577    pmullw          m14, m9, [pw_m1]; -tc / 2
578%endif
579
580    pavgw           m15, m1, m3;   (p2 + p0 + 1) >> 1
581    psubw           m15, m2;  ((p2 + p0 + 1) >> 1) - p1
582    paddw           m15, m12; ((p2 + p0 + 1) >> 1) - p1 + delta0
583    psraw           m15, 1;   (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1
584    pmaxsw          m15, m14
585    pminsw          m15, m9; av_clip(deltap1, -tc/2, tc/2)
586    paddw           m15, m2; p1'
587
588    ;beta calculations
589    movd            m10, betad
590    SPLATW          m10, m10, 0
591
592    movd            m13, r7d; 1dp0 + 1dp3
593    movd             m8, r8d; 0dp0 + 0dp3
594    punpcklwd        m8, m8
595    punpcklwd       m13, m13
596    shufps          m13, m8, 0;
597    pcmpgtw          m8, m10, m13
598    pand             m8, m11
599    ;end beta calculations
600    MASKED_COPY2     m2, m15, m8; write p1'
601
602    pavgw            m8, m6, m4;   (q2 + q0 + 1) >> 1
603    psubw            m8, m5;  ((q2 + q0 + 1) >> 1) - q1
604    psubw            m8, m12; ((q2 + q0 + 1) >> 1) - q1 - delta0)
605    psraw            m8, 1;   ((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1
606    pmaxsw           m8, m14
607    pminsw           m8, m9; av_clip(deltaq1, -tc/2, tc/2)
608    paddw            m8, m5; q1'
609
610    movd            m13, r9d;
611    movd            m15, r10d;
612    punpcklwd       m15, m15
613    punpcklwd       m13, m13
614    shufps          m13, m15, 0; dq0 + dq3
615
616    pcmpgtw         m10, m13; compare to ((beta+(beta>>1))>>3)
617    pand            m10, m11
618    MASKED_COPY2     m5, m8, m10; write q1'
619
620    paddw           m15, m3, m12 ; p0 + delta0
621    MASKED_COPY      m3, m15
622
623    psubw            m8, m4, m12 ; q0 - delta0
624    MASKED_COPY      m4, m8
625%endmacro
626
627;-----------------------------------------------------------------------------
628; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
629;                                   uint8_t *_no_p, uint8_t *_no_q);
630;-----------------------------------------------------------------------------
631%macro LOOP_FILTER_CHROMA 0
632cglobal hevc_v_loop_filter_chroma_8, 3, 5, 7, pix, stride, tc, pix0, r3stride
633    sub            pixq, 2
634    lea       r3strideq, [3*strideq]
635    mov           pix0q, pixq
636    add            pixq, r3strideq
637    TRANSPOSE4x8B_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
638    CHROMA_DEBLOCK_BODY 8
639    TRANSPOSE8x4B_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq)
640    RET
641
642cglobal hevc_v_loop_filter_chroma_10, 3, 5, 7, pix, stride, tc, pix0, r3stride
643    sub            pixq, 4
644    lea       r3strideq, [3*strideq]
645    mov           pix0q, pixq
646    add            pixq, r3strideq
647    TRANSPOSE4x8W_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
648    CHROMA_DEBLOCK_BODY 10
649    TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_10]
650    RET
651
652cglobal hevc_v_loop_filter_chroma_12, 3, 5, 7, pix, stride, tc, pix0, r3stride
653    sub            pixq, 4
654    lea       r3strideq, [3*strideq]
655    mov           pix0q, pixq
656    add            pixq, r3strideq
657    TRANSPOSE4x8W_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
658    CHROMA_DEBLOCK_BODY 12
659    TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_12]
660    RET
661
662;-----------------------------------------------------------------------------
663; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
664;                                   uint8_t *_no_p, uint8_t *_no_q);
665;-----------------------------------------------------------------------------
666cglobal hevc_h_loop_filter_chroma_8, 3, 4, 7, pix, stride, tc, pix0
667    mov           pix0q, pixq
668    sub           pix0q, strideq
669    sub           pix0q, strideq
670    movq             m0, [pix0q];    p1
671    movq             m1, [pix0q+strideq]; p0
672    movq             m2, [pixq];    q0
673    movq             m3, [pixq+strideq]; q1
674    pxor             m5, m5; zeros reg
675    punpcklbw        m0, m5
676    punpcklbw        m1, m5
677    punpcklbw        m2, m5
678    punpcklbw        m3, m5
679    CHROMA_DEBLOCK_BODY  8
680    packuswb         m1, m2
681    movh[pix0q+strideq], m1
682    movhps       [pixq], m1
683    RET
684
685cglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0
686    mov          pix0q, pixq
687    sub          pix0q, strideq
688    sub          pix0q, strideq
689    movu            m0, [pix0q];    p1
690    movu            m1, [pix0q+strideq]; p0
691    movu            m2, [pixq];    q0
692    movu            m3, [pixq+strideq]; q1
693    CHROMA_DEBLOCK_BODY 10
694    pxor            m5, m5; zeros reg
695    CLIPW           m1, m5, [pw_pixel_max_10]
696    CLIPW           m2, m5, [pw_pixel_max_10]
697    movu [pix0q+strideq], m1
698    movu        [pixq], m2
699    RET
700
701cglobal hevc_h_loop_filter_chroma_12, 3, 4, 7, pix, stride, tc, pix0
702    mov          pix0q, pixq
703    sub          pix0q, strideq
704    sub          pix0q, strideq
705    movu            m0, [pix0q];    p1
706    movu            m1, [pix0q+strideq]; p0
707    movu            m2, [pixq];    q0
708    movu            m3, [pixq+strideq]; q1
709    CHROMA_DEBLOCK_BODY 12
710    pxor            m5, m5; zeros reg
711    CLIPW           m1, m5, [pw_pixel_max_12]
712    CLIPW           m2, m5, [pw_pixel_max_12]
713    movu [pix0q+strideq], m1
714    movu        [pixq], m2
715    RET
716%endmacro
717
718INIT_XMM sse2
719LOOP_FILTER_CHROMA
720INIT_XMM avx
721LOOP_FILTER_CHROMA
722
723%if ARCH_X86_64
724%macro LOOP_FILTER_LUMA 0
725;-----------------------------------------------------------------------------
726; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
727;                                 int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
728;-----------------------------------------------------------------------------
729cglobal hevc_v_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
730    sub            pixq, 4
731    lea           pix0q, [3 * r1]
732    mov     src3strideq, pixq
733    add            pixq, pix0q
734    TRANSPOSE8x8B_LOAD  PASS8ROWS(src3strideq, pixq, r1, pix0q)
735    LUMA_DEBLOCK_BODY 8, v
736.store:
737    TRANSPOSE8x8B_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q)
738.bypassluma:
739    RET
740
741cglobal hevc_v_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
742    sub            pixq, 8
743    lea           pix0q, [3 * strideq]
744    mov     src3strideq, pixq
745    add            pixq, pix0q
746    TRANSPOSE8x8W_LOAD  PASS8ROWS(src3strideq, pixq, strideq, pix0q)
747    LUMA_DEBLOCK_BODY 10, v
748.store:
749    TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_10]
750.bypassluma:
751    RET
752
753cglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
754    sub            pixq, 8
755    lea           pix0q, [3 * strideq]
756    mov     src3strideq, pixq
757    add            pixq, pix0q
758    TRANSPOSE8x8W_LOAD  PASS8ROWS(src3strideq, pixq, strideq, pix0q)
759    LUMA_DEBLOCK_BODY 12, v
760.store:
761    TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_12]
762.bypassluma:
763    RET
764
765;-----------------------------------------------------------------------------
766; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
767;                                 int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
768;-----------------------------------------------------------------------------
769cglobal hevc_h_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
770    lea     src3strideq, [3 * strideq]
771    mov           pix0q, pixq
772    sub           pix0q, src3strideq
773    sub           pix0q, strideq
774    movq             m0, [pix0q];               p3
775    movq             m1, [pix0q +     strideq]; p2
776    movq             m2, [pix0q + 2 * strideq]; p1
777    movq             m3, [pix0q + src3strideq]; p0
778    movq             m4, [pixq];                q0
779    movq             m5, [pixq +     strideq];  q1
780    movq             m6, [pixq + 2 * strideq];  q2
781    movq             m7, [pixq + src3strideq];  q3
782    pxor             m8, m8
783    punpcklbw        m0, m8
784    punpcklbw        m1, m8
785    punpcklbw        m2, m8
786    punpcklbw        m3, m8
787    punpcklbw        m4, m8
788    punpcklbw        m5, m8
789    punpcklbw        m6, m8
790    punpcklbw        m7, m8
791    LUMA_DEBLOCK_BODY 8, h
792.store:
793    packuswb          m1, m2
794    packuswb          m3, m4
795    packuswb          m5, m6
796    movh   [pix0q +     strideq], m1
797    movhps [pix0q + 2 * strideq], m1
798    movh   [pix0q + src3strideq], m3
799    movhps [pixq               ], m3
800    movh   [pixq  +     strideq], m5
801    movhps [pixq  + 2 * strideq], m5
802.bypassluma:
803    RET
804
805cglobal hevc_h_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
806    lea                  src3strideq, [3 * strideq]
807    mov                        pix0q, pixq
808    sub                        pix0q, src3strideq
809    sub                        pix0q, strideq
810    movdqu                        m0, [pix0q];               p3
811    movdqu                        m1, [pix0q +     strideq]; p2
812    movdqu                        m2, [pix0q + 2 * strideq]; p1
813    movdqu                        m3, [pix0q + src3strideq]; p0
814    movdqu                        m4, [pixq];                q0
815    movdqu                        m5, [pixq  +     strideq]; q1
816    movdqu                        m6, [pixq  + 2 * strideq]; q2
817    movdqu                        m7, [pixq  + src3strideq]; q3
818    LUMA_DEBLOCK_BODY             10, h
819.store:
820    pxor                          m8, m8; zeros reg
821    CLIPW                         m1, m8, [pw_pixel_max_10]
822    CLIPW                         m2, m8, [pw_pixel_max_10]
823    CLIPW                         m3, m8, [pw_pixel_max_10]
824    CLIPW                         m4, m8, [pw_pixel_max_10]
825    CLIPW                         m5, m8, [pw_pixel_max_10]
826    CLIPW                         m6, m8, [pw_pixel_max_10]
827    movdqu     [pix0q +     strideq], m1;  p2
828    movdqu     [pix0q + 2 * strideq], m2;  p1
829    movdqu     [pix0q + src3strideq], m3;  p0
830    movdqu     [pixq               ], m4;  q0
831    movdqu     [pixq  +     strideq], m5;  q1
832    movdqu     [pixq  + 2 * strideq], m6;  q2
833.bypassluma:
834    RET
835
836cglobal hevc_h_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
837    lea                  src3strideq, [3 * strideq]
838    mov                        pix0q, pixq
839    sub                        pix0q, src3strideq
840    sub                        pix0q, strideq
841    movdqu                        m0, [pix0q];               p3
842    movdqu                        m1, [pix0q +     strideq]; p2
843    movdqu                        m2, [pix0q + 2 * strideq]; p1
844    movdqu                        m3, [pix0q + src3strideq]; p0
845    movdqu                        m4, [pixq];                q0
846    movdqu                        m5, [pixq  +     strideq]; q1
847    movdqu                        m6, [pixq  + 2 * strideq]; q2
848    movdqu                        m7, [pixq  + src3strideq]; q3
849    LUMA_DEBLOCK_BODY             12, h
850.store:
851    pxor                          m8, m8; zeros reg
852    CLIPW                         m1, m8, [pw_pixel_max_12]
853    CLIPW                         m2, m8, [pw_pixel_max_12]
854    CLIPW                         m3, m8, [pw_pixel_max_12]
855    CLIPW                         m4, m8, [pw_pixel_max_12]
856    CLIPW                         m5, m8, [pw_pixel_max_12]
857    CLIPW                         m6, m8, [pw_pixel_max_12]
858    movdqu     [pix0q +     strideq], m1;  p2
859    movdqu     [pix0q + 2 * strideq], m2;  p1
860    movdqu     [pix0q + src3strideq], m3;  p0
861    movdqu     [pixq               ], m4;  q0
862    movdqu     [pixq  +     strideq], m5;  q1
863    movdqu     [pixq  + 2 * strideq], m6;  q2
864.bypassluma:
865    RET
866
867%endmacro
868
869INIT_XMM sse2
870LOOP_FILTER_LUMA
871INIT_XMM ssse3
872LOOP_FILTER_LUMA
873INIT_XMM avx
874LOOP_FILTER_LUMA
875%endif
876