1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; Copyright (c) 2017-2021, The rav1e contributors
4; Copyright (c) 2021, Nathan Egge
5; All rights reserved.
6;
7; Redistribution and use in source and binary forms, with or without
8; modification, are permitted provided that the following conditions are met:
9;
10; 1. Redistributions of source code must retain the above copyright notice, this
11;    list of conditions and the following disclaimer.
12;
13; 2. Redistributions in binary form must reproduce the above copyright notice,
14;    this list of conditions and the following disclaimer in the documentation
15;    and/or other materials provided with the distribution.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
21; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28%include "config.asm"
29%include "ext/x86/x86inc.asm"
30
31SECTION_RODATA
32
33%macro DUP8 1-*
34    %rep %0
35        times 8 dw %1
36        %rotate 1
37    %endrep
38%endmacro
39
40pri_taps:  DUP8 4, 2, 3, 3
41dir_table: db  1 * 32 + 0,  2 * 32 + 0
42           db  1 * 32 + 0,  2 * 32 - 2
43           db -1 * 32 + 2, -2 * 32 + 4
44           db  0 * 32 + 2, -1 * 32 + 4
45           db  0 * 32 + 2,  0 * 32 + 4
46           db  0 * 32 + 2,  1 * 32 + 4
47           db  1 * 32 + 2,  2 * 32 + 4
48           db  1 * 32 + 0,  2 * 32 + 2
49           db  1 * 32 + 0,  2 * 32 + 0
50           db  1 * 32 + 0,  2 * 32 - 2
51           db -1 * 32 + 2, -2 * 32 + 4
52           db  0 * 32 + 2, -1 * 32 + 4
53
54dir_shift: times 4 dw 0x4000
55           times 4 dw 0x1000
56
57pw_128:    times 4 dw 128
58pw_2048:   times 8 dw 2048
59pw_m16384: times 8 dw -16384
60
61cextern cdef_dir_8bpc_ssse3.main
62cextern cdef_dir_8bpc_sse4.main
63cextern shufw_6543210x
64
65SECTION .text
66
67%if ARCH_X86_32
68DECLARE_REG_TMP 5, 3
69%elif WIN64
70DECLARE_REG_TMP 8, 4
71%else
72DECLARE_REG_TMP 8, 6
73%endif
74
75%macro CDEF_FILTER 2 ; w, h
76%if ARCH_X86_64
77    DEFINE_ARGS dst, stride, _, tmp, pridmp, pri, sec, dir
78    mova            m8, [base+pw_2048]
79%else
80    DEFINE_ARGS dst, pridmp, tmp, sec, pri, _, dir
81    %define         m8  [base+pw_2048]
82    %define         m9  [rsp+16*1+gprsize]
83    %define        m10  [rsp+16*2+gprsize]
84%endif
85    movifnidn     prid, r5m
86    movifnidn     secd, r6m
87    test          prid, prid
88    jz .sec_only
89    movd            m6, r5m
90%if ARCH_X86_32
91    mov       [rsp+24], pridmpd
92%endif
93    bsr        pridmpd, prid
94    lea           tmpd, [priq*4]
95    cmp     dword r10m, 0x3ff ; if (bpc == 10)
96    cmove         prid, tmpd  ;     pri <<= 2
97    mov           tmpd, r8m   ; damping
98    mov           dird, r7m
99    and           prid, 16
100    pshufb          m6, m7    ; splat
101    lea           dirq, [base+dir_table+dirq*2]
102    lea           priq, [base+pri_taps+priq*2]
103    test          secd, secd
104    jz .pri_only
105    mova         [rsp], m6
106    movd            m6, secd
107    tzcnt         secd, secd
108    sub        pridmpd, tmpd
109    sub           tmpd, secd
110    pshufb          m6, m7
111    xor           secd, secd
112    neg        pridmpd
113    cmovs      pridmpd, secd
114%if ARCH_X86_32
115    mov  [pri_shift+4], secd
116    mov  [sec_shift+4], secd
117%endif
118    mov  [pri_shift+0], pridmpq
119    mov  [sec_shift+0], tmpq
120    lea           tmpq, [px]
121%if WIN64
122    movaps         r4m, m9
123    movaps         r6m, m10
124%elif ARCH_X86_32
125    mov        pridmpd, [rsp+24]
126%endif
127%rep %1*%2/8
128    call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec
129%endrep
130%if WIN64
131    movaps          m9, r4m
132    movaps         m10, r6m
133%endif
134    jmp .end
135.pri_only:
136    sub           tmpd, pridmpd
137    cmovs         tmpd, secd
138%if ARCH_X86_32
139    mov        pridmpd, [rsp+24]
140    mov  [pri_shift+4], secd
141%endif
142    mov  [pri_shift+0], tmpq
143    lea           tmpq, [px]
144%rep %1*%2/8
145    call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri
146%endrep
147.end:
148    RET
149.sec_only:
150    mov           tmpd, r8m ; damping
151    movd            m6, r6m
152    tzcnt         secd, secd
153    mov           dird, r7m
154    pshufb          m6, m7
155    sub           tmpd, secd
156    lea           dirq, [base+dir_table+dirq*2]
157%if ARCH_X86_32
158    mov  [sec_shift+4], prid
159%endif
160    mov  [sec_shift+0], tmpq
161    lea           tmpq, [px]
162%rep %1*%2/8
163    call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec
164%endrep
165    jmp .end
166%if %1 == %2
167 %if ARCH_X86_64
168  DEFINE_ARGS dst, stride, _, tmp, off, pri, _, dir
169 %else
170  DEFINE_ARGS dst, stride, tmp, off, pri, _, dir
171 %endif
172ALIGN function_align
173.pri:
174    movsx         offq, byte [dirq+4]    ; off_k0
175%if %1 == 4
176    movq            m1, [dstq+strideq*0]
177    movhps          m1, [dstq+strideq*1]
178    movq            m2, [tmpq+offq+32*0] ; k0p0
179    movhps          m2, [tmpq+offq+32*1]
180    neg           offq
181    movq            m3, [tmpq+offq+32*0] ; k0p1
182    movhps          m3, [tmpq+offq+32*1]
183%else
184    mova            m1, [dstq]
185    movu            m2, [tmpq+offq]
186    neg           offq
187    movu            m3, [tmpq+offq]
188%endif
189    movsx         offq, byte [dirq+5]    ; off_k1
190    psubw           m2, m1               ; diff_k0p0
191    psubw           m3, m1               ; diff_k0p1
192    pabsw           m4, m2               ; adiff_k0p0
193    psrlw           m5, m4, [pri_shift+gprsize]
194    psubusw         m0, m6, m5
195    pabsw           m5, m3               ; adiff_k0p1
196    pminsw          m0, m4
197    psrlw           m4, m5, [pri_shift+gprsize]
198    psignw          m0, m2               ; constrain(diff_k0p0)
199    psubusw         m2, m6, m4
200    pminsw          m2, m5
201%if %1 == 4
202    movq            m4, [tmpq+offq+32*0] ; k1p0
203    movhps          m4, [tmpq+offq+32*1]
204    neg           offq
205    movq            m5, [tmpq+offq+32*0] ; k1p1
206    movhps          m5, [tmpq+offq+32*1]
207%else
208    movu            m4, [tmpq+offq]
209    neg           offq
210    movu            m5, [tmpq+offq]
211%endif
212    psubw           m4, m1               ; diff_k1p0
213    psubw           m5, m1               ; diff_k1p1
214    psignw          m2, m3               ; constrain(diff_k0p1)
215    pabsw           m3, m4               ; adiff_k1p0
216    paddw           m0, m2               ; constrain(diff_k0)
217    psrlw           m2, m3, [pri_shift+gprsize]
218    psubusw         m7, m6, m2
219    pabsw           m2, m5               ; adiff_k1p1
220    pminsw          m7, m3
221    psrlw           m3, m2, [pri_shift+gprsize]
222    psignw          m7, m4               ; constrain(diff_k1p0)
223    psubusw         m4, m6, m3
224    pminsw          m4, m2
225    psignw          m4, m5               ; constrain(diff_k1p1)
226    paddw           m7, m4               ; constrain(diff_k1)
227    pmullw          m0, [priq+16*0]      ; pri_tap_k0
228    pmullw          m7, [priq+16*1]      ; pri_tap_k1
229    paddw           m0, m7               ; sum
230    psraw           m2, m0, 15
231    paddw           m0, m2
232    pmulhrsw        m0, m8
233    paddw           m0, m1
234%if %1 == 4
235    add           tmpq, 32*2
236    movq   [dstq+strideq*0], m0
237    movhps [dstq+strideq*1], m0
238    lea           dstq, [dstq+strideq*2]
239%else
240    add           tmpq, 32
241    mova        [dstq], m0
242    add           dstq, strideq
243%endif
244    ret
245ALIGN function_align
246.sec:
247    movsx         offq, byte [dirq+8]    ; off1_k0
248%if %1 == 4
249    movq            m1, [dstq+strideq*0]
250    movhps          m1, [dstq+strideq*1]
251    movq            m2, [tmpq+offq+32*0] ; k0s0
252    movhps          m2, [tmpq+offq+32*1]
253    neg           offq
254    movq            m3, [tmpq+offq+32*0] ; k0s1
255    movhps          m3, [tmpq+offq+32*1]
256%else
257    mova            m1, [dstq]
258    movu            m2, [tmpq+offq]
259    neg           offq
260    movu            m3, [tmpq+offq]
261%endif
262    movsx         offq, byte [dirq+0]    ; off2_k0
263    psubw           m2, m1               ; diff_k0s0
264    psubw           m3, m1               ; diff_k0s1
265    pabsw           m4, m2               ; adiff_k0s0
266    psrlw           m5, m4, [sec_shift+gprsize]
267    psubusw         m0, m6, m5
268    pabsw           m5, m3               ; adiff_k0s1
269    pminsw          m0, m4
270    psrlw           m4, m5, [sec_shift+gprsize]
271    psignw          m0, m2               ; constrain(diff_k0s0)
272    psubusw         m2, m6, m4
273    pminsw          m2, m5
274%if %1 == 4
275    movq            m4, [tmpq+offq+32*0] ; k0s2
276    movhps          m4, [tmpq+offq+32*1]
277    neg           offq
278    movq            m5, [tmpq+offq+32*0] ; k0s3
279    movhps          m5, [tmpq+offq+32*1]
280%else
281    movu            m4, [tmpq+offq]
282    neg           offq
283    movu            m5, [tmpq+offq]
284%endif
285    movsx         offq, byte [dirq+9]    ; off1_k1
286    psubw           m4, m1               ; diff_k0s2
287    psubw           m5, m1               ; diff_k0s3
288    psignw          m2, m3               ; constrain(diff_k0s1)
289    pabsw           m3, m4               ; adiff_k0s2
290    paddw           m0, m2
291    psrlw           m2, m3, [sec_shift+gprsize]
292    psubusw         m7, m6, m2
293    pabsw           m2, m5               ; adiff_k0s3
294    pminsw          m7, m3
295    psrlw           m3, m2, [sec_shift+gprsize]
296    psignw          m7, m4               ; constrain(diff_k0s2)
297    psubusw         m4, m6, m3
298    pminsw          m4, m2
299%if %1 == 4
300    movq            m2, [tmpq+offq+32*0] ; k1s0
301    movhps          m2, [tmpq+offq+32*1]
302    neg           offq
303    movq            m3, [tmpq+offq+32*0] ; k1s1
304    movhps          m3, [tmpq+offq+32*1]
305%else
306    movu            m2, [tmpq+offq]
307    neg           offq
308    movu            m3, [tmpq+offq]
309%endif
310    movsx         offq, byte [dirq+1]    ; off2_k1
311    paddw           m0, m7
312    psignw          m4, m5               ; constrain(diff_k0s3)
313    paddw           m0, m4               ; constrain(diff_k0)
314    psubw           m2, m1               ; diff_k1s0
315    psubw           m3, m1               ; diff_k1s1
316    paddw           m0, m0               ; sec_tap_k0
317    pabsw           m4, m2               ; adiff_k1s0
318    psrlw           m5, m4, [sec_shift+gprsize]
319    psubusw         m7, m6, m5
320    pabsw           m5, m3               ; adiff_k1s1
321    pminsw          m7, m4
322    psrlw           m4, m5, [sec_shift+gprsize]
323    psignw          m7, m2               ; constrain(diff_k1s0)
324    psubusw         m2, m6, m4
325    pminsw          m2, m5
326%if %1 == 4
327    movq            m4, [tmpq+offq+32*0] ; k1s2
328    movhps          m4, [tmpq+offq+32*1]
329    neg           offq
330    movq            m5, [tmpq+offq+32*0] ; k1s3
331    movhps          m5, [tmpq+offq+32*1]
332%else
333    movu            m4, [tmpq+offq]
334    neg           offq
335    movu            m5, [tmpq+offq]
336%endif
337    paddw           m0, m7
338    psubw           m4, m1               ; diff_k1s2
339    psubw           m5, m1               ; diff_k1s3
340    psignw          m2, m3               ; constrain(diff_k1s1)
341    pabsw           m3, m4               ; adiff_k1s2
342    paddw           m0, m2
343    psrlw           m2, m3, [sec_shift+gprsize]
344    psubusw         m7, m6, m2
345    pabsw           m2, m5               ; adiff_k1s3
346    pminsw          m7, m3
347    psrlw           m3, m2, [sec_shift+gprsize]
348    psignw          m7, m4               ; constrain(diff_k1s2)
349    psubusw         m4, m6, m3
350    pminsw          m4, m2
351    paddw           m0, m7
352    psignw          m4, m5               ; constrain(diff_k1s3)
353    paddw           m0, m4               ; sum
354    psraw           m2, m0, 15
355    paddw           m0, m2
356    pmulhrsw        m0, m8
357    paddw           m0, m1
358%if %1 == 4
359    add           tmpq, 32*2
360    movq   [dstq+strideq*0], m0
361    movhps [dstq+strideq*1], m0
362    lea           dstq, [dstq+strideq*2]
363%else
364    add           tmpq, 32
365    mova        [dstq], m0
366    add           dstq, strideq
367%endif
368    ret
369ALIGN function_align
370.pri_sec:
371    movsx         offq, byte [dirq+8]    ; off2_k0
372%if %1 == 4
373    movq            m1, [dstq+strideq*0]
374    movhps          m1, [dstq+strideq*1]
375    movq            m2, [tmpq+offq+32*0] ; k0s0
376    movhps          m2, [tmpq+offq+32*1]
377    neg           offq
378    movq            m3, [tmpq+offq+32*0] ; k0s1
379    movhps          m3, [tmpq+offq+32*1]
380%else
381    mova            m1, [dstq]
382    movu            m2, [tmpq+offq]
383    neg           offq
384    movu            m3, [tmpq+offq]
385%endif
386    movsx         offq, byte [dirq+0]    ; off3_k0
387    pabsw           m4, m2
388%if ARCH_X86_64
389    pabsw          m10, m3
390    pmaxsw          m9, m2, m3
391    pminsw         m10, m4
392%else
393    pabsw           m7, m3
394    pmaxsw          m5, m2, m3
395    pminsw          m4, m7
396    mova            m9, m5
397    mova           m10, m4
398%endif
399    psubw           m2, m1               ; diff_k0s0
400    psubw           m3, m1               ; diff_k0s1
401    pabsw           m4, m2               ; adiff_k0s0
402    psrlw           m5, m4, [sec_shift+gprsize]
403    psubusw         m0, m6, m5
404    pabsw           m5, m3               ; adiff_k0s1
405    pminsw          m0, m4
406    psrlw           m4, m5, [sec_shift+gprsize]
407    psignw          m0, m2               ; constrain(diff_k0s0)
408    psubusw         m2, m6, m4
409    pminsw          m2, m5
410%if %1 == 4
411    movq            m4, [tmpq+offq+32*0] ; k0s2
412    movhps          m4, [tmpq+offq+32*1]
413    neg           offq
414    movq            m5, [tmpq+offq+32*0] ; k0s3
415    movhps          m5, [tmpq+offq+32*1]
416%else
417    movu            m4, [tmpq+offq]
418    neg           offq
419    movu            m5, [tmpq+offq]
420%endif
421    movsx         offq, byte [dirq+9]    ; off2_k1
422    pabsw           m7, m4
423    psignw          m2, m3
424    pabsw           m3, m5               ; constrain(diff_k0s1)
425%if ARCH_X86_64
426    pmaxsw          m9, m4
427    pminsw         m10, m7
428    pmaxsw          m9, m5
429    pminsw         m10, m3
430%else
431    pminsw          m7, m10
432    pminsw          m7, m3
433    pmaxsw          m3, m9, m4
434    pmaxsw          m3, m5
435    mova           m10, m7
436    mova            m9, m3
437%endif
438    psubw           m4, m1               ; diff_k0s2
439    psubw           m5, m1               ; diff_k0s3
440    paddw           m0, m2
441    pabsw           m3, m4               ; adiff_k0s2
442    psrlw           m2, m3, [sec_shift+gprsize]
443    psubusw         m7, m6, m2
444    pabsw           m2, m5               ; adiff_k0s3
445    pminsw          m7, m3
446    psrlw           m3, m2, [sec_shift+gprsize]
447    psignw          m7, m4               ; constrain(diff_k0s2)
448    psubusw         m4, m6, m3
449    pminsw          m4, m2
450%if %1 == 4
451    movq            m2, [tmpq+offq+32*0] ; k1s0
452    movhps          m2, [tmpq+offq+32*1]
453    neg           offq
454    movq            m3, [tmpq+offq+32*0] ; k1s1
455    movhps          m3, [tmpq+offq+32*1]
456%else
457    movu            m2, [tmpq+offq]
458    neg           offq
459    movu            m3, [tmpq+offq]
460%endif
461    movsx         offq, byte [dirq+1]    ; off3_k1
462    paddw           m0, m7
463    pabsw           m7, m2
464    psignw          m4, m5               ; constrain(diff_k0s3)
465    pabsw           m5, m3
466%if ARCH_X86_64
467    pmaxsw          m9, m2
468    pminsw         m10, m7
469    pmaxsw          m9, m3
470    pminsw         m10, m5
471%else
472    pminsw          m7, m10
473    pminsw          m7, m5
474    pmaxsw          m5, m9, m2
475    pmaxsw          m5, m3
476    mova           m10, m7
477    mova            m9, m5
478%endif
479    paddw           m0, m4               ; constrain(diff_k0)
480    psubw           m2, m1               ; diff_k1s0
481    psubw           m3, m1               ; diff_k1s1
482    paddw           m0, m0               ; sec_tap_k0
483    pabsw           m4, m2               ; adiff_k1s0
484    psrlw           m5, m4, [sec_shift+gprsize]
485    psubusw         m7, m6, m5
486    pabsw           m5, m3               ; adiff_k1s1
487    pminsw          m7, m4
488    psrlw           m4, m5, [sec_shift+gprsize]
489    psignw          m7, m2               ; constrain(diff_k1s0)
490    psubusw         m2, m6, m4
491    pminsw          m2, m5
492%if %1 == 4
493    movq            m4, [tmpq+offq+32*0] ; k1s2
494    movhps          m4, [tmpq+offq+32*1]
495    neg           offq
496    movq            m5, [tmpq+offq+32*0] ; k1s3
497    movhps          m5, [tmpq+offq+32*1]
498%else
499    movu            m4, [tmpq+offq]
500    neg           offq
501    movu            m5, [tmpq+offq]
502%endif
503    movsx         offq, byte [dirq+4]    ; off1_k0
504    paddw           m0, m7
505    pabsw           m7, m4
506    psignw          m2, m3               ; constrain(diff_k1s1)
507    pabsw           m3, m5
508%if ARCH_X86_64
509    pmaxsw          m9, m4
510    pminsw         m10, m7
511    pmaxsw          m9, m5
512    pminsw         m10, m3
513%else
514    pminsw          m7, m10
515    pminsw          m7, m3
516    pmaxsw          m3, m9, m4
517    pmaxsw          m3, m5
518    mova           m10, m7
519    mova            m9, m3
520%endif
521    psubw           m4, m1               ; diff_k1s2
522    psubw           m5, m1               ; diff_k1s3
523    pabsw           m3, m4               ; adiff_k1s2
524    paddw           m0, m2
525    psrlw           m2, m3, [sec_shift+gprsize]
526    psubusw         m7, m6, m2
527    pabsw           m2, m5               ; adiff_k1s3
528    pminsw          m7, m3
529    psrlw           m3, m2, [sec_shift+gprsize]
530    psignw          m7, m4               ; constrain(diff_k1s2)
531    psubusw         m4, m6, m3
532    pminsw          m4, m2
533    paddw           m0, m7
534%if %1 == 4
535    movq            m2, [tmpq+offq+32*0] ; k0p0
536    movhps          m2, [tmpq+offq+32*1]
537    neg           offq
538    movq            m3, [tmpq+offq+32*0] ; k0p1
539    movhps          m3, [tmpq+offq+32*1]
540%else
541    movu            m2, [tmpq+offq]
542    neg           offq
543    movu            m3, [tmpq+offq]
544%endif
545    movsx         offq, byte [dirq+5]    ; off1_k1
546    pabsw           m7, m2
547    psignw          m4, m5               ; constrain(diff_k1s3)
548    pabsw           m5, m3
549%if ARCH_X86_64
550    pmaxsw          m9, m2
551    pminsw         m10, m7
552    pmaxsw          m9, m3
553    pminsw         m10, m5
554%else
555    pminsw          m7, m10
556    pminsw          m7, m5
557    pmaxsw          m5, m9, m2
558    pmaxsw          m5, m3
559    mova           m10, m7
560    mova            m9, m5
561%endif
562    psubw           m2, m1               ; diff_k0p0
563    psubw           m3, m1               ; diff_k0p1
564    paddw           m0, m4
565    pabsw           m4, m2               ; adiff_k0p0
566    psrlw           m5, m4, [pri_shift+gprsize]
567    psubusw         m7, [rsp+gprsize], m5
568    pabsw           m5, m3               ; adiff_k0p1
569    pminsw          m7, m4
570    psrlw           m4, m5, [pri_shift+gprsize]
571    psignw          m7, m2               ; constrain(diff_k0p0)
572    psubusw         m2, [rsp+gprsize], m4
573    pminsw          m2, m5
574%if %1 == 4
575    movq            m4, [tmpq+offq+32*0] ; k1p0
576    movhps          m4, [tmpq+offq+32*1]
577    neg           offq
578    movq            m5, [tmpq+offq+32*0] ; k1p1
579    movhps          m5, [tmpq+offq+32*1]
580%else
581    movu            m4, [tmpq+offq]
582    neg           offq
583    movu            m5, [tmpq+offq]
584%endif
585    psignw          m2, m3               ; constrain(diff_k0p1)
586    pabsw           m3, m4
587    paddw           m7, m2               ; constrain(diff_k0)
588    pabsw           m2, m5
589%if ARCH_X86_64
590    pmaxsw          m9, m4
591    pminsw         m10, m3
592    pmaxsw          m9, m5
593    pminsw         m10, m2
594%else
595    pminsw          m3, m10
596    pminsw          m3, m2
597    pmaxsw          m2, m9, m4
598    pmaxsw          m2, m5
599    mova           m10, m3
600    mova            m9, m2
601%endif
602    psubw           m4, m1               ; diff_k1p0
603    psubw           m5, m1               ; diff_k1p1
604    pabsw           m3, m4               ; adiff_k1p0
605    pmullw          m7, [priq+16*0]      ; pri_tap_k0
606    paddw           m0, m7
607    psrlw           m2, m3, [pri_shift+gprsize]
608    psubusw         m7, [rsp+16*0+gprsize], m2
609    pabsw           m2, m5               ; adiff_k1p1
610    pminsw          m7, m3
611    psrlw           m3, m2, [pri_shift+gprsize]
612    psignw          m7, m4               ; constrain(diff_k1p0)
613    psubusw         m4, [rsp+16*0+gprsize], m3
614    pminsw          m4, m2
615    psignw          m4, m5               ; constrain(diff_k1p1)
616    paddw           m7, m4               ; constrain(diff_k1)
617    pmullw          m7, [priq+16*1]      ; pri_tap_k1
618    paddw           m0, m7               ; sum
619    psraw           m2, m0, 15
620    paddw           m0, m2
621    pmulhrsw        m0, m8
622    paddw           m0, m1
623%if ARCH_X86_64
624    pmaxsw          m9, m1
625    pminsw          m0, m9
626%else
627    pmaxsw          m2, m9, m1
628    pminsw          m0, m2
629%endif
630    pminsw          m1, m10
631    pmaxsw          m0, m1
632%if %1 == 4
633    add           tmpq, 32*2
634    movq   [dstq+strideq*0], m0
635    movhps [dstq+strideq*1], m0
636    lea           dstq, [dstq+strideq*2]
637%else
638    add           tmpq, 32
639    mova        [dstq], m0
640    add           dstq, strideq
641%endif
642    ret
643%endif
644%endmacro
645
646INIT_XMM ssse3
647%if ARCH_X86_64
648cglobal cdef_filter_4x4_16bpc, 5, 9, 9, 32*10, dst, stride, left, top, bot, \
649                                               pri, sec, edge
650    %define         px  rsp+32*4
651%else
652cglobal cdef_filter_4x4_16bpc, 2, 7, 8, -32*11, dst, stride, edge, top, left
653    %define       botq  topq
654    %define         px  rsp+32*5
655%endif
656    %define       base  t0-dir_table
657    %define  pri_shift  px-16*6
658    %define  sec_shift  px-16*5
659    mov          edged, r9m
660    LEA             t0, dir_table
661    movu            m0, [dstq+strideq*0]
662    movu            m1, [dstq+strideq*1]
663    lea             t1, [dstq+strideq*2]
664    movu            m2, [t1  +strideq*0]
665    movu            m3, [t1  +strideq*1]
666    movddup         m7, [base+pw_m16384]
667    mova   [px+32*0+0], m0
668    mova   [px+32*1+0], m1
669    mova   [px+32*2+0], m2
670    mova   [px+32*3+0], m3
671    test         edgeb, 4 ; HAVE_TOP
672    jz .no_top
673    movifnidn     topq, topmp
674    movu            m0, [topq+strideq*0]
675    movu            m1, [topq+strideq*1]
676    mova   [px-32*2+0], m0
677    mova   [px-32*1+0], m1
678    test         edgeb, 1 ; HAVE_LEFT
679    jz .top_no_left
680    movd            m0, [topq+strideq*0-4]
681    movd            m1, [topq+strideq*1-4]
682    movd   [px-32*2-4], m0
683    movd   [px-32*1-4], m1
684    jmp .top_done
685.no_top:
686    mova   [px-32*2+0], m7
687    mova   [px-32*1+0], m7
688.top_no_left:
689    movd   [px-32*2-4], m7
690    movd   [px-32*1-4], m7
691.top_done:
692    test         edgeb, 8 ; HAVE_BOTTOM
693    jz .no_bottom
694    movifnidn     botq, r4mp
695    movu            m0, [botq+strideq*0]
696    movu            m1, [botq+strideq*1]
697    mova   [px+32*4+0], m0
698    mova   [px+32*5+0], m1
699    test         edgeb, 1 ; HAVE_LEFT
700    jz .bottom_no_left
701    movd            m0, [botq+strideq*0-4]
702    movd            m1, [botq+strideq*1-4]
703    movd   [px+32*4-4], m0
704    movd   [px+32*5-4], m1
705    jmp .bottom_done
706.no_bottom:
707    mova   [px+32*4+0], m7
708    mova   [px+32*5+0], m7
709.bottom_no_left:
710    movd   [px+32*4-4], m7
711    movd   [px+32*5-4], m7
712.bottom_done:
713    test         edgeb, 1 ; HAVE_LEFT
714    jz .no_left
715    movifnidn    leftq, r2mp
716    movd            m0, [leftq+4*0]
717    movd            m1, [leftq+4*1]
718    movd            m2, [leftq+4*2]
719    movd            m3, [leftq+4*3]
720    movd   [px+32*0-4], m0
721    movd   [px+32*1-4], m1
722    movd   [px+32*2-4], m2
723    movd   [px+32*3-4], m3
724    jmp .left_done
725.no_left:
726    REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3
727.left_done:
728    test         edgeb, 2 ; HAVE_RIGHT
729    jnz .padding_done
730    REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5
731.padding_done:
732    CDEF_FILTER      4, 4
733
734%if ARCH_X86_64
735cglobal cdef_filter_4x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \
736                                               pri, sec, edge
737%else
738cglobal cdef_filter_4x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left
739%endif
740    mov          edged, r9m
741    LEA             t0, dir_table
742    movu            m0, [dstq+strideq*0]
743    movu            m1, [dstq+strideq*1]
744    lea             t1, [dstq+strideq*2]
745    movu            m2, [t1  +strideq*0]
746    movu            m3, [t1  +strideq*1]
747    lea             t1, [t1  +strideq*2]
748    movu            m4, [t1  +strideq*0]
749    movu            m5, [t1  +strideq*1]
750    lea             t1, [t1  +strideq*2]
751    movu            m6, [t1  +strideq*0]
752    movu            m7, [t1  +strideq*1]
753    mova   [px+32*0+0], m0
754    mova   [px+32*1+0], m1
755    mova   [px+32*2+0], m2
756    mova   [px+32*3+0], m3
757    mova   [px+32*4+0], m4
758    mova   [px+32*5+0], m5
759    mova   [px+32*6+0], m6
760    mova   [px+32*7+0], m7
761    movddup         m7, [base+pw_m16384]
762    test         edgeb, 4 ; HAVE_TOP
763    jz .no_top
764    movifnidn     topq, topmp
765    movu            m0, [topq+strideq*0]
766    movu            m1, [topq+strideq*1]
767    mova   [px-32*2+0], m0
768    mova   [px-32*1+0], m1
769    test         edgeb, 1 ; HAVE_LEFT
770    jz .top_no_left
771    movd            m0, [topq+strideq*0-4]
772    movd            m1, [topq+strideq*1-4]
773    movd   [px-32*2-4], m0
774    movd   [px-32*1-4], m1
775    jmp .top_done
776.no_top:
777    mova   [px-32*2+0], m7
778    mova   [px-32*1+0], m7
779.top_no_left:
780    movd   [px-32*2-4], m7
781    movd   [px-32*1-4], m7
782.top_done:
783    test         edgeb, 8 ; HAVE_BOTTOM
784    jz .no_bottom
785    movifnidn     botq, r4mp
786    movu            m0, [botq+strideq*0]
787    movu            m1, [botq+strideq*1]
788    mova   [px+32*8+0], m0
789    mova   [px+32*9+0], m1
790    test         edgeb, 1 ; HAVE_LEFT
791    jz .bottom_no_left
792    movd            m0, [botq+strideq*0-4]
793    movd            m1, [botq+strideq*1-4]
794    movd   [px+32*8-4], m0
795    movd   [px+32*9-4], m1
796    jmp .bottom_done
797.no_bottom:
798    mova   [px+32*8+0], m7
799    mova   [px+32*9+0], m7
800.bottom_no_left:
801    movd   [px+32*8-4], m7
802    movd   [px+32*9-4], m7
803.bottom_done:
804    test         edgeb, 1 ; HAVE_LEFT
805    jz .no_left
806    movifnidn    leftq, r2mp
807    movd            m0, [leftq+4*0]
808    movd            m1, [leftq+4*1]
809    movd            m2, [leftq+4*2]
810    movd            m3, [leftq+4*3]
811    movd   [px+32*0-4], m0
812    movd   [px+32*1-4], m1
813    movd   [px+32*2-4], m2
814    movd   [px+32*3-4], m3
815    movd            m0, [leftq+4*4]
816    movd            m1, [leftq+4*5]
817    movd            m2, [leftq+4*6]
818    movd            m3, [leftq+4*7]
819    movd   [px+32*4-4], m0
820    movd   [px+32*5-4], m1
821    movd   [px+32*6-4], m2
822    movd   [px+32*7-4], m3
823    jmp .left_done
824.no_left:
825    REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3, 4, 5, 6, 7
826.left_done:
827    test         edgeb, 2 ; HAVE_RIGHT
828    jnz .padding_done
829    REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
830.padding_done:
831    CDEF_FILTER      4, 8
832
833%if ARCH_X86_64
834cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \
835                                               pri, sec, edge
836%else
837cglobal cdef_filter_8x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left
838%endif
839    mov          edged, r9m
840    LEA             t0, dir_table
841    mova            m0, [dstq+strideq*0+ 0]
842    movd            m1, [dstq+strideq*0+16]
843    mova            m2, [dstq+strideq*1+ 0]
844    movd            m3, [dstq+strideq*1+16]
845    lea             t1, [dstq+strideq*2]
846    mova            m4, [t1  +strideq*0+ 0]
847    movd            m5, [t1  +strideq*0+16]
848    mova            m6, [t1  +strideq*1+ 0]
849    movd            m7, [t1  +strideq*1+16]
850    lea             t1, [t1  +strideq*2]
851    mova  [px+32*0+ 0], m0
852    movd  [px+32*0+16], m1
853    mova  [px+32*1+ 0], m2
854    movd  [px+32*1+16], m3
855    mova  [px+32*2+ 0], m4
856    movd  [px+32*2+16], m5
857    mova  [px+32*3+ 0], m6
858    movd  [px+32*3+16], m7
859    mova            m0, [t1  +strideq*0+ 0]
860    movd            m1, [t1  +strideq*0+16]
861    mova            m2, [t1  +strideq*1+ 0]
862    movd            m3, [t1  +strideq*1+16]
863    lea             t1, [t1  +strideq*2]
864    mova            m4, [t1  +strideq*0+ 0]
865    movd            m5, [t1  +strideq*0+16]
866    mova            m6, [t1  +strideq*1+ 0]
867    movd            m7, [t1  +strideq*1+16]
868    mova  [px+32*4+ 0], m0
869    movd  [px+32*4+16], m1
870    mova  [px+32*5+ 0], m2
871    movd  [px+32*5+16], m3
872    mova  [px+32*6+ 0], m4
873    movd  [px+32*6+16], m5
874    mova  [px+32*7+ 0], m6
875    movd  [px+32*7+16], m7
876    movddup         m7, [base+pw_m16384]
877    test         edgeb, 4 ; HAVE_TOP
878    jz .no_top
879    movifnidn     topq, topmp
880    mova            m0, [topq+strideq*0+ 0]
881    mova            m1, [topq+strideq*0+16]
882    mova            m2, [topq+strideq*1+ 0]
883    mova            m3, [topq+strideq*1+16]
884    mova  [px-32*2+ 0], m0
885    movd  [px-32*2+16], m1
886    mova  [px-32*1+ 0], m2
887    movd  [px-32*1+16], m3
888    test         edgeb, 1 ; HAVE_LEFT
889    jz .top_no_left
890    movd            m0, [topq+strideq*0-4]
891    movd            m1, [topq+strideq*1-4]
892    movd   [px-32*2-4], m0
893    movd   [px-32*1-4], m1
894    jmp .top_done
895.no_top:
896    mova  [px-32*2+ 0], m7
897    movd  [px-32*2+16], m7
898    mova  [px-32*1+ 0], m7
899    movd  [px-32*1+16], m7
900.top_no_left:
901    movd  [px-32*2- 4], m7
902    movd  [px-32*1- 4], m7
903.top_done:
904    test         edgeb, 8 ; HAVE_BOTTOM
905    jz .no_bottom
906    movifnidn     botq, r4mp
907    mova            m0, [botq+strideq*0+ 0]
908    movd            m1, [botq+strideq*0+16]
909    mova            m2, [botq+strideq*1+ 0]
910    movd            m3, [botq+strideq*1+16]
911    mova  [px+32*8+ 0], m0
912    movd  [px+32*8+16], m1
913    mova  [px+32*9+ 0], m2
914    movd  [px+32*9+16], m3
915    test         edgeb, 1 ; HAVE_LEFT
916    jz .bottom_no_left
917    movd            m0, [botq+strideq*0-4]
918    movd            m1, [botq+strideq*1-4]
919    movd  [px+32*8- 4], m0
920    movd  [px+32*9- 4], m1
921    jmp .bottom_done
922.no_bottom:
923    mova  [px+32*8+ 0], m7
924    movd  [px+32*8+16], m7
925    mova  [px+32*9+ 0], m7
926    movd  [px+32*9+16], m7
927.bottom_no_left:
928    movd  [px+32*8- 4], m7
929    movd  [px+32*9- 4], m7
930.bottom_done:
931    test         edgeb, 1 ; HAVE_LEFT
932    jz .no_left
933    movifnidn    leftq, r2mp
934    movd            m0, [leftq+4*0]
935    movd            m1, [leftq+4*1]
936    movd            m2, [leftq+4*2]
937    movd            m3, [leftq+4*3]
938    movd  [px+32*0- 4], m0
939    movd  [px+32*1- 4], m1
940    movd  [px+32*2- 4], m2
941    movd  [px+32*3- 4], m3
942    movd            m0, [leftq+4*4]
943    movd            m1, [leftq+4*5]
944    movd            m2, [leftq+4*6]
945    movd            m3, [leftq+4*7]
946    movd  [px+32*4- 4], m0
947    movd  [px+32*5- 4], m1
948    movd  [px+32*6- 4], m2
949    movd  [px+32*7- 4], m3
950    jmp .left_done
951.no_left:
952    REPX {movd [px+32*x- 4], m7}, 0, 1, 2, 3, 4, 5, 6, 7
953.left_done:
954    test         edgeb, 2 ; HAVE_RIGHT
955    jnz .padding_done
956    REPX {movd [px+32*x+16], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
957.padding_done:
958    CDEF_FILTER      8, 8
959
960%macro CDEF_DIR 0
961%if ARCH_X86_64
962cglobal cdef_dir_16bpc, 4, 7, 16, src, stride, var, bdmax
963    lea             r6, [dir_shift]
964    shr         bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc
965    movddup         m7, [r6+bdmaxq*8]
966    lea             r6, [strideq*3]
967    mova            m0, [srcq+strideq*0]
968    mova            m1, [srcq+strideq*1]
969    mova            m2, [srcq+strideq*2]
970    mova            m3, [srcq+r6       ]
971    lea           srcq, [srcq+strideq*4]
972    mova            m4, [srcq+strideq*0]
973    mova            m5, [srcq+strideq*1]
974    mova            m6, [srcq+strideq*2]
975    REPX {pmulhuw x, m7}, m0, m1, m2, m3, m4, m5, m6
976    pmulhuw         m7, [srcq+r6       ]
977    pxor            m8, m8
978    packuswb        m9, m0, m1
979    packuswb       m10, m2, m3
980    packuswb       m11, m4, m5
981    packuswb       m12, m6, m7
982    REPX {psadbw x, m8}, m9, m10, m11, m12
983    packssdw        m9, m10
984    packssdw       m11, m12
985    packssdw        m9, m11
986    jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main
987%else
988cglobal cdef_dir_16bpc, 2, 4, 8, 96, src, stride, var, bdmax
989    mov         bdmaxd, bdmaxm
990    LEA             r2, dir_shift
991    shr         bdmaxd, 11
992    movddup         m7, [r2+bdmaxq*8]
993    lea             r3, [strideq*3]
994    pmulhuw         m3, m7, [srcq+strideq*0]
995    pmulhuw         m4, m7, [srcq+strideq*1]
996    pmulhuw         m5, m7, [srcq+strideq*2]
997    pmulhuw         m6, m7, [srcq+r3       ]
998    movddup         m1, [r2-dir_shift+pw_128]
999    lea           srcq, [srcq+strideq*4]
1000    pxor            m0, m0
1001    packuswb        m2, m3, m4
1002    psubw           m3, m1
1003    psubw           m4, m1
1004    mova    [esp+0x00], m3
1005    mova    [esp+0x10], m4
1006    packuswb        m3, m5, m6
1007    psadbw          m2, m0
1008    psadbw          m3, m0
1009    psubw           m5, m1
1010    psubw           m6, m1
1011    packssdw        m2, m3
1012    mova    [esp+0x20], m5
1013    mova    [esp+0x50], m6
1014    pmulhuw         m4, m7, [srcq+strideq*0]
1015    pmulhuw         m5, m7, [srcq+strideq*1]
1016    pmulhuw         m6, m7, [srcq+strideq*2]
1017    pmulhuw         m7,     [srcq+r3       ]
1018    packuswb        m3, m4, m5
1019    packuswb        m1, m6, m7
1020    psadbw          m3, m0
1021    psadbw          m1, m0
1022    packssdw        m3, m1
1023    movddup         m1, [r2-dir_shift+pw_128]
1024    LEA             r2, shufw_6543210x
1025    jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main
1026%endif
1027%endmacro
1028
1029INIT_XMM ssse3
1030CDEF_DIR
1031
1032INIT_XMM sse4
1033CDEF_DIR
1034