1; Copyright © 2018, VideoLAN and dav1d authors
2; Copyright © 2018, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31%macro JMP_TABLE 2-*
32 %xdefine %1_jmptable %%table
33 %xdefine %%base mangle(private_prefix %+ _%1_avx2)
34 %%table:
35 %rep %0 - 1
36    dd %%base %+ .%2 - %%table
37  %rotate 1
38 %endrep
39%endmacro
40
41%macro CDEF_FILTER_JMP_TABLE 1
42JMP_TABLE cdef_filter_%1_8bpc, \
43    d6k0, d6k1, d7k0, d7k1, \
44    d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \
45    d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \
46    d0k0, d0k1, d1k0, d1k1
47%endmacro
48
49SECTION_RODATA 32
50
51pd_47130256:   dd  4,  7,  1,  3,  0,  2,  5,  6
52blend_4x4:     dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00
53               dd 0x80, 0x00, 0x00
54blend_4x8_0:   dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
55blend_4x8_1:   dd 0x00, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
56               dd 0x00, 0x00
57blend_4x8_2:   dd 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
58               dd 0x0000
59blend_4x8_3:   dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
60               dd 0x0000, 0x0000
61blend_8x8_0:   dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80
62blend_8x8_1:   dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000
63div_table:     dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105
64shufw_6543210x:db 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1, 14, 15
65shufb_lohi:    db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
66pw_128:        times 2 dw 128
67pw_2048:       times 2 dw 2048
68tap_table:     ; masks for 8 bit shifts
69               db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01
70               ; weights
71               db  4,  2,  3,  3,  2,  1
72               db -1 * 16 + 1, -2 * 16 + 2
73               db  0 * 16 + 1, -1 * 16 + 2
74               db  0 * 16 + 1,  0 * 16 + 2
75               db  0 * 16 + 1,  1 * 16 + 2
76               db  1 * 16 + 1,  2 * 16 + 2
77               db  1 * 16 + 0,  2 * 16 + 1
78               db  1 * 16 + 0,  2 * 16 + 0
79               db  1 * 16 + 0,  2 * 16 - 1
80               ; the last 6 are repeats of the first 6 so we don't need to & 7
81               db -1 * 16 + 1, -2 * 16 + 2
82               db  0 * 16 + 1, -1 * 16 + 2
83               db  0 * 16 + 1,  0 * 16 + 2
84               db  0 * 16 + 1,  1 * 16 + 2
85               db  1 * 16 + 1,  2 * 16 + 2
86               db  1 * 16 + 0,  2 * 16 + 1
87
88CDEF_FILTER_JMP_TABLE 4x4
89CDEF_FILTER_JMP_TABLE 4x8
90CDEF_FILTER_JMP_TABLE 8x8
91
92SECTION .text
93
94%macro PREP_REGS 2 ; w, h
95    ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
96    mov           dird, r6m
97    lea         tableq, [cdef_filter_%1x%2_8bpc_jmptable]
98    lea           dirq, [tableq+dirq*2*4]
99%if %1 == 4
100 %if %2 == 4
101  DEFINE_ARGS dst, stride, left, top, pri, sec, \
102              table, dir, dirjmp, dst4, stride3, k
103 %else
104  DEFINE_ARGS dst, stride, left, top, pri, sec, \
105              table, dir, dirjmp, dst4, dst8, stride3, k
106    lea          dst8q, [dstq+strideq*8]
107 %endif
108%else
109  DEFINE_ARGS dst, stride, h, top1, pri, sec, \
110              table, dir, dirjmp, top2, dst4, stride3, k
111    mov             hq, -8
112    lea          top1q, [top1q+strideq*0]
113    lea          top2q, [top1q+strideq*1]
114%endif
115    lea          dst4q, [dstq+strideq*4]
116%if %1 == 4
117    lea       stride3q, [strideq*3]
118%endif
119%endmacro
120
121%macro LOAD_BLOCK 2-3 0 ; w, h, init_min_max
122    mov             kd, 1
123    pxor           m15, m15                     ; sum
124%if %2 == 8
125    pxor           m12, m12
126 %if %1 == 4
127    movd           xm4, [dstq +strideq*0]
128    movd           xm6, [dstq +strideq*1]
129    movd           xm5, [dstq +strideq*2]
130    movd           xm7, [dstq +stride3q ]
131    vinserti128     m4, [dst4q+strideq*0], 1
132    vinserti128     m6, [dst4q+strideq*1], 1
133    vinserti128     m5, [dst4q+strideq*2], 1
134    vinserti128     m7, [dst4q+stride3q ], 1
135    punpckldq       m4, m6
136    punpckldq       m5, m7
137 %else
138    movq           xm4, [dstq+strideq*0]
139    movq           xm5, [dstq+strideq*1]
140    vinserti128     m4, [dstq+strideq*2], 1
141    vinserti128     m5, [dstq+stride3q ], 1
142 %endif
143    punpcklqdq      m4, m5
144%else
145    movd           xm4, [dstq+strideq*0]
146    movd           xm5, [dstq+strideq*1]
147    vinserti128     m4, [dstq+strideq*2], 1
148    vinserti128     m5, [dstq+stride3q ], 1
149    punpckldq       m4, m5
150%endif
151%if %3 == 1
152    mova            m7, m4                      ; min
153    mova            m8, m4                      ; max
154%endif
155%endmacro
156
157%macro ACCUMULATE_TAP_BYTE 7-8 0 ; tap_offset, shift, mask, strength
158                                 ; mul_tap, w, h, clip
159    ; load p0/p1
160    movsxd     dirjmpq, [dirq+kq*4+%1*2*4]
161    add        dirjmpq, tableq
162    call       dirjmpq
163
164%if %8 == 1
165    pmaxub          m7, m5
166    pminub          m8, m5
167    pmaxub          m7, m6
168    pminub          m8, m6
169%endif
170
171    ; accumulate sum[m15] over p0/p1
172%if %7 == 4
173    punpcklbw       m5, m6
174    punpcklbw       m6, m4, m4
175    psubusb         m9, m5, m6
176    psubusb         m5, m6, m5
177    por             m9, m5     ; abs_diff_p01(p01 - px)
178    pcmpeqb         m5, m9
179    por             m5, %5
180    psignb          m6, %5, m5
181    psrlw           m5, m9, %2 ; emulate 8-bit shift
182    pand            m5, %3
183    psubusb         m5, %4, m5
184    pminub          m5, m9
185    pmaddubsw       m5, m6
186    paddw          m15, m5
187%else
188    psubusb         m9, m5, m4
189    psubusb         m5, m4, m5
190    psubusb        m11, m6, m4
191    psubusb         m6, m4, m6
192    por             m9, m5      ; abs_diff_p0(p0 - px)
193    por            m11, m6      ; abs_diff_p1(p1 - px)
194    pcmpeqb         m5, m9
195    pcmpeqb         m6, m11
196    punpckhbw      m10, m9, m11
197    punpcklbw       m9, m11
198    por             m5, %5
199    por            m11, m6, %5
200    punpckhbw       m6, m5, m11
201    punpcklbw       m5, m11
202    psignb         m11, %5, m6
203    psrlw           m6, m10, %2 ; emulate 8-bit shift
204    pand            m6, %3
205    psubusb         m6, %4, m6
206    pminub          m6, m10
207    pmaddubsw       m6, m11
208    paddw          m12, m6
209    psignb         m11, %5, m5
210    psrlw           m5, m9, %2  ; emulate 8-bit shift
211    pand            m5, %3
212    psubusb         m5, %4, m5
213    pminub          m5, m9
214    pmaddubsw       m5, m11
215    paddw          m15, m5
216%endif
217%endmacro
218
219%macro ADJUST_PIXEL 4-5 0 ; w, h, zero, pw_2048, clip
220%if %2 == 4
221 %if %5 == 1
222    punpcklbw       m4, %3
223 %endif
224    pcmpgtw         %3, m15
225    paddw          m15, %3
226    pmulhrsw       m15, %4
227 %if %5 == 0
228    packsswb       m15, m15
229    paddb           m4, m15
230 %else
231    paddw           m4, m15
232    packuswb        m4, m4 ; clip px in [0x0,0xff]
233    pminub          m4, m7
234    pmaxub          m4, m8
235 %endif
236    vextracti128   xm5, m4, 1
237    movd   [dstq+strideq*0], xm4
238    movd   [dstq+strideq*2], xm5
239    pextrd [dstq+strideq*1], xm4, 1
240    pextrd [dstq+stride3q ], xm5, 1
241%else
242    pcmpgtw         m6, %3, m12
243    pcmpgtw         m5, %3, m15
244    paddw          m12, m6
245    paddw          m15, m5
246 %if %5 == 1
247    punpckhbw       m5, m4, %3
248    punpcklbw       m4, %3
249 %endif
250    pmulhrsw       m12, %4
251    pmulhrsw       m15, %4
252 %if %5 == 0
253    packsswb       m15, m12
254    paddb           m4, m15
255 %else
256    paddw           m5, m12
257    paddw           m4, m15
258    packuswb        m4, m5 ; clip px in [0x0,0xff]
259    pminub          m4, m7
260    pmaxub          m4, m8
261 %endif
262    vextracti128   xm5, m4, 1
263 %if %1 == 4
264    movd   [dstq +strideq*0], xm4
265    movd   [dst4q+strideq*0], xm5
266    pextrd [dstq +strideq*1], xm4, 1
267    pextrd [dst4q+strideq*1], xm5, 1
268    pextrd [dstq +strideq*2], xm4, 2
269    pextrd [dst4q+strideq*2], xm5, 2
270    pextrd [dstq +stride3q ], xm4, 3
271    pextrd [dst4q+stride3q ], xm5, 3
272 %else
273    movq   [dstq+strideq*0], xm4
274    movq   [dstq+strideq*2], xm5
275    movhps [dstq+strideq*1], xm4
276    movhps [dstq+stride3q ], xm5
277 %endif
278%endif
279%endmacro
280
281%macro BORDER_PREP_REGS 2 ; w, h
282    ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
283    mov           dird, r6m
284    lea           dirq, [tableq+dirq*2+14]
285%if %1*%2*2/mmsize > 1
286 %if %1 == 4
287    DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k
288 %else
289    DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k
290 %endif
291    mov             hd, %1*%2*2/mmsize
292%else
293    DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, off, k
294%endif
295    lea           stkq, [px]
296    pxor           m11, m11
297%endmacro
298
299%macro BORDER_LOAD_BLOCK 2-3 0 ; w, h, init_min_max
300    mov             kd, 1
301%if %1 == 4
302    movq           xm4, [stkq+32*0]
303    movhps         xm4, [stkq+32*1]
304    movq           xm5, [stkq+32*2]
305    movhps         xm5, [stkq+32*3]
306    vinserti128     m4, xm5, 1
307%else
308    mova           xm4, [stkq+32*0]             ; px
309    vinserti128     m4, [stkq+32*1], 1
310%endif
311    pxor           m15, m15                     ; sum
312%if %3 == 1
313    mova            m7, m4                      ; max
314    mova            m8, m4                      ; min
315%endif
316%endmacro
317
318%macro ACCUMULATE_TAP_WORD 6-7 0 ; tap_offset, shift, mask, strength
319                                 ; mul_tap, w, clip
320    ; load p0/p1
321    movsx         offq, byte [dirq+kq+%1]       ; off1
322%if %6 == 4
323    movq           xm5, [stkq+offq*2+32*0]      ; p0
324    movq           xm6, [stkq+offq*2+32*2]
325    movhps         xm5, [stkq+offq*2+32*1]
326    movhps         xm6, [stkq+offq*2+32*3]
327    vinserti128     m5, xm6, 1
328%else
329    movu           xm5, [stkq+offq*2+32*0]      ; p0
330    vinserti128     m5, [stkq+offq*2+32*1], 1
331%endif
332    neg           offq                          ; -off1
333%if %6 == 4
334    movq           xm6, [stkq+offq*2+32*0]      ; p1
335    movq           xm9, [stkq+offq*2+32*2]
336    movhps         xm6, [stkq+offq*2+32*1]
337    movhps         xm9, [stkq+offq*2+32*3]
338    vinserti128     m6, xm9, 1
339%else
340    movu           xm6, [stkq+offq*2+32*0]      ; p1
341    vinserti128     m6, [stkq+offq*2+32*1], 1
342%endif
343%if %7 == 1
344    ; out of bounds values are set to a value that is a both a large unsigned
345    ; value and a negative signed value.
346    ; use signed max and unsigned min to remove them
347    pmaxsw          m7, m5                      ; max after p0
348    pminuw          m8, m5                      ; min after p0
349    pmaxsw          m7, m6                      ; max after p1
350    pminuw          m8, m6                      ; min after p1
351%endif
352
353    ; accumulate sum[m15] over p0/p1
354    ; calculate difference before converting
355    psubw           m5, m4                      ; diff_p0(p0 - px)
356    psubw           m6, m4                      ; diff_p1(p1 - px)
357
358    ; convert to 8-bits with signed saturation
359    ; saturating to large diffs has no impact on the results
360    packsswb        m5, m6
361
362    ; group into pairs so we can accumulate using maddubsw
363    pshufb          m5, m12
364    pabsb           m9, m5
365    psignb         m10, %5, m5
366    psrlw           m5, m9, %2                  ; emulate 8-bit shift
367    pand            m5, %3
368    psubusb         m5, %4, m5
369
370    ; use unsigned min since abs diff can equal 0x80
371    pminub          m5, m9
372    pmaddubsw       m5, m10
373    paddw          m15, m5
374%endmacro
375
376%macro BORDER_ADJUST_PIXEL 2-3 0 ; w, pw_2048, clip
377    pcmpgtw         m9, m11, m15
378    paddw          m15, m9
379    pmulhrsw       m15, %2
380    paddw           m4, m15
381%if %3 == 1
382    pminsw          m4, m7
383    pmaxsw          m4, m8
384%endif
385    packuswb        m4, m4
386    vextracti128   xm5, m4, 1
387%if %1 == 4
388    movd [dstq+strideq*0], xm4
389    pextrd [dstq+strideq*1], xm4, 1
390    movd [dstq+strideq*2], xm5
391    pextrd [dstq+stride3q], xm5, 1
392%else
393    movq [dstq+strideq*0], xm4
394    movq [dstq+strideq*1], xm5
395%endif
396%endmacro
397
398%macro CDEF_FILTER 2 ; w, h
399INIT_YMM avx2
400cglobal cdef_filter_%1x%2_8bpc, 4, 9, 0, dst, stride, left, top, \
401                                    pri, sec, dir, damping, edge
402%assign stack_offset_entry stack_offset
403    mov          edged, edgem
404    cmp          edged, 0xf
405    jne .border_block
406
407    PUSH            r9
408    PUSH           r10
409    PUSH           r11
410%if %2 == 4
411 %assign regs_used 12
412 %if STACK_ALIGNMENT < 32
413    PUSH  r%+regs_used
414  %assign regs_used regs_used+1
415 %endif
416    ALLOC_STACK 0x60, 16
417    pmovzxbw       xm0, [leftq+1]
418    vpermq          m0, m0, q0110
419    psrldq          m1, m0, 4
420    vpalignr        m2, m0, m0, 12
421    movu    [rsp+0x10], m0
422    movu    [rsp+0x28], m1
423    movu    [rsp+0x40], m2
424%elif %1 == 4
425    PUSH           r12
426 %assign regs_used 13
427 %if STACK_ALIGNMENT < 32
428    PUSH  r%+regs_used
429   %assign regs_used regs_used+1
430 %endif
431    ALLOC_STACK 8*2+%1*%2*1, 16
432    pmovzxwd        m0, [leftq]
433    mova    [rsp+0x10], m0
434%else
435    PUSH           r12
436    PUSH           r13
437 %assign regs_used 14
438 %if STACK_ALIGNMENT < 32
439    PUSH  r%+regs_used
440  %assign regs_used regs_used+1
441 %endif
442    ALLOC_STACK 8*2+%1*%2*2+32, 16
443    lea            r11, [strideq*3]
444    movu           xm4, [dstq+strideq*2]
445    pmovzxwq        m0, [leftq+0]
446    pmovzxwq        m1, [leftq+8]
447    vinserti128     m4, [dstq+r11], 1
448    pmovzxbd        m2, [leftq+1]
449    pmovzxbd        m3, [leftq+9]
450    mova    [rsp+0x10], m0
451    mova    [rsp+0x30], m1
452    mova    [rsp+0x50], m2
453    mova    [rsp+0x70], m3
454    mova    [rsp+0x90], m4
455%endif
456
457 DEFINE_ARGS dst, stride, left, top, pri, secdmp, zero, pridmp, damping
458    mov       dampingd, r7m
459    xor          zerod, zerod
460    movifnidn     prid, prim
461    sub       dampingd, 31
462    movifnidn  secdmpd, secdmpm
463    test          prid, prid
464    jz .sec_only
465    movd           xm0, prid
466    lzcnt      pridmpd, prid
467    add        pridmpd, dampingd
468    cmovs      pridmpd, zerod
469    mov        [rsp+0], pridmpq                 ; pri_shift
470    test       secdmpd, secdmpd
471    jz .pri_only
472    movd           xm1, secdmpd
473    lzcnt      secdmpd, secdmpd
474    add        secdmpd, dampingd
475    cmovs      secdmpd, zerod
476    mov        [rsp+8], secdmpq                 ; sec_shift
477
478 DEFINE_ARGS dst, stride, left, top, pri, secdmp, table, pridmp
479    lea         tableq, [tap_table]
480    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
481    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
482
483    ; pri/sec_taps[k] [4 total]
484 DEFINE_ARGS dst, stride, left, top, pri, sec, table, dir
485    vpbroadcastb    m0, xm0                     ; pri_strength
486    vpbroadcastb    m1, xm1                     ; sec_strength
487    and           prid, 1
488    lea           priq, [tableq+priq*2+8]       ; pri_taps
489    lea           secq, [tableq+12]             ; sec_taps
490
491    PREP_REGS       %1, %2
492%if %1*%2 > mmsize
493.v_loop:
494%endif
495    LOAD_BLOCK      %1, %2, 1
496.k_loop:
497    vpbroadcastb    m2, [priq+kq]                          ; pri_taps
498    vpbroadcastb    m3, [secq+kq]                          ; sec_taps
499    ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2, 1 ; dir + 0
500    ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir + 2
501    ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir - 2
502    dec             kq
503    jge .k_loop
504
505    vpbroadcastd   m10, [pw_2048]
506    pxor            m9, m9
507    ADJUST_PIXEL    %1, %2, m9, m10, 1
508%if %1*%2 > mmsize
509    mov           dstq, dst4q
510    lea          top1q, [rsp+0x90]
511    lea          top2q, [rsp+0xA0]
512    lea          dst4q, [dst4q+strideq*4]
513    add             hq, 4
514    jl .v_loop
515%endif
516    RET
517
518.pri_only:
519 DEFINE_ARGS dst, stride, left, top, pri, _, table, pridmp
520    lea         tableq, [tap_table]
521    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
522    ; pri/sec_taps[k] [4 total]
523 DEFINE_ARGS dst, stride, left, top, pri, _, table, dir
524    vpbroadcastb    m0, xm0                     ; pri_strength
525    and           prid, 1
526    lea           priq, [tableq+priq*2+8]       ; pri_taps
527    PREP_REGS       %1, %2
528    vpbroadcastd    m3, [pw_2048]
529    pxor            m1, m1
530%if %1*%2 > mmsize
531.pri_v_loop:
532%endif
533    LOAD_BLOCK      %1, %2
534.pri_k_loop:
535    vpbroadcastb    m2, [priq+kq]                       ; pri_taps
536    ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0
537    dec             kq
538    jge .pri_k_loop
539    ADJUST_PIXEL    %1, %2, m1, m3
540%if %1*%2 > mmsize
541    mov           dstq, dst4q
542    lea          top1q, [rsp+0x90]
543    lea          top2q, [rsp+0xA0]
544    lea          dst4q, [dst4q+strideq*4]
545    add             hq, 4
546    jl .pri_v_loop
547%endif
548    RET
549
550.sec_only:
551 DEFINE_ARGS dst, stride, left, top, _, secdmp, zero, _, damping
552    movd           xm1, secdmpd
553    lzcnt      secdmpd, secdmpd
554    add        secdmpd, dampingd
555    cmovs      secdmpd, zerod
556    mov        [rsp+8], secdmpq                 ; sec_shift
557 DEFINE_ARGS dst, stride, left, top, _, secdmp, table
558    lea         tableq, [tap_table]
559    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
560    ; pri/sec_taps[k] [4 total]
561 DEFINE_ARGS dst, stride, left, top, _, sec, table, dir
562    vpbroadcastb    m1, xm1                     ; sec_strength
563    lea           secq, [tableq+12]             ; sec_taps
564    PREP_REGS       %1, %2
565    vpbroadcastd    m2, [pw_2048]
566    pxor            m0, m0
567%if %1*%2 > mmsize
568.sec_v_loop:
569%endif
570    LOAD_BLOCK      %1, %2
571.sec_k_loop:
572    vpbroadcastb    m3, [secq+kq]                       ; sec_taps
573    ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2
574    ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2
575    dec             kq
576    jge .sec_k_loop
577    ADJUST_PIXEL    %1, %2, m0, m2
578%if %1*%2 > mmsize
579    mov           dstq, dst4q
580    lea          top1q, [rsp+0x90]
581    lea          top2q, [rsp+0xA0]
582    lea          dst4q, [dst4q+strideq*4]
583    add             hq, 4
584    jl .sec_v_loop
585%endif
586    RET
587
588.d0k0:
589%if %1 == 4
590 %if %2 == 4
591    vpbroadcastq    m6, [dstq+strideq*1-1]
592    vpbroadcastq   m10, [dstq+strideq*2-1]
593    movd           xm5, [topq+strideq*1+1]
594    movd           xm9, [dstq+strideq*0+1]
595    psrldq         m11, m6, 2
596    psrldq         m12, m10, 2
597    vinserti128     m6, [dstq+stride3q -1], 1
598    vinserti128    m10, [dstq+strideq*4-1], 1
599    vpblendd        m5, m11, 0x10
600    vpblendd        m9, m12, 0x10
601    movu           m11, [blend_4x4+16]
602    punpckldq       m6, m10
603    punpckldq       m5, m9
604    vpblendvb       m6, [rsp+gprsize+0x28], m11
605 %else
606    movd           xm5, [topq +strideq*1+1]
607    movq           xm6, [dstq +strideq*1-1]
608    movq          xm10, [dstq +stride3q -1]
609    movq          xm11, [dst4q+strideq*1-1]
610    pinsrd         xm5, [dstq +strideq*0+1], 1
611    movhps         xm6, [dstq +strideq*2-1]
612    movhps        xm10, [dst4q+strideq*0-1]
613    movhps        xm11, [dst4q+strideq*2-1]
614    psrldq         xm9, xm6, 2
615    shufps         xm5, xm9, q2010   ; -1 +0 +1 +2
616    shufps         xm6, xm10, q2020  ; +1 +2 +3 +4
617    psrldq         xm9, xm11, 2
618    psrldq        xm10, 2
619    shufps        xm10, xm9, q2020   ; +3 +4 +5 +6
620    movd           xm9, [dst4q+stride3q -1]
621    pinsrd         xm9, [dst4q+strideq*4-1], 1
622    shufps        xm11, xm9, q1020   ; +5 +6 +7 +8
623    pmovzxbw        m9, [leftq+3]
624    vinserti128     m6, xm11, 1
625    movu           m11, [blend_4x8_0+4]
626    vinserti128     m5, xm10, 1
627    vpblendvb       m6, m9, m11
628 %endif
629%else
630    lea            r13, [blend_8x8_0+16]
631    movq           xm5, [top2q         +1]
632    vbroadcasti128 m10, [dstq+strideq*1-1]
633    vbroadcasti128 m11, [dstq+strideq*2-1]
634    movhps         xm5, [dstq+strideq*0+1]
635    vinserti128     m6, m10, [dstq+stride3q -1], 1
636    vinserti128     m9, m11, [dstq+strideq*4-1], 1
637    psrldq         m10, 2
638    psrldq         m11, 2
639    punpcklqdq      m6, m9
640    movu            m9, [r13+hq*2*1+16*1]
641    punpcklqdq     m10, m11
642    vpblendd        m5, m10, 0xF0
643    vpblendvb       m6, [rsp+gprsize+80+hq*8+64+8*1], m9
644%endif
645    ret
646.d1k0:
647.d2k0:
648.d3k0:
649%if %1 == 4
650 %if %2 == 4
651    movq           xm6, [dstq+strideq*0-1]
652    movq           xm9, [dstq+strideq*1-1]
653    vinserti128     m6, [dstq+strideq*2-1], 1
654    vinserti128     m9, [dstq+stride3q -1], 1
655    movu           m11, [rsp+gprsize+0x10]
656    pcmpeqd        m12, m12
657    psrldq          m5, m6, 2
658    psrldq         m10, m9, 2
659    psrld          m12, 24
660    punpckldq       m6, m9
661    punpckldq       m5, m10
662    vpblendvb       m6, m11, m12
663 %else
664    movq           xm6, [dstq +strideq*0-1]
665    movq           xm9, [dstq +strideq*2-1]
666    movhps         xm6, [dstq +strideq*1-1]
667    movhps         xm9, [dstq +stride3q -1]
668    movq          xm10, [dst4q+strideq*0-1]
669    movhps        xm10, [dst4q+strideq*1-1]
670    psrldq         xm5, xm6, 2
671    psrldq        xm11, xm9, 2
672    shufps         xm5, xm11, q2020
673    movq          xm11, [dst4q+strideq*2-1]
674    movhps        xm11, [dst4q+stride3q -1]
675    shufps         xm6, xm9, q2020
676    shufps         xm9, xm10, xm11, q2020
677    vinserti128     m6, xm9, 1
678    pmovzxbw        m9, [leftq+1]
679    psrldq        xm10, 2
680    psrldq        xm11, 2
681    shufps        xm10, xm11, q2020
682    vpbroadcastd   m11, [blend_4x8_0+4]
683    vinserti128     m5, xm10, 1
684    vpblendvb       m6, m9, m11
685 %endif
686%else
687    movu           xm5, [dstq+strideq*0-1]
688    movu           xm9, [dstq+strideq*1-1]
689    vinserti128     m5, [dstq+strideq*2-1], 1
690    vinserti128     m9, [dstq+stride3q -1], 1
691    movu           m10, [blend_8x8_0+16]
692    punpcklqdq      m6, m5, m9
693    vpblendvb       m6, [rsp+gprsize+80+hq*8+64], m10
694    psrldq          m5, 2
695    psrldq          m9, 2
696    punpcklqdq      m5, m9
697%endif
698    ret
699.d4k0:
700%if %1 == 4
701 %if %2 == 4
702    vpbroadcastq   m10, [dstq+strideq*1-1]
703    vpbroadcastq   m11, [dstq+strideq*2-1]
704    movd           xm6, [topq+strideq*1-1]
705    movd           xm9, [dstq+strideq*0-1]
706    psrldq          m5, m10, 2
707    psrldq         m12, m11, 2
708    vpblendd        m6, m10, 0x10
709    vpblendd        m9, m11, 0x10
710    movu           m10, [blend_4x4]
711    vinserti128     m5, [dstq+stride3q +1], 1
712    vinserti128    m12, [dstq+strideq*4+1], 1
713    punpckldq       m6, m9
714    punpckldq       m5, m12
715    vpblendvb       m6, [rsp+gprsize+0x40], m10
716 %else
717    movd           xm6, [topq +strideq*1-1]
718    movq           xm9, [dstq +strideq*1-1]
719    movq          xm10, [dstq +stride3q -1]
720    movq          xm11, [dst4q+strideq*1-1]
721    pinsrd         xm6, [dstq +strideq*0-1], 1
722    movhps         xm9, [dstq +strideq*2-1]
723    movhps        xm10, [dst4q+strideq*0-1]
724    movhps        xm11, [dst4q+strideq*2-1]
725    psrldq         xm5, xm9, 2
726    shufps         xm6, xm9, q2010
727    psrldq         xm9, xm10, 2
728    shufps         xm5, xm9, q2020
729    shufps        xm10, xm11, q2020
730    movd           xm9, [dst4q+stride3q +1]
731    vinserti128     m6, xm10, 1
732    pinsrd         xm9, [dst4q+strideq*4+1], 1
733    psrldq        xm11, 2
734    pmovzxbw       m10, [leftq-1]
735    shufps        xm11, xm9, q1020
736    movu            m9, [blend_4x8_0]
737    vinserti128     m5, xm11, 1
738    vpblendvb       m6, m10, m9
739 %endif
740%else
741    lea            r13, [blend_8x8_0+8]
742    movq           xm6, [top2q         -1]
743    vbroadcasti128  m5, [dstq+strideq*1-1]
744    vbroadcasti128  m9, [dstq+strideq*2-1]
745    movhps         xm6, [dstq+strideq*0-1]
746    movu           m11, [r13+hq*2*1+16*1]
747    punpcklqdq     m10, m5, m9
748    vinserti128     m5, [dstq+stride3q -1], 1
749    vinserti128     m9, [dstq+strideq*4-1], 1
750    vpblendd        m6, m10, 0xF0
751    vpblendvb       m6, [rsp+gprsize+80+hq*8+64-8*1], m11
752    psrldq          m5, 2
753    psrldq          m9, 2
754    punpcklqdq      m5, m9
755%endif
756    ret
757.d5k0:
758.d6k0:
759.d7k0:
760%if %1 == 4
761 %if %2 == 4
762    movd           xm6, [topq+strideq*1  ]
763    vpbroadcastd    m5, [dstq+strideq*1  ]
764    vpbroadcastd    m9, [dstq+strideq*2  ]
765    vpblendd       xm6, [dstq+strideq*0-4], 0x2
766    vpblendd        m5, m9, 0x22
767    vpblendd        m6, m5, 0x30
768    vinserti128     m5, [dstq+stride3q    ], 1
769    vpblendd        m5, [dstq+strideq*4-20], 0x20
770 %else
771    movd           xm6, [topq +strideq*1]
772    movd           xm5, [dstq +strideq*1]
773    movd           xm9, [dstq +stride3q ]
774    movd          xm10, [dst4q+strideq*1]
775    movd          xm11, [dst4q+stride3q ]
776    pinsrd         xm6, [dstq +strideq*0], 1
777    pinsrd         xm5, [dstq +strideq*2], 1
778    pinsrd         xm9, [dst4q+strideq*0], 1
779    pinsrd        xm10, [dst4q+strideq*2], 1
780    pinsrd        xm11, [dst4q+strideq*4], 1
781    punpcklqdq     xm6, xm5
782    punpcklqdq     xm5, xm9
783    punpcklqdq     xm9, xm10
784    punpcklqdq    xm10, xm11
785    vinserti128     m6, xm9, 1
786    vinserti128     m5, xm10, 1
787 %endif
788%else
789    movq           xm6, [top2q         ]
790    movq           xm5, [dstq+strideq*1]
791    movq           xm9, [dstq+stride3q ]
792    movhps         xm6, [dstq+strideq*0]
793    movhps         xm5, [dstq+strideq*2]
794    movhps         xm9, [dstq+strideq*4]
795    vinserti128     m6, xm5, 1
796    vinserti128     m5, xm9, 1
797%endif
798    ret
799.d0k1:
800%if %1 == 4
801 %if %2 == 4
802    movd           xm6, [dstq +strideq*2-2]
803    movd           xm9, [dstq +stride3q -2]
804    movd           xm5, [topq +strideq*0+2]
805    movd          xm10, [topq +strideq*1+2]
806    pinsrw         xm6, [leftq+4], 0
807    pinsrw         xm9, [leftq+6], 0
808    vinserti128     m5, [dstq +strideq*0+2], 1
809    vinserti128    m10, [dstq +strideq*1+2], 1
810    vinserti128     m6, [dst4q+strideq*0-2], 1
811    vinserti128     m9, [dst4q+strideq*1-2], 1
812    punpckldq       m5, m10
813    punpckldq       m6, m9
814 %else
815    movq           xm6, [dstq +strideq*2-2]
816    movd          xm10, [dst4q+strideq*2-2]
817    movd           xm5, [topq +strideq*0+2]
818    movq           xm9, [dst4q+strideq*0-2]
819    movhps         xm6, [dstq +stride3q -2]
820    pinsrw        xm10, [dst4q+stride3q   ], 3
821    pinsrd         xm5, [topq +strideq*1+2], 1
822    movhps         xm9, [dst4q+strideq*1-2]
823    pinsrd        xm10, [dst8q+strideq*0-2], 2
824    pinsrd         xm5, [dstq +strideq*0+2], 2
825    pinsrd        xm10, [dst8q+strideq*1-2], 3
826    pinsrd         xm5, [dstq +strideq*1+2], 3
827    shufps        xm11, xm6, xm9, q3131
828    shufps         xm6, xm9, q2020
829    movu            m9, [blend_4x8_3+8]
830    vinserti128     m6, xm10, 1
831    vinserti128     m5, xm11, 1
832    vpblendvb       m6, [rsp+gprsize+16+8], m9
833 %endif
834%else
835    lea            r13, [blend_8x8_1+16]
836    movq           xm6, [dstq +strideq*2-2]
837    movq           xm9, [dstq +stride3q -2]
838    movq           xm5, [top1q          +2]
839    movq          xm10, [top2q          +2]
840    movu           m11, [r13+hq*2*2+16*2]
841    vinserti128     m6, [dst4q+strideq*0-2], 1
842    vinserti128     m9, [dst4q+strideq*1-2], 1
843    vinserti128     m5, [dstq +strideq*0+2], 1
844    vinserti128    m10, [dstq +strideq*1+2], 1
845    punpcklqdq      m6, m9
846    punpcklqdq      m5, m10
847    vpblendvb       m6, [rsp+gprsize+16+hq*8+64+8*2], m11
848%endif
849    ret
850.d1k1:
851%if %1 == 4
852 %if %2 == 4
853    vpbroadcastq    m6, [dstq+strideq*1-2]
854    vpbroadcastq    m9, [dstq+strideq*2-2]
855    movd           xm5, [topq+strideq*1+2]
856    movd          xm10, [dstq+strideq*0+2]
857    psrldq         m11, m6, 4
858    psrldq         m12, m9, 4
859    vpblendd        m5, m11, 0x10
860    movq          xm11, [leftq+2]
861    vinserti128     m6, [dstq+stride3q -2], 1
862    punpckldq     xm11, xm11
863    vpblendd       m10, m12, 0x10
864    pcmpeqd        m12, m12
865    pmovzxwd       m11, xm11
866    psrld          m12, 16
867    punpckldq       m6, m9
868    vpbroadcastd    m9, [dstq+strideq*4-2]
869    vpblendvb       m6, m11, m12
870    punpckldq       m5, m10
871    vpblendd        m6, m9, 0x20
872 %else
873    movd           xm5, [topq +strideq*1+2]
874    movq           xm6, [dstq +strideq*1-2]
875    movq           xm9, [dstq +stride3q -2]
876    movq          xm10, [dst4q+strideq*1-2]
877    movd          xm11, [dst4q+stride3q -2]
878    pinsrd         xm5, [dstq +strideq*0+2], 1
879    movhps         xm6, [dstq +strideq*2-2]
880    movhps         xm9, [dst4q+strideq*0-2]
881    movhps        xm10, [dst4q+strideq*2-2]
882    pinsrd        xm11, [dst4q+strideq*4-2], 1
883    shufps         xm5, xm6, q3110
884    shufps         xm6, xm9, q2020
885    shufps         xm9, xm10, q3131
886    shufps        xm10, xm11, q1020
887    movu           m11, [blend_4x8_2+4]
888    vinserti128     m6, xm10, 1
889    vinserti128     m5, xm9, 1
890    vpblendvb       m6, [rsp+gprsize+16+4], m11
891 %endif
892%else
893    lea            r13, [blend_8x8_1+16]
894    movq           xm5, [top2q         +2]
895    vbroadcasti128  m6, [dstq+strideq*1-2]
896    vbroadcasti128  m9, [dstq+strideq*2-2]
897    movhps         xm5, [dstq+strideq*0+2]
898    shufps         m10, m6, m9, q2121
899    vinserti128     m6, [dstq+stride3q -2], 1
900    vinserti128     m9, [dstq+strideq*4-2], 1
901    movu           m11, [r13+hq*2*1+16*1]
902    vpblendd        m5, m10, 0xF0
903    punpcklqdq      m6, m9
904    vpblendvb       m6, [rsp+gprsize+16+hq*8+64+8*1], m11
905%endif
906    ret
907.d2k1:
908%if %1 == 4
909 %if %2 == 4
910    movq          xm11, [leftq]
911    movq           xm6, [dstq+strideq*0-2]
912    movq           xm9, [dstq+strideq*1-2]
913    vinserti128     m6, [dstq+strideq*2-2], 1
914    vinserti128     m9, [dstq+stride3q -2], 1
915    punpckldq     xm11, xm11
916    psrldq          m5, m6, 4
917    psrldq         m10, m9, 4
918    pmovzxwd       m11, xm11
919    punpckldq       m6, m9
920    punpckldq       m5, m10
921    pblendw         m6, m11, 0x05
922 %else
923    movq           xm5, [dstq +strideq*0-2]
924    movq           xm9, [dstq +strideq*2-2]
925    movq          xm10, [dst4q+strideq*0-2]
926    movq          xm11, [dst4q+strideq*2-2]
927    movhps         xm5, [dstq +strideq*1-2]
928    movhps         xm9, [dstq +stride3q -2]
929    movhps        xm10, [dst4q+strideq*1-2]
930    movhps        xm11, [dst4q+stride3q -2]
931    shufps         xm6, xm5, xm9, q2020
932    shufps         xm5, xm9, q3131
933    shufps         xm9, xm10, xm11, q2020
934    shufps        xm10, xm11, q3131
935    pmovzxwd       m11, [leftq]
936    vinserti128     m6, xm9, 1
937    vinserti128     m5, xm10, 1
938    pblendw         m6, m11, 0x55
939 %endif
940%else
941    mova           m11, [rsp+gprsize+16+hq*8+64]
942    movu           xm5, [dstq+strideq*0-2]
943    movu           xm9, [dstq+strideq*1-2]
944    vinserti128     m5, [dstq+strideq*2-2], 1
945    vinserti128     m9, [dstq+stride3q -2], 1
946    shufps          m6, m5, m9, q1010
947    shufps          m5, m9, q2121
948    pblendw         m6, m11, 0x11
949%endif
950    ret
951.d3k1:
952%if %1 == 4
953 %if %2 == 4
954    vpbroadcastq   m11, [dstq+strideq*1-2]
955    vpbroadcastq   m12, [dstq+strideq*2-2]
956    movd           xm6, [topq+strideq*1-2]
957    movd           xm9, [dstq+strideq*0-2]
958    pblendw        m11, [leftq-16+2], 0x01
959    pblendw        m12, [leftq-16+4], 0x01
960    pinsrw         xm9, [leftq- 0+0], 0
961    psrldq          m5, m11, 4
962    psrldq         m10, m12, 4
963    vinserti128     m5, [dstq+stride3q +2], 1
964    vinserti128    m10, [dstq+strideq*4+2], 1
965    vpblendd        m6, m11, 0x10
966    vpblendd        m9, m12, 0x10
967    punpckldq       m6, m9
968    punpckldq       m5, m10
969 %else
970    movd           xm6, [topq +strideq*1-2]
971    movq           xm5, [dstq +strideq*1-2]
972    movq           xm9, [dstq +stride3q -2]
973    movq          xm10, [dst4q+strideq*1-2]
974    movd          xm11, [dst4q+stride3q +2]
975    pinsrw         xm6, [dstq +strideq*0  ], 3
976    movhps         xm5, [dstq +strideq*2-2]
977    movhps         xm9, [dst4q+strideq*0-2]
978    movhps        xm10, [dst4q+strideq*2-2]
979    pinsrd        xm11, [dst4q+strideq*4+2], 1
980    shufps         xm6, xm5, q2010
981    shufps         xm5, xm9, q3131
982    shufps         xm9, xm10, q2020
983    shufps        xm10, xm11, q1031
984    movu           m11, [blend_4x8_2]
985    vinserti128     m6, xm9, 1
986    vinserti128     m5, xm10, 1
987    vpblendvb       m6, [rsp+gprsize+16-4], m11
988 %endif
989%else
990    lea            r13, [blend_8x8_1+8]
991    movq           xm6, [top2q         -2]
992    vbroadcasti128  m5, [dstq+strideq*1-2]
993    vbroadcasti128 m10, [dstq+strideq*2-2]
994    movhps         xm6, [dstq+strideq*0-2]
995    punpcklqdq      m9, m5, m10
996    vinserti128     m5, [dstq+stride3q -2], 1
997    vinserti128    m10, [dstq+strideq*4-2], 1
998    movu           m11, [r13+hq*2*1+16*1]
999    vpblendd        m6, m9, 0xF0
1000    shufps          m5, m10, q2121
1001    vpblendvb       m6, [rsp+gprsize+16+hq*8+64-8*1], m11
1002%endif
1003    ret
1004.d4k1:
1005%if %1 == 4
1006 %if %2 == 4
1007    vinserti128     m6, [dstq +strideq*0-2], 1
1008    vinserti128     m9, [dstq +strideq*1-2], 1
1009    movd           xm5, [dstq +strideq*2+2]
1010    movd          xm10, [dstq +stride3q +2]
1011    pblendw         m6, [leftq-16+0], 0x01
1012    pblendw         m9, [leftq-16+2], 0x01
1013    vinserti128     m5, [dst4q+strideq*0+2], 1
1014    vinserti128    m10, [dst4q+strideq*1+2], 1
1015    vpblendd        m6, [topq +strideq*0-2], 0x01
1016    vpblendd        m9, [topq +strideq*1-2], 0x01
1017    punpckldq       m5, m10
1018    punpckldq       m6, m9
1019 %else
1020    movd           xm6, [topq +strideq*0-2]
1021    movq           xm5, [dstq +strideq*2-2]
1022    movq           xm9, [dst4q+strideq*0-2]
1023    movd          xm10, [dst4q+strideq*2+2]
1024    pinsrd         xm6, [topq +strideq*1-2], 1
1025    movhps         xm5, [dstq +stride3q -2]
1026    movhps         xm9, [dst4q+strideq*1-2]
1027    pinsrd        xm10, [dst4q+stride3q +2], 1
1028    pinsrd         xm6, [dstq +strideq*0-2], 2
1029    pinsrd        xm10, [dst8q+strideq*0+2], 2
1030    pinsrd         xm6, [dstq +strideq*1-2], 3
1031    pinsrd        xm10, [dst8q+strideq*1+2], 3
1032    shufps        xm11, xm5, xm9, q2020
1033    shufps         xm5, xm9, q3131
1034    movu            m9, [blend_4x8_3]
1035    vinserti128     m6, xm11, 1
1036    vinserti128     m5, xm10, 1
1037    vpblendvb       m6, [rsp+gprsize+16-8], m9
1038 %endif
1039%else
1040    lea            r13, [blend_8x8_1]
1041    movu           m11, [r13+hq*2*2+16*2]
1042    movq           xm6, [top1q          -2]
1043    movq           xm9, [top2q          -2]
1044    movq           xm5, [dstq +strideq*2+2]
1045    movq          xm10, [dstq +stride3q +2]
1046    vinserti128     m6, [dstq +strideq*0-2], 1
1047    vinserti128     m9, [dstq +strideq*1-2], 1
1048    vinserti128     m5, [dst4q+strideq*0+2], 1
1049    vinserti128    m10, [dst4q+strideq*1+2], 1
1050    punpcklqdq      m6, m9
1051    vpblendvb       m6, [rsp+gprsize+16+hq*8+64-8*2], m11
1052    punpcklqdq      m5, m10
1053%endif
1054    ret
1055.d5k1:
1056%if %1 == 4
1057 %if %2 == 4
1058    movd           xm6, [topq +strideq*0-1]
1059    movd           xm9, [topq +strideq*1-1]
1060    movd           xm5, [dstq +strideq*2+1]
1061    movd          xm10, [dstq +stride3q +1]
1062    pcmpeqd        m12, m12
1063    pmovzxbw       m11, [leftq-8+1]
1064    psrld          m12, 24
1065    vinserti128     m6, [dstq +strideq*0-1], 1
1066    vinserti128     m9, [dstq +strideq*1-1], 1
1067    vinserti128     m5, [dst4q+strideq*0+1], 1
1068    vinserti128    m10, [dst4q+strideq*1+1], 1
1069    punpckldq       m6, m9
1070    pxor            m9, m9
1071    vpblendd       m12, m9, 0x0F
1072    punpckldq       m5, m10
1073    vpblendvb       m6, m11, m12
1074 %else
1075    movd           xm6, [topq +strideq*0-1]
1076    movq           xm5, [dstq +strideq*2-1]
1077    movq           xm9, [dst4q+strideq*0-1]
1078    movd          xm10, [dst4q+strideq*2+1]
1079    pinsrd         xm6, [topq +strideq*1-1], 1
1080    movhps         xm5, [dstq +stride3q -1]
1081    movhps         xm9, [dst4q+strideq*1-1]
1082    pinsrd        xm10, [dst4q+stride3q +1], 1
1083    pinsrd         xm6, [dstq +strideq*0-1], 2
1084    pinsrd        xm10, [dst8q+strideq*0+1], 2
1085    pinsrd         xm6, [dstq +strideq*1-1], 3
1086    pinsrd        xm10, [dst8q+strideq*1+1], 3
1087    shufps        xm11, xm5, xm9, q2020
1088    vinserti128     m6, xm11, 1
1089    pmovzxbw       m11, [leftq-3]
1090    psrldq         xm5, 2
1091    psrldq         xm9, 2
1092    shufps         xm5, xm9, q2020
1093    movu            m9, [blend_4x8_1]
1094    vinserti128     m5, xm10, 1
1095    vpblendvb       m6, m11, m9
1096 %endif
1097%else
1098    lea            r13, [blend_8x8_0]
1099    movu           m11, [r13+hq*2*2+16*2]
1100    movq           xm6, [top1q          -1]
1101    movq           xm9, [top2q          -1]
1102    movq           xm5, [dstq +strideq*2+1]
1103    movq          xm10, [dstq +stride3q +1]
1104    vinserti128     m6, [dstq +strideq*0-1], 1
1105    vinserti128     m9, [dstq +strideq*1-1], 1
1106    vinserti128     m5, [dst4q+strideq*0+1], 1
1107    vinserti128    m10, [dst4q+strideq*1+1], 1
1108    punpcklqdq      m6, m9
1109    punpcklqdq      m5, m10
1110    vpblendvb       m6, [rsp+gprsize+80+hq*8+64-8*2], m11
1111%endif
1112    ret
1113.d6k1:
1114%if %1 == 4
1115 %if %2 == 4
1116    movd           xm6, [topq +strideq*0]
1117    movd           xm9, [topq +strideq*1]
1118    movd           xm5, [dstq +strideq*2]
1119    movd          xm10, [dstq +stride3q ]
1120    vinserti128     m6, [dstq +strideq*0], 1
1121    vinserti128     m9, [dstq +strideq*1], 1
1122    vinserti128     m5, [dst4q+strideq*0], 1
1123    vinserti128    m10, [dst4q+strideq*1], 1
1124    punpckldq       m6, m9
1125    punpckldq       m5, m10
1126 %else
1127    movd           xm5, [dstq +strideq*2]
1128    movd           xm6, [topq +strideq*0]
1129    movd           xm9, [dst4q+strideq*2]
1130    pinsrd         xm5, [dstq +stride3q ], 1
1131    pinsrd         xm6, [topq +strideq*1], 1
1132    pinsrd         xm9, [dst4q+stride3q ], 1
1133    pinsrd         xm5, [dst4q+strideq*0], 2
1134    pinsrd         xm6, [dstq +strideq*0], 2
1135    pinsrd         xm9, [dst8q+strideq*0], 2
1136    pinsrd         xm5, [dst4q+strideq*1], 3
1137    pinsrd         xm6, [dstq +strideq*1], 3
1138    pinsrd         xm9, [dst8q+strideq*1], 3
1139    vinserti128     m6, xm5, 1
1140    vinserti128     m5, xm9, 1
1141 %endif
1142%else
1143    movq           xm5, [dstq +strideq*2]
1144    movq           xm9, [dst4q+strideq*0]
1145    movq           xm6, [top1q          ]
1146    movq          xm10, [dstq +strideq*0]
1147    movhps         xm5, [dstq +stride3q ]
1148    movhps         xm9, [dst4q+strideq*1]
1149    movhps         xm6, [top2q          ]
1150    movhps        xm10, [dstq +strideq*1]
1151    vinserti128     m5, xm9, 1
1152    vinserti128     m6, xm10, 1
1153%endif
1154    ret
1155.d7k1:
1156%if %1 == 4
1157 %if %2 == 4
1158    movd           xm5, [dstq +strideq*2-1]
1159    movd           xm9, [dstq +stride3q -1]
1160    movd           xm6, [topq +strideq*0+1]
1161    movd          xm10, [topq +strideq*1+1]
1162    pinsrb         xm5, [leftq+ 5], 0
1163    pinsrb         xm9, [leftq+ 7], 0
1164    vinserti128     m6, [dstq +strideq*0+1], 1
1165    vinserti128    m10, [dstq +strideq*1+1], 1
1166    vinserti128     m5, [dst4q+strideq*0-1], 1
1167    vinserti128     m9, [dst4q+strideq*1-1], 1
1168    punpckldq       m6, m10
1169    punpckldq       m5, m9
1170 %else
1171    movd           xm6, [topq +strideq*0+1]
1172    movq           xm9, [dstq +strideq*2-1]
1173    movq          xm10, [dst4q+strideq*0-1]
1174    movd          xm11, [dst4q+strideq*2-1]
1175    pinsrd         xm6, [topq +strideq*1+1], 1
1176    movhps         xm9, [dstq +stride3q -1]
1177    movhps        xm10, [dst4q+strideq*1-1]
1178    pinsrd        xm11, [dst4q+stride3q -1], 1
1179    pinsrd         xm6, [dstq +strideq*0+1], 2
1180    pinsrd        xm11, [dst8q+strideq*0-1], 2
1181    pinsrd         xm6, [dstq +strideq*1+1], 3
1182    pinsrd        xm11, [dst8q+strideq*1-1], 3
1183    shufps         xm5, xm9, xm10, q2020
1184    vinserti128     m5, xm11, 1
1185    pmovzxbw       m11, [leftq+5]
1186    psrldq         xm9, 2
1187    psrldq        xm10, 2
1188    shufps         xm9, xm10, q2020
1189    movu           m10, [blend_4x8_1+8]
1190    vinserti128     m6, xm9, 1
1191    vpblendvb       m5, m11, m10
1192 %endif
1193%else
1194    lea            r13, [blend_8x8_0+16]
1195    movq           xm5, [dstq +strideq*2-1]
1196    movq           xm9, [dst4q+strideq*0-1]
1197    movq           xm6, [top1q          +1]
1198    movq          xm10, [dstq +strideq*0+1]
1199    movhps         xm5, [dstq +stride3q -1]
1200    movhps         xm9, [dst4q+strideq*1-1]
1201    movhps         xm6, [top2q          +1]
1202    movhps        xm10, [dstq +strideq*1+1]
1203    movu           m11, [r13+hq*2*2+16*2]
1204    vinserti128     m5, xm9, 1
1205    vinserti128     m6, xm10, 1
1206    vpblendvb       m5, [rsp+gprsize+80+hq*8+64+8*2], m11
1207%endif
1208    ret
1209
1210.border_block:
1211 DEFINE_ARGS dst, stride, left, top, pri, sec, stride3, dst4, edge
1212%define rstk rsp
1213%assign stack_offset stack_offset_entry
1214%if %1 == 4 && %2 == 8
1215    PUSH            r9
1216 %assign regs_used 10
1217%else
1218 %assign regs_used 9
1219%endif
1220%if STACK_ALIGNMENT < 32
1221    PUSH  r%+regs_used
1222 %assign regs_used regs_used+1
1223%endif
1224    ALLOC_STACK 2*16+(%2+4)*32, 16
1225%define px rsp+2*16+2*32
1226
1227    pcmpeqw        m14, m14
1228    psllw          m14, 15                  ; 0x8000
1229
1230    ; prepare pixel buffers - body/right
1231%if %1 == 4
1232    INIT_XMM avx2
1233%endif
1234%if %2 == 8
1235    lea          dst4q, [dstq+strideq*4]
1236%endif
1237    lea       stride3q, [strideq*3]
1238    test         edgeb, 2                   ; have_right
1239    jz .no_right
1240    pmovzxbw        m1, [dstq+strideq*0]
1241    pmovzxbw        m2, [dstq+strideq*1]
1242    pmovzxbw        m3, [dstq+strideq*2]
1243    pmovzxbw        m4, [dstq+stride3q]
1244    mova     [px+0*32], m1
1245    mova     [px+1*32], m2
1246    mova     [px+2*32], m3
1247    mova     [px+3*32], m4
1248%if %2 == 8
1249    pmovzxbw        m1, [dst4q+strideq*0]
1250    pmovzxbw        m2, [dst4q+strideq*1]
1251    pmovzxbw        m3, [dst4q+strideq*2]
1252    pmovzxbw        m4, [dst4q+stride3q]
1253    mova     [px+4*32], m1
1254    mova     [px+5*32], m2
1255    mova     [px+6*32], m3
1256    mova     [px+7*32], m4
1257%endif
1258    jmp .body_done
1259.no_right:
1260%if %1 == 4
1261    movd           xm1, [dstq+strideq*0]
1262    movd           xm2, [dstq+strideq*1]
1263    movd           xm3, [dstq+strideq*2]
1264    movd           xm4, [dstq+stride3q]
1265    pmovzxbw       xm1, xm1
1266    pmovzxbw       xm2, xm2
1267    pmovzxbw       xm3, xm3
1268    pmovzxbw       xm4, xm4
1269    movq     [px+0*32], xm1
1270    movq     [px+1*32], xm2
1271    movq     [px+2*32], xm3
1272    movq     [px+3*32], xm4
1273%else
1274    pmovzxbw       xm1, [dstq+strideq*0]
1275    pmovzxbw       xm2, [dstq+strideq*1]
1276    pmovzxbw       xm3, [dstq+strideq*2]
1277    pmovzxbw       xm4, [dstq+stride3q]
1278    mova     [px+0*32], xm1
1279    mova     [px+1*32], xm2
1280    mova     [px+2*32], xm3
1281    mova     [px+3*32], xm4
1282%endif
1283    movd [px+0*32+%1*2], xm14
1284    movd [px+1*32+%1*2], xm14
1285    movd [px+2*32+%1*2], xm14
1286    movd [px+3*32+%1*2], xm14
1287%if %2 == 8
1288 %if %1 == 4
1289    movd           xm1, [dst4q+strideq*0]
1290    movd           xm2, [dst4q+strideq*1]
1291    movd           xm3, [dst4q+strideq*2]
1292    movd           xm4, [dst4q+stride3q]
1293    pmovzxbw       xm1, xm1
1294    pmovzxbw       xm2, xm2
1295    pmovzxbw       xm3, xm3
1296    pmovzxbw       xm4, xm4
1297    movq     [px+4*32], xm1
1298    movq     [px+5*32], xm2
1299    movq     [px+6*32], xm3
1300    movq     [px+7*32], xm4
1301 %else
1302    pmovzxbw       xm1, [dst4q+strideq*0]
1303    pmovzxbw       xm2, [dst4q+strideq*1]
1304    pmovzxbw       xm3, [dst4q+strideq*2]
1305    pmovzxbw       xm4, [dst4q+stride3q]
1306    mova     [px+4*32], xm1
1307    mova     [px+5*32], xm2
1308    mova     [px+6*32], xm3
1309    mova     [px+7*32], xm4
1310 %endif
1311    movd [px+4*32+%1*2], xm14
1312    movd [px+5*32+%1*2], xm14
1313    movd [px+6*32+%1*2], xm14
1314    movd [px+7*32+%1*2], xm14
1315%endif
1316.body_done:
1317
1318    ; top
1319    test         edgeb, 4                    ; have_top
1320    jz .no_top
1321    test         edgeb, 1                    ; have_left
1322    jz .top_no_left
1323    test         edgeb, 2                    ; have_right
1324    jz .top_no_right
1325    pmovzxbw        m1, [topq+strideq*0-(%1/2)]
1326    pmovzxbw        m2, [topq+strideq*1-(%1/2)]
1327    movu  [px-2*32-%1], m1
1328    movu  [px-1*32-%1], m2
1329    jmp .top_done
1330.top_no_right:
1331    pmovzxbw        m1, [topq+strideq*0-%1]
1332    pmovzxbw        m2, [topq+strideq*1-%1]
1333    movu [px-2*32-%1*2], m1
1334    movu [px-1*32-%1*2], m2
1335    movd [px-2*32+%1*2], xm14
1336    movd [px-1*32+%1*2], xm14
1337    jmp .top_done
1338.top_no_left:
1339    test         edgeb, 2                   ; have_right
1340    jz .top_no_left_right
1341    pmovzxbw        m1, [topq+strideq*0]
1342    pmovzxbw        m2, [topq+strideq*1]
1343    mova   [px-2*32+0], m1
1344    mova   [px-1*32+0], m2
1345    movd   [px-2*32-4], xm14
1346    movd   [px-1*32-4], xm14
1347    jmp .top_done
1348.top_no_left_right:
1349%if %1 == 4
1350    movd           xm1, [topq+strideq*0]
1351    pinsrd         xm1, [topq+strideq*1], 1
1352    pmovzxbw       xm1, xm1
1353    movq   [px-2*32+0], xm1
1354    movhps [px-1*32+0], xm1
1355%else
1356    pmovzxbw       xm1, [topq+strideq*0]
1357    pmovzxbw       xm2, [topq+strideq*1]
1358    mova   [px-2*32+0], xm1
1359    mova   [px-1*32+0], xm2
1360%endif
1361    movd   [px-2*32-4], xm14
1362    movd   [px-1*32-4], xm14
1363    movd [px-2*32+%1*2], xm14
1364    movd [px-1*32+%1*2], xm14
1365    jmp .top_done
1366.no_top:
1367    movu   [px-2*32-%1], m14
1368    movu   [px-1*32-%1], m14
1369.top_done:
1370
1371    ; left
1372    test         edgeb, 1                   ; have_left
1373    jz .no_left
1374    pmovzxbw       xm1, [leftq+ 0]
1375%if %2 == 8
1376    pmovzxbw       xm2, [leftq+ 8]
1377%endif
1378    movd   [px+0*32-4], xm1
1379    pextrd [px+1*32-4], xm1, 1
1380    pextrd [px+2*32-4], xm1, 2
1381    pextrd [px+3*32-4], xm1, 3
1382%if %2 == 8
1383    movd   [px+4*32-4], xm2
1384    pextrd [px+5*32-4], xm2, 1
1385    pextrd [px+6*32-4], xm2, 2
1386    pextrd [px+7*32-4], xm2, 3
1387%endif
1388    jmp .left_done
1389.no_left:
1390    movd   [px+0*32-4], xm14
1391    movd   [px+1*32-4], xm14
1392    movd   [px+2*32-4], xm14
1393    movd   [px+3*32-4], xm14
1394%if %2 == 8
1395    movd   [px+4*32-4], xm14
1396    movd   [px+5*32-4], xm14
1397    movd   [px+6*32-4], xm14
1398    movd   [px+7*32-4], xm14
1399%endif
1400.left_done:
1401
1402    ; bottom
1403    DEFINE_ARGS dst, stride, dst8, dummy1, pri, sec, stride3, dummy3, edge
1404    test         edgeb, 8                   ; have_bottom
1405    jz .no_bottom
1406    lea          dst8q, [dstq+%2*strideq]
1407    test         edgeb, 1                   ; have_left
1408    jz .bottom_no_left
1409    test         edgeb, 2                   ; have_right
1410    jz .bottom_no_right
1411    pmovzxbw        m1, [dst8q-(%1/2)]
1412    pmovzxbw        m2, [dst8q+strideq-(%1/2)]
1413    movu   [px+(%2+0)*32-%1], m1
1414    movu   [px+(%2+1)*32-%1], m2
1415    jmp .bottom_done
1416.bottom_no_right:
1417    pmovzxbw        m1, [dst8q-%1]
1418    pmovzxbw        m2, [dst8q+strideq-%1]
1419    movu  [px+(%2+0)*32-%1*2], m1
1420    movu  [px+(%2+1)*32-%1*2], m2
1421%if %1 == 8
1422    movd  [px+(%2-1)*32+%1*2], xm14                ; overwritten by previous movu
1423%endif
1424    movd  [px+(%2+0)*32+%1*2], xm14
1425    movd  [px+(%2+1)*32+%1*2], xm14
1426    jmp .bottom_done
1427.bottom_no_left:
1428    test          edgeb, 2                  ; have_right
1429    jz .bottom_no_left_right
1430    pmovzxbw        m1, [dst8q]
1431    pmovzxbw        m2, [dst8q+strideq]
1432    mova   [px+(%2+0)*32+0], m1
1433    mova   [px+(%2+1)*32+0], m2
1434    movd   [px+(%2+0)*32-4], xm14
1435    movd   [px+(%2+1)*32-4], xm14
1436    jmp .bottom_done
1437.bottom_no_left_right:
1438%if %1 == 4
1439    movd           xm1, [dst8q]
1440    pinsrd         xm1, [dst8q+strideq], 1
1441    pmovzxbw       xm1, xm1
1442    movq   [px+(%2+0)*32+0], xm1
1443    movhps [px+(%2+1)*32+0], xm1
1444%else
1445    pmovzxbw       xm1, [dst8q]
1446    pmovzxbw       xm2, [dst8q+strideq]
1447    mova   [px+(%2+0)*32+0], xm1
1448    mova   [px+(%2+1)*32+0], xm2
1449%endif
1450    movd   [px+(%2+0)*32-4], xm14
1451    movd   [px+(%2+1)*32-4], xm14
1452    movd  [px+(%2+0)*32+%1*2], xm14
1453    movd  [px+(%2+1)*32+%1*2], xm14
1454    jmp .bottom_done
1455.no_bottom:
1456    movu   [px+(%2+0)*32-%1], m14
1457    movu   [px+(%2+1)*32-%1], m14
1458.bottom_done:
1459
1460    ; actual filter
1461    INIT_YMM avx2
1462    DEFINE_ARGS dst, stride, pridmp, damping, pri, secdmp, stride3, zero
1463%undef edged
1464    ; register to shuffle values into after packing
1465    vbroadcasti128 m12, [shufb_lohi]
1466
1467    mov       dampingd, r7m
1468    xor          zerod, zerod
1469    movifnidn     prid, prim
1470    sub       dampingd, 31
1471    movifnidn  secdmpd, secdmpm
1472    test          prid, prid
1473    jz .border_sec_only
1474    movd           xm0, prid
1475    lzcnt      pridmpd, prid
1476    add        pridmpd, dampingd
1477    cmovs      pridmpd, zerod
1478    mov        [rsp+0], pridmpq                 ; pri_shift
1479    test       secdmpd, secdmpd
1480    jz .border_pri_only
1481    movd           xm1, secdmpd
1482    lzcnt      secdmpd, secdmpd
1483    add        secdmpd, dampingd
1484    cmovs      secdmpd, zerod
1485    mov        [rsp+8], secdmpq                 ; sec_shift
1486
1487    DEFINE_ARGS dst, stride, pridmp, table, pri, secdmp, stride3
1488    lea         tableq, [tap_table]
1489    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
1490    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
1491
1492    ; pri/sec_taps[k] [4 total]
1493    DEFINE_ARGS dst, stride, dir, table, pri, sec, stride3
1494    vpbroadcastb    m0, xm0                     ; pri_strength
1495    vpbroadcastb    m1, xm1                     ; sec_strength
1496    and           prid, 1
1497    lea           priq, [tableq+priq*2+8]       ; pri_taps
1498    lea           secq, [tableq+12]             ; sec_taps
1499
1500    BORDER_PREP_REGS %1, %2
1501%if %1*%2*2/mmsize > 1
1502.border_v_loop:
1503%endif
1504    BORDER_LOAD_BLOCK %1, %2, 1
1505.border_k_loop:
1506    vpbroadcastb    m2, [priq+kq]               ; pri_taps
1507    vpbroadcastb    m3, [secq+kq]               ; sec_taps
1508    ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1, 1
1509    ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1, 1
1510    ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1, 1
1511    dec             kq
1512    jge .border_k_loop
1513
1514    vpbroadcastd   m10, [pw_2048]
1515    BORDER_ADJUST_PIXEL %1, m10, 1
1516%if %1*%2*2/mmsize > 1
1517 %define vloop_lines (mmsize/(%1*2))
1518    lea           dstq, [dstq+strideq*vloop_lines]
1519    add           stkq, 32*vloop_lines
1520    dec             hd
1521    jg .border_v_loop
1522%endif
1523    RET
1524
1525.border_pri_only:
1526 DEFINE_ARGS dst, stride, pridmp, table, pri, _, stride3
1527    lea         tableq, [tap_table]
1528    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
1529 DEFINE_ARGS dst, stride, dir, table, pri, _, stride3
1530    vpbroadcastb    m0, xm0                     ; pri_strength
1531    and           prid, 1
1532    lea           priq, [tableq+priq*2+8]       ; pri_taps
1533    BORDER_PREP_REGS %1, %2
1534    vpbroadcastd    m1, [pw_2048]
1535%if %1*%2*2/mmsize > 1
1536.border_pri_v_loop:
1537%endif
1538    BORDER_LOAD_BLOCK %1, %2
1539.border_pri_k_loop:
1540    vpbroadcastb    m2, [priq+kq]               ; pri_taps
1541    ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1
1542    dec             kq
1543    jge .border_pri_k_loop
1544    BORDER_ADJUST_PIXEL %1, m1
1545%if %1*%2*2/mmsize > 1
1546 %define vloop_lines (mmsize/(%1*2))
1547    lea           dstq, [dstq+strideq*vloop_lines]
1548    add           stkq, 32*vloop_lines
1549    dec             hd
1550    jg .border_pri_v_loop
1551%endif
1552    RET
1553
1554.border_sec_only:
1555 DEFINE_ARGS dst, stride, _, damping, _, secdmp, stride3, zero
1556    movd           xm1, secdmpd
1557    lzcnt      secdmpd, secdmpd
1558    add        secdmpd, dampingd
1559    cmovs      secdmpd, zerod
1560    mov        [rsp+8], secdmpq                 ; sec_shift
1561 DEFINE_ARGS dst, stride, _, table, _, secdmp, stride3
1562    lea         tableq, [tap_table]
1563    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
1564 DEFINE_ARGS dst, stride, dir, table, _, sec, stride3
1565    vpbroadcastb    m1, xm1                     ; sec_strength
1566    lea           secq, [tableq+12]             ; sec_taps
1567    BORDER_PREP_REGS %1, %2
1568    vpbroadcastd    m0, [pw_2048]
1569%if %1*%2*2/mmsize > 1
1570.border_sec_v_loop:
1571%endif
1572    BORDER_LOAD_BLOCK %1, %2
1573.border_sec_k_loop:
1574    vpbroadcastb    m3, [secq+kq]               ; sec_taps
1575    ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1
1576    ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1
1577    dec             kq
1578    jge .border_sec_k_loop
1579    BORDER_ADJUST_PIXEL %1, m0
1580%if %1*%2*2/mmsize > 1
1581 %define vloop_lines (mmsize/(%1*2))
1582    lea           dstq, [dstq+strideq*vloop_lines]
1583    add           stkq, 32*vloop_lines
1584    dec             hd
1585    jg .border_sec_v_loop
1586%endif
1587    RET
1588%endmacro
1589
1590CDEF_FILTER 8, 8
1591CDEF_FILTER 4, 8
1592CDEF_FILTER 4, 4
1593
1594INIT_YMM avx2
1595cglobal cdef_dir_8bpc, 3, 4, 6, src, stride, var, stride3
1596    lea       stride3q, [strideq*3]
1597    movq           xm0, [srcq+strideq*0]
1598    movq           xm1, [srcq+strideq*1]
1599    movq           xm2, [srcq+strideq*2]
1600    movq           xm3, [srcq+stride3q ]
1601    lea           srcq, [srcq+strideq*4]
1602    vpbroadcastq    m4, [srcq+stride3q ]
1603    vpbroadcastq    m5, [srcq+strideq*2]
1604    vpblendd        m0, m4, 0xf0
1605    vpblendd        m1, m5, 0xf0
1606    vpbroadcastq    m4, [srcq+strideq*1]
1607    vpbroadcastq    m5, [srcq+strideq*0]
1608    vpblendd        m2, m4, 0xf0
1609    vpblendd        m3, m5, 0xf0
1610    pxor            m4, m4
1611    punpcklbw       m0, m4
1612    punpcklbw       m1, m4
1613    punpcklbw       m2, m4
1614    punpcklbw       m3, m4
1615cglobal_label .main
1616    vpbroadcastd    m4, [pw_128]
1617    PROLOGUE 3, 4, 15
1618    psubw           m0, m4
1619    psubw           m1, m4
1620    psubw           m2, m4
1621    psubw           m3, m4
1622
1623    ; shuffle registers to generate partial_sum_diag[0-1] together
1624    vperm2i128      m7, m0, m0, 0x01
1625    vperm2i128      m6, m1, m1, 0x01
1626    vperm2i128      m5, m2, m2, 0x01
1627    vperm2i128      m4, m3, m3, 0x01
1628
1629    ; start with partial_sum_hv[0-1]
1630    paddw           m8, m0, m1
1631    paddw           m9, m2, m3
1632    phaddw         m10, m0, m1
1633    phaddw         m11, m2, m3
1634    paddw           m8, m9
1635    phaddw         m10, m11
1636    vextracti128   xm9, m8, 1
1637    vextracti128  xm11, m10, 1
1638    paddw          xm8, xm9                 ; partial_sum_hv[1]
1639    phaddw        xm10, xm11                ; partial_sum_hv[0]
1640    vinserti128     m8, xm10, 1
1641    vpbroadcastd    m9, [div_table+44]
1642    pmaddwd         m8, m8
1643    pmulld          m8, m9                  ; cost6[2a-d] | cost2[a-d]
1644
1645    ; create aggregates [lower half]:
1646    ; m9 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234+
1647    ;      m4:xxxx0123+m5:xxxxx012+m6:xxxxxx01+m7:xxxxxxx0
1648    ; m10=             m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx+
1649    ;      m4:4567xxxx+m5:34567xxx+m6:234567xx+m7:1234567x
1650    ; and [upper half]:
1651    ; m9 = m0:xxxxxxx0+m1:xxxxxx01+m2:xxxxx012+m3:xxxx0123+
1652    ;      m4:xxx01234+m5:xx012345+m6:x0123456+m7:01234567
1653    ; m10= m0:1234567x+m1:234567xx+m2:34567xxx+m3:4567xxxx+
1654    ;      m4:567xxxxx+m5:67xxxxxx+m6:7xxxxxxx
1655    ; and then shuffle m11 [shufw_6543210x], unpcklwd, pmaddwd, pmulld, paddd
1656
1657    pslldq          m9, m1, 2
1658    psrldq         m10, m1, 14
1659    pslldq         m11, m2, 4
1660    psrldq         m12, m2, 12
1661    pslldq         m13, m3, 6
1662    psrldq         m14, m3, 10
1663    paddw           m9, m11
1664    paddw          m10, m12
1665    paddw           m9, m13
1666    paddw          m10, m14
1667    pslldq         m11, m4, 8
1668    psrldq         m12, m4, 8
1669    pslldq         m13, m5, 10
1670    psrldq         m14, m5, 6
1671    paddw           m9, m11
1672    paddw          m10, m12
1673    paddw           m9, m13
1674    paddw          m10, m14
1675    pslldq         m11, m6, 12
1676    psrldq         m12, m6, 4
1677    pslldq         m13, m7, 14
1678    psrldq         m14, m7, 2
1679    paddw           m9, m11
1680    paddw          m10, m12
1681    paddw           m9, m13
1682    paddw          m10, m14                 ; partial_sum_diag[0/1][8-14,zero]
1683    vbroadcasti128 m14, [shufw_6543210x]
1684    vbroadcasti128 m13, [div_table+16]
1685    vbroadcasti128 m12, [div_table+0]
1686    paddw           m9, m0                  ; partial_sum_diag[0/1][0-7]
1687    pshufb         m10, m14
1688    punpckhwd      m11, m9, m10
1689    punpcklwd       m9, m10
1690    pmaddwd        m11, m11
1691    pmaddwd         m9, m9
1692    pmulld         m11, m13
1693    pmulld          m9, m12
1694    paddd           m9, m11                 ; cost0[a-d] | cost4[a-d]
1695
1696    ; merge horizontally and vertically for partial_sum_alt[0-3]
1697    paddw          m10, m0, m1
1698    paddw          m11, m2, m3
1699    paddw          m12, m4, m5
1700    paddw          m13, m6, m7
1701    phaddw          m0, m4
1702    phaddw          m1, m5
1703    phaddw          m2, m6
1704    phaddw          m3, m7
1705
1706    ; create aggregates [lower half]:
1707    ; m4 = m10:01234567+m11:x0123456+m12:xx012345+m13:xxx01234
1708    ; m11=              m11:7xxxxxxx+m12:67xxxxxx+m13:567xxxxx
1709    ; and [upper half]:
1710    ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567
1711    ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx
1712    ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd
1713
1714    pslldq          m4, m11, 2
1715    psrldq         m11, 14
1716    pslldq          m5, m12, 4
1717    psrldq         m12, 12
1718    pslldq          m6, m13, 6
1719    psrldq         m13, 10
1720    paddw           m4, m10
1721    paddw          m11, m12
1722    vpbroadcastd   m12, [div_table+44]
1723    paddw           m5, m6
1724    paddw          m11, m13                 ; partial_sum_alt[3/2] right
1725    vbroadcasti128 m13, [div_table+32]
1726    paddw           m4, m5                  ; partial_sum_alt[3/2] left
1727    pshuflw         m5, m11, q3012
1728    punpckhwd       m6, m11, m4
1729    punpcklwd       m4, m5
1730    pmaddwd         m6, m6
1731    pmaddwd         m4, m4
1732    pmulld          m6, m12
1733    pmulld          m4, m13
1734    paddd           m4, m6                  ; cost7[a-d] | cost5[a-d]
1735
1736    ; create aggregates [lower half]:
1737    ; m5 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234
1738    ; m1 =             m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx
1739    ; and [upper half]:
1740    ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567
1741    ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx
1742    ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd
1743
1744    pslldq          m5, m1, 2
1745    psrldq          m1, 14
1746    pslldq          m6, m2, 4
1747    psrldq          m2, 12
1748    pslldq          m7, m3, 6
1749    psrldq          m3, 10
1750    paddw           m5, m0
1751    paddw           m1, m2
1752    paddw           m6, m7
1753    paddw           m1, m3                  ; partial_sum_alt[0/1] right
1754    paddw           m5, m6                  ; partial_sum_alt[0/1] left
1755    pshuflw         m0, m1, q3012
1756    punpckhwd       m1, m5
1757    punpcklwd       m5, m0
1758    pmaddwd         m1, m1
1759    pmaddwd         m5, m5
1760    pmulld          m1, m12
1761    pmulld          m5, m13
1762    paddd           m5, m1                  ; cost1[a-d] | cost3[a-d]
1763
1764    mova           xm0, [pd_47130256+ 16]
1765    mova            m1, [pd_47130256]
1766    phaddd          m9, m8
1767    phaddd          m5, m4
1768    phaddd          m9, m5
1769    vpermd          m0, m9                  ; cost[0-3]
1770    vpermd          m1, m9                  ; cost[4-7] | cost[0-3]
1771
1772    ; now find the best cost
1773    pmaxsd         xm2, xm0, xm1
1774    pshufd         xm3, xm2, q1032
1775    pmaxsd         xm2, xm3
1776    pshufd         xm3, xm2, q2301
1777    pmaxsd         xm2, xm3 ; best cost
1778
1779    ; find the idx using minpos
1780    ; make everything other than the best cost negative via subtraction
1781    ; find the min of unsigned 16-bit ints to sort out the negative values
1782    psubd          xm4, xm1, xm2
1783    psubd          xm3, xm0, xm2
1784    packssdw       xm3, xm4
1785    phminposuw     xm3, xm3
1786
1787    ; convert idx to 32-bits
1788    psrld          xm3, 16
1789    movd           eax, xm3
1790
1791    ; get idx^4 complement
1792    vpermd          m3, m1
1793    psubd          xm2, xm3
1794    psrld          xm2, 10
1795    movd        [varq], xm2
1796    RET
1797
1798%endif ; ARCH_X86_64
1799