1; Copyright © 2018, VideoLAN and dav1d authors
2; Copyright © 2018, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31%macro JMP_TABLE 2-*
32 %xdefine %1_jmptable %%table
33 %xdefine %%base mangle(private_prefix %+ _%1_avx2)
34 %%table:
35 %rep %0 - 1
36    dd %%base %+ .%2 - %%table
37  %rotate 1
38 %endrep
39%endmacro
40
41%macro CDEF_FILTER_JMP_TABLE 1
42JMP_TABLE cdef_filter_%1_8bpc, \
43    d6k0, d6k1, d7k0, d7k1, \
44    d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \
45    d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \
46    d0k0, d0k1, d1k0, d1k1
47%endmacro
48
49SECTION_RODATA 32
50
51pd_47130256:   dd  4,  7,  1,  3,  0,  2,  5,  6
52blend_4x4:     dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00
53               dd 0x80, 0x00, 0x00
54blend_4x8_0:   dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
55blend_4x8_1:   dd 0x00, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
56               dd 0x00, 0x00
57blend_4x8_2:   dd 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
58               dd 0x0000
59blend_4x8_3:   dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
60               dd 0x0000, 0x0000
61blend_8x8_0:   dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80
62blend_8x8_1:   dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000
63div_table:     dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105
64shufw_6543210x:db 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1, 14, 15
65shufb_lohi:    db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
66pw_128:        times 2 dw 128
67pw_2048:       times 2 dw 2048
68tap_table:     ; masks for 8 bit shifts
69               db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01
70               ; weights
71               db  4,  2,  3,  3,  2,  1
72               db -1 * 16 + 1, -2 * 16 + 2
73               db  0 * 16 + 1, -1 * 16 + 2
74               db  0 * 16 + 1,  0 * 16 + 2
75               db  0 * 16 + 1,  1 * 16 + 2
76               db  1 * 16 + 1,  2 * 16 + 2
77               db  1 * 16 + 0,  2 * 16 + 1
78               db  1 * 16 + 0,  2 * 16 + 0
79               db  1 * 16 + 0,  2 * 16 - 1
80               ; the last 6 are repeats of the first 6 so we don't need to & 7
81               db -1 * 16 + 1, -2 * 16 + 2
82               db  0 * 16 + 1, -1 * 16 + 2
83               db  0 * 16 + 1,  0 * 16 + 2
84               db  0 * 16 + 1,  1 * 16 + 2
85               db  1 * 16 + 1,  2 * 16 + 2
86               db  1 * 16 + 0,  2 * 16 + 1
87
88CDEF_FILTER_JMP_TABLE 4x4
89CDEF_FILTER_JMP_TABLE 4x8
90CDEF_FILTER_JMP_TABLE 8x8
91
92SECTION .text
93
94%macro PREP_REGS 2 ; w, h
95    ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
96    mov           dird, r7m
97    lea         tableq, [cdef_filter_%1x%2_8bpc_jmptable]
98    lea           dirq, [tableq+dirq*2*4]
99%if %1 == 4
100 %if %2 == 4
101  DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \
102              table, dir, dirjmp, stride3, k
103 %else
104  DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \
105              table, dir, dirjmp, dst4, stride3, k
106    lea          dst4q, [dstq+strideq*4]
107 %endif
108%else
109  DEFINE_ARGS dst, stride, h, top1, bot, pri, sec, \
110              table, dir, dirjmp, top2, stride3, k
111    mov             hq, -8
112    lea          top1q, [top1q+strideq*0]
113    lea          top2q, [top1q+strideq*1]
114%endif
115%if %1 == 4
116    lea       stride3q, [strideq*3]
117%endif
118%endmacro
119
120%macro LOAD_BLOCK 2-3 0 ; w, h, init_min_max
121    mov             kd, 1
122    pxor           m15, m15                     ; sum
123%if %2 == 8
124    pxor           m12, m12
125 %if %1 == 4
126    movd           xm4, [dstq +strideq*0]
127    movd           xm6, [dstq +strideq*1]
128    movd           xm5, [dstq +strideq*2]
129    movd           xm7, [dstq +stride3q ]
130    vinserti128     m4, [dst4q+strideq*0], 1
131    vinserti128     m6, [dst4q+strideq*1], 1
132    vinserti128     m5, [dst4q+strideq*2], 1
133    vinserti128     m7, [dst4q+stride3q ], 1
134    punpckldq       m4, m6
135    punpckldq       m5, m7
136 %else
137    movq           xm4, [dstq+strideq*0]
138    movq           xm5, [dstq+strideq*1]
139    vinserti128     m4, [dstq+strideq*2], 1
140    vinserti128     m5, [dstq+stride3q ], 1
141 %endif
142    punpcklqdq      m4, m5
143%else
144    movd           xm4, [dstq+strideq*0]
145    movd           xm5, [dstq+strideq*1]
146    vinserti128     m4, [dstq+strideq*2], 1
147    vinserti128     m5, [dstq+stride3q ], 1
148    punpckldq       m4, m5
149%endif
150%if %3 == 1
151    mova            m7, m4                      ; min
152    mova            m8, m4                      ; max
153%endif
154%endmacro
155
156%macro ACCUMULATE_TAP_BYTE 7-8 0 ; tap_offset, shift, mask, strength
157                                 ; mul_tap, w, h, clip
158    ; load p0/p1
159    movsxd     dirjmpq, [dirq+kq*4+%1*2*4]
160    add        dirjmpq, tableq
161    call       dirjmpq
162
163%if %8 == 1
164    pmaxub          m7, m5
165    pminub          m8, m5
166    pmaxub          m7, m6
167    pminub          m8, m6
168%endif
169
170    ; accumulate sum[m15] over p0/p1
171%if %7 == 4
172    punpcklbw       m5, m6
173    punpcklbw       m6, m4, m4
174    psubusb         m9, m5, m6
175    psubusb         m5, m6, m5
176    por             m9, m5     ; abs_diff_p01(p01 - px)
177    pcmpeqb         m5, m9
178    por             m5, %5
179    psignb          m6, %5, m5
180    psrlw           m5, m9, %2 ; emulate 8-bit shift
181    pand            m5, %3
182    psubusb         m5, %4, m5
183    pminub          m5, m9
184    pmaddubsw       m5, m6
185    paddw          m15, m5
186%else
187    psubusb         m9, m5, m4
188    psubusb         m5, m4, m5
189    psubusb        m11, m6, m4
190    psubusb         m6, m4, m6
191    por             m9, m5      ; abs_diff_p0(p0 - px)
192    por            m11, m6      ; abs_diff_p1(p1 - px)
193    pcmpeqb         m5, m9
194    pcmpeqb         m6, m11
195    punpckhbw      m10, m9, m11
196    punpcklbw       m9, m11
197    por             m5, %5
198    por            m11, m6, %5
199    punpckhbw       m6, m5, m11
200    punpcklbw       m5, m11
201    psignb         m11, %5, m6
202    psrlw           m6, m10, %2 ; emulate 8-bit shift
203    pand            m6, %3
204    psubusb         m6, %4, m6
205    pminub          m6, m10
206    pmaddubsw       m6, m11
207    paddw          m12, m6
208    psignb         m11, %5, m5
209    psrlw           m5, m9, %2  ; emulate 8-bit shift
210    pand            m5, %3
211    psubusb         m5, %4, m5
212    pminub          m5, m9
213    pmaddubsw       m5, m11
214    paddw          m15, m5
215%endif
216%endmacro
217
218%macro ADJUST_PIXEL 4-5 0 ; w, h, zero, pw_2048, clip
219%if %2 == 4
220 %if %5 == 1
221    punpcklbw       m4, %3
222 %endif
223    pcmpgtw         %3, m15
224    paddw          m15, %3
225    pmulhrsw       m15, %4
226 %if %5 == 0
227    packsswb       m15, m15
228    paddb           m4, m15
229 %else
230    paddw           m4, m15
231    packuswb        m4, m4 ; clip px in [0x0,0xff]
232    pminub          m4, m7
233    pmaxub          m4, m8
234 %endif
235    vextracti128   xm5, m4, 1
236    movd   [dstq+strideq*0], xm4
237    movd   [dstq+strideq*2], xm5
238    pextrd [dstq+strideq*1], xm4, 1
239    pextrd [dstq+stride3q ], xm5, 1
240%else
241    pcmpgtw         m6, %3, m12
242    pcmpgtw         m5, %3, m15
243    paddw          m12, m6
244    paddw          m15, m5
245 %if %5 == 1
246    punpckhbw       m5, m4, %3
247    punpcklbw       m4, %3
248 %endif
249    pmulhrsw       m12, %4
250    pmulhrsw       m15, %4
251 %if %5 == 0
252    packsswb       m15, m12
253    paddb           m4, m15
254 %else
255    paddw           m5, m12
256    paddw           m4, m15
257    packuswb        m4, m5 ; clip px in [0x0,0xff]
258    pminub          m4, m7
259    pmaxub          m4, m8
260 %endif
261    vextracti128   xm5, m4, 1
262 %if %1 == 4
263    movd   [dstq +strideq*0], xm4
264    movd   [dst4q+strideq*0], xm5
265    pextrd [dstq +strideq*1], xm4, 1
266    pextrd [dst4q+strideq*1], xm5, 1
267    pextrd [dstq +strideq*2], xm4, 2
268    pextrd [dst4q+strideq*2], xm5, 2
269    pextrd [dstq +stride3q ], xm4, 3
270    pextrd [dst4q+stride3q ], xm5, 3
271 %else
272    movq   [dstq+strideq*0], xm4
273    movq   [dstq+strideq*2], xm5
274    movhps [dstq+strideq*1], xm4
275    movhps [dstq+stride3q ], xm5
276 %endif
277%endif
278%endmacro
279
280%macro BORDER_PREP_REGS 2 ; w, h
281    ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
282    mov           dird, r7m
283    lea           dirq, [tableq+dirq*2+14]
284%if %1*%2*2/mmsize > 1
285 %if %1 == 4
286    DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, h, off
287 %else
288    DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, h, off
289 %endif
290    mov             hd, %1*%2*2/mmsize
291%else
292    DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, off
293%endif
294    lea           stkq, [px]
295    pxor           m11, m11
296%endmacro
297
298%macro BORDER_LOAD_BLOCK 2-3 0 ; w, h, init_min_max
299    mov             kd, 1
300%if %1 == 4
301    movq           xm4, [stkq+32*0]
302    movhps         xm4, [stkq+32*1]
303    movq           xm5, [stkq+32*2]
304    movhps         xm5, [stkq+32*3]
305    vinserti128     m4, xm5, 1
306%else
307    mova           xm4, [stkq+32*0]             ; px
308    vinserti128     m4, [stkq+32*1], 1
309%endif
310    pxor           m15, m15                     ; sum
311%if %3 == 1
312    mova            m7, m4                      ; max
313    mova            m8, m4                      ; min
314%endif
315%endmacro
316
317%macro ACCUMULATE_TAP_WORD 6-7 0 ; tap_offset, shift, mask, strength
318                                 ; mul_tap, w, clip
319    ; load p0/p1
320    movsx         offq, byte [dirq+kq+%1]       ; off1
321%if %6 == 4
322    movq           xm5, [stkq+offq*2+32*0]      ; p0
323    movq           xm6, [stkq+offq*2+32*2]
324    movhps         xm5, [stkq+offq*2+32*1]
325    movhps         xm6, [stkq+offq*2+32*3]
326    vinserti128     m5, xm6, 1
327%else
328    movu           xm5, [stkq+offq*2+32*0]      ; p0
329    vinserti128     m5, [stkq+offq*2+32*1], 1
330%endif
331    neg           offq                          ; -off1
332%if %6 == 4
333    movq           xm6, [stkq+offq*2+32*0]      ; p1
334    movq           xm9, [stkq+offq*2+32*2]
335    movhps         xm6, [stkq+offq*2+32*1]
336    movhps         xm9, [stkq+offq*2+32*3]
337    vinserti128     m6, xm9, 1
338%else
339    movu           xm6, [stkq+offq*2+32*0]      ; p1
340    vinserti128     m6, [stkq+offq*2+32*1], 1
341%endif
342%if %7 == 1
343    ; out of bounds values are set to a value that is a both a large unsigned
344    ; value and a negative signed value.
345    ; use signed max and unsigned min to remove them
346    pmaxsw          m7, m5                      ; max after p0
347    pminuw          m8, m5                      ; min after p0
348    pmaxsw          m7, m6                      ; max after p1
349    pminuw          m8, m6                      ; min after p1
350%endif
351
352    ; accumulate sum[m15] over p0/p1
353    ; calculate difference before converting
354    psubw           m5, m4                      ; diff_p0(p0 - px)
355    psubw           m6, m4                      ; diff_p1(p1 - px)
356
357    ; convert to 8-bits with signed saturation
358    ; saturating to large diffs has no impact on the results
359    packsswb        m5, m6
360
361    ; group into pairs so we can accumulate using maddubsw
362    pshufb          m5, m12
363    pabsb           m9, m5
364    psignb         m10, %5, m5
365    psrlw           m5, m9, %2                  ; emulate 8-bit shift
366    pand            m5, %3
367    psubusb         m5, %4, m5
368
369    ; use unsigned min since abs diff can equal 0x80
370    pminub          m5, m9
371    pmaddubsw       m5, m10
372    paddw          m15, m5
373%endmacro
374
375%macro BORDER_ADJUST_PIXEL 2-3 0 ; w, pw_2048, clip
376    pcmpgtw         m9, m11, m15
377    paddw          m15, m9
378    pmulhrsw       m15, %2
379    paddw           m4, m15
380%if %3 == 1
381    pminsw          m4, m7
382    pmaxsw          m4, m8
383%endif
384    packuswb        m4, m4
385    vextracti128   xm5, m4, 1
386%if %1 == 4
387    movd   [dstq+strideq*0], xm4
388    pextrd [dstq+strideq*1], xm4, 1
389    movd   [dstq+strideq*2], xm5
390    pextrd [dstq+stride3q ], xm5, 1
391%else
392    movq [dstq+strideq*0], xm4
393    movq [dstq+strideq*1], xm5
394%endif
395%endmacro
396
397%macro CDEF_FILTER 2 ; w, h
398INIT_YMM avx2
399cglobal cdef_filter_%1x%2_8bpc, 5, 10, 0, dst, stride, left, top, bot, \
400                                          pri, sec, dir, damping, edge
401%assign stack_offset_entry stack_offset
402    mov          edged, edgem
403    cmp          edged, 0xf
404    jne .border_block
405
406    PUSH           r10
407    PUSH           r11
408%if %2 == 4
409 %assign regs_used 12
410 %if STACK_ALIGNMENT < 32
411    PUSH  r%+regs_used
412  %assign regs_used regs_used+1
413 %endif
414    ALLOC_STACK   0x60, 16
415    pmovzxbw       xm0, [leftq+1]
416    vpermq          m0, m0, q0110
417    psrldq          m1, m0, 4
418    vpalignr        m2, m0, m0, 12
419    movu    [rsp+0x10], m0
420    movu    [rsp+0x28], m1
421    movu    [rsp+0x40], m2
422%elif %1 == 4
423    PUSH           r12
424 %assign regs_used 13
425 %if STACK_ALIGNMENT < 32
426    PUSH  r%+regs_used
427   %assign regs_used regs_used+1
428 %endif
429    ALLOC_STACK 8*2+%1*%2*1, 16
430    pmovzxwd        m0, [leftq]
431    mova    [rsp+0x10], m0
432%else
433    PUSH           r12
434    PUSH           r13
435 %assign regs_used 14
436 %if STACK_ALIGNMENT < 32
437    PUSH  r%+regs_used
438  %assign regs_used regs_used+1
439 %endif
440    ALLOC_STACK 8*4+%1*%2*2+32, 16
441    lea            r11, [strideq*3]
442    movu           xm4, [dstq+strideq*2]
443    pmovzxwq        m0, [leftq+0]
444    pmovzxwq        m1, [leftq+8]
445    vinserti128     m4, [dstq+r11], 1
446    pmovzxbd        m2, [leftq+1]
447    pmovzxbd        m3, [leftq+9]
448    mov       [rsp+16], botq
449    mova    [rsp+0x20], m0
450    mova    [rsp+0x40], m1
451    mova    [rsp+0x60], m2
452    mova    [rsp+0x80], m3
453    mova    [rsp+0xa0], m4
454    lea           botq, [dstq+strideq*4]
455%endif
456
457 DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, zero, pridmp, damping
458    mov       dampingd, r8m
459    xor          zerod, zerod
460    movifnidn     prid, prim
461    sub       dampingd, 31
462    movifnidn  secdmpd, secdmpm
463    test          prid, prid
464    jz .sec_only
465    movd           xm0, prid
466    lzcnt      pridmpd, prid
467    add        pridmpd, dampingd
468    cmovs      pridmpd, zerod
469    mov        [rsp+0], pridmpq                 ; pri_shift
470    test       secdmpd, secdmpd
471    jz .pri_only
472    movd           xm1, secdmpd
473    lzcnt      secdmpd, secdmpd
474    add        secdmpd, dampingd
475    mov        [rsp+8], secdmpq                 ; sec_shift
476
477 DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, table, pridmp
478    lea         tableq, [tap_table]
479    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
480    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
481
482    ; pri/sec_taps[k] [4 total]
483 DEFINE_ARGS dst, stride, left, top, bot, pri, sec, table, dir
484    vpbroadcastb    m0, xm0                     ; pri_strength
485    vpbroadcastb    m1, xm1                     ; sec_strength
486    and           prid, 1
487    lea           priq, [tableq+priq*2+8]       ; pri_taps
488    lea           secq, [tableq+12]             ; sec_taps
489
490    PREP_REGS       %1, %2
491%if %1*%2 > mmsize
492.v_loop:
493%endif
494    LOAD_BLOCK      %1, %2, 1
495.k_loop:
496    vpbroadcastb    m2, [priq+kq]                          ; pri_taps
497    vpbroadcastb    m3, [secq+kq]                          ; sec_taps
498    ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2, 1 ; dir + 0
499    ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir + 2
500    ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir - 2
501    dec             kq
502    jge .k_loop
503
504    vpbroadcastd   m10, [pw_2048]
505    pxor            m9, m9
506    ADJUST_PIXEL    %1, %2, m9, m10, 1
507%if %1*%2 > mmsize
508    lea           dstq, [dstq+strideq*4]
509    lea          top1q, [rsp+0xa0]
510    lea          top2q, [rsp+0xb0]
511    mov           botq, [rsp+16]
512    add             hq, 4
513    jl .v_loop
514%endif
515    RET
516
517.pri_only:
518 DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, pridmp
519    lea         tableq, [tap_table]
520    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
521    ; pri/sec_taps[k] [4 total]
522 DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, dir
523    vpbroadcastb    m0, xm0                     ; pri_strength
524    and           prid, 1
525    lea           priq, [tableq+priq*2+8]       ; pri_taps
526    PREP_REGS       %1, %2
527    vpbroadcastd    m3, [pw_2048]
528    pxor            m1, m1
529%if %1*%2 > mmsize
530.pri_v_loop:
531%endif
532    LOAD_BLOCK      %1, %2
533.pri_k_loop:
534    vpbroadcastb    m2, [priq+kq]                       ; pri_taps
535    ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0
536    dec             kq
537    jge .pri_k_loop
538    ADJUST_PIXEL    %1, %2, m1, m3
539%if %1*%2 > mmsize
540    lea           dstq, [dstq+strideq*4]
541    lea          top1q, [rsp+0xa0]
542    lea          top2q, [rsp+0xb0]
543    mov           botq, [rsp+16]
544    add             hq, 4
545    jl .pri_v_loop
546%endif
547    RET
548
549.sec_only:
550 DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, zero, _, damping
551    movd           xm1, secdmpd
552    lzcnt      secdmpd, secdmpd
553    add        secdmpd, dampingd
554    mov        [rsp+8], secdmpq                 ; sec_shift
555 DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, table
556    lea         tableq, [tap_table]
557    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
558    ; pri/sec_taps[k] [4 total]
559 DEFINE_ARGS dst, stride, left, top, bot, _, sec, table, dir
560    vpbroadcastb    m1, xm1                     ; sec_strength
561    lea           secq, [tableq+12]             ; sec_taps
562    PREP_REGS       %1, %2
563    vpbroadcastd    m2, [pw_2048]
564    pxor            m0, m0
565%if %1*%2 > mmsize
566.sec_v_loop:
567%endif
568    LOAD_BLOCK      %1, %2
569.sec_k_loop:
570    vpbroadcastb    m3, [secq+kq]                       ; sec_taps
571    ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2
572    ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2
573    dec             kq
574    jge .sec_k_loop
575    ADJUST_PIXEL    %1, %2, m0, m2
576%if %1*%2 > mmsize
577    lea           dstq, [dstq+strideq*4]
578    lea          top1q, [rsp+0xa0]
579    lea          top2q, [rsp+0xb0]
580    mov           botq, [rsp+16]
581    add             hq, 4
582    jl .sec_v_loop
583%endif
584    RET
585
586.d0k0:
587%if %1 == 4
588 %if %2 == 4
589    vpbroadcastq    m6, [dstq+strideq*1-1]
590    vpbroadcastq   m10, [dstq+strideq*2-1]
591    movd           xm5, [topq+strideq*1+1]
592    movd           xm9, [dstq+strideq*0+1]
593    psrldq         m11, m6, 2
594    psrldq         m12, m10, 2
595    vinserti128     m6, [dstq+stride3q -1], 1
596    vinserti128    m10, [botq          -1], 1
597    vpblendd        m5, m11, 0x10
598    vpblendd        m9, m12, 0x10
599    movu           m11, [blend_4x4+16]
600    punpckldq       m6, m10
601    punpckldq       m5, m9
602    vpblendvb       m6, [rsp+gprsize+0x28], m11
603 %else
604    movd           xm5, [topq +strideq*1+1]
605    movq           xm6, [dstq +strideq*1-1]
606    movq          xm10, [dstq +stride3q -1]
607    movq          xm11, [dst4q+strideq*1-1]
608    pinsrd         xm5, [dstq +strideq*0+1], 1
609    movhps         xm6, [dstq +strideq*2-1]
610    movhps        xm10, [dst4q+strideq*0-1]
611    movhps        xm11, [dst4q+strideq*2-1]
612    psrldq         xm9, xm6, 2
613    shufps         xm5, xm9, q2010   ; -1 +0 +1 +2
614    shufps         xm6, xm10, q2020  ; +1 +2 +3 +4
615    psrldq         xm9, xm11, 2
616    psrldq        xm10, 2
617    shufps        xm10, xm9, q2020   ; +3 +4 +5 +6
618    movd           xm9, [dst4q+stride3q -1]
619    pinsrd         xm9, [botq           -1], 1
620    shufps        xm11, xm9, q1020   ; +5 +6 +7 +8
621    pmovzxbw        m9, [leftq+3]
622    vinserti128     m6, xm11, 1
623    movu           m11, [blend_4x8_0+4]
624    vinserti128     m5, xm10, 1
625    vpblendvb       m6, m9, m11
626 %endif
627%else
628    lea            r13, [blend_8x8_0+16]
629    movq           xm5, [top2q         +1]
630    vbroadcasti128 m10, [dstq+strideq*1-1]
631    vbroadcasti128 m11, [dstq+strideq*2-1]
632    movhps         xm5, [dstq+strideq*0+1]
633    vinserti128     m6, m10, [dstq+stride3q-1], 1
634    vinserti128     m9, m11, [botq         -1], 1
635    psrldq         m10, 2
636    psrldq         m11, 2
637    punpcklqdq      m6, m9
638    movu            m9, [r13+hq*2*1+16*1]
639    punpcklqdq     m10, m11
640    vpblendd        m5, m10, 0xF0
641    vpblendvb       m6, [rsp+gprsize+0x60+hq*8+64+8*1], m9
642%endif
643    ret
644.d1k0:
645.d2k0:
646.d3k0:
647%if %1 == 4
648 %if %2 == 4
649    movq           xm6, [dstq+strideq*0-1]
650    movq           xm9, [dstq+strideq*1-1]
651    vinserti128     m6, [dstq+strideq*2-1], 1
652    vinserti128     m9, [dstq+stride3q -1], 1
653    movu           m11, [rsp+gprsize+0x10]
654    pcmpeqd        m12, m12
655    psrldq          m5, m6, 2
656    psrldq         m10, m9, 2
657    psrld          m12, 24
658    punpckldq       m6, m9
659    punpckldq       m5, m10
660    vpblendvb       m6, m11, m12
661 %else
662    movq           xm6, [dstq +strideq*0-1]
663    movq           xm9, [dstq +strideq*2-1]
664    movhps         xm6, [dstq +strideq*1-1]
665    movhps         xm9, [dstq +stride3q -1]
666    movq          xm10, [dst4q+strideq*0-1]
667    movhps        xm10, [dst4q+strideq*1-1]
668    psrldq         xm5, xm6, 2
669    psrldq        xm11, xm9, 2
670    shufps         xm5, xm11, q2020
671    movq          xm11, [dst4q+strideq*2-1]
672    movhps        xm11, [dst4q+stride3q -1]
673    shufps         xm6, xm9, q2020
674    shufps         xm9, xm10, xm11, q2020
675    vinserti128     m6, xm9, 1
676    pmovzxbw        m9, [leftq+1]
677    psrldq        xm10, 2
678    psrldq        xm11, 2
679    shufps        xm10, xm11, q2020
680    vpbroadcastd   m11, [blend_4x8_0+4]
681    vinserti128     m5, xm10, 1
682    vpblendvb       m6, m9, m11
683 %endif
684%else
685    movu           xm5, [dstq+strideq*0-1]
686    movu           xm9, [dstq+strideq*1-1]
687    vinserti128     m5, [dstq+strideq*2-1], 1
688    vinserti128     m9, [dstq+stride3q -1], 1
689    movu           m10, [blend_8x8_0+16]
690    punpcklqdq      m6, m5, m9
691    vpblendvb       m6, [rsp+gprsize+0x60+hq*8+64], m10
692    psrldq          m5, 2
693    psrldq          m9, 2
694    punpcklqdq      m5, m9
695%endif
696    ret
697.d4k0:
698%if %1 == 4
699 %if %2 == 4
700    vpbroadcastq   m10, [dstq+strideq*1-1]
701    vpbroadcastq   m11, [dstq+strideq*2-1]
702    movd           xm6, [topq+strideq*1-1]
703    movd           xm9, [dstq+strideq*0-1]
704    psrldq          m5, m10, 2
705    psrldq         m12, m11, 2
706    vpblendd        m6, m10, 0x10
707    vpblendd        m9, m11, 0x10
708    movu           m10, [blend_4x4]
709    vinserti128     m5, [dstq+stride3q +1], 1
710    vinserti128    m12, [botq          +1], 1
711    punpckldq       m6, m9
712    punpckldq       m5, m12
713    vpblendvb       m6, [rsp+gprsize+0x40], m10
714 %else
715    movd           xm6, [topq +strideq*1-1]
716    movq           xm9, [dstq +strideq*1-1]
717    movq          xm10, [dstq +stride3q -1]
718    movq          xm11, [dst4q+strideq*1-1]
719    pinsrd         xm6, [dstq +strideq*0-1], 1
720    movhps         xm9, [dstq +strideq*2-1]
721    movhps        xm10, [dst4q+strideq*0-1]
722    movhps        xm11, [dst4q+strideq*2-1]
723    psrldq         xm5, xm9, 2
724    shufps         xm6, xm9, q2010
725    psrldq         xm9, xm10, 2
726    shufps         xm5, xm9, q2020
727    shufps        xm10, xm11, q2020
728    movd           xm9, [dst4q+stride3q +1]
729    vinserti128     m6, xm10, 1
730    pinsrd         xm9, [botq           +1], 1
731    psrldq        xm11, 2
732    pmovzxbw       m10, [leftq-1]
733    shufps        xm11, xm9, q1020
734    movu            m9, [blend_4x8_0]
735    vinserti128     m5, xm11, 1
736    vpblendvb       m6, m10, m9
737 %endif
738%else
739    lea            r13, [blend_8x8_0+8]
740    movq           xm6, [top2q         -1]
741    vbroadcasti128  m5, [dstq+strideq*1-1]
742    vbroadcasti128  m9, [dstq+strideq*2-1]
743    movhps         xm6, [dstq+strideq*0-1]
744    movu           m11, [r13+hq*2*1+16*1]
745    punpcklqdq     m10, m5, m9
746    vinserti128     m5, [dstq+stride3q -1], 1
747    vinserti128     m9, [botq          -1], 1
748    vpblendd        m6, m10, 0xF0
749    vpblendvb       m6, [rsp+gprsize+0x60+hq*8+64-8*1], m11
750    psrldq          m5, 2
751    psrldq          m9, 2
752    punpcklqdq      m5, m9
753%endif
754    ret
755.d5k0:
756.d6k0:
757.d7k0:
758%if %1 == 4
759 %if %2 == 4
760    movd           xm6, [topq+strideq*1  ]
761    vpbroadcastd    m5, [dstq+strideq*1  ]
762    vpbroadcastd    m9, [dstq+strideq*2  ]
763    vpblendd       xm6, [dstq+strideq*0-4], 0x2
764    vpblendd        m5, m9, 0x22
765    vpblendd        m6, m5, 0x30
766    vinserti128     m5, [dstq+stride3q   ], 1
767    vpblendd        m5, [botq         -20], 0x20
768 %else
769    movd           xm6, [topq +strideq*1]
770    movd           xm5, [dstq +strideq*1]
771    movd           xm9, [dstq +stride3q ]
772    movd          xm10, [dst4q+strideq*1]
773    movd          xm11, [dst4q+stride3q ]
774    pinsrd         xm6, [dstq +strideq*0], 1
775    pinsrd         xm5, [dstq +strideq*2], 1
776    pinsrd         xm9, [dst4q+strideq*0], 1
777    pinsrd        xm10, [dst4q+strideq*2], 1
778    pinsrd        xm11, [botq           ], 1
779    punpcklqdq     xm6, xm5
780    punpcklqdq     xm5, xm9
781    punpcklqdq     xm9, xm10
782    punpcklqdq    xm10, xm11
783    vinserti128     m6, xm9, 1
784    vinserti128     m5, xm10, 1
785 %endif
786%else
787    movq           xm6, [top2q         ]
788    movq           xm5, [dstq+strideq*1]
789    movq           xm9, [dstq+stride3q ]
790    movhps         xm6, [dstq+strideq*0]
791    movhps         xm5, [dstq+strideq*2]
792    movhps         xm9, [botq          ]
793    vinserti128     m6, xm5, 1
794    vinserti128     m5, xm9, 1
795%endif
796    ret
797.d0k1:
798%if %1 == 4
799 %if %2 == 4
800    movd           xm6, [dstq+strideq*2-2]
801    movd           xm9, [dstq+stride3q -2]
802    movd           xm5, [topq+strideq*0+2]
803    movd          xm10, [topq+strideq*1+2]
804    pinsrw         xm6, [leftq+4], 0
805    pinsrw         xm9, [leftq+6], 0
806    vinserti128     m5, [dstq+strideq*0+2], 1
807    vinserti128    m10, [dstq+strideq*1+2], 1
808    vinserti128     m6, [botq+strideq*0-2], 1
809    vinserti128     m9, [botq+strideq*1-2], 1
810    punpckldq       m5, m10
811    punpckldq       m6, m9
812 %else
813    movq           xm6, [dstq +strideq*2-2]
814    movd          xm10, [dst4q+strideq*2-2]
815    movd           xm5, [topq +strideq*0+2]
816    movq           xm9, [dst4q+strideq*0-2]
817    movhps         xm6, [dstq +stride3q -2]
818    pinsrw        xm10, [dst4q+stride3q   ], 3
819    pinsrd         xm5, [topq +strideq*1+2], 1
820    movhps         xm9, [dst4q+strideq*1-2]
821    pinsrd        xm10, [botq +strideq*0-2], 2
822    pinsrd         xm5, [dstq +strideq*0+2], 2
823    pinsrd        xm10, [botq +strideq*1-2], 3
824    pinsrd         xm5, [dstq +strideq*1+2], 3
825    shufps        xm11, xm6, xm9, q3131
826    shufps         xm6, xm9, q2020
827    movu            m9, [blend_4x8_3+8]
828    vinserti128     m6, xm10, 1
829    vinserti128     m5, xm11, 1
830    vpblendvb       m6, [rsp+gprsize+0x10+8], m9
831 %endif
832%else
833    lea            r13, [blend_8x8_1+16]
834    movq           xm6, [dstq+strideq*2-2]
835    movq           xm9, [dstq+stride3q -2]
836    movq           xm5, [top1q         +2]
837    movq          xm10, [top2q         +2]
838    movu           m11, [r13+hq*2*2+16*2]
839    vinserti128     m6, [botq+strideq*0-2], 1
840    vinserti128     m9, [botq+strideq*1-2], 1
841    vinserti128     m5, [dstq+strideq*0+2], 1
842    vinserti128    m10, [dstq+strideq*1+2], 1
843    punpcklqdq      m6, m9
844    punpcklqdq      m5, m10
845    vpblendvb       m6, [rsp+gprsize+0x20+hq*8+64+8*2], m11
846%endif
847    ret
848.d1k1:
849%if %1 == 4
850 %if %2 == 4
851    vpbroadcastq    m6, [dstq+strideq*1-2]
852    vpbroadcastq    m9, [dstq+strideq*2-2]
853    movd           xm5, [topq+strideq*1+2]
854    movd          xm10, [dstq+strideq*0+2]
855    psrldq         m11, m6, 4
856    psrldq         m12, m9, 4
857    vpblendd        m5, m11, 0x10
858    movq          xm11, [leftq+2]
859    vinserti128     m6, [dstq+stride3q-2], 1
860    punpckldq     xm11, xm11
861    vpblendd       m10, m12, 0x10
862    pcmpeqd        m12, m12
863    pmovzxwd       m11, xm11
864    psrld          m12, 16
865    punpckldq       m6, m9
866    vpbroadcastd    m9, [botq-2]
867    vpblendvb       m6, m11, m12
868    punpckldq       m5, m10
869    vpblendd        m6, m9, 0x20
870 %else
871    movd           xm5, [topq +strideq*1+2]
872    movq           xm6, [dstq +strideq*1-2]
873    movq           xm9, [dstq +stride3q -2]
874    movq          xm10, [dst4q+strideq*1-2]
875    movd          xm11, [dst4q+stride3q -2]
876    pinsrd         xm5, [dstq +strideq*0+2], 1
877    movhps         xm6, [dstq +strideq*2-2]
878    movhps         xm9, [dst4q+strideq*0-2]
879    movhps        xm10, [dst4q+strideq*2-2]
880    pinsrd        xm11, [botq           -2], 1
881    shufps         xm5, xm6, q3110
882    shufps         xm6, xm9, q2020
883    shufps         xm9, xm10, q3131
884    shufps        xm10, xm11, q1020
885    movu           m11, [blend_4x8_2+4]
886    vinserti128     m6, xm10, 1
887    vinserti128     m5, xm9, 1
888    vpblendvb       m6, [rsp+gprsize+0x10+4], m11
889 %endif
890%else
891    lea            r13, [blend_8x8_1+16]
892    movq           xm5, [top2q         +2]
893    vbroadcasti128  m6, [dstq+strideq*1-2]
894    vbroadcasti128  m9, [dstq+strideq*2-2]
895    movhps         xm5, [dstq+strideq*0+2]
896    shufps         m10, m6, m9, q2121
897    vinserti128     m6, [dstq+stride3q -2], 1
898    vinserti128     m9, [botq          -2], 1
899    movu           m11, [r13+hq*2*1+16*1]
900    vpblendd        m5, m10, 0xF0
901    punpcklqdq      m6, m9
902    vpblendvb       m6, [rsp+gprsize+0x20+hq*8+64+8*1], m11
903%endif
904    ret
905.d2k1:
906%if %1 == 4
907 %if %2 == 4
908    movq          xm11, [leftq]
909    movq           xm6, [dstq+strideq*0-2]
910    movq           xm9, [dstq+strideq*1-2]
911    vinserti128     m6, [dstq+strideq*2-2], 1
912    vinserti128     m9, [dstq+stride3q -2], 1
913    punpckldq     xm11, xm11
914    psrldq          m5, m6, 4
915    psrldq         m10, m9, 4
916    pmovzxwd       m11, xm11
917    punpckldq       m6, m9
918    punpckldq       m5, m10
919    pblendw         m6, m11, 0x05
920 %else
921    movq           xm5, [dstq +strideq*0-2]
922    movq           xm9, [dstq +strideq*2-2]
923    movq          xm10, [dst4q+strideq*0-2]
924    movq          xm11, [dst4q+strideq*2-2]
925    movhps         xm5, [dstq +strideq*1-2]
926    movhps         xm9, [dstq +stride3q -2]
927    movhps        xm10, [dst4q+strideq*1-2]
928    movhps        xm11, [dst4q+stride3q -2]
929    shufps         xm6, xm5, xm9, q2020
930    shufps         xm5, xm9, q3131
931    shufps         xm9, xm10, xm11, q2020
932    shufps        xm10, xm11, q3131
933    pmovzxwd       m11, [leftq]
934    vinserti128     m6, xm9, 1
935    vinserti128     m5, xm10, 1
936    pblendw         m6, m11, 0x55
937 %endif
938%else
939    mova           m11, [rsp+gprsize+0x20+hq*8+64]
940    movu           xm5, [dstq+strideq*0-2]
941    movu           xm9, [dstq+strideq*1-2]
942    vinserti128     m5, [dstq+strideq*2-2], 1
943    vinserti128     m9, [dstq+stride3q -2], 1
944    shufps          m6, m5, m9, q1010
945    shufps          m5, m9, q2121
946    pblendw         m6, m11, 0x11
947%endif
948    ret
949.d3k1:
950%if %1 == 4
951 %if %2 == 4
952    vpbroadcastq   m11, [dstq+strideq*1-2]
953    vpbroadcastq   m12, [dstq+strideq*2-2]
954    movd           xm6, [topq+strideq*1-2]
955    movd           xm9, [dstq+strideq*0-2]
956    pblendw        m11, [leftq-16+2], 0x01
957    pblendw        m12, [leftq-16+4], 0x01
958    pinsrw         xm9, [leftq- 0+0], 0
959    psrldq          m5, m11, 4
960    psrldq         m10, m12, 4
961    vinserti128     m5, [dstq+stride3q +2], 1
962    vinserti128    m10, [botq          +2], 1
963    vpblendd        m6, m11, 0x10
964    vpblendd        m9, m12, 0x10
965    punpckldq       m6, m9
966    punpckldq       m5, m10
967 %else
968    movd           xm6, [topq +strideq*1-2]
969    movq           xm5, [dstq +strideq*1-2]
970    movq           xm9, [dstq +stride3q -2]
971    movq          xm10, [dst4q+strideq*1-2]
972    movd          xm11, [dst4q+stride3q +2]
973    pinsrw         xm6, [dstq +strideq*0  ], 3
974    movhps         xm5, [dstq +strideq*2-2]
975    movhps         xm9, [dst4q+strideq*0-2]
976    movhps        xm10, [dst4q+strideq*2-2]
977    pinsrd        xm11, [botq           +2], 1
978    shufps         xm6, xm5, q2010
979    shufps         xm5, xm9, q3131
980    shufps         xm9, xm10, q2020
981    shufps        xm10, xm11, q1031
982    movu           m11, [blend_4x8_2]
983    vinserti128     m6, xm9, 1
984    vinserti128     m5, xm10, 1
985    vpblendvb       m6, [rsp+gprsize+0x10-4], m11
986 %endif
987%else
988    lea            r13, [blend_8x8_1+8]
989    movq           xm6, [top2q         -2]
990    vbroadcasti128  m5, [dstq+strideq*1-2]
991    vbroadcasti128 m10, [dstq+strideq*2-2]
992    movhps         xm6, [dstq+strideq*0-2]
993    punpcklqdq      m9, m5, m10
994    vinserti128     m5, [dstq+stride3q -2], 1
995    vinserti128    m10, [botq          -2], 1
996    movu           m11, [r13+hq*2*1+16*1]
997    vpblendd        m6, m9, 0xF0
998    shufps          m5, m10, q2121
999    vpblendvb       m6, [rsp+gprsize+0x20+hq*8+64-8*1], m11
1000%endif
1001    ret
1002.d4k1:
1003%if %1 == 4
1004 %if %2 == 4
1005    vinserti128     m6, [dstq+strideq*0-2], 1
1006    vinserti128     m9, [dstq+strideq*1-2], 1
1007    movd           xm5, [dstq+strideq*2+2]
1008    movd          xm10, [dstq+stride3q +2]
1009    pblendw         m6, [leftq-16+0], 0x01
1010    pblendw         m9, [leftq-16+2], 0x01
1011    vinserti128     m5, [botq+strideq*0+2], 1
1012    vinserti128    m10, [botq+strideq*1+2], 1
1013    vpblendd        m6, [topq+strideq*0-2], 0x01
1014    vpblendd        m9, [topq+strideq*1-2], 0x01
1015    punpckldq       m5, m10
1016    punpckldq       m6, m9
1017 %else
1018    movd           xm6, [topq +strideq*0-2]
1019    movq           xm5, [dstq +strideq*2-2]
1020    movq           xm9, [dst4q+strideq*0-2]
1021    movd          xm10, [dst4q+strideq*2+2]
1022    pinsrd         xm6, [topq +strideq*1-2], 1
1023    movhps         xm5, [dstq +stride3q -2]
1024    movhps         xm9, [dst4q+strideq*1-2]
1025    pinsrd        xm10, [dst4q+stride3q +2], 1
1026    pinsrd         xm6, [dstq +strideq*0-2], 2
1027    pinsrd        xm10, [botq +strideq*0+2], 2
1028    pinsrd         xm6, [dstq +strideq*1-2], 3
1029    pinsrd        xm10, [botq +strideq*1+2], 3
1030    shufps        xm11, xm5, xm9, q2020
1031    shufps         xm5, xm9, q3131
1032    movu            m9, [blend_4x8_3]
1033    vinserti128     m6, xm11, 1
1034    vinserti128     m5, xm10, 1
1035    vpblendvb       m6, [rsp+gprsize+0x10-8], m9
1036 %endif
1037%else
1038    lea            r13, [blend_8x8_1]
1039    movu           m11, [r13+hq*2*2+16*2]
1040    movq           xm6, [top1q         -2]
1041    movq           xm9, [top2q         -2]
1042    movq           xm5, [dstq+strideq*2+2]
1043    movq          xm10, [dstq+stride3q +2]
1044    vinserti128     m6, [dstq+strideq*0-2], 1
1045    vinserti128     m9, [dstq+strideq*1-2], 1
1046    vinserti128     m5, [botq+strideq*0+2], 1
1047    vinserti128    m10, [botq+strideq*1+2], 1
1048    punpcklqdq      m6, m9
1049    vpblendvb       m6, [rsp+gprsize+0x20+hq*8+64-8*2], m11
1050    punpcklqdq      m5, m10
1051%endif
1052    ret
1053.d5k1:
1054%if %1 == 4
1055 %if %2 == 4
1056    movd           xm6, [topq+strideq*0-1]
1057    movd           xm9, [topq+strideq*1-1]
1058    movd           xm5, [dstq+strideq*2+1]
1059    movd          xm10, [dstq+stride3q +1]
1060    pcmpeqd        m12, m12
1061    pmovzxbw       m11, [leftq-8+1]
1062    psrld          m12, 24
1063    vinserti128     m6, [dstq+strideq*0-1], 1
1064    vinserti128     m9, [dstq+strideq*1-1], 1
1065    vinserti128     m5, [botq+strideq*0+1], 1
1066    vinserti128    m10, [botq+strideq*1+1], 1
1067    punpckldq       m6, m9
1068    pxor            m9, m9
1069    vpblendd       m12, m9, 0x0F
1070    punpckldq       m5, m10
1071    vpblendvb       m6, m11, m12
1072 %else
1073    movd           xm6, [topq +strideq*0-1]
1074    movq           xm5, [dstq +strideq*2-1]
1075    movq           xm9, [dst4q+strideq*0-1]
1076    movd          xm10, [dst4q+strideq*2+1]
1077    pinsrd         xm6, [topq +strideq*1-1], 1
1078    movhps         xm5, [dstq +stride3q -1]
1079    movhps         xm9, [dst4q+strideq*1-1]
1080    pinsrd        xm10, [dst4q+stride3q +1], 1
1081    pinsrd         xm6, [dstq +strideq*0-1], 2
1082    pinsrd        xm10, [botq +strideq*0+1], 2
1083    pinsrd         xm6, [dstq +strideq*1-1], 3
1084    pinsrd        xm10, [botq +strideq*1+1], 3
1085    shufps        xm11, xm5, xm9, q2020
1086    vinserti128     m6, xm11, 1
1087    pmovzxbw       m11, [leftq-3]
1088    psrldq         xm5, 2
1089    psrldq         xm9, 2
1090    shufps         xm5, xm9, q2020
1091    movu            m9, [blend_4x8_1]
1092    vinserti128     m5, xm10, 1
1093    vpblendvb       m6, m11, m9
1094 %endif
1095%else
1096    lea            r13, [blend_8x8_0]
1097    movu           m11, [r13+hq*2*2+16*2]
1098    movq           xm6, [top1q         -1]
1099    movq           xm9, [top2q         -1]
1100    movq           xm5, [dstq+strideq*2+1]
1101    movq          xm10, [dstq+stride3q +1]
1102    vinserti128     m6, [dstq+strideq*0-1], 1
1103    vinserti128     m9, [dstq+strideq*1-1], 1
1104    vinserti128     m5, [botq+strideq*0+1], 1
1105    vinserti128    m10, [botq+strideq*1+1], 1
1106    punpcklqdq      m6, m9
1107    punpcklqdq      m5, m10
1108    vpblendvb       m6, [rsp+gprsize+0x60+hq*8+64-8*2], m11
1109%endif
1110    ret
1111.d6k1:
1112%if %1 == 4
1113 %if %2 == 4
1114    movd           xm6, [topq+strideq*0]
1115    movd           xm9, [topq+strideq*1]
1116    movd           xm5, [dstq+strideq*2]
1117    movd          xm10, [dstq+stride3q ]
1118    vinserti128     m6, [dstq+strideq*0], 1
1119    vinserti128     m9, [dstq+strideq*1], 1
1120    vinserti128     m5, [botq+strideq*0], 1
1121    vinserti128    m10, [botq+strideq*1], 1
1122    punpckldq       m6, m9
1123    punpckldq       m5, m10
1124 %else
1125    movd           xm5, [dstq +strideq*2]
1126    movd           xm6, [topq +strideq*0]
1127    movd           xm9, [dst4q+strideq*2]
1128    pinsrd         xm5, [dstq +stride3q ], 1
1129    pinsrd         xm6, [topq +strideq*1], 1
1130    pinsrd         xm9, [dst4q+stride3q ], 1
1131    pinsrd         xm5, [dst4q+strideq*0], 2
1132    pinsrd         xm6, [dstq +strideq*0], 2
1133    pinsrd         xm9, [botq +strideq*0], 2
1134    pinsrd         xm5, [dst4q+strideq*1], 3
1135    pinsrd         xm6, [dstq +strideq*1], 3
1136    pinsrd         xm9, [botq +strideq*1], 3
1137    vinserti128     m6, xm5, 1
1138    vinserti128     m5, xm9, 1
1139 %endif
1140%else
1141    movq           xm5, [dstq+strideq*2]
1142    movq           xm9, [botq+strideq*0]
1143    movq           xm6, [top1q         ]
1144    movq          xm10, [dstq+strideq*0]
1145    movhps         xm5, [dstq+stride3q ]
1146    movhps         xm9, [botq+strideq*1]
1147    movhps         xm6, [top2q         ]
1148    movhps        xm10, [dstq+strideq*1]
1149    vinserti128     m5, xm9, 1
1150    vinserti128     m6, xm10, 1
1151%endif
1152    ret
1153.d7k1:
1154%if %1 == 4
1155 %if %2 == 4
1156    movd           xm5, [dstq+strideq*2-1]
1157    movd           xm9, [dstq+stride3q -1]
1158    movd           xm6, [topq+strideq*0+1]
1159    movd          xm10, [topq+strideq*1+1]
1160    pinsrb         xm5, [leftq+ 5], 0
1161    pinsrb         xm9, [leftq+ 7], 0
1162    vinserti128     m6, [dstq+strideq*0+1], 1
1163    vinserti128    m10, [dstq+strideq*1+1], 1
1164    vinserti128     m5, [botq+strideq*0-1], 1
1165    vinserti128     m9, [botq+strideq*1-1], 1
1166    punpckldq       m6, m10
1167    punpckldq       m5, m9
1168 %else
1169    movd           xm6, [topq +strideq*0+1]
1170    movq           xm9, [dstq +strideq*2-1]
1171    movq          xm10, [dst4q+strideq*0-1]
1172    movd          xm11, [dst4q+strideq*2-1]
1173    pinsrd         xm6, [topq +strideq*1+1], 1
1174    movhps         xm9, [dstq +stride3q -1]
1175    movhps        xm10, [dst4q+strideq*1-1]
1176    pinsrd        xm11, [dst4q+stride3q -1], 1
1177    pinsrd         xm6, [dstq +strideq*0+1], 2
1178    pinsrd        xm11, [botq +strideq*0-1], 2
1179    pinsrd         xm6, [dstq +strideq*1+1], 3
1180    pinsrd        xm11, [botq +strideq*1-1], 3
1181    shufps         xm5, xm9, xm10, q2020
1182    vinserti128     m5, xm11, 1
1183    pmovzxbw       m11, [leftq+5]
1184    psrldq         xm9, 2
1185    psrldq        xm10, 2
1186    shufps         xm9, xm10, q2020
1187    movu           m10, [blend_4x8_1+8]
1188    vinserti128     m6, xm9, 1
1189    vpblendvb       m5, m11, m10
1190 %endif
1191%else
1192    lea            r13, [blend_8x8_0+16]
1193    movq           xm5, [dstq+strideq*2-1]
1194    movq           xm9, [botq+strideq*0-1]
1195    movq           xm6, [top1q         +1]
1196    movq          xm10, [dstq+strideq*0+1]
1197    movhps         xm5, [dstq+stride3q -1]
1198    movhps         xm9, [botq+strideq*1-1]
1199    movhps         xm6, [top2q         +1]
1200    movhps        xm10, [dstq+strideq*1+1]
1201    movu           m11, [r13+hq*2*2+16*2]
1202    vinserti128     m5, xm9, 1
1203    vinserti128     m6, xm10, 1
1204    vpblendvb       m5, [rsp+gprsize+0x60+hq*8+64+8*2], m11
1205%endif
1206    ret
1207
1208.border_block:
1209 DEFINE_ARGS dst, stride, left, top, bot, pri, sec, stride3, dst4, edge
1210%define rstk rsp
1211%assign stack_offset stack_offset_entry
1212%assign regs_used 10
1213%if STACK_ALIGNMENT < 32
1214    PUSH  r%+regs_used
1215 %assign regs_used regs_used+1
1216%endif
1217    ALLOC_STACK 2*16+(%2+4)*32, 16
1218%define px rsp+2*16+2*32
1219
1220    pcmpeqw        m14, m14
1221    psllw          m14, 15                  ; 0x8000
1222
1223    ; prepare pixel buffers - body/right
1224%if %1 == 4
1225    INIT_XMM avx2
1226%endif
1227%if %2 == 8
1228    lea          dst4q, [dstq+strideq*4]
1229%endif
1230    lea       stride3q, [strideq*3]
1231    test         edgeb, 2                   ; have_right
1232    jz .no_right
1233    pmovzxbw        m1, [dstq+strideq*0]
1234    pmovzxbw        m2, [dstq+strideq*1]
1235    pmovzxbw        m3, [dstq+strideq*2]
1236    pmovzxbw        m4, [dstq+stride3q]
1237    mova     [px+0*32], m1
1238    mova     [px+1*32], m2
1239    mova     [px+2*32], m3
1240    mova     [px+3*32], m4
1241%if %2 == 8
1242    pmovzxbw        m1, [dst4q+strideq*0]
1243    pmovzxbw        m2, [dst4q+strideq*1]
1244    pmovzxbw        m3, [dst4q+strideq*2]
1245    pmovzxbw        m4, [dst4q+stride3q]
1246    mova     [px+4*32], m1
1247    mova     [px+5*32], m2
1248    mova     [px+6*32], m3
1249    mova     [px+7*32], m4
1250%endif
1251    jmp .body_done
1252.no_right:
1253%if %1 == 4
1254    movd           xm1, [dstq+strideq*0]
1255    movd           xm2, [dstq+strideq*1]
1256    movd           xm3, [dstq+strideq*2]
1257    movd           xm4, [dstq+stride3q]
1258    pmovzxbw       xm1, xm1
1259    pmovzxbw       xm2, xm2
1260    pmovzxbw       xm3, xm3
1261    pmovzxbw       xm4, xm4
1262    movq     [px+0*32], xm1
1263    movq     [px+1*32], xm2
1264    movq     [px+2*32], xm3
1265    movq     [px+3*32], xm4
1266%else
1267    pmovzxbw       xm1, [dstq+strideq*0]
1268    pmovzxbw       xm2, [dstq+strideq*1]
1269    pmovzxbw       xm3, [dstq+strideq*2]
1270    pmovzxbw       xm4, [dstq+stride3q]
1271    mova     [px+0*32], xm1
1272    mova     [px+1*32], xm2
1273    mova     [px+2*32], xm3
1274    mova     [px+3*32], xm4
1275%endif
1276    movd [px+0*32+%1*2], xm14
1277    movd [px+1*32+%1*2], xm14
1278    movd [px+2*32+%1*2], xm14
1279    movd [px+3*32+%1*2], xm14
1280%if %2 == 8
1281 %if %1 == 4
1282    movd           xm1, [dst4q+strideq*0]
1283    movd           xm2, [dst4q+strideq*1]
1284    movd           xm3, [dst4q+strideq*2]
1285    movd           xm4, [dst4q+stride3q]
1286    pmovzxbw       xm1, xm1
1287    pmovzxbw       xm2, xm2
1288    pmovzxbw       xm3, xm3
1289    pmovzxbw       xm4, xm4
1290    movq     [px+4*32], xm1
1291    movq     [px+5*32], xm2
1292    movq     [px+6*32], xm3
1293    movq     [px+7*32], xm4
1294 %else
1295    pmovzxbw       xm1, [dst4q+strideq*0]
1296    pmovzxbw       xm2, [dst4q+strideq*1]
1297    pmovzxbw       xm3, [dst4q+strideq*2]
1298    pmovzxbw       xm4, [dst4q+stride3q]
1299    mova     [px+4*32], xm1
1300    mova     [px+5*32], xm2
1301    mova     [px+6*32], xm3
1302    mova     [px+7*32], xm4
1303 %endif
1304    movd [px+4*32+%1*2], xm14
1305    movd [px+5*32+%1*2], xm14
1306    movd [px+6*32+%1*2], xm14
1307    movd [px+7*32+%1*2], xm14
1308%endif
1309.body_done:
1310
1311    ; top
1312    test         edgeb, 4                    ; have_top
1313    jz .no_top
1314    test         edgeb, 1                    ; have_left
1315    jz .top_no_left
1316    test         edgeb, 2                    ; have_right
1317    jz .top_no_right
1318    pmovzxbw        m1, [topq+strideq*0-(%1/2)]
1319    pmovzxbw        m2, [topq+strideq*1-(%1/2)]
1320    movu  [px-2*32-%1], m1
1321    movu  [px-1*32-%1], m2
1322    jmp .top_done
1323.top_no_right:
1324    pmovzxbw        m1, [topq+strideq*0-%1]
1325    pmovzxbw        m2, [topq+strideq*1-%1]
1326    movu [px-2*32-%1*2], m1
1327    movu [px-1*32-%1*2], m2
1328    movd [px-2*32+%1*2], xm14
1329    movd [px-1*32+%1*2], xm14
1330    jmp .top_done
1331.top_no_left:
1332    test         edgeb, 2                   ; have_right
1333    jz .top_no_left_right
1334    pmovzxbw        m1, [topq+strideq*0]
1335    pmovzxbw        m2, [topq+strideq*1]
1336    mova   [px-2*32+0], m1
1337    mova   [px-1*32+0], m2
1338    movd   [px-2*32-4], xm14
1339    movd   [px-1*32-4], xm14
1340    jmp .top_done
1341.top_no_left_right:
1342%if %1 == 4
1343    movd           xm1, [topq+strideq*0]
1344    pinsrd         xm1, [topq+strideq*1], 1
1345    pmovzxbw       xm1, xm1
1346    movq   [px-2*32+0], xm1
1347    movhps [px-1*32+0], xm1
1348%else
1349    pmovzxbw       xm1, [topq+strideq*0]
1350    pmovzxbw       xm2, [topq+strideq*1]
1351    mova   [px-2*32+0], xm1
1352    mova   [px-1*32+0], xm2
1353%endif
1354    movd   [px-2*32-4], xm14
1355    movd   [px-1*32-4], xm14
1356    movd [px-2*32+%1*2], xm14
1357    movd [px-1*32+%1*2], xm14
1358    jmp .top_done
1359.no_top:
1360    movu   [px-2*32-%1], m14
1361    movu   [px-1*32-%1], m14
1362.top_done:
1363
1364    ; left
1365    test         edgeb, 1                   ; have_left
1366    jz .no_left
1367    pmovzxbw       xm1, [leftq+ 0]
1368%if %2 == 8
1369    pmovzxbw       xm2, [leftq+ 8]
1370%endif
1371    movd   [px+0*32-4], xm1
1372    pextrd [px+1*32-4], xm1, 1
1373    pextrd [px+2*32-4], xm1, 2
1374    pextrd [px+3*32-4], xm1, 3
1375%if %2 == 8
1376    movd   [px+4*32-4], xm2
1377    pextrd [px+5*32-4], xm2, 1
1378    pextrd [px+6*32-4], xm2, 2
1379    pextrd [px+7*32-4], xm2, 3
1380%endif
1381    jmp .left_done
1382.no_left:
1383    movd   [px+0*32-4], xm14
1384    movd   [px+1*32-4], xm14
1385    movd   [px+2*32-4], xm14
1386    movd   [px+3*32-4], xm14
1387%if %2 == 8
1388    movd   [px+4*32-4], xm14
1389    movd   [px+5*32-4], xm14
1390    movd   [px+6*32-4], xm14
1391    movd   [px+7*32-4], xm14
1392%endif
1393.left_done:
1394
1395    ; bottom
1396 DEFINE_ARGS dst, stride, _, _, bot, pri, sec, stride3, _, edge
1397    test         edgeb, 8                   ; have_bottom
1398    jz .no_bottom
1399    test         edgeb, 1                   ; have_left
1400    jz .bottom_no_left
1401    test         edgeb, 2                   ; have_right
1402    jz .bottom_no_right
1403    pmovzxbw        m1, [botq+strideq*0-(%1/2)]
1404    pmovzxbw        m2, [botq+strideq*1-(%1/2)]
1405    movu   [px+(%2+0)*32-%1], m1
1406    movu   [px+(%2+1)*32-%1], m2
1407    jmp .bottom_done
1408.bottom_no_right:
1409    pmovzxbw        m1, [botq+strideq*0-%1]
1410    pmovzxbw        m2, [botq+strideq*1-%1]
1411    movu  [px+(%2+0)*32-%1*2], m1
1412    movu  [px+(%2+1)*32-%1*2], m2
1413%if %1 == 8
1414    movd  [px+(%2-1)*32+%1*2], xm14                ; overwritten by previous movu
1415%endif
1416    movd  [px+(%2+0)*32+%1*2], xm14
1417    movd  [px+(%2+1)*32+%1*2], xm14
1418    jmp .bottom_done
1419.bottom_no_left:
1420    test          edgeb, 2                  ; have_right
1421    jz .bottom_no_left_right
1422    pmovzxbw        m1, [botq+strideq*0]
1423    pmovzxbw        m2, [botq+strideq*1]
1424    mova   [px+(%2+0)*32+0], m1
1425    mova   [px+(%2+1)*32+0], m2
1426    movd   [px+(%2+0)*32-4], xm14
1427    movd   [px+(%2+1)*32-4], xm14
1428    jmp .bottom_done
1429.bottom_no_left_right:
1430%if %1 == 4
1431    movd           xm1, [botq+strideq*0]
1432    pinsrd         xm1, [botq+strideq*1], 1
1433    pmovzxbw       xm1, xm1
1434    movq   [px+(%2+0)*32+0], xm1
1435    movhps [px+(%2+1)*32+0], xm1
1436%else
1437    pmovzxbw       xm1, [botq+strideq*0]
1438    pmovzxbw       xm2, [botq+strideq*1]
1439    mova   [px+(%2+0)*32+0], xm1
1440    mova   [px+(%2+1)*32+0], xm2
1441%endif
1442    movd   [px+(%2+0)*32-4], xm14
1443    movd   [px+(%2+1)*32-4], xm14
1444    movd  [px+(%2+0)*32+%1*2], xm14
1445    movd  [px+(%2+1)*32+%1*2], xm14
1446    jmp .bottom_done
1447.no_bottom:
1448    movu   [px+(%2+0)*32-%1], m14
1449    movu   [px+(%2+1)*32-%1], m14
1450.bottom_done:
1451
1452    ; actual filter
1453 INIT_YMM avx2
1454 DEFINE_ARGS dst, stride, _, pridmp, damping, pri, secdmp, stride3, zero
1455%undef edged
1456    ; register to shuffle values into after packing
1457    vbroadcasti128 m12, [shufb_lohi]
1458
1459    mov       dampingd, r8m
1460    xor          zerod, zerod
1461    movifnidn     prid, prim
1462    sub       dampingd, 31
1463    movifnidn  secdmpd, secdmpm
1464    test          prid, prid
1465    jz .border_sec_only
1466    movd           xm0, prid
1467    lzcnt      pridmpd, prid
1468    add        pridmpd, dampingd
1469    cmovs      pridmpd, zerod
1470    mov        [rsp+0], pridmpq                 ; pri_shift
1471    test       secdmpd, secdmpd
1472    jz .border_pri_only
1473    movd           xm1, secdmpd
1474    lzcnt      secdmpd, secdmpd
1475    add        secdmpd, dampingd
1476    mov        [rsp+8], secdmpq                 ; sec_shift
1477
1478 DEFINE_ARGS dst, stride, _, pridmp, table, pri, secdmp, stride3
1479    lea         tableq, [tap_table]
1480    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
1481    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
1482
1483    ; pri/sec_taps[k] [4 total]
1484 DEFINE_ARGS dst, stride, _, dir, table, pri, sec, stride3
1485    vpbroadcastb    m0, xm0                     ; pri_strength
1486    vpbroadcastb    m1, xm1                     ; sec_strength
1487    and           prid, 1
1488    lea           priq, [tableq+priq*2+8]       ; pri_taps
1489    lea           secq, [tableq+12]             ; sec_taps
1490
1491    BORDER_PREP_REGS %1, %2
1492%if %1*%2*2/mmsize > 1
1493.border_v_loop:
1494%endif
1495    BORDER_LOAD_BLOCK %1, %2, 1
1496.border_k_loop:
1497    vpbroadcastb    m2, [priq+kq]               ; pri_taps
1498    vpbroadcastb    m3, [secq+kq]               ; sec_taps
1499    ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1, 1
1500    ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1, 1
1501    ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1, 1
1502    dec             kq
1503    jge .border_k_loop
1504
1505    vpbroadcastd   m10, [pw_2048]
1506    BORDER_ADJUST_PIXEL %1, m10, 1
1507%if %1*%2*2/mmsize > 1
1508 %define vloop_lines (mmsize/(%1*2))
1509    lea           dstq, [dstq+strideq*vloop_lines]
1510    add           stkq, 32*vloop_lines
1511    dec             hd
1512    jg .border_v_loop
1513%endif
1514    RET
1515
1516.border_pri_only:
1517 DEFINE_ARGS dst, stride, _, pridmp, table, pri, _, stride3
1518    lea         tableq, [tap_table]
1519    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
1520 DEFINE_ARGS dst, stride, _, dir, table, pri, _, stride3
1521    vpbroadcastb    m0, xm0                     ; pri_strength
1522    and           prid, 1
1523    lea           priq, [tableq+priq*2+8]       ; pri_taps
1524    BORDER_PREP_REGS %1, %2
1525    vpbroadcastd    m1, [pw_2048]
1526%if %1*%2*2/mmsize > 1
1527.border_pri_v_loop:
1528%endif
1529    BORDER_LOAD_BLOCK %1, %2
1530.border_pri_k_loop:
1531    vpbroadcastb    m2, [priq+kq]               ; pri_taps
1532    ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1
1533    dec             kq
1534    jge .border_pri_k_loop
1535    BORDER_ADJUST_PIXEL %1, m1
1536%if %1*%2*2/mmsize > 1
1537 %define vloop_lines (mmsize/(%1*2))
1538    lea           dstq, [dstq+strideq*vloop_lines]
1539    add           stkq, 32*vloop_lines
1540    dec             hd
1541    jg .border_pri_v_loop
1542%endif
1543    RET
1544
1545.border_sec_only:
1546 DEFINE_ARGS dst, stride, _, _, damping, _, secdmp, stride3
1547    movd           xm1, secdmpd
1548    lzcnt      secdmpd, secdmpd
1549    add        secdmpd, dampingd
1550    mov        [rsp+8], secdmpq                 ; sec_shift
1551 DEFINE_ARGS dst, stride, _, _, table, _, secdmp, stride3
1552    lea         tableq, [tap_table]
1553    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
1554 DEFINE_ARGS dst, stride, _, dir, table, _, sec, stride3
1555    vpbroadcastb    m1, xm1                     ; sec_strength
1556    lea           secq, [tableq+12]             ; sec_taps
1557    BORDER_PREP_REGS %1, %2
1558    vpbroadcastd    m0, [pw_2048]
1559%if %1*%2*2/mmsize > 1
1560.border_sec_v_loop:
1561%endif
1562    BORDER_LOAD_BLOCK %1, %2
1563.border_sec_k_loop:
1564    vpbroadcastb    m3, [secq+kq]               ; sec_taps
1565    ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1
1566    ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1
1567    dec             kq
1568    jge .border_sec_k_loop
1569    BORDER_ADJUST_PIXEL %1, m0
1570%if %1*%2*2/mmsize > 1
1571 %define vloop_lines (mmsize/(%1*2))
1572    lea           dstq, [dstq+strideq*vloop_lines]
1573    add           stkq, 32*vloop_lines
1574    dec             hd
1575    jg .border_sec_v_loop
1576%endif
1577    RET
1578%endmacro
1579
1580CDEF_FILTER 8, 8
1581CDEF_FILTER 4, 8
1582CDEF_FILTER 4, 4
1583
1584INIT_YMM avx2
1585cglobal cdef_dir_8bpc, 3, 4, 6, src, stride, var, stride3
1586    lea       stride3q, [strideq*3]
1587    movq           xm0, [srcq+strideq*0]
1588    movq           xm1, [srcq+strideq*1]
1589    movq           xm2, [srcq+strideq*2]
1590    movq           xm3, [srcq+stride3q ]
1591    lea           srcq, [srcq+strideq*4]
1592    vpbroadcastq    m4, [srcq+stride3q ]
1593    vpbroadcastq    m5, [srcq+strideq*2]
1594    vpblendd        m0, m4, 0xf0
1595    vpblendd        m1, m5, 0xf0
1596    vpbroadcastq    m4, [srcq+strideq*1]
1597    vpbroadcastq    m5, [srcq+strideq*0]
1598    vpblendd        m2, m4, 0xf0
1599    vpblendd        m3, m5, 0xf0
1600    pxor            m4, m4
1601    punpcklbw       m0, m4
1602    punpcklbw       m1, m4
1603    punpcklbw       m2, m4
1604    punpcklbw       m3, m4
1605cglobal_label .main
1606    vpbroadcastd    m4, [pw_128]
1607    PROLOGUE 3, 4, 15
1608    psubw           m0, m4
1609    psubw           m1, m4
1610    psubw           m2, m4
1611    psubw           m3, m4
1612
1613    ; shuffle registers to generate partial_sum_diag[0-1] together
1614    vperm2i128      m7, m0, m0, 0x01
1615    vperm2i128      m6, m1, m1, 0x01
1616    vperm2i128      m5, m2, m2, 0x01
1617    vperm2i128      m4, m3, m3, 0x01
1618
1619    ; start with partial_sum_hv[0-1]
1620    paddw           m8, m0, m1
1621    paddw           m9, m2, m3
1622    phaddw         m10, m0, m1
1623    phaddw         m11, m2, m3
1624    paddw           m8, m9
1625    phaddw         m10, m11
1626    vextracti128   xm9, m8, 1
1627    vextracti128  xm11, m10, 1
1628    paddw          xm8, xm9                 ; partial_sum_hv[1]
1629    phaddw        xm10, xm11                ; partial_sum_hv[0]
1630    vinserti128     m8, xm10, 1
1631    vpbroadcastd    m9, [div_table+44]
1632    pmaddwd         m8, m8
1633    pmulld          m8, m9                  ; cost6[2a-d] | cost2[a-d]
1634
1635    ; create aggregates [lower half]:
1636    ; m9 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234+
1637    ;      m4:xxxx0123+m5:xxxxx012+m6:xxxxxx01+m7:xxxxxxx0
1638    ; m10=             m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx+
1639    ;      m4:4567xxxx+m5:34567xxx+m6:234567xx+m7:1234567x
1640    ; and [upper half]:
1641    ; m9 = m0:xxxxxxx0+m1:xxxxxx01+m2:xxxxx012+m3:xxxx0123+
1642    ;      m4:xxx01234+m5:xx012345+m6:x0123456+m7:01234567
1643    ; m10= m0:1234567x+m1:234567xx+m2:34567xxx+m3:4567xxxx+
1644    ;      m4:567xxxxx+m5:67xxxxxx+m6:7xxxxxxx
1645    ; and then shuffle m11 [shufw_6543210x], unpcklwd, pmaddwd, pmulld, paddd
1646
1647    pslldq          m9, m1, 2
1648    psrldq         m10, m1, 14
1649    pslldq         m11, m2, 4
1650    psrldq         m12, m2, 12
1651    pslldq         m13, m3, 6
1652    psrldq         m14, m3, 10
1653    paddw           m9, m11
1654    paddw          m10, m12
1655    paddw           m9, m13
1656    paddw          m10, m14
1657    pslldq         m11, m4, 8
1658    psrldq         m12, m4, 8
1659    pslldq         m13, m5, 10
1660    psrldq         m14, m5, 6
1661    paddw           m9, m11
1662    paddw          m10, m12
1663    paddw           m9, m13
1664    paddw          m10, m14
1665    pslldq         m11, m6, 12
1666    psrldq         m12, m6, 4
1667    pslldq         m13, m7, 14
1668    psrldq         m14, m7, 2
1669    paddw           m9, m11
1670    paddw          m10, m12
1671    paddw           m9, m13
1672    paddw          m10, m14                 ; partial_sum_diag[0/1][8-14,zero]
1673    vbroadcasti128 m14, [shufw_6543210x]
1674    vbroadcasti128 m13, [div_table+16]
1675    vbroadcasti128 m12, [div_table+0]
1676    paddw           m9, m0                  ; partial_sum_diag[0/1][0-7]
1677    pshufb         m10, m14
1678    punpckhwd      m11, m9, m10
1679    punpcklwd       m9, m10
1680    pmaddwd        m11, m11
1681    pmaddwd         m9, m9
1682    pmulld         m11, m13
1683    pmulld          m9, m12
1684    paddd           m9, m11                 ; cost0[a-d] | cost4[a-d]
1685
1686    ; merge horizontally and vertically for partial_sum_alt[0-3]
1687    paddw          m10, m0, m1
1688    paddw          m11, m2, m3
1689    paddw          m12, m4, m5
1690    paddw          m13, m6, m7
1691    phaddw          m0, m4
1692    phaddw          m1, m5
1693    phaddw          m2, m6
1694    phaddw          m3, m7
1695
1696    ; create aggregates [lower half]:
1697    ; m4 = m10:01234567+m11:x0123456+m12:xx012345+m13:xxx01234
1698    ; m11=              m11:7xxxxxxx+m12:67xxxxxx+m13:567xxxxx
1699    ; and [upper half]:
1700    ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567
1701    ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx
1702    ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd
1703
1704    pslldq          m4, m11, 2
1705    psrldq         m11, 14
1706    pslldq          m5, m12, 4
1707    psrldq         m12, 12
1708    pslldq          m6, m13, 6
1709    psrldq         m13, 10
1710    paddw           m4, m10
1711    paddw          m11, m12
1712    vpbroadcastd   m12, [div_table+44]
1713    paddw           m5, m6
1714    paddw          m11, m13                 ; partial_sum_alt[3/2] right
1715    vbroadcasti128 m13, [div_table+32]
1716    paddw           m4, m5                  ; partial_sum_alt[3/2] left
1717    pshuflw         m5, m11, q3012
1718    punpckhwd       m6, m11, m4
1719    punpcklwd       m4, m5
1720    pmaddwd         m6, m6
1721    pmaddwd         m4, m4
1722    pmulld          m6, m12
1723    pmulld          m4, m13
1724    paddd           m4, m6                  ; cost7[a-d] | cost5[a-d]
1725
1726    ; create aggregates [lower half]:
1727    ; m5 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234
1728    ; m1 =             m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx
1729    ; and [upper half]:
1730    ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567
1731    ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx
1732    ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd
1733
1734    pslldq          m5, m1, 2
1735    psrldq          m1, 14
1736    pslldq          m6, m2, 4
1737    psrldq          m2, 12
1738    pslldq          m7, m3, 6
1739    psrldq          m3, 10
1740    paddw           m5, m0
1741    paddw           m1, m2
1742    paddw           m6, m7
1743    paddw           m1, m3                  ; partial_sum_alt[0/1] right
1744    paddw           m5, m6                  ; partial_sum_alt[0/1] left
1745    pshuflw         m0, m1, q3012
1746    punpckhwd       m1, m5
1747    punpcklwd       m5, m0
1748    pmaddwd         m1, m1
1749    pmaddwd         m5, m5
1750    pmulld          m1, m12
1751    pmulld          m5, m13
1752    paddd           m5, m1                  ; cost1[a-d] | cost3[a-d]
1753
1754    mova           xm0, [pd_47130256+ 16]
1755    mova            m1, [pd_47130256]
1756    phaddd          m9, m8
1757    phaddd          m5, m4
1758    phaddd          m9, m5
1759    vpermd          m0, m9                  ; cost[0-3]
1760    vpermd          m1, m9                  ; cost[4-7] | cost[0-3]
1761
1762    ; now find the best cost
1763    pmaxsd         xm2, xm0, xm1
1764    pshufd         xm3, xm2, q1032
1765    pmaxsd         xm2, xm3
1766    pshufd         xm3, xm2, q2301
1767    pmaxsd         xm2, xm3 ; best cost
1768
1769    ; find the idx using minpos
1770    ; make everything other than the best cost negative via subtraction
1771    ; find the min of unsigned 16-bit ints to sort out the negative values
1772    psubd          xm4, xm1, xm2
1773    psubd          xm3, xm0, xm2
1774    packssdw       xm3, xm4
1775    phminposuw     xm3, xm3
1776
1777    ; convert idx to 32-bits
1778    psrld          xm3, 16
1779    movd           eax, xm3
1780
1781    ; get idx^4 complement
1782    vpermd          m3, m1
1783    psubd          xm2, xm3
1784    psrld          xm2, 10
1785    movd        [varq], xm2
1786    RET
1787
1788%endif ; ARCH_X86_64
1789