1; Copyright © 2018-2020, VideoLAN and dav1d authors
2; Copyright © 2018-2020, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 32
32
33; dav1d_obmc_masks[] with 64-x interleaved
34obmc_masks:     db  0,  0,  0,  0
35                ; 2
36                db 45, 19, 64,  0
37                ; 4
38                db 39, 25, 50, 14, 59,  5, 64,  0
39                ; 8
40                db 36, 28, 42, 22, 48, 16, 53, 11, 57,  7, 61,  3, 64,  0, 64,  0
41                ; 16
42                db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
43                db 56,  8, 58,  6, 60,  4, 61,  3, 64,  0, 64,  0, 64,  0, 64,  0
44                ; 32
45                db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
46                db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55,  9
47                db 56,  8, 57,  7, 58,  6, 59,  5, 60,  4, 60,  4, 61,  3, 62,  2
48                db 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0
49
50warp_8x8_shufA: db  0,  2,  4,  6,  1,  3,  5,  7,  1,  3,  5,  7,  2,  4,  6,  8
51                db  4,  6,  8, 10,  5,  7,  9, 11,  5,  7,  9, 11,  6,  8, 10, 12
52warp_8x8_shufB: db  2,  4,  6,  8,  3,  5,  7,  9,  3,  5,  7,  9,  4,  6,  8, 10
53                db  6,  8, 10, 12,  7,  9, 11, 13,  7,  9, 11, 13,  8, 10, 12, 14
54subpel_h_shuf4: db  0,  1,  2,  3,  1,  2,  3,  4,  8,  9, 10, 11,  9, 10, 11, 12
55                db  2,  3,  4,  5,  3,  4,  5,  6, 10, 11, 12, 13, 11, 12, 13, 14
56subpel_h_shufA: db  0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
57subpel_h_shufB: db  4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
58subpel_h_shufC: db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
59subpel_v_shuf4: db  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
60subpel_s_shuf2: db  0,  1,  2,  3,  0,  1,  2,  3,  8,  9, 10, 11,  8,  9, 10, 11
61subpel_s_shuf8: db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
62bilin_h_shuf4:  db  0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
63bilin_h_shuf8:  db  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8
64bilin_v_shuf4:  db  4,  0,  5,  1,  6,  2,  7,  3,  8,  4,  9,  5, 10,  6, 11,  7
65deint_shuf4:    db  0,  4,  1,  5,  2,  6,  3,  7,  4,  8,  5,  9,  6, 10,  7, 11
66blend_shuf:     db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
67pb_8x0_8x8:     db  0,  0,  0,  0,  0,  0,  0,  0,  8,  8,  8,  8,  8,  8,  8,  8
68bdct_lb_dw:     db  0,  0,  0,  0,  4,  4,  4,  4,  8,  8,  8,  8, 12, 12, 12, 12
69wswap:          db  2,  3,  0,  1,  6,  7,  4,  5, 10, 11,  8,  9, 14, 15, 12, 13
70rescale_mul:    dd  0,  1,  2,  3,  4,  5,  6,  7
71resize_shuf:    db  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  7,  7,  7,  7
72                db  7,  7,  7,  7,  7,  7,  7,  7
73
74wm_420_sign:    dd 0x01020102, 0x01010101
75wm_422_sign:    dd 0x80808080, 0x7f7f7f7f
76
77pb_64:   times 4 db 64
78pw_m256: times 2 dw -256
79pw_15:   times 2 dw 15
80pw_32:   times 2 dw 32
81pw_34:   times 2 dw 34
82pw_258:  times 2 dw 258
83pw_512:  times 2 dw 512
84pw_1024: times 2 dw 1024
85pw_2048: times 2 dw 2048
86pw_6903: times 2 dw 6903
87pw_8192: times 2 dw 8192
88pd_32:           dd 32
89pd_63:           dd 63
90pd_512:          dd 512
91pd_32768:        dd 32768
92pd_0x3ff:        dd 0x3ff
93pd_0x4000:       dd 0x4000
94pq_0x40000000:   dq 0x40000000
95
96cextern mc_subpel_filters
97cextern mc_warp_filter
98cextern resize_filter
99
100%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
101
102%macro BASE_JMP_TABLE 3-*
103    %xdefine %1_%2_table (%%table - %3)
104    %xdefine %%base %1_%2
105    %%table:
106    %rep %0 - 2
107        dw %%base %+ _w%3 - %%base
108        %rotate 1
109    %endrep
110%endmacro
111
112%macro HV_JMP_TABLE 5-*
113    %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3)
114    %xdefine %%base %1_%3
115    %assign %%types %4
116    %if %%types & 1
117        %xdefine %1_%2_h_%3_table  (%%h  - %5)
118        %%h:
119        %rep %0 - 4
120            dw %%prefix %+ .h_w%5 - %%base
121            %rotate 1
122        %endrep
123        %rotate 4
124    %endif
125    %if %%types & 2
126        %xdefine %1_%2_v_%3_table  (%%v  - %5)
127        %%v:
128        %rep %0 - 4
129            dw %%prefix %+ .v_w%5 - %%base
130            %rotate 1
131        %endrep
132        %rotate 4
133    %endif
134    %if %%types & 4
135        %xdefine %1_%2_hv_%3_table (%%hv - %5)
136        %%hv:
137        %rep %0 - 4
138            dw %%prefix %+ .hv_w%5 - %%base
139            %rotate 1
140        %endrep
141    %endif
142%endmacro
143
144%macro BIDIR_JMP_TABLE 1-*
145    %xdefine %1_table (%%table - 2*%2)
146    %xdefine %%base %1_table
147    %xdefine %%prefix mangle(private_prefix %+ _%1)
148    %%table:
149    %rep %0 - 1
150        dd %%prefix %+ .w%2 - %%base
151        %rotate 1
152    %endrep
153%endmacro
154
155%macro SCALED_JMP_TABLE 1-*
156    %xdefine %1_table (%%table - %2)
157    %xdefine %%base mangle(private_prefix %+ _%1)
158%%table:
159    %rep %0 - 1
160        dw %%base %+ .w%2 - %%base
161        %rotate 1
162    %endrep
163    %rotate 1
164%%dy_1024:
165    %xdefine %1_dy1_table (%%dy_1024 - %2)
166    %rep %0 - 1
167        dw %%base %+ .dy1_w%2 - %%base
168        %rotate 1
169    %endrep
170    %rotate 1
171%%dy_2048:
172    %xdefine %1_dy2_table (%%dy_2048 - %2)
173    %rep %0 - 1
174        dw %%base %+ .dy2_w%2 - %%base
175        %rotate 1
176    %endrep
177%endmacro
178
179%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_avx2.put)
180%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_avx2.prep)
181
182%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
183
184BASE_JMP_TABLE   put,  avx2,           2, 4, 8, 16, 32, 64, 128
185BASE_JMP_TABLE   prep, avx2,              4, 8, 16, 32, 64, 128
186HV_JMP_TABLE     put,  bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
187HV_JMP_TABLE     prep, bilin, avx2, 7,    4, 8, 16, 32, 64, 128
188HV_JMP_TABLE     put,  8tap,  avx2, 3, 2, 4, 8, 16, 32, 64, 128
189HV_JMP_TABLE     prep, 8tap,  avx2, 1,    4, 8, 16, 32, 64, 128
190SCALED_JMP_TABLE put_8tap_scaled_avx2, 2, 4, 8, 16, 32, 64, 128
191SCALED_JMP_TABLE prep_8tap_scaled_avx2,   4, 8, 16, 32, 64, 128
192BIDIR_JMP_TABLE  avg_avx2,                4, 8, 16, 32, 64, 128
193BIDIR_JMP_TABLE  w_avg_avx2,              4, 8, 16, 32, 64, 128
194BIDIR_JMP_TABLE  mask_avx2,               4, 8, 16, 32, 64, 128
195BIDIR_JMP_TABLE  w_mask_420_avx2,         4, 8, 16, 32, 64, 128
196BIDIR_JMP_TABLE  w_mask_422_avx2,         4, 8, 16, 32, 64, 128
197BIDIR_JMP_TABLE  w_mask_444_avx2,         4, 8, 16, 32, 64, 128
198BIDIR_JMP_TABLE  blend_avx2,              4, 8, 16, 32
199BIDIR_JMP_TABLE  blend_v_avx2,         2, 4, 8, 16, 32
200BIDIR_JMP_TABLE  blend_h_avx2,         2, 4, 8, 16, 32, 32, 32
201
202SECTION .text
203
204INIT_XMM avx2
205cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
206    movifnidn          mxyd, r6m ; mx
207    lea                  r7, [put_avx2]
208    tzcnt                wd, wm
209    movifnidn            hd, hm
210    test               mxyd, mxyd
211    jnz .h
212    mov                mxyd, r7m ; my
213    test               mxyd, mxyd
214    jnz .v
215.put:
216    movzx                wd, word [r7+wq*2+table_offset(put,)]
217    add                  wq, r7
218    jmp                  wq
219.put_w2:
220    movzx               r6d, word [srcq+ssq*0]
221    movzx               r7d, word [srcq+ssq*1]
222    lea                srcq, [srcq+ssq*2]
223    mov        [dstq+dsq*0], r6w
224    mov        [dstq+dsq*1], r7w
225    lea                dstq, [dstq+dsq*2]
226    sub                  hd, 2
227    jg .put_w2
228    RET
229.put_w4:
230    mov                 r6d, [srcq+ssq*0]
231    mov                 r7d, [srcq+ssq*1]
232    lea                srcq, [srcq+ssq*2]
233    mov        [dstq+dsq*0], r6d
234    mov        [dstq+dsq*1], r7d
235    lea                dstq, [dstq+dsq*2]
236    sub                  hd, 2
237    jg .put_w4
238    RET
239.put_w8:
240    mov                  r6, [srcq+ssq*0]
241    mov                  r7, [srcq+ssq*1]
242    lea                srcq, [srcq+ssq*2]
243    mov        [dstq+dsq*0], r6
244    mov        [dstq+dsq*1], r7
245    lea                dstq, [dstq+dsq*2]
246    sub                  hd, 2
247    jg .put_w8
248    RET
249.put_w16:
250    movu                 m0, [srcq+ssq*0]
251    movu                 m1, [srcq+ssq*1]
252    lea                srcq, [srcq+ssq*2]
253    mova       [dstq+dsq*0], m0
254    mova       [dstq+dsq*1], m1
255    lea                dstq, [dstq+dsq*2]
256    sub                  hd, 2
257    jg .put_w16
258    RET
259INIT_YMM avx2
260.put_w32:
261    movu                 m0, [srcq+ssq*0]
262    movu                 m1, [srcq+ssq*1]
263    lea                srcq, [srcq+ssq*2]
264    mova       [dstq+dsq*0], m0
265    mova       [dstq+dsq*1], m1
266    lea                dstq, [dstq+dsq*2]
267    sub                  hd, 2
268    jg .put_w32
269    RET
270.put_w64:
271    movu                 m0, [srcq+ssq*0+32*0]
272    movu                 m1, [srcq+ssq*0+32*1]
273    movu                 m2, [srcq+ssq*1+32*0]
274    movu                 m3, [srcq+ssq*1+32*1]
275    lea                srcq, [srcq+ssq*2]
276    mova  [dstq+dsq*0+32*0], m0
277    mova  [dstq+dsq*0+32*1], m1
278    mova  [dstq+dsq*1+32*0], m2
279    mova  [dstq+dsq*1+32*1], m3
280    lea                dstq, [dstq+dsq*2]
281    sub                  hd, 2
282    jg .put_w64
283    RET
284.put_w128:
285    movu                 m0, [srcq+32*0]
286    movu                 m1, [srcq+32*1]
287    movu                 m2, [srcq+32*2]
288    movu                 m3, [srcq+32*3]
289    add                srcq, ssq
290    mova        [dstq+32*0], m0
291    mova        [dstq+32*1], m1
292    mova        [dstq+32*2], m2
293    mova        [dstq+32*3], m3
294    add                dstq, dsq
295    dec                  hd
296    jg .put_w128
297    RET
298.h:
299    ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
300    ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
301    imul               mxyd, 255
302    vbroadcasti128       m4, [bilin_h_shuf8]
303    add                mxyd, 16
304    movd                xm5, mxyd
305    mov                mxyd, r7m ; my
306    vpbroadcastw         m5, xm5
307    test               mxyd, mxyd
308    jnz .hv
309    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_h)]
310    vpbroadcastd         m3, [pw_2048]
311    add                  wq, r7
312    jmp                  wq
313.h_w2:
314    movd                xm0, [srcq+ssq*0]
315    pinsrd              xm0, [srcq+ssq*1], 1
316    lea                srcq, [srcq+ssq*2]
317    pshufb              xm0, xm4
318    pmaddubsw           xm0, xm5
319    pmulhrsw            xm0, xm3
320    packuswb            xm0, xm0
321    pextrw     [dstq+dsq*0], xm0, 0
322    pextrw     [dstq+dsq*1], xm0, 2
323    lea                dstq, [dstq+dsq*2]
324    sub                  hd, 2
325    jg .h_w2
326    RET
327.h_w4:
328    mova                xm4, [bilin_h_shuf4]
329.h_w4_loop:
330    movq                xm0, [srcq+ssq*0]
331    movhps              xm0, [srcq+ssq*1]
332    lea                srcq, [srcq+ssq*2]
333    pshufb              xm0, xm4
334    pmaddubsw           xm0, xm5
335    pmulhrsw            xm0, xm3
336    packuswb            xm0, xm0
337    movd       [dstq+dsq*0], xm0
338    pextrd     [dstq+dsq*1], xm0, 1
339    lea                dstq, [dstq+dsq*2]
340    sub                  hd, 2
341    jg .h_w4_loop
342    RET
343.h_w8:
344    movu                xm0, [srcq+ssq*0]
345    movu                xm1, [srcq+ssq*1]
346    lea                srcq, [srcq+ssq*2]
347    pshufb              xm0, xm4
348    pshufb              xm1, xm4
349    pmaddubsw           xm0, xm5
350    pmaddubsw           xm1, xm5
351    pmulhrsw            xm0, xm3
352    pmulhrsw            xm1, xm3
353    packuswb            xm0, xm1
354    movq       [dstq+dsq*0], xm0
355    movhps     [dstq+dsq*1], xm0
356    lea                dstq, [dstq+dsq*2]
357    sub                  hd, 2
358    jg .h_w8
359    RET
360.h_w16:
361    movu                xm0, [srcq+ssq*0+8*0]
362    vinserti128          m0, [srcq+ssq*1+8*0], 1
363    movu                xm1, [srcq+ssq*0+8*1]
364    vinserti128          m1, [srcq+ssq*1+8*1], 1
365    lea                srcq, [srcq+ssq*2]
366    pshufb               m0, m4
367    pshufb               m1, m4
368    pmaddubsw            m0, m5
369    pmaddubsw            m1, m5
370    pmulhrsw             m0, m3
371    pmulhrsw             m1, m3
372    packuswb             m0, m1
373    mova         [dstq+dsq*0], xm0
374    vextracti128 [dstq+dsq*1], m0, 1
375    lea                dstq, [dstq+dsq*2]
376    sub                  hd, 2
377    jg .h_w16
378    RET
379.h_w32:
380    movu                 m0, [srcq+8*0]
381    movu                 m1, [srcq+8*1]
382    add                srcq, ssq
383    pshufb               m0, m4
384    pshufb               m1, m4
385    pmaddubsw            m0, m5
386    pmaddubsw            m1, m5
387    pmulhrsw             m0, m3
388    pmulhrsw             m1, m3
389    packuswb             m0, m1
390    mova             [dstq], m0
391    add                dstq, dsq
392    dec                  hd
393    jg .h_w32
394    RET
395.h_w64:
396    movu                 m0, [srcq+8*0]
397    movu                 m1, [srcq+8*1]
398    pshufb               m0, m4
399    pshufb               m1, m4
400    pmaddubsw            m0, m5
401    pmaddubsw            m1, m5
402    pmulhrsw             m0, m3
403    pmulhrsw             m1, m3
404    packuswb             m0, m1
405    movu                 m1, [srcq+8*4]
406    movu                 m2, [srcq+8*5]
407    add                srcq, ssq
408    pshufb               m1, m4
409    pshufb               m2, m4
410    pmaddubsw            m1, m5
411    pmaddubsw            m2, m5
412    pmulhrsw             m1, m3
413    pmulhrsw             m2, m3
414    packuswb             m1, m2
415    mova        [dstq+32*0], m0
416    mova        [dstq+32*1], m1
417    add                dstq, dsq
418    dec                  hd
419    jg .h_w64
420    RET
421.h_w128:
422    mov                  r6, -32*3
423.h_w128_loop:
424    movu                 m0, [srcq+r6+32*3+8*0]
425    movu                 m1, [srcq+r6+32*3+8*1]
426    pshufb               m0, m4
427    pshufb               m1, m4
428    pmaddubsw            m0, m5
429    pmaddubsw            m1, m5
430    pmulhrsw             m0, m3
431    pmulhrsw             m1, m3
432    packuswb             m0, m1
433    mova     [dstq+r6+32*3], m0
434    add                  r6, 32
435    jle .h_w128_loop
436    add                srcq, ssq
437    add                dstq, dsq
438    dec                  hd
439    jg .h_w128
440    RET
441.v:
442    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_v)]
443    imul               mxyd, 255
444    vpbroadcastd         m5, [pw_2048]
445    add                mxyd, 16
446    add                  wq, r7
447    movd                xm4, mxyd
448    vpbroadcastw         m4, xm4
449    jmp                  wq
450.v_w2:
451    movd                xm0,      [srcq+ssq*0]
452.v_w2_loop:
453    pinsrw              xm1, xm0, [srcq+ssq*1], 1 ; 0 1
454    lea                srcq,      [srcq+ssq*2]
455    pinsrw              xm0, xm1, [srcq+ssq*0], 0 ; 2 1
456    pshuflw             xm1, xm1, q2301           ; 1 0
457    punpcklbw           xm1, xm0
458    pmaddubsw           xm1, xm4
459    pmulhrsw            xm1, xm5
460    packuswb            xm1, xm1
461    pextrw     [dstq+dsq*0], xm1, 1
462    pextrw     [dstq+dsq*1], xm1, 0
463    lea                dstq, [dstq+dsq*2]
464    sub                  hd, 2
465    jg .v_w2_loop
466    RET
467.v_w4:
468    movd                xm0, [srcq+ssq*0]
469.v_w4_loop:
470    vpbroadcastd        xm2, [srcq+ssq*1]
471    lea                srcq, [srcq+ssq*2]
472    vpblendd            xm1, xm2, xm0, 0x01 ; 0 1
473    vpbroadcastd        xm0, [srcq+ssq*0]
474    vpblendd            xm2, xm0, 0x02      ; 1 2
475    punpcklbw           xm1, xm2
476    pmaddubsw           xm1, xm4
477    pmulhrsw            xm1, xm5
478    packuswb            xm1, xm1
479    movd       [dstq+dsq*0], xm1
480    pextrd     [dstq+dsq*1], xm1, 1
481    lea                dstq, [dstq+dsq*2]
482    sub                  hd, 2
483    jg .v_w4_loop
484    RET
485.v_w8:
486    movq                xm0, [srcq+ssq*0]
487.v_w8_loop:
488    movq                xm2, [srcq+ssq*1]
489    lea                srcq, [srcq+ssq*2]
490    punpcklbw           xm1, xm0, xm2
491    movq                xm0, [srcq+ssq*0]
492    punpcklbw           xm2, xm0
493    pmaddubsw           xm1, xm4
494    pmaddubsw           xm2, xm4
495    pmulhrsw            xm1, xm5
496    pmulhrsw            xm2, xm5
497    packuswb            xm1, xm2
498    movq       [dstq+dsq*0], xm1
499    movhps     [dstq+dsq*1], xm1
500    lea                dstq, [dstq+dsq*2]
501    sub                  hd, 2
502    jg .v_w8_loop
503    RET
504.v_w16:
505    movu                xm0, [srcq+ssq*0]
506.v_w16_loop:
507    vbroadcasti128       m3, [srcq+ssq*1]
508    lea                srcq, [srcq+ssq*2]
509    vpblendd             m2, m3, m0, 0x0f ; 0 1
510    vbroadcasti128       m0, [srcq+ssq*0]
511    vpblendd             m3, m0, 0xf0     ; 1 2
512    punpcklbw            m1, m2, m3
513    punpckhbw            m2, m3
514    pmaddubsw            m1, m4
515    pmaddubsw            m2, m4
516    pmulhrsw             m1, m5
517    pmulhrsw             m2, m5
518    packuswb             m1, m2
519    mova         [dstq+dsq*0], xm1
520    vextracti128 [dstq+dsq*1], m1, 1
521    lea                dstq, [dstq+dsq*2]
522    sub                  hd, 2
523    jg .v_w16_loop
524    RET
525.v_w32:
526%macro PUT_BILIN_V_W32 0
527    movu                 m0, [srcq+ssq*0]
528%%loop:
529    movu                 m3, [srcq+ssq*1]
530    lea                srcq, [srcq+ssq*2]
531    punpcklbw            m1, m0, m3
532    punpckhbw            m2, m0, m3
533    movu                 m0, [srcq+ssq*0]
534    pmaddubsw            m1, m4
535    pmaddubsw            m2, m4
536    pmulhrsw             m1, m5
537    pmulhrsw             m2, m5
538    packuswb             m1, m2
539    punpcklbw            m2, m3, m0
540    punpckhbw            m3, m0
541    pmaddubsw            m2, m4
542    pmaddubsw            m3, m4
543    pmulhrsw             m2, m5
544    pmulhrsw             m3, m5
545    packuswb             m2, m3
546    mova       [dstq+dsq*0], m1
547    mova       [dstq+dsq*1], m2
548    lea                dstq, [dstq+dsq*2]
549    sub                  hd, 2
550    jg %%loop
551%endmacro
552    PUT_BILIN_V_W32
553    RET
554.v_w64:
555    movu                 m0, [srcq+32*0]
556    movu                 m1, [srcq+32*1]
557.v_w64_loop:
558    add                srcq, ssq
559    movu                 m3, [srcq+32*0]
560    punpcklbw            m2, m0, m3
561    punpckhbw            m0, m3
562    pmaddubsw            m2, m4
563    pmaddubsw            m0, m4
564    pmulhrsw             m2, m5
565    pmulhrsw             m0, m5
566    packuswb             m2, m0
567    mova                 m0, m3
568    movu                 m3, [srcq+32*1]
569    mova        [dstq+32*0], m2
570    punpcklbw            m2, m1, m3
571    punpckhbw            m1, m3
572    pmaddubsw            m2, m4
573    pmaddubsw            m1, m4
574    pmulhrsw             m2, m5
575    pmulhrsw             m1, m5
576    packuswb             m2, m1
577    mova                 m1, m3
578    mova        [dstq+32*1], m2
579    add                dstq, dsq
580    dec                  hd
581    jg .v_w64_loop
582    RET
583.v_w128:
584    lea                 r6d, [hq+(3<<8)]
585    mov                  r4, srcq
586    mov                  r7, dstq
587.v_w128_loop:
588    PUT_BILIN_V_W32
589    add                  r4, 32
590    add                  r7, 32
591    movzx                hd, r6b
592    mov                srcq, r4
593    mov                dstq, r7
594    sub                 r6d, 1<<8
595    jg .v_w128_loop
596    RET
597.hv:
598    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
599    ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
600    movzx                wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
601    WIN64_SPILL_XMM       8
602    shl                mxyd, 11 ; can't shift by 12 due to signed overflow
603    vpbroadcastd         m7, [pw_15]
604    movd                xm6, mxyd
605    add                  wq, r7
606    paddb                m5, m5
607    vpbroadcastw         m6, xm6
608    jmp                  wq
609.hv_w2:
610    vpbroadcastd        xm0, [srcq+ssq*0]
611    pshufb              xm0, xm4
612    pmaddubsw           xm0, xm5
613.hv_w2_loop:
614    movd                xm1, [srcq+ssq*1]
615    lea                srcq, [srcq+ssq*2]
616    pinsrd              xm1, [srcq+ssq*0], 1
617    pshufb              xm1, xm4
618    pmaddubsw           xm1, xm5             ; 1 _ 2 _
619    shufps              xm2, xm0, xm1, q1032 ; 0 _ 1 _
620    mova                xm0, xm1
621    psubw               xm1, xm2
622    pmulhw              xm1, xm6
623    pavgw               xm2, xm7
624    paddw               xm1, xm2
625    psrlw               xm1, 4
626    packuswb            xm1, xm1
627    pextrw     [dstq+dsq*0], xm1, 0
628    pextrw     [dstq+dsq*1], xm1, 2
629    lea                dstq, [dstq+dsq*2]
630    sub                  hd, 2
631    jg .hv_w2_loop
632    RET
633.hv_w4:
634    mova                xm4, [bilin_h_shuf4]
635    movddup             xm0, [srcq+ssq*0]
636    pshufb              xm0, xm4
637    pmaddubsw           xm0, xm5
638.hv_w4_loop:
639    movq                xm1, [srcq+ssq*1]
640    lea                srcq, [srcq+ssq*2]
641    movhps              xm1, [srcq+ssq*0]
642    pshufb              xm1, xm4
643    pmaddubsw           xm1, xm5             ; 1 2
644    shufps              xm2, xm0, xm1, q1032 ; 0 1
645    mova                xm0, xm1
646    psubw               xm1, xm2
647    pmulhw              xm1, xm6
648    pavgw               xm2, xm7
649    paddw               xm1, xm2
650    psrlw               xm1, 4
651    packuswb            xm1, xm1
652    movd       [dstq+dsq*0], xm1
653    pextrd     [dstq+dsq*1], xm1, 1
654    lea                dstq, [dstq+dsq*2]
655    sub                  hd, 2
656    jg .hv_w4_loop
657    RET
658.hv_w8:
659    vbroadcasti128       m0, [srcq+ssq*0]
660    pshufb               m0, m4
661    pmaddubsw            m0, m5
662.hv_w8_loop:
663    movu                xm1, [srcq+ssq*1]
664    lea                srcq, [srcq+ssq*2]
665    vinserti128          m1, [srcq+ssq*0], 1
666    pshufb               m1, m4
667    pmaddubsw            m1, m5           ; 1 2
668    vperm2i128           m2, m0, m1, 0x21 ; 0 1
669    mova                 m0, m1
670    psubw                m1, m2
671    pmulhw               m1, m6
672    pavgw                m2, m7
673    paddw                m1, m2
674    psrlw                m1, 4
675    vextracti128        xm2, m1, 1
676    packuswb            xm1, xm2
677    movq       [dstq+dsq*0], xm1
678    movhps     [dstq+dsq*1], xm1
679    lea                dstq, [dstq+dsq*2]
680    sub                  hd, 2
681    jg .hv_w8_loop
682    RET
683.hv_w16:
684    movu                 m0, [srcq+ssq*0+8*0]
685    vinserti128          m0, [srcq+ssq*0+8*1], 1
686    pshufb               m0, m4
687    pmaddubsw            m0, m5
688.hv_w16_loop:
689    movu                xm2, [srcq+ssq*1+8*0]
690    vinserti128          m2, [srcq+ssq*1+8*1], 1
691    lea                srcq, [srcq+ssq*2]
692    movu                xm3, [srcq+ssq*0+8*0]
693    vinserti128          m3, [srcq+ssq*0+8*1], 1
694    pshufb               m2, m4
695    pshufb               m3, m4
696    pmaddubsw            m2, m5
697    psubw                m1, m2, m0
698    pmulhw               m1, m6
699    pavgw                m0, m7
700    paddw                m1, m0
701    pmaddubsw            m0, m3, m5
702    psubw                m3, m0, m2
703    pmulhw               m3, m6
704    pavgw                m2, m7
705    paddw                m3, m2
706    psrlw                m1, 4
707    psrlw                m3, 4
708    packuswb             m1, m3
709    vpermq               m1, m1, q3120
710    mova         [dstq+dsq*0], xm1
711    vextracti128 [dstq+dsq*1], m1, 1
712    lea                dstq, [dstq+dsq*2]
713    sub                  hd, 2
714    jg .hv_w16_loop
715    RET
716.hv_w128:
717    lea                 r6d, [hq+(3<<16)]
718    jmp .hv_w32_start
719.hv_w64:
720    lea                 r6d, [hq+(1<<16)]
721.hv_w32_start:
722    mov                  r4, srcq
723    mov                  r7, dstq
724.hv_w32:
725%if WIN64
726    movaps              r4m, xmm8
727%endif
728.hv_w32_loop0:
729    movu                 m0, [srcq+8*0]
730    movu                 m1, [srcq+8*1]
731    pshufb               m0, m4
732    pshufb               m1, m4
733    pmaddubsw            m0, m5
734    pmaddubsw            m1, m5
735.hv_w32_loop:
736    add                srcq, ssq
737    movu                 m2, [srcq+8*0]
738    movu                 m3, [srcq+8*1]
739    pshufb               m2, m4
740    pshufb               m3, m4
741    pmaddubsw            m2, m5
742    pmaddubsw            m3, m5
743    psubw                m8, m2, m0
744    pmulhw               m8, m6
745    pavgw                m0, m7
746    paddw                m8, m0
747    mova                 m0, m2
748    psubw                m2, m3, m1
749    pmulhw               m2, m6
750    pavgw                m1, m7
751    paddw                m2, m1
752    mova                 m1, m3
753    psrlw                m8, 4
754    psrlw                m2, 4
755    packuswb             m8, m2
756    mova             [dstq], m8
757    add                dstq, dsq
758    dec                  hd
759    jg .hv_w32_loop
760    add                  r4, 32
761    add                  r7, 32
762    movzx                hd, r6b
763    mov                srcq, r4
764    mov                dstq, r7
765    sub                 r6d, 1<<16
766    jg .hv_w32_loop0
767%if WIN64
768    movaps             xmm8, r4m
769%endif
770    RET
771
772cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
773    movifnidn          mxyd, r5m ; mx
774    lea                  r6, [prep%+SUFFIX]
775    tzcnt                wd, wm
776    movifnidn            hd, hm
777    test               mxyd, mxyd
778    jnz .h
779    mov                mxyd, r6m ; my
780    test               mxyd, mxyd
781    jnz .v
782.prep:
783    movzx                wd, word [r6+wq*2+table_offset(prep,)]
784    add                  wq, r6
785    lea            stride3q, [strideq*3]
786    jmp                  wq
787.prep_w4:
788    movd                xm0, [srcq+strideq*0]
789    pinsrd              xm0, [srcq+strideq*1], 1
790    pinsrd              xm0, [srcq+strideq*2], 2
791    pinsrd              xm0, [srcq+stride3q ], 3
792    lea                srcq, [srcq+strideq*4]
793    pmovzxbw             m0, xm0
794    psllw                m0, 4
795    mova             [tmpq], m0
796    add                tmpq, 32
797    sub                  hd, 4
798    jg .prep_w4
799    RET
800.prep_w8:
801    movq                xm0, [srcq+strideq*0]
802    movhps              xm0, [srcq+strideq*1]
803    movq                xm1, [srcq+strideq*2]
804    movhps              xm1, [srcq+stride3q ]
805    lea                srcq, [srcq+strideq*4]
806    pmovzxbw             m0, xm0
807    pmovzxbw             m1, xm1
808    psllw                m0, 4
809    psllw                m1, 4
810    mova        [tmpq+32*0], m0
811    mova        [tmpq+32*1], m1
812    add                tmpq, 32*2
813    sub                  hd, 4
814    jg .prep_w8
815    RET
816.prep_w16:
817    pmovzxbw             m0, [srcq+strideq*0]
818    pmovzxbw             m1, [srcq+strideq*1]
819    pmovzxbw             m2, [srcq+strideq*2]
820    pmovzxbw             m3, [srcq+stride3q ]
821    lea                srcq, [srcq+strideq*4]
822    psllw                m0, 4
823    psllw                m1, 4
824    psllw                m2, 4
825    psllw                m3, 4
826    mova        [tmpq+32*0], m0
827    mova        [tmpq+32*1], m1
828    mova        [tmpq+32*2], m2
829    mova        [tmpq+32*3], m3
830    add                tmpq, 32*4
831    sub                  hd, 4
832    jg .prep_w16
833    RET
834.prep_w32:
835    pmovzxbw             m0, [srcq+strideq*0+16*0]
836    pmovzxbw             m1, [srcq+strideq*0+16*1]
837    pmovzxbw             m2, [srcq+strideq*1+16*0]
838    pmovzxbw             m3, [srcq+strideq*1+16*1]
839    lea                srcq, [srcq+strideq*2]
840    psllw                m0, 4
841    psllw                m1, 4
842    psllw                m2, 4
843    psllw                m3, 4
844    mova        [tmpq+32*0], m0
845    mova        [tmpq+32*1], m1
846    mova        [tmpq+32*2], m2
847    mova        [tmpq+32*3], m3
848    add                tmpq, 32*4
849    sub                  hd, 2
850    jg .prep_w32
851    RET
852.prep_w64:
853    pmovzxbw             m0, [srcq+16*0]
854    pmovzxbw             m1, [srcq+16*1]
855    pmovzxbw             m2, [srcq+16*2]
856    pmovzxbw             m3, [srcq+16*3]
857    add                srcq, strideq
858    psllw                m0, 4
859    psllw                m1, 4
860    psllw                m2, 4
861    psllw                m3, 4
862    mova        [tmpq+32*0], m0
863    mova        [tmpq+32*1], m1
864    mova        [tmpq+32*2], m2
865    mova        [tmpq+32*3], m3
866    add                tmpq, 32*4
867    dec                  hd
868    jg .prep_w64
869    RET
870.prep_w128:
871    pmovzxbw             m0, [srcq+16*0]
872    pmovzxbw             m1, [srcq+16*1]
873    pmovzxbw             m2, [srcq+16*2]
874    pmovzxbw             m3, [srcq+16*3]
875    psllw                m0, 4
876    psllw                m1, 4
877    psllw                m2, 4
878    psllw                m3, 4
879    mova        [tmpq+32*0], m0
880    mova        [tmpq+32*1], m1
881    mova        [tmpq+32*2], m2
882    mova        [tmpq+32*3], m3
883    pmovzxbw             m0, [srcq+16*4]
884    pmovzxbw             m1, [srcq+16*5]
885    pmovzxbw             m2, [srcq+16*6]
886    pmovzxbw             m3, [srcq+16*7]
887    add                tmpq, 32*8
888    add                srcq, strideq
889    psllw                m0, 4
890    psllw                m1, 4
891    psllw                m2, 4
892    psllw                m3, 4
893    mova        [tmpq-32*4], m0
894    mova        [tmpq-32*3], m1
895    mova        [tmpq-32*2], m2
896    mova        [tmpq-32*1], m3
897    dec                  hd
898    jg .prep_w128
899    RET
900.h:
901    ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
902    ; = (16 - mx) * src[x] + mx * src[x + 1]
903    imul               mxyd, 255
904    vbroadcasti128       m4, [bilin_h_shuf8]
905    add                mxyd, 16
906    movd                xm5, mxyd
907    mov                mxyd, r6m ; my
908    vpbroadcastw         m5, xm5
909    test               mxyd, mxyd
910    jnz .hv
911    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
912    add                  wq, r6
913    lea            stride3q, [strideq*3]
914    jmp                  wq
915.h_w4:
916    vbroadcasti128       m4, [bilin_h_shuf4]
917.h_w4_loop:
918    movq                xm0, [srcq+strideq*0]
919    movhps              xm0, [srcq+strideq*1]
920    movq                xm1, [srcq+strideq*2]
921    movhps              xm1, [srcq+stride3q ]
922    lea                srcq, [srcq+strideq*4]
923    vinserti128          m0, xm1, 1
924    pshufb               m0, m4
925    pmaddubsw            m0, m5
926    mova             [tmpq], m0
927    add                tmpq, 32
928    sub                  hd, 4
929    jg .h_w4_loop
930    RET
931.h_w8:
932.h_w8_loop:
933    movu                xm0, [srcq+strideq*0]
934    vinserti128          m0, [srcq+strideq*1], 1
935    movu                xm1, [srcq+strideq*2]
936    vinserti128          m1, [srcq+stride3q ], 1
937    lea                srcq, [srcq+strideq*4]
938    pshufb               m0, m4
939    pshufb               m1, m4
940    pmaddubsw            m0, m5
941    pmaddubsw            m1, m5
942    mova        [tmpq+32*0], m0
943    mova        [tmpq+32*1], m1
944    add                tmpq, 32*2
945    sub                  hd, 4
946    jg .h_w8_loop
947    RET
948.h_w16:
949.h_w16_loop:
950    movu                xm0, [srcq+strideq*0+8*0]
951    vinserti128          m0, [srcq+strideq*0+8*1], 1
952    movu                xm1, [srcq+strideq*1+8*0]
953    vinserti128          m1, [srcq+strideq*1+8*1], 1
954    movu                xm2, [srcq+strideq*2+8*0]
955    vinserti128          m2, [srcq+strideq*2+8*1], 1
956    movu                xm3, [srcq+stride3q +8*0]
957    vinserti128          m3, [srcq+stride3q +8*1], 1
958    lea                srcq, [srcq+strideq*4]
959    pshufb               m0, m4
960    pshufb               m1, m4
961    pshufb               m2, m4
962    pshufb               m3, m4
963    pmaddubsw            m0, m5
964    pmaddubsw            m1, m5
965    pmaddubsw            m2, m5
966    pmaddubsw            m3, m5
967    mova        [tmpq+32*0], m0
968    mova        [tmpq+32*1], m1
969    mova        [tmpq+32*2], m2
970    mova        [tmpq+32*3], m3
971    add                tmpq, 32*4
972    sub                  hd, 4
973    jg .h_w16_loop
974    RET
975.h_w32:
976.h_w32_loop:
977    movu                xm0, [srcq+strideq*0+8*0]
978    vinserti128          m0, [srcq+strideq*0+8*1], 1
979    movu                xm1, [srcq+strideq*0+8*2]
980    vinserti128          m1, [srcq+strideq*0+8*3], 1
981    movu                xm2, [srcq+strideq*1+8*0]
982    vinserti128          m2, [srcq+strideq*1+8*1], 1
983    movu                xm3, [srcq+strideq*1+8*2]
984    vinserti128          m3, [srcq+strideq*1+8*3], 1
985    lea                srcq, [srcq+strideq*2]
986    pshufb               m0, m4
987    pshufb               m1, m4
988    pshufb               m2, m4
989    pshufb               m3, m4
990    pmaddubsw            m0, m5
991    pmaddubsw            m1, m5
992    pmaddubsw            m2, m5
993    pmaddubsw            m3, m5
994    mova        [tmpq+32*0], m0
995    mova        [tmpq+32*1], m1
996    mova        [tmpq+32*2], m2
997    mova        [tmpq+32*3], m3
998    add                tmpq, 32*4
999    sub                  hd, 2
1000    jg .h_w32_loop
1001    RET
1002.h_w64:
1003    movu                xm0, [srcq+8*0]
1004    vinserti128          m0, [srcq+8*1], 1
1005    movu                xm1, [srcq+8*2]
1006    vinserti128          m1, [srcq+8*3], 1
1007    movu                xm2, [srcq+8*4]
1008    vinserti128          m2, [srcq+8*5], 1
1009    movu                xm3, [srcq+8*6]
1010    vinserti128          m3, [srcq+8*7], 1
1011    add                srcq, strideq
1012    pshufb               m0, m4
1013    pshufb               m1, m4
1014    pshufb               m2, m4
1015    pshufb               m3, m4
1016    pmaddubsw            m0, m5
1017    pmaddubsw            m1, m5
1018    pmaddubsw            m2, m5
1019    pmaddubsw            m3, m5
1020    mova        [tmpq+32*0], m0
1021    mova        [tmpq+32*1], m1
1022    mova        [tmpq+32*2], m2
1023    mova        [tmpq+32*3], m3
1024    add                tmpq, 32*4
1025    dec                  hd
1026    jg .h_w64
1027    RET
1028.h_w128:
1029    movu                xm0, [srcq+8*0]
1030    vinserti128          m0, [srcq+8*1], 1
1031    movu                xm1, [srcq+8*2]
1032    vinserti128          m1, [srcq+8*3], 1
1033    movu                xm2, [srcq+8*4]
1034    vinserti128          m2, [srcq+8*5], 1
1035    movu                xm3, [srcq+8*6]
1036    vinserti128          m3, [srcq+8*7], 1
1037    pshufb               m0, m4
1038    pshufb               m1, m4
1039    pshufb               m2, m4
1040    pshufb               m3, m4
1041    pmaddubsw            m0, m5
1042    pmaddubsw            m1, m5
1043    pmaddubsw            m2, m5
1044    pmaddubsw            m3, m5
1045    mova        [tmpq+32*0], m0
1046    mova        [tmpq+32*1], m1
1047    mova        [tmpq+32*2], m2
1048    mova        [tmpq+32*3], m3
1049    movu                xm0, [srcq+8* 8]
1050    vinserti128          m0, [srcq+8* 9], 1
1051    movu                xm1, [srcq+8*10]
1052    vinserti128          m1, [srcq+8*11], 1
1053    movu                xm2, [srcq+8*12]
1054    vinserti128          m2, [srcq+8*13], 1
1055    movu                xm3, [srcq+8*14]
1056    vinserti128          m3, [srcq+8*15], 1
1057    add                tmpq, 32*8
1058    add                srcq, strideq
1059    pshufb               m0, m4
1060    pshufb               m1, m4
1061    pshufb               m2, m4
1062    pshufb               m3, m4
1063    pmaddubsw            m0, m5
1064    pmaddubsw            m1, m5
1065    pmaddubsw            m2, m5
1066    pmaddubsw            m3, m5
1067    mova        [tmpq-32*4], m0
1068    mova        [tmpq-32*3], m1
1069    mova        [tmpq-32*2], m2
1070    mova        [tmpq-32*1], m3
1071    dec                  hd
1072    jg .h_w128
1073    RET
1074.v:
1075    WIN64_SPILL_XMM       7
1076    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
1077    imul               mxyd, 255
1078    add                mxyd, 16
1079    add                  wq, r6
1080    lea            stride3q, [strideq*3]
1081    movd                xm6, mxyd
1082    vpbroadcastw         m6, xm6
1083    jmp                  wq
1084.v_w4:
1085    movd                xm0, [srcq+strideq*0]
1086.v_w4_loop:
1087    vpbroadcastd         m1, [srcq+strideq*2]
1088    vpbroadcastd        xm2, [srcq+strideq*1]
1089    vpbroadcastd         m3, [srcq+stride3q ]
1090    lea                srcq, [srcq+strideq*4]
1091    vpblendd             m1, m0, 0x05     ; 0 2 2 2
1092    vpbroadcastd         m0, [srcq+strideq*0]
1093    vpblendd             m3, m2, 0x0f     ; 1 1 3 3
1094    vpblendd             m2, m1, m0, 0xa0 ; 0 2 2 4
1095    vpblendd             m1, m3, 0xaa     ; 0 1 2 3
1096    vpblendd             m2, m3, 0x55     ; 1 2 3 4
1097    punpcklbw            m1, m2
1098    pmaddubsw            m1, m6
1099    mova             [tmpq], m1
1100    add                tmpq, 32
1101    sub                  hd, 4
1102    jg .v_w4_loop
1103    RET
1104.v_w8:
1105    movq                xm0, [srcq+strideq*0]
1106.v_w8_loop:
1107    vpbroadcastq         m1, [srcq+strideq*2]
1108    vpbroadcastq         m2, [srcq+strideq*1]
1109    vpbroadcastq         m3, [srcq+stride3q ]
1110    lea                srcq, [srcq+strideq*4]
1111    vpblendd             m1, m0, 0x03     ; 0 2 2 2
1112    vpbroadcastq         m0, [srcq+strideq*0]
1113    vpblendd             m2, m3, 0xcc     ; 1 3 1 3
1114    vpblendd             m3, m2, m1, 0xf0 ; 1 3 2 2
1115    vpblendd             m2, m1, 0x0f     ; 0 2 1 3
1116    vpblendd             m3, m0, 0xc0     ; 1 3 2 4
1117    punpcklbw            m1, m2, m3
1118    punpckhbw            m2, m3
1119    pmaddubsw            m1, m6
1120    pmaddubsw            m2, m6
1121    mova        [tmpq+32*0], m1
1122    mova        [tmpq+32*1], m2
1123    add                tmpq, 32*2
1124    sub                  hd, 4
1125    jg .v_w8_loop
1126    RET
1127.v_w16:
1128    vbroadcasti128       m0, [srcq+strideq*0]
1129.v_w16_loop:
1130    vbroadcasti128       m1, [srcq+strideq*1]
1131    vbroadcasti128       m2, [srcq+strideq*2]
1132    vbroadcasti128       m3, [srcq+stride3q ]
1133    lea                srcq, [srcq+strideq*4]
1134    shufpd               m4, m0, m2, 0x0c ; 0 2
1135    vbroadcasti128       m0, [srcq+strideq*0]
1136    shufpd               m1, m3, 0x0c     ; 1 3
1137    shufpd               m2, m0, 0x0c     ; 2 4
1138    punpcklbw            m3, m4, m1
1139    punpcklbw            m5, m1, m2
1140    punpckhbw            m4, m1
1141    punpckhbw            m1, m2
1142    pmaddubsw            m3, m6
1143    pmaddubsw            m5, m6
1144    pmaddubsw            m4, m6
1145    pmaddubsw            m1, m6
1146    mova        [tmpq+32*0], m3
1147    mova        [tmpq+32*1], m5
1148    mova        [tmpq+32*2], m4
1149    mova        [tmpq+32*3], m1
1150    add                tmpq, 32*4
1151    sub                  hd, 4
1152    jg .v_w16_loop
1153    RET
1154.v_w32:
1155    vpermq               m0, [srcq+strideq*0], q3120
1156.v_w32_loop:
1157    vpermq               m1, [srcq+strideq*1], q3120
1158    vpermq               m2, [srcq+strideq*2], q3120
1159    vpermq               m3, [srcq+stride3q ], q3120
1160    lea                srcq, [srcq+strideq*4]
1161    punpcklbw            m4, m0, m1
1162    punpckhbw            m5, m0, m1
1163    vpermq               m0, [srcq+strideq*0], q3120
1164    pmaddubsw            m4, m6
1165    pmaddubsw            m5, m6
1166    mova        [tmpq+32*0], m4
1167    mova        [tmpq+32*1], m5
1168    punpcklbw            m4, m1, m2
1169    punpckhbw            m1, m2
1170    pmaddubsw            m4, m6
1171    pmaddubsw            m1, m6
1172    punpcklbw            m5, m2, m3
1173    punpckhbw            m2, m3
1174    pmaddubsw            m5, m6
1175    pmaddubsw            m2, m6
1176    mova        [tmpq+32*2], m4
1177    mova        [tmpq+32*3], m1
1178    add                tmpq, 32*8
1179    punpcklbw            m1, m3, m0
1180    punpckhbw            m3, m0
1181    pmaddubsw            m1, m6
1182    pmaddubsw            m3, m6
1183    mova        [tmpq-32*4], m5
1184    mova        [tmpq-32*3], m2
1185    mova        [tmpq-32*2], m1
1186    mova        [tmpq-32*1], m3
1187    sub                  hd, 4
1188    jg .v_w32_loop
1189    RET
1190.v_w64:
1191    vpermq               m0, [srcq+strideq*0+32*0], q3120
1192    vpermq               m1, [srcq+strideq*0+32*1], q3120
1193.v_w64_loop:
1194    vpermq               m2, [srcq+strideq*1+32*0], q3120
1195    vpermq               m3, [srcq+strideq*1+32*1], q3120
1196    lea                srcq, [srcq+strideq*2]
1197    punpcklbw            m4, m0, m2
1198    punpckhbw            m0, m2
1199    pmaddubsw            m4, m6
1200    pmaddubsw            m0, m6
1201    mova        [tmpq+32*0], m4
1202    mova        [tmpq+32*1], m0
1203    punpcklbw            m4, m1, m3
1204    punpckhbw            m5, m1, m3
1205    vpermq               m0, [srcq+strideq*0+32*0], q3120
1206    vpermq               m1, [srcq+strideq*0+32*1], q3120
1207    pmaddubsw            m4, m6
1208    pmaddubsw            m5, m6
1209    mova        [tmpq+32*2], m4
1210    mova        [tmpq+32*3], m5
1211    add                tmpq, 32*8
1212    punpcklbw            m4, m2, m0
1213    punpckhbw            m2, m0
1214    punpcklbw            m5, m3, m1
1215    punpckhbw            m3, m1
1216    pmaddubsw            m4, m6
1217    pmaddubsw            m2, m6
1218    pmaddubsw            m5, m6
1219    pmaddubsw            m3, m6
1220    mova        [tmpq-32*4], m4
1221    mova        [tmpq-32*3], m2
1222    mova        [tmpq-32*2], m5
1223    mova        [tmpq-32*1], m3
1224    sub                  hd, 2
1225    jg .v_w64_loop
1226    RET
1227.v_w128:
1228    lea                 r6d, [hq+(3<<8)]
1229    mov                  r3, srcq
1230    mov                  r5, tmpq
1231.v_w128_loop0:
1232    vpermq               m0, [srcq+strideq*0], q3120
1233.v_w128_loop:
1234    vpermq               m1, [srcq+strideq*1], q3120
1235    lea                srcq, [srcq+strideq*2]
1236    punpcklbw            m2, m0, m1
1237    punpckhbw            m3, m0, m1
1238    vpermq               m0, [srcq+strideq*0], q3120
1239    pmaddubsw            m2, m6
1240    pmaddubsw            m3, m6
1241    punpcklbw            m4, m1, m0
1242    punpckhbw            m1, m0
1243    pmaddubsw            m4, m6
1244    pmaddubsw            m1, m6
1245    mova        [tmpq+32*0], m2
1246    mova        [tmpq+32*1], m3
1247    mova        [tmpq+32*8], m4
1248    mova        [tmpq+32*9], m1
1249    add                tmpq, 32*16
1250    sub                  hd, 2
1251    jg .v_w128_loop
1252    add                  r3, 32
1253    add                  r5, 64
1254    movzx                hd, r6b
1255    mov                srcq, r3
1256    mov                tmpq, r5
1257    sub                 r6d, 1<<8
1258    jg .v_w128_loop0
1259    RET
1260.hv:
1261    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
1262    ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
1263    %assign stack_offset stack_offset - stack_size_padded
1264    WIN64_SPILL_XMM       7
1265    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
1266    shl                mxyd, 11
1267    movd                xm6, mxyd
1268    vpbroadcastw         m6, xm6
1269    add                  wq, r6
1270    lea            stride3q, [strideq*3]
1271    jmp                  wq
1272.hv_w4:
1273    vbroadcasti128       m4, [bilin_h_shuf4]
1274    vpbroadcastq         m0, [srcq+strideq*0]
1275    pshufb               m0, m4
1276    pmaddubsw            m0, m5
1277.hv_w4_loop:
1278    movq                xm1, [srcq+strideq*1]
1279    movhps              xm1, [srcq+strideq*2]
1280    movq                xm2, [srcq+stride3q ]
1281    lea                srcq, [srcq+strideq*4]
1282    movhps              xm2, [srcq+strideq*0]
1283    vinserti128          m1, xm2, 1
1284    pshufb               m1, m4
1285    pmaddubsw            m1, m5        ; 1 2 3 4
1286    vpblendd             m2, m1, m0, 0xc0
1287    vpermq               m2, m2, q2103 ; 0 1 2 3
1288    mova                 m0, m1
1289    psubw                m1, m2
1290    pmulhrsw             m1, m6
1291    paddw                m1, m2
1292    mova             [tmpq], m1
1293    add                tmpq, 32
1294    sub                  hd, 4
1295    jg .hv_w4_loop
1296    RET
1297.hv_w8:
1298    vbroadcasti128       m0, [srcq+strideq*0]
1299    pshufb               m0, m4
1300    pmaddubsw            m0, m5
1301.hv_w8_loop:
1302    movu                xm1, [srcq+strideq*1]
1303    vinserti128          m1, [srcq+strideq*2], 1
1304    movu                xm2, [srcq+stride3q ]
1305    lea                srcq, [srcq+strideq*4]
1306    vinserti128          m2, [srcq+strideq*0], 1
1307    pshufb               m1, m4
1308    pshufb               m2, m4
1309    pmaddubsw            m1, m5           ; 1 2
1310    vperm2i128           m3, m0, m1, 0x21 ; 0 1
1311    pmaddubsw            m0, m2, m5       ; 3 4
1312    vperm2i128           m2, m1, m0, 0x21 ; 2 3
1313    psubw                m1, m3
1314    pmulhrsw             m1, m6
1315    paddw                m1, m3
1316    psubw                m3, m0, m2
1317    pmulhrsw             m3, m6
1318    paddw                m3, m2
1319    mova        [tmpq+32*0], m1
1320    mova        [tmpq+32*1], m3
1321    add                tmpq, 32*2
1322    sub                  hd, 4
1323    jg .hv_w8_loop
1324    RET
1325.hv_w16:
1326    movu                xm0, [srcq+strideq*0+8*0]
1327    vinserti128          m0, [srcq+strideq*0+8*1], 1
1328    pshufb               m0, m4
1329    pmaddubsw            m0, m5
1330.hv_w16_loop:
1331    movu                xm1, [srcq+strideq*1+8*0]
1332    vinserti128          m1, [srcq+strideq*1+8*1], 1
1333    lea                srcq, [srcq+strideq*2]
1334    movu                xm2, [srcq+strideq*0+8*0]
1335    vinserti128          m2, [srcq+strideq*0+8*1], 1
1336    pshufb               m1, m4
1337    pshufb               m2, m4
1338    pmaddubsw            m1, m5
1339    psubw                m3, m1, m0
1340    pmulhrsw             m3, m6
1341    paddw                m3, m0
1342    pmaddubsw            m0, m2, m5
1343    psubw                m2, m0, m1
1344    pmulhrsw             m2, m6
1345    paddw                m2, m1
1346    mova        [tmpq+32*0], m3
1347    mova        [tmpq+32*1], m2
1348    add                tmpq, 32*2
1349    sub                  hd, 2
1350    jg .hv_w16_loop
1351    RET
1352.hv_w32:
1353    movu                xm0, [srcq+8*0]
1354    vinserti128          m0, [srcq+8*1], 1
1355    movu                xm1, [srcq+8*2]
1356    vinserti128          m1, [srcq+8*3], 1
1357    pshufb               m0, m4
1358    pshufb               m1, m4
1359    pmaddubsw            m0, m5
1360    pmaddubsw            m1, m5
1361.hv_w32_loop:
1362    add                srcq, strideq
1363    movu                xm2, [srcq+8*0]
1364    vinserti128          m2, [srcq+8*1], 1
1365    pshufb               m2, m4
1366    pmaddubsw            m2, m5
1367    psubw                m3, m2, m0
1368    pmulhrsw             m3, m6
1369    paddw                m3, m0
1370    mova                 m0, m2
1371    movu                xm2, [srcq+8*2]
1372    vinserti128          m2, [srcq+8*3], 1
1373    pshufb               m2, m4
1374    pmaddubsw            m2, m5
1375    mova        [tmpq+32*0], m3
1376    psubw                m3, m2, m1
1377    pmulhrsw             m3, m6
1378    paddw                m3, m1
1379    mova                 m1, m2
1380    mova        [tmpq+32*1], m3
1381    add                tmpq, 32*2
1382    dec                  hd
1383    jg .hv_w32_loop
1384    RET
1385.hv_w128:
1386    lea                 r3d, [hq+(7<<8)]
1387    mov                 r6d, 256
1388    jmp .hv_w64_start
1389.hv_w64:
1390    lea                 r3d, [hq+(3<<8)]
1391    mov                 r6d, 128
1392.hv_w64_start:
1393%if WIN64
1394    PUSH                 r7
1395%endif
1396    mov                  r5, srcq
1397    mov                  r7, tmpq
1398.hv_w64_loop0:
1399    movu                xm0, [srcq+strideq*0+8*0]
1400    vinserti128          m0, [srcq+strideq*0+8*1], 1
1401    pshufb               m0, m4
1402    pmaddubsw            m0, m5
1403.hv_w64_loop:
1404    movu                xm1, [srcq+strideq*1+8*0]
1405    vinserti128          m1, [srcq+strideq*1+8*1], 1
1406    lea                srcq, [srcq+strideq*2]
1407    movu                xm2, [srcq+strideq*0+8*0]
1408    vinserti128          m2, [srcq+strideq*0+8*1], 1
1409    pshufb               m1, m4
1410    pshufb               m2, m4
1411    pmaddubsw            m1, m5
1412    psubw                m3, m1, m0
1413    pmulhrsw             m3, m6
1414    paddw                m3, m0
1415    pmaddubsw            m0, m2, m5
1416    psubw                m2, m0, m1
1417    pmulhrsw             m2, m6
1418    paddw                m2, m1
1419    mova        [tmpq+r6*0], m3
1420    mova        [tmpq+r6*1], m2
1421    lea                tmpq, [tmpq+r6*2]
1422    sub                  hd, 2
1423    jg .hv_w64_loop
1424    add                  r5, 16
1425    add                  r7, 32
1426    movzx                hd, r3b
1427    mov                srcq, r5
1428    mov                tmpq, r7
1429    sub                 r3d, 1<<8
1430    jg .hv_w64_loop0
1431%if WIN64
1432    POP                  r7
1433%endif
1434    RET
1435
1436; int8_t subpel_filters[5][15][8]
1437%assign FILTER_REGULAR (0*15 << 16) | 3*15
1438%assign FILTER_SMOOTH  (1*15 << 16) | 4*15
1439%assign FILTER_SHARP   (2*15 << 16) | 3*15
1440
1441%macro FN 4 ; fn, type, type_h, type_v
1442cglobal %1_%2
1443    mov                 t0d, FILTER_%3
1444%ifidn %3, %4
1445    mov                 t1d, t0d
1446%else
1447    mov                 t1d, FILTER_%4
1448%endif
1449%ifnidn %2, regular ; skip the jump in the last filter
1450    jmp mangle(private_prefix %+ _%1 %+ SUFFIX)
1451%endif
1452%endmacro
1453
1454%if WIN64
1455DECLARE_REG_TMP 4, 5
1456%else
1457DECLARE_REG_TMP 7, 8
1458%endif
1459
1460%define PUT_8TAP_FN FN put_8tap,
1461
1462PUT_8TAP_FN sharp,          SHARP,   SHARP
1463PUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH
1464PUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP
1465PUT_8TAP_FN smooth,         SMOOTH,  SMOOTH
1466PUT_8TAP_FN sharp_regular,  SHARP,   REGULAR
1467PUT_8TAP_FN regular_sharp,  REGULAR, SHARP
1468PUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR
1469PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
1470PUT_8TAP_FN regular,        REGULAR, REGULAR
1471
1472cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
1473    imul                mxd, mxm, 0x010101
1474    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
1475    imul                myd, mym, 0x010101
1476    add                 myd, t1d ; 8tap_v, my, 4tap_v
1477    lea                  r8, [put_avx2]
1478    movsxd               wq, wm
1479    movifnidn            hd, hm
1480    test                mxd, 0xf00
1481    jnz .h
1482    test                myd, 0xf00
1483    jnz .v
1484    tzcnt                wd, wd
1485    movzx                wd, word [r8+wq*2+table_offset(put,)]
1486    add                  wq, r8
1487    lea                  r6, [ssq*3]
1488    lea                  r7, [dsq*3]
1489%if WIN64
1490    pop                  r8
1491%endif
1492    jmp                  wq
1493.h:
1494    test                myd, 0xf00
1495    jnz .hv
1496    vpbroadcastd         m5, [pw_34] ; 2 + (8 << 2)
1497    WIN64_SPILL_XMM      11
1498    cmp                  wd, 4
1499    jl .h_w2
1500    vbroadcasti128       m6, [subpel_h_shufA]
1501    je .h_w4
1502    tzcnt                wd, wd
1503    vbroadcasti128       m7, [subpel_h_shufB]
1504    vbroadcasti128       m8, [subpel_h_shufC]
1505    shr                 mxd, 16
1506    sub                srcq, 3
1507    movzx                wd, word [r8+wq*2+table_offset(put, _8tap_h)]
1508    vpbroadcastd         m9, [r8+mxq*8+subpel_filters-put_avx2+0]
1509    vpbroadcastd        m10, [r8+mxq*8+subpel_filters-put_avx2+4]
1510    add                  wq, r8
1511    jmp                  wq
1512.h_w2:
1513    movzx               mxd, mxb
1514    dec                srcq
1515    mova                xm4, [subpel_h_shuf4]
1516    vpbroadcastd        xm3, [r8+mxq*8+subpel_filters-put_avx2+2]
1517.h_w2_loop:
1518    movq                xm0, [srcq+ssq*0]
1519    movhps              xm0, [srcq+ssq*1]
1520    lea                srcq, [srcq+ssq*2]
1521    pshufb              xm0, xm4
1522    pmaddubsw           xm0, xm3
1523    phaddw              xm0, xm0
1524    paddw               xm0, xm5
1525    psraw               xm0, 6
1526    packuswb            xm0, xm0
1527    pextrw     [dstq+dsq*0], xm0, 0
1528    pextrw     [dstq+dsq*1], xm0, 1
1529    lea                dstq, [dstq+dsq*2]
1530    sub                  hd, 2
1531    jg .h_w2_loop
1532    RET
1533.h_w4:
1534    movzx               mxd, mxb
1535    dec                srcq
1536    vpbroadcastd        xm3, [r8+mxq*8+subpel_filters-put_avx2+2]
1537.h_w4_loop:
1538    movq                xm0, [srcq+ssq*0]
1539    movq                xm1, [srcq+ssq*1]
1540    lea                srcq, [srcq+ssq*2]
1541    pshufb              xm0, xm6
1542    pshufb              xm1, xm6
1543    pmaddubsw           xm0, xm3
1544    pmaddubsw           xm1, xm3
1545    phaddw              xm0, xm1
1546    paddw               xm0, xm5
1547    psraw               xm0, 6
1548    packuswb            xm0, xm0
1549    movd       [dstq+dsq*0], xm0
1550    pextrd     [dstq+dsq*1], xm0, 1
1551    lea                dstq, [dstq+dsq*2]
1552    sub                  hd, 2
1553    jg .h_w4_loop
1554    RET
1555.h_w8:
1556%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
1557    pshufb              m%2, m%1, m7
1558    pshufb              m%3, m%1, m8
1559    pshufb              m%1, m6
1560    pmaddubsw           m%4, m%2, m9
1561    pmaddubsw           m%2, m10
1562    pmaddubsw           m%3, m10
1563    pmaddubsw           m%1, m9
1564    paddw               m%3, m%4
1565    paddw               m%1, m%2
1566    phaddw              m%1, m%3
1567    paddw               m%1, m5
1568    psraw               m%1, 6
1569%endmacro
1570    movu                xm0, [srcq+ssq*0]
1571    vinserti128          m0, [srcq+ssq*1], 1
1572    lea                srcq, [srcq+ssq*2]
1573    PUT_8TAP_H            0, 1, 2, 3
1574    vextracti128        xm1, m0, 1
1575    packuswb            xm0, xm1
1576    movq       [dstq+dsq*0], xm0
1577    movhps     [dstq+dsq*1], xm0
1578    lea                dstq, [dstq+dsq*2]
1579    sub                  hd, 2
1580    jg .h_w8
1581    RET
1582.h_w16:
1583    movu                xm0, [srcq+ssq*0+8*0]
1584    vinserti128          m0, [srcq+ssq*1+8*0], 1
1585    movu                xm1, [srcq+ssq*0+8*1]
1586    vinserti128          m1, [srcq+ssq*1+8*1], 1
1587    PUT_8TAP_H            0, 2, 3, 4
1588    lea                srcq, [srcq+ssq*2]
1589    PUT_8TAP_H            1, 2, 3, 4
1590    packuswb             m0, m1
1591    mova         [dstq+dsq*0], xm0
1592    vextracti128 [dstq+dsq*1], m0, 1
1593    lea                dstq, [dstq+dsq*2]
1594    sub                  hd, 2
1595    jg .h_w16
1596    RET
1597.h_w32:
1598    xor                 r6d, r6d
1599    jmp .h_start
1600.h_w64:
1601    mov                  r6, -32*1
1602    jmp .h_start
1603.h_w128:
1604    mov                  r6, -32*3
1605.h_start:
1606    sub                srcq, r6
1607    sub                dstq, r6
1608    mov                  r4, r6
1609.h_loop:
1610    movu                 m0, [srcq+r6+8*0]
1611    movu                 m1, [srcq+r6+8*1]
1612    PUT_8TAP_H            0, 2, 3, 4
1613    PUT_8TAP_H            1, 2, 3, 4
1614    packuswb             m0, m1
1615    mova          [dstq+r6], m0
1616    add                  r6, 32
1617    jle .h_loop
1618    add                srcq, ssq
1619    add                dstq, dsq
1620    mov                  r6, r4
1621    dec                  hd
1622    jg .h_loop
1623    RET
1624.v:
1625    %assign stack_offset stack_offset - stack_size_padded
1626    WIN64_SPILL_XMM      16
1627    movzx               mxd, myb
1628    shr                 myd, 16
1629    cmp                  hd, 6
1630    cmovs               myd, mxd
1631    tzcnt               r6d, wd
1632    movzx               r6d, word [r8+r6*2+table_offset(put, _8tap_v)]
1633    vpbroadcastd         m7, [pw_512]
1634    lea                 myq, [r8+myq*8+subpel_filters-put_avx2]
1635    vpbroadcastw         m8, [myq+0]
1636    vpbroadcastw         m9, [myq+2]
1637    vpbroadcastw        m10, [myq+4]
1638    vpbroadcastw        m11, [myq+6]
1639    add                  r6, r8
1640    lea                ss3q, [ssq*3]
1641    sub                srcq, ss3q
1642    jmp                  r6
1643.v_w2:
1644    movd                xm2, [srcq+ssq*0]
1645    pinsrw              xm2, [srcq+ssq*1], 2
1646    pinsrw              xm2, [srcq+ssq*2], 4
1647    add                srcq, ss3q
1648    pinsrw              xm2, [srcq+ssq*0], 6 ; 0 1 2 3
1649    movd                xm3, [srcq+ssq*1]
1650    vpbroadcastd        xm1, [srcq+ssq*2]
1651    add                srcq, ss3q
1652    vpbroadcastd        xm0, [srcq+ssq*0]
1653    vpblendd            xm3, xm1, 0x02       ; 4 5
1654    vpblendd            xm1, xm0, 0x02       ; 5 6
1655    palignr             xm4, xm3, xm2, 4     ; 1 2 3 4
1656    punpcklbw           xm3, xm1             ; 45 56
1657    punpcklbw           xm1, xm2, xm4        ; 01 12
1658    punpckhbw           xm2, xm4             ; 23 34
1659.v_w2_loop:
1660    pmaddubsw           xm5, xm1, xm8        ; a0 b0
1661    mova                xm1, xm2
1662    pmaddubsw           xm2, xm9             ; a1 b1
1663    paddw               xm5, xm2
1664    mova                xm2, xm3
1665    pmaddubsw           xm3, xm10            ; a2 b2
1666    paddw               xm5, xm3
1667    vpbroadcastd        xm4, [srcq+ssq*1]
1668    lea                srcq, [srcq+ssq*2]
1669    vpblendd            xm3, xm0, xm4, 0x02  ; 6 7
1670    vpbroadcastd        xm0, [srcq+ssq*0]
1671    vpblendd            xm4, xm0, 0x02       ; 7 8
1672    punpcklbw           xm3, xm4             ; 67 78
1673    pmaddubsw           xm4, xm3, xm11       ; a3 b3
1674    paddw               xm5, xm4
1675    pmulhrsw            xm5, xm7
1676    packuswb            xm5, xm5
1677    pextrw     [dstq+dsq*0], xm5, 0
1678    pextrw     [dstq+dsq*1], xm5, 2
1679    lea                dstq, [dstq+dsq*2]
1680    sub                  hd, 2
1681    jg .v_w2_loop
1682    RET
1683.v_w4:
1684    movd                xm2, [srcq+ssq*0]
1685    pinsrd              xm2, [srcq+ssq*1], 1
1686    pinsrd              xm2, [srcq+ssq*2], 2
1687    add                srcq, ss3q
1688    pinsrd              xm2, [srcq+ssq*0], 3 ; 0 1 2 3
1689    movd                xm3, [srcq+ssq*1]
1690    vpbroadcastd        xm1, [srcq+ssq*2]
1691    add                srcq, ss3q
1692    vpbroadcastd        xm0, [srcq+ssq*0]
1693    vpblendd            xm3, xm1, 0x02       ; 4 5
1694    vpblendd            xm1, xm0, 0x02       ; 5 6
1695    palignr             xm4, xm3, xm2, 4     ; 1 2 3 4
1696    punpcklbw           xm3, xm1             ; 45 56
1697    punpcklbw           xm1, xm2, xm4        ; 01 12
1698    punpckhbw           xm2, xm4             ; 23 34
1699.v_w4_loop:
1700    pmaddubsw           xm5, xm1, xm8        ; a0 b0
1701    mova                xm1, xm2
1702    pmaddubsw           xm2, xm9             ; a1 b1
1703    paddw               xm5, xm2
1704    mova                xm2, xm3
1705    pmaddubsw           xm3, xm10            ; a2 b2
1706    paddw               xm5, xm3
1707    vpbroadcastd        xm4, [srcq+ssq*1]
1708    lea                srcq, [srcq+ssq*2]
1709    vpblendd            xm3, xm0, xm4, 0x02  ; 6 7
1710    vpbroadcastd        xm0, [srcq+ssq*0]
1711    vpblendd            xm4, xm0, 0x02       ; 7 8
1712    punpcklbw           xm3, xm4             ; 67 78
1713    pmaddubsw           xm4, xm3, xm11       ; a3 b3
1714    paddw               xm5, xm4
1715    pmulhrsw            xm5, xm7
1716    packuswb            xm5, xm5
1717    movd       [dstq+dsq*0], xm5
1718    pextrd     [dstq+dsq*1], xm5, 1
1719    lea                dstq, [dstq+dsq*2]
1720    sub                  hd, 2
1721    jg .v_w4_loop
1722    RET
1723.v_w8:
1724    movq                xm1, [srcq+ssq*0]
1725    vpbroadcastq         m4, [srcq+ssq*1]
1726    vpbroadcastq         m2, [srcq+ssq*2]
1727    add                srcq, ss3q
1728    vpbroadcastq         m5, [srcq+ssq*0]
1729    vpbroadcastq         m3, [srcq+ssq*1]
1730    vpbroadcastq         m6, [srcq+ssq*2]
1731    add                srcq, ss3q
1732    vpbroadcastq         m0, [srcq+ssq*0]
1733    vpblendd             m1, m4, 0x30
1734    vpblendd             m4, m2, 0x30
1735    punpcklbw            m1, m4      ; 01 12
1736    vpblendd             m2, m5, 0x30
1737    vpblendd             m5, m3, 0x30
1738    punpcklbw            m2, m5      ; 23 34
1739    vpblendd             m3, m6, 0x30
1740    vpblendd             m6, m0, 0x30
1741    punpcklbw            m3, m6      ; 45 56
1742.v_w8_loop:
1743    vpbroadcastq         m4, [srcq+ssq*1]
1744    lea                srcq, [srcq+ssq*2]
1745    pmaddubsw            m5, m1, m8  ; a0 b0
1746    mova                 m1, m2
1747    pmaddubsw            m2, m9      ; a1 b1
1748    paddw                m5, m2
1749    mova                 m2, m3
1750    pmaddubsw            m3, m10     ; a2 b2
1751    paddw                m5, m3
1752    vpblendd             m3, m0, m4, 0x30
1753    vpbroadcastq         m0, [srcq+ssq*0]
1754    vpblendd             m4, m0, 0x30
1755    punpcklbw            m3, m4      ; 67 78
1756    pmaddubsw            m4, m3, m11 ; a3 b3
1757    paddw                m5, m4
1758    pmulhrsw             m5, m7
1759    vextracti128        xm4, m5, 1
1760    packuswb            xm5, xm4
1761    movq       [dstq+dsq*0], xm5
1762    movhps     [dstq+dsq*1], xm5
1763    lea                dstq, [dstq+dsq*2]
1764    sub                  hd, 2
1765    jg .v_w8_loop
1766    RET
1767.v_w16:
1768.v_w32:
1769.v_w64:
1770.v_w128:
1771    lea                 r6d, [wq*8-128]
1772    mov                  r4, srcq
1773    mov                  r7, dstq
1774    lea                 r6d, [hq+r6*2]
1775.v_w16_loop0:
1776    vbroadcasti128       m4, [srcq+ssq*0]
1777    vbroadcasti128       m5, [srcq+ssq*1]
1778    vbroadcasti128       m6, [srcq+ssq*2]
1779    add                srcq, ss3q
1780    vbroadcasti128       m0, [srcq+ssq*0]
1781    vbroadcasti128       m1, [srcq+ssq*1]
1782    vbroadcasti128       m2, [srcq+ssq*2]
1783    add                srcq, ss3q
1784    vbroadcasti128       m3, [srcq+ssq*0]
1785    shufpd               m4, m0, 0x0c
1786    shufpd               m5, m1, 0x0c
1787    punpcklbw            m1, m4, m5 ; 01
1788    punpckhbw            m4, m5     ; 34
1789    shufpd               m6, m2, 0x0c
1790    punpcklbw            m2, m5, m6 ; 12
1791    punpckhbw            m5, m6     ; 45
1792    shufpd               m0, m3, 0x0c
1793    punpcklbw            m3, m6, m0 ; 23
1794    punpckhbw            m6, m0     ; 56
1795.v_w16_loop:
1796    vbroadcasti128      m12, [srcq+ssq*1]
1797    lea                srcq, [srcq+ssq*2]
1798    vbroadcasti128      m13, [srcq+ssq*0]
1799    pmaddubsw           m14, m1, m8  ; a0
1800    pmaddubsw           m15, m2, m8  ; b0
1801    mova                 m1, m3
1802    mova                 m2, m4
1803    pmaddubsw            m3, m9      ; a1
1804    pmaddubsw            m4, m9      ; b1
1805    paddw               m14, m3
1806    paddw               m15, m4
1807    mova                 m3, m5
1808    mova                 m4, m6
1809    pmaddubsw            m5, m10     ; a2
1810    pmaddubsw            m6, m10     ; b2
1811    paddw               m14, m5
1812    paddw               m15, m6
1813    shufpd               m6, m0, m12, 0x0d
1814    shufpd               m0, m12, m13, 0x0c
1815    punpcklbw            m5, m6, m0  ; 67
1816    punpckhbw            m6, m0      ; 78
1817    pmaddubsw           m12, m5, m11 ; a3
1818    pmaddubsw           m13, m6, m11 ; b3
1819    paddw               m14, m12
1820    paddw               m15, m13
1821    pmulhrsw            m14, m7
1822    pmulhrsw            m15, m7
1823    packuswb            m14, m15
1824    vpermq              m14, m14, q3120
1825    mova         [dstq+dsq*0], xm14
1826    vextracti128 [dstq+dsq*1], m14, 1
1827    lea                dstq, [dstq+dsq*2]
1828    sub                  hd, 2
1829    jg .v_w16_loop
1830    add                  r4, 16
1831    add                  r7, 16
1832    movzx                hd, r6b
1833    mov                srcq, r4
1834    mov                dstq, r7
1835    sub                 r6d, 1<<8
1836    jg .v_w16_loop0
1837    RET
1838.hv:
1839    %assign stack_offset stack_offset - stack_size_padded
1840    WIN64_SPILL_XMM      16
1841    cmp                  wd, 4
1842    jg .hv_w8
1843    movzx               mxd, mxb
1844    dec                srcq
1845    vpbroadcastd         m7, [r8+mxq*8+subpel_filters-put_avx2+2]
1846    movzx               mxd, myb
1847    shr                 myd, 16
1848    cmp                  hd, 6
1849    cmovs               myd, mxd
1850    vpbroadcastq         m0, [r8+myq*8+subpel_filters-put_avx2]
1851    lea                ss3q, [ssq*3]
1852    sub                srcq, ss3q
1853    punpcklbw            m0, m0
1854    psraw                m0, 8 ; sign-extend
1855    vpbroadcastd         m8, [pw_8192]
1856    vpbroadcastd         m9, [pd_512]
1857    pshufd              m10, m0, q0000
1858    pshufd              m11, m0, q1111
1859    pshufd              m12, m0, q2222
1860    pshufd              m13, m0, q3333
1861    cmp                  wd, 4
1862    je .hv_w4
1863    vbroadcasti128       m6, [subpel_h_shuf4]
1864    movq                xm2, [srcq+ssq*0]
1865    movhps              xm2, [srcq+ssq*1]
1866    movq                xm0, [srcq+ssq*2]
1867    add                srcq, ss3q
1868    movhps              xm0, [srcq+ssq*0]
1869    vpbroadcastq         m3, [srcq+ssq*1]
1870    vpbroadcastq         m4, [srcq+ssq*2]
1871    add                srcq, ss3q
1872    vpbroadcastq         m1, [srcq+ssq*0]
1873    vpblendd             m2, m3, 0x30
1874    vpblendd             m0, m1, 0x30
1875    vpblendd             m2, m4, 0xc0
1876    pshufb               m2, m6
1877    pshufb               m0, m6
1878    pmaddubsw            m2, m7
1879    pmaddubsw            m0, m7
1880    phaddw               m2, m0
1881    pmulhrsw             m2, m8
1882    vextracti128        xm3, m2, 1
1883    palignr             xm4, xm3, xm2, 4
1884    punpcklwd           xm1, xm2, xm4  ; 01 12
1885    punpckhwd           xm2, xm4       ; 23 34
1886    pshufd              xm0, xm3, q2121
1887    punpcklwd           xm3, xm0       ; 45 56
1888.hv_w2_loop:
1889    movq                xm4, [srcq+ssq*1]
1890    lea                srcq, [srcq+ssq*2]
1891    movhps              xm4, [srcq+ssq*0]
1892    pshufb              xm4, xm6
1893    pmaddubsw           xm4, xm7
1894    pmaddwd             xm5, xm1, xm10 ; a0 b0
1895    mova                xm1, xm2
1896    pmaddwd             xm2, xm11      ; a1 b1
1897    paddd               xm5, xm2
1898    mova                xm2, xm3
1899    pmaddwd             xm3, xm12      ; a2 b2
1900    phaddw              xm4, xm4
1901    pmulhrsw            xm4, xm8
1902    paddd               xm5, xm3
1903    palignr             xm3, xm4, xm0, 12
1904    mova                xm0, xm4
1905    punpcklwd           xm3, xm0       ; 67 78
1906    pmaddwd             xm4, xm3, xm13 ; a3 b3
1907    paddd               xm5, xm9
1908    paddd               xm5, xm4
1909    psrad               xm5, 10
1910    packssdw            xm5, xm5
1911    packuswb            xm5, xm5
1912    pextrw     [dstq+dsq*0], xm5, 0
1913    pextrw     [dstq+dsq*1], xm5, 1
1914    lea                dstq, [dstq+dsq*2]
1915    sub                  hd, 2
1916    jg .hv_w2_loop
1917    RET
1918.hv_w4:
1919    mova                 m6, [subpel_h_shuf4]
1920    vpbroadcastq         m2, [srcq+ssq*0]
1921    vpbroadcastq         m4, [srcq+ssq*1]
1922    vpbroadcastq         m0, [srcq+ssq*2]
1923    add                srcq, ss3q
1924    vpbroadcastq         m5, [srcq+ssq*0]
1925    vpbroadcastq         m3, [srcq+ssq*1]
1926    vpblendd             m2, m4, 0xcc ; 0 1
1927    vpbroadcastq         m4, [srcq+ssq*2]
1928    add                srcq, ss3q
1929    vpbroadcastq         m1, [srcq+ssq*0]
1930    vpblendd             m0, m5, 0xcc ; 2 3
1931    vpblendd             m3, m4, 0xcc ; 4 5
1932    pshufb               m2, m6
1933    pshufb               m0, m6
1934    pshufb               m3, m6
1935    pshufb               m1, m6
1936    pmaddubsw            m2, m7
1937    pmaddubsw            m0, m7
1938    pmaddubsw            m3, m7
1939    pmaddubsw            m1, m7
1940    phaddw               m2, m0
1941    phaddw               m3, m1
1942    pmulhrsw             m2, m8
1943    pmulhrsw             m3, m8
1944    palignr              m4, m3, m2, 4
1945    punpcklwd            m1, m2, m4   ; 01 12
1946    punpckhwd            m2, m4       ; 23 34
1947    pshufd               m0, m3, q2121
1948    punpcklwd            m3, m0       ; 45 56
1949.hv_w4_loop:
1950    vpbroadcastq         m4, [srcq+ssq*1]
1951    lea                srcq, [srcq+ssq*2]
1952    pmaddwd              m5, m1, m10  ; a0 b0
1953    mova                 m1, m2
1954    pmaddwd              m2, m11      ; a1 b1
1955    paddd                m5, m2
1956    mova                 m2, m3
1957    pmaddwd              m3, m12      ; a2 b2
1958    paddd                m5, m3
1959    vpbroadcastq         m3, [srcq+ssq*0]
1960    vpblendd             m4, m3, 0xcc ; 7 8
1961    pshufb               m4, m6
1962    pmaddubsw            m4, m7
1963    phaddw               m4, m4
1964    pmulhrsw             m4, m8
1965    palignr              m3, m4, m0, 12
1966    mova                 m0, m4
1967    punpcklwd            m3, m0       ; 67 78
1968    pmaddwd              m4, m3, m13  ; a3 b3
1969    paddd                m5, m9
1970    paddd                m5, m4
1971    psrad                m5, 10
1972    vextracti128        xm4, m5, 1
1973    packssdw            xm5, xm4
1974    packuswb            xm5, xm5
1975    pshuflw             xm5, xm5, q3120
1976    movd       [dstq+dsq*0], xm5
1977    pextrd     [dstq+dsq*1], xm5, 1
1978    lea                dstq, [dstq+dsq*2]
1979    sub                  hd, 2
1980    jg .hv_w4_loop
1981    RET
1982.hv_w8:
1983    shr                 mxd, 16
1984    sub                srcq, 3
1985    vpbroadcastd        m10, [r8+mxq*8+subpel_filters-put_avx2+0]
1986    vpbroadcastd        m11, [r8+mxq*8+subpel_filters-put_avx2+4]
1987    movzx               mxd, myb
1988    shr                 myd, 16
1989    cmp                  hd, 6
1990    cmovs               myd, mxd
1991    vpbroadcastq         m0, [r8+myq*8+subpel_filters-put_avx2]
1992    lea                ss3q, [ssq*3]
1993    sub                srcq, ss3q
1994    punpcklbw            m0, m0
1995    psraw                m0, 8 ; sign-extend
1996    pshufd              m12, m0, q0000
1997    pshufd              m13, m0, q1111
1998    pshufd              m14, m0, q2222
1999    pshufd              m15, m0, q3333
2000    lea                 r6d, [wq*8-64]
2001    mov                  r4, srcq
2002    mov                  r7, dstq
2003    lea                 r6d, [hq+r6*4]
2004.hv_w8_loop0:
2005    vbroadcasti128       m7, [subpel_h_shufA]
2006    movu                xm4, [srcq+ssq*0]
2007    vbroadcasti128       m8, [subpel_h_shufB]
2008    movu                xm5, [srcq+ssq*1]
2009    vbroadcasti128       m9, [subpel_h_shufC]
2010    movu                xm6, [srcq+ssq*2]
2011    add                srcq, ss3q
2012    vbroadcasti128       m0, [srcq+ssq*0]
2013    vpblendd             m4, m0, 0xf0        ; 0 3
2014    vinserti128          m5, [srcq+ssq*1], 1 ; 1 4
2015    vinserti128          m6, [srcq+ssq*2], 1 ; 2 5
2016    add                srcq, ss3q
2017    vinserti128          m0, [srcq+ssq*0], 1 ; 3 6
2018%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
2019    pshufb               %3, %1, %6
2020    pshufb               %4, %1, %7
2021    pshufb               %1, %5
2022    pmaddubsw            %2, %3, m10
2023    pmaddubsw            %4, m11
2024    pmaddubsw            %3, m11
2025    pmaddubsw            %1, m10
2026    paddw                %2, %4
2027    paddw                %1, %3
2028    phaddw               %1, %2
2029%endmacro
2030    HV_H_W8              m4, m1, m2, m3, m7, m8, m9
2031    HV_H_W8              m5, m1, m2, m3, m7, m8, m9
2032    HV_H_W8              m6, m1, m2, m3, m7, m8, m9
2033    HV_H_W8              m0, m1, m2, m3, m7, m8, m9
2034    vpbroadcastd         m7, [pw_8192]
2035    vpermq               m4, m4, q3120
2036    vpermq               m5, m5, q3120
2037    vpermq               m6, m6, q3120
2038    pmulhrsw             m0, m7
2039    pmulhrsw             m4, m7
2040    pmulhrsw             m5, m7
2041    pmulhrsw             m6, m7
2042    vpermq               m7, m0, q3120
2043    punpcklwd            m1, m4, m5  ; 01
2044    punpckhwd            m4, m5      ; 34
2045    punpcklwd            m2, m5, m6  ; 12
2046    punpckhwd            m5, m6      ; 45
2047    punpcklwd            m3, m6, m7  ; 23
2048    punpckhwd            m6, m7      ; 56
2049.hv_w8_loop:
2050    vextracti128        r6m, m0, 1 ; not enough registers
2051    movu                xm0, [srcq+ssq*1]
2052    lea                srcq, [srcq+ssq*2]
2053    vinserti128          m0, [srcq+ssq*0], 1 ; 7 8
2054    pmaddwd              m8, m1, m12 ; a0
2055    pmaddwd              m9, m2, m12 ; b0
2056    mova                 m1, m3
2057    mova                 m2, m4
2058    pmaddwd              m3, m13     ; a1
2059    pmaddwd              m4, m13     ; b1
2060    paddd                m8, m3
2061    paddd                m9, m4
2062    mova                 m3, m5
2063    mova                 m4, m6
2064    pmaddwd              m5, m14     ; a2
2065    pmaddwd              m6, m14     ; b2
2066    paddd                m8, m5
2067    paddd                m9, m6
2068    vbroadcasti128       m6, [subpel_h_shufB]
2069    vbroadcasti128       m7, [subpel_h_shufC]
2070    vbroadcasti128       m5, [subpel_h_shufA]
2071    HV_H_W8              m0, m5, m6, m7, m5, m6, m7
2072    vpbroadcastd         m5, [pw_8192]
2073    vpbroadcastd         m7, [pd_512]
2074    vbroadcasti128       m6, r6m
2075    pmulhrsw             m0, m5
2076    paddd                m8, m7
2077    paddd                m9, m7
2078    vpermq               m7, m0, q3120    ; 7 8
2079    shufpd               m6, m6, m7, 0x04 ; 6 7
2080    punpcklwd            m5, m6, m7  ; 67
2081    punpckhwd            m6, m7      ; 78
2082    pmaddwd              m7, m5, m15 ; a3
2083    paddd                m8, m7
2084    pmaddwd              m7, m6, m15 ; b3
2085    paddd                m7, m9
2086    psrad                m8, 10
2087    psrad                m7, 10
2088    packssdw             m8, m7
2089    vextracti128        xm7, m8, 1
2090    packuswb            xm8, xm7
2091    pshufd              xm7, xm8, q3120
2092    movq       [dstq+dsq*0], xm7
2093    movhps     [dstq+dsq*1], xm7
2094    lea                dstq, [dstq+dsq*2]
2095    sub                  hd, 2
2096    jg .hv_w8_loop
2097    add                  r4, 8
2098    add                  r7, 8
2099    movzx                hd, r6b
2100    mov                srcq, r4
2101    mov                dstq, r7
2102    sub                 r6d, 1<<8
2103    jg .hv_w8_loop0
2104    RET
2105
2106%macro PREP_8TAP_H 0
2107    pshufb               m1, m0, m5
2108    pshufb               m2, m0, m6
2109    pshufb               m3, m0, m7
2110    pmaddubsw            m1, m8
2111    pmaddubsw            m0, m2, m8
2112    pmaddubsw            m2, m9
2113    pmaddubsw            m3, m9
2114    paddw                m1, m2
2115    paddw                m0, m3
2116    phaddw               m0, m1, m0
2117    pmulhrsw             m0, m4
2118%endmacro
2119
2120%if WIN64
2121DECLARE_REG_TMP 6, 4
2122%else
2123DECLARE_REG_TMP 6, 7
2124%endif
2125
2126%define PREP_8TAP_FN FN prep_8tap,
2127
2128PREP_8TAP_FN sharp,          SHARP,   SHARP
2129PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH
2130PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP
2131PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH
2132PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR
2133PREP_8TAP_FN regular_sharp,  REGULAR, SHARP
2134PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR
2135PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
2136PREP_8TAP_FN regular,        REGULAR, REGULAR
2137
2138cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
2139    imul                mxd, mxm, 0x010101
2140    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
2141    imul                myd, mym, 0x010101
2142    add                 myd, t1d ; 8tap_v, my, 4tap_v
2143    lea                  r7, [prep%+SUFFIX]
2144    movsxd               wq, wm
2145    movifnidn            hd, hm
2146    test                mxd, 0xf00
2147    jnz .h
2148    test                myd, 0xf00
2149    jnz .v
2150    tzcnt                wd, wd
2151    movzx                wd, word [r7+wq*2+table_offset(prep,)]
2152    add                  wq, r7
2153    lea                  r6, [strideq*3]
2154%if WIN64
2155    pop                  r7
2156%endif
2157    jmp                  wq
2158.h:
2159    test                myd, 0xf00
2160    jnz .hv
2161    vpbroadcastd         m4, [pw_8192]
2162    vbroadcasti128       m5, [subpel_h_shufA]
2163    WIN64_SPILL_XMM      10
2164    cmp                  wd, 4
2165    je .h_w4
2166    tzcnt                wd, wd
2167    vbroadcasti128       m6, [subpel_h_shufB]
2168    vbroadcasti128       m7, [subpel_h_shufC]
2169    shr                 mxd, 16
2170    sub                srcq, 3
2171    movzx                wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
2172    vpbroadcastd         m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0]
2173    vpbroadcastd         m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4]
2174    add                  wq, r7
2175    jmp                  wq
2176.h_w4:
2177    movzx               mxd, mxb
2178    dec                srcq
2179    vpbroadcastd         m6, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
2180    lea            stride3q, [strideq*3]
2181.h_w4_loop:
2182    movq                xm0, [srcq+strideq*0]
2183    vpbroadcastq         m2, [srcq+strideq*2]
2184    movq                xm1, [srcq+strideq*1]
2185    vpblendd             m0, m2, 0xf0
2186    vpbroadcastq         m2, [srcq+stride3q ]
2187    lea                srcq, [srcq+strideq*4]
2188    vpblendd             m1, m2, 0xf0
2189    pshufb               m0, m5
2190    pshufb               m1, m5
2191    pmaddubsw            m0, m6
2192    pmaddubsw            m1, m6
2193    phaddw               m0, m1
2194    pmulhrsw             m0, m4
2195    mova             [tmpq], m0
2196    add                tmpq, 32
2197    sub                  hd, 4
2198    jg .h_w4_loop
2199    RET
2200.h_w8:
2201    movu                xm0, [srcq+strideq*0]
2202    vinserti128          m0, [srcq+strideq*1], 1
2203    lea                srcq, [srcq+strideq*2]
2204    PREP_8TAP_H
2205    mova             [tmpq], m0
2206    add                tmpq, 32
2207    sub                  hd, 2
2208    jg .h_w8
2209    RET
2210.h_w16:
2211    movu                xm0, [srcq+strideq*0+8*0]
2212    vinserti128          m0, [srcq+strideq*0+8*1], 1
2213    PREP_8TAP_H
2214    mova        [tmpq+32*0], m0
2215    movu                xm0, [srcq+strideq*1+8*0]
2216    vinserti128          m0, [srcq+strideq*1+8*1], 1
2217    lea                srcq, [srcq+strideq*2]
2218    PREP_8TAP_H
2219    mova        [tmpq+32*1], m0
2220    add                tmpq, 32*2
2221    sub                  hd, 2
2222    jg .h_w16
2223    RET
2224.h_w32:
2225    xor                 r6d, r6d
2226    jmp .h_start
2227.h_w64:
2228    mov                  r6, -32*1
2229    jmp .h_start
2230.h_w128:
2231    mov                  r6, -32*3
2232.h_start:
2233    sub                srcq, r6
2234    mov                  r5, r6
2235.h_loop:
2236    movu                xm0, [srcq+r6+8*0]
2237    vinserti128          m0, [srcq+r6+8*1], 1
2238    PREP_8TAP_H
2239    mova        [tmpq+32*0], m0
2240    movu                xm0, [srcq+r6+8*2]
2241    vinserti128          m0, [srcq+r6+8*3], 1
2242    PREP_8TAP_H
2243    mova        [tmpq+32*1], m0
2244    add                tmpq, 32*2
2245    add                  r6, 32
2246    jle .h_loop
2247    add                srcq, strideq
2248    mov                  r6, r5
2249    dec                  hd
2250    jg .h_loop
2251    RET
2252.v:
2253    %assign stack_offset stack_offset - stack_size_padded
2254    WIN64_SPILL_XMM      16
2255    movzx               mxd, myb ; Select 4-tap/8-tap filter multipliers.
2256    shr                 myd, 16  ; Note that the code is 8-tap only, having
2257    cmp                  hd, 4   ; a separate 4-tap code path for (4|8|16)x4
2258    cmove               myd, mxd ; had a negligible effect on performance.
2259    ; TODO: Would a 6-tap code path be worth it?
2260    lea                 myq, [r7+myq*8+subpel_filters-prep%+SUFFIX]
2261    lea            stride3q, [strideq*3]
2262    sub                srcq, stride3q
2263    vpbroadcastd         m7, [pw_8192]
2264    vpbroadcastw         m8, [myq+0]
2265    vpbroadcastw         m9, [myq+2]
2266    vpbroadcastw        m10, [myq+4]
2267    vpbroadcastw        m11, [myq+6]
2268    cmp                  wd, 8
2269    jg .v_w16
2270    je .v_w8
2271.v_w4:
2272    movd                xm0, [srcq+strideq*0]
2273    vpbroadcastd         m1, [srcq+strideq*2]
2274    vpbroadcastd        xm2, [srcq+strideq*1]
2275    add                srcq, stride3q
2276    vpbroadcastd         m3, [srcq+strideq*0]
2277    vpblendd             m1, m0, 0x01     ; 0 2 2 _   2 _ _ _
2278    vpblendd             m3, m2, 0x03     ; 1 1 3 3   3 3 _ _
2279    vpbroadcastd         m0, [srcq+strideq*1]
2280    vpbroadcastd         m2, [srcq+strideq*2]
2281    vpblendd             m1, m0, 0x68     ; 0 2 2 4   2 4 4 _
2282    vpbroadcastd         m0, [srcq+stride3q ]
2283    vbroadcasti128       m5, [deint_shuf4]
2284    vpblendd             m3, m2, 0xc0     ; 1 1 3 3   3 3 5 5
2285    vpblendd             m2, m3, m1, 0x55 ; 0 1 2 3   2 3 4 5
2286    vpblendd             m3, m1, 0xaa     ; 1 2 3 4   3 4 5 _
2287    punpcklbw            m1, m2, m3       ; 01  12    23  34
2288    vpblendd             m3, m0, 0x80     ; 1 2 3 4   3 4 5 6
2289    punpckhbw            m2, m3           ; 23  34    45  56
2290.v_w4_loop:
2291    lea                srcq, [srcq+strideq*4]
2292    pinsrd              xm0, [srcq+strideq*0], 1
2293    vpbroadcastd         m3, [srcq+strideq*1]
2294    vpbroadcastd         m4, [srcq+strideq*2]
2295    vpblendd             m3, m0, 0x03     ; 6 7 8 _   8 _ _ _
2296    vpbroadcastd         m0, [srcq+stride3q ]
2297    vpblendd             m3, m4, 0x20     ; 6 7 8 _   8 9 _ _
2298    vpblendd             m3, m0, 0x40     ; 6 7 8 _   8 9 a _
2299    pshufb               m3, m5           ; 67  78    89  9a
2300    pmaddubsw            m4, m1, m8
2301    vperm2i128           m1, m2, m3, 0x21 ; 45  56    67  78
2302    pmaddubsw            m2, m9
2303    paddw                m4, m2
2304    mova                 m2, m3
2305    pmaddubsw            m3, m11
2306    paddw                m3, m4
2307    pmaddubsw            m4, m1, m10
2308    paddw                m3, m4
2309    pmulhrsw             m3, m7
2310    mova             [tmpq], m3
2311    add                tmpq, 32
2312    sub                  hd, 4
2313    jg .v_w4_loop
2314    RET
2315.v_w8:
2316    movq                xm1, [srcq+strideq*0]
2317    vpbroadcastq         m4, [srcq+strideq*1]
2318    vpbroadcastq         m2, [srcq+strideq*2]
2319    vpbroadcastq         m5, [srcq+stride3q ]
2320    lea                srcq, [srcq+strideq*4]
2321    vpbroadcastq         m3, [srcq+strideq*0]
2322    vpbroadcastq         m6, [srcq+strideq*1]
2323    vpbroadcastq         m0, [srcq+strideq*2]
2324    vpblendd             m1, m4, 0x30
2325    vpblendd             m4, m2, 0x30
2326    punpcklbw            m1, m4 ; 01 12
2327    vpblendd             m2, m5, 0x30
2328    vpblendd             m5, m3, 0x30
2329    punpcklbw            m2, m5 ; 23 34
2330    vpblendd             m3, m6, 0x30
2331    vpblendd             m6, m0, 0x30
2332    punpcklbw            m3, m6 ; 45 56
2333.v_w8_loop:
2334    vpbroadcastq         m4, [srcq+stride3q ]
2335    lea                srcq, [srcq+strideq*4]
2336    pmaddubsw            m5, m2, m9  ; a1
2337    pmaddubsw            m6, m2, m8  ; b0
2338    vpblendd             m2, m0, m4, 0x30
2339    vpbroadcastq         m0, [srcq+strideq*0]
2340    vpblendd             m4, m0, 0x30
2341    punpcklbw            m2, m4      ; 67 78
2342    pmaddubsw            m1, m8      ; a0
2343    pmaddubsw            m4, m3, m9  ; b1
2344    paddw                m5, m1
2345    mova                 m1, m3
2346    pmaddubsw            m3, m10     ; a2
2347    paddw                m6, m4
2348    paddw                m5, m3
2349    vpbroadcastq         m4, [srcq+strideq*1]
2350    vpblendd             m3, m0, m4, 0x30
2351    vpbroadcastq         m0, [srcq+strideq*2]
2352    vpblendd             m4, m0, 0x30
2353    punpcklbw            m3, m4      ; 89 9a
2354    pmaddubsw            m4, m2, m11 ; a3
2355    paddw                m5, m4
2356    pmaddubsw            m4, m2, m10 ; b2
2357    paddw                m6, m4
2358    pmaddubsw            m4, m3, m11 ; b3
2359    paddw                m6, m4
2360    pmulhrsw             m5, m7
2361    pmulhrsw             m6, m7
2362    mova        [tmpq+32*0], m5
2363    mova        [tmpq+32*1], m6
2364    add                tmpq, 32*2
2365    sub                  hd, 4
2366    jg .v_w8_loop
2367    RET
2368.v_w16:
2369    add                  wd, wd
2370    mov                  r5, srcq
2371    mov                  r7, tmpq
2372    lea                 r6d, [hq+wq*8-256]
2373.v_w16_loop0:
2374    vbroadcasti128       m4, [srcq+strideq*0]
2375    vbroadcasti128       m5, [srcq+strideq*1]
2376    lea                srcq, [srcq+strideq*2]
2377    vbroadcasti128       m0, [srcq+strideq*1]
2378    vbroadcasti128       m6, [srcq+strideq*0]
2379    lea                srcq, [srcq+strideq*2]
2380    vbroadcasti128       m1, [srcq+strideq*0]
2381    vbroadcasti128       m2, [srcq+strideq*1]
2382    lea                srcq, [srcq+strideq*2]
2383    vbroadcasti128       m3, [srcq+strideq*0]
2384    shufpd               m4, m4, m0, 0x0c
2385    shufpd               m5, m5, m1, 0x0c
2386    punpcklbw            m1, m4, m5 ; 01
2387    punpckhbw            m4, m5     ; 34
2388    shufpd               m6, m6, m2, 0x0c
2389    punpcklbw            m2, m5, m6 ; 12
2390    punpckhbw            m5, m6     ; 45
2391    shufpd               m0, m0, m3, 0x0c
2392    punpcklbw            m3, m6, m0 ; 23
2393    punpckhbw            m6, m0     ; 56
2394.v_w16_loop:
2395    vbroadcasti128      m12, [srcq+strideq*1]
2396    lea                srcq, [srcq+strideq*2]
2397    vbroadcasti128      m13, [srcq+strideq*0]
2398    pmaddubsw           m14, m1, m8  ; a0
2399    pmaddubsw           m15, m2, m8  ; b0
2400    mova                 m1, m3
2401    mova                 m2, m4
2402    pmaddubsw            m3, m9      ; a1
2403    pmaddubsw            m4, m9      ; b1
2404    paddw               m14, m3
2405    paddw               m15, m4
2406    mova                 m3, m5
2407    mova                 m4, m6
2408    pmaddubsw            m5, m10     ; a2
2409    pmaddubsw            m6, m10     ; b2
2410    paddw               m14, m5
2411    paddw               m15, m6
2412    shufpd               m6, m0, m12, 0x0d
2413    shufpd               m0, m12, m13, 0x0c
2414    punpcklbw            m5, m6, m0  ; 67
2415    punpckhbw            m6, m0      ; 78
2416    pmaddubsw           m12, m5, m11 ; a3
2417    pmaddubsw           m13, m6, m11 ; b3
2418    paddw               m14, m12
2419    paddw               m15, m13
2420    pmulhrsw            m14, m7
2421    pmulhrsw            m15, m7
2422    mova        [tmpq+wq*0], m14
2423    mova        [tmpq+wq*1], m15
2424    lea                tmpq, [tmpq+wq*2]
2425    sub                  hd, 2
2426    jg .v_w16_loop
2427    add                  r5, 16
2428    add                  r7, 32
2429    movzx                hd, r6b
2430    mov                srcq, r5
2431    mov                tmpq, r7
2432    sub                 r6d, 1<<8
2433    jg .v_w16_loop0
2434    RET
2435.hv:
2436    %assign stack_offset stack_offset - stack_size_padded
2437    %assign stack_size_padded 0
2438    WIN64_SPILL_XMM      16
2439    cmp                  wd, 4
2440    je .hv_w4
2441    shr                 mxd, 16
2442    sub                srcq, 3
2443    vpbroadcastd        m10, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0]
2444    vpbroadcastd        m11, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4]
2445    movzx               mxd, myb
2446    shr                 myd, 16
2447    cmp                  hd, 4
2448    cmove               myd, mxd
2449    vpbroadcastq         m0, [r7+myq*8+subpel_filters-prep%+SUFFIX]
2450    lea            stride3q, [strideq*3]
2451    sub                srcq, stride3q
2452    punpcklbw            m0, m0
2453    psraw                m0, 8 ; sign-extend
2454    pshufd              m12, m0, q0000
2455    pshufd              m13, m0, q1111
2456    pshufd              m14, m0, q2222
2457    pshufd              m15, m0, q3333
2458    jmp .hv_w8
2459.hv_w4:
2460    movzx               mxd, mxb
2461    dec                srcq
2462    vpbroadcastd         m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
2463    movzx               mxd, myb
2464    shr                 myd, 16
2465    cmp                  hd, 4
2466    cmove               myd, mxd
2467    vpbroadcastq         m0, [r7+myq*8+subpel_filters-prep%+SUFFIX]
2468    lea            stride3q, [strideq*3]
2469    sub                srcq, stride3q
2470    mova                 m7, [subpel_h_shuf4]
2471    pmovzxbd             m9, [deint_shuf4]
2472    vpbroadcastd        m10, [pw_8192]
2473    punpcklbw            m0, m0
2474    psraw                m0, 8 ; sign-extend
2475    vpbroadcastd        m11, [pd_32]
2476    pshufd              m12, m0, q0000
2477    pshufd              m13, m0, q1111
2478    pshufd              m14, m0, q2222
2479    pshufd              m15, m0, q3333
2480    vpbroadcastq         m2, [srcq+strideq*0]
2481    vpbroadcastq         m4, [srcq+strideq*1]
2482    vpbroadcastq         m0, [srcq+strideq*2]
2483    vpbroadcastq         m5, [srcq+stride3q ]
2484    lea                srcq, [srcq+strideq*4]
2485    vpbroadcastq         m3, [srcq+strideq*0]
2486    vpbroadcastq         m6, [srcq+strideq*1]
2487    vpbroadcastq         m1, [srcq+strideq*2]
2488    vpblendd             m2, m4, 0xcc ; 0 1
2489    vpblendd             m0, m5, 0xcc ; 2 3
2490    vpblendd             m3, m6, 0xcc ; 4 5
2491    pshufb               m2, m7 ; 00 01 10 11  02 03 12 13
2492    pshufb               m0, m7 ; 20 21 30 31  22 23 32 33
2493    pshufb               m3, m7 ; 40 41 50 51  42 43 52 53
2494    pshufb               m1, m7 ; 60 61 60 61  62 63 62 63
2495    pmaddubsw            m2, m8
2496    pmaddubsw            m0, m8
2497    pmaddubsw            m3, m8
2498    pmaddubsw            m1, m8
2499    phaddw               m2, m0 ; 0a 1a 2a 3a  0b 1b 2b 3b
2500    phaddw               m3, m1 ; 4a 5a 6a __  4b 5b 6b __
2501    pmulhrsw             m2, m10
2502    pmulhrsw             m3, m10
2503    palignr              m4, m3, m2, 4 ; 1a 2a 3a 4a  1b 2b 3b 4b
2504    punpcklwd            m1, m2, m4  ; 01 12
2505    punpckhwd            m2, m4      ; 23 34
2506    pshufd               m0, m3, q2121
2507    punpcklwd            m3, m0      ; 45 56
2508.hv_w4_loop:
2509    pmaddwd              m5, m1, m12 ; a0 b0
2510    pmaddwd              m6, m2, m12 ; c0 d0
2511    pmaddwd              m2, m13     ; a1 b1
2512    pmaddwd              m4, m3, m13 ; c1 d1
2513    mova                 m1, m3
2514    pmaddwd              m3, m14     ; a2 b2
2515    paddd                m5, m2
2516    vpbroadcastq         m2, [srcq+stride3q ]
2517    lea                srcq, [srcq+strideq*4]
2518    paddd                m6, m4
2519    vpbroadcastq         m4, [srcq+strideq*0]
2520    paddd                m5, m3
2521    vpbroadcastq         m3, [srcq+strideq*1]
2522    vpblendd             m2, m4, 0xcc
2523    vpbroadcastq         m4, [srcq+strideq*2]
2524    vpblendd             m3, m4, 0xcc
2525    pshufb               m2, m7
2526    pshufb               m3, m7
2527    pmaddubsw            m2, m8
2528    pmaddubsw            m3, m8
2529    phaddw               m2, m3
2530    pmulhrsw             m2, m10
2531    palignr              m3, m2, m0, 12
2532    mova                 m0, m2
2533    punpcklwd            m2, m3, m0  ; 67 78
2534    punpckhwd            m3, m0      ; 89 9a
2535    pmaddwd              m4, m2, m14 ; c2 d2
2536    paddd                m6, m11
2537    paddd                m5, m11
2538    paddd                m6, m4
2539    pmaddwd              m4, m2, m15 ; a3 b3
2540    paddd                m5, m4
2541    pmaddwd              m4, m3, m15 ; c3 d3
2542    paddd                m6, m4
2543    psrad                m5, 6
2544    psrad                m6, 6
2545    packssdw             m5, m6
2546    vpermd               m5, m9, m5
2547    mova             [tmpq], m5
2548    add                tmpq, 32
2549    sub                  hd, 4
2550    jg .hv_w4_loop
2551    RET
2552.hv_w8:
2553    lea                 r6d, [wq*8-64]
2554    mov                  r5, srcq
2555    mov                  r7, tmpq
2556    lea                 r6d, [hq+r6*4]
2557.hv_w8_loop0:
2558    vbroadcasti128       m7, [subpel_h_shufA]
2559    movu                xm4, [srcq+strideq*0]
2560    vbroadcasti128       m8, [subpel_h_shufB]
2561    movu                xm5, [srcq+strideq*1]
2562    lea                srcq, [srcq+strideq*2]
2563    vbroadcasti128       m9, [subpel_h_shufC]
2564    movu                xm6, [srcq+strideq*0]
2565    vbroadcasti128       m0, [srcq+strideq*1]
2566    lea                srcq, [srcq+strideq*2]
2567    vpblendd             m4, m0, 0xf0            ; 0 3
2568    vinserti128          m5, [srcq+strideq*0], 1 ; 1 4
2569    vinserti128          m6, [srcq+strideq*1], 1 ; 2 5
2570    lea                srcq, [srcq+strideq*2]
2571    vinserti128          m0, [srcq+strideq*0], 1 ; 3 6
2572    HV_H_W8              m4, m1, m2, m3, m7, m8, m9
2573    HV_H_W8              m5, m1, m2, m3, m7, m8, m9
2574    HV_H_W8              m6, m1, m2, m3, m7, m8, m9
2575    HV_H_W8              m0, m1, m2, m3, m7, m8, m9
2576    vpbroadcastd         m7, [pw_8192]
2577    vpermq               m4, m4, q3120
2578    vpermq               m5, m5, q3120
2579    vpermq               m6, m6, q3120
2580    pmulhrsw             m0, m7
2581    pmulhrsw             m4, m7
2582    pmulhrsw             m5, m7
2583    pmulhrsw             m6, m7
2584    vpermq               m7, m0, q3120
2585    punpcklwd            m1, m4, m5  ; 01
2586    punpckhwd            m4, m5      ; 34
2587    punpcklwd            m2, m5, m6  ; 12
2588    punpckhwd            m5, m6      ; 45
2589    punpcklwd            m3, m6, m7  ; 23
2590    punpckhwd            m6, m7      ; 56
2591.hv_w8_loop:
2592    vextracti128     [tmpq], m0, 1 ; not enough registers
2593    movu                xm0, [srcq+strideq*1]
2594    lea                srcq, [srcq+strideq*2]
2595    vinserti128          m0, [srcq+strideq*0], 1 ; 7 8
2596    pmaddwd              m8, m1, m12 ; a0
2597    pmaddwd              m9, m2, m12 ; b0
2598    mova                 m1, m3
2599    mova                 m2, m4
2600    pmaddwd              m3, m13     ; a1
2601    pmaddwd              m4, m13     ; b1
2602    paddd                m8, m3
2603    paddd                m9, m4
2604    mova                 m3, m5
2605    mova                 m4, m6
2606    pmaddwd              m5, m14     ; a2
2607    pmaddwd              m6, m14     ; b2
2608    paddd                m8, m5
2609    paddd                m9, m6
2610    vbroadcasti128       m6, [subpel_h_shufB]
2611    vbroadcasti128       m7, [subpel_h_shufC]
2612    vbroadcasti128       m5, [subpel_h_shufA]
2613    HV_H_W8              m0, m5, m6, m7, m5, m6, m7
2614    vpbroadcastd         m5, [pw_8192]
2615    vpbroadcastd         m7, [pd_32]
2616    vbroadcasti128       m6, [tmpq]
2617    pmulhrsw             m0, m5
2618    paddd                m8, m7
2619    paddd                m9, m7
2620    vpermq               m7, m0, q3120    ; 7 8
2621    shufpd               m6, m6, m7, 0x04 ; 6 7
2622    punpcklwd            m5, m6, m7  ; 67
2623    punpckhwd            m6, m7      ; 78
2624    pmaddwd              m7, m5, m15 ; a3
2625    paddd                m8, m7
2626    pmaddwd              m7, m6, m15 ; b3
2627    paddd                m7, m9
2628    psrad                m8, 6
2629    psrad                m7, 6
2630    packssdw             m8, m7
2631    vpermq               m7, m8, q3120
2632    mova         [tmpq+wq*0], xm7
2633    vextracti128 [tmpq+wq*2], m7, 1
2634    lea                tmpq, [tmpq+wq*4]
2635    sub                  hd, 2
2636    jg .hv_w8_loop
2637    add                  r5, 8
2638    add                  r7, 16
2639    movzx                hd, r6b
2640    mov                srcq, r5
2641    mov                tmpq, r7
2642    sub                 r6d, 1<<8
2643    jg .hv_w8_loop0
2644    RET
2645
2646%macro movifprep 2
2647 %if isprep
2648    mov %1, %2
2649 %endif
2650%endmacro
2651
2652%macro REMAP_REG 2
2653 %xdefine r%1  r%2
2654 %xdefine r%1q r%2q
2655 %xdefine r%1d r%2d
2656%endmacro
2657
2658%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
2659 %if isprep
2660  %xdefine r14_save r14
2661  %assign %%i 14
2662  %rep 14
2663   %assign %%j %%i-1
2664   REMAP_REG %%i, %%j
2665   %assign %%i %%i-1
2666  %endrep
2667 %endif
2668%endmacro
2669
2670%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
2671 %if isprep
2672  %assign %%i 1
2673  %rep 13
2674   %assign %%j %%i+1
2675   REMAP_REG %%i, %%j
2676   %assign %%i %%i+1
2677  %endrep
2678  %xdefine r14 r14_save
2679  %undef r14_save
2680 %endif
2681%endmacro
2682
2683%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
2684    MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
2685    RET
2686 %if %1
2687    MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
2688 %endif
2689%endmacro
2690
2691%macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6]
2692    movq               xm%1, [srcq+ r4]
2693    movq               xm%2, [srcq+ r6]
2694    movhps             xm%1, [srcq+ r7]
2695    movhps             xm%2, [srcq+ r9]
2696    vinserti128         m%1, [srcq+r10], 1
2697    vinserti128         m%2, [srcq+r11], 1
2698    vpbroadcastq        m%5, [srcq+r13]
2699    vpbroadcastq        m%6, [srcq+ rX]
2700    add                srcq, ssq
2701    movq               xm%3, [srcq+ r4]
2702    movq               xm%4, [srcq+ r6]
2703    movhps             xm%3, [srcq+ r7]
2704    movhps             xm%4, [srcq+ r9]
2705    vinserti128         m%3, [srcq+r10], 1
2706    vinserti128         m%4, [srcq+r11], 1
2707    vpbroadcastq        m%7, [srcq+r13]
2708    vpbroadcastq        m%8, [srcq+ rX]
2709    add                srcq, ssq
2710    vpblendd            m%1, m%5, 0xc0
2711    vpblendd            m%2, m%6, 0xc0
2712    vpblendd            m%3, m%7, 0xc0
2713    vpblendd            m%4, m%8, 0xc0
2714    pmaddubsw           m%1, m15
2715    pmaddubsw           m%2, m10
2716    pmaddubsw           m%3, m15
2717    pmaddubsw           m%4, m10
2718    phaddw              m%1, m%2
2719    phaddw              m%3, m%4
2720    phaddw              m%1, m%3
2721    pmulhrsw            m%1, m12
2722%endmacro
2723
2724%macro MC_8TAP_SCALED 1
2725%ifidn %1, put
2726 %assign isprep 0
2727 %if required_stack_alignment <= STACK_ALIGNMENT
2728cglobal put_8tap_scaled, 4, 15, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy
2729 %else
2730cglobal put_8tap_scaled, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy
2731 %endif
2732 %xdefine base_reg r12
2733 %define rndshift 10
2734%else
2735 %assign isprep 1
2736 %if required_stack_alignment <= STACK_ALIGNMENT
2737cglobal prep_8tap_scaled, 4, 15, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
2738  %xdefine tmp_stridem r14q
2739 %else
2740cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
2741  %define tmp_stridem qword [rsp+120]
2742 %endif
2743 %xdefine base_reg r11
2744 %define rndshift 6
2745%endif
2746    lea            base_reg, [%1_8tap_scaled_avx2]
2747%define base base_reg-%1_8tap_scaled_avx2
2748    tzcnt                wd, wm
2749    vpbroadcastd         m8, dxm
2750%if isprep && UNIX64
2751    movd               xm14, mxd
2752    vpbroadcastd        m14, xm14
2753    mov                 r5d, t0d
2754 DECLARE_REG_TMP 5, 7
2755%else
2756    vpbroadcastd        m14, mxm
2757%endif
2758    mov                 dyd, dym
2759%ifidn %1, put
2760 %if WIN64
2761    mov                 r8d, hm
2762  DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
2763  %define hm r5m
2764  %define dxm r8m
2765 %else
2766  DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
2767  %define hm r6m
2768 %endif
2769 %if required_stack_alignment > STACK_ALIGNMENT
2770  %define dsm [rsp+112]
2771  %define rX r1
2772  %define rXd r1d
2773 %else
2774  %define dsm dsq
2775  %define rX r14
2776  %define rXd r14d
2777 %endif
2778%else ; prep
2779 %if WIN64
2780    mov                 r7d, hm
2781  DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
2782  %define hm r4m
2783  %define dxm r7m
2784 %else
2785  DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
2786  %define hm [rsp+112]
2787 %endif
2788 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
2789 %define rX r14
2790 %define rXd r14d
2791%endif
2792    vpbroadcastd        m10, [base+pd_0x3ff]
2793    vpbroadcastd        m12, [base+pw_8192]
2794%ifidn %1, put
2795    vpbroadcastd        m13, [base+pd_512]
2796%else
2797    vpbroadcastd        m13, [base+pd_32]
2798%endif
2799    pxor                 m9, m9
2800    lea                ss3q, [ssq*3]
2801    movzx               r7d, t1b
2802    shr                 t1d, 16
2803    cmp                  hd, 6
2804    cmovs               t1d, r7d
2805    sub                srcq, ss3q
2806    cmp                 dyd, 1024
2807    je .dy1
2808    cmp                 dyd, 2048
2809    je .dy2
2810    movzx                wd, word [base+%1_8tap_scaled_avx2_table+wq*2]
2811    add                  wq, base_reg
2812    jmp                  wq
2813%ifidn %1, put
2814.w2:
2815    mov                 myd, mym
2816    movzx               t0d, t0b
2817    dec                srcq
2818    movd               xm15, t0d
2819    punpckldq            m8, m9, m8
2820    paddd               m14, m8 ; mx+dx*[0-1]
2821    vpbroadcastd        m11, [base+pd_0x4000]
2822    vpbroadcastd       xm15, xm15
2823    pand                 m8, m14, m10
2824    psrld                m8, 6
2825    paddd              xm15, xm8
2826    movd                r4d, xm15
2827    pextrd              r6d, xm15, 1
2828    vbroadcasti128       m5, [base+bdct_lb_dw]
2829    vbroadcasti128       m6, [base+subpel_s_shuf2]
2830    vpbroadcastd        m15, [base+subpel_filters+r4*8+2]
2831    vpbroadcastd         m7, [base+subpel_filters+r6*8+2]
2832    pcmpeqd              m8, m9
2833    psrld               m14, 10
2834    movq                xm0, [srcq+ssq*0]
2835    movq                xm1, [srcq+ssq*2]
2836    movhps              xm0, [srcq+ssq*1]
2837    movhps              xm1, [srcq+ss3q ]
2838    lea                srcq, [srcq+ssq*4]
2839    pshufb              m14, m5
2840    paddb               m14, m6
2841    vinserti128          m0, [srcq+ssq*0], 1
2842    vinserti128          m1, [srcq+ssq*2], 1
2843    vpbroadcastq         m2, [srcq+ssq*1]
2844    vpbroadcastq         m3, [srcq+ss3q ]
2845    lea                srcq, [srcq+ssq*4]
2846    vpblendd            m15, m7, 0xaa
2847    vpblendd             m0, m2, 0xc0       ; 0 1  4 5
2848    vpblendd             m1, m3, 0xc0       ; 2 3  6 7
2849    pblendvb            m15, m11, m8
2850    pshufb               m0, m14
2851    pshufb               m1, m14
2852    pmaddubsw            m0, m15
2853    pmaddubsw            m1, m15
2854    phaddw               m0, m1
2855    pmulhrsw             m0, m12            ; 0 1 2 3  4 5 6 7
2856    vextracti128        xm1, m0, 1          ; 4 5 6 7
2857    palignr             xm2, xm1, xm0, 4    ; 1 2 3 4
2858    punpcklwd           xm3, xm0, xm2       ; 01 12
2859    punpckhwd           xm0, xm2            ; 23 34
2860    pshufd              xm4, xm1, q0321     ; 5 6 7 _
2861    punpcklwd           xm2, xm1, xm4       ; 45 56
2862    punpckhwd           xm4, xm1, xm4       ; 67 __
2863.w2_loop:
2864    and                 myd, 0x3ff
2865    mov                 r6d, 64 << 24
2866    mov                 r4d, myd
2867    shr                 r4d, 6
2868    lea                 r4d, [t1+r4]
2869    cmovnz              r6q, [base+subpel_filters+r4*8]
2870    movq               xm11, r6q
2871    punpcklbw          xm11, xm11
2872    psraw              xm11, 8
2873    pshufd              xm8, xm11, q0000
2874    pshufd              xm9, xm11, q1111
2875    pshufd             xm10, xm11, q2222
2876    pshufd             xm11, xm11, q3333
2877    pmaddwd             xm5, xm3, xm8
2878    pmaddwd             xm6, xm0, xm9
2879    pmaddwd             xm7, xm2, xm10
2880    pmaddwd             xm8, xm4, xm11
2881    paddd               xm5, xm6
2882    paddd               xm7, xm8
2883    paddd               xm5, xm13
2884    paddd               xm5, xm7
2885    psrad               xm5, 10
2886    packssdw            xm5, xm5
2887    packuswb            xm5, xm5
2888    pextrw           [dstq], xm5, 0
2889    add                dstq, dsq
2890    dec                  hd
2891    jz .ret
2892    add                 myd, dyd
2893    test                myd, ~0x3ff
2894    jz .w2_loop
2895    movq                xm5, [srcq]
2896    test                myd, 0x400
2897    jz .w2_skip_line
2898    add                srcq, ssq
2899    shufps              xm3, xm0, q1032     ; 01 12
2900    shufps              xm0, xm2, q1032     ; 23 34
2901    shufps              xm2, xm4, q1032     ; 45 56
2902    pshufb              xm5, xm14
2903    pmaddubsw           xm5, xm15
2904    phaddw              xm5, xm5
2905    pmulhrsw            xm5, xm12
2906    palignr             xm1, xm5, xm1, 12
2907    punpcklqdq          xm1, xm1            ; 6 7 6 7
2908    punpcklwd           xm4, xm1, xm5       ; 67 __
2909    jmp .w2_loop
2910.w2_skip_line:
2911    movhps              xm5, [srcq+ssq*1]
2912    lea                srcq, [srcq+ssq*2]
2913    mova                xm3, xm0            ; 01 12
2914    mova                xm0, xm2            ; 23 34
2915    pshufb              xm5, xm14
2916    pmaddubsw           xm5, xm15
2917    phaddw              xm5, xm5
2918    pmulhrsw            xm5, xm12           ; 6 7 6 7
2919    palignr             xm1, xm5, xm1, 8    ; 4 5 6 7
2920    pshufd              xm5, xm1, q0321     ; 5 6 7 _
2921    punpcklwd           xm2, xm1, xm5       ; 45 56
2922    punpckhwd           xm4, xm1, xm5       ; 67 __
2923    jmp .w2_loop
2924%endif
2925.w4:
2926    mov                 myd, mym
2927    vbroadcasti128       m7, [base+rescale_mul]
2928    movzx               t0d, t0b
2929    dec                srcq
2930    movd               xm15, t0d
2931    pmaddwd              m8, m7
2932    vpbroadcastd        m11, [base+pd_0x4000]
2933    vpbroadcastd       xm15, xm15
2934    paddd               m14, m8 ; mx+dx*[0-3]
2935    pand                 m0, m14, m10
2936    psrld                m0, 6
2937    paddd              xm15, xm0
2938    movd                r4d, xm15
2939    pextrd              r6d, xm15, 1
2940    pextrd             r11d, xm15, 2
2941    pextrd             r13d, xm15, 3
2942    movd               xm15, [base+subpel_filters+r4*8+2]
2943    vbroadcasti128       m5, [base+bdct_lb_dw]
2944    vpbroadcastq         m6, [base+subpel_s_shuf2]
2945    pinsrd             xm15, [base+subpel_filters+r6*8+2], 1
2946    pcmpeqd              m0, m9
2947    psrld               m14, 10
2948    movu                xm7, [srcq+ssq*0]
2949    movu                xm9, [srcq+ssq*1]
2950    pinsrd             xm15, [base+subpel_filters+r11*8+2], 2
2951    movu                xm8, [srcq+ssq*2]
2952    movu               xm10, [srcq+ss3q ]
2953    pinsrd             xm15, [base+subpel_filters+r13*8+2], 3
2954    lea                srcq, [srcq+ssq*4]
2955    pshufb              m14, m5
2956    paddb               m14, m6
2957    vinserti128          m7, [srcq+ssq*0], 1
2958    vinserti128          m9, [srcq+ssq*1], 1
2959    vinserti128         m15, xm15, 1
2960    vinserti128          m8, [srcq+ssq*2], 1
2961    vinserti128         m10, [srcq+ss3q ], 1
2962    lea                srcq, [srcq+ssq*4]
2963    pblendvb            m15, m11, m0
2964    pshufb               m7, m14
2965    pshufb               m9, m14
2966    pshufb               m8, m14
2967    pshufb              m10, m14
2968    pmaddubsw            m7, m15
2969    pmaddubsw            m9, m15
2970    pmaddubsw            m8, m15
2971    pmaddubsw           m10, m15
2972    phaddw               m7, m9
2973    phaddw               m8, m10
2974    pmulhrsw             m7, m12                ; 0 1  4 5
2975    pmulhrsw             m8, m12                ; 2 3  6 7
2976    vextracti128        xm9, m7, 1              ; 4 5
2977    vextracti128        xm3, m8, 1              ; 6 7
2978    shufps              xm4, xm7, xm8, q1032    ; 1 2
2979    shufps              xm5, xm8, xm9, q1032    ; 3 4
2980    shufps              xm6, xm9, xm3, q1032    ; 5 6
2981    psrldq             xm11, xm3, 8             ; 7 _
2982    punpcklwd           xm0, xm7, xm4   ; 01
2983    punpckhwd           xm7, xm4        ; 12
2984    punpcklwd           xm1, xm8, xm5   ; 23
2985    punpckhwd           xm8, xm5        ; 34
2986    punpcklwd           xm2, xm9, xm6   ; 45
2987    punpckhwd           xm9, xm6        ; 56
2988    punpcklwd           xm3, xm11       ; 67
2989    mova         [rsp+0x00], xm7
2990    mova         [rsp+0x10], xm8
2991    mova         [rsp+0x20], xm9
2992.w4_loop:
2993    and                 myd, 0x3ff
2994    mov                 r6d, 64 << 24
2995    mov                 r4d, myd
2996    shr                 r4d, 6
2997    lea                 r4d, [t1+r4]
2998    cmovnz              r6q, [base+subpel_filters+r4*8]
2999    movq               xm10, r6q
3000    punpcklbw          xm10, xm10
3001    psraw              xm10, 8
3002    pshufd              xm7, xm10, q0000
3003    pshufd              xm8, xm10, q1111
3004    pshufd              xm9, xm10, q2222
3005    pshufd             xm10, xm10, q3333
3006    pmaddwd             xm4, xm0, xm7
3007    pmaddwd             xm5, xm1, xm8
3008    pmaddwd             xm6, xm2, xm9
3009    pmaddwd             xm7, xm3, xm10
3010    paddd               xm4, xm5
3011    paddd               xm6, xm7
3012    paddd               xm4, xm13
3013    paddd               xm4, xm6
3014    psrad               xm4, rndshift
3015    packssdw            xm4, xm4
3016%ifidn %1, put
3017    packuswb            xm4, xm4
3018    movd             [dstq], xm4
3019    add                dstq, dsq
3020%else
3021    movq             [tmpq], xm4
3022    add                tmpq, 8
3023%endif
3024    dec                  hd
3025    jz .ret
3026    add                 myd, dyd
3027    test                myd, ~0x3ff
3028    jz .w4_loop
3029    movu                xm4, [srcq]
3030    test                myd, 0x400
3031    jz .w4_skip_line
3032    mova                xm0, [rsp+0x00]
3033    mova         [rsp+0x00], xm1
3034    mova                xm1, [rsp+0x10]
3035    mova         [rsp+0x10], xm2
3036    mova                xm2, [rsp+0x20]
3037    mova         [rsp+0x20], xm3
3038    pshufb              xm4, xm14
3039    pmaddubsw           xm4, xm15
3040    phaddw              xm4, xm4
3041    pmulhrsw            xm4, xm12
3042    punpcklwd           xm3, xm11, xm4
3043    mova               xm11, xm4
3044    add                srcq, ssq
3045    jmp .w4_loop
3046.w4_skip_line:
3047    movu                xm5, [srcq+ssq*1]
3048    movu                 m6, [rsp+0x10]
3049    pshufb              xm4, xm14
3050    pshufb              xm5, xm14
3051    pmaddubsw           xm4, xm15
3052    pmaddubsw           xm5, xm15
3053    movu         [rsp+0x00], m6
3054    phaddw              xm4, xm5
3055    pmulhrsw            xm4, xm12
3056    punpcklwd           xm9, xm11, xm4
3057    mova         [rsp+0x20], xm9
3058    psrldq             xm11, xm4, 8
3059    mova                xm0, xm1
3060    mova                xm1, xm2
3061    mova                xm2, xm3
3062    punpcklwd           xm3, xm4, xm11
3063    lea                srcq, [srcq+ssq*2]
3064    jmp .w4_loop
3065.w8:
3066    mov      dword [rsp+48], 1
3067    movifprep   tmp_stridem, 16
3068    jmp .w_start
3069.w16:
3070    mov      dword [rsp+48], 2
3071    movifprep   tmp_stridem, 32
3072    jmp .w_start
3073.w32:
3074    mov      dword [rsp+48], 4
3075    movifprep   tmp_stridem, 64
3076    jmp .w_start
3077.w64:
3078    mov      dword [rsp+48], 8
3079    movifprep   tmp_stridem, 128
3080    jmp .w_start
3081.w128:
3082    mov      dword [rsp+48], 16
3083    movifprep   tmp_stridem, 256
3084.w_start:
3085%ifidn %1, put
3086    movifnidn           dsm, dsq
3087%endif
3088    shr                 t0d, 16
3089    sub                srcq, 3
3090    pmaddwd              m8, [base+rescale_mul]
3091    movd               xm15, t0d
3092    mov            [rsp+72], t0d
3093    mov            [rsp+56], srcq
3094    mov            [rsp+64], r0q ; dstq / tmpq
3095%if UNIX64
3096    mov                  hm, hd
3097%endif
3098    shl           dword dxm, 3 ; dx*8
3099    vpbroadcastd        m15, xm15
3100    paddd               m14, m8 ; mx+dx*[0-7]
3101    jmp .hloop
3102.hloop_prep:
3103    dec      dword [rsp+48]
3104    jz .ret
3105    add      qword [rsp+64], 8*(isprep+1)
3106    mov                  hd, hm
3107    vpbroadcastd         m8, dxm
3108    vpbroadcastd        m10, [base+pd_0x3ff]
3109    paddd               m14, m8, [rsp+16]
3110    vpbroadcastd        m15, [rsp+72]
3111    pxor                 m9, m9
3112    mov                srcq, [rsp+56]
3113    mov                 r0q, [rsp+64] ; dstq / tmpq
3114.hloop:
3115    vpbroadcastq        m11, [base+pq_0x40000000]
3116    pand                 m6, m14, m10
3117    psrld                m6, 6
3118    paddd               m15, m6
3119    pcmpeqd              m6, m9
3120    vextracti128        xm7, m15, 1
3121    movd                r4d, xm15
3122    pextrd              r6d, xm15, 2
3123    pextrd              r7d, xm15, 1
3124    pextrd              r9d, xm15, 3
3125    movd               r10d, xm7
3126    pextrd             r11d, xm7, 2
3127    pextrd             r13d, xm7, 1
3128    pextrd              rXd, xm7, 3
3129    movu           [rsp+16], m14
3130    movq               xm15, [base+subpel_filters+ r4*8]
3131    movq               xm10, [base+subpel_filters+ r6*8]
3132    movhps             xm15, [base+subpel_filters+ r7*8]
3133    movhps             xm10, [base+subpel_filters+ r9*8]
3134    vinserti128         m15, [base+subpel_filters+r10*8], 1
3135    vinserti128         m10, [base+subpel_filters+r11*8], 1
3136    vpbroadcastq         m9, [base+subpel_filters+r13*8]
3137    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
3138    psrld               m14, 10
3139    vextracti128        xm7, m14, 1
3140    mova              [rsp], xm14
3141    movd                r4d, xm14
3142    pextrd              r6d, xm14, 2
3143    pextrd              r7d, xm14, 1
3144    pextrd              r9d, xm14, 3
3145    movd               r10d, xm7
3146    pextrd             r11d, xm7, 2
3147    pextrd             r13d, xm7, 1
3148    pextrd              rXd, xm7, 3
3149    pshufd               m5, m6, q1100
3150    pshufd               m6, m6, q3322
3151    vpblendd            m15, m9, 0xc0
3152    vpblendd            m10, m8, 0xc0
3153    pblendvb            m15, m11, m5
3154    pblendvb            m10, m11, m6
3155    vbroadcasti128      m14, [base+subpel_s_shuf8]
3156    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
3157    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
3158    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
3159    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
3160    mov                 myd, mym
3161    mov                 dyd, dym
3162    pshufb               m0, m14    ; 01a 01b
3163    pshufb               m1, m14    ; 23a 23b
3164    pshufb               m2, m14    ; 45a 45b
3165    pshufb               m3, m14    ; 67a 67b
3166    vbroadcasti128      m14, [base+wswap]
3167.vloop:
3168    and                 myd, 0x3ff
3169    mov                 r6d, 64 << 24
3170    mov                 r4d, myd
3171    shr                 r4d, 6
3172    lea                 r4d, [t1+r4]
3173    cmovnz              r6q, [base+subpel_filters+r4*8]
3174    movq               xm11, r6q
3175    punpcklbw          xm11, xm11
3176    psraw              xm11, 8
3177    vinserti128         m11, xm11, 1
3178    pshufd               m8, m11, q0000
3179    pshufd               m9, m11, q1111
3180    pmaddwd              m4, m0, m8
3181    pmaddwd              m5, m1, m9
3182    pshufd               m8, m11, q2222
3183    pshufd              m11, m11, q3333
3184    pmaddwd              m6, m2, m8
3185    pmaddwd              m7, m3, m11
3186    paddd                m4, m5
3187    paddd                m6, m7
3188    paddd                m4, m13
3189    paddd                m4, m6
3190    psrad                m4, rndshift
3191    vextracti128        xm5, m4, 1
3192    packssdw            xm4, xm5
3193%ifidn %1, put
3194    packuswb            xm4, xm4
3195    movq             [dstq], xm4
3196    add                dstq, dsm
3197%else
3198    mova             [tmpq], xm4
3199    add                tmpq, tmp_stridem
3200%endif
3201    dec                  hd
3202    jz .hloop_prep
3203    add                 myd, dyd
3204    test                myd, ~0x3ff
3205    jz .vloop
3206    test                myd, 0x400
3207    mov            [rsp+52], myd
3208    mov                 r4d, [rsp+ 0]
3209    mov                 r6d, [rsp+ 8]
3210    mov                 r7d, [rsp+ 4]
3211    mov                 r9d, [rsp+12]
3212    jz .skip_line
3213    vpbroadcastq         m6, [srcq+r13]
3214    vpbroadcastq         m7, [srcq+ rX]
3215    movq                xm4, [srcq+ r4]
3216    movq                xm5, [srcq+ r6]
3217    movhps              xm4, [srcq+ r7]
3218    movhps              xm5, [srcq+ r9]
3219    vinserti128          m4, [srcq+r10], 1
3220    vinserti128          m5, [srcq+r11], 1
3221    add                srcq, ssq
3222    mov                 myd, [rsp+52]
3223    mov                 dyd, dym
3224    pshufb               m0, m14
3225    pshufb               m1, m14
3226    pshufb               m2, m14
3227    pshufb               m3, m14
3228    vpblendd             m4, m6, 0xc0
3229    vpblendd             m5, m7, 0xc0
3230    pmaddubsw            m4, m15
3231    pmaddubsw            m5, m10
3232    phaddw               m4, m5
3233    pslld                m5, m4, 16
3234    paddw                m4, m5
3235    pmulhrsw             m4, m12
3236    pblendw              m0, m1, 0xaa
3237    pblendw              m1, m2, 0xaa
3238    pblendw              m2, m3, 0xaa
3239    pblendw              m3, m4, 0xaa
3240    jmp .vloop
3241.skip_line:
3242    mova                 m0, m1
3243    mova                 m1, m2
3244    mova                 m2, m3
3245    vpbroadcastq         m7, [srcq+r13]
3246    vpbroadcastq         m8, [srcq+ rX]
3247    movq                xm3, [srcq+ r4]
3248    movq                xm4, [srcq+ r6]
3249    movhps              xm3, [srcq+ r7]
3250    movhps              xm4, [srcq+ r9]
3251    vinserti128          m3, [srcq+r10], 1
3252    vinserti128          m4, [srcq+r11], 1
3253    add                srcq, ssq
3254    movq                xm5, [srcq+ r4]
3255    movq                xm6, [srcq+ r6]
3256    movhps              xm5, [srcq+ r7]
3257    movhps              xm6, [srcq+ r9]
3258    vinserti128          m5, [srcq+r10], 1
3259    vinserti128          m6, [srcq+r11], 1
3260    vpbroadcastq         m9, [srcq+r13]
3261    vpbroadcastq        m11, [srcq+ rX]
3262    add                srcq, ssq
3263    mov                 myd, [rsp+52]
3264    mov                 dyd, dym
3265    vpblendd             m3, m7, 0xc0
3266    vpblendd             m4, m8, 0xc0
3267    vpblendd             m5, m9, 0xc0
3268    vpblendd             m6, m11, 0xc0
3269    pmaddubsw            m3, m15
3270    pmaddubsw            m4, m10
3271    pmaddubsw            m5, m15
3272    pmaddubsw            m6, m10
3273    phaddw               m3, m4
3274    phaddw               m5, m6
3275    psrld                m4, m3, 16
3276    pslld                m6, m5, 16
3277    paddw                m3, m4
3278    paddw                m5, m6
3279    pblendw              m3, m5, 0xaa
3280    pmulhrsw             m3, m12
3281    jmp .vloop
3282.dy1:
3283    movzx                wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2]
3284    add                  wq, base_reg
3285    jmp                  wq
3286%ifidn %1, put
3287.dy1_w2:
3288    mov                 myd, mym
3289    movzx               t0d, t0b
3290    dec                srcq
3291    movd               xm15, t0d
3292    punpckldq            m8, m9, m8
3293    paddd               m14, m8 ; mx+dx*[0-1]
3294    vpbroadcastd        m11, [base+pd_0x4000]
3295    vpbroadcastd       xm15, xm15
3296    pand                 m8, m14, m10
3297    psrld                m8, 6
3298    paddd              xm15, xm8
3299    movd                r4d, xm15
3300    pextrd              r6d, xm15, 1
3301    vbroadcasti128       m5, [base+bdct_lb_dw]
3302    vbroadcasti128       m6, [base+subpel_s_shuf2]
3303    vpbroadcastd        m15, [base+subpel_filters+r4*8+2]
3304    vpbroadcastd         m7, [base+subpel_filters+r6*8+2]
3305    pcmpeqd              m8, m9
3306    psrld               m14, 10
3307    movq                xm0, [srcq+ssq*0]
3308    movq                xm1, [srcq+ssq*2]
3309    movhps              xm0, [srcq+ssq*1]
3310    movhps              xm1, [srcq+ss3q ]
3311    lea                srcq, [srcq+ssq*4]
3312    shr                 myd, 6
3313    mov                 r4d, 64 << 24
3314    lea                 myd, [t1+myq]
3315    cmovnz              r4q, [base+subpel_filters+myq*8]
3316    pshufb              m14, m5
3317    paddb               m14, m6
3318    vinserti128          m0, [srcq+ssq*0], 1
3319    vinserti128          m1, [srcq+ssq*2], 1
3320    vpbroadcastq         m2, [srcq+ssq*1]
3321    add                srcq, ss3q
3322    movq               xm10, r4q
3323    punpcklbw          xm10, xm10
3324    psraw              xm10, 8
3325    vpblendd            m15, m7, 0xaa
3326    pblendvb            m15, m11, m8
3327    pshufd              xm8, xm10, q0000
3328    pshufd              xm9, xm10, q1111
3329    pshufd             xm11, xm10, q3333
3330    pshufd             xm10, xm10, q2222
3331    vpblendd             m0, m2, 0xc0
3332    pshufb               m1, m14
3333    pshufb               m0, m14
3334    pmaddubsw            m1, m15
3335    pmaddubsw            m0, m15
3336    phaddw               m0, m1
3337    pmulhrsw             m0, m12
3338    vextracti128        xm1, m0, 1
3339    palignr             xm2, xm1, xm0, 4
3340    pshufd              xm4, xm1, q2121
3341    punpcklwd           xm3, xm0, xm2       ; 01 12
3342    punpckhwd           xm0, xm2            ; 23 34
3343    punpcklwd           xm2, xm1, xm4       ; 45 56
3344.dy1_w2_loop:
3345    movq                xm1, [srcq+ssq*0]
3346    movhps              xm1, [srcq+ssq*1]
3347    lea                srcq, [srcq+ssq*2]
3348    pmaddwd             xm5, xm3, xm8
3349    pmaddwd             xm6, xm0, xm9
3350    pmaddwd             xm7, xm2, xm10
3351    mova                xm3, xm0
3352    mova                xm0, xm2
3353    paddd               xm5, xm13
3354    paddd               xm6, xm7
3355    pshufb              xm1, xm14
3356    pmaddubsw           xm1, xm15
3357    phaddw              xm1, xm1
3358    pmulhrsw            xm1, xm12
3359    palignr             xm7, xm1, xm4, 12
3360    punpcklwd           xm2, xm7, xm1     ; 67 78
3361    pmaddwd             xm7, xm2, xm11
3362    mova                xm4, xm1
3363    paddd               xm5, xm6
3364    paddd               xm5, xm7
3365    psrad               xm5, rndshift
3366    packssdw            xm5, xm5
3367    packuswb            xm5, xm5
3368    pextrw     [dstq+dsq*0], xm5, 0
3369    pextrw     [dstq+dsq*1], xm5, 1
3370    lea                dstq, [dstq+dsq*2]
3371    sub                  hd, 2
3372    jg .dy1_w2_loop
3373    RET
3374%endif
3375.dy1_w4:
3376    mov                 myd, mym
3377    vbroadcasti128       m7, [base+rescale_mul]
3378    movzx               t0d, t0b
3379    dec                srcq
3380    movd               xm15, t0d
3381    pmaddwd              m8, m7
3382    vpbroadcastd        m11, [base+pd_0x4000]
3383    vpbroadcastd       xm15, xm15
3384    paddd               m14, m8 ; mx+dx*[0-3]
3385    pand                 m8, m14, m10
3386    psrld                m8, 6
3387    paddd              xm15, xm8
3388    vpermq               m8, m8, q3120
3389    movd                r4d, xm15
3390    pextrd              r6d, xm15, 2
3391    pextrd             r11d, xm15, 1
3392    pextrd             r13d, xm15, 3
3393    movd               xm15, [base+subpel_filters+r4*8+2]
3394    vpbroadcastd         m7, [base+subpel_filters+r6*8+2]
3395    movu                xm2, [srcq+ssq*0]
3396    movu                xm3, [srcq+ssq*2]
3397    vbroadcasti128       m5, [base+bdct_lb_dw]
3398    vpbroadcastq         m6, [base+subpel_s_shuf2]
3399    pcmpeqd              m8, m9
3400    psrld               m14, 10
3401    pinsrd             xm15, [base+subpel_filters+r11*8+2], 1
3402    vpblendd             m7, [base+subpel_filters+r13*8+2-20], 0x20
3403    vinserti128          m2, [srcq+ssq*1], 1
3404    vinserti128          m3, [srcq+ss3q ], 1
3405    lea                srcq, [srcq+ssq*4]
3406    shr                 myd, 6
3407    mov                 r4d, 64 << 24
3408    lea                 myd, [t1+myq]
3409    cmovnz              r4q, [base+subpel_filters+myq*8]
3410    pshufb              m14, m5
3411    paddb               m14, m6
3412    movu                xm4, [srcq+ssq*0]
3413    movu                xm5, [srcq+ssq*2]
3414    vinserti128          m4, [srcq+ssq*1], 1
3415    add                srcq, ss3q
3416    vpblendd            m15, m7, 0x30
3417    punpcklqdq          m15, m15
3418    pblendvb            m15, m11, m8
3419    movq               xm10, r4q
3420    punpcklbw          xm10, xm10
3421    psraw              xm10, 8
3422    vinserti128         m10, xm10, 1
3423    pshufb               m2, m14
3424    pshufb               m3, m14
3425    pshufb               m4, m14
3426    pshufb              xm5, xm14
3427    vpermq               m2, m2, q3120
3428    vpermq               m3, m3, q3120
3429    vpermq               m4, m4, q3120
3430    vpermq               m5, m5, q3120
3431    pshufd               m7, m10, q0000
3432    pshufd               m8, m10, q1111
3433    pshufd               m9, m10, q2222
3434    pshufd              m10, m10, q3333
3435    pmaddubsw            m2, m15
3436    pmaddubsw            m3, m15
3437    pmaddubsw            m4, m15
3438    pmaddubsw            m5, m15
3439    phaddw               m2, m3
3440    phaddw               m4, m5
3441    pmulhrsw             m2, m12
3442    pmulhrsw             m4, m12
3443    palignr              m5, m4, m2, 4
3444    pshufd               m3, m4, q2121
3445    punpcklwd            m0, m2, m5     ; 01 12
3446    punpckhwd            m1, m2, m5     ; 23 34
3447    punpcklwd            m2, m4, m3     ; 45 56
3448.dy1_w4_loop:
3449    movu               xm11, [srcq+ssq*0]
3450    vinserti128         m11, [srcq+ssq*1], 1
3451    lea                srcq, [srcq+ssq*2]
3452    pmaddwd              m4, m0, m7
3453    pmaddwd              m5, m1, m8
3454    pmaddwd              m6, m2, m9
3455    mova                 m0, m1
3456    mova                 m1, m2
3457    paddd                m4, m13
3458    paddd                m5, m6
3459    pshufb              m11, m14
3460    vpermq              m11, m11, q3120
3461    pmaddubsw           m11, m15
3462    phaddw              m11, m11
3463    pmulhrsw            m11, m12
3464    palignr              m6, m11, m3, 12
3465    punpcklwd            m2, m6, m11    ; 67 78
3466    mova                 m3, m11
3467    pmaddwd              m6, m2, m10
3468    paddd                m4, m5
3469    paddd                m4, m6
3470    psrad                m4, rndshift
3471    vextracti128        xm5, m4, 1
3472    packssdw            xm4, xm5
3473%ifidn %1, put
3474    packuswb            xm4, xm4
3475    pshuflw             xm4, xm4, q3120
3476    movd       [dstq+dsq*0], xm4
3477    pextrd     [dstq+dsq*1], xm4, 1
3478    lea                dstq, [dstq+dsq*2]
3479%else
3480    pshufd              xm4, xm4, q3120
3481    mova             [tmpq], xm4
3482    add                tmpq, 16
3483%endif
3484    sub                  hd, 2
3485    jg .dy1_w4_loop
3486    MC_8TAP_SCALED_RET
3487.dy1_w8:
3488    mov      dword [rsp+72], 1
3489    movifprep   tmp_stridem, 16
3490    jmp .dy1_w_start
3491.dy1_w16:
3492    mov      dword [rsp+72], 2
3493    movifprep   tmp_stridem, 32
3494    jmp .dy1_w_start
3495.dy1_w32:
3496    mov      dword [rsp+72], 4
3497    movifprep   tmp_stridem, 64
3498    jmp .dy1_w_start
3499.dy1_w64:
3500    mov      dword [rsp+72], 8
3501    movifprep   tmp_stridem, 128
3502    jmp .dy1_w_start
3503.dy1_w128:
3504    mov      dword [rsp+72], 16
3505    movifprep   tmp_stridem, 256
3506.dy1_w_start:
3507    mov                 myd, mym
3508%ifidn %1, put
3509    movifnidn           dsm, dsq
3510%endif
3511    shr                 t0d, 16
3512    sub                srcq, 3
3513    shr                 myd, 6
3514    mov                 r4d, 64 << 24
3515    lea                 myd, [t1+myq]
3516    cmovnz              r4q, [base+subpel_filters+myq*8]
3517    pmaddwd              m8, [base+rescale_mul]
3518    movd               xm15, t0d
3519    mov            [rsp+76], t0d
3520    mov            [rsp+80], srcq
3521    mov            [rsp+88], r0q ; dstq / tmpq
3522%if UNIX64
3523    mov                  hm, hd
3524%endif
3525    shl           dword dxm, 3 ; dx*8
3526    vpbroadcastd        m15, xm15
3527    paddd               m14, m8 ; mx+dx*[0-7]
3528    movq                xm0, r4q
3529    punpcklbw           xm0, xm0
3530    psraw               xm0, 8
3531    mova           [rsp+96], xm0
3532    jmp .dy1_hloop
3533.dy1_hloop_prep:
3534    dec      dword [rsp+72]
3535    jz .ret
3536    add      qword [rsp+88], 8*(isprep+1)
3537    mov                  hd, hm
3538    vpbroadcastd         m8, dxm
3539    vpbroadcastd        m10, [base+pd_0x3ff]
3540    paddd               m14, m8, [rsp+32]
3541    vpbroadcastd        m15, [rsp+76]
3542    pxor                 m9, m9
3543    mov                srcq, [rsp+80]
3544    mov                 r0q, [rsp+88] ; dstq / tmpq
3545.dy1_hloop:
3546    vpbroadcastq        m11, [base+pq_0x40000000]
3547    pand                 m6, m14, m10
3548    psrld                m6, 6
3549    paddd               m15, m6
3550    pcmpeqd              m6, m9
3551    vextracti128        xm7, m15, 1
3552    movd                r4d, xm15
3553    pextrd              r6d, xm15, 2
3554    pextrd              r7d, xm15, 1
3555    pextrd              r9d, xm15, 3
3556    movd               r10d, xm7
3557    pextrd             r11d, xm7, 2
3558    pextrd             r13d, xm7, 1
3559    pextrd              rXd, xm7, 3
3560    movu           [rsp+32], m14
3561    movq               xm15, [base+subpel_filters+ r4*8]
3562    movq               xm10, [base+subpel_filters+ r6*8]
3563    movhps             xm15, [base+subpel_filters+ r7*8]
3564    movhps             xm10, [base+subpel_filters+ r9*8]
3565    vinserti128         m15, [base+subpel_filters+r10*8], 1
3566    vinserti128         m10, [base+subpel_filters+r11*8], 1
3567    vpbroadcastq         m9, [base+subpel_filters+r13*8]
3568    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
3569    psrld               m14, 10
3570    vextracti128        xm7, m14, 1
3571    movq           [rsp+64], xm14
3572    movd                r4d, xm14
3573    pextrd              r6d, xm14, 2
3574    pextrd              r7d, xm14, 1
3575    pextrd              r9d, xm14, 3
3576    movd               r10d, xm7
3577    pextrd             r11d, xm7, 2
3578    pextrd             r13d, xm7, 1
3579    pextrd              rXd, xm7, 3
3580    pshufd               m5, m6, q1100
3581    pshufd               m6, m6, q3322
3582    vpblendd            m15, m9, 0xc0
3583    vpblendd            m10, m8, 0xc0
3584    pblendvb            m15, m11, m5
3585    pblendvb            m10, m11, m6
3586    vbroadcasti128      m14, [base+subpel_s_shuf8]
3587    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
3588    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
3589    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
3590    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
3591    movu              [rsp], m10
3592    vpbroadcastd         m8, [rsp+0x60]
3593    vpbroadcastd         m9, [rsp+0x64]
3594    vpbroadcastd        m10, [rsp+0x68]
3595    vpbroadcastd        m11, [rsp+0x6c]
3596    pshufb               m0, m14    ; 01a 01b
3597    pshufb               m1, m14    ; 23a 23b
3598    pshufb               m2, m14    ; 45a 45b
3599    pshufb               m3, m14    ; 67a 67b
3600    vbroadcasti128      m14, [base+wswap]
3601.dy1_vloop:
3602    pmaddwd              m4, m0, m8
3603    pmaddwd              m5, m1, m9
3604    pmaddwd              m6, m2, m10
3605    pmaddwd              m7, m3, m11
3606    paddd                m4, m5
3607    paddd                m6, m7
3608    paddd                m4, m13
3609    paddd                m4, m6
3610    psrad                m4, rndshift
3611    vextracti128        xm5, m4, 1
3612    packssdw            xm4, xm5
3613%ifidn %1, put
3614    packuswb            xm4, xm4
3615    movq             [dstq], xm4
3616    add                dstq, dsm
3617%else
3618    mova             [tmpq], xm4
3619    add                tmpq, tmp_stridem
3620%endif
3621    dec                  hd
3622    jz .dy1_hloop_prep
3623    movq                xm4, [srcq+ r4]
3624    movq                xm5, [srcq+ r6]
3625    movhps              xm4, [srcq+ r7]
3626    movhps              xm5, [srcq+ r9]
3627    vinserti128          m4, [srcq+r10], 1
3628    vinserti128          m5, [srcq+r11], 1
3629    vpbroadcastq         m6, [srcq+r13]
3630    vpbroadcastq         m7, [srcq+ rX]
3631    add                srcq, ssq
3632    pshufb               m0, m14
3633    pshufb               m1, m14
3634    pshufb               m2, m14
3635    pshufb               m3, m14
3636    vpblendd             m4, m6, 0xc0
3637    vpblendd             m5, m7, 0xc0
3638    pmaddubsw            m4, m15
3639    pmaddubsw            m5, [rsp]
3640    phaddw               m4, m5
3641    pslld                m5, m4, 16
3642    paddw                m4, m5
3643    pmulhrsw             m4, m12
3644    pblendw              m0, m1, 0xaa
3645    pblendw              m1, m2, 0xaa
3646    pblendw              m2, m3, 0xaa
3647    pblendw              m3, m4, 0xaa
3648    jmp .dy1_vloop
3649.dy2:
3650    movzx                wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2]
3651    add                  wq, base_reg
3652    jmp                  wq
3653%ifidn %1, put
3654.dy2_w2:
3655    mov                 myd, mym
3656    movzx               t0d, t0b
3657    dec                srcq
3658    movd               xm15, t0d
3659    punpckldq            m8, m9, m8
3660    paddd               m14, m8 ; mx+dx*[0-1]
3661    vpbroadcastd        m11, [base+pd_0x4000]
3662    vpbroadcastd       xm15, xm15
3663    pand                 m8, m14, m10
3664    psrld                m8, 6
3665    paddd              xm15, xm8
3666    movd                r4d, xm15
3667    pextrd              r6d, xm15, 1
3668    vbroadcasti128       m5, [base+bdct_lb_dw]
3669    vbroadcasti128       m6, [base+subpel_s_shuf2]
3670    vpbroadcastd        m15, [base+subpel_filters+r4*8+2]
3671    vpbroadcastd         m7, [base+subpel_filters+r6*8+2]
3672    pcmpeqd              m8, m9
3673    psrld               m14, 10
3674    movq                xm0, [srcq+ssq*0]
3675    vpbroadcastq         m2, [srcq+ssq*1]
3676    movhps              xm0, [srcq+ssq*2]
3677    vpbroadcastq         m3, [srcq+ss3q ]
3678    lea                srcq, [srcq+ssq*4]
3679    pshufb              m14, m5
3680    paddb               m14, m6
3681    vpblendd            m15, m7, 0xaa
3682    pblendvb            m15, m11, m8
3683    movhps              xm1, [srcq+ssq*0]
3684    vpbroadcastq         m4, [srcq+ssq*1]
3685    lea                srcq, [srcq+ssq*2]
3686    shr                 myd, 6
3687    mov                 r4d, 64 << 24
3688    lea                 myd, [t1+myq]
3689    cmovnz              r4q, [base+subpel_filters+myq*8]
3690    vpblendd             m0, m2, 0x30
3691    vpblendd             m1, m4, 0xc0
3692    vpblendd             m0, m3, 0xc0
3693    pshufb               m0, m14
3694    pshufb               m1, m14
3695    pmaddubsw            m0, m15
3696    pmaddubsw            m1, m15
3697    movq               xm11, r4q
3698    punpcklbw          xm11, xm11
3699    psraw              xm11, 8
3700    phaddw               m0, m1
3701    pmulhrsw             m0, m12            ; 0 2 _ 4  1 3 _ 5
3702    pshufd              xm8, xm11, q0000
3703    pshufd              xm9, xm11, q1111
3704    pshufd             xm10, xm11, q2222
3705    pshufd             xm11, xm11, q3333
3706    pshufd               m2, m0, q3110      ; 0 2 2 4  1 3 3 5
3707    vextracti128        xm1, m2, 1
3708    punpcklwd           xm3, xm2, xm1       ; 01 23
3709    punpckhwd           xm2, xm1            ; 23 45
3710.dy2_w2_loop:
3711    movq                xm6, [srcq+ssq*0]
3712    vpbroadcastq         m7, [srcq+ssq*1]
3713    movhps              xm6, [srcq+ssq*2]
3714    vpbroadcastq         m1, [srcq+ss3q ]
3715    lea                srcq, [srcq+ssq*4]
3716    pmaddwd             xm4, xm3, xm8
3717    pmaddwd             xm5, xm2, xm9
3718    vpblendd             m6, m7, 0x30
3719    vpblendd             m6, m1, 0xc0
3720    pshufb               m6, m14
3721    pmaddubsw            m6, m15
3722    phaddw               m6, m6
3723    pmulhrsw             m6, m12
3724    palignr              m0, m6, m0, 8
3725    pshufd               m2, m0, q3221
3726    vextracti128        xm1, m2, 1
3727    punpcklwd           xm3, xm2, xm1       ; 45 67
3728    punpckhwd           xm2, xm1            ; 67 89
3729    pmaddwd             xm6, xm3, xm10
3730    pmaddwd             xm7, xm2, xm11
3731    paddd               xm4, xm5
3732    paddd               xm4, xm13
3733    paddd               xm6, xm7
3734    paddd               xm4, xm6
3735    psrad               xm4, rndshift
3736    packssdw            xm4, xm4
3737    packuswb            xm4, xm4
3738    pextrw     [dstq+dsq*0], xm4, 0
3739    pextrw     [dstq+dsq*1], xm4, 1
3740    lea                dstq, [dstq+dsq*2]
3741    sub                  hd, 2
3742    jg .dy2_w2_loop
3743    RET
3744%endif
3745.dy2_w4:
3746    mov                 myd, mym
3747    vbroadcasti128       m7, [base+rescale_mul]
3748    movzx               t0d, t0b
3749    dec                srcq
3750    movd               xm15, t0d
3751    pmaddwd              m8, m7
3752    vpbroadcastd        m11, [base+pd_0x4000]
3753    vpbroadcastd       xm15, xm15
3754    paddd               m14, m8 ; mx+dx*[0-3]
3755    pand                 m8, m14, m10
3756    psrld                m8, 6
3757    paddd              xm15, xm8
3758    movd                r4d, xm15
3759    pextrd              r6d, xm15, 1
3760    pextrd             r11d, xm15, 2
3761    pextrd             r13d, xm15, 3
3762    movd               xm15, [base+subpel_filters+r4*8+2]
3763    vbroadcasti128       m5, [base+bdct_lb_dw]
3764    vpbroadcastq         m6, [base+subpel_s_shuf2]
3765    pinsrd             xm15, [base+subpel_filters+r6*8+2], 1
3766    pcmpeqd              m8, m9
3767    psrld               m14, 10
3768    movu                xm0, [srcq+ssq*0]
3769    movu                xm2, [srcq+ssq*2]
3770    pinsrd             xm15, [base+subpel_filters+r11*8+2], 2
3771    movu                xm1, [srcq+ssq*1]
3772    movu                xm3, [srcq+ss3q ]
3773    pinsrd             xm15, [base+subpel_filters+r13*8+2], 3
3774    lea                srcq, [srcq+ssq*4]
3775    shr                 myd, 6
3776    mov                 r4d, 64 << 24
3777    lea                 myd, [t1+myq]
3778    cmovnz              r4q, [base+subpel_filters+myq*8]
3779    vinserti128         m15, xm15, 1
3780    pshufb              m14, m5
3781    paddb               m14, m6
3782    vinserti128          m2, [srcq+ssq*0], 1
3783    vinserti128          m3, [srcq+ssq*1], 1
3784    lea                srcq, [srcq+ssq*2]
3785    pblendvb            m15, m11, m8
3786    pshufb              xm0, xm14
3787    pshufb               m2, m14
3788    pshufb              xm1, xm14
3789    pshufb               m3, m14
3790    pmaddubsw           xm0, xm15
3791    pmaddubsw            m2, m15
3792    pmaddubsw           xm1, xm15
3793    pmaddubsw            m3, m15
3794    movq               xm11, r4q
3795    punpcklbw          xm11, xm11
3796    psraw              xm11, 8
3797    vinserti128         m11, xm11, 1
3798    phaddw               m0, m2
3799    phaddw               m1, m3
3800    pmulhrsw             m0, m12    ; 0 2  _ 4
3801    pmulhrsw             m1, m12    ; 1 3  _ 5
3802    pshufd               m8, m11, q0000
3803    pshufd               m9, m11, q1111
3804    pshufd              m10, m11, q2222
3805    pshufd              m11, m11, q3333
3806    punpcklwd           xm2, xm0, xm1
3807    punpckhwd            m1, m0, m1     ; 23 45
3808    vinserti128          m0, m2, xm1, 1 ; 01 23
3809.dy2_w4_loop:
3810    movu                xm6, [srcq+ssq*0]
3811    movu                xm7, [srcq+ssq*1]
3812    vinserti128          m6, [srcq+ssq*2], 1
3813    vinserti128          m7, [srcq+ss3q ], 1
3814    lea                srcq, [srcq+ssq*4]
3815    pmaddwd              m4, m0, m8
3816    pmaddwd              m5, m1, m9
3817    pshufb               m6, m14
3818    pshufb               m7, m14
3819    pmaddubsw            m6, m15
3820    pmaddubsw            m7, m15
3821    psrld                m2, m6, 16
3822    pslld                m3, m7, 16
3823    paddw                m6, m2
3824    paddw                m7, m3
3825    pblendw              m6, m7, 0xaa   ; 67 89
3826    pmulhrsw             m6, m12
3827    paddd                m4, m5
3828    vpblendd             m0, m1, m6, 0x0f
3829    mova                 m1, m6
3830    vpermq               m0, m0, q1032  ; 45 67
3831    pmaddwd              m6, m0, m10
3832    pmaddwd              m7, m1, m11
3833    paddd                m4, m13
3834    paddd                m6, m7
3835    paddd                m4, m6
3836    psrad                m4, rndshift
3837    vextracti128        xm5, m4, 1
3838    packssdw            xm4, xm5
3839%ifidn %1, put
3840    packuswb            xm4, xm4
3841    movd       [dstq+dsq*0], xm4
3842    pextrd     [dstq+dsq*1], xm4, 1
3843    lea                dstq, [dstq+dsq*2]
3844%else
3845    mova             [tmpq], xm4
3846    add                tmpq, 16
3847%endif
3848    sub                  hd, 2
3849    jg .dy2_w4_loop
3850    MC_8TAP_SCALED_RET
3851.dy2_w8:
3852    mov      dword [rsp+40], 1
3853    movifprep   tmp_stridem, 16
3854    jmp .dy2_w_start
3855.dy2_w16:
3856    mov      dword [rsp+40], 2
3857    movifprep   tmp_stridem, 32
3858    jmp .dy2_w_start
3859.dy2_w32:
3860    mov      dword [rsp+40], 4
3861    movifprep   tmp_stridem, 64
3862    jmp .dy2_w_start
3863.dy2_w64:
3864    mov      dword [rsp+40], 8
3865    movifprep   tmp_stridem, 128
3866    jmp .dy2_w_start
3867.dy2_w128:
3868    mov      dword [rsp+40], 16
3869    movifprep   tmp_stridem, 256
3870.dy2_w_start:
3871    mov                 myd, mym
3872%ifidn %1, put
3873    movifnidn           dsm, dsq
3874%endif
3875    shr                 t0d, 16
3876    sub                srcq, 3
3877    shr                 myd, 6
3878    mov                 r4d, 64 << 24
3879    lea                 myd, [t1+myq]
3880    cmovnz              r4q, [base+subpel_filters+myq*8]
3881    pmaddwd              m8, [base+rescale_mul]
3882    movd               xm15, t0d
3883    mov            [rsp+64], t0d
3884    mov            [rsp+48], srcq
3885    mov            [rsp+56], r0q ; dstq / tmpq
3886%if UNIX64
3887    mov                  hm, hd
3888%endif
3889    shl           dword dxm, 3 ; dx*8
3890    vpbroadcastd        m15, xm15
3891    paddd               m14, m8 ; mx+dx*[0-7]
3892    movq                xm0, r4q
3893    punpcklbw           xm0, xm0
3894    psraw               xm0, 8
3895    mova         [rsp+0x50], xm0
3896    jmp .dy2_hloop
3897.dy2_hloop_prep:
3898    dec      dword [rsp+40]
3899    jz .ret
3900    add      qword [rsp+56], 8*(isprep+1)
3901    mov                  hd, hm
3902    vpbroadcastd         m8, dxm
3903    vpbroadcastd        m10, [base+pd_0x3ff]
3904    paddd               m14, m8, [rsp]
3905    vpbroadcastd        m15, [rsp+64]
3906    pxor                 m9, m9
3907    mov                srcq, [rsp+48]
3908    mov                 r0q, [rsp+56] ; dstq / tmpq
3909.dy2_hloop:
3910    vpbroadcastq        m11, [base+pq_0x40000000]
3911    pand                 m6, m14, m10
3912    psrld                m6, 6
3913    paddd               m15, m6
3914    pcmpeqd              m6, m9
3915    vextracti128        xm7, m15, 1
3916    movd                r4d, xm15
3917    pextrd              r6d, xm15, 2
3918    pextrd              r7d, xm15, 1
3919    pextrd              r9d, xm15, 3
3920    movd               r10d, xm7
3921    pextrd             r11d, xm7, 2
3922    pextrd             r13d, xm7, 1
3923    pextrd              rXd, xm7, 3
3924    movu              [rsp], m14
3925    movq               xm15, [base+subpel_filters+ r4*8]
3926    movq               xm10, [base+subpel_filters+ r6*8]
3927    movhps             xm15, [base+subpel_filters+ r7*8]
3928    movhps             xm10, [base+subpel_filters+ r9*8]
3929    vinserti128         m15, [base+subpel_filters+r10*8], 1
3930    vinserti128         m10, [base+subpel_filters+r11*8], 1
3931    vpbroadcastq         m9, [base+subpel_filters+r13*8]
3932    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
3933    psrld               m14, 10
3934    vextracti128        xm7, m14, 1
3935    movd                r4d, xm14
3936    pextrd              r6d, xm14, 2
3937    pextrd              r7d, xm14, 1
3938    pextrd              r9d, xm14, 3
3939    movd               r10d, xm7
3940    pextrd             r11d, xm7, 2
3941    pextrd             r13d, xm7, 1
3942    pextrd              rXd, xm7, 3
3943    pshufd               m5, m6, q1100
3944    pshufd               m6, m6, q3322
3945    vpblendd            m15, m9, 0xc0
3946    vpblendd            m10, m8, 0xc0
3947    pblendvb            m15, m11, m5
3948    pblendvb            m10, m11, m6
3949    vbroadcasti128      m14, [base+subpel_s_shuf8]
3950    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
3951    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
3952    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
3953    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
3954    vpbroadcastd         m8, [rsp+0x50]
3955    vpbroadcastd         m9, [rsp+0x54]
3956    vpbroadcastd        m11, [rsp+0x58]
3957    vpbroadcastd         m4, [rsp+0x5c]
3958    pshufb               m0, m14    ; 01a 01b
3959    pshufb               m1, m14    ; 23a 23b
3960    pshufb               m2, m14    ; 45a 45b
3961    pshufb               m3, m14    ; 67a 67b
3962    SWAP                m14, m4
3963.dy2_vloop:
3964    pmaddwd              m4, m0, m8
3965    pmaddwd              m5, m1, m9
3966    pmaddwd              m6, m2, m11
3967    pmaddwd              m7, m3, m14
3968    paddd                m4, m5
3969    paddd                m6, m7
3970    paddd                m4, m13
3971    paddd                m4, m6
3972    psrad                m4, rndshift
3973    vextracti128        xm5, m4, 1
3974    packssdw            xm4, xm5
3975%ifidn %1, put
3976    packuswb            xm4, xm4
3977    movq             [dstq], xm4
3978    add                dstq, dsm
3979%else
3980    mova             [tmpq], xm4
3981    add                tmpq, tmp_stridem
3982%endif
3983    dec                  hd
3984    jz .dy2_hloop_prep
3985    mova                 m0, m1
3986    mova                 m1, m2
3987    mova                 m2, m3
3988    movq                xm3, [srcq+ r4]
3989    movq                xm4, [srcq+ r6]
3990    movhps              xm3, [srcq+ r7]
3991    movhps              xm4, [srcq+ r9]
3992    vinserti128          m3, [srcq+r10], 1
3993    vinserti128          m4, [srcq+r11], 1
3994    vpbroadcastq         m5, [srcq+r13]
3995    vpbroadcastq         m6, [srcq+ rX]
3996    add                srcq, ssq
3997    vpblendd             m3, m5, 0xc0
3998    vpblendd             m4, m6, 0xc0
3999    pmaddubsw            m3, m15
4000    pmaddubsw            m4, m10
4001    phaddw               m3, m4
4002    movq                xm4, [srcq+ r4]
4003    movq                xm5, [srcq+ r6]
4004    movhps              xm4, [srcq+ r7]
4005    movhps              xm5, [srcq+ r9]
4006    vinserti128          m4, [srcq+r10], 1
4007    vinserti128          m5, [srcq+r11], 1
4008    vpbroadcastq         m6, [srcq+r13]
4009    vpbroadcastq         m7, [srcq+ rX]
4010    add                srcq, ssq
4011    vpblendd             m4, m6, 0xc0
4012    vpblendd             m5, m7, 0xc0
4013    pmaddubsw            m4, m15
4014    pmaddubsw            m5, m10
4015    phaddw               m4, m5
4016    psrld                m5, m3, 16
4017    pslld                m6, m4, 16
4018    paddw                m3, m5
4019    paddw                m4, m6
4020    pblendw              m3, m4, 0xaa
4021    pmulhrsw             m3, m12
4022    jmp .dy2_vloop
4023.ret:
4024    MC_8TAP_SCALED_RET 0
4025%undef isprep
4026%endmacro
4027
4028%macro BILIN_SCALED_FN 1
4029cglobal %1_bilin_scaled
4030    mov                 t0d, (5*15 << 16) | 5*15
4031    mov                 t1d, t0d
4032    jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX)
4033%endmacro
4034
4035%if WIN64
4036DECLARE_REG_TMP 6, 5
4037%else
4038DECLARE_REG_TMP 6, 8
4039%endif
4040
4041%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
4042%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
4043
4044BILIN_SCALED_FN put
4045PUT_8TAP_SCALED_FN sharp,          SHARP,   SHARP
4046PUT_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH
4047PUT_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP
4048PUT_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH
4049PUT_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR
4050PUT_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP
4051PUT_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR
4052PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
4053PUT_8TAP_SCALED_FN regular,        REGULAR, REGULAR
4054MC_8TAP_SCALED put
4055
4056%if WIN64
4057DECLARE_REG_TMP 5, 4
4058%else
4059DECLARE_REG_TMP 6, 7
4060%endif
4061
4062BILIN_SCALED_FN prep
4063PREP_8TAP_SCALED_FN sharp,          SHARP,   SHARP
4064PREP_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH
4065PREP_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP
4066PREP_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH
4067PREP_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR
4068PREP_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP
4069PREP_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR
4070PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
4071PREP_8TAP_SCALED_FN regular,        REGULAR, REGULAR
4072MC_8TAP_SCALED prep
4073
4074%macro WARP_V 5 ; dst, 02, 46, 13, 57
4075    ; Can be done using gathers, but that's terribly slow on many CPU:s
4076    lea               tmp1d, [myq+deltaq*4]
4077    lea               tmp2d, [myq+deltaq*1]
4078    shr                 myd, 10
4079    shr               tmp1d, 10
4080    movq                xm8, [filterq+myq  *8]
4081    vinserti128          m8, [filterq+tmp1q*8], 1 ; a e
4082    lea               tmp1d, [tmp2q+deltaq*4]
4083    lea                 myd, [tmp2q+deltaq*1]
4084    shr               tmp2d, 10
4085    shr               tmp1d, 10
4086    movq                xm0, [filterq+tmp2q*8]
4087    vinserti128          m0, [filterq+tmp1q*8], 1 ; b f
4088    lea               tmp1d, [myq+deltaq*4]
4089    lea               tmp2d, [myq+deltaq*1]
4090    shr                 myd, 10
4091    shr               tmp1d, 10
4092    movq                xm9, [filterq+myq  *8]
4093    vinserti128          m9, [filterq+tmp1q*8], 1 ; c g
4094    lea               tmp1d, [tmp2q+deltaq*4]
4095    lea                 myd, [tmp2q+gammaq]       ; my += gamma
4096    shr               tmp2d, 10
4097    shr               tmp1d, 10
4098    punpcklwd            m8, m0
4099    movq                xm0, [filterq+tmp2q*8]
4100    vinserti128          m0, [filterq+tmp1q*8], 1 ; d h
4101    punpcklwd            m0, m9, m0
4102    punpckldq            m9, m8, m0
4103    punpckhdq            m0, m8, m0
4104    punpcklbw            m8, m11, m9 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8
4105    punpckhbw            m9, m11, m9 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8
4106    pmaddwd             m%2, m8
4107    pmaddwd              m9, m%3
4108    punpcklbw            m8, m11, m0 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8
4109    punpckhbw            m0, m11, m0 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8
4110    pmaddwd              m8, m%4
4111    pmaddwd              m0, m%5
4112    paddd               m%2, m9
4113    paddd                m0, m8
4114    paddd               m%1, m0, m%2
4115%endmacro
4116
4117cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts
4118%if WIN64
4119    sub                 rsp, 0xa0
4120%endif
4121    call mangle(private_prefix %+ _warp_affine_8x8_avx2).main
4122.loop:
4123    psrad                m7, 13
4124    psrad                m0, 13
4125    packssdw             m7, m0
4126    pmulhrsw             m7, m14 ; (x + (1 << 6)) >> 7
4127    vpermq               m7, m7, q3120
4128    mova         [tmpq+tsq*0], xm7
4129    vextracti128 [tmpq+tsq*2], m7, 1
4130    dec                 r4d
4131    jz   mangle(private_prefix %+ _warp_affine_8x8_avx2).end
4132    call mangle(private_prefix %+ _warp_affine_8x8_avx2).main2
4133    lea                tmpq, [tmpq+tsq*4]
4134    jmp .loop
4135
4136cglobal warp_affine_8x8, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \
4137                                   beta, filter, tmp1, delta, my, gamma
4138%if WIN64
4139    sub                 rsp, 0xa0
4140    %assign xmm_regs_used 16
4141    %assign stack_size_padded 0xa0
4142    %assign stack_offset stack_offset+stack_size_padded
4143%endif
4144    call .main
4145    jmp .start
4146.loop:
4147    call .main2
4148    lea                dstq, [dstq+dsq*2]
4149.start:
4150    psrad                m7, 18
4151    psrad                m0, 18
4152    packusdw             m7, m0
4153    pavgw                m7, m11 ; (x + (1 << 10)) >> 11
4154    vextracti128        xm0, m7, 1
4155    packuswb            xm7, xm0
4156    pshufd              xm7, xm7, q3120
4157    movq       [dstq+dsq*0], xm7
4158    movhps     [dstq+dsq*1], xm7
4159    dec                 r4d
4160    jg .loop
4161.end:
4162    RET
4163ALIGN function_align
4164.main:
4165    ; Stack args offset by one (r4m -> r5m etc.) due to call
4166%if WIN64
4167    mov               abcdq, r5m
4168    mov                 mxd, r6m
4169    movaps [rsp+stack_offset+0x10], xmm6
4170    movaps [rsp+stack_offset+0x20], xmm7
4171    movaps       [rsp+0x28], xmm8
4172    movaps       [rsp+0x38], xmm9
4173    movaps       [rsp+0x48], xmm10
4174    movaps       [rsp+0x58], xmm11
4175    movaps       [rsp+0x68], xmm12
4176    movaps       [rsp+0x78], xmm13
4177    movaps       [rsp+0x88], xmm14
4178    movaps       [rsp+0x98], xmm15
4179%endif
4180    movsx            alphad, word [abcdq+2*0]
4181    movsx             betad, word [abcdq+2*1]
4182    mova                m12, [warp_8x8_shufA]
4183    mova                m13, [warp_8x8_shufB]
4184    vpbroadcastd        m14, [pw_8192]
4185    vpbroadcastd        m15, [pd_32768]
4186    pxor                m11, m11
4187    lea             filterq, [mc_warp_filter]
4188    lea               tmp1q, [ssq*3+3]
4189    add                 mxd, 512+(64<<10)
4190    lea               tmp2d, [alphaq*3]
4191    sub                srcq, tmp1q    ; src -= src_stride*3 + 3
4192    sub               betad, tmp2d    ; beta -= alpha*3
4193    mov                 myd, r7m
4194    call .h
4195    psrld                m1, m0, 16
4196    call .h
4197    psrld                m4, m0, 16
4198    call .h
4199    pblendw              m1, m0, 0xaa ; 02
4200    call .h
4201    pblendw              m4, m0, 0xaa ; 13
4202    call .h
4203    psrld                m2, m1, 16
4204    pblendw              m2, m0, 0xaa ; 24
4205    call .h
4206    psrld                m5, m4, 16
4207    pblendw              m5, m0, 0xaa ; 35
4208    call .h
4209    psrld                m3, m2, 16
4210    pblendw              m3, m0, 0xaa ; 46
4211    movsx            deltad, word [abcdq+2*2]
4212    movsx            gammad, word [abcdq+2*3]
4213    add                 myd, 512+(64<<10)
4214    mov                 r4d, 4
4215    lea               tmp1d, [deltaq*3]
4216    sub              gammad, tmp1d    ; gamma -= delta*3
4217.main2:
4218    call .h
4219    psrld                m6, m5, 16
4220    pblendw              m6, m0, 0xaa ; 57
4221    WARP_V                7, 1, 3, 4, 6
4222    call .h
4223    mova                 m1, m2
4224    mova                 m2, m3
4225    psrld                m3, 16
4226    pblendw              m3, m0, 0xaa ; 68
4227    WARP_V                0, 4, 6, 1, 3
4228    mova                 m4, m5
4229    mova                 m5, m6
4230    ret
4231ALIGN function_align
4232.h:
4233    lea               tmp1d, [mxq+alphaq*4]
4234    lea               tmp2d, [mxq+alphaq*1]
4235    vbroadcasti128      m10, [srcq]
4236    shr                 mxd, 10
4237    shr               tmp1d, 10
4238    movq                xm8, [filterq+mxq  *8]
4239    vinserti128          m8, [filterq+tmp1q*8], 1
4240    lea               tmp1d, [tmp2q+alphaq*4]
4241    lea                 mxd, [tmp2q+alphaq*1]
4242    shr               tmp2d, 10
4243    shr               tmp1d, 10
4244    movq                xm0, [filterq+tmp2q*8]
4245    vinserti128          m0, [filterq+tmp1q*8], 1
4246    lea               tmp1d, [mxq+alphaq*4]
4247    lea               tmp2d, [mxq+alphaq*1]
4248    shr                 mxd, 10
4249    shr               tmp1d, 10
4250    movq                xm9, [filterq+mxq  *8]
4251    vinserti128          m9, [filterq+tmp1q*8], 1
4252    lea               tmp1d, [tmp2q+alphaq*4]
4253    lea                 mxd, [tmp2q+betaq] ; mx += beta
4254    shr               tmp2d, 10
4255    shr               tmp1d, 10
4256    punpcklqdq           m8, m0  ; 0 1   4 5
4257    movq                xm0, [filterq+tmp2q*8]
4258    vinserti128          m0, [filterq+tmp1q*8], 1
4259    punpcklqdq           m9, m0  ; 2 3   6 7
4260    pshufb               m0, m10, m12
4261    pmaddubsw            m0, m8
4262    pshufb              m10, m13
4263    pmaddubsw           m10, m9
4264    add                srcq, ssq
4265    phaddw               m0, m10
4266    pmaddwd              m0, m14 ; 17-bit intermediate, upshifted by 13
4267    paddd                m0, m15 ; rounded 14-bit result in upper 16 bits of dword
4268    ret
4269
4270%macro BIDIR_FN 1 ; op
4271    %1                    0
4272    lea            stride3q, [strideq*3]
4273    jmp                  wq
4274.w4:
4275    vextracti128        xm1, m0, 1
4276    movd   [dstq          ], xm0
4277    pextrd [dstq+strideq*1], xm0, 1
4278    movd   [dstq+strideq*2], xm1
4279    pextrd [dstq+stride3q ], xm1, 1
4280    cmp                  hd, 4
4281    je .ret
4282    lea                dstq, [dstq+strideq*4]
4283    pextrd [dstq          ], xm0, 2
4284    pextrd [dstq+strideq*1], xm0, 3
4285    pextrd [dstq+strideq*2], xm1, 2
4286    pextrd [dstq+stride3q ], xm1, 3
4287    cmp                  hd, 8
4288    je .ret
4289    %1                    2
4290    lea                dstq, [dstq+strideq*4]
4291    vextracti128        xm1, m0, 1
4292    movd   [dstq          ], xm0
4293    pextrd [dstq+strideq*1], xm0, 1
4294    movd   [dstq+strideq*2], xm1
4295    pextrd [dstq+stride3q ], xm1, 1
4296    lea                dstq, [dstq+strideq*4]
4297    pextrd [dstq          ], xm0, 2
4298    pextrd [dstq+strideq*1], xm0, 3
4299    pextrd [dstq+strideq*2], xm1, 2
4300    pextrd [dstq+stride3q ], xm1, 3
4301.ret:
4302    RET
4303.w8_loop:
4304    %1_INC_PTR            2
4305    %1                    0
4306    lea                dstq, [dstq+strideq*4]
4307.w8:
4308    vextracti128        xm1, m0, 1
4309    movq   [dstq          ], xm0
4310    movq   [dstq+strideq*1], xm1
4311    movhps [dstq+strideq*2], xm0
4312    movhps [dstq+stride3q ], xm1
4313    sub                  hd, 4
4314    jg .w8_loop
4315    RET
4316.w16_loop:
4317    %1_INC_PTR            4
4318    %1                    0
4319    lea                dstq, [dstq+strideq*4]
4320.w16:
4321    vpermq               m0, m0, q3120
4322    mova         [dstq          ], xm0
4323    vextracti128 [dstq+strideq*1], m0, 1
4324    %1                    2
4325    vpermq               m0, m0, q3120
4326    mova         [dstq+strideq*2], xm0
4327    vextracti128 [dstq+stride3q ], m0, 1
4328    sub                  hd, 4
4329    jg .w16_loop
4330    RET
4331.w32_loop:
4332    %1_INC_PTR            4
4333    %1                    0
4334    lea                dstq, [dstq+strideq*2]
4335.w32:
4336    vpermq               m0, m0, q3120
4337    mova   [dstq+strideq*0], m0
4338    %1                    2
4339    vpermq               m0, m0, q3120
4340    mova   [dstq+strideq*1], m0
4341    sub                  hd, 2
4342    jg .w32_loop
4343    RET
4344.w64_loop:
4345    %1_INC_PTR            4
4346    %1                    0
4347    add                dstq, strideq
4348.w64:
4349    vpermq               m0, m0, q3120
4350    mova             [dstq], m0
4351    %1                    2
4352    vpermq               m0, m0, q3120
4353    mova          [dstq+32], m0
4354    dec                  hd
4355    jg .w64_loop
4356    RET
4357.w128_loop:
4358    %1                    0
4359    add                dstq, strideq
4360.w128:
4361    vpermq               m0, m0, q3120
4362    mova        [dstq+0*32], m0
4363    %1                    2
4364    vpermq               m0, m0, q3120
4365    mova        [dstq+1*32], m0
4366    %1_INC_PTR            8
4367    %1                   -4
4368    vpermq               m0, m0, q3120
4369    mova        [dstq+2*32], m0
4370    %1                   -2
4371    vpermq               m0, m0, q3120
4372    mova        [dstq+3*32], m0
4373    dec                  hd
4374    jg .w128_loop
4375    RET
4376%endmacro
4377
4378%macro AVG 1 ; src_offset
4379    mova                 m0, [tmp1q+(%1+0)*32]
4380    paddw                m0, [tmp2q+(%1+0)*32]
4381    mova                 m1, [tmp1q+(%1+1)*32]
4382    paddw                m1, [tmp2q+(%1+1)*32]
4383    pmulhrsw             m0, m2
4384    pmulhrsw             m1, m2
4385    packuswb             m0, m1
4386%endmacro
4387
4388%macro AVG_INC_PTR 1
4389    add               tmp1q, %1*32
4390    add               tmp2q, %1*32
4391%endmacro
4392
4393cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
4394%define base r6-avg %+ SUFFIX %+ _table
4395    lea                  r6, [avg %+ SUFFIX %+ _table]
4396    tzcnt                wd, wm
4397    movifnidn            hd, hm
4398    movsxd               wq, dword [r6+wq*4]
4399    vpbroadcastd         m2, [base+pw_1024]
4400    add                  wq, r6
4401    BIDIR_FN            AVG
4402
4403%macro W_AVG 1 ; src_offset
4404    ; (a * weight + b * (16 - weight) + 128) >> 8
4405    ; = ((a - b) * weight + (b << 4) + 128) >> 8
4406    ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
4407    ; = ((((b - a) * (-weight     << 12)) >> 16) + b + 8) >> 4
4408    mova                 m0,     [tmp1q+(%1+0)*32]
4409    psubw                m2, m0, [tmp2q+(%1+0)*32]
4410    mova                 m1,     [tmp1q+(%1+1)*32]
4411    psubw                m3, m1, [tmp2q+(%1+1)*32]
4412    pmulhw               m2, m4
4413    pmulhw               m3, m4
4414    paddw                m0, m2
4415    paddw                m1, m3
4416    pmulhrsw             m0, m5
4417    pmulhrsw             m1, m5
4418    packuswb             m0, m1
4419%endmacro
4420
4421%define W_AVG_INC_PTR AVG_INC_PTR
4422
4423cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
4424%define base r6-w_avg %+ SUFFIX %+ _table
4425    lea                  r6, [w_avg %+ SUFFIX %+ _table]
4426    tzcnt                wd, wm
4427    movifnidn            hd, hm
4428    vpbroadcastw         m4, r6m ; weight
4429    movsxd               wq, dword [r6+wq*4]
4430    vpbroadcastd         m5, [base+pw_2048]
4431    psllw                m4, 12 ; (weight-16) << 12 when interpreted as signed
4432    add                  wq, r6
4433    cmp           dword r6m, 7
4434    jg .weight_gt7
4435    mov                  r6, tmp1q
4436    pxor                 m0, m0
4437    mov               tmp1q, tmp2q
4438    psubw                m4, m0, m4 ; -weight
4439    mov               tmp2q, r6
4440.weight_gt7:
4441    BIDIR_FN          W_AVG
4442
4443%macro MASK 1 ; src_offset
4444    ; (a * m + b * (64 - m) + 512) >> 10
4445    ; = ((a - b) * m + (b << 6) + 512) >> 10
4446    ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
4447    vpermq               m3,     [maskq+%1*16], q3120
4448    mova                 m0,     [tmp2q+(%1+0)*32]
4449    psubw                m1, m0, [tmp1q+(%1+0)*32]
4450    psubb                m3, m4, m3
4451    paddw                m1, m1     ; (b - a) << 1
4452    paddb                m3, m3
4453    punpcklbw            m2, m4, m3 ; -m << 9
4454    pmulhw               m1, m2
4455    paddw                m0, m1
4456    mova                 m1,     [tmp2q+(%1+1)*32]
4457    psubw                m2, m1, [tmp1q+(%1+1)*32]
4458    paddw                m2, m2
4459    punpckhbw            m3, m4, m3
4460    pmulhw               m2, m3
4461    paddw                m1, m2
4462    pmulhrsw             m0, m5
4463    pmulhrsw             m1, m5
4464    packuswb             m0, m1
4465%endmacro
4466
4467%macro MASK_INC_PTR 1
4468    add               maskq, %1*16
4469    add               tmp2q, %1*32
4470    add               tmp1q, %1*32
4471%endmacro
4472
4473cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
4474%define base r7-mask %+ SUFFIX %+ _table
4475    lea                  r7, [mask %+ SUFFIX %+ _table]
4476    tzcnt                wd, wm
4477    movifnidn            hd, hm
4478    mov               maskq, maskmp
4479    movsxd               wq, dword [r7+wq*4]
4480    vpbroadcastd         m5, [base+pw_2048]
4481    pxor                 m4, m4
4482    add                  wq, r7
4483    BIDIR_FN           MASK
4484
4485%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4
4486    mova                m%1, [tmp1q+32*%3]
4487    mova                 m1, [tmp2q+32*%3]
4488    psubw                m1, m%1
4489    pabsw               m%2, m1
4490    psubusw             m%2, m6, m%2
4491    psrlw               m%2, 8 ; 64 - m
4492    psllw                m2, m%2, 10
4493    pmulhw               m1, m2
4494    paddw               m%1, m1
4495    mova                 m1, [tmp1q+32*%4]
4496    mova                 m2, [tmp2q+32*%4]
4497    psubw                m2, m1
4498    pabsw                m3, m2
4499    psubusw              m3, m6, m3
4500    psrlw                m3, 8
4501%if %5
4502    packuswb            m%2, m3
4503    psubb               m%2, m5, m%2
4504    vpermq              m%2, m%2, q3120
4505%else
4506    phaddw              m%2, m3
4507%endif
4508    psllw                m3, 10
4509    pmulhw               m2, m3
4510    paddw                m1, m2
4511    pmulhrsw            m%1, m7
4512    pmulhrsw             m1, m7
4513    packuswb            m%1, m1
4514%endmacro
4515
4516cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask
4517%define base r6-blend_avx2_table
4518    lea                  r6, [blend_avx2_table]
4519    tzcnt                wd, wm
4520    movifnidn            hd, hm
4521    movifnidn         maskq, maskmp
4522    movsxd               wq, dword [r6+wq*4]
4523    vpbroadcastd         m4, [base+pb_64]
4524    vpbroadcastd         m5, [base+pw_512]
4525    add                  wq, r6
4526    lea                  r6, [dsq*3]
4527    jmp                  wq
4528.w4:
4529    movd                xm0, [dstq+dsq*0]
4530    pinsrd              xm0, [dstq+dsq*1], 1
4531    vpbroadcastd        xm1, [dstq+dsq*2]
4532    pinsrd              xm1, [dstq+r6   ], 3
4533    mova                xm6, [maskq]
4534    psubb               xm3, xm4, xm6
4535    punpcklbw           xm2, xm3, xm6
4536    punpckhbw           xm3, xm6
4537    mova                xm6, [tmpq]
4538    add               maskq, 4*4
4539    add                tmpq, 4*4
4540    punpcklbw           xm0, xm6
4541    punpckhbw           xm1, xm6
4542    pmaddubsw           xm0, xm2
4543    pmaddubsw           xm1, xm3
4544    pmulhrsw            xm0, xm5
4545    pmulhrsw            xm1, xm5
4546    packuswb            xm0, xm1
4547    movd       [dstq+dsq*0], xm0
4548    pextrd     [dstq+dsq*1], xm0, 1
4549    pextrd     [dstq+dsq*2], xm0, 2
4550    pextrd     [dstq+r6   ], xm0, 3
4551    lea                dstq, [dstq+dsq*4]
4552    sub                  hd, 4
4553    jg .w4
4554    RET
4555ALIGN function_align
4556.w8:
4557    movq                xm1, [dstq+dsq*0]
4558    movhps              xm1, [dstq+dsq*1]
4559    vpbroadcastq         m2, [dstq+dsq*2]
4560    vpbroadcastq         m3, [dstq+r6   ]
4561    mova                 m0, [maskq]
4562    mova                 m6, [tmpq]
4563    add               maskq, 8*4
4564    add                tmpq, 8*4
4565    vpblendd             m1, m2, 0x30
4566    vpblendd             m1, m3, 0xc0
4567    psubb                m3, m4, m0
4568    punpcklbw            m2, m3, m0
4569    punpckhbw            m3, m0
4570    punpcklbw            m0, m1, m6
4571    punpckhbw            m1, m6
4572    pmaddubsw            m0, m2
4573    pmaddubsw            m1, m3
4574    pmulhrsw             m0, m5
4575    pmulhrsw             m1, m5
4576    packuswb             m0, m1
4577    vextracti128        xm1, m0, 1
4578    movq       [dstq+dsq*0], xm0
4579    movhps     [dstq+dsq*1], xm0
4580    movq       [dstq+dsq*2], xm1
4581    movhps     [dstq+r6   ], xm1
4582    lea                dstq, [dstq+dsq*4]
4583    sub                  hd, 4
4584    jg .w8
4585    RET
4586ALIGN function_align
4587.w16:
4588    mova                 m0, [maskq]
4589    mova                xm1, [dstq+dsq*0]
4590    vinserti128          m1, [dstq+dsq*1], 1
4591    psubb                m3, m4, m0
4592    punpcklbw            m2, m3, m0
4593    punpckhbw            m3, m0
4594    mova                 m6, [tmpq]
4595    add               maskq, 16*2
4596    add                tmpq, 16*2
4597    punpcklbw            m0, m1, m6
4598    punpckhbw            m1, m6
4599    pmaddubsw            m0, m2
4600    pmaddubsw            m1, m3
4601    pmulhrsw             m0, m5
4602    pmulhrsw             m1, m5
4603    packuswb             m0, m1
4604    mova         [dstq+dsq*0], xm0
4605    vextracti128 [dstq+dsq*1], m0, 1
4606    lea                dstq, [dstq+dsq*2]
4607    sub                  hd, 2
4608    jg .w16
4609    RET
4610ALIGN function_align
4611.w32:
4612    mova                 m0, [maskq]
4613    mova                 m1, [dstq]
4614    mova                 m6, [tmpq]
4615    add               maskq, 32
4616    add                tmpq, 32
4617    psubb                m3, m4, m0
4618    punpcklbw            m2, m3, m0
4619    punpckhbw            m3, m0
4620    punpcklbw            m0, m1, m6
4621    punpckhbw            m1, m6
4622    pmaddubsw            m0, m2
4623    pmaddubsw            m1, m3
4624    pmulhrsw             m0, m5
4625    pmulhrsw             m1, m5
4626    packuswb             m0, m1
4627    mova             [dstq], m0
4628    add                dstq, dsq
4629    dec                  hd
4630    jg .w32
4631    RET
4632
4633cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask
4634%define base r5-blend_v_avx2_table
4635    lea                  r5, [blend_v_avx2_table]
4636    tzcnt                wd, wm
4637    movifnidn            hd, hm
4638    movsxd               wq, dword [r5+wq*4]
4639    vpbroadcastd         m5, [base+pw_512]
4640    add                  wq, r5
4641    add               maskq, obmc_masks-blend_v_avx2_table
4642    jmp                  wq
4643.w2:
4644    vpbroadcastd        xm2, [maskq+2*2]
4645.w2_s0_loop:
4646    movd                xm0, [dstq+dsq*0]
4647    pinsrw              xm0, [dstq+dsq*1], 1
4648    movd                xm1, [tmpq]
4649    add                tmpq, 2*2
4650    punpcklbw           xm0, xm1
4651    pmaddubsw           xm0, xm2
4652    pmulhrsw            xm0, xm5
4653    packuswb            xm0, xm0
4654    pextrw     [dstq+dsq*0], xm0, 0
4655    pextrw     [dstq+dsq*1], xm0, 1
4656    lea                dstq, [dstq+dsq*2]
4657    sub                  hd, 2
4658    jg .w2_s0_loop
4659    RET
4660ALIGN function_align
4661.w4:
4662    vpbroadcastq        xm2, [maskq+4*2]
4663.w4_loop:
4664    movd                xm0, [dstq+dsq*0]
4665    pinsrd              xm0, [dstq+dsq*1], 1
4666    movq                xm1, [tmpq]
4667    add                tmpq, 4*2
4668    punpcklbw           xm0, xm1
4669    pmaddubsw           xm0, xm2
4670    pmulhrsw            xm0, xm5
4671    packuswb            xm0, xm0
4672    movd       [dstq+dsq*0], xm0
4673    pextrd     [dstq+dsq*1], xm0, 1
4674    lea                dstq, [dstq+dsq*2]
4675    sub                  hd, 2
4676    jg .w4_loop
4677    RET
4678ALIGN function_align
4679.w8:
4680    vbroadcasti128       m4, [maskq+8*2]
4681.w8_loop:
4682    vpbroadcastq         m2, [dstq+dsq*0]
4683    movq                xm0, [dstq+dsq*1]
4684    vpblendd             m0, m2, 0x30
4685    movq                xm1, [tmpq+8*1]
4686    vinserti128          m1, [tmpq+8*0], 1
4687    add                tmpq, 8*2
4688    punpcklbw            m0, m1
4689    pmaddubsw            m0, m4
4690    pmulhrsw             m0, m5
4691    vextracti128        xm1, m0, 1
4692    packuswb            xm0, xm1
4693    movhps     [dstq+dsq*0], xm0
4694    movq       [dstq+dsq*1], xm0
4695    lea                dstq, [dstq+dsq*2]
4696    sub                  hd, 2
4697    jg .w8_loop
4698    RET
4699ALIGN function_align
4700.w16:
4701    vbroadcasti128       m3, [maskq+16*2]
4702    vbroadcasti128       m4, [maskq+16*3]
4703.w16_loop:
4704    mova                xm1, [dstq+dsq*0]
4705    vinserti128          m1, [dstq+dsq*1], 1
4706    mova                 m2, [tmpq]
4707    add                tmpq, 16*2
4708    punpcklbw            m0, m1, m2
4709    punpckhbw            m1, m2
4710    pmaddubsw            m0, m3
4711    pmaddubsw            m1, m4
4712    pmulhrsw             m0, m5
4713    pmulhrsw             m1, m5
4714    packuswb             m0, m1
4715    mova         [dstq+dsq*0], xm0
4716    vextracti128 [dstq+dsq*1], m0, 1
4717    lea                dstq, [dstq+dsq*2]
4718    sub                  hd, 2
4719    jg .w16_loop
4720    RET
4721ALIGN function_align
4722.w32:
4723    mova                xm3, [maskq+16*4]
4724    vinserti128          m3, [maskq+16*6], 1
4725    mova                xm4, [maskq+16*5]
4726    vinserti128          m4, [maskq+16*7], 1
4727.w32_loop:
4728    mova                 m1, [dstq]
4729    mova                 m2, [tmpq]
4730    add                tmpq, 32
4731    punpcklbw            m0, m1, m2
4732    punpckhbw            m1, m2
4733    pmaddubsw            m0, m3
4734    pmaddubsw            m1, m4
4735    pmulhrsw             m0, m5
4736    pmulhrsw             m1, m5
4737    packuswb             m0, m1
4738    mova             [dstq], m0
4739    add                dstq, dsq
4740    dec                  hd
4741    jg .w32_loop
4742    RET
4743
4744cglobal blend_h, 4, 7, 6, dst, ds, tmp, w, h, mask
4745%define base r5-blend_h_avx2_table
4746    lea                  r5, [blend_h_avx2_table]
4747    mov                 r6d, wd
4748    tzcnt                wd, wd
4749    mov                  hd, hm
4750    movsxd               wq, dword [r5+wq*4]
4751    vpbroadcastd         m5, [base+pw_512]
4752    add                  wq, r5
4753    lea               maskq, [base+obmc_masks+hq*2]
4754    lea                  hd, [hq*3]
4755    shr                  hd, 2 ; h * 3/4
4756    lea               maskq, [maskq+hq*2]
4757    neg                  hq
4758    jmp                  wq
4759.w2:
4760    movd                xm0, [dstq+dsq*0]
4761    pinsrw              xm0, [dstq+dsq*1], 1
4762    movd                xm2, [maskq+hq*2]
4763    movd                xm1, [tmpq]
4764    add                tmpq, 2*2
4765    punpcklwd           xm2, xm2
4766    punpcklbw           xm0, xm1
4767    pmaddubsw           xm0, xm2
4768    pmulhrsw            xm0, xm5
4769    packuswb            xm0, xm0
4770    pextrw     [dstq+dsq*0], xm0, 0
4771    pextrw     [dstq+dsq*1], xm0, 1
4772    lea                dstq, [dstq+dsq*2]
4773    add                  hq, 2
4774    jl .w2
4775    RET
4776ALIGN function_align
4777.w4:
4778    mova                xm3, [blend_shuf]
4779.w4_loop:
4780    movd                xm0, [dstq+dsq*0]
4781    pinsrd              xm0, [dstq+dsq*1], 1
4782    movd                xm2, [maskq+hq*2]
4783    movq                xm1, [tmpq]
4784    add                tmpq, 4*2
4785    pshufb              xm2, xm3
4786    punpcklbw           xm0, xm1
4787    pmaddubsw           xm0, xm2
4788    pmulhrsw            xm0, xm5
4789    packuswb            xm0, xm0
4790    movd       [dstq+dsq*0], xm0
4791    pextrd     [dstq+dsq*1], xm0, 1
4792    lea                dstq, [dstq+dsq*2]
4793    add                  hq, 2
4794    jl .w4_loop
4795    RET
4796ALIGN function_align
4797.w8:
4798    vbroadcasti128       m4, [blend_shuf]
4799    shufpd               m4, m4, 0x03
4800.w8_loop:
4801    vpbroadcastq         m1, [dstq+dsq*0]
4802    movq                xm0, [dstq+dsq*1]
4803    vpblendd             m0, m1, 0x30
4804    vpbroadcastd         m3, [maskq+hq*2]
4805    movq                xm1, [tmpq+8*1]
4806    vinserti128          m1, [tmpq+8*0], 1
4807    add                tmpq, 8*2
4808    pshufb               m3, m4
4809    punpcklbw            m0, m1
4810    pmaddubsw            m0, m3
4811    pmulhrsw             m0, m5
4812    vextracti128        xm1, m0, 1
4813    packuswb            xm0, xm1
4814    movhps     [dstq+dsq*0], xm0
4815    movq       [dstq+dsq*1], xm0
4816    lea                dstq, [dstq+dsq*2]
4817    add                  hq, 2
4818    jl .w8_loop
4819    RET
4820ALIGN function_align
4821.w16:
4822    vbroadcasti128       m4, [blend_shuf]
4823    shufpd               m4, m4, 0x0c
4824.w16_loop:
4825    mova                xm1, [dstq+dsq*0]
4826    vinserti128          m1, [dstq+dsq*1], 1
4827    vpbroadcastd         m3, [maskq+hq*2]
4828    mova                 m2, [tmpq]
4829    add                tmpq, 16*2
4830    pshufb               m3, m4
4831    punpcklbw            m0, m1, m2
4832    punpckhbw            m1, m2
4833    pmaddubsw            m0, m3
4834    pmaddubsw            m1, m3
4835    pmulhrsw             m0, m5
4836    pmulhrsw             m1, m5
4837    packuswb             m0, m1
4838    mova         [dstq+dsq*0], xm0
4839    vextracti128 [dstq+dsq*1], m0, 1
4840    lea                dstq, [dstq+dsq*2]
4841    add                  hq, 2
4842    jl .w16_loop
4843    RET
4844ALIGN function_align
4845.w32: ; w32/w64/w128
4846    sub                 dsq, r6
4847.w32_loop0:
4848    vpbroadcastw         m3, [maskq+hq*2]
4849    mov                  wd, r6d
4850.w32_loop:
4851    mova                 m1, [dstq]
4852    mova                 m2, [tmpq]
4853    add                tmpq, 32
4854    punpcklbw            m0, m1, m2
4855    punpckhbw            m1, m2
4856    pmaddubsw            m0, m3
4857    pmaddubsw            m1, m3
4858    pmulhrsw             m0, m5
4859    pmulhrsw             m1, m5
4860    packuswb             m0, m1
4861    mova             [dstq], m0
4862    add                dstq, 32
4863    sub                  wd, 32
4864    jg .w32_loop
4865    add                dstq, dsq
4866    inc                  hq
4867    jl .w32_loop0
4868    RET
4869
4870cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
4871                             bottomext, rightext
4872    ; we assume that the buffer (stride) is larger than width, so we can
4873    ; safely overwrite by a few bytes
4874
4875    ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
4876    xor                r12d, r12d
4877    lea                 r10, [ihq-1]
4878    cmp                  yq, ihq
4879    cmovs               r10, yq
4880    test                 yq, yq
4881    cmovs               r10, r12
4882    imul                r10, sstrideq
4883    add                srcq, r10
4884
4885    ; ref += iclip(x, 0, iw - 1)
4886    lea                 r10, [iwq-1]
4887    cmp                  xq, iwq
4888    cmovs               r10, xq
4889    test                 xq, xq
4890    cmovs               r10, r12
4891    add                srcq, r10
4892
4893    ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
4894    lea          bottomextq, [yq+bhq]
4895    sub          bottomextq, ihq
4896    lea                  r3, [bhq-1]
4897    cmovs        bottomextq, r12
4898
4899    DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \
4900                bottomext, rightext
4901
4902    ; top_ext = iclip(-y, 0, bh - 1)
4903    neg             topextq
4904    cmovs           topextq, r12
4905    cmp          bottomextq, bhq
4906    cmovns       bottomextq, r3
4907    cmp             topextq, bhq
4908    cmovg           topextq, r3
4909
4910    ; right_ext = iclip(x + bw - iw, 0, bw - 1)
4911    lea           rightextq, [xq+bwq]
4912    sub           rightextq, iwq
4913    lea                  r2, [bwq-1]
4914    cmovs         rightextq, r12
4915
4916    DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \
4917                bottomext, rightext
4918
4919    ; left_ext = iclip(-x, 0, bw - 1)
4920    neg            leftextq
4921    cmovs          leftextq, r12
4922    cmp           rightextq, bwq
4923    cmovns        rightextq, r2
4924    cmp            leftextq, bwq
4925    cmovns         leftextq, r2
4926
4927    DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \
4928                dst, dstride, src, sstride, bottomext, rightext
4929
4930    ; center_h = bh - top_ext - bottom_ext
4931    lea                  r3, [bottomextq+topextq]
4932    sub            centerhq, r3
4933
4934    ; blk += top_ext * PXSTRIDE(dst_stride)
4935    mov                  r2, topextq
4936    imul                 r2, dstrideq
4937    add                dstq, r2
4938    mov                 r9m, dstq
4939
4940    ; center_w = bw - left_ext - right_ext
4941    mov            centerwq, bwq
4942    lea                  r3, [rightextq+leftextq]
4943    sub            centerwq, r3
4944
4945%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
4946.v_loop_%3:
4947%if %1
4948    ; left extension
4949    xor                  r3, r3
4950    vpbroadcastb         m0, [srcq]
4951.left_loop_%3:
4952    mova          [dstq+r3], m0
4953    add                  r3, 32
4954    cmp                  r3, leftextq
4955    jl .left_loop_%3
4956
4957    ; body
4958    lea                 r12, [dstq+leftextq]
4959%endif
4960    xor                  r3, r3
4961.body_loop_%3:
4962    movu                 m0, [srcq+r3]
4963%if %1
4964    movu           [r12+r3], m0
4965%else
4966    movu          [dstq+r3], m0
4967%endif
4968    add                  r3, 32
4969    cmp                  r3, centerwq
4970    jl .body_loop_%3
4971
4972%if %2
4973    ; right extension
4974%if %1
4975    add                 r12, centerwq
4976%else
4977    lea                 r12, [dstq+centerwq]
4978%endif
4979    xor                  r3, r3
4980    vpbroadcastb         m0, [srcq+centerwq-1]
4981.right_loop_%3:
4982    movu           [r12+r3], m0
4983    add                  r3, 32
4984    cmp                  r3, rightextq
4985    jl .right_loop_%3
4986
4987%endif
4988    add                dstq, dstrideq
4989    add                srcq, sstrideq
4990    dec            centerhq
4991    jg .v_loop_%3
4992%endmacro
4993
4994    test           leftextq, leftextq
4995    jnz .need_left_ext
4996    test          rightextq, rightextq
4997    jnz .need_right_ext
4998    v_loop                0, 0, 0
4999    jmp .body_done
5000
5001.need_left_ext:
5002    test          rightextq, rightextq
5003    jnz .need_left_right_ext
5004    v_loop                1, 0, 1
5005    jmp .body_done
5006
5007.need_left_right_ext:
5008    v_loop                1, 1, 2
5009    jmp .body_done
5010
5011.need_right_ext:
5012    v_loop                0, 1, 3
5013
5014.body_done:
5015    ; bottom edge extension
5016    test         bottomextq, bottomextq
5017    jz .top
5018    mov                srcq, dstq
5019    sub                srcq, dstrideq
5020    xor                  r1, r1
5021.bottom_x_loop:
5022    mova                 m0, [srcq+r1]
5023    lea                  r3, [dstq+r1]
5024    mov                  r4, bottomextq
5025.bottom_y_loop:
5026    mova               [r3], m0
5027    add                  r3, dstrideq
5028    dec                  r4
5029    jg .bottom_y_loop
5030    add                  r1, 32
5031    cmp                  r1, bwq
5032    jl .bottom_x_loop
5033
5034.top:
5035    ; top edge extension
5036    test            topextq, topextq
5037    jz .end
5038    mov                srcq, r9m
5039    mov                dstq, dstm
5040    xor                  r1, r1
5041.top_x_loop:
5042    mova                 m0, [srcq+r1]
5043    lea                  r3, [dstq+r1]
5044    mov                  r4, topextq
5045.top_y_loop:
5046    mova               [r3], m0
5047    add                  r3, dstrideq
5048    dec                  r4
5049    jg .top_y_loop
5050    add                  r1, 32
5051    cmp                  r1, bwq
5052    jl .top_x_loop
5053
5054.end:
5055    RET
5056
5057cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \
5058                           dst_w, h, src_w, dx, mx0
5059    sub          dword mx0m, 4<<14
5060    sub        dword src_wm, 8
5061    vpbroadcastd         m5, dxm
5062    vpbroadcastd         m8, mx0m
5063    vpbroadcastd         m6, src_wm
5064
5065    DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr
5066    LEA                  r7, $$
5067%define base r7-$$
5068
5069    vpbroadcastd         m3, [base+pw_m256]
5070    vpbroadcastd         m7, [base+pd_63]
5071    vbroadcasti128      m15, [base+pb_8x0_8x8]
5072    pmaddwd              m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7]
5073    pslld                m5, 3                      ; dx*8
5074    pslld                m6, 14
5075    paddd                m8, m2                     ; mx+[0..7]*dx
5076    pxor                 m2, m2
5077
5078    ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7
5079    ; m8 = mx+[0..7]*dx, m5 = dx*8, m6 = src_w, m7 = 0x3f, m15=0,8
5080
5081.loop_y:
5082    xor                  xd, xd
5083    mova                 m4, m8                     ; per-line working version of mx
5084
5085.loop_x:
5086    pmaxsd               m0, m4, m2
5087    psrad                m9, m4, 8                  ; filter offset (unmasked)
5088    pminsd               m0, m6                     ; iclip(mx, 0, src_w-8)
5089    psubd                m1, m4, m0                 ; pshufb offset
5090    psrad                m0, 14                     ; clipped src_x offset
5091    psrad                m1, 14                     ; pshufb edge_emu offset
5092    pand                 m9, m7                     ; filter offset (masked)
5093
5094    ; load source pixels - this ugly code is vpgatherdq emulation since
5095    ; directly using vpgatherdq on Haswell is quite a bit slower :(
5096    movd                r8d, xm0
5097    pextrd              r9d, xm0, 1
5098    pextrd             r10d, xm0, 2
5099    pextrd             r11d, xm0, 3
5100    vextracti128        xm0, m0, 1
5101    movq               xm12, [srcq+r8]
5102    movq               xm13, [srcq+r10]
5103    movhps             xm12, [srcq+r9]
5104    movhps             xm13, [srcq+r11]
5105    movd                r8d, xm0
5106    pextrd              r9d, xm0, 1
5107    pextrd             r10d, xm0, 2
5108    pextrd             r11d, xm0, 3
5109    vinserti128         m12, [srcq+r8], 1
5110    vinserti128         m13, [srcq+r10], 1
5111    vpbroadcastq        m10, [srcq+r9]
5112    vpbroadcastq        m11, [srcq+r11]
5113    vpblendd            m12, m10, 11000000b
5114    vpblendd            m13, m11, 11000000b
5115
5116    ; if no emulation is required, we don't need to shuffle or emulate edges
5117    ; this also saves 2 quasi-vpgatherdqs
5118    vptest               m1, m1
5119    jz .filter
5120
5121    movd                r8d, xm1
5122    pextrd              r9d, xm1, 1
5123    pextrd             r10d, xm1, 2
5124    pextrd             r11d, xm1, 3
5125    movsxd               r8, r8d
5126    movsxd               r9, r9d
5127    movsxd              r10, r10d
5128    movsxd              r11, r11d
5129    vextracti128        xm1, m1, 1
5130    movq               xm14, [base+resize_shuf+4+r8]
5131    movq                xm0, [base+resize_shuf+4+r10]
5132    movhps             xm14, [base+resize_shuf+4+r9]
5133    movhps              xm0, [base+resize_shuf+4+r11]
5134    movd                r8d, xm1
5135    pextrd              r9d, xm1, 1
5136    pextrd             r10d, xm1, 2
5137    pextrd             r11d, xm1, 3
5138    movsxd               r8, r8d
5139    movsxd               r9, r9d
5140    movsxd              r10, r10d
5141    movsxd              r11, r11d
5142    vinserti128         m14, [base+resize_shuf+4+r8], 1
5143    vinserti128          m0, [base+resize_shuf+4+r10], 1
5144    vpbroadcastq        m10, [base+resize_shuf+4+r9]
5145    vpbroadcastq        m11, [base+resize_shuf+4+r11]
5146    vpblendd            m14, m10, 11000000b
5147    vpblendd             m0, m11, 11000000b
5148
5149    paddb               m14, m15
5150    paddb                m0, m15
5151    pshufb              m12, m14
5152    pshufb              m13, m0
5153
5154.filter:
5155    movd                r8d, xm9
5156    pextrd              r9d, xm9, 1
5157    pextrd             r10d, xm9, 2
5158    pextrd             r11d, xm9, 3
5159    vextracti128        xm9, m9, 1
5160    movq               xm10, [base+resize_filter+r8*8]
5161    movq               xm11, [base+resize_filter+r10*8]
5162    movhps             xm10, [base+resize_filter+r9*8]
5163    movhps             xm11, [base+resize_filter+r11*8]
5164    movd                r8d, xm9
5165    pextrd              r9d, xm9, 1
5166    pextrd             r10d, xm9, 2
5167    pextrd             r11d, xm9, 3
5168    vinserti128         m10, [base+resize_filter+r8*8], 1
5169    vinserti128         m11, [base+resize_filter+r10*8], 1
5170    vpbroadcastq        m14, [base+resize_filter+r9*8]
5171    vpbroadcastq         m1, [base+resize_filter+r11*8]
5172    vpblendd            m10, m14, 11000000b
5173    vpblendd            m11, m1, 11000000b
5174
5175    pmaddubsw           m12, m10
5176    pmaddubsw           m13, m11
5177    phaddw              m12, m13
5178    vextracti128       xm13, m12, 1
5179    phaddsw            xm12, xm13
5180    pmulhrsw           xm12, xm3                    ; x=(x+64)>>7
5181    packuswb           xm12, xm12
5182    movq          [dstq+xq], xm12
5183
5184    paddd                m4, m5
5185    add                  xd, 8
5186    cmp                  xd, dst_wd
5187    jl .loop_x
5188
5189    add                dstq, dst_strideq
5190    add                srcq, src_strideq
5191    dec                  hd
5192    jg .loop_y
5193    RET
5194
5195cglobal w_mask_420, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
5196%define base r7-w_mask_420_avx2_table
5197    lea                  r7, [w_mask_420_avx2_table]
5198    tzcnt                wd, wm
5199    mov                 r6d, r7m ; sign
5200    movifnidn            hd, hm
5201    movsxd               wq, [r7+wq*4]
5202    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
5203    vpbroadcastd         m7, [base+pw_2048]
5204    pmovzxbd             m9, [base+deint_shuf4]
5205    vpbroadcastd         m8, [base+wm_420_sign+r6*4] ; 258 - sign
5206    add                  wq, r7
5207    W_MASK                0, 4, 0, 1
5208    mov               maskq, maskmp
5209    lea            stride3q, [strideq*3]
5210    jmp                  wq
5211.w4:
5212    vextracti128        xm1, m0, 1
5213    movd   [dstq+strideq*0], xm0
5214    pextrd [dstq+strideq*1], xm0, 1
5215    movd   [dstq+strideq*2], xm1
5216    pextrd [dstq+stride3q ], xm1, 1
5217    cmp                  hd, 8
5218    jl .w4_end
5219    lea                dstq, [dstq+strideq*4]
5220    pextrd [dstq+strideq*0], xm0, 2
5221    pextrd [dstq+strideq*1], xm0, 3
5222    pextrd [dstq+strideq*2], xm1, 2
5223    pextrd [dstq+stride3q ], xm1, 3
5224    jg .w4_h16
5225.w4_end:
5226    vextracti128        xm0, m4, 1
5227    vpblendd            xm1, xm4, xm0, 0x05
5228    vpblendd            xm4, xm0, 0x0a
5229    pshufd              xm1, xm1, q2301
5230    psubw               xm4, xm8, xm4
5231    psubw               xm4, xm1
5232    psrlw               xm4, 2
5233    packuswb            xm4, xm4
5234    movq            [maskq], xm4
5235    RET
5236.w4_h16:
5237    W_MASK                0, 5, 2, 3
5238    lea                dstq, [dstq+strideq*4]
5239    phaddd               m4, m5
5240    vextracti128        xm1, m0, 1
5241    psubw                m4, m8, m4
5242    psrlw                m4, 2
5243    vpermd               m4, m9, m4
5244    vextracti128        xm5, m4, 1
5245    packuswb            xm4, xm5
5246    movd   [dstq+strideq*0], xm0
5247    pextrd [dstq+strideq*1], xm0, 1
5248    movd   [dstq+strideq*2], xm1
5249    pextrd [dstq+stride3q], xm1, 1
5250    lea                dstq, [dstq+strideq*4]
5251    pextrd [dstq+strideq*0], xm0, 2
5252    pextrd [dstq+strideq*1], xm0, 3
5253    pextrd [dstq+strideq*2], xm1, 2
5254    pextrd [dstq+stride3q ], xm1, 3
5255    mova            [maskq], xm4
5256    RET
5257.w8_loop:
5258    add               tmp1q, 2*32
5259    add               tmp2q, 2*32
5260    W_MASK                0, 4, 0, 1
5261    lea                dstq, [dstq+strideq*4]
5262    add               maskq, 8
5263.w8:
5264    vextracti128        xm2, m4, 1
5265    vextracti128        xm1, m0, 1
5266    psubw               xm4, xm8, xm4
5267    psubw               xm4, xm2
5268    psrlw               xm4, 2
5269    packuswb            xm4, xm4
5270    movq   [dstq+strideq*0], xm0
5271    movq   [dstq+strideq*1], xm1
5272    movhps [dstq+strideq*2], xm0
5273    movhps [dstq+stride3q ], xm1
5274    movq            [maskq], xm4
5275    sub                  hd, 4
5276    jg .w8_loop
5277    RET
5278.w16_loop:
5279    add               tmp1q, 4*32
5280    add               tmp2q, 4*32
5281    W_MASK                0, 4, 0, 1
5282    lea                dstq, [dstq+strideq*4]
5283    add               maskq, 16
5284.w16:
5285    vpermq               m0, m0, q3120
5286    mova         [dstq+strideq*0], xm0
5287    vextracti128 [dstq+strideq*1], m0, 1
5288    W_MASK                0, 5, 2, 3
5289    punpckhqdq           m1, m4, m5
5290    punpcklqdq           m4, m5
5291    psubw                m1, m8, m1
5292    psubw                m1, m4
5293    psrlw                m1, 2
5294    vpermq               m0, m0, q3120
5295    packuswb             m1, m1
5296    vpermd               m1, m9, m1
5297    mova         [dstq+strideq*2], xm0
5298    vextracti128 [dstq+stride3q ], m0, 1
5299    mova            [maskq], xm1
5300    sub                  hd, 4
5301    jg .w16_loop
5302    RET
5303.w32_loop:
5304    add               tmp1q, 4*32
5305    add               tmp2q, 4*32
5306    W_MASK                0, 4, 0, 1
5307    lea                dstq, [dstq+strideq*2]
5308    add               maskq, 16
5309.w32:
5310    vpermq               m0, m0, q3120
5311    mova   [dstq+strideq*0], m0
5312    W_MASK                0, 5, 2, 3
5313    psubw                m4, m8, m4
5314    psubw                m4, m5
5315    psrlw                m4, 2
5316    vpermq               m0, m0, q3120
5317    packuswb             m4, m4
5318    vpermd               m4, m9, m4
5319    mova   [dstq+strideq*1], m0
5320    mova            [maskq], xm4
5321    sub                  hd, 2
5322    jg .w32_loop
5323    RET
5324.w64_loop_even:
5325    psubw               m10, m8, m4
5326    psubw               m11, m8, m5
5327    dec                  hd
5328.w64_loop:
5329    add               tmp1q, 4*32
5330    add               tmp2q, 4*32
5331    W_MASK                0, 4, 0, 1
5332    add                dstq, strideq
5333.w64:
5334    vpermq               m0, m0, q3120
5335    mova        [dstq+32*0], m0
5336    W_MASK                0, 5, 2, 3
5337    vpermq               m0, m0, q3120
5338    mova        [dstq+32*1], m0
5339    test                 hd, 1
5340    jz .w64_loop_even
5341    psubw                m4, m10, m4
5342    psubw                m5, m11, m5
5343    psrlw                m4, 2
5344    psrlw                m5, 2
5345    packuswb             m4, m5
5346    vpermd               m4, m9, m4
5347    mova            [maskq], m4
5348    add               maskq, 32
5349    dec                  hd
5350    jg .w64_loop
5351    RET
5352.w128_loop_even:
5353    psubw               m12, m8, m4
5354    psubw               m13, m8, m5
5355    dec                  hd
5356.w128_loop:
5357    W_MASK                0, 4, 0, 1
5358    add                dstq, strideq
5359.w128:
5360    vpermq               m0, m0, q3120
5361    mova        [dstq+32*0], m0
5362    W_MASK                0, 5, 2, 3
5363    vpermq               m0, m0, q3120
5364    mova        [dstq+32*1], m0
5365    add               tmp1q, 8*32
5366    add               tmp2q, 8*32
5367    test                 hd, 1
5368    jz .w128_even
5369    psubw                m4, m10, m4
5370    psubw                m5, m11, m5
5371    psrlw                m4, 2
5372    psrlw                m5, 2
5373    packuswb             m4, m5
5374    vpermd               m4, m9, m4
5375    mova       [maskq+32*0], m4
5376    jmp .w128_odd
5377.w128_even:
5378    psubw               m10, m8, m4
5379    psubw               m11, m8, m5
5380.w128_odd:
5381    W_MASK                0, 4, -4, -3
5382    vpermq               m0, m0, q3120
5383    mova        [dstq+32*2], m0
5384    W_MASK                0, 5, -2, -1
5385    vpermq               m0, m0, q3120
5386    mova        [dstq+32*3], m0
5387    test                 hd, 1
5388    jz .w128_loop_even
5389    psubw                m4, m12, m4
5390    psubw                m5, m13, m5
5391    psrlw                m4, 2
5392    psrlw                m5, 2
5393    packuswb             m4, m5
5394    vpermd               m4, m9, m4
5395    mova       [maskq+32*1], m4
5396    add               maskq, 64
5397    dec                  hd
5398    jg .w128_loop
5399    RET
5400
5401cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
5402%define base r7-w_mask_422_avx2_table
5403    lea                  r7, [w_mask_422_avx2_table]
5404    tzcnt                wd, wm
5405    mov                 r6d, r7m ; sign
5406    movifnidn            hd, hm
5407    pxor                 m9, m9
5408    movsxd               wq, dword [r7+wq*4]
5409    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
5410    vpbroadcastd         m7, [base+pw_2048]
5411    pmovzxbd            m10, [base+deint_shuf4]
5412    vpbroadcastd         m8, [base+wm_422_sign+r6*4] ; 128 - sign
5413    add                  wq, r7
5414    mov               maskq, maskmp
5415    W_MASK                0, 4, 0, 1
5416    lea            stride3q, [strideq*3]
5417    jmp                  wq
5418.w4:
5419    vextracti128        xm1, m0, 1
5420    movd   [dstq+strideq*0], xm0
5421    pextrd [dstq+strideq*1], xm0, 1
5422    movd   [dstq+strideq*2], xm1
5423    pextrd [dstq+stride3q ], xm1, 1
5424    cmp                  hd, 8
5425    jl .w4_end
5426    lea                dstq, [dstq+strideq*4]
5427    pextrd [dstq+strideq*0], xm0, 2
5428    pextrd [dstq+strideq*1], xm0, 3
5429    pextrd [dstq+strideq*2], xm1, 2
5430    pextrd [dstq+stride3q ], xm1, 3
5431    jg .w4_h16
5432.w4_end:
5433    vextracti128        xm5, m4, 1
5434    packuswb            xm4, xm5
5435    psubb               xm5, xm8, xm4
5436    pavgb               xm5, xm9
5437    pshufd              xm5, xm5, q3120
5438    mova            [maskq], xm5
5439    RET
5440.w4_h16:
5441    W_MASK                0, 5, 2, 3
5442    lea                dstq, [dstq+strideq*4]
5443    packuswb             m4, m5
5444    psubb                m5, m8, m4
5445    pavgb                m5, m9
5446    vpermd               m5, m10, m5
5447    vextracti128        xm1, m0, 1
5448    movd   [dstq+strideq*0], xm0
5449    pextrd [dstq+strideq*1], xm0, 1
5450    movd   [dstq+strideq*2], xm1
5451    pextrd [dstq+stride3q ], xm1, 1
5452    lea                dstq, [dstq+strideq*4]
5453    pextrd [dstq+strideq*0], xm0, 2
5454    pextrd [dstq+strideq*1], xm0, 3
5455    pextrd [dstq+strideq*2], xm1, 2
5456    pextrd [dstq+stride3q ], xm1, 3
5457    mova            [maskq], m5
5458    RET
5459.w8_loop:
5460    add               tmp1q, 32*2
5461    add               tmp2q, 32*2
5462    W_MASK                0, 4, 0, 1
5463    lea                dstq, [dstq+strideq*4]
5464    add               maskq, 16
5465.w8:
5466    vextracti128        xm5, m4, 1
5467    vextracti128        xm1, m0, 1
5468    packuswb            xm4, xm5
5469    psubb               xm5, xm8, xm4
5470    pavgb               xm5, xm9
5471    pshufd              xm5, xm5, q3120
5472    movq   [dstq+strideq*0], xm0
5473    movq   [dstq+strideq*1], xm1
5474    movhps [dstq+strideq*2], xm0
5475    movhps [dstq+stride3q ], xm1
5476    mova            [maskq], xm5
5477    sub                  hd, 4
5478    jg .w8_loop
5479    RET
5480.w16_loop:
5481    add               tmp1q, 32*4
5482    add               tmp2q, 32*4
5483    W_MASK                0, 4, 0, 1
5484    lea                dstq, [dstq+strideq*4]
5485    add               maskq, 32
5486.w16:
5487    vpermq               m0, m0, q3120
5488    mova         [dstq+strideq*0], xm0
5489    vextracti128 [dstq+strideq*1], m0, 1
5490    W_MASK                0, 5, 2, 3
5491    packuswb             m4, m5
5492    psubb                m5, m8, m4
5493    pavgb                m5, m9
5494    vpermq               m0, m0, q3120
5495    vpermd               m5, m10, m5
5496    mova         [dstq+strideq*2], xm0
5497    vextracti128 [dstq+stride3q ], m0, 1
5498    mova            [maskq], m5
5499    sub                  hd, 4
5500    jg .w16_loop
5501    RET
5502.w32_loop:
5503    add               tmp1q, 32*4
5504    add               tmp2q, 32*4
5505    W_MASK                0, 4, 0, 1
5506    lea                dstq, [dstq+strideq*2]
5507    add               maskq, 32
5508.w32:
5509    vpermq               m0, m0, q3120
5510    mova   [dstq+strideq*0], m0
5511    W_MASK                0, 5, 2, 3
5512    packuswb             m4, m5
5513    psubb                m5, m8, m4
5514    pavgb                m5, m9
5515    vpermq               m0, m0, q3120
5516    vpermd               m5, m10, m5
5517    mova   [dstq+strideq*1], m0
5518    mova            [maskq], m5
5519    sub                  hd, 2
5520    jg .w32_loop
5521    RET
5522.w64_loop:
5523    add               tmp1q, 32*4
5524    add               tmp2q, 32*4
5525    W_MASK                0, 4, 0, 1
5526    add                dstq, strideq
5527    add               maskq, 32
5528.w64:
5529    vpermq               m0, m0, q3120
5530    mova        [dstq+32*0], m0
5531    W_MASK                0, 5, 2, 3
5532    packuswb             m4, m5
5533    psubb                m5, m8, m4
5534    pavgb                m5, m9
5535    vpermq               m0, m0, q3120
5536    vpermd               m5, m10, m5
5537    mova        [dstq+32*1], m0
5538    mova            [maskq], m5
5539    dec                  hd
5540    jg .w64_loop
5541    RET
5542.w128_loop:
5543    add               tmp1q, 32*8
5544    add               tmp2q, 32*8
5545    W_MASK                0, 4, 0, 1
5546    add                dstq, strideq
5547    add               maskq, 32*2
5548.w128:
5549    vpermq               m0, m0, q3120
5550    mova        [dstq+32*0], m0
5551    W_MASK                0, 5, 2, 3
5552    packuswb             m4, m5
5553    psubb                m5, m8, m4
5554    pavgb                m5, m9
5555    vpermq               m0, m0, q3120
5556    vpermd               m5, m10, m5
5557    mova        [dstq+32*1], m0
5558    mova       [maskq+32*0], m5
5559    W_MASK                0, 4, 4, 5
5560    vpermq               m0, m0, q3120
5561    mova        [dstq+32*2], m0
5562    W_MASK                0, 5, 6, 7
5563    packuswb             m4, m5
5564    psubb                m5, m8, m4
5565    pavgb                m5, m9
5566    vpermq               m0, m0, q3120
5567    vpermd               m5, m10, m5
5568    mova        [dstq+32*3], m0
5569    mova       [maskq+32*1], m5
5570    dec                  hd
5571    jg .w128_loop
5572    RET
5573
5574cglobal w_mask_444, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3
5575%define base r7-w_mask_444_avx2_table
5576    lea                  r7, [w_mask_444_avx2_table]
5577    tzcnt                wd, wm
5578    movifnidn            hd, hm
5579    mov               maskq, maskmp
5580    movsxd               wq, dword [r7+wq*4]
5581    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
5582    vpbroadcastd         m5, [base+pb_64]
5583    vpbroadcastd         m7, [base+pw_2048]
5584    add                  wq, r7
5585    W_MASK                0, 4, 0, 1, 1
5586    lea            stride3q, [strideq*3]
5587    jmp                  wq
5588.w4:
5589    vextracti128        xm1, m0, 1
5590    movd   [dstq+strideq*0], xm0
5591    pextrd [dstq+strideq*1], xm0, 1
5592    movd   [dstq+strideq*2], xm1
5593    pextrd [dstq+stride3q ], xm1, 1
5594    mova       [maskq+32*0], m4
5595    cmp                  hd, 8
5596    jl .w4_end
5597    lea                dstq, [dstq+strideq*4]
5598    pextrd [dstq+strideq*0], xm0, 2
5599    pextrd [dstq+strideq*1], xm0, 3
5600    pextrd [dstq+strideq*2], xm1, 2
5601    pextrd [dstq+stride3q ], xm1, 3
5602    je .w4_end
5603    W_MASK                0, 4, 2, 3, 1
5604    lea                dstq, [dstq+strideq*4]
5605    vextracti128        xm1, m0, 1
5606    movd   [dstq+strideq*0], xm0
5607    pextrd [dstq+strideq*1], xm0, 1
5608    movd   [dstq+strideq*2], xm1
5609    pextrd [dstq+stride3q ], xm1, 1
5610    lea                dstq, [dstq+strideq*4]
5611    pextrd [dstq+strideq*0], xm0, 2
5612    pextrd [dstq+strideq*1], xm0, 3
5613    pextrd [dstq+strideq*2], xm1, 2
5614    pextrd [dstq+stride3q ], xm1, 3
5615    mova       [maskq+32*1], m4
5616.w4_end:
5617    RET
5618.w8_loop:
5619    add               tmp1q, 32*2
5620    add               tmp2q, 32*2
5621    W_MASK                0, 4, 0, 1, 1
5622    lea                dstq, [dstq+strideq*4]
5623    add               maskq, 32
5624.w8:
5625    vextracti128        xm1, m0, 1
5626    movq   [dstq+strideq*0], xm0
5627    movq   [dstq+strideq*1], xm1
5628    movhps [dstq+strideq*2], xm0
5629    movhps [dstq+stride3q ], xm1
5630    mova            [maskq], m4
5631    sub                  hd, 4
5632    jg .w8_loop
5633    RET
5634.w16_loop:
5635    add               tmp1q, 32*2
5636    add               tmp2q, 32*2
5637    W_MASK                0, 4, 0, 1, 1
5638    lea                dstq, [dstq+strideq*2]
5639    add               maskq, 32
5640.w16:
5641    vpermq               m0, m0, q3120
5642    mova         [dstq+strideq*0], xm0
5643    vextracti128 [dstq+strideq*1], m0, 1
5644    mova            [maskq], m4
5645    sub                  hd, 2
5646    jg .w16_loop
5647    RET
5648.w32_loop:
5649    add               tmp1q, 32*2
5650    add               tmp2q, 32*2
5651    W_MASK                0, 4, 0, 1, 1
5652    add                dstq, strideq
5653    add               maskq, 32
5654.w32:
5655    vpermq               m0, m0, q3120
5656    mova             [dstq], m0
5657    mova            [maskq], m4
5658    dec                  hd
5659    jg .w32_loop
5660    RET
5661.w64_loop:
5662    add               tmp1q, 32*4
5663    add               tmp2q, 32*4
5664    W_MASK                0, 4, 0, 1, 1
5665    add                dstq, strideq
5666    add               maskq, 32*2
5667.w64:
5668    vpermq               m0, m0, q3120
5669    mova        [dstq+32*0], m0
5670    mova       [maskq+32*0], m4
5671    W_MASK                0, 4, 2, 3, 1
5672    vpermq               m0, m0, q3120
5673    mova        [dstq+32*1], m0
5674    mova       [maskq+32*1], m4
5675    dec                  hd
5676    jg .w64_loop
5677    RET
5678.w128_loop:
5679    add               tmp1q, 32*8
5680    add               tmp2q, 32*8
5681    W_MASK                0, 4, 0, 1, 1
5682    add                dstq, strideq
5683    add               maskq, 32*4
5684.w128:
5685    vpermq               m0, m0, q3120
5686    mova        [dstq+32*0], m0
5687    mova       [maskq+32*0], m4
5688    W_MASK                0, 4, 2, 3, 1
5689    vpermq               m0, m0, q3120
5690    mova        [dstq+32*1], m0
5691    mova       [maskq+32*1], m4
5692    W_MASK                0, 4, 4, 5, 1
5693    vpermq               m0, m0, q3120
5694    mova        [dstq+32*2], m0
5695    mova       [maskq+32*2], m4
5696    W_MASK                0, 4, 6, 7, 1
5697    vpermq               m0, m0, q3120
5698    mova        [dstq+32*3], m0
5699    mova       [maskq+32*3], m4
5700    dec                  hd
5701    jg .w128_loop
5702    RET
5703
5704%endif ; ARCH_X86_64
5705