1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29SECTION_RODATA
30
31filter_shuf:   db  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  4,  5,  2,  3, -1, -1
32pal_pred_shuf: db  0,  2,  4,  6,  8, 10, 12, 14,  1,  3,  5,  7,  9, 11, 13, 15
33
34pb_0_1:  times 4 db 0, 1
35pb_2_3:  times 4 db 2, 3
36pw_1:    times 4 dw 1
37pw_2:    times 4 dw 2
38pw_4:    times 4 dw 4
39pw_512:  times 4 dw 512
40pw_2048: times 4 dw 2048
41
42%macro JMP_TABLE 3-*
43    %xdefine %1_%2_table (%%table - 2*4)
44    %xdefine %%base mangle(private_prefix %+ _%1_%2)
45    %%table:
46    %rep %0 - 2
47        dd %%base %+ .%3 - (%%table - 2*4)
48        %rotate 1
49    %endrep
50%endmacro
51
52%define ipred_dc_splat_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 10*4)
53%define ipred_dc_128_16bpc_ssse3_table   (ipred_dc_16bpc_ssse3_table + 15*4)
54%define ipred_cfl_splat_16bpc_ssse3_table (ipred_cfl_16bpc_ssse3_table + 8*4)
55
56JMP_TABLE ipred_dc_left_16bpc,    ssse3, h4, h8, h16, h32, h64
57JMP_TABLE ipred_dc_16bpc,         ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
58                                         s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4, \
59                                         s4-15*4, s8-15*4, s16c-15*4, s32c-15*4, s64-15*4
60JMP_TABLE ipred_h_16bpc,          ssse3, w4, w8, w16, w32, w64
61JMP_TABLE ipred_cfl_16bpc,        ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \
62                                         s4-8*4, s8-8*4, s16-8*4, s32-8*4
63JMP_TABLE ipred_cfl_left_16bpc,   ssse3, h4, h8, h16, h32
64JMP_TABLE ipred_cfl_ac_444_16bpc, ssse3, w4, w8, w16, w32
65JMP_TABLE pal_pred_16bpc,         ssse3, w4, w8, w16, w32, w64
66
67cextern smooth_weights_1d_16bpc
68cextern smooth_weights_2d_16bpc
69cextern filter_intra_taps
70
71SECTION .text
72
73INIT_XMM ssse3
74cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h
75    LEA                  r5, ipred_dc_left_16bpc_ssse3_table
76    movd                 m4, wm
77    tzcnt                wd, wm
78    add                 tlq, 2
79    movifnidn            hd, hm
80    pxor                 m3, m3
81    pavgw                m4, m3
82    movd                 m5, wd
83    movu                 m0, [tlq]
84    movsxd               r6, [r5+wq*4]
85    add                  r6, r5
86    add                  r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table
87    movsxd               wq, [r5+wq*4]
88    add                  wq, r5
89    jmp                  r6
90
91cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
92    LEA                  r5, ipred_dc_left_16bpc_ssse3_table
93    mov                  hd, hm
94    movd                 m4, hm
95    tzcnt               r6d, hd
96    sub                 tlq, hq
97    tzcnt                wd, wm
98    pxor                 m3, m3
99    sub                 tlq, hq
100    pavgw                m4, m3
101    movd                 m5, r6d
102    movu                 m0, [tlq]
103    movsxd               r6, [r5+r6*4]
104    add                  r6, r5
105    add                  r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table
106    movsxd               wq, [r5+wq*4]
107    add                  wq, r5
108    jmp                  r6
109.h64:
110    movu                 m2, [tlq+112]
111    movu                 m1, [tlq+ 96]
112    paddw                m0, m2
113    movu                 m2, [tlq+ 80]
114    paddw                m1, m2
115    movu                 m2, [tlq+ 64]
116    paddw                m0, m2
117    paddw                m0, m1
118.h32:
119    movu                 m1, [tlq+ 48]
120    movu                 m2, [tlq+ 32]
121    paddw                m1, m2
122    paddw                m0, m1
123.h16:
124    movu                 m1, [tlq+ 16]
125    paddw                m0, m1
126.h8:
127    movhlps              m1, m0
128    paddw                m0, m1
129.h4:
130    punpcklwd            m0, m3
131    paddd                m4, m0
132    punpckhqdq           m0, m0
133    paddd                m0, m4
134    pshuflw              m4, m0, q1032
135    paddd                m0, m4
136    psrld                m0, m5
137    lea            stride3q, [strideq*3]
138    pshuflw              m0, m0, q0000
139    punpcklqdq           m0, m0
140    jmp                  wq
141
142cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3
143    movifnidn            hd, hm
144    tzcnt               r6d, hd
145    lea                 r5d, [wq+hq]
146    movd                 m4, r5d
147    tzcnt               r5d, r5d
148    movd                 m5, r5d
149    LEA                  r5, ipred_dc_16bpc_ssse3_table
150    tzcnt                wd, wd
151    movsxd               r6, [r5+r6*4]
152    movsxd               wq, [r5+wq*4+5*4]
153    pxor                 m3, m3
154    psrlw                m4, 1
155    add                  r6, r5
156    add                  wq, r5
157    lea            stride3q, [strideq*3]
158    jmp                  r6
159.h4:
160    movq                 m0, [tlq-8]
161    jmp                  wq
162.w4:
163    movq                 m1, [tlq+2]
164    paddw                m1, m0
165    punpckhwd            m0, m3
166    punpcklwd            m1, m3
167    paddd                m0, m1
168    paddd                m4, m0
169    punpckhqdq           m0, m0
170    paddd                m0, m4
171    pshuflw              m1, m0, q1032
172    paddd                m0, m1
173    cmp                  hd, 4
174    jg .w4_mul
175    psrlw                m0, 3
176    jmp .w4_end
177.w4_mul:
178    mov                 r2d, 0xAAAB
179    mov                 r3d, 0x6667
180    cmp                  hd, 16
181    cmove               r2d, r3d
182    psrld                m0, 2
183    movd                 m1, r2d
184    pmulhuw              m0, m1
185    psrlw                m0, 1
186.w4_end:
187    pshuflw              m0, m0, q0000
188.s4:
189    movq   [dstq+strideq*0], m0
190    movq   [dstq+strideq*1], m0
191    movq   [dstq+strideq*2], m0
192    movq   [dstq+stride3q ], m0
193    lea                dstq, [dstq+strideq*4]
194    sub                  hd, 4
195    jg .s4
196    RET
197.h8:
198    mova                 m0, [tlq-16]
199    jmp                  wq
200.w8:
201    movu                 m1, [tlq+2]
202    paddw                m0, m1
203    punpcklwd            m1, m0, m3
204    punpckhwd            m0, m3
205    paddd                m0, m1
206    paddd                m4, m0
207    punpckhqdq           m0, m0
208    paddd                m0, m4
209    pshuflw              m1, m0, q1032
210    paddd                m0, m1
211    psrld                m0, m5
212    cmp                  hd, 8
213    je .w8_end
214    mov                 r2d, 0xAAAB
215    mov                 r3d, 0x6667
216    cmp                  hd, 32
217    cmove               r2d, r3d
218    movd                 m1, r2d
219    pmulhuw              m0, m1
220    psrlw                m0, 1
221.w8_end:
222    pshuflw              m0, m0, q0000
223    punpcklqdq           m0, m0
224.s8:
225    mova   [dstq+strideq*0], m0
226    mova   [dstq+strideq*1], m0
227    mova   [dstq+strideq*2], m0
228    mova   [dstq+stride3q ], m0
229    lea                dstq, [dstq+strideq*4]
230    sub                  hd, 4
231    jg .s8
232    RET
233.h16:
234    mova                 m0, [tlq-32]
235    paddw                m0, [tlq-16]
236    jmp                  wq
237.w16:
238    movu                 m1, [tlq+ 2]
239    movu                 m2, [tlq+18]
240    paddw                m1, m2
241    paddw                m0, m1
242    punpckhwd            m1, m0, m3
243    punpcklwd            m0, m3
244    paddd                m0, m1
245    paddd                m4, m0
246    punpckhqdq           m0, m0
247    paddd                m0, m4
248    pshuflw              m1, m0, q1032
249    paddd                m0, m1
250    psrld                m0, m5
251    cmp                  hd, 16
252    je .w16_end
253    mov                 r2d, 0xAAAB
254    mov                 r3d, 0x6667
255    test                 hd, 8|32
256    cmovz               r2d, r3d
257    movd                 m1, r2d
258    pmulhuw              m0, m1
259    psrlw                m0, 1
260.w16_end:
261    pshuflw              m0, m0, q0000
262    punpcklqdq           m0, m0
263.s16c:
264    mova                 m1, m0
265.s16:
266    mova [dstq+strideq*0+16*0], m0
267    mova [dstq+strideq*0+16*1], m1
268    mova [dstq+strideq*1+16*0], m0
269    mova [dstq+strideq*1+16*1], m1
270    mova [dstq+strideq*2+16*0], m0
271    mova [dstq+strideq*2+16*1], m1
272    mova [dstq+stride3q +16*0], m0
273    mova [dstq+stride3q +16*1], m1
274    lea                dstq, [dstq+strideq*4]
275    sub                  hd, 4
276    jg .s16
277    RET
278.h32:
279    mova                 m0, [tlq-64]
280    paddw                m0, [tlq-48]
281    paddw                m0, [tlq-32]
282    paddw                m0, [tlq-16]
283    jmp                  wq
284.w32:
285    movu                 m1, [tlq+ 2]
286    movu                 m2, [tlq+18]
287    paddw                m1, m2
288    movu                 m2, [tlq+34]
289    paddw                m0, m2
290    movu                 m2, [tlq+50]
291    paddw                m1, m2
292    paddw                m0, m1
293    punpcklwd            m1, m0, m3
294    punpckhwd            m0, m3
295    paddd                m0, m1
296    paddd                m4, m0
297    punpckhqdq           m0, m0
298    paddd                m0, m4
299    pshuflw              m1, m0, q1032
300    paddd                m0, m1
301    psrld                m0, m5
302    cmp                  hd, 32
303    je .w32_end
304    mov                 r2d, 0xAAAB
305    mov                 r3d, 0x6667
306    cmp                  hd, 8
307    cmove               r2d, r3d
308    movd                 m1, r2d
309    pmulhuw              m0, m1
310    psrlw                m0, 1
311.w32_end:
312    pshuflw              m0, m0, q0000
313    punpcklqdq           m0, m0
314.s32c:
315    mova                 m1, m0
316    mova                 m2, m0
317    mova                 m3, m0
318.s32:
319    mova [dstq+strideq*0+16*0], m0
320    mova [dstq+strideq*0+16*1], m1
321    mova [dstq+strideq*0+16*2], m2
322    mova [dstq+strideq*0+16*3], m3
323    mova [dstq+strideq*1+16*0], m0
324    mova [dstq+strideq*1+16*1], m1
325    mova [dstq+strideq*1+16*2], m2
326    mova [dstq+strideq*1+16*3], m3
327    lea                dstq, [dstq+strideq*2]
328    sub                  hd, 2
329    jg .s32
330    RET
331.h64:
332    mova                 m0, [tlq-128]
333    mova                 m1, [tlq-112]
334    paddw                m0, [tlq- 96]
335    paddw                m1, [tlq- 80]
336    paddw                m0, [tlq- 64]
337    paddw                m1, [tlq- 48]
338    paddw                m0, [tlq- 32]
339    paddw                m1, [tlq- 16]
340    paddw                m0, m1
341    jmp                  wq
342.w64:
343    movu                 m1, [tlq+  2]
344    movu                 m2, [tlq+ 18]
345    paddw                m1, m2
346    movu                 m2, [tlq+ 34]
347    paddw                m0, m2
348    movu                 m2, [tlq+ 50]
349    paddw                m1, m2
350    movu                 m2, [tlq+ 66]
351    paddw                m0, m2
352    movu                 m2, [tlq+ 82]
353    paddw                m1, m2
354    movu                 m2, [tlq+ 98]
355    paddw                m0, m2
356    movu                 m2, [tlq+114]
357    paddw                m1, m2
358    paddw                m0, m1
359    punpcklwd            m1, m0, m3
360    punpckhwd            m0, m3
361    paddd                m0, m1
362    paddd                m4, m0
363    punpckhqdq           m0, m0
364    paddd                m0, m4
365    pshuflw              m1, m0, q1032
366    paddd                m0, m1
367    psrld                m0, m5
368    cmp                  hd, 64
369    je .w64_end
370    mov                 r2d, 0xAAAB
371    mov                 r3d, 0x6667
372    cmp                  hd, 16
373    cmove               r2d, r3d
374    movd                 m1, r2d
375    pmulhuw              m0, m1
376    psrlw                m0, 1
377.w64_end:
378    pshuflw              m0, m0, q0000
379    punpcklqdq           m0, m0
380.s64:
381    mova        [dstq+16*0], m0
382    mova        [dstq+16*1], m0
383    mova        [dstq+16*2], m0
384    mova        [dstq+16*3], m0
385    mova        [dstq+16*4], m0
386    mova        [dstq+16*5], m0
387    mova        [dstq+16*6], m0
388    mova        [dstq+16*7], m0
389    add                dstq, strideq
390    dec                  hd
391    jg .s64
392    RET
393
394cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
395    mov                 r6d, r8m
396    LEA                  r5, ipred_dc_128_16bpc_ssse3_table
397    tzcnt                wd, wm
398    shr                 r6d, 11
399    movifnidn            hd, hm
400    movsxd               wq, [r5+wq*4]
401    movddup              m0, [r5-ipred_dc_128_16bpc_ssse3_table+pw_512+r6*8]
402    add                  wq, r5
403    lea            stride3q, [strideq*3]
404    jmp                  wq
405
406cglobal ipred_v_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3
407    LEA                  r5, ipred_dc_splat_16bpc_ssse3_table
408    movifnidn            hd, hm
409    movu                 m0, [tlq+  2]
410    movu                 m1, [tlq+ 18]
411    movu                 m2, [tlq+ 34]
412    movu                 m3, [tlq+ 50]
413    cmp                  wd, 64
414    je .w64
415    tzcnt                wd, wd
416    movsxd               wq, [r5+wq*4]
417    add                  wq, r5
418    lea            stride3q, [strideq*3]
419    jmp                  wq
420.w64:
421    WIN64_SPILL_XMM 8
422    movu                 m4, [tlq+ 66]
423    movu                 m5, [tlq+ 82]
424    movu                 m6, [tlq+ 98]
425    movu                 m7, [tlq+114]
426.w64_loop:
427    mova        [dstq+16*0], m0
428    mova        [dstq+16*1], m1
429    mova        [dstq+16*2], m2
430    mova        [dstq+16*3], m3
431    mova        [dstq+16*4], m4
432    mova        [dstq+16*5], m5
433    mova        [dstq+16*6], m6
434    mova        [dstq+16*7], m7
435    add                dstq, strideq
436    dec                  hd
437    jg .w64_loop
438    RET
439
440cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
441%define base r5-ipred_h_16bpc_ssse3_table
442    tzcnt                wd, wm
443    LEA                  r5, ipred_h_16bpc_ssse3_table
444    movifnidn            hd, hm
445    movsxd               wq, [r5+wq*4]
446    movddup              m2, [base+pb_0_1]
447    movddup              m3, [base+pb_2_3]
448    add                  wq, r5
449    lea            stride3q, [strideq*3]
450    jmp                  wq
451.w4:
452    sub                 tlq, 8
453    movq                 m3, [tlq]
454    pshuflw              m0, m3, q3333
455    pshuflw              m1, m3, q2222
456    pshuflw              m2, m3, q1111
457    pshuflw              m3, m3, q0000
458    movq   [dstq+strideq*0], m0
459    movq   [dstq+strideq*1], m1
460    movq   [dstq+strideq*2], m2
461    movq   [dstq+stride3q ], m3
462    lea                dstq, [dstq+strideq*4]
463    sub                  hd, 4
464    jg .w4
465    RET
466.w8:
467    sub                 tlq, 8
468    movq                 m3, [tlq]
469    punpcklwd            m3, m3
470    pshufd               m0, m3, q3333
471    pshufd               m1, m3, q2222
472    pshufd               m2, m3, q1111
473    pshufd               m3, m3, q0000
474    mova   [dstq+strideq*0], m0
475    mova   [dstq+strideq*1], m1
476    mova   [dstq+strideq*2], m2
477    mova   [dstq+stride3q ], m3
478    lea                dstq, [dstq+strideq*4]
479    sub                  hd, 4
480    jg .w8
481    RET
482.w16:
483    sub                 tlq, 4
484    movd                 m1, [tlq]
485    pshufb               m0, m1, m3
486    pshufb               m1, m2
487    mova [dstq+strideq*0+16*0], m0
488    mova [dstq+strideq*0+16*1], m0
489    mova [dstq+strideq*1+16*0], m1
490    mova [dstq+strideq*1+16*1], m1
491    lea                dstq, [dstq+strideq*2]
492    sub                  hd, 2
493    jg .w16
494    RET
495.w32:
496    sub                 tlq, 4
497    movd                 m1, [tlq]
498    pshufb               m0, m1, m3
499    pshufb               m1, m2
500    mova [dstq+strideq*0+16*0], m0
501    mova [dstq+strideq*0+16*1], m0
502    mova [dstq+strideq*0+16*2], m0
503    mova [dstq+strideq*0+16*3], m0
504    mova [dstq+strideq*1+16*0], m1
505    mova [dstq+strideq*1+16*1], m1
506    mova [dstq+strideq*1+16*2], m1
507    mova [dstq+strideq*1+16*3], m1
508    lea                dstq, [dstq+strideq*2]
509    sub                  hd, 2
510    jg .w32
511    RET
512.w64:
513    sub                 tlq, 2
514    movd                 m0, [tlq]
515    pshufb               m0, m2
516    mova        [dstq+16*0], m0
517    mova        [dstq+16*1], m0
518    mova        [dstq+16*2], m0
519    mova        [dstq+16*3], m0
520    mova        [dstq+16*4], m0
521    mova        [dstq+16*5], m0
522    mova        [dstq+16*6], m0
523    mova        [dstq+16*7], m0
524    add                dstq, strideq
525    dec                  hd
526    jg .w64
527    RET
528
529cglobal ipred_paeth_16bpc, 4, 6, 8, dst, stride, tl, w, h, left
530%define base r5-ipred_paeth_16bpc_ssse3_table
531    movifnidn            hd, hm
532    pshuflw              m4, [tlq], q0000
533    mov               leftq, tlq
534    add                  hd, hd
535    punpcklqdq           m4, m4      ; topleft
536    sub               leftq, hq
537    and                  wd, ~7
538    jnz .w8
539    movddup              m5, [tlq+2] ; top
540    psubw                m6, m5, m4
541    pabsw                m7, m6
542.w4_loop:
543    movd                 m1, [leftq+hq-4]
544    punpcklwd            m1, m1
545    punpckldq            m1, m1      ; left
546%macro PAETH 0
547    paddw                m0, m6, m1
548    psubw                m2, m4, m0  ; tldiff
549    psubw                m0, m5      ; tdiff
550    pabsw                m2, m2
551    pabsw                m0, m0
552    pminsw               m2, m0
553    pcmpeqw              m0, m2
554    pand                 m3, m5, m0
555    pandn                m0, m4
556    por                  m0, m3
557    pcmpgtw              m3, m7, m2
558    pand                 m0, m3
559    pandn                m3, m1
560    por                  m0, m3
561%endmacro
562    PAETH
563    movhps [dstq+strideq*0], m0
564    movq   [dstq+strideq*1], m0
565    lea                dstq, [dstq+strideq*2]
566    sub                  hd, 2*2
567    jg .w4_loop
568    RET
569.w8:
570%if ARCH_X86_32
571    PUSH                 r6
572    %define             r7d  hm
573    %assign regs_used     7
574%elif WIN64
575    movaps              r4m, m8
576    PUSH                 r7
577    %assign regs_used     8
578%endif
579%if ARCH_X86_64
580    movddup              m8, [pb_0_1]
581%endif
582    lea                 tlq, [tlq+wq*2+2]
583    neg                  wq
584    mov                 r7d, hd
585.w8_loop0:
586    movu                 m5, [tlq+wq*2]
587    mov                  r6, dstq
588    add                dstq, 16
589    psubw                m6, m5, m4
590    pabsw                m7, m6
591.w8_loop:
592    movd                 m1, [leftq+hq-2]
593%if ARCH_X86_64
594    pshufb               m1, m8
595%else
596    pshuflw              m1, m1, q0000
597    punpcklqdq           m1, m1
598%endif
599    PAETH
600    mova               [r6], m0
601    add                  r6, strideq
602    sub                  hd, 1*2
603    jg .w8_loop
604    mov                  hd, r7d
605    add                  wq, 8
606    jl .w8_loop0
607%if WIN64
608    movaps               m8, r4m
609%endif
610    RET
611
612%if ARCH_X86_64
613DECLARE_REG_TMP 7
614%else
615DECLARE_REG_TMP 4
616%endif
617
618cglobal ipred_smooth_v_16bpc, 4, 6, 6, dst, stride, tl, w, h, weights
619    LEA            weightsq, smooth_weights_1d_16bpc
620    mov                  hd, hm
621    lea            weightsq, [weightsq+hq*4]
622    neg                  hq
623    movd                 m5, [tlq+hq*2] ; bottom
624    pshuflw              m5, m5, q0000
625    punpcklqdq           m5, m5
626    cmp                  wd, 4
627    jne .w8
628    movddup              m4, [tlq+2]    ; top
629    lea                  r3, [strideq*3]
630    psubw                m4, m5         ; top - bottom
631.w4_loop:
632    movq                 m1, [weightsq+hq*2]
633    punpcklwd            m1, m1
634    pshufd               m0, m1, q1100
635    punpckhdq            m1, m1
636    pmulhrsw             m0, m4
637    pmulhrsw             m1, m4
638    paddw                m0, m5
639    paddw                m1, m5
640    movq   [dstq+strideq*0], m0
641    movhps [dstq+strideq*1], m0
642    movq   [dstq+strideq*2], m1
643    movhps [dstq+r3       ], m1
644    lea                dstq, [dstq+strideq*4]
645    add                  hq, 4
646    jl .w4_loop
647    RET
648.w8:
649%if ARCH_X86_32
650    PUSH                 r6
651    %assign regs_used     7
652    mov                  hm, hq
653    %define              hq  hm
654%elif WIN64
655    PUSH                 r7
656    %assign regs_used     8
657%endif
658.w8_loop0:
659    mov                  t0, hq
660    movu                 m4, [tlq+2]
661    add                 tlq, 16
662    mov                  r6, dstq
663    add                dstq, 16
664    psubw                m4, m5
665.w8_loop:
666    movq                 m3, [weightsq+t0*2]
667    punpcklwd            m3, m3
668    pshufd               m0, m3, q0000
669    pshufd               m1, m3, q1111
670    pshufd               m2, m3, q2222
671    pshufd               m3, m3, q3333
672    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
673    REPX   {paddw    x, m5}, m0, m1, m2, m3
674    mova     [r6+strideq*0], m0
675    mova     [r6+strideq*1], m1
676    lea                  r6, [r6+strideq*2]
677    mova     [r6+strideq*0], m2
678    mova     [r6+strideq*1], m3
679    lea                  r6, [r6+strideq*2]
680    add                  t0, 4
681    jl .w8_loop
682    sub                  wd, 8
683    jg .w8_loop0
684    RET
685
686cglobal ipred_smooth_h_16bpc, 3, 6, 6, dst, stride, tl, w, h, weights
687    LEA            weightsq, smooth_weights_1d_16bpc
688    mov                  wd, wm
689    movifnidn            hd, hm
690    movd                 m5, [tlq+wq*2] ; right
691    sub                 tlq, 8
692    add                  hd, hd
693    pshuflw              m5, m5, q0000
694    sub                 tlq, hq
695    punpcklqdq           m5, m5
696    cmp                  wd, 4
697    jne .w8
698    movddup              m4, [weightsq+4*2]
699    lea                  r3, [strideq*3]
700.w4_loop:
701    movq                 m1, [tlq+hq]   ; left
702    punpcklwd            m1, m1
703    psubw                m1, m5         ; left - right
704    pshufd               m0, m1, q3322
705    punpckldq            m1, m1
706    pmulhrsw             m0, m4
707    pmulhrsw             m1, m4
708    paddw                m0, m5
709    paddw                m1, m5
710    movhps [dstq+strideq*0], m0
711    movq   [dstq+strideq*1], m0
712    movhps [dstq+strideq*2], m1
713    movq   [dstq+r3       ], m1
714    lea                dstq, [dstq+strideq*4]
715    sub                  hd, 4*2
716    jg .w4_loop
717    RET
718.w8:
719    lea            weightsq, [weightsq+wq*4]
720    neg                  wq
721%if ARCH_X86_32
722    PUSH                 r6
723    %assign regs_used     7
724    %define              hd  hm
725%elif WIN64
726    PUSH                 r7
727    %assign regs_used     8
728%endif
729.w8_loop0:
730    mov                 t0d, hd
731    mova                 m4, [weightsq+wq*2]
732    mov                  r6, dstq
733    add                dstq, 16
734.w8_loop:
735    movq                 m3, [tlq+t0*(1+ARCH_X86_32)]
736    punpcklwd            m3, m3
737    psubw                m3, m5
738    pshufd               m0, m3, q3333
739    pshufd               m1, m3, q2222
740    pshufd               m2, m3, q1111
741    pshufd               m3, m3, q0000
742    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
743    REPX   {paddw    x, m5}, m0, m1, m2, m3
744    mova     [r6+strideq*0], m0
745    mova     [r6+strideq*1], m1
746    lea                  r6, [r6+strideq*2]
747    mova     [r6+strideq*0], m2
748    mova     [r6+strideq*1], m3
749    lea                  r6, [r6+strideq*2]
750    sub                 t0d, 4*(1+ARCH_X86_64)
751    jg .w8_loop
752    add                  wq, 8
753    jl .w8_loop0
754    RET
755
756%if ARCH_X86_64
757DECLARE_REG_TMP 10
758%else
759DECLARE_REG_TMP 3
760%endif
761
762cglobal ipred_smooth_16bpc, 3, 7, 8, dst, stride, tl, w, h, \
763                                     h_weights, v_weights, top
764    LEA          h_weightsq, smooth_weights_2d_16bpc
765    mov                  wd, wm
766    mov                  hd, hm
767    movd                 m7, [tlq+wq*2] ; right
768    lea          v_weightsq, [h_weightsq+hq*8]
769    neg                  hq
770    movd                 m6, [tlq+hq*2] ; bottom
771    pshuflw              m7, m7, q0000
772    pshuflw              m6, m6, q0000
773    cmp                  wd, 4
774    jne .w8
775    movq                 m4, [tlq+2]    ; top
776    mova                 m5, [h_weightsq+4*4]
777    punpcklwd            m4, m6         ; top, bottom
778    pxor                 m6, m6
779.w4_loop:
780    movq                 m1, [v_weightsq+hq*4]
781    sub                 tlq, 4
782    movd                 m3, [tlq]      ; left
783    pshufd               m0, m1, q0000
784    pshufd               m1, m1, q1111
785    pmaddwd              m0, m4
786    punpcklwd            m3, m7         ; left, right
787    pmaddwd              m1, m4
788    pshufd               m2, m3, q1111
789    pshufd               m3, m3, q0000
790    pmaddwd              m2, m5
791    pmaddwd              m3, m5
792    paddd                m0, m2
793    paddd                m1, m3
794    psrld                m0, 8
795    psrld                m1, 8
796    packssdw             m0, m1
797    pavgw                m0, m6
798    movq   [dstq+strideq*0], m0
799    movhps [dstq+strideq*1], m0
800    lea                dstq, [dstq+strideq*2]
801    add                  hq, 2
802    jl .w4_loop
803    RET
804.w8:
805%if ARCH_X86_32
806    lea          h_weightsq, [h_weightsq+wq*4]
807    mov                  t0, tlq
808    mov                 r1m, tlq
809    mov                 r2m, hq
810    %define              m8  [h_weightsq+16*0]
811    %define              m9  [h_weightsq+16*1]
812%else
813%if WIN64
814    movaps              r4m, m8
815    movaps              r6m, m9
816    PUSH                 r7
817    PUSH                 r8
818%endif
819    PUSH                 r9
820    PUSH                r10
821    %assign       regs_used  11
822    lea          h_weightsq, [h_weightsq+wq*8]
823    lea                topq, [tlq+wq*2]
824    neg                  wq
825    mov                  r8, tlq
826    mov                  r9, hq
827%endif
828    punpcklqdq           m6, m6
829.w8_loop0:
830%if ARCH_X86_32
831    movu                 m5, [t0+2]
832    add                  t0, 16
833    mov                 r0m, t0
834%else
835    movu                 m5, [topq+wq*2+2]
836    mova                 m8, [h_weightsq+wq*4+16*0]
837    mova                 m9, [h_weightsq+wq*4+16*1]
838%endif
839    mov                  t0, dstq
840    add                dstq, 16
841    punpcklwd            m4, m5, m6
842    punpckhwd            m5, m6
843.w8_loop:
844    movd                 m1, [v_weightsq+hq*4]
845    sub                 tlq, 2
846    movd                 m3, [tlq]      ; left
847    pshufd               m1, m1, q0000
848    pmaddwd              m0, m4, m1
849    pshuflw              m3, m3, q0000
850    pmaddwd              m1, m5
851    punpcklwd            m3, m7         ; left, right
852    pmaddwd              m2, m8, m3
853    pmaddwd              m3, m9
854    paddd                m0, m2
855    paddd                m1, m3
856    psrld                m0, 8
857    psrld                m1, 8
858    packssdw             m0, m1
859    pxor                 m1, m1
860    pavgw                m0, m1
861    mova               [t0], m0
862    add                  t0, strideq
863    inc                  hq
864    jl .w8_loop
865%if ARCH_X86_32
866    mov                  t0, r0m
867    mov                 tlq, r1m
868    add          h_weightsq, 16*2
869    mov                  hq, r2m
870    sub            dword wm, 8
871    jg .w8_loop0
872%else
873    mov                 tlq, r8
874    mov                  hq, r9
875    add                  wq, 8
876    jl .w8_loop0
877%endif
878%if WIN64
879    movaps               m8, r4m
880    movaps               m9, r6m
881%endif
882    RET
883
884%if ARCH_X86_64
885cglobal ipred_filter_16bpc, 4, 7, 16, dst, stride, tl, w, h, filter
886%else
887cglobal ipred_filter_16bpc, 4, 7, 8, -16*8, dst, stride, tl, w, h, filter
888%define  m8 [esp+16*0]
889%define  m9 [esp+16*1]
890%define m10 [esp+16*2]
891%define m11 [esp+16*3]
892%define m12 [esp+16*4]
893%define m13 [esp+16*5]
894%define m14 [esp+16*6]
895%define m15 [esp+16*7]
896%endif
897%define base r6-$$
898    movifnidn            hd, hm
899    movd                 m6, r8m     ; bitdepth_max
900%ifidn filterd, filterm
901    movzx           filterd, filterb
902%else
903    movzx           filterd, byte filterm
904%endif
905    LEA                  r6, $$
906    shl             filterd, 6
907    movu                 m0, [tlq-6] ; __ l1 l0 tl t0 t1 t2 t3
908    mova                 m1, [base+filter_intra_taps+filterq+16*0]
909    mova                 m2, [base+filter_intra_taps+filterq+16*1]
910    mova                 m3, [base+filter_intra_taps+filterq+16*2]
911    mova                 m4, [base+filter_intra_taps+filterq+16*3]
912    pxor                 m5, m5
913%if ARCH_X86_64
914    punpcklbw            m8, m5, m1  ; place 8-bit coefficients in the upper
915    punpckhbw            m9, m5, m1  ; half of each 16-bit word to avoid
916    punpcklbw           m10, m5, m2  ; having to perform sign-extension.
917    punpckhbw           m11, m5, m2
918    punpcklbw           m12, m5, m3
919    punpckhbw           m13, m5, m3
920    punpcklbw           m14, m5, m4
921    punpckhbw           m15, m5, m4
922%else
923    punpcklbw            m7, m5, m1
924    mova                 m8, m7
925    punpckhbw            m7, m5, m1
926    mova                 m9, m7
927    punpcklbw            m7, m5, m2
928    mova                m10, m7
929    punpckhbw            m7, m5, m2
930    mova                m11, m7
931    punpcklbw            m7, m5, m3
932    mova                m12, m7
933    punpckhbw            m7, m5, m3
934    mova                m13, m7
935    punpcklbw            m7, m5, m4
936    mova                m14, m7
937    punpckhbw            m7, m5, m4
938    mova                m15, m7
939%endif
940    mova                 m7, [base+filter_shuf]
941    add                  hd, hd
942    mov                  r5, dstq
943    pshuflw              m6, m6, q0000
944    mov                  r6, tlq
945    punpcklqdq           m6, m6
946    sub                 tlq, hq
947.left_loop:
948    pshufb               m0, m7      ; tl t0 t1 t2 t3 l0 l1 __
949    pshufd               m1, m0, q0000
950    pmaddwd              m2, m8, m1
951    pmaddwd              m1, m9
952    pshufd               m4, m0, q1111
953    pmaddwd              m3, m10, m4
954    pmaddwd              m4, m11
955    paddd                m2, m3
956    paddd                m1, m4
957    pshufd               m4, m0, q2222
958    pmaddwd              m3, m12, m4
959    pmaddwd              m4, m13
960    paddd                m2, m3
961    paddd                m1, m4
962    pshufd               m3, m0, q3333
963    pmaddwd              m0, m14, m3
964    pmaddwd              m3, m15
965    paddd                m0, m2
966    paddd                m1, m3
967    psrad                m0, 11     ; x >> 3
968    psrad                m1, 11
969    packssdw             m0, m1
970    pmaxsw               m0, m5
971    pavgw                m0, m5     ; (x + 8) >> 4
972    pminsw               m0, m6
973    movq   [dstq+strideq*0], m0
974    movhps [dstq+strideq*1], m0
975    movlps               m0, [tlq+hq-10]
976    lea                dstq, [dstq+strideq*2]
977    sub                  hd, 2*2
978    jg .left_loop
979    sub                  wd, 4
980    jz .end
981    sub                 tld, r6d     ; -h*2
982    sub                  r6, r5      ; tl-dst
983.right_loop0:
984    add                  r5, 8
985    mov                  hd, tld
986    movu                 m0, [r5+r6] ; tl t0 t1 t2 t3 __ __ __
987    mov                dstq, r5
988.right_loop:
989    pshufd               m2, m0, q0000
990    pmaddwd              m1, m8, m2
991    pmaddwd              m2, m9
992    pshufd               m4, m0, q1111
993    pmaddwd              m3, m10, m4
994    pmaddwd              m4, m11
995    pinsrw               m0, [dstq+strideq*0-2], 5
996    paddd                m1, m3
997    paddd                m2, m4
998    pshufd               m0, m0, q2222
999    movddup              m4, [dstq+strideq*1-8]
1000    pmaddwd              m3, m12, m0
1001    pmaddwd              m0, m13
1002    paddd                m1, m3
1003    paddd                m0, m2
1004    pshuflw              m2, m4, q3333
1005    punpcklwd            m2, m5
1006    pmaddwd              m3, m14, m2
1007    pmaddwd              m2, m15
1008    paddd                m1, m3
1009    paddd                m0, m2
1010    psrad                m1, 11
1011    psrad                m0, 11
1012    packssdw             m0, m1
1013    pmaxsw               m0, m5
1014    pavgw                m0, m5
1015    pminsw               m0, m6
1016    movhps [dstq+strideq*0], m0
1017    movq   [dstq+strideq*1], m0
1018    palignr              m0, m4, 14
1019    lea                dstq, [dstq+strideq*2]
1020    add                  hd, 2*2
1021    jl .right_loop
1022    sub                  wd, 4
1023    jg .right_loop0
1024.end:
1025    RET
1026
1027%if UNIX64
1028DECLARE_REG_TMP 7
1029%else
1030DECLARE_REG_TMP 5
1031%endif
1032
1033cglobal ipred_cfl_top_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac
1034    LEA                  t0, ipred_cfl_left_16bpc_ssse3_table
1035    movd                 m4, wd
1036    tzcnt                wd, wd
1037    movifnidn            hd, hm
1038    add                 tlq, 2
1039    movsxd               r6, [t0+wq*4]
1040    movd                 m5, wd
1041    jmp mangle(private_prefix %+ _ipred_cfl_left_16bpc_ssse3.start)
1042
1043cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
1044    movifnidn            hd, hm
1045    LEA                  t0, ipred_cfl_left_16bpc_ssse3_table
1046    tzcnt                wd, wm
1047    lea                 r6d, [hq*2]
1048    movd                 m4, hd
1049    sub                 tlq, r6
1050    tzcnt               r6d, hd
1051    movd                 m5, r6d
1052    movsxd               r6, [t0+r6*4]
1053.start:
1054    movd                 m7, r7m
1055    movu                 m0, [tlq]
1056    add                  r6, t0
1057    add                  t0, ipred_cfl_splat_16bpc_ssse3_table-ipred_cfl_left_16bpc_ssse3_table
1058    movsxd               wq, [t0+wq*4]
1059    pxor                 m6, m6
1060    pshuflw              m7, m7, q0000
1061    pcmpeqw              m3, m3
1062    add                  wq, t0
1063    movifnidn           acq, acmp
1064    pavgw                m4, m6
1065    punpcklqdq           m7, m7
1066    jmp                  r6
1067.h32:
1068    movu                 m1, [tlq+48]
1069    movu                 m2, [tlq+32]
1070    paddw                m0, m1
1071    paddw                m0, m2
1072.h16:
1073    movu                 m1, [tlq+16]
1074    paddw                m0, m1
1075.h8:
1076    pshufd               m1, m0, q1032
1077    paddw                m0, m1
1078.h4:
1079    pmaddwd              m0, m3
1080    psubd                m4, m0
1081    pshuflw              m0, m4, q1032
1082    paddd                m0, m4
1083    psrld                m0, m5
1084    pshuflw              m0, m0, q0000
1085    punpcklqdq           m0, m0
1086    jmp                  wq
1087
1088%macro IPRED_CFL 2 ; dst, src
1089    pabsw               m%1, m%2
1090    pmulhrsw            m%1, m2
1091    psignw              m%2, m1
1092    psignw              m%1, m%2
1093    paddw               m%1, m0
1094    pmaxsw              m%1, m6
1095    pminsw              m%1, m7
1096%endmacro
1097
1098cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac, alpha
1099    movifnidn            hd, hm
1100    tzcnt               r6d, hd
1101    lea                 t0d, [wq+hq]
1102    movd                 m4, t0d
1103    tzcnt               t0d, t0d
1104    movd                 m5, t0d
1105    LEA                  t0, ipred_cfl_16bpc_ssse3_table
1106    tzcnt                wd, wd
1107    movd                 m7, r7m
1108    movsxd               r6, [t0+r6*4]
1109    movsxd               wq, [t0+wq*4+4*4]
1110    psrlw                m4, 1
1111    pxor                 m6, m6
1112    pshuflw              m7, m7, q0000
1113    add                  r6, t0
1114    add                  wq, t0
1115    movifnidn           acq, acmp
1116    pcmpeqw              m3, m3
1117    punpcklqdq           m7, m7
1118    jmp                  r6
1119.h4:
1120    movq                 m0, [tlq-8]
1121    jmp                  wq
1122.w4:
1123    movq                 m1, [tlq+2]
1124    paddw                m0, m1
1125    pmaddwd              m0, m3
1126    psubd                m4, m0
1127    pshufd               m0, m4, q1032
1128    paddd                m0, m4
1129    pshuflw              m4, m0, q1032
1130    paddd                m0, m4
1131    cmp                  hd, 4
1132    jg .w4_mul
1133    psrld                m0, 3
1134    jmp .w4_end
1135.w4_mul:
1136    mov                 r6d, 0xAAAB
1137    mov                 r2d, 0x6667
1138    cmp                  hd, 16
1139    cmove               r6d, r2d
1140    movd                 m1, r6d
1141    psrld                m0, 2
1142    pmulhuw              m0, m1
1143    psrlw                m0, 1
1144.w4_end:
1145    pshuflw              m0, m0, q0000
1146    punpcklqdq           m0, m0
1147.s4:
1148    movd                 m1, alpham
1149    lea                  r6, [strideq*3]
1150    pshuflw              m1, m1, q0000
1151    punpcklqdq           m1, m1
1152    pabsw                m2, m1
1153    psllw                m2, 9
1154.s4_loop:
1155    mova                 m4, [acq+16*0]
1156    mova                 m5, [acq+16*1]
1157    add                 acq, 16*2
1158    IPRED_CFL             3, 4
1159    IPRED_CFL             4, 5
1160    movq   [dstq+strideq*0], m3
1161    movhps [dstq+strideq*1], m3
1162    movq   [dstq+strideq*2], m4
1163    movhps [dstq+r6       ], m4
1164    lea                dstq, [dstq+strideq*4]
1165    sub                  hd, 4
1166    jg .s4_loop
1167    RET
1168.h8:
1169    mova                 m0, [tlq-16]
1170    jmp                  wq
1171.w8:
1172    movu                 m1, [tlq+2]
1173    paddw                m0, m1
1174    pmaddwd              m0, m3
1175    psubd                m4, m0
1176    pshufd               m0, m4, q1032
1177    paddd                m0, m4
1178    pshuflw              m4, m0, q1032
1179    paddd                m0, m4
1180    psrld                m0, m5
1181    cmp                  hd, 8
1182    je .w8_end
1183    mov                 r6d, 0xAAAB
1184    mov                 r2d, 0x6667
1185    cmp                  hd, 32
1186    cmove               r6d, r2d
1187    movd                 m1, r6d
1188    pmulhuw              m0, m1
1189    psrlw                m0, 1
1190.w8_end:
1191    pshuflw              m0, m0, q0000
1192    punpcklqdq           m0, m0
1193.s8:
1194    movd                 m1, alpham
1195    pshuflw              m1, m1, q0000
1196    punpcklqdq           m1, m1
1197    pabsw                m2, m1
1198    psllw                m2, 9
1199.s8_loop:
1200    mova                 m4, [acq+16*0]
1201    mova                 m5, [acq+16*1]
1202    add                 acq, 16*2
1203    IPRED_CFL             3, 4
1204    IPRED_CFL             4, 5
1205    mova   [dstq+strideq*0], m3
1206    mova   [dstq+strideq*1], m4
1207    lea                dstq, [dstq+strideq*2]
1208    sub                  hd, 2
1209    jg .s8_loop
1210    RET
1211.h16:
1212    mova                 m0, [tlq-32]
1213    paddw                m0, [tlq-16]
1214    jmp                  wq
1215.w16:
1216    movu                 m1, [tlq+ 2]
1217    movu                 m2, [tlq+18]
1218    paddw                m1, m2
1219    paddw                m0, m1
1220    pmaddwd              m0, m3
1221    psubd                m4, m0
1222    pshufd               m0, m4, q1032
1223    paddd                m0, m4
1224    pshuflw              m4, m0, q1032
1225    paddd                m0, m4
1226    psrld                m0, m5
1227    cmp                  hd, 16
1228    je .w16_end
1229    mov                 r6d, 0xAAAB
1230    mov                 r2d, 0x6667
1231    test                 hd, 8|32
1232    cmovz               r6d, r2d
1233    movd                 m1, r6d
1234    pmulhuw              m0, m1
1235    psrlw                m0, 1
1236.w16_end:
1237    pshuflw              m0, m0, q0000
1238    punpcklqdq           m0, m0
1239.s16:
1240    movd                 m1, alpham
1241    pshuflw              m1, m1, q0000
1242    punpcklqdq           m1, m1
1243    pabsw                m2, m1
1244    psllw                m2, 9
1245.s16_loop:
1246    mova                 m4, [acq+16*0]
1247    mova                 m5, [acq+16*1]
1248    add                 acq, 16*2
1249    IPRED_CFL             3, 4
1250    IPRED_CFL             4, 5
1251    mova        [dstq+16*0], m3
1252    mova        [dstq+16*1], m4
1253    add                dstq, strideq
1254    dec                  hd
1255    jg .s16_loop
1256    RET
1257.h32:
1258    mova                 m0, [tlq-64]
1259    paddw                m0, [tlq-48]
1260    paddw                m0, [tlq-32]
1261    paddw                m0, [tlq-16]
1262    jmp                  wq
1263.w32:
1264    movu                 m1, [tlq+ 2]
1265    movu                 m2, [tlq+18]
1266    paddw                m1, m2
1267    movu                 m2, [tlq+34]
1268    paddw                m1, m2
1269    movu                 m2, [tlq+50]
1270    paddw                m1, m2
1271    paddw                m0, m1
1272    pmaddwd              m0, m3
1273    psubd                m4, m0
1274    pshufd               m0, m4, q1032
1275    paddd                m0, m4
1276    pshuflw              m4, m0, q1032
1277    paddd                m0, m4
1278    psrld                m0, m5
1279    cmp                  hd, 32
1280    je .w32_end
1281    mov                 r6d, 0xAAAB
1282    mov                 r2d, 0x6667
1283    cmp                  hd, 8
1284    cmove               r6d, r2d
1285    movd                 m1, r6d
1286    pmulhuw              m0, m1
1287    psrlw                m0, 1
1288.w32_end:
1289    pshuflw              m0, m0, q0000
1290    punpcklqdq           m0, m0
1291.s32:
1292    movd                 m1, alpham
1293    pshuflw              m1, m1, q0000
1294    punpcklqdq           m1, m1
1295    pabsw                m2, m1
1296    psllw                m2, 9
1297.s32_loop:
1298    mova                 m4, [acq+16*0]
1299    mova                 m5, [acq+16*1]
1300    IPRED_CFL             3, 4
1301    IPRED_CFL             4, 5
1302    mova        [dstq+16*0], m3
1303    mova        [dstq+16*1], m4
1304    mova                 m4, [acq+16*2]
1305    mova                 m5, [acq+16*3]
1306    add                 acq, 16*4
1307    IPRED_CFL             3, 4
1308    IPRED_CFL             4, 5
1309    mova        [dstq+16*2], m3
1310    mova        [dstq+16*3], m4
1311    add                dstq, strideq
1312    dec                  hd
1313    jg .s32_loop
1314    RET
1315
1316cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac
1317    tzcnt                wd, wm
1318    LEA                  t0, ipred_cfl_splat_16bpc_ssse3_table
1319    mov                 r6d, r7m
1320    movifnidn            hd, hm
1321    shr                 r6d, 11
1322    movd                 m7, r7m
1323    movsxd               wq, [t0+wq*4]
1324    movddup              m0, [t0-ipred_cfl_splat_16bpc_ssse3_table+pw_512+r6*8]
1325    pshuflw              m7, m7, q0000
1326    pxor                 m6, m6
1327    add                  wq, t0
1328    movifnidn           acq, acmp
1329    punpcklqdq           m7, m7
1330    jmp                  wq
1331
1332cglobal ipred_cfl_ac_420_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
1333    movifnidn         hpadd, hpadm
1334%if ARCH_X86_32 && PIC
1335    pcmpeqw              m5, m5
1336    pabsw                m5, m5
1337    paddw                m5, m5
1338%else
1339    movddup              m5, [pw_2]
1340%endif
1341    mov                  hd, hm
1342    shl               hpadd, 2
1343    pxor                 m4, m4
1344    sub                  hd, hpadd
1345    cmp            dword wm, 8
1346    mov                  r5, acq
1347    jg .w16
1348    je .w8
1349    lea                  r3, [strideq*3]
1350.w4_loop:
1351    pmaddwd              m0, m5, [ypxq+strideq*0]
1352    pmaddwd              m1, m5, [ypxq+strideq*1]
1353    pmaddwd              m2, m5, [ypxq+strideq*2]
1354    pmaddwd              m3, m5, [ypxq+r3       ]
1355    lea                ypxq, [ypxq+strideq*4]
1356    paddd                m0, m1
1357    paddd                m2, m3
1358    paddd                m4, m0
1359    packssdw             m0, m2
1360    paddd                m4, m2
1361    mova              [acq], m0
1362    add                 acq, 16
1363    sub                  hd, 2
1364    jg .w4_loop
1365    test              hpadd, hpadd
1366    jz .dc
1367    punpckhqdq           m0, m0
1368    pslld                m2, 2
1369.w4_hpad:
1370    mova         [acq+16*0], m0
1371    paddd                m4, m2
1372    mova         [acq+16*1], m0
1373    add                 acq, 16*2
1374    sub               hpadd, 4
1375    jg .w4_hpad
1376    jmp .dc
1377.w8:
1378%if ARCH_X86_32
1379    cmp         dword wpadm, 0
1380%else
1381    test              wpadd, wpadd
1382%endif
1383    jnz .w8_wpad1
1384.w8_loop:
1385    pmaddwd              m0, m5, [ypxq+strideq*0+16*0]
1386    pmaddwd              m2, m5, [ypxq+strideq*1+16*0]
1387    pmaddwd              m1, m5, [ypxq+strideq*0+16*1]
1388    pmaddwd              m3, m5, [ypxq+strideq*1+16*1]
1389    lea                ypxq, [ypxq+strideq*2]
1390    paddd                m0, m2
1391    paddd                m1, m3
1392    paddd                m2, m0, m1
1393    packssdw             m0, m1
1394    paddd                m4, m2
1395    mova              [acq], m0
1396    add                 acq, 16
1397    dec                  hd
1398    jg .w8_loop
1399.w8_hpad:
1400    test              hpadd, hpadd
1401    jz .dc
1402    pslld                m2, 2
1403    mova                 m1, m0
1404    jmp .hpad
1405.w8_wpad1:
1406    pmaddwd              m0, m5, [ypxq+strideq*0]
1407    pmaddwd              m1, m5, [ypxq+strideq*1]
1408    lea                ypxq, [ypxq+strideq*2]
1409    paddd                m0, m1
1410    pshufd               m1, m0, q3333
1411    paddd                m2, m0, m1
1412    packssdw             m0, m1
1413    paddd                m4, m2
1414    mova              [acq], m0
1415    add                 acq, 16
1416    dec                  hd
1417    jg .w8_wpad1
1418    jmp .w8_hpad
1419.w16_wpad3:
1420    pshufd               m3, m0, q3333
1421    mova                 m1, m3
1422    mova                 m2, m3
1423    jmp .w16_wpad_end
1424.w16_wpad2:
1425    pshufd               m1, m3, q3333
1426    mova                 m2, m1
1427    jmp .w16_wpad_end
1428.w16_wpad1:
1429    pshufd               m2, m1, q3333
1430    jmp .w16_wpad_end
1431.w16:
1432    movifnidn         wpadd, wpadm
1433    WIN64_SPILL_XMM       7
1434.w16_loop:
1435    pmaddwd              m0, m5, [ypxq+strideq*0+16*0]
1436    pmaddwd              m6, m5, [ypxq+strideq*1+16*0]
1437    paddd                m0, m6
1438    cmp               wpadd, 2
1439    jg .w16_wpad3
1440    pmaddwd              m3, m5, [ypxq+strideq*0+16*1]
1441    pmaddwd              m6, m5, [ypxq+strideq*1+16*1]
1442    paddd                m3, m6
1443    je .w16_wpad2
1444    pmaddwd              m1, m5, [ypxq+strideq*0+16*2]
1445    pmaddwd              m6, m5, [ypxq+strideq*1+16*2]
1446    paddd                m1, m6
1447    jp .w16_wpad1
1448    pmaddwd              m2, m5, [ypxq+strideq*0+16*3]
1449    pmaddwd              m6, m5, [ypxq+strideq*1+16*3]
1450    paddd                m2, m6
1451.w16_wpad_end:
1452    lea                ypxq, [ypxq+strideq*2]
1453    paddd                m6, m0, m3
1454    packssdw             m0, m3
1455    paddd                m6, m1
1456    mova         [acq+16*0], m0
1457    packssdw             m1, m2
1458    paddd                m2, m6
1459    mova         [acq+16*1], m1
1460    add                 acq, 16*2
1461    paddd                m4, m2
1462    dec                  hd
1463    jg .w16_loop
1464    WIN64_RESTORE_XMM
1465    add               hpadd, hpadd
1466    jz .dc
1467    paddd                m2, m2
1468.hpad:
1469    mova         [acq+16*0], m0
1470    mova         [acq+16*1], m1
1471    paddd                m4, m2
1472    mova         [acq+16*2], m0
1473    mova         [acq+16*3], m1
1474    add                 acq, 16*4
1475    sub               hpadd, 4
1476    jg .hpad
1477.dc:
1478    sub                  r5, acq ; -w*h*2
1479    pshufd               m2, m4, q1032
1480    tzcnt               r1d, r5d
1481    paddd                m2, m4
1482    sub                 r1d, 2
1483    pshufd               m4, m2, q2301
1484    movd                 m0, r1d
1485    paddd                m2, m4
1486    psrld                m2, m0
1487    pxor                 m0, m0
1488    pavgw                m2, m0
1489    packssdw             m2, m2
1490.dc_loop:
1491    mova                 m0, [acq+r5+16*0]
1492    mova                 m1, [acq+r5+16*1]
1493    psubw                m0, m2
1494    psubw                m1, m2
1495    mova      [acq+r5+16*0], m0
1496    mova      [acq+r5+16*1], m1
1497    add                  r5, 16*2
1498    jl .dc_loop
1499    RET
1500
1501cglobal ipred_cfl_ac_422_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
1502    movifnidn         hpadd, hpadm
1503%if ARCH_X86_32 && PIC
1504    pcmpeqw              m5, m5
1505    pabsw                m5, m5
1506    psllw                m5, 2
1507%else
1508    movddup              m5, [pw_4]
1509%endif
1510    mov                  hd, hm
1511    shl               hpadd, 2
1512    pxor                 m4, m4
1513    sub                  hd, hpadd
1514    cmp            dword wm, 8
1515    mov                  r5, acq
1516    jg .w16
1517    je .w8
1518    lea                  r3, [strideq*3]
1519.w4_loop:
1520    pmaddwd              m0, m5, [ypxq+strideq*0]
1521    pmaddwd              m3, m5, [ypxq+strideq*1]
1522    pmaddwd              m1, m5, [ypxq+strideq*2]
1523    pmaddwd              m2, m5, [ypxq+r3       ]
1524    lea                ypxq, [ypxq+strideq*4]
1525    paddd                m4, m0
1526    packssdw             m0, m3
1527    paddd                m3, m1
1528    packssdw             m1, m2
1529    paddd                m4, m2
1530    paddd                m4, m3
1531    mova         [acq+16*0], m0
1532    mova         [acq+16*1], m1
1533    add                 acq, 16*2
1534    sub                  hd, 4
1535    jg .w4_loop
1536    test              hpadd, hpadd
1537    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
1538    punpckhqdq           m1, m1
1539    pslld                m2, 3
1540    mova         [acq+16*0], m1
1541    mova         [acq+16*1], m1
1542    paddd                m4, m2
1543    mova         [acq+16*2], m1
1544    mova         [acq+16*3], m1
1545    add                 acq, 16*4
1546    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
1547.w8:
1548%if ARCH_X86_32
1549    cmp         dword wpadm, 0
1550%else
1551    test              wpadd, wpadd
1552%endif
1553    jnz .w8_wpad1
1554.w8_loop:
1555    pmaddwd              m0, m5, [ypxq+strideq*0+16*0]
1556    pmaddwd              m2, m5, [ypxq+strideq*0+16*1]
1557    pmaddwd              m1, m5, [ypxq+strideq*1+16*0]
1558    pmaddwd              m3, m5, [ypxq+strideq*1+16*1]
1559    lea                ypxq, [ypxq+strideq*2]
1560    paddd                m4, m0
1561    packssdw             m0, m2
1562    paddd                m4, m2
1563    mova         [acq+16*0], m0
1564    paddd                m2, m1, m3
1565    packssdw             m1, m3
1566    paddd                m4, m2
1567    mova         [acq+16*1], m1
1568    add                 acq, 16*2
1569    sub                  hd, 2
1570    jg .w8_loop
1571.w8_hpad:
1572    test              hpadd, hpadd
1573    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
1574    pslld                m2, 2
1575    mova                 m0, m1
1576    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
1577.w8_wpad1:
1578    pmaddwd              m0, m5, [ypxq+strideq*0]
1579    pmaddwd              m1, m5, [ypxq+strideq*1]
1580    lea                ypxq, [ypxq+strideq*2]
1581    pshufd               m2, m0, q3333
1582    pshufd               m3, m1, q3333
1583    paddd                m4, m0
1584    packssdw             m0, m2
1585    paddd                m4, m2
1586    paddd                m2, m1, m3
1587    packssdw             m1, m3
1588    paddd                m4, m2
1589    mova         [acq+16*0], m0
1590    mova         [acq+16*1], m1
1591    add                 acq, 16*2
1592    sub                  hd, 2
1593    jg .w8_wpad1
1594    jmp .w8_hpad
1595.w16_wpad3:
1596    pshufd               m3, m0, q3333
1597    mova                 m1, m3
1598    mova                 m2, m3
1599    jmp .w16_wpad_end
1600.w16_wpad2:
1601    pshufd               m1, m3, q3333
1602    mova                 m2, m1
1603    jmp .w16_wpad_end
1604.w16_wpad1:
1605    pshufd               m2, m1, q3333
1606    jmp .w16_wpad_end
1607.w16:
1608    movifnidn         wpadd, wpadm
1609    WIN64_SPILL_XMM       7
1610.w16_loop:
1611    pmaddwd              m0, m5, [ypxq+16*0]
1612    cmp               wpadd, 2
1613    jg .w16_wpad3
1614    pmaddwd              m3, m5, [ypxq+16*1]
1615    je .w16_wpad2
1616    pmaddwd              m1, m5, [ypxq+16*2]
1617    jp .w16_wpad1
1618    pmaddwd              m2, m5, [ypxq+16*3]
1619.w16_wpad_end:
1620    add                ypxq, strideq
1621    paddd                m6, m0, m3
1622    packssdw             m0, m3
1623    mova         [acq+16*0], m0
1624    paddd                m6, m1
1625    packssdw             m1, m2
1626    paddd                m2, m6
1627    mova         [acq+16*1], m1
1628    add                 acq, 16*2
1629    paddd                m4, m2
1630    dec                  hd
1631    jg .w16_loop
1632    WIN64_RESTORE_XMM
1633    add               hpadd, hpadd
1634    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
1635    paddd                m2, m2
1636    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
1637
1638cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
1639%define base r6-ipred_cfl_ac_444_16bpc_ssse3_table
1640    LEA                  r6, ipred_cfl_ac_444_16bpc_ssse3_table
1641    tzcnt                wd, wm
1642    movifnidn         hpadd, hpadm
1643    pxor                 m4, m4
1644    movsxd               wq, [r6+wq*4]
1645    movddup              m5, [base+pw_1]
1646    add                  wq, r6
1647    mov                  hd, hm
1648    shl               hpadd, 2
1649    sub                  hd, hpadd
1650    jmp                  wq
1651.w4:
1652    lea                  r3, [strideq*3]
1653    mov                  r5, acq
1654.w4_loop:
1655    movq                 m0, [ypxq+strideq*0]
1656    movhps               m0, [ypxq+strideq*1]
1657    movq                 m1, [ypxq+strideq*2]
1658    movhps               m1, [ypxq+r3       ]
1659    lea                ypxq, [ypxq+strideq*4]
1660    psllw                m0, 3
1661    psllw                m1, 3
1662    mova         [acq+16*0], m0
1663    pmaddwd              m0, m5
1664    mova         [acq+16*1], m1
1665    pmaddwd              m2, m5, m1
1666    add                 acq, 16*2
1667    paddd                m4, m0
1668    paddd                m4, m2
1669    sub                  hd, 4
1670    jg .w4_loop
1671    test              hpadd, hpadd
1672    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
1673    punpckhqdq           m1, m1
1674    mova         [acq+16*0], m1
1675    pslld                m2, 2
1676    mova         [acq+16*1], m1
1677    punpckhqdq           m2, m2
1678    mova         [acq+16*2], m1
1679    paddd                m4, m2
1680    mova         [acq+16*3], m1
1681    add                 acq, 16*4
1682    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
1683.w8:
1684    mov                  r5, acq
1685.w8_loop:
1686    mova                 m0, [ypxq+strideq*0]
1687    mova                 m1, [ypxq+strideq*1]
1688    lea                ypxq, [ypxq+strideq*2]
1689    psllw                m0, 3
1690    psllw                m1, 3
1691    mova         [acq+16*0], m0
1692    pmaddwd              m0, m5
1693    mova         [acq+16*1], m1
1694    pmaddwd              m2, m5, m1
1695    add                 acq, 16*2
1696    paddd                m4, m0
1697    paddd                m4, m2
1698    sub                  hd, 2
1699    jg .w8_loop
1700.w8_hpad:
1701    test              hpadd, hpadd
1702    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
1703    pslld                m2, 2
1704    mova                 m0, m1
1705    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
1706.w16_wpad2:
1707    pshufhw              m3, m2, q3333
1708    pshufhw              m1, m0, q3333
1709    punpckhqdq           m3, m3
1710    punpckhqdq           m1, m1
1711    jmp .w16_wpad_end
1712.w16:
1713    movifnidn         wpadd, wpadm
1714    mov                  r5, acq
1715.w16_loop:
1716    mova                 m2, [ypxq+strideq*0+16*0]
1717    mova                 m0, [ypxq+strideq*1+16*0]
1718    psllw                m2, 3
1719    psllw                m0, 3
1720    test              wpadd, wpadd
1721    jnz .w16_wpad2
1722    mova                 m3, [ypxq+strideq*0+16*1]
1723    mova                 m1, [ypxq+strideq*1+16*1]
1724    psllw                m3, 3
1725    psllw                m1, 3
1726.w16_wpad_end:
1727    lea                ypxq, [ypxq+strideq*2]
1728    mova         [acq+16*0], m2
1729    pmaddwd              m2, m5
1730    mova         [acq+16*1], m3
1731    pmaddwd              m3, m5
1732    paddd                m4, m2
1733    pmaddwd              m2, m5, m0
1734    mova         [acq+16*2], m0
1735    paddd                m4, m3
1736    pmaddwd              m3, m5, m1
1737    mova         [acq+16*3], m1
1738    add                 acq, 16*4
1739    paddd                m2, m3
1740    paddd                m4, m2
1741    sub                  hd, 2
1742    jg .w16_loop
1743    add               hpadd, hpadd
1744    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
1745    paddd                m2, m2
1746    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad
1747.w32_wpad6:
1748    pshufhw              m1, m0, q3333
1749    punpckhqdq           m1, m1
1750    mova                 m2, m1
1751    mova                 m3, m1
1752    jmp .w32_wpad_end
1753.w32_wpad4:
1754    pshufhw              m2, m1, q3333
1755    punpckhqdq           m2, m2
1756    mova                 m3, m2
1757    jmp .w32_wpad_end
1758.w32_wpad2:
1759    pshufhw              m3, m2, q3333
1760    punpckhqdq           m3, m3
1761    jmp .w32_wpad_end
1762.w32:
1763    movifnidn         wpadd, wpadm
1764    mov                  r5, acq
1765    WIN64_SPILL_XMM       8
1766.w32_loop:
1767    mova                 m0, [ypxq+16*0]
1768    psllw                m0, 3
1769    cmp               wpadd, 4
1770    jg .w32_wpad6
1771    mova                 m1, [ypxq+16*1]
1772    psllw                m1, 3
1773    je .w32_wpad4
1774    mova                 m2, [ypxq+16*2]
1775    psllw                m2, 3
1776    jnp .w32_wpad2
1777    mova                 m3, [ypxq+16*3]
1778    psllw                m3, 3
1779.w32_wpad_end:
1780    add                ypxq, strideq
1781    pmaddwd              m6, m5, m0
1782    mova         [acq+16*0], m0
1783    pmaddwd              m7, m5, m1
1784    mova         [acq+16*1], m1
1785    paddd                m6, m7
1786    pmaddwd              m7, m5, m2
1787    mova         [acq+16*2], m2
1788    paddd                m6, m7
1789    pmaddwd              m7, m5, m3
1790    mova         [acq+16*3], m3
1791    add                 acq, 16*4
1792    paddd                m6, m7
1793    paddd                m4, m6
1794    dec                  hd
1795    jg .w32_loop
1796%if WIN64
1797    mova                 m5, m6
1798    WIN64_RESTORE_XMM
1799    SWAP                  5, 6
1800%endif
1801    test              hpadd, hpadd
1802    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
1803.w32_hpad_loop:
1804    mova         [acq+16*0], m0
1805    mova         [acq+16*1], m1
1806    paddd                m4, m6
1807    mova         [acq+16*2], m2
1808    mova         [acq+16*3], m3
1809    add                 acq, 16*4
1810    dec               hpadd
1811    jg .w32_hpad_loop
1812    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
1813
1814cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h
1815%define base r2-pal_pred_16bpc_ssse3_table
1816%if ARCH_X86_32
1817    %define              hd  r2d
1818%endif
1819    mova                 m3, [palq]
1820    LEA                  r2, pal_pred_16bpc_ssse3_table
1821    tzcnt                wd, wm
1822    pshufb               m3, [base+pal_pred_shuf]
1823    movsxd               wq, [r2+wq*4]
1824    pshufd               m4, m3, q1032
1825    add                  wq, r2
1826    movifnidn            hd, hm
1827    jmp                  wq
1828.w4:
1829    mova                 m0, [idxq]
1830    add                idxq, 16
1831    pshufb               m1, m3, m0
1832    pshufb               m2, m4, m0
1833    punpcklbw            m0, m1, m2
1834    punpckhbw            m1, m2
1835    movq   [dstq+strideq*0], m0
1836    movhps [dstq+strideq*1], m0
1837    lea                dstq, [dstq+strideq*2]
1838    movq   [dstq+strideq*0], m1
1839    movhps [dstq+strideq*1], m1
1840    lea                dstq, [dstq+strideq*2]
1841    sub                  hd, 4
1842    jg .w4
1843    RET
1844.w8:
1845    mova                 m0, [idxq]
1846    add                idxq, 16
1847    pshufb               m1, m3, m0
1848    pshufb               m2, m4, m0
1849    punpcklbw            m0, m1, m2
1850    punpckhbw            m1, m2
1851    mova   [dstq+strideq*0], m0
1852    mova   [dstq+strideq*1], m1
1853    lea                dstq, [dstq+strideq*2]
1854    sub                  hd, 2
1855    jg .w8
1856    RET
1857.w16:
1858    mova                 m0, [idxq]
1859    add                idxq, 16
1860    pshufb               m1, m3, m0
1861    pshufb               m2, m4, m0
1862    punpcklbw            m0, m1, m2
1863    punpckhbw            m1, m2
1864    mova        [dstq+16*0], m0
1865    mova        [dstq+16*1], m1
1866    add                dstq, strideq
1867    dec                  hd
1868    jg .w16
1869    RET
1870.w32:
1871    mova                 m0, [idxq+16*0]
1872    pshufb               m1, m3, m0
1873    pshufb               m2, m4, m0
1874    punpcklbw            m0, m1, m2
1875    punpckhbw            m1, m2
1876    mova                 m2, [idxq+16*1]
1877    add                idxq, 16*2
1878    mova        [dstq+16*0], m0
1879    pshufb               m0, m3, m2
1880    mova        [dstq+16*1], m1
1881    pshufb               m1, m4, m2
1882    punpcklbw            m2, m0, m1
1883    punpckhbw            m0, m1
1884    mova        [dstq+16*2], m2
1885    mova        [dstq+16*3], m0
1886    add                dstq, strideq
1887    dec                  hd
1888    jg .w32
1889    RET
1890.w64:
1891    mova                 m0, [idxq+16*0]
1892    pshufb               m1, m3, m0
1893    pshufb               m2, m4, m0
1894    punpcklbw            m0, m1, m2
1895    punpckhbw            m1, m2
1896    mova                 m2, [idxq+16*1]
1897    mova        [dstq+16*0], m0
1898    pshufb               m0, m3, m2
1899    mova        [dstq+16*1], m1
1900    pshufb               m1, m4, m2
1901    punpcklbw            m2, m0, m1
1902    punpckhbw            m0, m1
1903    mova                 m1, [idxq+16*2]
1904    mova        [dstq+16*2], m2
1905    pshufb               m2, m3, m1
1906    mova        [dstq+16*3], m0
1907    pshufb               m0, m4, m1
1908    punpcklbw            m1, m2, m0
1909    punpckhbw            m2, m0
1910    mova                 m0, [idxq+16*3]
1911    add                idxq, 16*4
1912    mova        [dstq+16*4], m1
1913    pshufb               m1, m3, m0
1914    mova        [dstq+16*5], m2
1915    pshufb               m2, m4, m0
1916    punpcklbw            m0, m1, m2
1917    punpckhbw            m1, m2
1918    mova        [dstq+16*6], m0
1919    mova        [dstq+16*7], m1
1920    add                dstq, strideq
1921    dec                  hd
1922    jg .w64
1923    RET
1924