1;******************************************************************************
2;* VP9 Intra prediction SIMD optimizations
3;*
4;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
5;* Copyright (c) 2015 Henrik Gramner <henrik gramner com>
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA 32
27
28pd_2: times 8 dd 2
29pd_4: times 8 dd 4
30pd_8: times 8 dd 8
31
32pb_2to15_14_15: db 2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15
33pb_4_5_8to13_8x0: db 4,5,8,9,10,11,12,13,0,0,0,0,0,0,0,0
34pb_0to7_67x4: db 0,1,2,3,4,5,6,7,6,7,6,7,6,7,6,7
35
36cextern pw_1
37cextern pw_1023
38cextern pw_4095
39cextern pd_16
40cextern pd_32
41cextern pd_65535;
42
43; FIXME most top-only functions (ddl, vl, v, dc_top) can be modified to take
44; only 3 registers on x86-32, which would make it one cycle faster, but that
45; would make the code quite a bit uglier...
46
47SECTION .text
48
49%macro SCRATCH 3-4
50%if ARCH_X86_64
51    SWAP                %1, %2
52%if %0 == 4
53%define reg_%4 m%2
54%endif
55%else
56    mova              [%3], m%1
57%if %0 == 4
58%define reg_%4 [%3]
59%endif
60%endif
61%endmacro
62
63%macro UNSCRATCH 3-4
64%if ARCH_X86_64
65    SWAP                %1, %2
66%else
67    mova               m%1, [%3]
68%endif
69%if %0 == 4
70%undef reg_%4
71%endif
72%endmacro
73
74%macro PRELOAD 2-3
75%if ARCH_X86_64
76    mova               m%1, [%2]
77%if %0 == 3
78%define reg_%3 m%1
79%endif
80%elif %0 == 3
81%define reg_%3 [%2]
82%endif
83%endmacro
84
85INIT_MMX mmx
86cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a
87    movifnidn               aq, amp
88    mova                    m0, [aq]
89    DEFINE_ARGS dst, stride, stride3
90    lea               stride3q, [strideq*3]
91    mova      [dstq+strideq*0], m0
92    mova      [dstq+strideq*1], m0
93    mova      [dstq+strideq*2], m0
94    mova      [dstq+stride3q ], m0
95    RET
96
97INIT_XMM sse
98cglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a
99    movifnidn               aq, amp
100    mova                    m0, [aq]
101    DEFINE_ARGS dst, stride, stride3
102    lea               stride3q, [strideq*3]
103    mova      [dstq+strideq*0], m0
104    mova      [dstq+strideq*1], m0
105    mova      [dstq+strideq*2], m0
106    mova      [dstq+stride3q ], m0
107    lea                   dstq, [dstq+strideq*4]
108    mova      [dstq+strideq*0], m0
109    mova      [dstq+strideq*1], m0
110    mova      [dstq+strideq*2], m0
111    mova      [dstq+stride3q ], m0
112    RET
113
114INIT_XMM sse
115cglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a
116    movifnidn               aq, amp
117    mova                    m0, [aq]
118    mova                    m1, [aq+mmsize]
119    DEFINE_ARGS dst, stride, stride3, cnt
120    lea               stride3q, [strideq*3]
121    mov                   cntd, 4
122.loop:
123    mova   [dstq+strideq*0+ 0], m0
124    mova   [dstq+strideq*0+16], m1
125    mova   [dstq+strideq*1+ 0], m0
126    mova   [dstq+strideq*1+16], m1
127    mova   [dstq+strideq*2+ 0], m0
128    mova   [dstq+strideq*2+16], m1
129    mova   [dstq+stride3q + 0], m0
130    mova   [dstq+stride3q +16], m1
131    lea                   dstq, [dstq+strideq*4]
132    dec               cntd
133    jg .loop
134    RET
135
136INIT_XMM sse
137cglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a
138    movifnidn               aq, amp
139    mova                    m0, [aq+mmsize*0]
140    mova                    m1, [aq+mmsize*1]
141    mova                    m2, [aq+mmsize*2]
142    mova                    m3, [aq+mmsize*3]
143    DEFINE_ARGS dst, stride, cnt
144    mov                   cntd, 16
145.loop:
146    mova   [dstq+strideq*0+ 0], m0
147    mova   [dstq+strideq*0+16], m1
148    mova   [dstq+strideq*0+32], m2
149    mova   [dstq+strideq*0+48], m3
150    mova   [dstq+strideq*1+ 0], m0
151    mova   [dstq+strideq*1+16], m1
152    mova   [dstq+strideq*1+32], m2
153    mova   [dstq+strideq*1+48], m3
154    lea                   dstq, [dstq+strideq*2]
155    dec               cntd
156    jg .loop
157    RET
158
159INIT_MMX mmxext
160cglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a
161    mova                    m3, [lq]
162    DEFINE_ARGS dst, stride, stride3
163    lea               stride3q, [strideq*3]
164    pshufw                  m0, m3, q3333
165    pshufw                  m1, m3, q2222
166    pshufw                  m2, m3, q1111
167    pshufw                  m3, m3, q0000
168    mova      [dstq+strideq*0], m0
169    mova      [dstq+strideq*1], m1
170    mova      [dstq+strideq*2], m2
171    mova      [dstq+stride3q ], m3
172    RET
173
174INIT_XMM sse2
175cglobal vp9_ipred_h_8x8_16, 3, 3, 4, dst, stride, l, a
176    mova                    m2, [lq]
177    DEFINE_ARGS dst, stride, stride3
178    lea               stride3q, [strideq*3]
179    punpckhwd               m3, m2, m2
180    pshufd                  m0, m3, q3333
181    pshufd                  m1, m3, q2222
182    mova      [dstq+strideq*0], m0
183    mova      [dstq+strideq*1], m1
184    pshufd                  m0, m3, q1111
185    pshufd                  m1, m3, q0000
186    mova      [dstq+strideq*2], m0
187    mova      [dstq+stride3q ], m1
188    lea                   dstq, [dstq+strideq*4]
189    punpcklwd               m2, m2
190    pshufd                  m0, m2, q3333
191    pshufd                  m1, m2, q2222
192    mova      [dstq+strideq*0], m0
193    mova      [dstq+strideq*1], m1
194    pshufd                  m0, m2, q1111
195    pshufd                  m1, m2, q0000
196    mova      [dstq+strideq*2], m0
197    mova      [dstq+stride3q ], m1
198    RET
199
200INIT_XMM sse2
201cglobal vp9_ipred_h_16x16_16, 3, 5, 4, dst, stride, l, stride3, cnt
202    mov                   cntd, 3
203    lea               stride3q, [strideq*3]
204.loop:
205    movh                    m3, [lq+cntq*8]
206    punpcklwd               m3, m3
207    pshufd                  m0, m3, q3333
208    pshufd                  m1, m3, q2222
209    pshufd                  m2, m3, q1111
210    pshufd                  m3, m3, q0000
211    mova    [dstq+strideq*0+ 0], m0
212    mova    [dstq+strideq*0+16], m0
213    mova    [dstq+strideq*1+ 0], m1
214    mova    [dstq+strideq*1+16], m1
215    mova    [dstq+strideq*2+ 0], m2
216    mova    [dstq+strideq*2+16], m2
217    mova    [dstq+stride3q + 0], m3
218    mova    [dstq+stride3q +16], m3
219    lea                   dstq, [dstq+strideq*4]
220    dec                   cntd
221    jge .loop
222    RET
223
224INIT_XMM sse2
225cglobal vp9_ipred_h_32x32_16, 3, 5, 4, dst, stride, l, stride3, cnt
226    mov                   cntd, 7
227    lea               stride3q, [strideq*3]
228.loop:
229    movh                    m3, [lq+cntq*8]
230    punpcklwd               m3, m3
231    pshufd                  m0, m3, q3333
232    pshufd                  m1, m3, q2222
233    pshufd                  m2, m3, q1111
234    pshufd                  m3, m3, q0000
235    mova   [dstq+strideq*0+ 0], m0
236    mova   [dstq+strideq*0+16], m0
237    mova   [dstq+strideq*0+32], m0
238    mova   [dstq+strideq*0+48], m0
239    mova   [dstq+strideq*1+ 0], m1
240    mova   [dstq+strideq*1+16], m1
241    mova   [dstq+strideq*1+32], m1
242    mova   [dstq+strideq*1+48], m1
243    mova   [dstq+strideq*2+ 0], m2
244    mova   [dstq+strideq*2+16], m2
245    mova   [dstq+strideq*2+32], m2
246    mova   [dstq+strideq*2+48], m2
247    mova   [dstq+stride3q + 0], m3
248    mova   [dstq+stride3q +16], m3
249    mova   [dstq+stride3q +32], m3
250    mova   [dstq+stride3q +48], m3
251    lea                   dstq, [dstq+strideq*4]
252    dec                   cntd
253    jge .loop
254    RET
255
256INIT_MMX mmxext
257cglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a
258    mova                    m0, [lq]
259    paddw                   m0, [aq]
260    DEFINE_ARGS dst, stride, stride3
261    lea               stride3q, [strideq*3]
262    pmaddwd                 m0, [pw_1]
263    pshufw                  m1, m0, q3232
264    paddd                   m0, [pd_4]
265    paddd                   m0, m1
266    psrad                   m0, 3
267    pshufw                  m0, m0, q0000
268    mova      [dstq+strideq*0], m0
269    mova      [dstq+strideq*1], m0
270    mova      [dstq+strideq*2], m0
271    mova      [dstq+stride3q ], m0
272    RET
273
274INIT_XMM sse2
275cglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a
276    mova                    m0, [lq]
277    paddw                   m0, [aq]
278    DEFINE_ARGS dst, stride, stride3
279    lea               stride3q, [strideq*3]
280    pmaddwd                 m0, [pw_1]
281    pshufd                  m1, m0, q3232
282    paddd                   m0, m1
283    pshufd                  m1, m0, q1111
284    paddd                   m0, [pd_8]
285    paddd                   m0, m1
286    psrad                   m0, 4
287    pshuflw                 m0, m0, q0000
288    punpcklqdq              m0, m0
289    mova      [dstq+strideq*0], m0
290    mova      [dstq+strideq*1], m0
291    mova      [dstq+strideq*2], m0
292    mova      [dstq+stride3q ], m0
293    lea                   dstq, [dstq+strideq*4]
294    mova      [dstq+strideq*0], m0
295    mova      [dstq+strideq*1], m0
296    mova      [dstq+strideq*2], m0
297    mova      [dstq+stride3q ], m0
298    RET
299
300INIT_XMM sse2
301cglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a
302    mova                    m0, [lq]
303    paddw                   m0, [lq+mmsize]
304    paddw                   m0, [aq]
305    paddw                   m0, [aq+mmsize]
306    DEFINE_ARGS dst, stride, stride3, cnt
307    lea               stride3q, [strideq*3]
308    mov                   cntd, 4
309    pmaddwd                 m0, [pw_1]
310    pshufd                  m1, m0, q3232
311    paddd                   m0, m1
312    pshufd                  m1, m0, q1111
313    paddd                   m0, [pd_16]
314    paddd                   m0, m1
315    psrad                   m0, 5
316    pshuflw                 m0, m0, q0000
317    punpcklqdq              m0, m0
318.loop:
319    mova   [dstq+strideq*0+ 0], m0
320    mova   [dstq+strideq*0+16], m0
321    mova   [dstq+strideq*1+ 0], m0
322    mova   [dstq+strideq*1+16], m0
323    mova   [dstq+strideq*2+ 0], m0
324    mova   [dstq+strideq*2+16], m0
325    mova   [dstq+stride3q + 0], m0
326    mova   [dstq+stride3q +16], m0
327    lea                   dstq, [dstq+strideq*4]
328    dec                   cntd
329    jg .loop
330    RET
331
332INIT_XMM sse2
333cglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a
334    mova                    m0, [lq+mmsize*0]
335    paddw                   m0, [lq+mmsize*1]
336    paddw                   m0, [lq+mmsize*2]
337    paddw                   m0, [lq+mmsize*3]
338    paddw                   m0, [aq+mmsize*0]
339    paddw                   m0, [aq+mmsize*1]
340    paddw                   m0, [aq+mmsize*2]
341    paddw                   m0, [aq+mmsize*3]
342    DEFINE_ARGS dst, stride, stride3, cnt
343    lea               stride3q, [strideq*3]
344    mov                   cntd, 16
345    pmaddwd                 m0, [pw_1]
346    pshufd                  m1, m0, q3232
347    paddd                   m0, m1
348    pshufd                  m1, m0, q1111
349    paddd                   m0, [pd_32]
350    paddd                   m0, m1
351    psrad                   m0, 6
352    pshuflw                 m0, m0, q0000
353    punpcklqdq              m0, m0
354.loop:
355    mova   [dstq+strideq*0+ 0], m0
356    mova   [dstq+strideq*0+16], m0
357    mova   [dstq+strideq*0+32], m0
358    mova   [dstq+strideq*0+48], m0
359    mova   [dstq+strideq*1+ 0], m0
360    mova   [dstq+strideq*1+16], m0
361    mova   [dstq+strideq*1+32], m0
362    mova   [dstq+strideq*1+48], m0
363    lea                   dstq, [dstq+strideq*2]
364    dec                   cntd
365    jg .loop
366    RET
367
368%macro DC_1D_FNS 2
369INIT_MMX mmxext
370cglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a
371    mova                    m0, [%2]
372    DEFINE_ARGS dst, stride, stride3
373    lea               stride3q, [strideq*3]
374    pmaddwd                 m0, [pw_1]
375    pshufw                  m1, m0, q3232
376    paddd                   m0, [pd_2]
377    paddd                   m0, m1
378    psrad                   m0, 2
379    pshufw                  m0, m0, q0000
380    mova      [dstq+strideq*0], m0
381    mova      [dstq+strideq*1], m0
382    mova      [dstq+strideq*2], m0
383    mova      [dstq+stride3q ], m0
384    RET
385
386INIT_XMM sse2
387cglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a
388    mova                    m0, [%2]
389    DEFINE_ARGS dst, stride, stride3
390    lea               stride3q, [strideq*3]
391    pmaddwd                 m0, [pw_1]
392    pshufd                  m1, m0, q3232
393    paddd                   m0, m1
394    pshufd                  m1, m0, q1111
395    paddd                   m0, [pd_4]
396    paddd                   m0, m1
397    psrad                   m0, 3
398    pshuflw                 m0, m0, q0000
399    punpcklqdq              m0, m0
400    mova      [dstq+strideq*0], m0
401    mova      [dstq+strideq*1], m0
402    mova      [dstq+strideq*2], m0
403    mova      [dstq+stride3q ], m0
404    lea                   dstq, [dstq+strideq*4]
405    mova      [dstq+strideq*0], m0
406    mova      [dstq+strideq*1], m0
407    mova      [dstq+strideq*2], m0
408    mova      [dstq+stride3q ], m0
409    RET
410
411INIT_XMM sse2
412cglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a
413    mova                    m0, [%2]
414    paddw                   m0, [%2+mmsize]
415    DEFINE_ARGS dst, stride, stride3, cnt
416    lea               stride3q, [strideq*3]
417    mov                   cntd, 4
418    pmaddwd                 m0, [pw_1]
419    pshufd                  m1, m0, q3232
420    paddd                   m0, m1
421    pshufd                  m1, m0, q1111
422    paddd                   m0, [pd_8]
423    paddd                   m0, m1
424    psrad                   m0, 4
425    pshuflw                 m0, m0, q0000
426    punpcklqdq              m0, m0
427.loop:
428    mova   [dstq+strideq*0+ 0], m0
429    mova   [dstq+strideq*0+16], m0
430    mova   [dstq+strideq*1+ 0], m0
431    mova   [dstq+strideq*1+16], m0
432    mova   [dstq+strideq*2+ 0], m0
433    mova   [dstq+strideq*2+16], m0
434    mova   [dstq+stride3q + 0], m0
435    mova   [dstq+stride3q +16], m0
436    lea                   dstq, [dstq+strideq*4]
437    dec                   cntd
438    jg .loop
439    RET
440
441INIT_XMM sse2
442cglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a
443    mova                    m0, [%2+mmsize*0]
444    paddw                   m0, [%2+mmsize*1]
445    paddw                   m0, [%2+mmsize*2]
446    paddw                   m0, [%2+mmsize*3]
447    DEFINE_ARGS dst, stride, cnt
448    mov                   cntd, 16
449    pmaddwd                 m0, [pw_1]
450    pshufd                  m1, m0, q3232
451    paddd                   m0, m1
452    pshufd                  m1, m0, q1111
453    paddd                   m0, [pd_16]
454    paddd                   m0, m1
455    psrad                   m0, 5
456    pshuflw                 m0, m0, q0000
457    punpcklqdq              m0, m0
458.loop:
459    mova   [dstq+strideq*0+ 0], m0
460    mova   [dstq+strideq*0+16], m0
461    mova   [dstq+strideq*0+32], m0
462    mova   [dstq+strideq*0+48], m0
463    mova   [dstq+strideq*1+ 0], m0
464    mova   [dstq+strideq*1+16], m0
465    mova   [dstq+strideq*1+32], m0
466    mova   [dstq+strideq*1+48], m0
467    lea                   dstq, [dstq+strideq*2]
468    dec                   cntd
469    jg .loop
470    RET
471%endmacro
472
473DC_1D_FNS top,  aq
474DC_1D_FNS left, lq
475
476INIT_MMX mmxext
477cglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a
478    mova                    m5, [pw_1023]
479.body:
480    mova                    m4, [aq]
481    mova                    m3, [lq]
482    movd                    m0, [aq-4]
483    pshufw                  m0, m0, q1111
484    psubw                   m4, m0
485    DEFINE_ARGS dst, stride, stride3
486    lea               stride3q, [strideq*3]
487    pshufw                  m0, m3, q3333
488    pshufw                  m1, m3, q2222
489    pshufw                  m2, m3, q1111
490    pshufw                  m3, m3, q0000
491    paddw                   m0, m4
492    paddw                   m1, m4
493    paddw                   m2, m4
494    paddw                   m3, m4
495    pxor                    m4, m4
496    pmaxsw                  m0, m4
497    pmaxsw                  m1, m4
498    pmaxsw                  m2, m4
499    pmaxsw                  m3, m4
500    pminsw                  m0, m5
501    pminsw                  m1, m5
502    pminsw                  m2, m5
503    pminsw                  m3, m5
504    mova      [dstq+strideq*0], m0
505    mova      [dstq+strideq*1], m1
506    mova      [dstq+strideq*2], m2
507    mova      [dstq+stride3q ], m3
508    RET
509
510cglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a
511    mova                    m5, [pw_4095]
512    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body
513
514INIT_XMM sse2
515cglobal vp9_ipred_tm_8x8_10, 4, 5, 7, dst, stride, l, a
516    mova                    m4, [pw_1023]
517.body:
518    pxor                    m6, m6
519    mova                    m5, [aq]
520    movd                    m0, [aq-4]
521    pshuflw                 m0, m0, q1111
522    punpcklqdq              m0, m0
523    psubw                   m5, m0
524    DEFINE_ARGS dst, stride, l, stride3, cnt
525    lea               stride3q, [strideq*3]
526    mov                   cntd, 1
527.loop:
528    movh                    m3, [lq+cntq*8]
529    punpcklwd               m3, m3
530    pshufd                  m0, m3, q3333
531    pshufd                  m1, m3, q2222
532    pshufd                  m2, m3, q1111
533    pshufd                  m3, m3, q0000
534    paddw                   m0, m5
535    paddw                   m1, m5
536    paddw                   m2, m5
537    paddw                   m3, m5
538    pmaxsw                  m0, m6
539    pmaxsw                  m1, m6
540    pmaxsw                  m2, m6
541    pmaxsw                  m3, m6
542    pminsw                  m0, m4
543    pminsw                  m1, m4
544    pminsw                  m2, m4
545    pminsw                  m3, m4
546    mova      [dstq+strideq*0], m0
547    mova      [dstq+strideq*1], m1
548    mova      [dstq+strideq*2], m2
549    mova      [dstq+stride3q ], m3
550    lea                   dstq, [dstq+strideq*4]
551    dec                   cntd
552    jge .loop
553    RET
554
555cglobal vp9_ipred_tm_8x8_12, 4, 5, 7, dst, stride, l, a
556    mova                    m4, [pw_4095]
557    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body
558
559INIT_XMM sse2
560cglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a
561    mova                    m7, [pw_1023]
562.body:
563    pxor                    m6, m6
564    mova                    m4, [aq]
565    mova                    m5, [aq+mmsize]
566    movd                    m0, [aq-4]
567    pshuflw                 m0, m0, q1111
568    punpcklqdq              m0, m0
569    psubw                   m4, m0
570    psubw                   m5, m0
571    DEFINE_ARGS dst, stride, l, cnt
572    mov                   cntd, 7
573.loop:
574    movd                    m3, [lq+cntq*4]
575    punpcklwd               m3, m3
576    pshufd                  m2, m3, q1111
577    pshufd                  m3, m3, q0000
578    paddw                   m0, m2, m4
579    paddw                   m2, m5
580    paddw                   m1, m3, m4
581    paddw                   m3, m5
582    pmaxsw                  m0, m6
583    pmaxsw                  m2, m6
584    pmaxsw                  m1, m6
585    pmaxsw                  m3, m6
586    pminsw                  m0, m7
587    pminsw                  m2, m7
588    pminsw                  m1, m7
589    pminsw                  m3, m7
590    mova   [dstq+strideq*0+ 0], m0
591    mova   [dstq+strideq*0+16], m2
592    mova   [dstq+strideq*1+ 0], m1
593    mova   [dstq+strideq*1+16], m3
594    lea                   dstq, [dstq+strideq*2]
595    dec                   cntd
596    jge .loop
597    RET
598
599cglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a
600    mova                    m7, [pw_4095]
601    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body
602
603INIT_XMM sse2
604cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
605    mova                    m0, [pw_1023]
606.body:
607    pxor                    m1, m1
608%if ARCH_X86_64
609    SWAP                     0, 8
610    SWAP                     1, 9
611%define reg_min m9
612%define reg_max m8
613%else
614    mova              [rsp+ 0], m0
615    mova              [rsp+16], m1
616%define reg_min [rsp+16]
617%define reg_max [rsp+ 0]
618%endif
619
620    mova                    m4, [aq+mmsize*0]
621    mova                    m5, [aq+mmsize*1]
622    mova                    m6, [aq+mmsize*2]
623    mova                    m7, [aq+mmsize*3]
624    movd                    m0, [aq-4]
625    pshuflw                 m0, m0, q1111
626    punpcklqdq              m0, m0
627    psubw                   m4, m0
628    psubw                   m5, m0
629    psubw                   m6, m0
630    psubw                   m7, m0
631    DEFINE_ARGS dst, stride, l, cnt
632    mov                   cntd, 31
633.loop:
634    pinsrw                  m3, [lq+cntq*2], 0
635    punpcklwd               m3, m3
636    pshufd                  m3, m3, q0000
637    paddw                   m0, m3, m4
638    paddw                   m1, m3, m5
639    paddw                   m2, m3, m6
640    paddw                   m3, m7
641    pmaxsw                  m0, reg_min
642    pmaxsw                  m1, reg_min
643    pmaxsw                  m2, reg_min
644    pmaxsw                  m3, reg_min
645    pminsw                  m0, reg_max
646    pminsw                  m1, reg_max
647    pminsw                  m2, reg_max
648    pminsw                  m3, reg_max
649    mova   [dstq+strideq*0+ 0], m0
650    mova   [dstq+strideq*0+16], m1
651    mova   [dstq+strideq*0+32], m2
652    mova   [dstq+strideq*0+48], m3
653    add                   dstq, strideq
654    dec                   cntd
655    jge .loop
656    RET
657
658cglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * -ARCH_X86_32, dst, stride, l, a
659    mova                    m0, [pw_4095]
660    jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body
661
662; Directional intra predicion functions
663;
664; in the functions below, 'abcdefgh' refers to above data (sometimes simply
665; abbreviated as a[N-M]). 'stuvwxyz' refers to left data (sometimes simply
666; abbreviated as l[N-M]). * is top-left data. ABCDEFG or A[N-M] is filtered
667; above data, STUVWXYZ or L[N-M] is filtered left data, and # is filtered
668; top-left data.
669
670; left=(left+2*center+right+2)>>2
671%macro LOWPASS 3 ; left [dst], center, right
672    paddw                  m%1, m%3
673    psraw                  m%1, 1
674    pavgw                  m%1, m%2
675%endmacro
676
677; abcdefgh (src) -> bcdefghh (dst)
678; dst/src can be the same register
679%macro SHIFT_RIGHT 2-3 [pb_2to15_14_15] ; dst, src, [ssse3_shift_reg]
680%if cpuflag(ssse3)
681    pshufb                  %1, %2, %3              ; abcdefgh -> bcdefghh
682%else
683    psrldq                  %1, %2, 2               ; abcdefgh -> bcdefgh.
684    pshufhw                 %1, %1, q2210           ; bcdefgh. -> bcdefghh
685%endif
686%endmacro
687
688; abcdefgh (src) -> bcdefghh (dst1) and cdefghhh (dst2)
689%macro SHIFT_RIGHTx2 3-4 [pb_2to15_14_15] ; dst1, dst2, src, [ssse3_shift_reg]
690%if cpuflag(ssse3)
691    pshufb                  %1, %3, %4              ; abcdefgh -> bcdefghh
692    pshufb                  %2, %1, %4              ; bcdefghh -> cdefghhh
693%else
694    psrldq                  %1, %3, 2               ; abcdefgh -> bcdefgh.
695    psrldq                  %2, %3, 4               ; abcdefgh -> cdefgh..
696    pshufhw                 %1, %1, q2210           ; bcdefgh. -> bcdefghh
697    pshufhw                 %2, %2, q1110           ; cdefgh.. -> cdefghhh
698%endif
699%endmacro
700
701%macro DL_FUNCS 0
702cglobal vp9_ipred_dl_4x4_16, 2, 4, 3, dst, stride, l, a
703    movifnidn               aq, amp
704    movu                    m1, [aq]                ; abcdefgh
705    pshufhw                 m0, m1, q3310           ; abcdefhh
706    SHIFT_RIGHT             m1, m1                  ; bcdefghh
707    psrldq                  m2, m1, 2               ; cdefghh.
708    LOWPASS                  0,  1,  2              ; BCDEFGh.
709    pshufd                  m1, m0, q3321           ; DEFGh...
710    movh      [dstq+strideq*0], m0
711    movh      [dstq+strideq*2], m1
712    add                   dstq, strideq
713    psrldq                  m0, 2                   ; CDEFGh..
714    psrldq                  m1, 2                   ; EFGh....
715    movh      [dstq+strideq*0], m0
716    movh      [dstq+strideq*2], m1
717    RET
718
719cglobal vp9_ipred_dl_8x8_16, 2, 4, 5, dst, stride, l, a
720    movifnidn               aq, amp
721    mova                    m0, [aq]                ; abcdefgh
722%if cpuflag(ssse3)
723    mova                    m4, [pb_2to15_14_15]
724%endif
725    SHIFT_RIGHTx2           m1, m2, m0, m4          ; bcdefghh/cdefghhh
726    LOWPASS                  0,  1,  2              ; BCDEFGHh
727    shufps                  m1, m0, m2, q3332       ; FGHhhhhh
728    shufps                  m3, m0, m1, q2121       ; DEFGHhhh
729    DEFINE_ARGS dst, stride, stride5
730    lea               stride5q, [strideq*5]
731
732    mova      [dstq+strideq*0], m0
733    mova      [dstq+strideq*4], m1
734    SHIFT_RIGHT             m0, m0, m4              ; CDEFGHhh
735    pshuflw                 m1, m1, q3321           ; GHhhhhhh
736    pshufd                  m2, m0, q3321           ; EFGHhhhh
737    mova      [dstq+strideq*1], m0
738    mova      [dstq+stride5q ], m1
739    lea                   dstq, [dstq+strideq*2]
740    pshuflw                 m1, m1, q3321           ; Hhhhhhhh
741    mova      [dstq+strideq*0], m3
742    mova      [dstq+strideq*4], m1
743    pshuflw                 m1, m1, q3321           ; hhhhhhhh
744    mova      [dstq+strideq*1], m2
745    mova      [dstq+stride5q ], m1
746    RET
747
748cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
749    movifnidn               aq, amp
750    mova                    m0, [aq]                ; abcdefgh
751    mova                    m3, [aq+mmsize]         ; ijklmnop
752    PALIGNR                 m1, m3, m0, 2, m4       ; bcdefghi
753    PALIGNR                 m2, m3, m0, 4, m4       ; cdefghij
754    LOWPASS                  0,  1,  2              ; BCDEFGHI
755%if cpuflag(ssse3)
756    mova                    m4, [pb_2to15_14_15]
757%endif
758    SHIFT_RIGHTx2           m2, m1, m3, m4          ; jklmnopp/klmnoppp
759    LOWPASS                  1,  2,  3              ; JKLMNOPp
760    pshufd                  m2, m2, q3333           ; pppppppp
761    DEFINE_ARGS dst, stride, cnt
762    mov                   cntd, 8
763
764.loop:
765    mova   [dstq+strideq*0+ 0], m0
766    mova   [dstq+strideq*0+16], m1
767    mova   [dstq+strideq*8+ 0], m1
768    mova   [dstq+strideq*8+16], m2
769    add                   dstq, strideq
770%if cpuflag(avx)
771    vpalignr                m0, m1, m0, 2
772%else
773    PALIGNR                 m3, m1, m0, 2, m4
774    mova                    m0, m3
775%endif
776    SHIFT_RIGHT             m1, m1, m4
777    dec                   cntd
778    jg .loop
779    RET
780
781cglobal vp9_ipred_dl_32x32_16, 2, 5, 7, dst, stride, l, a
782    movifnidn               aq, amp
783    mova                    m0, [aq+mmsize*0]       ; abcdefgh
784    mova                    m1, [aq+mmsize*1]       ; ijklmnop
785    mova                    m2, [aq+mmsize*2]       ; qrstuvwx
786    mova                    m3, [aq+mmsize*3]       ; yz012345
787    PALIGNR                 m4, m1, m0, 2, m6
788    PALIGNR                 m5, m1, m0, 4, m6
789    LOWPASS                  0,  4,  5              ; BCDEFGHI
790    PALIGNR                 m4, m2, m1, 2, m6
791    PALIGNR                 m5, m2, m1, 4, m6
792    LOWPASS                  1,  4,  5              ; JKLMNOPQ
793    PALIGNR                 m4, m3, m2, 2, m6
794    PALIGNR                 m5, m3, m2, 4, m6
795    LOWPASS                  2,  4,  5              ; RSTUVWXY
796%if cpuflag(ssse3)
797    mova                    m6, [pb_2to15_14_15]
798%endif
799    SHIFT_RIGHTx2           m4, m5, m3, m6
800    LOWPASS                  3,  4,  5              ; Z0123455
801    pshufd                  m4, m4, q3333           ; 55555555
802    DEFINE_ARGS dst, stride, stride8, stride24, cnt
803    mov                   cntd, 8
804    lea               stride8q, [strideq*8]
805    lea              stride24q, [stride8q*3]
806
807.loop:
808    mova  [dstq+stride8q*0+ 0], m0
809    mova  [dstq+stride8q*0+16], m1
810    mova  [dstq+stride8q*0+32], m2
811    mova  [dstq+stride8q*0+48], m3
812    mova  [dstq+stride8q*1+ 0], m1
813    mova  [dstq+stride8q*1+16], m2
814    mova  [dstq+stride8q*1+32], m3
815    mova  [dstq+stride8q*1+48], m4
816    mova  [dstq+stride8q*2+ 0], m2
817    mova  [dstq+stride8q*2+16], m3
818    mova  [dstq+stride8q*2+32], m4
819    mova  [dstq+stride8q*2+48], m4
820    mova  [dstq+stride24q + 0], m3
821    mova  [dstq+stride24q +16], m4
822    mova  [dstq+stride24q +32], m4
823    mova  [dstq+stride24q +48], m4
824    add                   dstq, strideq
825%if cpuflag(avx)
826    vpalignr                m0, m1, m0, 2
827    vpalignr                m1, m2, m1, 2
828    vpalignr                m2, m3, m2, 2
829%else
830    PALIGNR                 m5, m1, m0, 2, m6
831    mova                    m0, m5
832    PALIGNR                 m5, m2, m1, 2, m6
833    mova                    m1, m5
834    PALIGNR                 m5, m3, m2, 2, m6
835    mova                    m2, m5
836%endif
837    SHIFT_RIGHT             m3, m3, m6
838    dec                   cntd
839    jg .loop
840    RET
841%endmacro
842
843INIT_XMM sse2
844DL_FUNCS
845INIT_XMM ssse3
846DL_FUNCS
847INIT_XMM avx
848DL_FUNCS
849
850%if HAVE_AVX2_EXTERNAL
851INIT_YMM avx2
852cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
853    movifnidn               aq, amp
854    mova                    m0, [aq]                   ; abcdefghijklmnop
855    vpbroadcastw           xm1, [aq+30]                ; pppppppp
856    vperm2i128              m2, m0, m1, q0201          ; ijklmnoppppppppp
857    vpalignr                m3, m2, m0, 2              ; bcdefghijklmnopp
858    vpalignr                m4, m2, m0, 4              ; cdefghijklmnoppp
859    LOWPASS                  0,  3,  4                 ; BCDEFGHIJKLMNOPp
860    vperm2i128              m2, m0, m1, q0201          ; JKLMNOPppppppppp
861    DEFINE_ARGS dst, stride, stride3, cnt
862    mov                   cntd, 2
863    lea               stride3q, [strideq*3]
864
865.loop:
866    mova      [dstq+strideq*0], m0
867    vpalignr                m3, m2, m0, 2
868    vpalignr                m4, m2, m0, 4
869    mova      [dstq+strideq*1], m3
870    mova      [dstq+strideq*2], m4
871    vpalignr                m3, m2, m0, 6
872    vpalignr                m4, m2, m0, 8
873    mova      [dstq+stride3q ], m3
874    lea                   dstq, [dstq+strideq*4]
875    mova      [dstq+strideq*0], m4
876    vpalignr                m3, m2, m0, 10
877    vpalignr                m4, m2, m0, 12
878    mova      [dstq+strideq*1], m3
879    mova      [dstq+strideq*2], m4
880    vpalignr                m3, m2, m0, 14
881    mova      [dstq+stride3q ], m3
882    lea                   dstq, [dstq+strideq*4]
883    mova                    m0, m2
884    vperm2i128              m2, m2, m2, q0101          ; pppppppppppppppp
885    dec                   cntd
886    jg .loop
887    RET
888
889cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a
890    movifnidn               aq, amp
891    mova                    m0, [aq+mmsize*0+ 0]       ; abcdefghijklmnop
892    mova                    m1, [aq+mmsize*1+ 0]       ; qrstuvwxyz012345
893    vpbroadcastw           xm4, [aq+mmsize*1+30]       ; 55555555
894    vperm2i128              m5, m0, m1, q0201          ; ijklmnopqrstuvwx
895    vpalignr                m2, m5, m0, 2              ; bcdefghijklmnopq
896    vpalignr                m3, m5, m0, 4              ; cdefghijklmnopqr
897    LOWPASS                  0,  2,  3                 ; BCDEFGHIJKLMNOPQ
898    vperm2i128              m5, m1, m4, q0201          ; yz01234555555555
899    vpalignr                m2, m5, m1, 2              ; rstuvwxyz0123455
900    vpalignr                m3, m5, m1, 4              ; stuvwxyz01234555
901    LOWPASS                  1,  2,  3                 ; RSTUVWXYZ......5
902    vperm2i128              m2, m1, m4, q0201          ; Z......555555555
903    vperm2i128              m5, m0, m1, q0201          ; JKLMNOPQRSTUVWXY
904    DEFINE_ARGS dst, stride, stride3, cnt
905    lea               stride3q, [strideq*3]
906    mov                   cntd, 4
907
908.loop:
909    mova   [dstq+strideq*0 + 0], m0
910    mova   [dstq+strideq*0 +32], m1
911    vpalignr                 m3, m5, m0, 2
912    vpalignr                 m4, m2, m1, 2
913    mova   [dstq+strideq*1 + 0], m3
914    mova   [dstq+strideq*1 +32], m4
915    vpalignr                 m3, m5, m0, 4
916    vpalignr                 m4, m2, m1, 4
917    mova   [dstq+strideq*2 + 0], m3
918    mova   [dstq+strideq*2 +32], m4
919    vpalignr                 m3, m5, m0, 6
920    vpalignr                 m4, m2, m1, 6
921    mova   [dstq+stride3q*1+ 0], m3
922    mova   [dstq+stride3q*1+32], m4
923    lea                    dstq, [dstq+strideq*4]
924    vpalignr                 m3, m5, m0, 8
925    vpalignr                 m4, m2, m1, 8
926    mova   [dstq+strideq*0 + 0], m3
927    mova   [dstq+strideq*0 +32], m4
928    vpalignr                 m3, m5, m0, 10
929    vpalignr                 m4, m2, m1, 10
930    mova   [dstq+strideq*1 + 0], m3
931    mova   [dstq+strideq*1 +32], m4
932    vpalignr                 m3, m5, m0, 12
933    vpalignr                 m4, m2, m1, 12
934    mova   [dstq+strideq*2+ 0], m3
935    mova   [dstq+strideq*2+32], m4
936    vpalignr                 m3, m5, m0, 14
937    vpalignr                 m4, m2, m1, 14
938    mova   [dstq+stride3q+  0], m3
939    mova   [dstq+stride3q+ 32], m4
940    vpalignr                 m3, m5, m0, 16
941    vpalignr                 m4, m2, m1, 16
942    vperm2i128               m5, m3, m4, q0201
943    vperm2i128               m2, m4, m4, q0101
944    mova                     m0, m3
945    mova                     m1, m4
946    lea                    dstq, [dstq+strideq*4]
947    dec                    cntd
948    jg .loop
949    RET
950%endif
951
952%macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function
953cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a
954    movh                    m0, [lq]                ; wxyz....
955    movhps                  m0, [aq-2]              ; wxyz*abc
956    movd                    m1, [aq+6]              ; d.......
957    PALIGNR                 m1, m0, 2, m2           ; xyz*abcd
958    psrldq                  m2, m1, 2               ; yz*abcd.
959    LOWPASS                  0, 1, 2                ; XYZ#ABC.
960    DEFINE_ARGS dst, stride, stride3
961    lea               stride3q, [strideq*3]
962
963    movh      [dstq+stride3q ], m0
964    psrldq                  m0, 2                   ; YZ#ABC..
965    movh      [dstq+strideq*2], m0
966    psrldq                  m0, 2                   ; Z#ABC...
967    movh      [dstq+strideq*1], m0
968    psrldq                  m0, 2                   ; #ABC....
969    movh      [dstq+strideq*0], m0
970    RET
971
972cglobal vp9_ipred_dr_8x8_16, 4, 4, 5, dst, stride, l, a
973    mova                    m0, [lq]                ; stuvwxyz
974    movu                    m1, [aq-2]              ; *abcdefg
975    mova                    m2, [aq]                ; abcdefgh
976    psrldq                  m3, m2, 2               ; bcdefgh.
977    LOWPASS                  3,  2, 1               ; ABCDEFG.
978    PALIGNR                 m1, m0, 2, m4           ; tuvwxyz*
979    PALIGNR                 m2, m1, 2, m4           ; uvwxyz*a
980    LOWPASS                  2,  1, 0               ; TUVWXYZ#
981    DEFINE_ARGS dst, stride, dst4, stride3
982    lea               stride3q, [strideq*3]
983    lea                  dst4q, [dstq+strideq*4]
984
985    movhps [dstq +stride3q +0], m2
986    movh   [dstq+ stride3q +8], m3
987    mova   [dst4q+stride3q +0], m2
988    PALIGNR                 m1, m3, m2, 2, m0
989    psrldq                  m3, 2
990    movhps [dstq +strideq*2+0], m1
991    movh   [dstq+ strideq*2+8], m3
992    mova   [dst4q+strideq*2+0], m1
993    PALIGNR                 m2, m3, m1, 2, m0
994    psrldq                  m3, 2
995    movhps [dstq +strideq*1+0], m2
996    movh   [dstq+ strideq*1+8], m3
997    mova   [dst4q+strideq*1+0], m2
998    PALIGNR                 m1, m3, m2, 2, m0
999    psrldq                  m3, 2
1000    movhps [dstq +strideq*0+0], m1
1001    movh   [dstq+ strideq*0+8], m3
1002    mova   [dst4q+strideq*0+0], m1
1003    RET
1004
1005cglobal vp9_ipred_dr_16x16_16, 4, 4, 7, dst, stride, l, a
1006    mova                    m0, [lq]                ; klmnopqr
1007    mova                    m1, [lq+mmsize]         ; stuvwxyz
1008    movu                    m2, [aq-2]              ; *abcdefg
1009    movu                    m3, [aq+mmsize-2]       ; hijklmno
1010    mova                    m4, [aq]                ; abcdefgh
1011    mova                    m5, [aq+mmsize]         ; ijklmnop
1012    psrldq                  m6, m5, 2               ; jklmnop.
1013    LOWPASS                  6,  5, 3               ; IJKLMNO.
1014    PALIGNR                 m5, m4, 2, m3           ; bcdefghi
1015    LOWPASS                  5,  4, 2               ; ABCDEFGH
1016    PALIGNR                 m2, m1, 2, m3           ; tuvwxyz*
1017    PALIGNR                 m4, m2, 2, m3           ; uvwxyz*a
1018    LOWPASS                  4,  2, 1               ; TUVWXYZ#
1019    PALIGNR                 m1, m0, 2, m3           ; lmnopqrs
1020    PALIGNR                 m2, m1, 2, m3           ; mnopqrst
1021    LOWPASS                  2, 1, 0                ; LMNOPQRS
1022    DEFINE_ARGS dst, stride, dst8, cnt
1023    lea                  dst8q, [dstq+strideq*8]
1024    mov                   cntd, 8
1025
1026.loop:
1027    sub                  dst8q, strideq
1028    mova  [dst8q+strideq*0+ 0], m4
1029    mova  [dst8q+strideq*0+16], m5
1030    mova  [dst8q+strideq*8+ 0], m2
1031    mova  [dst8q+strideq*8+16], m4
1032%if cpuflag(avx)
1033    vpalignr                m2, m4, m2, 2
1034    vpalignr                m4, m5, m4, 2
1035    vpalignr                m5, m6, m5, 2
1036%else
1037    PALIGNR                 m0, m4, m2, 2, m1
1038    mova                    m2, m0
1039    PALIGNR                 m0, m5, m4, 2, m1
1040    mova                    m4, m0
1041    PALIGNR                 m0, m6, m5, 2, m1
1042    mova                    m5, m0
1043%endif
1044    psrldq                  m6, 2
1045    dec                   cntd
1046    jg .loop
1047    RET
1048
1049cglobal vp9_ipred_dr_32x32_16, 4, 5, 10 + notcpuflag(ssse3), \
1050                               %1 * ARCH_X86_32 * -mmsize, dst, stride, l, a
1051    mova                    m0, [aq+mmsize*3]       ; a[24-31]
1052    movu                    m1, [aq+mmsize*3-2]     ; a[23-30]
1053    psrldq                  m2, m0, 2               ; a[25-31].
1054    LOWPASS                  2,  0, 1               ; A[24-30].
1055    mova                    m1, [aq+mmsize*2]       ; a[16-23]
1056    movu                    m3, [aq+mmsize*2-2]     ; a[15-22]
1057    PALIGNR                 m0, m1, 2, m4           ; a[17-24]
1058    LOWPASS                  0,  1, 3               ; A[16-23]
1059    mova                    m3, [aq+mmsize*1]       ; a[8-15]
1060    movu                    m4, [aq+mmsize*1-2]     ; a[7-14]
1061    PALIGNR                 m1, m3, 2, m5           ; a[9-16]
1062    LOWPASS                  1,  3, 4               ; A[8-15]
1063    mova                    m4, [aq+mmsize*0]       ; a[0-7]
1064    movu                    m5, [aq+mmsize*0-2]     ; *a[0-6]
1065    PALIGNR                 m3, m4, 2, m6           ; a[1-8]
1066    LOWPASS                  3,  4, 5               ; A[0-7]
1067    SCRATCH                  1,  8, rsp+0*mmsize
1068    SCRATCH                  3,  9, rsp+1*mmsize
1069%if notcpuflag(ssse3)
1070    SCRATCH                  0, 10, rsp+2*mmsize
1071%endif
1072    mova                    m6, [lq+mmsize*3]       ; l[24-31]
1073    PALIGNR                 m5, m6, 2, m0           ; l[25-31]*
1074    PALIGNR                 m4, m5, 2, m0           ; l[26-31]*a
1075    LOWPASS                  4,  5, 6               ; L[25-31]#
1076    mova                    m7, [lq+mmsize*2]       ; l[16-23]
1077    PALIGNR                 m6, m7, 2, m0           ; l[17-24]
1078    PALIGNR                 m5, m6, 2, m0           ; l[18-25]
1079    LOWPASS                  5,  6, 7               ; L[17-24]
1080    mova                    m1, [lq+mmsize*1]       ; l[8-15]
1081    PALIGNR                 m7, m1, 2, m0           ; l[9-16]
1082    PALIGNR                 m6, m7, 2, m0           ; l[10-17]
1083    LOWPASS                  6,  7, 1               ; L[9-16]
1084    mova                    m3, [lq+mmsize*0]       ; l[0-7]
1085    PALIGNR                 m1, m3, 2, m0           ; l[1-8]
1086    PALIGNR                 m7, m1, 2, m0           ; l[2-9]
1087    LOWPASS                  7,  1, 3               ; L[1-8]
1088%if cpuflag(ssse3)
1089%if cpuflag(avx)
1090    UNSCRATCH                1,  8, rsp+0*mmsize
1091%endif
1092    UNSCRATCH                3,  9, rsp+1*mmsize
1093%else
1094    UNSCRATCH                0, 10, rsp+2*mmsize
1095%endif
1096    DEFINE_ARGS dst8, stride, stride8, stride24, cnt
1097    lea               stride8q, [strideq*8]
1098    lea              stride24q, [stride8q*3]
1099    lea                  dst8q, [dst8q+strideq*8]
1100    mov                   cntd, 8
1101
1102.loop:
1103    sub                  dst8q, strideq
1104%if notcpuflag(avx)
1105    UNSCRATCH                1,  8, rsp+0*mmsize
1106%if notcpuflag(ssse3)
1107    UNSCRATCH                3,  9, rsp+1*mmsize
1108%endif
1109%endif
1110    mova [dst8q+stride8q*0+ 0], m4
1111    mova [dst8q+stride8q*0+16], m3
1112    mova [dst8q+stride8q*0+32], m1
1113    mova [dst8q+stride8q*0+48], m0
1114    mova [dst8q+stride8q*1+ 0], m5
1115    mova [dst8q+stride8q*1+16], m4
1116    mova [dst8q+stride8q*1+32], m3
1117    mova [dst8q+stride8q*1+48], m1
1118    mova [dst8q+stride8q*2+ 0], m6
1119    mova [dst8q+stride8q*2+16], m5
1120    mova [dst8q+stride8q*2+32], m4
1121    mova [dst8q+stride8q*2+48], m3
1122    mova [dst8q+stride24q + 0], m7
1123    mova [dst8q+stride24q +16], m6
1124    mova [dst8q+stride24q +32], m5
1125    mova [dst8q+stride24q +48], m4
1126%if cpuflag(avx)
1127    vpalignr                m7, m6, m7, 2
1128    vpalignr                m6, m5, m6, 2
1129    vpalignr                m5, m4, m5, 2
1130    vpalignr                m4, m3, m4, 2
1131    vpalignr                m3, m1, m3, 2
1132    vpalignr                m1, m0, m1, 2
1133    vpalignr                m0, m2, m0, 2
1134%else
1135    SCRATCH                  2,  8, rsp+0*mmsize
1136%if notcpuflag(ssse3)
1137    SCRATCH                  0,  9, rsp+1*mmsize
1138%endif
1139    PALIGNR                 m2, m6, m7, 2, m0
1140    mova                    m7, m2
1141    PALIGNR                 m2, m5, m6, 2, m0
1142    mova                    m6, m2
1143    PALIGNR                 m2, m4, m5, 2, m0
1144    mova                    m5, m2
1145    PALIGNR                 m2, m3, m4, 2, m0
1146    mova                    m4, m2
1147    PALIGNR                 m2, m1, m3, 2, m0
1148    mova                    m3, m2
1149%if notcpuflag(ssse3)
1150    UNSCRATCH                0,  9, rsp+1*mmsize
1151    SCRATCH                  3,  9, rsp+1*mmsize
1152%endif
1153    PALIGNR                 m2, m0, m1, 2, m3
1154    mova                    m1, m2
1155    UNSCRATCH                2,  8, rsp+0*mmsize
1156    SCRATCH                  1,  8, rsp+0*mmsize
1157    PALIGNR                 m1, m2, m0, 2, m3
1158    mova                    m0, m1
1159%endif
1160    psrldq                  m2, 2
1161    dec                   cntd
1162    jg .loop
1163    RET
1164%endmacro
1165
1166INIT_XMM sse2
1167DR_FUNCS 3
1168INIT_XMM ssse3
1169DR_FUNCS 2
1170INIT_XMM avx
1171DR_FUNCS 2
1172
1173%if HAVE_AVX2_EXTERNAL
1174INIT_YMM avx2
1175cglobal vp9_ipred_dr_16x16_16, 4, 5, 6, dst, stride, l, a
1176    mova                    m0, [lq]                   ; klmnopqrstuvwxyz
1177    movu                    m1, [aq-2]                 ; *abcdefghijklmno
1178    mova                    m2, [aq]                   ; abcdefghijklmnop
1179    vperm2i128              m4, m2, m2, q2001          ; ijklmnop........
1180    vpalignr                m5, m4, m2, 2              ; bcdefghijklmnop.
1181    vperm2i128              m3, m0, m1, q0201          ; stuvwxyz*abcdefg
1182    LOWPASS                  1,  2,  5                 ; ABCDEFGHIJKLMNO.
1183    vpalignr                m4, m3, m0, 2              ; lmnopqrstuvwxyz*
1184    vpalignr                m5, m3, m0, 4              ; mnopqrstuvwxyz*a
1185    LOWPASS                  0,  4,  5                 ; LMNOPQRSTUVWXYZ#
1186    vperm2i128              m5, m0, m1, q0201          ; TUVWXYZ#ABCDEFGH
1187    DEFINE_ARGS dst, stride, stride3, stride5, dst3
1188    lea                  dst3q, [dstq+strideq*4]
1189    lea               stride3q, [strideq*3]
1190    lea               stride5q, [stride3q+strideq*2]
1191
1192    vpalignr                m3, m5, m0, 2
1193    vpalignr                m4, m1, m5, 2
1194    mova    [dst3q+stride5q*2], m3                     ; 14
1195    mova    [ dstq+stride3q*2], m4                     ; 6
1196    vpalignr                m3, m5, m0, 4
1197    vpalignr                m4, m1, m5, 4
1198    sub                  dst3q, strideq
1199    mova    [dst3q+stride5q*2], m3                     ; 13
1200    mova    [dst3q+strideq*2 ], m4                     ; 5
1201    mova    [dst3q+stride3q*4], m0                     ; 15
1202    vpalignr                m3, m5, m0, 6
1203    vpalignr                m4, m1, m5, 6
1204    mova     [dstq+stride3q*4], m3                     ; 12
1205    mova     [dst3q+strideq*1], m4                     ; 4
1206    vpalignr                m3, m5, m0, 8
1207    vpalignr                m4, m1, m5, 8
1208    mova     [dst3q+strideq*8], m3                     ; 11
1209    mova     [dst3q+strideq*0], m4                     ; 3
1210    vpalignr                m3, m5, m0, 10
1211    vpalignr                m4, m1, m5, 10
1212    mova     [dstq+stride5q*2], m3                     ; 10
1213    mova     [dstq+strideq*2 ], m4                     ; 2
1214    vpalignr                m3, m5, m0, 12
1215    vpalignr                m4, m1, m5, 12
1216    mova    [dst3q+stride3q*2], m3                     ; 9
1217    mova     [dstq+strideq*1 ], m4                     ; 1
1218    vpalignr                m3, m5, m0, 14
1219    vpalignr                m4, m1, m5, 14
1220    mova      [dstq+strideq*8], m3                     ; 8
1221    mova      [dstq+strideq*0], m4                     ; 0
1222    mova     [dst3q+strideq*4], m5                     ; 7
1223    RET
1224
1225%if ARCH_X86_64
1226cglobal vp9_ipred_dr_32x32_16, 4, 7, 10, dst, stride, l, a
1227    mova                    m0, [lq+mmsize*0+0]        ; l[0-15]
1228    mova                    m1, [lq+mmsize*1+0]        ; l[16-31]
1229    movu                    m2, [aq+mmsize*0-2]        ; *abcdefghijklmno
1230    mova                    m3, [aq+mmsize*0+0]        ; abcdefghijklmnop
1231    mova                    m4, [aq+mmsize*1+0]        ; qrstuvwxyz012345
1232    vperm2i128              m5, m0, m1, q0201          ; lmnopqrstuvwxyz0
1233    vpalignr                m6, m5, m0, 2              ; mnopqrstuvwxyz01
1234    vpalignr                m7, m5, m0, 4              ; nopqrstuvwxyz012
1235    LOWPASS                  0,  6,  7                 ; L[0-15]
1236    vperm2i128              m7, m1, m2, q0201          ; stuvwxyz*abcdefg
1237    vpalignr                m5, m7, m1, 2              ; lmnopqrstuvwxyz*
1238    vpalignr                m6, m7, m1, 4              ; mnopqrstuvwxyz*a
1239    LOWPASS                  1,  5,  6                 ; L[16-31]#
1240    vperm2i128              m5, m3, m4, q0201          ; ijklmnopqrstuvwx
1241    vpalignr                m6, m5, m3, 2              ; bcdefghijklmnopq
1242    LOWPASS                  2,  3,  6                 ; A[0-15]
1243    movu                    m3, [aq+mmsize*1-2]        ; pqrstuvwxyz01234
1244    vperm2i128              m6, m4, m4, q2001          ; yz012345........
1245    vpalignr                m7, m6, m4, 2              ; rstuvwxyz012345.
1246    LOWPASS                  3,  4,  7                 ; A[16-31].
1247    vperm2i128              m4, m1, m2, q0201          ; TUVWXYZ#ABCDEFGH
1248    vperm2i128              m5, m0, m1, q0201          ; L[7-15]L[16-23]
1249    vperm2i128              m8, m2, m3, q0201          ; IJKLMNOPQRSTUVWX
1250    DEFINE_ARGS dst8, stride, stride3, stride7, stride5, dst24, cnt
1251    lea               stride3q, [strideq*3]
1252    lea               stride5q, [stride3q+strideq*2]
1253    lea               stride7q, [strideq*4+stride3q]
1254    lea                 dst24q, [dst8q+stride3q*8]
1255    lea                  dst8q, [dst8q+strideq*8]
1256    mov                   cntd, 2
1257
1258.loop:
1259    mova  [dst24q+stride7q+0 ], m0                     ; 31 23 15 7
1260    mova  [dst24q+stride7q+32], m1
1261    mova    [dst8q+stride7q+0], m1
1262    mova   [dst8q+stride7q+32], m2
1263    vpalignr                m6, m4, m1, 2
1264    vpalignr                m7, m5, m0, 2
1265    vpalignr                m9, m8, m2, 2
1266    mova [dst24q+stride3q*2+0], m7                     ; 30 22 14 6
1267    mova [dst24q+stride3q*2+32], m6
1268    mova  [dst8q+stride3q*2+0], m6
1269    mova [dst8q+stride3q*2+32], m9
1270    vpalignr                m6, m4, m1, 4
1271    vpalignr                m7, m5, m0, 4
1272    vpalignr                m9, m8, m2, 4
1273    mova   [dst24q+stride5q+0], m7                     ; 29 21 13 5
1274    mova  [dst24q+stride5q+32], m6
1275    mova    [dst8q+stride5q+0], m6
1276    mova   [dst8q+stride5q+32], m9
1277    vpalignr                m6, m4, m1, 6
1278    vpalignr                m7, m5, m0, 6
1279    vpalignr                m9, m8, m2, 6
1280    mova [dst24q+strideq*4+0 ], m7                     ; 28 20 12 4
1281    mova [dst24q+strideq*4+32], m6
1282    mova   [dst8q+strideq*4+0], m6
1283    mova  [dst8q+strideq*4+32], m9
1284    vpalignr                m6, m4, m1, 8
1285    vpalignr                m7, m5, m0, 8
1286    vpalignr                m9, m8, m2, 8
1287    mova  [dst24q+stride3q+0 ], m7                     ; 27 19 11 3
1288    mova  [dst24q+stride3q+32], m6
1289    mova    [dst8q+stride3q+0], m6
1290    mova   [dst8q+stride3q+32], m9
1291    vpalignr                m6, m4, m1, 10
1292    vpalignr                m7, m5, m0, 10
1293    vpalignr                m9, m8, m2, 10
1294    mova [dst24q+strideq*2+0 ], m7                     ; 26 18 10 2
1295    mova [dst24q+strideq*2+32], m6
1296    mova   [dst8q+strideq*2+0], m6
1297    mova  [dst8q+strideq*2+32], m9
1298    vpalignr                m6, m4, m1, 12
1299    vpalignr                m7, m5, m0, 12
1300    vpalignr                m9, m8, m2, 12
1301    mova   [dst24q+strideq+0 ], m7                     ; 25 17 9 1
1302    mova   [dst24q+strideq+32], m6
1303    mova     [dst8q+strideq+0], m6
1304    mova    [dst8q+strideq+32], m9
1305    vpalignr                m6, m4, m1, 14
1306    vpalignr                m7, m5, m0, 14
1307    vpalignr                m9, m8, m2, 14
1308    mova [dst24q+strideq*0+0 ], m7                     ; 24 16 8 0
1309    mova [dst24q+strideq*0+32], m6
1310    mova   [dst8q+strideq*0+0], m6
1311    mova  [dst8q+strideq*0+32], m9
1312    mova                    m0, m5
1313    mova                    m5, m1
1314    mova                    m1, m4
1315    mova                    m4, m2
1316    mova                    m2, m8
1317    mova                    m8, m3
1318    sub                 dst24q, stride7q
1319    sub                 dst24q, strideq
1320    sub                  dst8q, stride7q
1321    sub                  dst8q, strideq
1322    dec                   cntd
1323    jg .loop
1324    RET
1325%endif
1326%endif
1327
1328%macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function
1329cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a
1330    movifnidn               aq, amp
1331    movu                    m0, [aq]                ; abcdefgh
1332    psrldq                  m1, m0, 2               ; bcdefgh.
1333    psrldq                  m2, m0, 4               ; cdefgh..
1334    LOWPASS                  2,  1, 0               ; BCDEFGH.
1335    pavgw                   m1, m0                  ; ABCDEFG.
1336    DEFINE_ARGS dst, stride, stride3
1337    lea               stride3q, [strideq*3]
1338
1339    movh      [dstq+strideq*0], m1
1340    movh      [dstq+strideq*1], m2
1341    psrldq                  m1, 2
1342    psrldq                  m2, 2
1343    movh      [dstq+strideq*2], m1
1344    movh      [dstq+stride3q ], m2
1345    RET
1346
1347cglobal vp9_ipred_vl_8x8_16, 2, 4, 4, dst, stride, l, a
1348    movifnidn               aq, amp
1349    mova                    m0, [aq]                ; abcdefgh
1350%if cpuflag(ssse3)
1351    mova                    m3, [pb_2to15_14_15]
1352%endif
1353    SHIFT_RIGHTx2           m1, m2, m0, m3          ; bcdefghh/cdefghhh
1354    LOWPASS                  2,  1, 0               ; BCDEFGHh
1355    pavgw                   m1, m0                  ; ABCDEFGh
1356    DEFINE_ARGS dst, stride, stride3
1357    lea               stride3q, [strideq*3]
1358
1359    mova      [dstq+strideq*0], m1
1360    mova      [dstq+strideq*1], m2
1361    SHIFT_RIGHT             m1, m1, m3
1362    SHIFT_RIGHT             m2, m2, m3
1363    mova      [dstq+strideq*2], m1
1364    mova      [dstq+stride3q ], m2
1365    lea                   dstq, [dstq+strideq*4]
1366    SHIFT_RIGHT             m1, m1, m3
1367    SHIFT_RIGHT             m2, m2, m3
1368    mova      [dstq+strideq*0], m1
1369    mova      [dstq+strideq*1], m2
1370    SHIFT_RIGHT             m1, m1, m3
1371    SHIFT_RIGHT             m2, m2, m3
1372    mova      [dstq+strideq*2], m1
1373    mova      [dstq+stride3q ], m2
1374    RET
1375
1376cglobal vp9_ipred_vl_16x16_16, 2, 4, 6, dst, stride, l, a
1377    movifnidn               aq, amp
1378    mova                    m0, [aq]
1379    mova                    m1, [aq+mmsize]
1380    PALIGNR                 m2, m1, m0, 2, m3
1381    PALIGNR                 m3, m1, m0, 4, m4
1382    LOWPASS                  3,  2,  0
1383    pavgw                   m2, m0
1384%if cpuflag(ssse3)
1385    mova                    m4, [pb_2to15_14_15]
1386%endif
1387    SHIFT_RIGHTx2           m5, m0, m1, m4
1388    LOWPASS                  0,  5,  1
1389    pavgw                   m1, m5
1390    DEFINE_ARGS dst, stride, cnt
1391    mov                   cntd, 8
1392
1393.loop:
1394    mova   [dstq+strideq*0+ 0], m2
1395    mova   [dstq+strideq*0+16], m1
1396    mova   [dstq+strideq*1+ 0], m3
1397    mova   [dstq+strideq*1+16], m0
1398    lea                   dstq, [dstq+strideq*2]
1399%if cpuflag(avx)
1400    vpalignr                m2, m1, m2, 2
1401    vpalignr                m3, m0, m3, 2
1402%else
1403    PALIGNR                 m5, m1, m2, 2, m4
1404    mova                    m2, m5
1405    PALIGNR                 m5, m0, m3, 2, m4
1406    mova                    m3, m5
1407%endif
1408    SHIFT_RIGHT             m1, m1, m4
1409    SHIFT_RIGHT             m0, m0, m4
1410    dec                   cntd
1411    jg .loop
1412    RET
1413
1414cglobal vp9_ipred_vl_32x32_16, 2, 5, 11, %1 * mmsize * ARCH_X86_32, dst, stride, l, a
1415    movifnidn               aq, amp
1416    mova                    m0, [aq+mmsize*0]
1417    mova                    m1, [aq+mmsize*1]
1418    mova                    m2, [aq+mmsize*2]
1419    PALIGNR                 m6, m1, m0, 2, m5
1420    PALIGNR                 m7, m1, m0, 4, m5
1421    LOWPASS                  7,  6,  0
1422    pavgw                   m6, m0
1423    SCRATCH                  6,  8, rsp+0*mmsize
1424    PALIGNR                 m4, m2, m1, 2, m0
1425    PALIGNR                 m5, m2, m1, 4, m0
1426    LOWPASS                  5,  4,  1
1427    pavgw                   m4, m1
1428    mova                    m0, [aq+mmsize*3]
1429    PALIGNR                 m1, m0, m2, 2, m6
1430    PALIGNR                 m3, m0, m2, 4, m6
1431    LOWPASS                  3,  1,  2
1432    pavgw                   m2, m1
1433%if cpuflag(ssse3)
1434    PRELOAD                 10, pb_2to15_14_15, shuf
1435%endif
1436    SHIFT_RIGHTx2           m6, m1, m0, reg_shuf
1437    LOWPASS                  1,  6,  0
1438    pavgw                   m0, m6
1439%if ARCH_X86_64
1440    pshufd                  m9, m6, q3333
1441%endif
1442%if cpuflag(avx)
1443    UNSCRATCH                6,  8, rsp+0*mmsize
1444%endif
1445    DEFINE_ARGS dst, stride, cnt, stride16, stride17
1446    mov              stride16q, strideq
1447    mov                   cntd, 8
1448    shl              stride16q, 4
1449    lea              stride17q, [stride16q+strideq]
1450
1451    ; FIXME m8 is unused for avx, so we could save one register here for win64
1452.loop:
1453%if notcpuflag(avx)
1454    UNSCRATCH                6,  8, rsp+0*mmsize
1455%endif
1456    mova   [dstq+strideq*0+ 0], m6
1457    mova   [dstq+strideq*0+16], m4
1458    mova   [dstq+strideq*0+32], m2
1459    mova   [dstq+strideq*0+48], m0
1460    mova   [dstq+strideq*1+ 0], m7
1461    mova   [dstq+strideq*1+16], m5
1462    mova   [dstq+strideq*1+32], m3
1463    mova   [dstq+strideq*1+48], m1
1464    mova   [dstq+stride16q+ 0], m4
1465    mova   [dstq+stride16q+16], m2
1466    mova   [dstq+stride16q+32], m0
1467%if ARCH_X86_64
1468    mova   [dstq+stride16q+48], m9
1469%endif
1470    mova   [dstq+stride17q+ 0], m5
1471    mova   [dstq+stride17q+16], m3
1472    mova   [dstq+stride17q+32], m1
1473%if ARCH_X86_64
1474    mova   [dstq+stride17q+48], m9
1475%endif
1476    lea                   dstq, [dstq+strideq*2]
1477%if cpuflag(avx)
1478    vpalignr                m6, m4, m6, 2
1479    vpalignr                m4, m2, m4, 2
1480    vpalignr                m2, m0, m2, 2
1481    vpalignr                m7, m5, m7, 2
1482    vpalignr                m5, m3, m5, 2
1483    vpalignr                m3, m1, m3, 2
1484%else
1485    SCRATCH                  3,  8, rsp+0*mmsize
1486%if notcpuflag(ssse3)
1487    SCRATCH                  1, 10, rsp+1*mmsize
1488%endif
1489    PALIGNR                 m3, m4, m6, 2, m1
1490    mova                    m6, m3
1491    PALIGNR                 m3, m2, m4, 2, m1
1492    mova                    m4, m3
1493    PALIGNR                 m3, m0, m2, 2, m1
1494    mova                    m2, m3
1495    PALIGNR                 m3, m5, m7, 2, m1
1496    mova                    m7, m3
1497    UNSCRATCH                3,  8, rsp+0*mmsize
1498    SCRATCH                  6,  8, rsp+0*mmsize
1499%if notcpuflag(ssse3)
1500    UNSCRATCH                1, 10, rsp+1*mmsize
1501    SCRATCH                  7, 10, rsp+1*mmsize
1502%endif
1503    PALIGNR                 m6, m3, m5, 2, m7
1504    mova                    m5, m6
1505    PALIGNR                 m6, m1, m3, 2, m7
1506    mova                    m3, m6
1507%if notcpuflag(ssse3)
1508    UNSCRATCH                7, 10, rsp+1*mmsize
1509%endif
1510%endif
1511    SHIFT_RIGHT             m1, m1, reg_shuf
1512    SHIFT_RIGHT             m0, m0, reg_shuf
1513    dec                   cntd
1514    jg .loop
1515
1516%if ARCH_X86_32
1517    DEFINE_ARGS dst, stride, stride3
1518    lea               stride3q, [strideq*3]
1519%assign %%n 0
1520%rep 4
1521    mova   [dstq+strideq*0+48], m0
1522    mova   [dstq+strideq*1+48], m0
1523    mova   [dstq+strideq*2+48], m0
1524    mova   [dstq+stride3q +48], m0
1525%if %%n < 3
1526    lea                   dstq, [dstq+strideq*4]
1527%endif
1528%assign %%n (%%n+1)
1529%endrep
1530%endif
1531    RET
1532%endmacro
1533
1534INIT_XMM sse2
1535VL_FUNCS 2
1536INIT_XMM ssse3
1537VL_FUNCS 1
1538INIT_XMM avx
1539VL_FUNCS 1
1540
1541%macro VR_FUNCS 0
1542cglobal vp9_ipred_vr_4x4_16, 4, 4, 3, dst, stride, l, a
1543    movu                    m0, [aq-2]
1544    movhps                  m1, [lq]
1545    PALIGNR                 m0, m1, 10, m2          ; xyz*abcd
1546    pslldq                  m1, m0, 2               ; .xyz*abc
1547    pslldq                  m2, m0, 4               ; ..xyz*ab
1548    LOWPASS                  2,  1, 0               ; ..YZ#ABC
1549    pavgw                   m1, m0                  ; ....#ABC
1550    DEFINE_ARGS dst, stride, stride3
1551    lea               stride3q, [strideq*3]
1552
1553    movhps    [dstq+strideq*0], m1
1554    movhps    [dstq+strideq*1], m2
1555    shufps                  m0, m2, m1, q3210
1556%if cpuflag(ssse3)
1557    pshufb                  m2, [pb_4_5_8to13_8x0]
1558%else
1559    pshuflw                 m2, m2, q2222
1560    psrldq                  m2, 6
1561%endif
1562    psrldq                  m0, 6
1563    movh      [dstq+strideq*2], m0
1564    movh      [dstq+stride3q ], m2
1565    RET
1566
1567cglobal vp9_ipred_vr_8x8_16, 4, 4, 5, dst, stride, l, a
1568    movu                    m1, [aq-2]              ; *abcdefg
1569    movu                    m2, [lq]                ; stuvwxyz
1570    mova                    m0, [aq]                ; abcdefgh
1571    PALIGNR                 m3, m1, m2, 14, m4      ; z*abcdef
1572    LOWPASS                  3,  1,  0
1573    pavgw                   m0, m1
1574    PALIGNR                 m1, m2,  2, m4          ; tuvwxyz*
1575    pslldq                  m4, m2,  2              ; .stuvwxy
1576    LOWPASS                  4,  2,  1
1577    DEFINE_ARGS dst, stride, stride3
1578    lea               stride3q, [strideq*3]
1579
1580    mova      [dstq+strideq*0], m0
1581    mova      [dstq+strideq*1], m3
1582    PALIGNR                 m0, m4, 14, m1
1583    pslldq                  m4, 2
1584    PALIGNR                 m3, m4, 14, m1
1585    pslldq                  m4, 2
1586    mova      [dstq+strideq*2], m0
1587    mova      [dstq+stride3q ], m3
1588    lea                   dstq, [dstq+strideq*4]
1589    PALIGNR                 m0, m4, 14, m1
1590    pslldq                  m4, 2
1591    PALIGNR                 m3, m4, 14, m1
1592    pslldq                  m4, 2
1593    mova      [dstq+strideq*0], m0
1594    mova      [dstq+strideq*1], m3
1595    PALIGNR                 m0, m4, 14, m1
1596    pslldq                  m4, 2
1597    PALIGNR                 m3, m4, 14, m4
1598    mova      [dstq+strideq*2], m0
1599    mova      [dstq+stride3q ], m3
1600    RET
1601
1602cglobal vp9_ipred_vr_16x16_16, 4, 4, 8, dst, stride, l, a
1603    movu                    m1, [aq-2]              ; *abcdefg
1604    movu                    m2, [aq+mmsize-2]       ; hijklmno
1605    mova                    m3, [aq]                ; abcdefgh
1606    mova                    m4, [aq+mmsize]         ; ijklmnop
1607    mova                    m5, [lq+mmsize]         ; stuvwxyz
1608    PALIGNR                 m0, m1, m5, 14, m6      ; z*abcdef
1609    movu                    m6, [aq+mmsize-4]       ; ghijklmn
1610    LOWPASS                  6,  2,  4
1611    pavgw                   m2, m4
1612    LOWPASS                  0,  1,  3
1613    pavgw                   m3, m1
1614    PALIGNR                 m1, m5,  2, m7          ; tuvwxyz*
1615    movu                    m7, [lq+mmsize-2]       ; rstuvwxy
1616    LOWPASS                  1,  5,  7
1617    movu                    m5, [lq+2]              ; lmnopqrs
1618    pslldq                  m4, m5,  2              ; .lmnopqr
1619    pslldq                  m7, m5,  4              ; ..lmnopq
1620    LOWPASS                  5,  4,  7
1621    psrld                   m4, m1, 16
1622    psrld                   m7, m5, 16
1623    pand                    m1, [pd_65535]
1624    pand                    m5, [pd_65535]
1625    packssdw                m7, m4
1626    packssdw                m5, m1
1627    DEFINE_ARGS dst, stride, cnt
1628    mov                   cntd, 8
1629
1630.loop:
1631    mova   [dstq+strideq*0+ 0], m3
1632    mova   [dstq+strideq*0+16], m2
1633    mova   [dstq+strideq*1+ 0], m0
1634    mova   [dstq+strideq*1+16], m6
1635    lea                   dstq, [dstq+strideq*2]
1636    PALIGNR                 m2, m3, 14, m4
1637    PALIGNR                 m3, m7, 14, m4
1638    pslldq                  m7, 2
1639    PALIGNR                 m6, m0, 14, m4
1640    PALIGNR                 m0, m5, 14, m4
1641    pslldq                  m5, 2
1642    dec                   cntd
1643    jg .loop
1644    RET
1645
1646cglobal vp9_ipred_vr_32x32_16, 4, 5, 14, 6 * mmsize * ARCH_X86_32, dst, stride, l, a
1647    movu                    m0, [aq+mmsize*0-2]     ; *a[0-6]
1648    movu                    m1, [aq+mmsize*1-2]     ; a[7-14]
1649    movu                    m2, [aq+mmsize*2-2]     ; a[15-22]
1650    movu                    m3, [aq+mmsize*3-2]     ; a[23-30]
1651    mova                    m4, [aq+mmsize*3+0]     ; a[24-31]
1652    movu                    m5, [aq+mmsize*3-4]     ; a[22-29]
1653    LOWPASS                  5,  3,  4              ; A[23-30]
1654    SCRATCH                  5,  8, rsp+0*mmsize
1655    pavgw                   m3, m4
1656    mova                    m4, [aq+mmsize*2+0]     ; a[16-23]
1657    movu                    m6, [aq+mmsize*2-4]     ; a[14-21]
1658    LOWPASS                  6,  2,  4              ; A[15-22]
1659    SCRATCH                  6,  9, rsp+1*mmsize
1660    pavgw                   m2, m4
1661    mova                    m4, [aq+mmsize*1+0]     ; a[8-15]
1662    movu                    m7, [aq+mmsize*1-4]     ; a[6-13]
1663    LOWPASS                  7,  1,  4              ; A[7-14]
1664    SCRATCH                  7, 10, rsp+2*mmsize
1665    pavgw                   m1, m4
1666    mova                    m4, [aq+mmsize*0+0]     ; a[0-7]
1667    mova                    m5, [lq+mmsize*3+0]     ; l[24-31]
1668    PALIGNR                 m6, m0, m5, 14, m7      ; l[31]*a[0-5]
1669    LOWPASS                  6,  0,  4              ; #A[0-6]
1670    SCRATCH                  6, 11, rsp+3*mmsize
1671    pavgw                   m4, m0
1672    PALIGNR                 m0, m5,  2, m7          ; l[25-31]*
1673    movu                    m7, [lq+mmsize*3-2]     ; l[23-30]
1674    LOWPASS                  0,  5,  7              ; L[24-31]
1675    movu                    m5, [lq+mmsize*2-2]     ; l[15-22]
1676    mova                    m7, [lq+mmsize*2+0]     ; l[16-23]
1677    movu                    m6, [lq+mmsize*2+2]     ; l[17-24]
1678    LOWPASS                  5,  7,  6              ; L[16-23]
1679    psrld                   m7, m0, 16
1680    psrld                   m6, m5, 16
1681    pand                    m0, [pd_65535]
1682    pand                    m5, [pd_65535]
1683    packssdw                m6, m7
1684    packssdw                m5, m0
1685    SCRATCH                  5, 12, rsp+4*mmsize
1686    SCRATCH                  6, 13, rsp+5*mmsize
1687    movu                    m6, [lq+mmsize*1-2]     ; l[7-14]
1688    mova                    m0, [lq+mmsize*1+0]     ; l[8-15]
1689    movu                    m5, [lq+mmsize*1+2]     ; l[9-16]
1690    LOWPASS                  6,  0,  5              ; L[8-15]
1691    movu                    m0, [lq+mmsize*0+2]     ; l[1-8]
1692    pslldq                  m5, m0,  2              ; .l[1-7]
1693    pslldq                  m7, m0,  4              ; ..l[1-6]
1694    LOWPASS                  0,  5,  7
1695    psrld                   m5, m6, 16
1696    psrld                   m7, m0, 16
1697    pand                    m6, [pd_65535]
1698    pand                    m0, [pd_65535]
1699    packssdw                m7, m5
1700    packssdw                m0, m6
1701    UNSCRATCH                6, 13, rsp+5*mmsize
1702    DEFINE_ARGS dst, stride, stride16, cnt, stride17
1703    mov              stride16q, strideq
1704    mov                   cntd, 8
1705    shl              stride16q, 4
1706%if ARCH_X86_64
1707    lea              stride17q, [stride16q+strideq]
1708%endif
1709
1710.loop:
1711    mova   [dstq+strideq*0+ 0], m4
1712    mova   [dstq+strideq*0+16], m1
1713    mova   [dstq+strideq*0+32], m2
1714    mova   [dstq+strideq*0+48], m3
1715%if ARCH_X86_64
1716    mova   [dstq+strideq*1+ 0], m11
1717    mova   [dstq+strideq*1+16], m10
1718    mova   [dstq+strideq*1+32], m9
1719    mova   [dstq+strideq*1+48], m8
1720%endif
1721    mova   [dstq+stride16q+ 0], m6
1722    mova   [dstq+stride16q+16], m4
1723    mova   [dstq+stride16q+32], m1
1724    mova   [dstq+stride16q+48], m2
1725%if ARCH_X86_64
1726    mova   [dstq+stride17q+ 0], m12
1727    mova   [dstq+stride17q+16], m11
1728    mova   [dstq+stride17q+32], m10
1729    mova   [dstq+stride17q+48], m9
1730%endif
1731    lea                   dstq, [dstq+strideq*2]
1732    PALIGNR                 m3, m2,  14, m5
1733    PALIGNR                 m2, m1,  14, m5
1734    PALIGNR                 m1, m4,  14, m5
1735    PALIGNR                 m4, m6,  14, m5
1736    PALIGNR                 m6, m7,  14, m5
1737    pslldq                  m7, 2
1738%if ARCH_X86_64
1739    PALIGNR                 m8, m9,  14, m5
1740    PALIGNR                 m9, m10, 14, m5
1741    PALIGNR                m10, m11, 14, m5
1742    PALIGNR                m11, m12, 14, m5
1743    PALIGNR                m12, m0,  14, m5
1744    pslldq                  m0, 2
1745%endif
1746    dec                   cntd
1747    jg .loop
1748
1749%if ARCH_X86_32
1750    UNSCRATCH                5, 12, rsp+4*mmsize
1751    UNSCRATCH                4, 11, rsp+3*mmsize
1752    UNSCRATCH                3, 10, rsp+2*mmsize
1753    UNSCRATCH                2,  9, rsp+1*mmsize
1754    UNSCRATCH                1,  8, rsp+0*mmsize
1755    mov                   dstq, dstm
1756    mov                   cntd, 8
1757    add                   dstq, strideq
1758.loop2:
1759    mova   [dstq+strideq*0+ 0], m4
1760    mova   [dstq+strideq*0+16], m3
1761    mova   [dstq+strideq*0+32], m2
1762    mova   [dstq+strideq*0+48], m1
1763    mova   [dstq+stride16q+ 0], m5
1764    mova   [dstq+stride16q+16], m4
1765    mova   [dstq+stride16q+32], m3
1766    mova   [dstq+stride16q+48], m2
1767    lea                   dstq, [dstq+strideq*2]
1768    PALIGNR                 m1, m2,  14, m6
1769    PALIGNR                 m2, m3,  14, m6
1770    PALIGNR                 m3, m4,  14, m6
1771    PALIGNR                 m4, m5,  14, m6
1772    PALIGNR                 m5, m0,  14, m6
1773    pslldq                  m0, 2
1774    dec                   cntd
1775    jg .loop2
1776%endif
1777    RET
1778%endmacro
1779
1780INIT_XMM sse2
1781VR_FUNCS
1782INIT_XMM ssse3
1783VR_FUNCS
1784INIT_XMM avx
1785VR_FUNCS
1786
1787%macro HU_FUNCS 1 ; stack_mem_for_32x32_32bit_function
1788cglobal vp9_ipred_hu_4x4_16, 3, 3, 3, dst, stride, l, a
1789    movh                    m0, [lq]                ; abcd
1790%if cpuflag(ssse3)
1791    pshufb                  m0, [pb_0to7_67x4]      ; abcddddd
1792%else
1793    punpcklqdq              m0, m0
1794    pshufhw                 m0, m0, q3333           ; abcddddd
1795%endif
1796    psrldq                  m1, m0,  2              ; bcddddd.
1797    psrldq                  m2, m0,  4              ; cddddd..
1798    LOWPASS                  2,  1,  0              ; BCDddd..
1799    pavgw                   m1, m0                  ; abcddddd
1800    SBUTTERFLY          wd,  1,  2,  0              ; aBbCcDdd, dddddddd
1801    PALIGNR                 m2, m1,  4, m0          ; bCcDdddd
1802    DEFINE_ARGS dst, stride, stride3
1803    lea               stride3q, [strideq*3]
1804
1805    movh      [dstq+strideq*0], m1                  ; aBbC
1806    movh      [dstq+strideq*1], m2                  ; bCcD
1807    movhps    [dstq+strideq*2], m1                  ; cDdd
1808    movhps    [dstq+stride3q ], m2                  ; dddd
1809    RET
1810
1811cglobal vp9_ipred_hu_8x8_16, 3, 3, 4, dst, stride, l, a
1812    mova                    m0, [lq]
1813%if cpuflag(ssse3)
1814    mova                    m3, [pb_2to15_14_15]
1815%endif
1816    SHIFT_RIGHTx2           m1, m2, m0, m3
1817    LOWPASS                  2,  1,  0
1818    pavgw                   m1, m0
1819    SBUTTERFLY          wd,  1,  2,  0
1820    shufps                  m0, m1, m2, q1032
1821    pshufd                  m3, m2, q3332
1822    DEFINE_ARGS dst, stride, stride3
1823    lea               stride3q, [strideq*3]
1824
1825    mova     [dstq+strideq *0], m1
1826    mova     [dstq+strideq *2], m0
1827    mova     [dstq+strideq *4], m2
1828    mova     [dstq+stride3q*2], m3
1829    add                   dstq, strideq
1830%if cpuflag(avx)
1831    vpalignr                m1, m2, m1, 4
1832%else
1833    PALIGNR                 m0, m2, m1, 4, m3
1834    mova                    m1, m0
1835%endif
1836    pshufd                  m2, m2, q3321
1837    shufps                  m0, m1, m2, q1032
1838    pshufd                  m3, m2, q3332
1839    mova     [dstq+strideq *0], m1
1840    mova     [dstq+strideq *2], m0
1841    mova     [dstq+strideq *4], m2
1842    mova     [dstq+stride3q*2], m3
1843    RET
1844
1845cglobal vp9_ipred_hu_16x16_16, 3, 4, 6 + notcpuflag(ssse3), dst, stride, l, a
1846    mova                    m0, [lq]
1847    mova                    m3, [lq+mmsize]
1848    movu                    m1, [lq+2]
1849    movu                    m2, [lq+4]
1850    LOWPASS                  2,  1,  0
1851    pavgw                   m1, m0
1852    SBUTTERFLY           wd, 1,  2,  0
1853%if cpuflag(ssse3)
1854    mova                    m5, [pb_2to15_14_15]
1855%endif
1856    SHIFT_RIGHTx2           m0, m4, m3, m5
1857    LOWPASS                  4,  0,  3
1858    pavgw                   m3, m0
1859    SBUTTERFLY           wd, 3,  4,  5
1860    pshufd                  m0, m0, q3333
1861    DEFINE_ARGS dst, stride, stride3, cnt
1862    lea               stride3q, [strideq*3]
1863    mov                   cntd, 4
1864
1865.loop:
1866    mova  [dstq+strideq *0+ 0], m1
1867    mova  [dstq+strideq *0+16], m2
1868    mova  [dstq+strideq *4+ 0], m2
1869    mova  [dstq+strideq *4+16], m3
1870    mova  [dstq+strideq *8+ 0], m3
1871    mova  [dstq+strideq *8+16], m4
1872    mova  [dstq+stride3q*4+ 0], m4
1873    mova  [dstq+stride3q*4+16], m0
1874    add                   dstq, strideq
1875%if cpuflag(avx)
1876    vpalignr                m1, m2, m1, 4
1877    vpalignr                m2, m3, m2, 4
1878    vpalignr                m3, m4, m3, 4
1879    vpalignr                m4, m0, m4, 4
1880%else
1881    PALIGNR                 m5, m2, m1, 4, m6
1882    mova                    m1, m5
1883    PALIGNR                 m5, m3, m2, 4, m6
1884    mova                    m2, m5
1885    PALIGNR                 m5, m4, m3, 4, m6
1886    mova                    m3, m5
1887    PALIGNR                 m5, m0, m4, 4, m6
1888    mova                    m4, m5
1889%endif
1890    dec                   cntd
1891    jg .loop
1892    RET
1893
1894cglobal vp9_ipred_hu_32x32_16, 3, 7, 10 + notcpuflag(ssse3), \
1895                               %1 * -mmsize * ARCH_X86_32, dst, stride, l, a
1896    mova                    m2, [lq+mmsize*0+0]
1897    movu                    m1, [lq+mmsize*0+2]
1898    movu                    m0, [lq+mmsize*0+4]
1899    LOWPASS                  0,  1,  2
1900    pavgw                   m1, m2
1901    SBUTTERFLY           wd, 1,  0,  2
1902    SCRATCH                  1,  8, rsp+0*mmsize
1903    mova                    m4, [lq+mmsize*1+0]
1904    movu                    m3, [lq+mmsize*1+2]
1905    movu                    m2, [lq+mmsize*1+4]
1906    LOWPASS                  2,  3,  4
1907    pavgw                   m3, m4
1908    SBUTTERFLY           wd, 3,  2,  4
1909    mova                    m6, [lq+mmsize*2+0]
1910    movu                    m5, [lq+mmsize*2+2]
1911    movu                    m4, [lq+mmsize*2+4]
1912    LOWPASS                  4,  5,  6
1913    pavgw                   m5, m6
1914    SBUTTERFLY           wd, 5,  4,  6
1915    mova                    m7, [lq+mmsize*3+0]
1916    SCRATCH                  0,  9, rsp+1*mmsize
1917%if cpuflag(ssse3)
1918    mova                    m0, [pb_2to15_14_15]
1919%endif
1920    SHIFT_RIGHTx2           m1, m6, m7, m0
1921    LOWPASS                  6,  1,  7
1922    pavgw                   m7, m1
1923    SBUTTERFLY           wd, 7,  6,  0
1924    pshufd                  m1, m1, q3333
1925    UNSCRATCH                0,  9, rsp+1*mmsize
1926    DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
1927    lea               stride3q, [strideq*3]
1928    lea               stride4q, [strideq*4]
1929    lea              stride28q, [stride4q*8]
1930    lea              stride20q, [stride4q*5]
1931    sub              stride28q, stride4q
1932    mov                   cntd, 4
1933
1934.loop:
1935%if ARCH_X86_64
1936    SWAP                     1,  8
1937%else
1938    mova        [rsp+1*mmsize], m1
1939    mova                    m1, [rsp+0*mmsize]
1940%endif
1941    mova  [dstq+strideq *0+ 0], m1
1942    mova  [dstq+strideq *0+16], m0
1943    mova  [dstq+strideq *0+32], m3
1944    mova  [dstq+strideq *0+48], m2
1945    mova  [dstq+stride4q*1+ 0], m0
1946    mova  [dstq+stride4q*1+16], m3
1947    mova  [dstq+stride4q*1+32], m2
1948    mova  [dstq+stride4q*1+48], m5
1949    mova  [dstq+stride4q*2+ 0], m3
1950    mova  [dstq+stride4q*2+16], m2
1951    mova  [dstq+stride4q*2+32], m5
1952    mova  [dstq+stride4q*2+48], m4
1953%if cpuflag(avx)
1954    vpalignr                m1, m0, m1, 4
1955    vpalignr                m0, m3, m0, 4
1956    vpalignr                m3, m2, m3, 4
1957%else
1958    SCRATCH                  6,  9, rsp+2*mmsize
1959%if notcpuflag(ssse3)
1960    SCRATCH                  7, 10, rsp+3*mmsize
1961%endif
1962    PALIGNR                 m6, m0, m1, 4, m7
1963    mova                    m1, m6
1964    PALIGNR                 m6, m3, m0, 4, m7
1965    mova                    m0, m6
1966    PALIGNR                 m6, m2, m3, 4, m7
1967    mova                    m3, m6
1968    UNSCRATCH                6,  9, rsp+2*mmsize
1969    SCRATCH                  0,  9, rsp+2*mmsize
1970%if notcpuflag(ssse3)
1971    UNSCRATCH                7, 10, rsp+3*mmsize
1972    SCRATCH                  3, 10, rsp+3*mmsize
1973%endif
1974%endif
1975%if ARCH_X86_64
1976    SWAP                     1,  8
1977%else
1978    mova        [rsp+0*mmsize], m1
1979    mova                    m1, [rsp+1*mmsize]
1980%endif
1981    mova  [dstq+stride3q*4+ 0], m2
1982    mova  [dstq+stride3q*4+16], m5
1983    mova  [dstq+stride3q*4+32], m4
1984    mova  [dstq+stride3q*4+48], m7
1985    mova  [dstq+stride4q*4+ 0], m5
1986    mova  [dstq+stride4q*4+16], m4
1987    mova  [dstq+stride4q*4+32], m7
1988    mova  [dstq+stride4q*4+48], m6
1989    mova  [dstq+stride20q + 0], m4
1990    mova  [dstq+stride20q +16], m7
1991    mova  [dstq+stride20q +32], m6
1992    mova  [dstq+stride20q +48], m1
1993    mova  [dstq+stride3q*8+ 0], m7
1994    mova  [dstq+stride3q*8+16], m6
1995    mova  [dstq+stride3q*8+32], m1
1996    mova  [dstq+stride3q*8+48], m1
1997    mova  [dstq+stride28q + 0], m6
1998    mova  [dstq+stride28q +16], m1
1999    mova  [dstq+stride28q +32], m1
2000    mova  [dstq+stride28q +48], m1
2001%if cpuflag(avx)
2002    vpalignr                m2, m5, m2, 4
2003    vpalignr                m5, m4, m5, 4
2004    vpalignr                m4, m7, m4, 4
2005    vpalignr                m7, m6, m7, 4
2006    vpalignr                m6, m1, m6, 4
2007%else
2008    PALIGNR                 m0, m5, m2, 4, m3
2009    mova                    m2, m0
2010    PALIGNR                 m0, m4, m5, 4, m3
2011    mova                    m5, m0
2012    PALIGNR                 m0, m7, m4, 4, m3
2013    mova                    m4, m0
2014    PALIGNR                 m0, m6, m7, 4, m3
2015    mova                    m7, m0
2016    PALIGNR                 m0, m1, m6, 4, m3
2017    mova                    m6, m0
2018    UNSCRATCH                0,  9, rsp+2*mmsize
2019%if notcpuflag(ssse3)
2020    UNSCRATCH                3, 10, rsp+3*mmsize
2021%endif
2022%endif
2023    add                   dstq, strideq
2024    dec                   cntd
2025    jg .loop
2026    RET
2027%endmacro
2028
2029INIT_XMM sse2
2030HU_FUNCS 4
2031INIT_XMM ssse3
2032HU_FUNCS 3
2033INIT_XMM avx
2034HU_FUNCS 2
2035
2036%macro HD_FUNCS 0
2037cglobal vp9_ipred_hd_4x4_16, 4, 4, 4, dst, stride, l, a
2038    movh                    m0, [lq]
2039    movhps                  m0, [aq-2]
2040    psrldq                  m1, m0, 2
2041    psrldq                  m2, m0, 4
2042    LOWPASS                  2,  1,  0
2043    pavgw                   m1, m0
2044    punpcklwd               m1, m2
2045    DEFINE_ARGS dst, stride, stride3
2046    lea               stride3q, [strideq*3]
2047
2048    movh      [dstq+stride3q ], m1
2049    movhps    [dstq+strideq*1], m1
2050    movhlps                 m2, m2
2051    PALIGNR                 m2, m1, 4, m0
2052    movh      [dstq+strideq*2], m2
2053    movhps    [dstq+strideq*0], m2
2054    RET
2055
2056cglobal vp9_ipred_hd_8x8_16, 4, 4, 5, dst, stride, l, a
2057    mova                    m0, [lq]
2058    movu                    m1, [aq-2]
2059    PALIGNR                 m2, m1, m0, 2, m3
2060    PALIGNR                 m3, m1, m0, 4, m4
2061    LOWPASS                  3,  2,  0
2062    pavgw                   m2, m0
2063    SBUTTERFLY           wd, 2,  3,  0
2064    psrldq                  m0, m1,  2
2065    psrldq                  m4, m1,  4
2066    LOWPASS                  1,  0,  4
2067    DEFINE_ARGS dst8, mstride, cnt
2068    lea                  dst8q, [dst8q+mstrideq*8]
2069    neg               mstrideq
2070    mov                   cntd, 4
2071
2072.loop:
2073    add                  dst8q, mstrideq
2074    mova    [dst8q+mstrideq*0], m2
2075    mova    [dst8q+mstrideq*4], m3
2076%if cpuflag(avx)
2077    vpalignr                m2, m3, m2, 4
2078    vpalignr                m3, m1, m3, 4
2079%else
2080    PALIGNR                 m0, m3, m2, 4, m4
2081    mova                    m2, m0
2082    PALIGNR                 m0, m1, m3, 4, m4
2083    mova                    m3, m0
2084%endif
2085    psrldq                  m1, 4
2086    dec                   cntd
2087    jg .loop
2088    RET
2089
2090cglobal vp9_ipred_hd_16x16_16, 4, 4, 8, dst, stride, l, a
2091    mova                    m2, [lq]
2092    movu                    m1, [lq+2]
2093    movu                    m0, [lq+4]
2094    LOWPASS                  0,  1,  2
2095    pavgw                   m1, m2
2096    mova                    m4, [lq+mmsize]
2097    movu                    m5, [aq-2]
2098    PALIGNR                 m3, m5, m4, 2, m6
2099    PALIGNR                 m2, m5, m4, 4, m6
2100    LOWPASS                  2,  3,  4
2101    pavgw                   m3, m4
2102    SBUTTERFLY           wd, 1,  0,  4
2103    SBUTTERFLY           wd, 3,  2,  4
2104    mova                    m6, [aq]
2105    movu                    m4, [aq+2]
2106    LOWPASS                  4,  6,  5
2107    movu                    m5, [aq+mmsize-2]
2108    psrldq                  m6, m5,  2
2109    psrldq                  m7, m5,  4
2110    LOWPASS                  5,  6,  7
2111    DEFINE_ARGS dst, mstride, mstride3, cnt
2112    lea                   dstq, [dstq+mstrideq*8]
2113    lea                   dstq, [dstq+mstrideq*8]
2114    neg               mstrideq
2115    lea              mstride3q, [mstrideq*3]
2116    mov                   cntd, 4
2117
2118.loop:
2119    add                  dstq, mstrideq
2120    mova [dstq+mstride3q*4+ 0], m2
2121    mova [dstq+mstride3q*4+16], m4
2122    mova [dstq+mstrideq *8+ 0], m3
2123    mova [dstq+mstrideq *8+16], m2
2124    mova [dstq+mstrideq *4+ 0], m0
2125    mova [dstq+mstrideq *4+16], m3
2126    mova [dstq+mstrideq *0+ 0], m1
2127    mova [dstq+mstrideq *0+16], m0
2128%if cpuflag(avx)
2129    vpalignr                m1, m0, m1, 4
2130    vpalignr                m0, m3, m0, 4
2131    vpalignr                m3, m2, m3, 4
2132    vpalignr                m2, m4, m2, 4
2133    vpalignr                m4, m5, m4, 4
2134%else
2135    PALIGNR                 m6, m0, m1, 4, m7
2136    mova                    m1, m6
2137    PALIGNR                 m6, m3, m0, 4, m7
2138    mova                    m0, m6
2139    PALIGNR                 m6, m2, m3, 4, m7
2140    mova                    m3, m6
2141    PALIGNR                 m6, m4, m2, 4, m7
2142    mova                    m2, m6
2143    PALIGNR                 m6, m5, m4, 4, m7
2144    mova                    m4, m6
2145%endif
2146    psrldq                  m5, 4
2147    dec                   cntd
2148    jg .loop
2149    RET
2150
2151cglobal vp9_ipred_hd_32x32_16, 4, 4 + 3 * ARCH_X86_64, 14, \
2152                               10 * -mmsize * ARCH_X86_32, dst, stride, l, a
2153    mova                    m2, [lq+mmsize*0+0]
2154    movu                    m1, [lq+mmsize*0+2]
2155    movu                    m0, [lq+mmsize*0+4]
2156    LOWPASS                  0,  1,  2
2157    pavgw                   m1, m2
2158    SBUTTERFLY           wd, 1,  0,  2
2159    mova                    m4, [lq+mmsize*1+0]
2160    movu                    m3, [lq+mmsize*1+2]
2161    movu                    m2, [lq+mmsize*1+4]
2162    LOWPASS                  2,  3,  4
2163    pavgw                   m3, m4
2164    SBUTTERFLY           wd, 3,  2,  4
2165    SCRATCH                  0,  8, rsp+0*mmsize
2166    SCRATCH                  1,  9, rsp+1*mmsize
2167    SCRATCH                  2, 10, rsp+2*mmsize
2168    SCRATCH                  3, 11, rsp+3*mmsize
2169    mova                    m6, [lq+mmsize*2+0]
2170    movu                    m5, [lq+mmsize*2+2]
2171    movu                    m4, [lq+mmsize*2+4]
2172    LOWPASS                  4,  5,  6
2173    pavgw                   m5, m6
2174    SBUTTERFLY           wd, 5,  4,  6
2175    mova                    m0, [lq+mmsize*3+0]
2176    movu                    m1, [aq+mmsize*0-2]
2177    PALIGNR                 m7, m1, m0, 2, m2
2178    PALIGNR                 m6, m1, m0, 4, m2
2179    LOWPASS                  6,  7,  0
2180    pavgw                   m7, m0
2181    SBUTTERFLY           wd, 7,  6,  0
2182    mova                    m2, [aq+mmsize*0+0]
2183    movu                    m0, [aq+mmsize*0+2]
2184    LOWPASS                  0,  2,  1
2185    movu                    m1, [aq+mmsize*1-2]
2186    mova                    m2, [aq+mmsize*1+0]
2187    movu                    m3, [aq+mmsize*1+2]
2188    LOWPASS                  1,  2,  3
2189    SCRATCH                  6, 12, rsp+6*mmsize
2190    SCRATCH                  7, 13, rsp+7*mmsize
2191    movu                    m2, [aq+mmsize*2-2]
2192    mova                    m3, [aq+mmsize*2+0]
2193    movu                    m6, [aq+mmsize*2+2]
2194    LOWPASS                  2,  3,  6
2195    movu                    m3, [aq+mmsize*3-2]
2196    psrldq                  m6, m3,  2
2197    psrldq                  m7, m3,  4
2198    LOWPASS                  3,  6,  7
2199    UNSCRATCH                6, 12, rsp+6*mmsize
2200    UNSCRATCH                7, 13, rsp+7*mmsize
2201%if ARCH_X86_32
2202    mova        [rsp+4*mmsize], m4
2203    mova        [rsp+5*mmsize], m5
2204    ; we already backed up m6/m7 earlier on x86-32 in SCRATCH, so we don't need
2205    ; to do it again here
2206%endif
2207    DEFINE_ARGS dst, stride, cnt, stride3, stride4, stride20, stride28
2208    mov                   cntd, 4
2209    lea               stride3q, [strideq*3]
2210%if ARCH_X86_64
2211    lea               stride4q, [strideq*4]
2212    lea              stride28q, [stride4q*8]
2213    lea              stride20q, [stride4q*5]
2214    sub              stride28q, stride4q
2215%endif
2216    add                   dstq, stride3q
2217
2218    ; x86-32 doesn't have enough registers, so on that platform, we split
2219    ; the loop in 2... Otherwise you spend most of the loop (un)scratching
2220.loop:
2221%if ARCH_X86_64
2222    mova  [dstq+stride28q + 0], m9
2223    mova  [dstq+stride28q +16], m8
2224    mova  [dstq+stride28q +32], m11
2225    mova  [dstq+stride28q +48], m10
2226    mova  [dstq+stride3q*8+ 0], m8
2227    mova  [dstq+stride3q*8+16], m11
2228    mova  [dstq+stride3q*8+32], m10
2229    mova  [dstq+stride3q*8+48], m5
2230    mova  [dstq+stride20q + 0], m11
2231    mova  [dstq+stride20q +16], m10
2232    mova  [dstq+stride20q +32], m5
2233    mova  [dstq+stride20q +48], m4
2234    mova  [dstq+stride4q*4+ 0], m10
2235    mova  [dstq+stride4q*4+16], m5
2236    mova  [dstq+stride4q*4+32], m4
2237    mova  [dstq+stride4q*4+48], m7
2238%endif
2239    mova  [dstq+stride3q*4+ 0], m5
2240    mova  [dstq+stride3q*4+16], m4
2241    mova  [dstq+stride3q*4+32], m7
2242    mova  [dstq+stride3q*4+48], m6
2243    mova  [dstq+strideq* 8+ 0], m4
2244    mova  [dstq+strideq* 8+16], m7
2245    mova  [dstq+strideq* 8+32], m6
2246    mova  [dstq+strideq* 8+48], m0
2247    mova  [dstq+strideq* 4+ 0], m7
2248    mova  [dstq+strideq* 4+16], m6
2249    mova  [dstq+strideq* 4+32], m0
2250    mova  [dstq+strideq* 4+48], m1
2251    mova  [dstq+strideq* 0+ 0], m6
2252    mova  [dstq+strideq* 0+16], m0
2253    mova  [dstq+strideq* 0+32], m1
2254    mova  [dstq+strideq* 0+48], m2
2255    sub                   dstq, strideq
2256%if cpuflag(avx)
2257%if ARCH_X86_64
2258    vpalignr                m9, m8,  m9,  4
2259    vpalignr                m8, m11, m8,  4
2260    vpalignr               m11, m10, m11, 4
2261    vpalignr               m10, m5,  m10, 4
2262%endif
2263    vpalignr                m5, m4,  m5,  4
2264    vpalignr                m4, m7,  m4,  4
2265    vpalignr                m7, m6,  m7,  4
2266    vpalignr                m6, m0,  m6,  4
2267    vpalignr                m0, m1,  m0,  4
2268    vpalignr                m1, m2,  m1,  4
2269    vpalignr                m2, m3,  m2,  4
2270%else
2271%if ARCH_X86_64
2272    PALIGNR                m12, m8,  m9,  4, m13
2273    mova                    m9, m12
2274    PALIGNR                m12, m11, m8,  4, m13
2275    mova                    m8, m12
2276    PALIGNR                m12, m10, m11, 4, m13
2277    mova                   m11, m12
2278    PALIGNR                m12, m5,  m10, 4, m13
2279    mova                   m10, m12
2280%endif
2281    SCRATCH                  3, 12, rsp+8*mmsize, sh
2282%if notcpuflag(ssse3)
2283    SCRATCH                  2, 13, rsp+9*mmsize
2284%endif
2285    PALIGNR                 m3, m4,  m5,  4, m2
2286    mova                    m5, m3
2287    PALIGNR                 m3, m7,  m4,  4, m2
2288    mova                    m4, m3
2289    PALIGNR                 m3, m6,  m7,  4, m2
2290    mova                    m7, m3
2291    PALIGNR                 m3, m0,  m6,  4, m2
2292    mova                    m6, m3
2293    PALIGNR                 m3, m1,  m0,  4, m2
2294    mova                    m0, m3
2295%if notcpuflag(ssse3)
2296    UNSCRATCH                2, 13, rsp+9*mmsize
2297    SCRATCH                  0, 13, rsp+9*mmsize
2298%endif
2299    PALIGNR                 m3, m2,  m1,  4, m0
2300    mova                    m1, m3
2301    PALIGNR                 m3, reg_sh,  m2,  4, m0
2302    mova                    m2, m3
2303%if notcpuflag(ssse3)
2304    UNSCRATCH                0, 13, rsp+9*mmsize
2305%endif
2306    UNSCRATCH                3, 12, rsp+8*mmsize, sh
2307%endif
2308    psrldq                  m3, 4
2309    dec                   cntd
2310    jg .loop
2311
2312%if ARCH_X86_32
2313    UNSCRATCH                0,  8, rsp+0*mmsize
2314    UNSCRATCH                1,  9, rsp+1*mmsize
2315    UNSCRATCH                2, 10, rsp+2*mmsize
2316    UNSCRATCH                3, 11, rsp+3*mmsize
2317    mova                    m4, [rsp+4*mmsize]
2318    mova                    m5, [rsp+5*mmsize]
2319    mova                    m6, [rsp+6*mmsize]
2320    mova                    m7, [rsp+7*mmsize]
2321    DEFINE_ARGS dst, stride, stride5, stride3
2322    lea               stride5q, [strideq*5]
2323    lea                   dstq, [dstq+stride5q*4]
2324    DEFINE_ARGS dst, stride, cnt, stride3
2325    mov                   cntd, 4
2326.loop_2:
2327    mova  [dstq+stride3q*4+ 0], m1
2328    mova  [dstq+stride3q*4+16], m0
2329    mova  [dstq+stride3q*4+32], m3
2330    mova  [dstq+stride3q*4+48], m2
2331    mova  [dstq+strideq* 8+ 0], m0
2332    mova  [dstq+strideq* 8+16], m3
2333    mova  [dstq+strideq* 8+32], m2
2334    mova  [dstq+strideq* 8+48], m5
2335    mova  [dstq+strideq* 4+ 0], m3
2336    mova  [dstq+strideq* 4+16], m2
2337    mova  [dstq+strideq* 4+32], m5
2338    mova  [dstq+strideq* 4+48], m4
2339    mova  [dstq+strideq* 0+ 0], m2
2340    mova  [dstq+strideq* 0+16], m5
2341    mova  [dstq+strideq* 0+32], m4
2342    mova  [dstq+strideq* 0+48], m7
2343    sub                   dstq, strideq
2344%if cpuflag(avx)
2345    vpalignr                m1, m0,  m1,  4
2346    vpalignr                m0, m3,  m0,  4
2347    vpalignr                m3, m2,  m3,  4
2348    vpalignr                m2, m5,  m2,  4
2349    vpalignr                m5, m4,  m5,  4
2350    vpalignr                m4, m7,  m4,  4
2351    vpalignr                m7, m6,  m7,  4
2352%else
2353    SCRATCH                  6, 12, rsp+8*mmsize, sh
2354%if notcpuflag(ssse3)
2355    SCRATCH                  7, 13, rsp+9*mmsize
2356%endif
2357    PALIGNR                 m6, m0,  m1,  4, m7
2358    mova                    m1, m6
2359    PALIGNR                 m6, m3,  m0,  4, m7
2360    mova                    m0, m6
2361    PALIGNR                 m6, m2,  m3,  4, m7
2362    mova                    m3, m6
2363    PALIGNR                 m6, m5,  m2,  4, m7
2364    mova                    m2, m6
2365    PALIGNR                 m6, m4,  m5,  4, m7
2366    mova                    m5, m6
2367%if notcpuflag(ssse3)
2368    UNSCRATCH                7, 13, rsp+9*mmsize
2369    SCRATCH                  5, 13, rsp+9*mmsize
2370%endif
2371    PALIGNR                 m6, m7,  m4,  4, m5
2372    mova                    m4, m6
2373    PALIGNR                 m6, reg_sh,  m7,  4, m5
2374    mova                    m7, m6
2375%if notcpuflag(ssse3)
2376    UNSCRATCH                5, 13, rsp+9*mmsize
2377%endif
2378    UNSCRATCH                6, 12, rsp+8*mmsize, sh
2379%endif
2380    psrldq                  m6, 4
2381    dec                   cntd
2382    jg .loop_2
2383%endif
2384    RET
2385%endmacro
2386
2387INIT_XMM sse2
2388HD_FUNCS
2389INIT_XMM ssse3
2390HD_FUNCS
2391INIT_XMM avx
2392HD_FUNCS
2393