1;******************************************************************************
2;* VP9 Intra prediction SIMD optimizations
3;*
4;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
5;*
6;* Parts based on:
7;* H.264 intra prediction asm optimizations
8;* Copyright (c) 2010 Fiona Glaser
9;* Copyright (c) 2010 Holger Lubitz
10;* Copyright (c) 2010 Loren Merritt
11;* Copyright (c) 2010 Ronald S. Bultje
12;*
13;* This file is part of FFmpeg.
14;*
15;* FFmpeg is free software; you can redistribute it and/or
16;* modify it under the terms of the GNU Lesser General Public
17;* License as published by the Free Software Foundation; either
18;* version 2.1 of the License, or (at your option) any later version.
19;*
20;* FFmpeg is distributed in the hope that it will be useful,
21;* but WITHOUT ANY WARRANTY; without even the implied warranty of
22;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23;* Lesser General Public License for more details.
24;*
25;* You should have received a copy of the GNU Lesser General Public
26;* License along with FFmpeg; if not, write to the Free Software
27;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
28;******************************************************************************
29
30%include "libavutil/x86/x86util.asm"
31
32SECTION_RODATA 32
33
34pw_m256: times 16 dw -256
35pw_m255: times 16 dw -255
36pw_4096: times 8 dw 4096
37
38pb_4x3_4x2_4x1_4x0: times 4 db 3
39                    times 4 db 2
40                    times 4 db 1
41                    times 4 db 0
42pb_8x1_8x0:   times 8 db 1
43              times 8 db 0
44pb_8x3_8x2:   times 8 db 3
45              times 8 db 2
46pb_0to5_2x7:  db 0, 1, 2, 3, 4, 5, 7, 7
47              times 8 db -1
48pb_0to6_9x7:  db 0, 1, 2, 3, 4, 5, 6
49              times 9 db 7
50pb_1to6_10x7: db 1, 2, 3, 4, 5, 6
51              times 10 db 7
52pb_2to6_3x7:
53pb_2to6_11x7: db 2, 3, 4, 5, 6
54              times 11 db 7
55pb_1toE_2xF:  db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
56pb_2toE_3xF:  db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
57pb_13456_3xm1: db 1, 3, 4, 5, 6
58               times 3 db -1
59pb_6012_4xm1: db 6, 0, 1, 2
60              times 4 db -1
61pb_6xm1_246_8toE: times 6 db -1
62                  db 2, 4, 6, 8, 9, 10, 11, 12, 13, 14
63pb_6xm1_BDF_0to6: times 6 db -1
64                  db 11, 13, 15, 0, 1, 2, 3, 4, 5, 6
65pb_02468ACE_13579BDF: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
66
67pb_15x0_1xm1: times 15 db 0
68              db -1
69pb_0to2_5x3: db 0, 1, 2
70             times 5 db 3
71pb_6xm1_2x0: times 6 db -1
72             times 2 db 0
73pb_6x0_2xm1: times 6 db 0
74             times 2 db -1
75
76cextern pb_1
77cextern pb_2
78cextern pb_3
79cextern pb_15
80cextern pw_2
81cextern pw_4
82cextern pw_8
83cextern pw_16
84cextern pw_32
85cextern pw_255
86cextern pw_512
87cextern pw_1024
88cextern pw_2048
89cextern pw_8192
90
91SECTION .text
92
93; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
94
95%macro DC_4to8_FUNCS 0
96cglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a
97    movd                    m0, [lq]
98    punpckldq               m0, [aq]
99    pxor                    m1, m1
100    psadbw                  m0, m1
101%if cpuflag(ssse3)
102    pmulhrsw                m0, [pw_4096]
103    pshufb                  m0, m1
104%else
105    paddw                   m0, [pw_4]
106    psraw                   m0, 3
107    punpcklbw               m0, m0
108    pshufw                  m0, m0, q0000
109%endif
110    movd      [dstq+strideq*0], m0
111    movd      [dstq+strideq*1], m0
112    lea                   dstq, [dstq+strideq*2]
113    movd      [dstq+strideq*0], m0
114    movd      [dstq+strideq*1], m0
115    RET
116
117cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a
118    movq                    m0, [lq]
119    movq                    m1, [aq]
120    DEFINE_ARGS dst, stride, stride3
121    lea               stride3q, [strideq*3]
122    pxor                    m2, m2
123    psadbw                  m0, m2
124    psadbw                  m1, m2
125    paddw                   m0, m1
126%if cpuflag(ssse3)
127    pmulhrsw                m0, [pw_2048]
128    pshufb                  m0, m2
129%else
130    paddw                   m0, [pw_8]
131    psraw                   m0, 4
132    punpcklbw               m0, m0
133    pshufw                  m0, m0, q0000
134%endif
135    movq      [dstq+strideq*0], m0
136    movq      [dstq+strideq*1], m0
137    movq      [dstq+strideq*2], m0
138    movq      [dstq+stride3q ], m0
139    lea                   dstq, [dstq+strideq*4]
140    movq      [dstq+strideq*0], m0
141    movq      [dstq+strideq*1], m0
142    movq      [dstq+strideq*2], m0
143    movq      [dstq+stride3q ], m0
144    RET
145%endmacro
146
147INIT_MMX mmxext
148DC_4to8_FUNCS
149INIT_MMX ssse3
150DC_4to8_FUNCS
151
152%macro DC_16to32_FUNCS 0
153cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a
154    mova                    m0, [lq]
155    mova                    m1, [aq]
156    DEFINE_ARGS dst, stride, stride3, cnt
157    lea               stride3q, [strideq*3]
158    pxor                    m2, m2
159    psadbw                  m0, m2
160    psadbw                  m1, m2
161    paddw                   m0, m1
162    movhlps                 m1, m0
163    paddw                   m0, m1
164%if cpuflag(ssse3)
165    pmulhrsw                m0, [pw_1024]
166    pshufb                  m0, m2
167%else
168    paddw                   m0, [pw_16]
169    psraw                   m0, 5
170    punpcklbw               m0, m0
171    pshuflw                 m0, m0, q0000
172    punpcklqdq              m0, m0
173%endif
174    mov                   cntd, 4
175.loop:
176    mova      [dstq+strideq*0], m0
177    mova      [dstq+strideq*1], m0
178    mova      [dstq+strideq*2], m0
179    mova      [dstq+stride3q ], m0
180    lea                   dstq, [dstq+strideq*4]
181    dec                   cntd
182    jg .loop
183    RET
184
185cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a
186    mova                    m0, [lq]
187    mova                    m1, [lq+16]
188    mova                    m2, [aq]
189    mova                    m3, [aq+16]
190    DEFINE_ARGS dst, stride, stride3, cnt
191    lea               stride3q, [strideq*3]
192    pxor                    m4, m4
193    psadbw                  m0, m4
194    psadbw                  m1, m4
195    psadbw                  m2, m4
196    psadbw                  m3, m4
197    paddw                   m0, m1
198    paddw                   m2, m3
199    paddw                   m0, m2
200    movhlps                 m1, m0
201    paddw                   m0, m1
202%if cpuflag(ssse3)
203    pmulhrsw                m0, [pw_512]
204    pshufb                  m0, m4
205%else
206    paddw                   m0, [pw_32]
207    psraw                   m0, 6
208    punpcklbw               m0, m0
209    pshuflw                 m0, m0, q0000
210    punpcklqdq              m0, m0
211%endif
212    mov                   cntd, 8
213.loop:
214    mova   [dstq+strideq*0+ 0], m0
215    mova   [dstq+strideq*0+16], m0
216    mova   [dstq+strideq*1+ 0], m0
217    mova   [dstq+strideq*1+16], m0
218    mova   [dstq+strideq*2+ 0], m0
219    mova   [dstq+strideq*2+16], m0
220    mova   [dstq+stride3q + 0], m0
221    mova   [dstq+stride3q +16], m0
222    lea                   dstq, [dstq+strideq*4]
223    dec                   cntd
224    jg .loop
225    RET
226%endmacro
227
228INIT_XMM sse2
229DC_16to32_FUNCS
230INIT_XMM ssse3
231DC_16to32_FUNCS
232
233%if HAVE_AVX2_EXTERNAL
234INIT_YMM avx2
235cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
236    mova                    m0, [lq]
237    mova                    m1, [aq]
238    DEFINE_ARGS dst, stride, stride3, cnt
239    lea               stride3q, [strideq*3]
240    pxor                    m2, m2
241    psadbw                  m0, m2
242    psadbw                  m1, m2
243    paddw                   m0, m1
244    vextracti128           xm1, m0, 1
245    paddw                  xm0, xm1
246    movhlps                xm1, xm0
247    paddw                  xm0, xm1
248    pmulhrsw               xm0, [pw_512]
249    vpbroadcastb            m0, xm0
250    mov                   cntd, 4
251.loop:
252    mova      [dstq+strideq*0], m0
253    mova      [dstq+strideq*1], m0
254    mova      [dstq+strideq*2], m0
255    mova      [dstq+stride3q ], m0
256    lea                   dstq, [dstq+strideq*4]
257    mova      [dstq+strideq*0], m0
258    mova      [dstq+strideq*1], m0
259    mova      [dstq+strideq*2], m0
260    mova      [dstq+stride3q ], m0
261    lea                   dstq, [dstq+strideq*4]
262    dec                   cntd
263    jg .loop
264    RET
265%endif
266
267; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
268
269%macro DC_1D_4to8_FUNCS 2 ; dir (top or left), arg (a or l)
270cglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a
271    movd                    m0, [%2q]
272    pxor                    m1, m1
273    psadbw                  m0, m1
274%if cpuflag(ssse3)
275    pmulhrsw                m0, [pw_8192]
276    pshufb                  m0, m1
277%else
278    paddw                   m0, [pw_2]
279    psraw                   m0, 2
280    punpcklbw               m0, m0
281    pshufw                  m0, m0, q0000
282%endif
283    movd      [dstq+strideq*0], m0
284    movd      [dstq+strideq*1], m0
285    lea                   dstq, [dstq+strideq*2]
286    movd      [dstq+strideq*0], m0
287    movd      [dstq+strideq*1], m0
288    RET
289
290cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a
291    movq                    m0, [%2q]
292    DEFINE_ARGS dst, stride, stride3
293    lea               stride3q, [strideq*3]
294    pxor                    m1, m1
295    psadbw                  m0, m1
296%if cpuflag(ssse3)
297    pmulhrsw                m0, [pw_4096]
298    pshufb                  m0, m1
299%else
300    paddw                   m0, [pw_4]
301    psraw                   m0, 3
302    punpcklbw               m0, m0
303    pshufw                  m0, m0, q0000
304%endif
305    movq      [dstq+strideq*0], m0
306    movq      [dstq+strideq*1], m0
307    movq      [dstq+strideq*2], m0
308    movq      [dstq+stride3q ], m0
309    lea                   dstq, [dstq+strideq*4]
310    movq      [dstq+strideq*0], m0
311    movq      [dstq+strideq*1], m0
312    movq      [dstq+strideq*2], m0
313    movq      [dstq+stride3q ], m0
314    RET
315%endmacro
316
317INIT_MMX mmxext
318DC_1D_4to8_FUNCS top,  a
319DC_1D_4to8_FUNCS left, l
320INIT_MMX ssse3
321DC_1D_4to8_FUNCS top,  a
322DC_1D_4to8_FUNCS left, l
323
324%macro DC_1D_16to32_FUNCS 2; dir (top or left), arg (a or l)
325cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a
326    mova                    m0, [%2q]
327    DEFINE_ARGS dst, stride, stride3, cnt
328    lea               stride3q, [strideq*3]
329    pxor                    m2, m2
330    psadbw                  m0, m2
331    movhlps                 m1, m0
332    paddw                   m0, m1
333%if cpuflag(ssse3)
334    pmulhrsw                m0, [pw_2048]
335    pshufb                  m0, m2
336%else
337    paddw                   m0, [pw_8]
338    psraw                   m0, 4
339    punpcklbw               m0, m0
340    pshuflw                 m0, m0, q0000
341    punpcklqdq              m0, m0
342%endif
343    mov                   cntd, 4
344.loop:
345    mova      [dstq+strideq*0], m0
346    mova      [dstq+strideq*1], m0
347    mova      [dstq+strideq*2], m0
348    mova      [dstq+stride3q ], m0
349    lea                   dstq, [dstq+strideq*4]
350    dec                   cntd
351    jg .loop
352    RET
353
354cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
355    mova                    m0, [%2q]
356    mova                    m1, [%2q+16]
357    DEFINE_ARGS dst, stride, stride3, cnt
358    lea               stride3q, [strideq*3]
359    pxor                    m2, m2
360    psadbw                  m0, m2
361    psadbw                  m1, m2
362    paddw                   m0, m1
363    movhlps                 m1, m0
364    paddw                   m0, m1
365%if cpuflag(ssse3)
366    pmulhrsw                m0, [pw_1024]
367    pshufb                  m0, m2
368%else
369    paddw                   m0, [pw_16]
370    psraw                   m0, 5
371    punpcklbw               m0, m0
372    pshuflw                 m0, m0, q0000
373    punpcklqdq              m0, m0
374%endif
375    mov                   cntd, 8
376.loop:
377    mova   [dstq+strideq*0+ 0], m0
378    mova   [dstq+strideq*0+16], m0
379    mova   [dstq+strideq*1+ 0], m0
380    mova   [dstq+strideq*1+16], m0
381    mova   [dstq+strideq*2+ 0], m0
382    mova   [dstq+strideq*2+16], m0
383    mova   [dstq+stride3q + 0], m0
384    mova   [dstq+stride3q +16], m0
385    lea                   dstq, [dstq+strideq*4]
386    dec                   cntd
387    jg .loop
388    RET
389%endmacro
390
391INIT_XMM sse2
392DC_1D_16to32_FUNCS top,  a
393DC_1D_16to32_FUNCS left, l
394INIT_XMM ssse3
395DC_1D_16to32_FUNCS top,  a
396DC_1D_16to32_FUNCS left, l
397
398%macro DC_1D_AVX2_FUNCS 2 ; dir (top or left), arg (a or l)
399%if HAVE_AVX2_EXTERNAL
400cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
401    mova                    m0, [%2q]
402    DEFINE_ARGS dst, stride, stride3, cnt
403    lea               stride3q, [strideq*3]
404    pxor                    m2, m2
405    psadbw                  m0, m2
406    vextracti128           xm1, m0, 1
407    paddw                  xm0, xm1
408    movhlps                xm1, xm0
409    paddw                  xm0, xm1
410    pmulhrsw               xm0, [pw_1024]
411    vpbroadcastb            m0, xm0
412    mov                   cntd, 4
413.loop:
414    mova      [dstq+strideq*0], m0
415    mova      [dstq+strideq*1], m0
416    mova      [dstq+strideq*2], m0
417    mova      [dstq+stride3q ], m0
418    lea                   dstq, [dstq+strideq*4]
419    mova      [dstq+strideq*0], m0
420    mova      [dstq+strideq*1], m0
421    mova      [dstq+strideq*2], m0
422    mova      [dstq+stride3q ], m0
423    lea                   dstq, [dstq+strideq*4]
424    dec                   cntd
425    jg .loop
426    RET
427%endif
428%endmacro
429
430INIT_YMM avx2
431DC_1D_AVX2_FUNCS top,  a
432DC_1D_AVX2_FUNCS left, l
433
434; v
435
436INIT_MMX mmx
437cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a
438    movq                    m0, [aq]
439    DEFINE_ARGS dst, stride, stride3
440    lea               stride3q, [strideq*3]
441    movq      [dstq+strideq*0], m0
442    movq      [dstq+strideq*1], m0
443    movq      [dstq+strideq*2], m0
444    movq      [dstq+stride3q ], m0
445    lea                   dstq, [dstq+strideq*4]
446    movq      [dstq+strideq*0], m0
447    movq      [dstq+strideq*1], m0
448    movq      [dstq+strideq*2], m0
449    movq      [dstq+stride3q ], m0
450    RET
451
452INIT_XMM sse
453cglobal vp9_ipred_v_16x16, 4, 4, 1, dst, stride, l, a
454    mova                    m0, [aq]
455    DEFINE_ARGS dst, stride, stride3, cnt
456    lea               stride3q, [strideq*3]
457    mov                   cntd, 4
458.loop:
459    mova      [dstq+strideq*0], m0
460    mova      [dstq+strideq*1], m0
461    mova      [dstq+strideq*2], m0
462    mova      [dstq+stride3q ], m0
463    lea                   dstq, [dstq+strideq*4]
464    dec                   cntd
465    jg .loop
466    RET
467
468INIT_XMM sse
469cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a
470    mova                    m0, [aq]
471    mova                    m1, [aq+16]
472    DEFINE_ARGS dst, stride, stride3, cnt
473    lea               stride3q, [strideq*3]
474    mov                   cntd, 8
475.loop:
476    mova   [dstq+strideq*0+ 0], m0
477    mova   [dstq+strideq*0+16], m1
478    mova   [dstq+strideq*1+ 0], m0
479    mova   [dstq+strideq*1+16], m1
480    mova   [dstq+strideq*2+ 0], m0
481    mova   [dstq+strideq*2+16], m1
482    mova   [dstq+stride3q + 0], m0
483    mova   [dstq+stride3q +16], m1
484    lea                   dstq, [dstq+strideq*4]
485    dec                   cntd
486    jg .loop
487    RET
488
489INIT_YMM avx
490cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a
491    mova                    m0, [aq]
492    DEFINE_ARGS dst, stride, stride3, cnt
493    lea               stride3q, [strideq*3]
494    mov                   cntd, 4
495.loop:
496    mova      [dstq+strideq*0], m0
497    mova      [dstq+strideq*1], m0
498    mova      [dstq+strideq*2], m0
499    mova      [dstq+stride3q ], m0
500    lea                   dstq, [dstq+strideq*4]
501    mova      [dstq+strideq*0], m0
502    mova      [dstq+strideq*1], m0
503    mova      [dstq+strideq*2], m0
504    mova      [dstq+stride3q ], m0
505    lea                   dstq, [dstq+strideq*4]
506    dec                   cntd
507    jg .loop
508    RET
509
510; h
511
512%macro H_XMM_FUNCS 2
513%if notcpuflag(avx)
514cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3
515    movd                    m0, [lq]
516%if cpuflag(ssse3)
517    pshufb                  m0, [pb_4x3_4x2_4x1_4x0]
518%else
519    punpcklbw               m0, m0
520    pshuflw                 m0, m0, q0123
521    punpcklwd               m0, m0
522%endif
523    lea               stride3q, [strideq*3]
524    movd      [dstq+strideq*0], m0
525    psrldq                  m0, 4
526    movd      [dstq+strideq*1], m0
527    psrldq                  m0, 4
528    movd      [dstq+strideq*2], m0
529    psrldq                  m0, 4
530    movd      [dstq+stride3q ], m0
531    RET
532%endif
533
534cglobal vp9_ipred_h_8x8, 3, 5, %1, dst, stride, l, stride3, cnt
535%if cpuflag(ssse3)
536    mova                    m2, [pb_8x1_8x0]
537    mova                    m3, [pb_8x3_8x2]
538%endif
539    lea               stride3q, [strideq*3]
540    mov                   cntq, 1
541.loop:
542    movd                    m0, [lq+cntq*4]
543%if cpuflag(ssse3)
544    pshufb                  m1, m0, m3
545    pshufb                  m0, m2
546%else
547    punpcklbw               m0, m0
548    punpcklwd               m0, m0
549    pshufd                  m1, m0, q2233
550    pshufd                  m0, m0, q0011
551%endif
552    movq      [dstq+strideq*0], m1
553    movhps    [dstq+strideq*1], m1
554    movq      [dstq+strideq*2], m0
555    movhps    [dstq+stride3q ], m0
556    lea                   dstq, [dstq+strideq*4]
557    dec                   cntq
558    jge .loop
559    RET
560
561cglobal vp9_ipred_h_16x16, 3, 5, %2, dst, stride, l, stride3, cnt
562%if cpuflag(ssse3)
563    mova                    m5, [pb_1]
564    mova                    m6, [pb_2]
565    mova                    m7, [pb_3]
566    pxor                    m4, m4
567%endif
568    lea               stride3q, [strideq*3]
569    mov                   cntq, 3
570.loop:
571    movd                    m3, [lq+cntq*4]
572%if cpuflag(ssse3)
573    pshufb                  m0, m3, m7
574    pshufb                  m1, m3, m6
575%else
576    punpcklbw               m3, m3
577    punpcklwd               m3, m3
578    pshufd                  m0, m3, q3333
579    pshufd                  m1, m3, q2222
580%endif
581    mova      [dstq+strideq*0], m0
582    mova      [dstq+strideq*1], m1
583%if cpuflag(ssse3)
584    pshufb                  m2, m3, m5
585    pshufb                  m3, m4
586%else
587    pshufd                  m2, m3, q1111
588    pshufd                  m3, m3, q0000
589%endif
590    mova      [dstq+strideq*2], m2
591    mova      [dstq+stride3q ], m3
592    lea                   dstq, [dstq+strideq*4]
593    dec                   cntq
594    jge .loop
595    RET
596
597cglobal vp9_ipred_h_32x32, 3, 5, %2, dst, stride, l, stride3, cnt
598%if cpuflag(ssse3)
599    mova                    m5, [pb_1]
600    mova                    m6, [pb_2]
601    mova                    m7, [pb_3]
602    pxor                    m4, m4
603%endif
604    lea               stride3q, [strideq*3]
605    mov                   cntq, 7
606.loop:
607    movd                    m3, [lq+cntq*4]
608%if cpuflag(ssse3)
609    pshufb                  m0, m3, m7
610    pshufb                  m1, m3, m6
611%else
612    punpcklbw               m3, m3
613    punpcklwd               m3, m3
614    pshufd                  m0, m3, q3333
615    pshufd                  m1, m3, q2222
616%endif
617    mova   [dstq+strideq*0+ 0], m0
618    mova   [dstq+strideq*0+16], m0
619    mova   [dstq+strideq*1+ 0], m1
620    mova   [dstq+strideq*1+16], m1
621%if cpuflag(ssse3)
622    pshufb                  m2, m3, m5
623    pshufb                  m3, m4
624%else
625    pshufd                  m2, m3, q1111
626    pshufd                  m3, m3, q0000
627%endif
628    mova   [dstq+strideq*2+ 0], m2
629    mova   [dstq+strideq*2+16], m2
630    mova   [dstq+stride3q + 0], m3
631    mova   [dstq+stride3q +16], m3
632    lea                   dstq, [dstq+strideq*4]
633    dec                   cntq
634    jge .loop
635    RET
636%endmacro
637
638INIT_XMM sse2
639H_XMM_FUNCS 2, 4
640INIT_XMM ssse3
641H_XMM_FUNCS 4, 8
642INIT_XMM avx
643H_XMM_FUNCS 4, 8
644
645%if HAVE_AVX2_EXTERNAL
646INIT_YMM avx2
647cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
648    mova                    m5, [pb_1]
649    mova                    m6, [pb_2]
650    mova                    m7, [pb_3]
651    pxor                    m4, m4
652    lea               stride3q, [strideq*3]
653    mov                   cntq, 7
654.loop:
655    movd                   xm3, [lq+cntq*4]
656    vinserti128             m3, m3, xm3, 1
657    pshufb                  m0, m3, m7
658    pshufb                  m1, m3, m6
659    mova      [dstq+strideq*0], m0
660    mova      [dstq+strideq*1], m1
661    pshufb                  m2, m3, m5
662    pshufb                  m3, m4
663    mova      [dstq+strideq*2], m2
664    mova      [dstq+stride3q ], m3
665    lea                   dstq, [dstq+strideq*4]
666    dec                   cntq
667    jge .loop
668    RET
669%endif
670
671; tm
672
673%macro TM_MMX_FUNCS 0
674cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a
675    pxor                    m1, m1
676    movd                    m0, [aq]
677    pinsrw                  m2, [aq-1], 0
678    punpcklbw               m0, m1
679    DEFINE_ARGS dst, stride, l, cnt
680%if cpuflag(ssse3)
681    mova                    m3, [pw_m256]
682    mova                    m1, [pw_m255]
683    pshufb                  m2, m3
684%else
685    punpcklbw               m2, m1
686    pshufw                  m2, m2, q0000
687%endif
688    psubw                   m0, m2
689    mov                   cntq, 1
690.loop:
691    pinsrw                  m2, [lq+cntq*2], 0
692%if cpuflag(ssse3)
693    pshufb                  m4, m2, m1
694    pshufb                  m2, m3
695%else
696    punpcklbw               m2, m1
697    pshufw                  m4, m2, q1111
698    pshufw                  m2, m2, q0000
699%endif
700    paddw                   m4, m0
701    paddw                   m2, m0
702    packuswb                m4, m4
703    packuswb                m2, m2
704    movd      [dstq+strideq*0], m4
705    movd      [dstq+strideq*1], m2
706    lea                   dstq, [dstq+strideq*2]
707    dec                   cntq
708    jge .loop
709    RET
710%endmacro
711
712INIT_MMX mmxext
713TM_MMX_FUNCS
714INIT_MMX ssse3
715TM_MMX_FUNCS
716
717%macro TM_XMM_FUNCS 0
718cglobal vp9_ipred_tm_8x8, 4, 4, 5, dst, stride, l, a
719    pxor                    m1, m1
720    movh                    m0, [aq]
721    pinsrw                  m2, [aq-1], 0
722    punpcklbw               m0, m1
723    DEFINE_ARGS dst, stride, l, cnt
724%if cpuflag(ssse3)
725    mova                    m3, [pw_m256]
726    mova                    m1, [pw_m255]
727    pshufb                  m2, m3
728%else
729    punpcklbw               m2, m1
730    punpcklwd               m2, m2
731    pshufd                  m2, m2, q0000
732%endif
733    psubw                   m0, m2
734    mov                   cntq, 3
735.loop:
736    pinsrw                  m2, [lq+cntq*2], 0
737%if cpuflag(ssse3)
738    pshufb                  m4, m2, m1
739    pshufb                  m2, m3
740%else
741    punpcklbw               m2, m1
742    punpcklwd               m2, m2
743    pshufd                  m4, m2, q1111
744    pshufd                  m2, m2, q0000
745%endif
746    paddw                   m4, m0
747    paddw                   m2, m0
748    packuswb                m4, m2
749    movh      [dstq+strideq*0], m4
750    movhps    [dstq+strideq*1], m4
751    lea                   dstq, [dstq+strideq*2]
752    dec                   cntq
753    jge .loop
754    RET
755
756cglobal vp9_ipred_tm_16x16, 4, 4, 8, dst, stride, l, a
757    pxor                    m3, m3
758    mova                    m0, [aq]
759    pinsrw                  m2, [aq-1], 0
760    punpckhbw               m1, m0, m3
761    punpcklbw               m0, m3
762    DEFINE_ARGS dst, stride, l, cnt
763%if cpuflag(ssse3)
764    mova                    m4, [pw_m256]
765    mova                    m3, [pw_m255]
766    pshufb                  m2, m4
767%else
768    punpcklbw               m2, m3
769    punpcklwd               m2, m2
770    pshufd                  m2, m2, q0000
771%endif
772    psubw                   m1, m2
773    psubw                   m0, m2
774    mov                   cntq, 7
775.loop:
776    pinsrw                  m7, [lq+cntq*2], 0
777%if cpuflag(ssse3)
778    pshufb                  m5, m7, m3
779    pshufb                  m7, m4
780%else
781    punpcklbw               m7, m3
782    punpcklwd               m7, m7
783    pshufd                  m5, m7, q1111
784    pshufd                  m7, m7, q0000
785%endif
786    paddw                   m2, m5, m0
787    paddw                   m5, m1
788    paddw                   m6, m7, m0
789    paddw                   m7, m1
790    packuswb                m2, m5
791    packuswb                m6, m7
792    mova      [dstq+strideq*0], m2
793    mova      [dstq+strideq*1], m6
794    lea                   dstq, [dstq+strideq*2]
795    dec                   cntq
796    jge .loop
797    RET
798
799%if ARCH_X86_64
800%define mem 0
801%else
802%define mem 64
803%endif
804cglobal vp9_ipred_tm_32x32, 4, 4, 14, mem, dst, stride, l, a
805    pxor                    m5, m5
806    pinsrw                  m4, [aq-1], 0
807    mova                    m0, [aq]
808    mova                    m2, [aq+16]
809    DEFINE_ARGS dst, stride, l, cnt
810%if cpuflag(ssse3)
811%if ARCH_X86_64
812    mova                   m12, [pw_m256]
813    mova                   m13, [pw_m255]
814%define pw_m256_reg m12
815%define pw_m255_reg m13
816%else
817%define pw_m256_reg [pw_m256]
818%define pw_m255_reg [pw_m255]
819%endif
820    pshufb                  m4, pw_m256_reg
821%else
822    punpcklbw               m4, m5
823    punpcklwd               m4, m4
824    pshufd                  m4, m4, q0000
825%endif
826    punpckhbw               m1, m0,  m5
827    punpckhbw               m3, m2,  m5
828    punpcklbw               m0, m5
829    punpcklbw               m2, m5
830    psubw                   m1, m4
831    psubw                   m0, m4
832    psubw                   m3, m4
833    psubw                   m2, m4
834%if ARCH_X86_64
835    SWAP                     0, 8
836    SWAP                     1, 9
837    SWAP                     2, 10
838    SWAP                     3, 11
839%else
840    mova            [rsp+0*16], m0
841    mova            [rsp+1*16], m1
842    mova            [rsp+2*16], m2
843    mova            [rsp+3*16], m3
844%endif
845    mov                   cntq, 15
846.loop:
847    pinsrw                  m3, [lq+cntq*2], 0
848%if cpuflag(ssse3)
849    pshufb                  m7, m3, pw_m255_reg
850    pshufb                  m3, pw_m256_reg
851%else
852    pxor                    m7, m7
853    punpcklbw               m3, m7
854    punpcklwd               m3, m3
855    pshufd                  m7, m3, q1111
856    pshufd                  m3, m3, q0000
857%endif
858%if ARCH_X86_64
859    paddw                   m4, m7, m8
860    paddw                   m5, m7, m9
861    paddw                   m6, m7, m10
862    paddw                   m7, m11
863    paddw                   m0, m3, m8
864    paddw                   m1, m3, m9
865    paddw                   m2, m3, m10
866    paddw                   m3, m11
867%else
868    paddw                   m4, m7, [rsp+0*16]
869    paddw                   m5, m7, [rsp+1*16]
870    paddw                   m6, m7, [rsp+2*16]
871    paddw                   m7, [rsp+3*16]
872    paddw                   m0, m3, [rsp+0*16]
873    paddw                   m1, m3, [rsp+1*16]
874    paddw                   m2, m3, [rsp+2*16]
875    paddw                   m3, [rsp+3*16]
876%endif
877    packuswb                m4, m5
878    packuswb                m6, m7
879    packuswb                m0, m1
880    packuswb                m2, m3
881    mova   [dstq+strideq*0+ 0], m4
882    mova   [dstq+strideq*0+16], m6
883    mova   [dstq+strideq*1+ 0], m0
884    mova   [dstq+strideq*1+16], m2
885    lea                   dstq, [dstq+strideq*2]
886    dec                   cntq
887    jge .loop
888    RET
889%undef pw_m256_reg
890%undef pw_m255_reg
891%undef mem
892%endmacro
893
894INIT_XMM sse2
895TM_XMM_FUNCS
896INIT_XMM ssse3
897TM_XMM_FUNCS
898INIT_XMM avx
899TM_XMM_FUNCS
900
901%if HAVE_AVX2_EXTERNAL
902INIT_YMM avx2
903cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a
904    pxor                    m3, m3
905    pinsrw                 xm2, [aq-1], 0
906    vinserti128             m2, m2, xm2, 1
907    mova                    m0, [aq]
908    DEFINE_ARGS dst, stride, l, cnt
909    mova                    m4, [pw_m256]
910    mova                    m5, [pw_m255]
911    pshufb                  m2, m4
912    punpckhbw               m1, m0, m3
913    punpcklbw               m0, m3
914    psubw                   m1, m2
915    psubw                   m0, m2
916    mov                   cntq, 15
917.loop:
918    pinsrw                 xm7, [lq+cntq*2], 0
919    vinserti128             m7, m7, xm7, 1
920    pshufb                  m3, m7, m5
921    pshufb                  m7, m4
922    paddw                   m2, m3, m0
923    paddw                   m3, m1
924    paddw                   m6, m7, m0
925    paddw                   m7, m1
926    packuswb                m2, m3
927    packuswb                m6, m7
928    mova      [dstq+strideq*0], m2
929    mova      [dstq+strideq*1], m6
930    lea                   dstq, [dstq+strideq*2]
931    dec                   cntq
932    jge .loop
933    RET
934%endif
935
936; dl
937
938%macro LOWPASS 4 ; left [dst], center, right, tmp
939    pxor                   m%4, m%1, m%3
940    pand                   m%4, [pb_1]
941    pavgb                  m%1, m%3
942    psubusb                m%1, m%4
943    pavgb                  m%1, m%2
944%endmacro
945
946%macro DL_MMX_FUNCS 0
947cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a
948    movq                    m1, [aq]
949%if cpuflag(ssse3)
950    pshufb                  m0, m1, [pb_0to5_2x7]
951    pshufb                  m2, m1, [pb_2to6_3x7]
952%else
953    punpckhbw               m3, m1, m1              ; 44556677
954    pand                    m0, m1, [pb_6xm1_2x0]   ; 012345__
955    pand                    m3, [pb_6x0_2xm1]       ; ______77
956    psrlq                   m2, m1, 16              ; 234567__
957    por                     m0, m3                  ; 01234577
958    por                     m2, m3                  ; 23456777
959%endif
960    psrlq                   m1, 8
961    LOWPASS                  0, 1, 2, 3
962
963    pshufw                  m1, m0, q3321
964    movd      [dstq+strideq*0], m0
965    movd      [dstq+strideq*2], m1
966    psrlq                   m0, 8
967    psrlq                   m1, 8
968    add                   dstq, strideq
969    movd      [dstq+strideq*0], m0
970    movd      [dstq+strideq*2], m1
971    RET
972%endmacro
973
974INIT_MMX mmxext
975DL_MMX_FUNCS
976INIT_MMX ssse3
977DL_MMX_FUNCS
978
979%macro DL_XMM_FUNCS 0
980cglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a
981    movq                    m0, [aq]
982    lea               stride5q, [strideq*5]
983%if cpuflag(ssse3)
984    pshufb                  m1, m0, [pb_1to6_10x7]
985%else
986    punpcklbw               m1, m0, m0              ; 0011223344556677
987    punpckhwd               m1, m1                  ; 4x4,4x5,4x6,4x7
988%endif
989    shufps                  m0, m1, q3310
990%if notcpuflag(ssse3)
991    psrldq                  m1, m0, 1
992    shufps                  m1, m0, q3210
993%endif
994    psrldq                  m2, m1, 1
995    LOWPASS                  0, 1, 2, 3
996
997    pshufd                  m1, m0, q3321
998    movq      [dstq+strideq*0], m0
999    movq      [dstq+strideq*4], m1
1000    psrldq                  m0, 1
1001    psrldq                  m1, 1
1002    movq      [dstq+strideq*1], m0
1003    movq      [dstq+stride5q ], m1
1004    lea                   dstq, [dstq+strideq*2]
1005    psrldq                  m0, 1
1006    psrldq                  m1, 1
1007    movq      [dstq+strideq*0], m0
1008    movq      [dstq+strideq*4], m1
1009    psrldq                  m0, 1
1010    psrldq                  m1, 1
1011    movq      [dstq+strideq*1], m0
1012    movq      [dstq+stride5q ], m1
1013    RET
1014
1015cglobal vp9_ipred_dl_16x16, 4, 4, 6, dst, stride, l, a
1016    mova                    m0, [aq]
1017%if cpuflag(ssse3)
1018    mova                    m5, [pb_1toE_2xF]
1019    pshufb                  m1, m0, m5
1020    pshufb                  m2, m1, m5
1021    pshufb                  m4, m0, [pb_15]
1022%else
1023    pand                    m5, m0, [pb_15x0_1xm1]      ; _______________F
1024    psrldq                  m1, m0, 1                   ; 123456789ABCDEF_
1025    por                     m1, m5                      ; 123456789ABCDEFF
1026    psrldq                  m2, m1, 1                   ; 23456789ABCDEFF_
1027    por                     m2, m5                      ; 23456789ABCDEFFF
1028    pshufhw                 m4, m1, q3333               ; xxxxxxxxFFFFFFFF
1029%endif
1030    LOWPASS                  0, 1, 2, 3
1031    DEFINE_ARGS dst, stride, cnt, stride9
1032    lea               stride9q, [strideq+strideq*8]
1033    mov                   cntd, 4
1034
1035.loop:
1036    movhlps                 m4, m0
1037    mova      [dstq+strideq*0], m0
1038%if cpuflag(ssse3)
1039    pshufb                  m0, m5
1040%else
1041    psrldq                  m0, 1
1042    por                     m0, m5
1043%endif
1044    mova      [dstq+strideq*8], m4
1045    movhlps                 m4, m0
1046    mova      [dstq+strideq*1], m0
1047%if cpuflag(ssse3)
1048    pshufb                  m0, m5
1049%else
1050    psrldq                  m0, 1
1051    por                     m0, m5
1052%endif
1053    mova      [dstq+stride9q ], m4
1054    lea                   dstq, [dstq+strideq*2]
1055    dec                   cntd
1056    jg .loop
1057    RET
1058
1059cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16
1060    mova                    m0, [aq]
1061    mova                    m1, [aq+16]
1062    PALIGNR                 m2, m1, m0, 1, m4
1063    PALIGNR                 m3, m1, m0, 2, m4
1064    LOWPASS                  0, 2, 3, 4
1065%if cpuflag(ssse3)
1066    mova                    m5, [pb_1toE_2xF]
1067    pshufb                  m2, m1, m5
1068    pshufb                  m3, m2, m5
1069    pshufb                  m6, m1, [pb_15]
1070    mova                    m7, m6
1071%else
1072    pand                    m5, m1, [pb_15x0_1xm1]      ; _______________F
1073    psrldq                  m2, m1, 1                   ; 123456789ABCDEF_
1074    por                     m2, m5                      ; 123456789ABCDEFF
1075    psrldq                  m3, m2, 1                   ; 23456789ABCDEFF_
1076    por                     m3, m5                      ; 23456789ABCDEFFF
1077    pshufhw                 m7, m2, q3333               ; xxxxxxxxFFFFFFFF
1078    pshufd                  m6, m7, q3333
1079%endif
1080    LOWPASS                  1, 2, 3, 4
1081    lea                 dst16q, [dstq  +strideq*8]
1082    mov                   cntd, 8
1083    lea                 dst16q, [dst16q+strideq*8]
1084.loop:
1085    movhlps                 m7, m1
1086    mova [dstq  +strideq*0+ 0], m0
1087    mova [dstq  +strideq*0+16], m1
1088    movhps [dstq+strideq*8+ 0], m0
1089    movq [dstq  +strideq*8+ 8], m1
1090    mova [dstq  +strideq*8+16], m7
1091    mova [dst16q+strideq*0+ 0], m1
1092    mova [dst16q+strideq*0+16], m6
1093    mova [dst16q+strideq*8+ 0], m7
1094    mova [dst16q+strideq*8+16], m6
1095%if cpuflag(avx)
1096    vpalignr                m0, m1, m0, 1
1097    pshufb                  m1, m5
1098%elif cpuflag(ssse3)
1099    palignr                 m2, m1, m0, 1
1100    pshufb                  m1, m5
1101    mova                    m0, m2
1102%else
1103    mova                    m4, m1
1104    psrldq                  m0, 1
1105    pslldq                  m4, 15
1106    psrldq                  m1, 1
1107    por                     m0, m4
1108    por                     m1, m5
1109%endif
1110    add                   dstq, strideq
1111    add                 dst16q, strideq
1112    dec                   cntd
1113    jg .loop
1114    RET
1115%endmacro
1116
1117INIT_XMM sse2
1118DL_XMM_FUNCS
1119INIT_XMM ssse3
1120DL_XMM_FUNCS
1121INIT_XMM avx
1122DL_XMM_FUNCS
1123
1124; dr
1125
1126%macro DR_MMX_FUNCS 0
1127cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a
1128    movd                    m0, [lq]
1129    punpckldq               m0, [aq-1]
1130    movd                    m1, [aq+3]
1131    DEFINE_ARGS dst, stride, stride3
1132    lea               stride3q, [strideq*3]
1133    PALIGNR                 m1, m0, 1, m3
1134    psrlq                   m2, m1, 8
1135    LOWPASS                  0, 1, 2, 3
1136
1137    movd      [dstq+stride3q ], m0
1138    psrlq                   m0, 8
1139    movd      [dstq+strideq*2], m0
1140    psrlq                   m0, 8
1141    movd      [dstq+strideq*1], m0
1142    psrlq                   m0, 8
1143    movd      [dstq+strideq*0], m0
1144    RET
1145%endmacro
1146
1147INIT_MMX mmxext
1148DR_MMX_FUNCS
1149INIT_MMX ssse3
1150DR_MMX_FUNCS
1151
1152%macro DR_XMM_FUNCS 0
1153cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a
1154    movq                    m1, [lq]
1155    movhps                  m1, [aq-1]
1156    movd                    m2, [aq+7]
1157    DEFINE_ARGS dst, stride, stride3
1158    lea               stride3q, [strideq*3]
1159    pslldq                  m0, m1, 1
1160    PALIGNR                 m2, m1, 1, m3
1161    LOWPASS                  0, 1, 2, 3
1162
1163    movhps    [dstq+strideq*0], m0
1164    pslldq                  m0, 1
1165    movhps    [dstq+strideq*1], m0
1166    pslldq                  m0, 1
1167    movhps    [dstq+strideq*2], m0
1168    pslldq                  m0, 1
1169    movhps    [dstq+stride3q ], m0
1170    pslldq                  m0, 1
1171    lea                   dstq, [dstq+strideq*4]
1172    movhps    [dstq+strideq*0], m0
1173    pslldq                  m0, 1
1174    movhps    [dstq+strideq*1], m0
1175    pslldq                  m0, 1
1176    movhps    [dstq+strideq*2], m0
1177    pslldq                  m0, 1
1178    movhps    [dstq+stride3q ], m0
1179    RET
1180
1181cglobal vp9_ipred_dr_16x16, 4, 4, 6, dst, stride, l, a
1182    mova                    m1, [lq]
1183    movu                    m2, [aq-1]
1184    movd                    m4, [aq+15]
1185    DEFINE_ARGS dst, stride, stride9, cnt
1186    lea               stride9q, [strideq *3]
1187    mov                   cntd, 4
1188    lea               stride9q, [stride9q*3]
1189    PALIGNR                 m4, m2, 1, m5
1190    PALIGNR                 m3, m2, m1, 15, m5
1191    LOWPASS                  3,  2, 4, 5
1192    pslldq                  m0, m1, 1
1193    PALIGNR                 m2, m1, 1, m4
1194    LOWPASS                  0,  1, 2, 4
1195
1196.loop:
1197    mova    [dstq+strideq*0  ], m3
1198    movhps  [dstq+strideq*8+0], m0
1199    movq    [dstq+strideq*8+8], m3
1200    PALIGNR                 m3, m0, 15, m1
1201    pslldq                  m0, 1
1202    mova    [dstq+strideq*1  ], m3
1203    movhps  [dstq+stride9q +0], m0
1204    movq    [dstq+stride9q +8], m3
1205    PALIGNR                 m3, m0, 15, m1
1206    pslldq                  m0, 1
1207    lea                   dstq, [dstq+strideq*2]
1208    dec                   cntd
1209    jg .loop
1210    RET
1211
1212cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a
1213    mova                    m1, [lq]
1214    mova                    m2, [lq+16]
1215    movu                    m3, [aq-1]
1216    movu                    m4, [aq+15]
1217    movd                    m5, [aq+31]
1218    DEFINE_ARGS dst, stride, stride8, cnt
1219    lea               stride8q, [strideq*8]
1220    PALIGNR                 m5, m4, 1, m7
1221    PALIGNR                 m6, m4, m3, 15, m7
1222    LOWPASS                  5,  4,  6,  7
1223    PALIGNR                 m4, m3, 1, m7
1224    PALIGNR                 m6, m3, m2, 15, m7
1225    LOWPASS                  4,  3,  6,  7
1226    PALIGNR                 m3, m2, 1, m7
1227    PALIGNR                 m6, m2, m1, 15, m7
1228    LOWPASS                  3,  2,  6,  7
1229    PALIGNR                 m2, m1, 1, m6
1230    pslldq                  m0, m1, 1
1231    LOWPASS                  2,  1,  0,  6
1232    mov                   cntd, 16
1233
1234    ; out=m2/m3/m4/m5
1235.loop:
1236    mova  [dstq+stride8q*0+ 0], m4
1237    mova  [dstq+stride8q*0+16], m5
1238    mova  [dstq+stride8q*2+ 0], m3
1239    mova  [dstq+stride8q*2+16], m4
1240    PALIGNR                 m5, m4, 15, m6
1241    PALIGNR                 m4, m3, 15, m6
1242    PALIGNR                 m3, m2, 15, m6
1243    pslldq                  m2, 1
1244    add                   dstq, strideq
1245    dec                   cntd
1246    jg .loop
1247    RET
1248%endmacro
1249
1250INIT_XMM sse2
1251DR_XMM_FUNCS
1252INIT_XMM ssse3
1253DR_XMM_FUNCS
1254INIT_XMM avx
1255DR_XMM_FUNCS
1256
1257; vl
1258
1259INIT_MMX mmxext
1260cglobal vp9_ipred_vl_4x4, 4, 4, 0, dst, stride, l, a
1261    movq                    m0, [aq]
1262    psrlq                   m1, m0, 8
1263    psrlq                   m2, m1, 8
1264    LOWPASS                  2,  1, 0, 3
1265    pavgb                   m1, m0
1266    movd      [dstq+strideq*0], m1
1267    movd      [dstq+strideq*1], m2
1268    lea                   dstq, [dstq+strideq*2]
1269    psrlq                   m1, 8
1270    psrlq                   m2, 8
1271    movd      [dstq+strideq*0], m1
1272    movd      [dstq+strideq*1], m2
1273    RET
1274
1275%macro VL_XMM_FUNCS 0
1276cglobal vp9_ipred_vl_8x8, 4, 4, 4, dst, stride, l, a
1277    movq                    m0, [aq]
1278%if cpuflag(ssse3)
1279    pshufb                  m0, [pb_0to6_9x7]
1280%else
1281    punpcklbw               m1, m0, m0
1282    punpckhwd               m1, m1
1283    shufps                  m0, m1, q3310
1284%endif
1285    DEFINE_ARGS dst, stride, stride3
1286    lea               stride3q, [strideq*3]
1287    psrldq                  m1, m0, 1
1288    psrldq                  m2, m0, 2
1289    LOWPASS                  2,  1,  0,  3
1290    pavgb                   m1, m0
1291
1292    movq      [dstq+strideq*0], m1
1293    movq      [dstq+strideq*1], m2
1294    psrldq                  m1, 1
1295    psrldq                  m2, 1
1296    movq      [dstq+strideq*2], m1
1297    movq      [dstq+stride3q ], m2
1298    lea                   dstq, [dstq+strideq*4]
1299    psrldq                  m1, 1
1300    psrldq                  m2, 1
1301    movq      [dstq+strideq*0], m1
1302    movq      [dstq+strideq*1], m2
1303    psrldq                  m1, 1
1304    psrldq                  m2, 1
1305    movq      [dstq+strideq*2], m1
1306    movq      [dstq+stride3q ], m2
1307    RET
1308
1309cglobal vp9_ipred_vl_16x16, 4, 4, 5, dst, stride, l, a
1310    mova                    m0, [aq]
1311    DEFINE_ARGS dst, stride, stride3, cnt
1312    lea               stride3q, [strideq*3]
1313%if cpuflag(ssse3)
1314    mova                    m4, [pb_1toE_2xF]
1315    pshufb                  m1, m0, m4
1316    pshufb                  m2, m1, m4
1317%else
1318    pand                    m4, m0, [pb_15x0_1xm1]  ; _______________F
1319    psrldq                  m1, m0, 1               ; 123456789ABCDEF_
1320    por                     m1, m4                  ; 123456789ABCDEFF
1321    psrldq                  m2, m1, 1               ; 23456789ABCDEFF_
1322    por                     m2, m4                  ; 23456789ABCDEFFF
1323%endif
1324    LOWPASS                  2,  1,  0, 3
1325    pavgb                   m1, m0
1326    mov                   cntd, 4
1327.loop:
1328    mova      [dstq+strideq*0], m1
1329    mova      [dstq+strideq*1], m2
1330%if cpuflag(ssse3)
1331    pshufb                  m1, m4
1332    pshufb                  m2, m4
1333%else
1334    psrldq                  m1, 1
1335    psrldq                  m2, 1
1336    por                     m1, m4
1337    por                     m2, m4
1338%endif
1339    mova      [dstq+strideq*2], m1
1340    mova      [dstq+stride3q ], m2
1341%if cpuflag(ssse3)
1342    pshufb                  m1, m4
1343    pshufb                  m2, m4
1344%else
1345    psrldq                  m1, 1
1346    psrldq                  m2, 1
1347    por                     m1, m4
1348    por                     m2, m4
1349%endif
1350    lea                   dstq, [dstq+strideq*4]
1351    dec                   cntd
1352    jg .loop
1353    RET
1354
1355cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a
1356    mova                    m0, [aq]
1357    mova                    m5, [aq+16]
1358    DEFINE_ARGS dst, stride, dst16, cnt
1359    PALIGNR                 m2, m5, m0, 1, m4
1360    PALIGNR                 m3, m5, m0, 2, m4
1361    lea                 dst16q, [dstq  +strideq*8]
1362    LOWPASS                  3,  2,  0, 6
1363    pavgb                   m2, m0
1364%if cpuflag(ssse3)
1365    mova                    m4, [pb_1toE_2xF]
1366    pshufb                  m0, m5, m4
1367    pshufb                  m1, m0, m4
1368%else
1369    pand                    m4, m5, [pb_15x0_1xm1]  ; _______________F
1370    psrldq                  m0, m5, 1               ; 123456789ABCDEF_
1371    por                     m0, m4                  ; 123456789ABCDEFF
1372    psrldq                  m1, m0, 1               ; 23456789ABCDEFF_
1373    por                     m1, m4                  ; 23456789ABCDEFFF
1374%endif
1375    lea                 dst16q, [dst16q+strideq*8]
1376    LOWPASS                  1,  0,  5, 6
1377    pavgb                   m0, m5
1378%if cpuflag(ssse3)
1379    pshufb                  m5, [pb_15]
1380%else
1381    punpckhbw               m5, m4, m4
1382    pshufhw                 m5, m5, q3333
1383    punpckhqdq              m5, m5
1384%endif
1385    mov                   cntd, 8
1386
1387.loop:
1388%macro %%write 3
1389    mova    [dstq+stride%1+ 0], %2
1390    mova    [dstq+stride%1+16], %3
1391    movhps  [dst16q+stride%1 ], %2
1392    movu  [dst16q+stride%1+ 8], %3
1393    movq  [dst16q+stride%1+24], m5
1394%if cpuflag(avx)
1395    palignr                 %2, %3, %2, 1
1396    pshufb                  %3, m4
1397%elif cpuflag(ssse3)
1398    palignr                 m6, %3, %2, 1
1399    pshufb                  %3, m4
1400    mova                    %2, m6
1401%else
1402    pslldq                  m6, %3, 15
1403    psrldq                  %3, 1
1404    psrldq                  %2, 1
1405    por                     %3, m4
1406    por                     %2, m6
1407%endif
1408%endmacro
1409
1410    %%write                q*0, m2, m0
1411    %%write                q*1, m3, m1
1412    lea                   dstq, [dstq  +strideq*2]
1413    lea                 dst16q, [dst16q+strideq*2]
1414    dec                   cntd
1415    jg .loop
1416    RET
1417%endmacro
1418
1419INIT_XMM sse2
1420VL_XMM_FUNCS
1421INIT_XMM ssse3
1422VL_XMM_FUNCS
1423INIT_XMM avx
1424VL_XMM_FUNCS
1425
1426; vr
1427
1428%macro VR_MMX_FUNCS 0
1429cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a
1430    movq                    m1, [aq-1]
1431    punpckldq               m2, [lq]
1432    movd                    m0, [aq]
1433    DEFINE_ARGS dst, stride, stride3
1434    lea               stride3q, [strideq*3]
1435    pavgb                   m0, m1
1436    PALIGNR                 m1, m2, 5, m3
1437    psrlq                   m2, m1, 8
1438    psllq                   m3, m1, 8
1439    LOWPASS                  2,  1, 3, 4
1440
1441    ; ABCD <- for the following predictor:
1442    ; EFGH
1443    ; IABC  | m0 contains ABCDxxxx
1444    ; JEFG  | m2 contains xJIEFGHx
1445
1446%if cpuflag(ssse3)
1447    punpckldq               m0, m2
1448    pshufb                  m2, [pb_13456_3xm1]
1449    movd      [dstq+strideq*0], m0
1450    pshufb                  m0, [pb_6012_4xm1]
1451    movd      [dstq+stride3q ], m2
1452    psrlq                   m2, 8
1453    movd      [dstq+strideq*2], m0
1454    movd      [dstq+strideq*1], m2
1455%else
1456    psllq                   m1, m2, 40
1457    psrlq                   m2, 24
1458    movd      [dstq+strideq*0], m0
1459    movd      [dstq+strideq*1], m2
1460    PALIGNR                 m0, m1, 7, m3
1461    psllq                   m1, 8
1462    PALIGNR                 m2, m1, 7, m3
1463    movd      [dstq+strideq*2], m0
1464    movd      [dstq+stride3q ], m2
1465%endif
1466    RET
1467%endmacro
1468
1469INIT_MMX mmxext
1470VR_MMX_FUNCS
1471INIT_MMX ssse3
1472VR_MMX_FUNCS
1473
1474%macro VR_XMM_FUNCS 1 ; n_xmm_regs for 16x16
1475cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a
1476    movu                    m1, [aq-1]
1477    movhps                  m2, [lq]
1478    movq                    m0, [aq]
1479    DEFINE_ARGS dst, stride, stride3
1480    lea               stride3q, [strideq*3]
1481    pavgb                   m0, m1
1482    PALIGNR                 m1, m2, 9, m3
1483    pslldq                  m2, m1, 1
1484    pslldq                  m3, m1, 2
1485    LOWPASS                  1,  2, 3, 4
1486
1487    ; ABCDEFGH <- for the following predictor:
1488    ; IJKLMNOP
1489    ; QABCDEFG  | m0 contains ABCDEFGHxxxxxxxx
1490    ; RIJKLMNO  | m1 contains xxVUTSRQIJKLMNOP
1491    ; SQABCDEF
1492    ; TRIJKLMN
1493    ; USQABCDE
1494    ; VTRIJKLM
1495
1496%if cpuflag(ssse3)
1497    punpcklqdq              m0, m1 ; ABCDEFGHxxVUTSRQ
1498%endif
1499    movq      [dstq+strideq*0], m0
1500    movhps    [dstq+strideq*1], m1
1501%if cpuflag(ssse3)
1502    pshufb                  m0, [pb_6xm1_BDF_0to6]  ; xxxxxxUSQABCDEFG
1503    pshufb                  m1, [pb_6xm1_246_8toE]  ; xxxxxxVTRIJKLMNO
1504%else
1505    psrlw                   m2, m1, 8               ; x_U_S_Q_xxxxxxxx
1506    pand                    m3, m1, [pw_255]        ; x_V_T_R_xxxxxxxx
1507    packuswb                m3, m2                  ; xVTRxxxxxUSQxxxx
1508    pslldq                  m3, 4                   ; xxxxxVTRxxxxxUSQ
1509    PALIGNR                 m0, m3, 7, m4           ; xxxxxxUSQABCDEFG
1510    psrldq                  m1, 8
1511    pslldq                  m3, 8
1512    PALIGNR                 m1, m3, 7, m4           ; xxxxxxVTRIJKLMNO
1513%endif
1514    movhps    [dstq+strideq*2], m0
1515    movhps    [dstq+stride3q ], m1
1516    lea                   dstq, [dstq+strideq*4]
1517    pslldq                  m0, 1
1518    pslldq                  m1, 1
1519    movhps    [dstq+strideq*0], m0
1520    movhps    [dstq+strideq*1], m1
1521    pslldq                  m0, 1
1522    pslldq                  m1, 1
1523    movhps    [dstq+strideq*2], m0
1524    movhps    [dstq+stride3q ], m1
1525    RET
1526
1527cglobal vp9_ipred_vr_16x16, 4, 4, %1, dst, stride, l, a
1528    mova                    m0, [aq]
1529    movu                    m1, [aq-1]
1530    mova                    m2, [lq]
1531    DEFINE_ARGS dst, stride, stride3, cnt
1532    lea               stride3q, [strideq*3]
1533    PALIGNR                 m3, m1, m2, 15, m6
1534    LOWPASS                  3,  1,  0,  4
1535    pavgb                   m0, m1
1536    PALIGNR                 m1, m2,  1, m6
1537    pslldq                  m4, m2,  1
1538    LOWPASS                  1,  2,  4,  5
1539%if cpuflag(ssse3)
1540    pshufb                  m1, [pb_02468ACE_13579BDF]
1541%else
1542    psrlw                   m5, m1, 8
1543    pand                    m1, [pw_255]
1544    packuswb                m1, m5
1545%endif
1546    mov                   cntd, 4
1547
1548.loop:
1549    movlhps                 m2, m1
1550    mova      [dstq+strideq*0], m0
1551    mova      [dstq+strideq*1], m3
1552    PALIGNR                 m4, m0, m1, 15, m6
1553    PALIGNR                 m5, m3, m2, 15, m6
1554    mova      [dstq+strideq*2], m4
1555    mova      [dstq+stride3q ], m5
1556    lea                   dstq, [dstq+strideq*4]
1557    PALIGNR                 m0, m1, 14, m6
1558    PALIGNR                 m3, m2, 14, m6
1559    pslldq                  m1, 2
1560    dec                   cntd
1561    jg .loop
1562    RET
1563
1564cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a
1565    mova                    m0, [aq]
1566    mova                    m2, [aq+16]
1567    movu                    m1, [aq-1]
1568    PALIGNR                 m3, m2, m0, 15, m6
1569    PALIGNR                 m4, m2, m0, 14, m6
1570    LOWPASS                  4,  3,  2,  5
1571    pavgb                   m3, m2
1572    mova                    m2, [lq+16]
1573    PALIGNR                 m5, m1, m2, 15, m6
1574    LOWPASS                  5,  1,  0,  6
1575    pavgb                   m0, m1
1576    mova                    m6, [lq]
1577%if ARCH_X86_64
1578    SWAP                     0, 8
1579%else
1580    mova                [dstq], m0
1581%endif
1582    PALIGNR                 m1, m2,  1, m0
1583    PALIGNR                 m7, m2, m6, 15, m0
1584    LOWPASS                  1,  2,  7,  0
1585    PALIGNR                 m2, m6,  1, m0
1586    pslldq                  m7, m6,  1
1587    LOWPASS                  2,  6,  7,  0
1588%if cpuflag(ssse3)
1589    pshufb                  m1, [pb_02468ACE_13579BDF]
1590    pshufb                  m2, [pb_02468ACE_13579BDF]
1591%else
1592    psrlw                   m0, m1, 8
1593    psrlw                   m6, m2, 8
1594    pand                    m1, [pw_255]
1595    pand                    m2, [pw_255]
1596    packuswb                m1, m0
1597    packuswb                m2, m6
1598%endif
1599    DEFINE_ARGS dst, stride, dst16, cnt
1600    lea                 dst16q, [dstq  +strideq*8]
1601    lea                 dst16q, [dst16q+strideq*8]
1602    SBUTTERFLY             qdq,  2,  1,  6
1603%if ARCH_X86_64
1604    SWAP                     0, 8
1605%else
1606    mova                    m0, [dstq]
1607%endif
1608    mov                   cntd, 8
1609
1610.loop:
1611    ; even lines (0, 2, 4, ...): m1 | m0, m3
1612    ;  odd lines (1, 3, 5, ...): m2 | m5, m4
1613%macro %%write 4
1614    mova    [dstq+stride%1+ 0], %3
1615    mova    [dstq+stride%1+16], %4
1616    movhps  [dst16q+stride%1 ], %2
1617    movu  [dst16q+stride%1+ 8], %3
1618    movq  [dst16q+stride%1+24], %4
1619    PALIGNR                 %4, %3, 15, m6
1620    PALIGNR                 %3, %2, 15, m6
1621    pslldq                  %2,  1
1622%endmacro
1623
1624    %%write                q*0, m1, m0, m3
1625    %%write                q*1, m2, m5, m4
1626    lea                   dstq, [dstq  +strideq*2]
1627    lea                 dst16q, [dst16q+strideq*2]
1628    dec                   cntd
1629    jg .loop
1630    RET
1631%endmacro
1632
1633INIT_XMM sse2
1634VR_XMM_FUNCS 7
1635INIT_XMM ssse3
1636VR_XMM_FUNCS 6
1637INIT_XMM avx
1638VR_XMM_FUNCS 6
1639
1640; hd
1641
1642INIT_MMX mmxext
1643cglobal vp9_ipred_hd_4x4, 4, 4, 0, dst, stride, l, a
1644    movd                    m0, [lq]
1645    punpckldq               m0, [aq-1]
1646    DEFINE_ARGS dst, stride, stride3
1647    lea               stride3q, [strideq*3]
1648    psrlq                   m1, m0, 8
1649    psrlq                   m2, m1, 8
1650    LOWPASS                  2,  1, 0,  3
1651    pavgb                   m1, m0
1652
1653    ; DHIJ <- for the following predictor:
1654    ; CGDH
1655    ; BFCG  | m1 contains ABCDxxxx
1656    ; AEBF  | m2 contains EFGHIJxx
1657
1658    punpcklbw               m1, m2
1659    punpckhdq               m0, m1, m2
1660
1661    ; m1 contains AEBFCGDH
1662    ; m0 contains CGDHIJxx
1663
1664    movd      [dstq+stride3q ], m1
1665    movd      [dstq+strideq*1], m0
1666    psrlq                   m1, 16
1667    psrlq                   m0, 16
1668    movd      [dstq+strideq*2], m1
1669    movd      [dstq+strideq*0], m0
1670    RET
1671
1672%macro HD_XMM_FUNCS 0
1673cglobal vp9_ipred_hd_8x8, 4, 4, 5, dst, stride, l, a
1674    movq                    m0, [lq]
1675    movhps                  m0, [aq-1]
1676    DEFINE_ARGS dst, stride, stride3, dst4
1677    lea               stride3q, [strideq*3]
1678    lea                  dst4q, [dstq+strideq*4]
1679    psrldq                  m1, m0, 1
1680    psrldq                  m2, m1, 1
1681    LOWPASS                  2,  1,  0,  3
1682    pavgb                   m1, m0
1683
1684    ; HPQRSTUV <- for the following predictor
1685    ; GOHPQRST
1686    ; FNGOHPQR  | m1 contains ABCDEFGHxxxxxxxx
1687    ; EMFNGOHP  | m2 contains IJKLMNOPQRSTUVxx
1688    ; DLEMFNGO
1689    ; CKDLEMFN
1690    ; BJCKDLEM
1691    ; AIBJCKDL
1692
1693    punpcklbw               m1, m2
1694    movhlps                 m2, m2
1695
1696    ; m1 contains AIBJCKDLEMFNGOHP
1697    ; m2 contains QRSTUVxxxxxxxxxx
1698
1699    movhps   [dstq +stride3q ], m1
1700    movq     [dst4q+stride3q ], m1
1701    PALIGNR                 m3, m2, m1, 2, m4
1702    movhps   [dstq +strideq*2], m3
1703    movq     [dst4q+strideq*2], m3
1704    PALIGNR                 m3, m2, m1, 4, m4
1705    movhps   [dstq +strideq*1], m3
1706    movq     [dst4q+strideq*1], m3
1707    PALIGNR                 m2, m1, 6, m4
1708    movhps   [dstq +strideq*0], m2
1709    movq     [dst4q+strideq*0], m2
1710    RET
1711
1712cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a
1713    mova                    m0, [lq]
1714    movu                    m3, [aq-1]
1715    DEFINE_ARGS dst, stride, stride4, dst4, dst8, dst12
1716    lea               stride4q, [strideq*4]
1717    lea                  dst4q, [dstq +stride4q]
1718    lea                  dst8q, [dst4q+stride4q]
1719    lea                 dst12q, [dst8q+stride4q]
1720    psrldq                  m4, m3,  1
1721    psrldq                  m5, m3,  2
1722    LOWPASS                  5,  4,  3,  6
1723    PALIGNR                 m1, m3, m0,  1, m6
1724    PALIGNR                 m2, m3, m0,  2, m6
1725    LOWPASS                  2,  1,  0,  6
1726    pavgb                   m1, m0
1727    SBUTTERFLY              bw,  1,  2,  6
1728
1729    ; I PROBABLY INVERTED L0 ad L16 here
1730    ; m1, m2, m5
1731.loop:
1732    sub               stride4q, strideq
1733    movhps [dstq +stride4q +0], m2
1734    movq   [dstq +stride4q +8], m5
1735    mova   [dst4q+stride4q   ], m2
1736    movhps [dst8q+stride4q +0], m1
1737    movq   [dst8q+stride4q +8], m2
1738    mova  [dst12q+stride4q   ], m1
1739%if cpuflag(avx)
1740    palignr                 m1, m2, m1, 2
1741    palignr                 m2, m5, m2, 2
1742%elif cpuflag(ssse3)
1743    palignr                 m3, m2, m1, 2
1744    palignr                 m0, m5, m2, 2
1745    mova                    m1, m3
1746    mova                    m2, m0
1747%else
1748    ; slightly modified version of PALIGNR
1749    mova                    m6, m2
1750    mova                    m4, m5
1751    pslldq                  m6, 14
1752    pslldq                  m4, 14
1753    psrldq                  m1, 2
1754    psrldq                  m2, 2
1755    por                     m1, m6
1756    por                     m2, m4
1757%endif
1758    psrldq                  m5, 2
1759    jg .loop
1760    RET
1761
1762cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a
1763    mova                    m0, [lq]
1764    mova                    m1, [lq+16]
1765    movu                    m2, [aq-1]
1766    movu                    m3, [aq+15]
1767    DEFINE_ARGS dst, stride, stride8, dst8, dst16, dst24
1768    lea               stride8q, [strideq*8]
1769    lea                  dst8q, [dstq  +stride8q]
1770    lea                 dst16q, [dst8q +stride8q]
1771    lea                 dst24q, [dst16q+stride8q]
1772    psrldq                  m4, m3,  1
1773    psrldq                  m5, m3,  2
1774    LOWPASS                  5,  4,  3,  6
1775    PALIGNR                 m4, m3, m2,  2, m6
1776    PALIGNR                 m3, m2,  1, m6
1777    LOWPASS                  4,  3,  2,  6
1778    PALIGNR                 m3, m2, m1,  2, m6
1779    PALIGNR                 m2, m1,  1, m6
1780    LOWPASS                  3,  2,  1,  6
1781    pavgb                   m2, m1
1782    PALIGNR                 m6, m1, m0,  1, m7
1783    PALIGNR                 m1, m0,  2, m7
1784    LOWPASS                  1,  6,  0,  7
1785    pavgb                   m0, m6
1786    SBUTTERFLY              bw,  2,  3,  6
1787    SBUTTERFLY              bw,  0,  1,  6
1788
1789    ; m0, m1, m2, m3, m4, m5
1790.loop:
1791    sub               stride8q, strideq
1792    mova  [dstq  +stride8q+ 0], m3
1793    mova  [dstq  +stride8q+16], m4
1794    mova  [dst8q +stride8q+ 0], m2
1795    mova  [dst8q +stride8q+16], m3
1796    mova  [dst16q+stride8q+ 0], m1
1797    mova  [dst16q+stride8q+16], m2
1798    mova  [dst24q+stride8q+ 0], m0
1799    mova  [dst24q+stride8q+16], m1
1800%if cpuflag(avx)
1801    palignr                 m0, m1, m0, 2
1802    palignr                 m1, m2, m1, 2
1803    palignr                 m2, m3, m2, 2
1804    palignr                 m3, m4, m3, 2
1805    palignr                 m4, m5, m4, 2
1806    psrldq                  m5, 2
1807%elif cpuflag(ssse3)
1808    psrldq                  m6, m5, 2
1809    palignr                 m5, m4, 2
1810    palignr                 m4, m3, 2
1811    palignr                 m3, m2, 2
1812    palignr                 m2, m1, 2
1813    palignr                 m1, m0, 2
1814    mova                    m0, m1
1815    mova                    m1, m2
1816    mova                    m2, m3
1817    mova                    m3, m4
1818    mova                    m4, m5
1819    mova                    m5, m6
1820%else
1821    ; sort of a half-integrated version of PALIGNR
1822    pslldq                  m7, m4, 14
1823    pslldq                  m6, m5, 14
1824    psrldq                  m4, 2
1825    psrldq                  m5, 2
1826    por                     m4, m6
1827    pslldq                  m6, m3, 14
1828    psrldq                  m3, 2
1829    por                     m3, m7
1830    pslldq                  m7, m2, 14
1831    psrldq                  m2, 2
1832    por                     m2, m6
1833    pslldq                  m6, m1, 14
1834    psrldq                  m1, 2
1835    por                     m1, m7
1836    psrldq                  m0, 2
1837    por                     m0, m6
1838%endif
1839    jg .loop
1840    RET
1841%endmacro
1842
1843INIT_XMM sse2
1844HD_XMM_FUNCS
1845INIT_XMM ssse3
1846HD_XMM_FUNCS
1847INIT_XMM avx
1848HD_XMM_FUNCS
1849
1850%macro HU_MMX_FUNCS 0
1851cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l
1852    movd                    m0, [lq]
1853%if cpuflag(ssse3)
1854    pshufb                  m0, [pb_0to2_5x3]
1855%else
1856    punpcklbw               m1, m0, m0          ; 00112233
1857    pshufw                  m1, m1, q3333       ; 33333333
1858    punpckldq               m0, m1              ; 01233333
1859%endif
1860    psrlq                   m1, m0, 8
1861    psrlq                   m2, m1, 8
1862    LOWPASS                  2,  1, 0, 3
1863    pavgb                   m1, m0
1864    DEFINE_ARGS dst, stride, stride3
1865    lea               stride3q, [strideq*3]
1866    SBUTTERFLY              bw,  1, 2, 0
1867    PALIGNR                 m2, m1, 2, m0
1868    movd      [dstq+strideq*0], m1
1869    movd      [dstq+strideq*1], m2
1870    punpckhdq               m1, m1
1871    punpckhdq               m2, m2
1872    movd      [dstq+strideq*2], m1
1873    movd      [dstq+stride3q ], m2
1874    RET
1875%endmacro
1876
1877INIT_MMX mmxext
1878HU_MMX_FUNCS
1879INIT_MMX ssse3
1880HU_MMX_FUNCS
1881
1882%macro HU_XMM_FUNCS 1 ; n_xmm_regs in hu_32x32
1883cglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l
1884    movq                    m0, [lq]
1885%if cpuflag(ssse3)
1886    pshufb                  m0, [pb_0to6_9x7]
1887%else
1888    punpcklbw               m1, m0, m0          ; 0011223344556677
1889    punpckhwd               m1, m1              ; 4444555566667777
1890    shufps                  m0, m1, q3310       ; 0123456777777777
1891%endif
1892    psrldq                  m1, m0, 1
1893    psrldq                  m2, m1, 1
1894    LOWPASS                  2,  1, 0, 3
1895    pavgb                   m1, m0
1896    DEFINE_ARGS dst, stride, stride3, dst4
1897    lea               stride3q, [strideq*3]
1898    lea                  dst4q, [dstq+strideq*4]
1899    SBUTTERFLY              bw,  1, 2, 0
1900    movq     [dstq +strideq*0], m1
1901    movhps   [dst4q+strideq*0], m1
1902    PALIGNR                 m0, m2, m1, 2, m3
1903    movq     [dstq +strideq*1], m0
1904    movhps   [dst4q+strideq*1], m0
1905    PALIGNR                 m0, m2, m1, 4, m3
1906    movq     [dstq +strideq*2], m0
1907    movhps   [dst4q+strideq*2], m0
1908    PALIGNR                 m2, m1, 6, m3
1909    movq     [dstq +stride3q ], m2
1910    movhps   [dst4q+stride3q ], m2
1911    RET
1912
1913cglobal vp9_ipred_hu_16x16, 3, 4, 5, dst, stride, l
1914    mova                    m0, [lq]
1915%if cpuflag(ssse3)
1916    mova                    m3, [pb_2toE_3xF]
1917    pshufb                  m1, m0, [pb_1toE_2xF]
1918    pshufb                  m2, m0, m3
1919%else
1920    pand                    m3, m0, [pb_15x0_1xm1]
1921    psrldq                  m1, m0, 1
1922    por                     m1, m3
1923    punpckhbw               m3, m3
1924    psrldq                  m2, m0, 2
1925    por                     m2, m3
1926%endif
1927    LOWPASS                  2,  1,  0,  4
1928    pavgb                   m1, m0
1929    DEFINE_ARGS dst, stride, stride9, cnt
1930    lea                stride9q, [strideq*8+strideq]
1931    mov                   cntd,  4
1932    SBUTTERFLY              bw,  1,  2,  0
1933
1934.loop:
1935    mova      [dstq+strideq*0], m1
1936    mova      [dstq+strideq*8], m2
1937    PALIGNR                 m0, m2, m1, 2, m4
1938%if cpuflag(ssse3)
1939    pshufb                  m2, m3
1940%else
1941    psrldq                  m2, 2
1942    por                     m2, m3
1943%endif
1944    mova      [dstq+strideq*1], m0
1945    mova      [dstq+stride9q ], m2
1946    PALIGNR                 m1, m2, m0, 2, m4
1947%if cpuflag(ssse3)
1948    pshufb                  m2, m3
1949%else
1950    psrldq                  m2, 2
1951    por                     m2, m3
1952%endif
1953    lea                   dstq, [dstq+strideq*2]
1954    dec                   cntd
1955    jg .loop
1956    RET
1957
1958cglobal vp9_ipred_hu_32x32, 3, 7, %1, dst, stride, l
1959    mova                    m1, [lq]
1960    mova                    m0, [lq+16]
1961    PALIGNR                 m2, m0, m1,  1, m5
1962    PALIGNR                 m3, m0, m1,  2, m5
1963    LOWPASS                  3,  2,  1,  5
1964    pavgb                   m2, m1
1965%if cpuflag(ssse3)
1966    mova                    m4, [pb_2toE_3xF]
1967    pshufb                  m5, m0, [pb_1toE_2xF]
1968    pshufb                  m1, m0, m4
1969%else
1970    pand                    m4, m0, [pb_15x0_1xm1]
1971    psrldq                  m5, m0, 1
1972    por                     m5, m4
1973    punpckhbw               m4, m4
1974    psrldq                  m1, m0, 2
1975    por                     m1, m4
1976%endif
1977    LOWPASS                  1,  5,  0,  6
1978    pavgb                   m0, m5
1979    DEFINE_ARGS dst, stride, cnt, stride0, dst8, dst16, dst24
1980    mov                   cntd,  8
1981    xor               stride0q, stride0q
1982    lea                  dst8q, [dstq  +strideq*8]
1983    lea                 dst16q, [dst8q +strideq*8]
1984    lea                 dst24q, [dst16q+strideq*8]
1985    SBUTTERFLY              bw,  0,  1,  5
1986    SBUTTERFLY              bw,  2,  3,  5
1987%if cpuflag(ssse3)
1988    pshufb                  m6, m1, [pb_15]
1989%else
1990    pshufhw                 m6, m4, q3333
1991    punpckhqdq              m6, m6
1992%endif
1993
1994.loop:
1995    mova  [dstq  +stride0q+ 0], m2
1996    mova  [dstq  +stride0q+16], m3
1997    mova  [dst8q +stride0q+ 0], m3
1998    mova  [dst8q +stride0q+16], m0
1999    mova  [dst16q+stride0q+ 0], m0
2000    mova  [dst16q+stride0q+16], m1
2001    mova  [dst24q+stride0q+ 0], m1
2002    mova  [dst24q+stride0q+16], m6
2003%if cpuflag(avx)
2004    palignr                 m2, m3, m2, 2
2005    palignr                 m3, m0, m3, 2
2006    palignr                 m0, m1, m0, 2
2007    pshufb                  m1, m4
2008%elif cpuflag(ssse3)
2009    pshufb                  m5, m1, m4
2010    palignr                 m1, m0, 2
2011    palignr                 m0, m3, 2
2012    palignr                 m3, m2, 2
2013    mova                    m2, m3
2014    mova                    m3, m0
2015    mova                    m0, m1
2016    mova                    m1, m5
2017%else
2018    ; half-integrated version of PALIGNR
2019    pslldq                  m5, m1, 14
2020    pslldq                  m7, m0, 14
2021    psrldq                  m1, 2
2022    psrldq                  m0, 2
2023    por                     m1, m4
2024    por                     m0, m5
2025    pslldq                  m5, m3, 14
2026    psrldq                  m3, 2
2027    por                     m3, m7
2028    psrldq                  m2, 2
2029    por                     m2, m5
2030%endif
2031    add               stride0q, strideq
2032    dec                   cntd
2033    jg .loop
2034    RET
2035%endmacro
2036
2037INIT_XMM sse2
2038HU_XMM_FUNCS 8
2039INIT_XMM ssse3
2040HU_XMM_FUNCS 7
2041INIT_XMM avx
2042HU_XMM_FUNCS 7
2043
2044; FIXME 127, 128, 129 ?
2045