1;******************************************************************************
2;* VP9 Intra prediction SIMD optimizations
3;*
4;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
5;*
6;* Parts based on:
7;* H.264 intra prediction asm optimizations
8;* Copyright (c) 2010 Fiona Glaser
9;* Copyright (c) 2010 Holger Lubitz
10;* Copyright (c) 2010 Loren Merritt
11;* Copyright (c) 2010 Ronald S. Bultje
12;*
13;* This file is part of FFmpeg.
14;*
15;* FFmpeg is free software; you can redistribute it and/or
16;* modify it under the terms of the GNU Lesser General Public
17;* License as published by the Free Software Foundation; either
18;* version 2.1 of the License, or (at your option) any later version.
19;*
20;* FFmpeg is distributed in the hope that it will be useful,
21;* but WITHOUT ANY WARRANTY; without even the implied warranty of
22;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23;* Lesser General Public License for more details.
24;*
25;* You should have received a copy of the GNU Lesser General Public
26;* License along with FFmpeg; if not, write to the Free Software
27;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
28;******************************************************************************
29
30%include "libavutil/x86/x86util.asm"
31
32SECTION_RODATA 32
33
34pw_m256: times 16 dw -256
35pw_m255: times 16 dw -255
36pw_4096: times 8 dw 4096
37
38pb_4x3_4x2_4x1_4x0: times 4 db 3
39                    times 4 db 2
40                    times 4 db 1
41                    times 4 db 0
42pb_8x1_8x0:   times 8 db 1
43              times 8 db 0
44pb_8x3_8x2:   times 8 db 3
45              times 8 db 2
46pb_0to5_2x7:  db 0, 1, 2, 3, 4, 5, 7, 7
47              times 8 db -1
48pb_0to6_9x7:  db 0, 1, 2, 3, 4, 5, 6
49              times 9 db 7
50pb_1to6_10x7: db 1, 2, 3, 4, 5, 6
51              times 10 db 7
52pb_2to6_3x7:
53pb_2to6_11x7: db 2, 3, 4, 5, 6
54              times 11 db 7
55pb_1toE_2xF:  db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
56pb_2toE_3xF:  db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
57pb_13456_3xm1: db 1, 3, 4, 5, 6
58               times 3 db -1
59pb_6012_4xm1: db 6, 0, 1, 2
60              times 4 db -1
61pb_6xm1_246_8toE: times 6 db -1
62                  db 2, 4, 6, 8, 9, 10, 11, 12, 13, 14
63pb_6xm1_BDF_0to6: times 6 db -1
64                  db 11, 13, 15, 0, 1, 2, 3, 4, 5, 6
65pb_02468ACE_13579BDF: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
66pb_7to1_9x0:  db 7, 6, 5, 4
67pb_3to1_5x0:  db 3, 2, 1
68              times 9 db 0
69pb_Fto0:      db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
70
71pb_2:  times 32 db 2
72pb_15: times 16 db 15
73
74cextern pb_1
75cextern pb_3
76cextern pw_512
77cextern pw_1024
78cextern pw_2048
79cextern pw_8192
80
81SECTION .text
82
83; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
84
85INIT_MMX ssse3
86cglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a
87    movd                    m0, [lq]
88    punpckldq               m0, [aq]
89    pxor                    m1, m1
90    psadbw                  m0, m1
91    pmulhrsw                m0, [pw_4096]
92    pshufb                  m0, m1
93    movd      [dstq+strideq*0], m0
94    movd      [dstq+strideq*1], m0
95    lea                   dstq, [dstq+strideq*2]
96    movd      [dstq+strideq*0], m0
97    movd      [dstq+strideq*1], m0
98    RET
99
100INIT_MMX ssse3
101cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a
102    movq                    m0, [lq]
103    movq                    m1, [aq]
104    DEFINE_ARGS dst, stride, stride3
105    lea               stride3q, [strideq*3]
106    pxor                    m2, m2
107    psadbw                  m0, m2
108    psadbw                  m1, m2
109    paddw                   m0, m1
110    pmulhrsw                m0, [pw_2048]
111    pshufb                  m0, m2
112    movq      [dstq+strideq*0], m0
113    movq      [dstq+strideq*1], m0
114    movq      [dstq+strideq*2], m0
115    movq      [dstq+stride3q ], m0
116    lea                   dstq, [dstq+strideq*4]
117    movq      [dstq+strideq*0], m0
118    movq      [dstq+strideq*1], m0
119    movq      [dstq+strideq*2], m0
120    movq      [dstq+stride3q ], m0
121    RET
122
123INIT_XMM ssse3
124cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a
125    mova                    m0, [lq]
126    mova                    m1, [aq]
127    DEFINE_ARGS dst, stride, stride3, cnt
128    lea               stride3q, [strideq*3]
129    pxor                    m2, m2
130    psadbw                  m0, m2
131    psadbw                  m1, m2
132    paddw                   m0, m1
133    movhlps                 m1, m0
134    paddw                   m0, m1
135    pmulhrsw                m0, [pw_1024]
136    pshufb                  m0, m2
137    mov                   cntd, 4
138.loop:
139    mova      [dstq+strideq*0], m0
140    mova      [dstq+strideq*1], m0
141    mova      [dstq+strideq*2], m0
142    mova      [dstq+stride3q ], m0
143    lea                   dstq, [dstq+strideq*4]
144    dec                   cntd
145    jg .loop
146    RET
147
148INIT_XMM ssse3
149cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a
150    mova                    m0, [lq]
151    mova                    m1, [lq+16]
152    mova                    m2, [aq]
153    mova                    m3, [aq+16]
154    DEFINE_ARGS dst, stride, stride3, cnt
155    lea               stride3q, [strideq*3]
156    pxor                    m4, m4
157    psadbw                  m0, m4
158    psadbw                  m1, m4
159    psadbw                  m2, m4
160    psadbw                  m3, m4
161    paddw                   m0, m1
162    paddw                   m2, m3
163    paddw                   m0, m2
164    movhlps                 m1, m0
165    paddw                   m0, m1
166    pmulhrsw                m0, [pw_512]
167    pshufb                  m0, m4
168    mov                   cntd, 8
169.loop:
170    mova   [dstq+strideq*0+ 0], m0
171    mova   [dstq+strideq*0+16], m0
172    mova   [dstq+strideq*1+ 0], m0
173    mova   [dstq+strideq*1+16], m0
174    mova   [dstq+strideq*2+ 0], m0
175    mova   [dstq+strideq*2+16], m0
176    mova   [dstq+stride3q + 0], m0
177    mova   [dstq+stride3q +16], m0
178    lea                   dstq, [dstq+strideq*4]
179    dec                   cntd
180    jg .loop
181    RET
182
183%if HAVE_AVX2_EXTERNAL
184INIT_YMM avx2
185cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
186    mova                    m0, [lq]
187    mova                    m1, [aq]
188    DEFINE_ARGS dst, stride, stride3, cnt
189    lea               stride3q, [strideq*3]
190    pxor                    m2, m2
191    psadbw                  m0, m2
192    psadbw                  m1, m2
193    paddw                   m0, m1
194    vextracti128           xm1, m0, 1
195    paddw                  xm0, xm1
196    movhlps                xm1, xm0
197    paddw                  xm0, xm1
198    pmulhrsw               xm0, [pw_512]
199    vpbroadcastb            m0, xm0
200    mov                   cntd, 4
201.loop:
202    mova      [dstq+strideq*0], m0
203    mova      [dstq+strideq*1], m0
204    mova      [dstq+strideq*2], m0
205    mova      [dstq+stride3q ], m0
206    lea                   dstq, [dstq+strideq*4]
207    mova      [dstq+strideq*0], m0
208    mova      [dstq+strideq*1], m0
209    mova      [dstq+strideq*2], m0
210    mova      [dstq+stride3q ], m0
211    lea                   dstq, [dstq+strideq*4]
212    dec                   cntd
213    jg .loop
214    RET
215%endif
216
217; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
218
219%macro DC_1D_FUNCS 2 ; dir (top or left), arg (a or l)
220INIT_MMX ssse3
221cglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a
222    movd                    m0, [%2q]
223    pxor                    m1, m1
224    psadbw                  m0, m1
225    pmulhrsw                m0, [pw_8192]
226    pshufb                  m0, m1
227    movd      [dstq+strideq*0], m0
228    movd      [dstq+strideq*1], m0
229    lea                   dstq, [dstq+strideq*2]
230    movd      [dstq+strideq*0], m0
231    movd      [dstq+strideq*1], m0
232    RET
233
234INIT_MMX ssse3
235cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a
236    movq                    m0, [%2q]
237    DEFINE_ARGS dst, stride, stride3
238    lea               stride3q, [strideq*3]
239    pxor                    m1, m1
240    psadbw                  m0, m1
241    pmulhrsw                m0, [pw_4096]
242    pshufb                  m0, m1
243    movq      [dstq+strideq*0], m0
244    movq      [dstq+strideq*1], m0
245    movq      [dstq+strideq*2], m0
246    movq      [dstq+stride3q ], m0
247    lea                   dstq, [dstq+strideq*4]
248    movq      [dstq+strideq*0], m0
249    movq      [dstq+strideq*1], m0
250    movq      [dstq+strideq*2], m0
251    movq      [dstq+stride3q ], m0
252    RET
253
254INIT_XMM ssse3
255cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a
256    mova                    m0, [%2q]
257    DEFINE_ARGS dst, stride, stride3, cnt
258    lea               stride3q, [strideq*3]
259    pxor                    m2, m2
260    psadbw                  m0, m2
261    movhlps                 m1, m0
262    paddw                   m0, m1
263    pmulhrsw                m0, [pw_2048]
264    pshufb                  m0, m2
265    mov                   cntd, 4
266.loop:
267    mova      [dstq+strideq*0], m0
268    mova      [dstq+strideq*1], m0
269    mova      [dstq+strideq*2], m0
270    mova      [dstq+stride3q ], m0
271    lea                   dstq, [dstq+strideq*4]
272    dec                   cntd
273    jg .loop
274    RET
275
276INIT_XMM ssse3
277cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
278    mova                    m0, [%2q]
279    mova                    m1, [%2q+16]
280    DEFINE_ARGS dst, stride, stride3, cnt
281    lea               stride3q, [strideq*3]
282    pxor                    m2, m2
283    psadbw                  m0, m2
284    psadbw                  m1, m2
285    paddw                   m0, m1
286    movhlps                 m1, m0
287    paddw                   m0, m1
288    pmulhrsw                m0, [pw_1024]
289    pshufb                  m0, m2
290    mov                   cntd, 8
291.loop:
292    mova   [dstq+strideq*0+ 0], m0
293    mova   [dstq+strideq*0+16], m0
294    mova   [dstq+strideq*1+ 0], m0
295    mova   [dstq+strideq*1+16], m0
296    mova   [dstq+strideq*2+ 0], m0
297    mova   [dstq+strideq*2+16], m0
298    mova   [dstq+stride3q + 0], m0
299    mova   [dstq+stride3q +16], m0
300    lea                   dstq, [dstq+strideq*4]
301    dec                   cntd
302    jg .loop
303    RET
304
305%if HAVE_AVX2_EXTERNAL
306INIT_YMM avx2
307cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
308    mova                    m0, [%2q]
309    DEFINE_ARGS dst, stride, stride3, cnt
310    lea               stride3q, [strideq*3]
311    pxor                    m2, m2
312    psadbw                  m0, m2
313    vextracti128           xm1, m0, 1
314    paddw                  xm0, xm1
315    movhlps                xm1, xm0
316    paddw                  xm0, xm1
317    pmulhrsw               xm0, [pw_1024]
318    vpbroadcastb            m0, xm0
319    mov                   cntd, 4
320.loop:
321    mova      [dstq+strideq*0], m0
322    mova      [dstq+strideq*1], m0
323    mova      [dstq+strideq*2], m0
324    mova      [dstq+stride3q ], m0
325    lea                   dstq, [dstq+strideq*4]
326    mova      [dstq+strideq*0], m0
327    mova      [dstq+strideq*1], m0
328    mova      [dstq+strideq*2], m0
329    mova      [dstq+stride3q ], m0
330    lea                   dstq, [dstq+strideq*4]
331    dec                   cntd
332    jg .loop
333    RET
334%endif
335%endmacro
336
337DC_1D_FUNCS top,  a
338DC_1D_FUNCS left, l
339
340; v
341
342INIT_MMX mmx
343cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a
344    movq                    m0, [aq]
345    DEFINE_ARGS dst, stride, stride3
346    lea               stride3q, [strideq*3]
347    movq      [dstq+strideq*0], m0
348    movq      [dstq+strideq*1], m0
349    movq      [dstq+strideq*2], m0
350    movq      [dstq+stride3q ], m0
351    lea                   dstq, [dstq+strideq*4]
352    movq      [dstq+strideq*0], m0
353    movq      [dstq+strideq*1], m0
354    movq      [dstq+strideq*2], m0
355    movq      [dstq+stride3q ], m0
356    RET
357
358INIT_XMM sse2
359cglobal vp9_ipred_v_16x16, 4, 4, 1, dst, stride, l, a
360    mova                    m0, [aq]
361    DEFINE_ARGS dst, stride, stride3, cnt
362    lea               stride3q, [strideq*3]
363    mov                   cntd, 4
364.loop:
365    mova      [dstq+strideq*0], m0
366    mova      [dstq+strideq*1], m0
367    mova      [dstq+strideq*2], m0
368    mova      [dstq+stride3q ], m0
369    lea                   dstq, [dstq+strideq*4]
370    dec                   cntd
371    jg .loop
372    RET
373
374INIT_XMM sse2
375cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a
376    mova                    m0, [aq]
377    mova                    m1, [aq+16]
378    DEFINE_ARGS dst, stride, stride3, cnt
379    lea               stride3q, [strideq*3]
380    mov                   cntd, 8
381.loop:
382    mova   [dstq+strideq*0+ 0], m0
383    mova   [dstq+strideq*0+16], m1
384    mova   [dstq+strideq*1+ 0], m0
385    mova   [dstq+strideq*1+16], m1
386    mova   [dstq+strideq*2+ 0], m0
387    mova   [dstq+strideq*2+16], m1
388    mova   [dstq+stride3q + 0], m0
389    mova   [dstq+stride3q +16], m1
390    lea                   dstq, [dstq+strideq*4]
391    dec                   cntd
392    jg .loop
393    RET
394
395%if HAVE_AVX2_EXTERNAL
396INIT_YMM avx2
397cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a
398    mova                    m0, [aq]
399    DEFINE_ARGS dst, stride, stride3, cnt
400    lea               stride3q, [strideq*3]
401    mov                   cntd, 4
402.loop:
403    mova      [dstq+strideq*0], m0
404    mova      [dstq+strideq*1], m0
405    mova      [dstq+strideq*2], m0
406    mova      [dstq+stride3q ], m0
407    lea                   dstq, [dstq+strideq*4]
408    mova      [dstq+strideq*0], m0
409    mova      [dstq+strideq*1], m0
410    mova      [dstq+strideq*2], m0
411    mova      [dstq+stride3q ], m0
412    lea                   dstq, [dstq+strideq*4]
413    dec                   cntd
414    jg .loop
415    RET
416%endif
417
418; h
419
420INIT_XMM ssse3
421cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3
422    movd                    m0, [lq]
423    pshufb                  m0, [pb_4x3_4x2_4x1_4x0]
424    lea               stride3q, [strideq*3]
425    movd      [dstq+strideq*0], m0
426    psrldq                  m0, 4
427    movd      [dstq+strideq*1], m0
428    psrldq                  m0, 4
429    movd      [dstq+strideq*2], m0
430    psrldq                  m0, 4
431    movd      [dstq+stride3q ], m0
432    RET
433
434%macro H_XMM_FUNCS 1
435INIT_XMM %1
436cglobal vp9_ipred_h_8x8, 3, 5, 4, dst, stride, l, stride3, cnt
437    mova                    m2, [pb_8x1_8x0]
438    mova                    m3, [pb_8x3_8x2]
439    lea               stride3q, [strideq*3]
440    mov                   cntq, 1
441.loop:
442    movd                    m0, [lq+cntq*4]
443    pshufb                  m1, m0, m3
444    pshufb                  m0, m2
445    movq      [dstq+strideq*0], m1
446    movhps    [dstq+strideq*1], m1
447    movq      [dstq+strideq*2], m0
448    movhps    [dstq+stride3q ], m0
449    lea                   dstq, [dstq+strideq*4]
450    dec                   cntq
451    jge .loop
452    RET
453
454INIT_XMM %1
455cglobal vp9_ipred_h_16x16, 3, 5, 8, dst, stride, l, stride3, cnt
456    mova                    m5, [pb_1]
457    mova                    m6, [pb_2]
458    mova                    m7, [pb_3]
459    pxor                    m4, m4
460    lea               stride3q, [strideq*3]
461    mov                   cntq, 3
462.loop:
463    movd                    m3, [lq+cntq*4]
464    pshufb                  m0, m3, m7
465    pshufb                  m1, m3, m6
466    mova      [dstq+strideq*0], m0
467    mova      [dstq+strideq*1], m1
468    pshufb                  m2, m3, m5
469    pshufb                  m3, m4
470    mova      [dstq+strideq*2], m2
471    mova      [dstq+stride3q ], m3
472    lea                   dstq, [dstq+strideq*4]
473    dec                   cntq
474    jge .loop
475    RET
476
477INIT_XMM %1
478cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
479    mova                    m5, [pb_1]
480    mova                    m6, [pb_2]
481    mova                    m7, [pb_3]
482    pxor                    m4, m4
483    lea               stride3q, [strideq*3]
484    mov                   cntq, 7
485.loop:
486    movd                    m3, [lq+cntq*4]
487    pshufb                  m0, m3, m7
488    pshufb                  m1, m3, m6
489    mova   [dstq+strideq*0+ 0], m0
490    mova   [dstq+strideq*0+16], m0
491    mova   [dstq+strideq*1+ 0], m1
492    mova   [dstq+strideq*1+16], m1
493    pshufb                  m2, m3, m5
494    pshufb                  m3, m4
495    mova   [dstq+strideq*2+ 0], m2
496    mova   [dstq+strideq*2+16], m2
497    mova   [dstq+stride3q + 0], m3
498    mova   [dstq+stride3q +16], m3
499    lea                   dstq, [dstq+strideq*4]
500    dec                   cntq
501    jge .loop
502    RET
503%endmacro
504
505H_XMM_FUNCS ssse3
506H_XMM_FUNCS avx
507
508%if HAVE_AVX2_EXTERNAL
509INIT_YMM avx2
510cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
511    mova                    m5, [pb_1]
512    mova                    m6, [pb_2]
513    mova                    m7, [pb_3]
514    pxor                    m4, m4
515    lea               stride3q, [strideq*3]
516    mov                   cntq, 7
517.loop:
518    movd                   xm3, [lq+cntq*4]
519    vinserti128             m3, m3, xm3, 1
520    pshufb                  m0, m3, m7
521    pshufb                  m1, m3, m6
522    mova      [dstq+strideq*0], m0
523    mova      [dstq+strideq*1], m1
524    pshufb                  m2, m3, m5
525    pshufb                  m3, m4
526    mova      [dstq+strideq*2], m2
527    mova      [dstq+stride3q ], m3
528    lea                   dstq, [dstq+strideq*4]
529    dec                   cntq
530    jge .loop
531    RET
532%endif
533
534; tm
535
536INIT_MMX ssse3
537cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a
538    pxor                    m1, m1
539    pinsrw                  m2, [aq-1], 0
540    movd                    m0, [aq]
541    DEFINE_ARGS dst, stride, l, cnt
542    mova                    m3, [pw_m256]
543    mova                    m4, [pw_m255]
544    pshufb                  m2, m3
545    punpcklbw               m0, m1
546    psubw                   m0, m2
547    mov                   cntq, 1
548.loop:
549    pinsrw                  m2, [lq+cntq*2], 0
550    pshufb                  m1, m2, m4
551    pshufb                  m2, m3
552    paddw                   m1, m0
553    paddw                   m2, m0
554    packuswb                m1, m1
555    packuswb                m2, m2
556    movd      [dstq+strideq*0], m1
557    movd      [dstq+strideq*1], m2
558    lea                   dstq, [dstq+strideq*2]
559    dec                   cntq
560    jge .loop
561    RET
562
563%macro TM_XMM_FUNCS 1
564INIT_XMM %1
565cglobal vp9_ipred_tm_8x8, 4, 4, 5, dst, stride, l, a
566    pxor                    m1, m1
567    pinsrw                  m2, [aq-1], 0
568    movh                    m0, [aq]
569    DEFINE_ARGS dst, stride, l, cnt
570    mova                    m3, [pw_m256]
571    mova                    m4, [pw_m255]
572    pshufb                  m2, m3
573    punpcklbw               m0, m1
574    psubw                   m0, m2
575    mov                   cntq, 3
576.loop:
577    pinsrw                  m2, [lq+cntq*2], 0
578    pshufb                  m1, m2, m4
579    pshufb                  m2, m3
580    paddw                   m1, m0
581    paddw                   m2, m0
582    packuswb                m1, m2
583    movh      [dstq+strideq*0], m1
584    movhps    [dstq+strideq*1], m1
585    lea                   dstq, [dstq+strideq*2]
586    dec                   cntq
587    jge .loop
588    RET
589
590INIT_XMM %1
591cglobal vp9_ipred_tm_16x16, 4, 4, 8, dst, stride, l, a
592    pxor                    m3, m3
593    pinsrw                  m2, [aq-1], 0
594    mova                    m0, [aq]
595    DEFINE_ARGS dst, stride, l, cnt
596    mova                    m4, [pw_m256]
597    mova                    m5, [pw_m255]
598    pshufb                  m2, m4
599    punpckhbw               m1, m0, m3
600    punpcklbw               m0, m3
601    psubw                   m1, m2
602    psubw                   m0, m2
603    mov                   cntq, 7
604.loop:
605    pinsrw                  m7, [lq+cntq*2], 0
606    pshufb                  m3, m7, m5
607    pshufb                  m7, m4
608    paddw                   m2, m3, m0
609    paddw                   m3, m1
610    paddw                   m6, m7, m0
611    paddw                   m7, m1
612    packuswb                m2, m3
613    packuswb                m6, m7
614    mova      [dstq+strideq*0], m2
615    mova      [dstq+strideq*1], m6
616    lea                   dstq, [dstq+strideq*2]
617    dec                   cntq
618    jge .loop
619    RET
620
621%if ARCH_X86_64
622INIT_XMM %1
623cglobal vp9_ipred_tm_32x32, 4, 4, 14, dst, stride, l, a
624    pxor                    m5, m5
625    pinsrw                  m4, [aq-1], 0
626    mova                    m0, [aq]
627    mova                    m2, [aq+16]
628    DEFINE_ARGS dst, stride, l, cnt
629    mova                    m8, [pw_m256]
630    mova                    m9, [pw_m255]
631    pshufb                  m4, m8
632    punpckhbw               m1, m0,  m5
633    punpckhbw               m3, m2,  m5
634    punpcklbw               m0, m5
635    punpcklbw               m2, m5
636    psubw                   m1, m4
637    psubw                   m0, m4
638    psubw                   m3, m4
639    psubw                   m2, m4
640    mov                   cntq, 15
641.loop:
642    pinsrw                 m13, [lq+cntq*2], 0
643    pshufb                  m7, m13, m9
644    pshufb                 m13, m8
645    paddw                   m4, m7,  m0
646    paddw                   m5, m7,  m1
647    paddw                   m6, m7,  m2
648    paddw                   m7, m3
649    paddw                  m10, m13, m0
650    paddw                  m11, m13, m1
651    paddw                  m12, m13, m2
652    paddw                  m13, m3
653    packuswb                m4, m5
654    packuswb                m6, m7
655    packuswb               m10, m11
656    packuswb               m12, m13
657    mova   [dstq+strideq*0+ 0], m4
658    mova   [dstq+strideq*0+16], m6
659    mova   [dstq+strideq*1+ 0], m10
660    mova   [dstq+strideq*1+16], m12
661    lea                   dstq, [dstq+strideq*2]
662    dec                   cntq
663    jge .loop
664    RET
665%endif
666%endmacro
667
668TM_XMM_FUNCS ssse3
669TM_XMM_FUNCS avx
670
671%if HAVE_AVX2_EXTERNAL
672INIT_YMM avx2
673cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a
674    pxor                    m3, m3
675    pinsrw                 xm2, [aq-1], 0
676    vinserti128             m2, m2, xm2, 1
677    mova                    m0, [aq]
678    DEFINE_ARGS dst, stride, l, cnt
679    mova                    m4, [pw_m256]
680    mova                    m5, [pw_m255]
681    pshufb                  m2, m4
682    punpckhbw               m1, m0, m3
683    punpcklbw               m0, m3
684    psubw                   m1, m2
685    psubw                   m0, m2
686    mov                   cntq, 15
687.loop:
688    pinsrw                 xm7, [lq+cntq*2], 0
689    vinserti128             m7, m7, xm7, 1
690    pshufb                  m3, m7, m5
691    pshufb                  m7, m4
692    paddw                   m2, m3, m0
693    paddw                   m3, m1
694    paddw                   m6, m7, m0
695    paddw                   m7, m1
696    packuswb                m2, m3
697    packuswb                m6, m7
698    mova      [dstq+strideq*0], m2
699    mova      [dstq+strideq*1], m6
700    lea                   dstq, [dstq+strideq*2]
701    dec                   cntq
702    jge .loop
703    RET
704%endif
705
706; dl
707
708%macro LOWPASS 4 ; left [dst], center, right, tmp
709    pxor                   m%4, m%1, m%3
710    pand                   m%4, [pb_1]
711    pavgb                  m%1, m%3
712    psubusb                m%1, m%4
713    pavgb                  m%1, m%2
714%endmacro
715
716INIT_MMX ssse3
717cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a
718    movq                    m1, [aq]
719    pshufb                  m0, m1, [pb_0to5_2x7]
720    pshufb                  m2, m1, [pb_2to6_3x7]
721    psrlq                   m1, 8
722    LOWPASS                  0, 1, 2, 3
723
724    pshufw                  m1, m0, q3321
725    movd      [dstq+strideq*0], m0
726    movd      [dstq+strideq*2], m1
727    psrlq                   m0, 8
728    psrlq                   m1, 8
729    add                   dstq, strideq
730    movd      [dstq+strideq*0], m0
731    movd      [dstq+strideq*2], m1
732    RET
733
734%macro DL_XMM_FUNCS 1
735INIT_XMM %1
736cglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a
737    movq                    m0, [aq]
738    lea               stride5q, [strideq*5]
739    pshufb                  m1, m0, [pb_1to6_10x7]
740    psrldq                  m2, m1, 1
741    shufps                  m0, m1, q3210
742    LOWPASS                  0, 1, 2, 3
743
744    pshufd                  m1, m0, q3321
745    movq      [dstq+strideq*0], m0
746    movq      [dstq+strideq*4], m1
747    psrldq                  m0, 1
748    psrldq                  m1, 1
749    movq      [dstq+strideq*1], m0
750    movq      [dstq+stride5q ], m1
751    lea                   dstq, [dstq+strideq*2]
752    psrldq                  m0, 1
753    psrldq                  m1, 1
754    movq      [dstq+strideq*0], m0
755    movq      [dstq+strideq*4], m1
756    psrldq                  m0, 1
757    psrldq                  m1, 1
758    movq      [dstq+strideq*1], m0
759    movq      [dstq+stride5q ], m1
760    RET
761
762INIT_XMM %1
763cglobal vp9_ipred_dl_16x16, 4, 4, 6, dst, stride, l, a
764    mova                    m5, [pb_1toE_2xF]
765    mova                    m0, [aq]
766    pshufb                  m1, m0, m5
767    pshufb                  m2, m1, m5
768    pshufb                  m4, m0, [pb_15]
769    LOWPASS                  0, 1, 2, 3
770    DEFINE_ARGS dst, stride, cnt, stride9
771    lea               stride9q, [strideq*3]
772    mov                   cntd, 4
773    lea               stride9q, [stride9q*3]
774
775.loop:
776    movhlps                 m4, m0
777    mova      [dstq+strideq*0], m0
778    pshufb                  m0, m5
779    mova      [dstq+strideq*8], m4
780    movhlps                 m4, m0
781    mova      [dstq+strideq*1], m0
782    pshufb                  m0, m5
783    mova      [dstq+stride9q ], m4
784    lea                   dstq, [dstq+strideq*2]
785    dec                   cntd
786    jg .loop
787    RET
788
789INIT_XMM %1
790cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16
791    mova                    m5, [pb_1toE_2xF]
792    mova                    m0, [aq]
793    mova                    m1, [aq+16]
794    palignr                 m2, m1, m0, 1
795    palignr                 m3, m1, m0, 2
796    LOWPASS                  0, 2, 3, 4
797    pshufb                  m2, m1, m5
798    pshufb                  m3, m2, m5
799    pshufb                  m6, m1, [pb_15]
800    LOWPASS                  1, 2, 3, 4
801    mova                    m7, m6
802    lea                 dst16q, [dstq  +strideq*8]
803    mov                   cntd, 8
804    lea                 dst16q, [dst16q+strideq*8]
805.loop:
806    movhlps                 m7, m1
807    mova [dstq  +strideq*0+ 0], m0
808    mova [dstq  +strideq*0+16], m1
809    movhps [dstq+strideq*8+ 0], m0
810    movq [dstq  +strideq*8+ 8], m1
811    mova [dstq  +strideq*8+16], m7
812    mova [dst16q+strideq*0+ 0], m1
813    mova [dst16q+strideq*0+16], m6
814    mova [dst16q+strideq*8+ 0], m7
815    mova [dst16q+strideq*8+16], m6
816%if cpuflag(avx)
817    vpalignr                m0, m1, m0, 1
818    pshufb                  m1, m5
819%else
820    palignr                 m2, m1, m0, 1
821    pshufb                  m1, m5
822    mova                    m0, m2
823%endif
824    add                   dstq, strideq
825    add                 dst16q, strideq
826    dec                   cntd
827    jg .loop
828    RET
829%endmacro
830
831DL_XMM_FUNCS ssse3
832DL_XMM_FUNCS avx
833
834; dr
835
836INIT_MMX ssse3
837cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a
838    movd                    m0, [lq]
839    punpckldq               m0, [aq-1]
840    movd                    m1, [aq+3]
841    DEFINE_ARGS dst, stride, stride3
842    lea               stride3q, [strideq*3]
843    palignr                 m1, m0, 1
844    psrlq                   m2, m1, 8
845    LOWPASS                  0, 1, 2, 3
846
847    movd      [dstq+stride3q ], m0
848    psrlq                   m0, 8
849    movd      [dstq+strideq*2], m0
850    psrlq                   m0, 8
851    movd      [dstq+strideq*1], m0
852    psrlq                   m0, 8
853    movd      [dstq+strideq*0], m0
854    RET
855
856%macro DR_XMM_FUNCS 1
857INIT_XMM %1
858cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a
859    movq                    m1, [lq]
860    movhps                  m1, [aq-1]
861    movd                    m2, [aq+7]
862    DEFINE_ARGS dst, stride, stride3
863    lea               stride3q, [strideq*3]
864    pslldq                  m0, m1, 1
865    palignr                 m2, m1, 1
866    LOWPASS                  0, 1, 2, 3
867
868    movhps    [dstq+strideq*0], m0
869    pslldq                  m0, 1
870    movhps    [dstq+strideq*1], m0
871    pslldq                  m0, 1
872    movhps    [dstq+strideq*2], m0
873    pslldq                  m0, 1
874    movhps    [dstq+stride3q ], m0
875    pslldq                  m0, 1
876    lea                   dstq, [dstq+strideq*4]
877    movhps    [dstq+strideq*0], m0
878    pslldq                  m0, 1
879    movhps    [dstq+strideq*1], m0
880    pslldq                  m0, 1
881    movhps    [dstq+strideq*2], m0
882    pslldq                  m0, 1
883    movhps    [dstq+stride3q ], m0
884    RET
885
886INIT_XMM %1
887cglobal vp9_ipred_dr_16x16, 4, 4, 6, dst, stride, l, a
888    mova                    m1, [lq]
889    movu                    m2, [aq-1]
890    movd                    m4, [aq+15]
891    DEFINE_ARGS dst, stride, stride9, cnt
892    lea               stride9q, [strideq *3]
893    mov                   cntd, 4
894    lea               stride9q, [stride9q*3]
895    palignr                 m4, m2, 1
896    palignr                 m3, m2, m1, 15
897    LOWPASS                  3,  2, 4, 5
898    pslldq                  m0, m1, 1
899    palignr                 m2, m1, 1
900    LOWPASS                  0,  1, 2, 4
901
902.loop:
903    mova    [dstq+strideq*0  ], m3
904    movhps  [dstq+strideq*8+0], m0
905    movq    [dstq+strideq*8+8], m3
906    palignr                 m3, m0, 15
907    pslldq                  m0, 1
908    mova    [dstq+strideq*1  ], m3
909    movhps  [dstq+stride9q +0], m0
910    movq    [dstq+stride9q +8], m3
911    palignr                 m3, m0, 15
912    pslldq                  m0, 1
913    lea                   dstq, [dstq+strideq*2]
914    dec                   cntd
915    jg .loop
916    RET
917
918INIT_XMM %1
919cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a
920    mova                    m1, [lq]
921    mova                    m2, [lq+16]
922    movu                    m3, [aq-1]
923    movu                    m4, [aq+15]
924    movd                    m5, [aq+31]
925    DEFINE_ARGS dst, stride, stride8, cnt
926    lea               stride8q, [strideq*8]
927    palignr                 m5, m4, 1
928    palignr                 m6, m4, m3, 15
929    LOWPASS                  5,  4,  6,  7
930    palignr                 m4, m3, 1
931    palignr                 m6, m3, m2, 15
932    LOWPASS                  4,  3,  6,  7
933    palignr                 m3, m2, 1
934    palignr                 m6, m2, m1, 15
935    LOWPASS                  3,  2,  6,  7
936    palignr                 m2, m1, 1
937    pslldq                  m0, m1, 1
938    LOWPASS                  2,  1,  0,  6
939    mov                   cntd, 16
940
941    ; out=m2/m3/m4/m5
942.loop:
943    mova  [dstq+stride8q*0+ 0], m4
944    mova  [dstq+stride8q*0+16], m5
945    mova  [dstq+stride8q*2+ 0], m3
946    mova  [dstq+stride8q*2+16], m4
947    palignr                 m5, m4, 15
948    palignr                 m4, m3, 15
949    palignr                 m3, m2, 15
950    pslldq                  m2, 1
951    add                   dstq, strideq
952    dec                   cntd
953    jg .loop
954    RET
955%endmacro
956
957DR_XMM_FUNCS ssse3
958DR_XMM_FUNCS avx
959
960; vl
961
962INIT_MMX ssse3
963cglobal vp9_ipred_vl_4x4, 4, 4, 0, dst, stride, l, a
964    movq                    m0, [aq]
965    psrlq                   m1, m0, 8
966    psrlq                   m2, m1, 8
967    LOWPASS                  2,  1, 0, 3
968    pavgb                   m1, m0
969    movd      [dstq+strideq*0], m1
970    movd      [dstq+strideq*1], m2
971    lea                   dstq, [dstq+strideq*2]
972    psrlq                   m1, 8
973    psrlq                   m2, 8
974    movd      [dstq+strideq*0], m1
975    movd      [dstq+strideq*1], m2
976    RET
977
978%macro VL_XMM_FUNCS 1
979INIT_XMM %1
980cglobal vp9_ipred_vl_8x8, 4, 4, 4, dst, stride, l, a
981    movq                    m0, [aq]
982    pshufb                  m0, [pb_0to6_9x7]
983    DEFINE_ARGS dst, stride, stride3
984    lea               stride3q, [strideq*3]
985    psrldq                  m1, m0, 1
986    psrldq                  m2, m0, 2
987    LOWPASS                  2,  1,  0,  3
988    pavgb                   m1, m0
989
990    movq      [dstq+strideq*0], m1
991    movq      [dstq+strideq*1], m2
992    psrldq                  m1, 1
993    psrldq                  m2, 1
994    movq      [dstq+strideq*2], m1
995    movq      [dstq+stride3q ], m2
996    lea                   dstq, [dstq+strideq*4]
997    psrldq                  m1, 1
998    psrldq                  m2, 1
999    movq      [dstq+strideq*0], m1
1000    movq      [dstq+strideq*1], m2
1001    psrldq                  m1, 1
1002    psrldq                  m2, 1
1003    movq      [dstq+strideq*2], m1
1004    movq      [dstq+stride3q ], m2
1005    RET
1006
1007INIT_XMM %1
1008cglobal vp9_ipred_vl_16x16, 4, 4, 5, dst, stride, l, a
1009    mova                    m0, [aq]
1010    mova                    m4, [pb_1toE_2xF]
1011    DEFINE_ARGS dst, stride, stride3, cnt
1012    lea               stride3q, [strideq*3]
1013    pshufb                  m1, m0, m4
1014    pshufb                  m2, m1, m4
1015    LOWPASS                  2,  1,  0, 3
1016    pavgb                   m1, m0
1017    mov                   cntd, 4
1018.loop:
1019    mova      [dstq+strideq*0], m1
1020    mova      [dstq+strideq*1], m2
1021    pshufb                  m1, m4
1022    pshufb                  m2, m4
1023    mova      [dstq+strideq*2], m1
1024    mova      [dstq+stride3q ], m2
1025    pshufb                  m1, m4
1026    pshufb                  m2, m4
1027    lea                   dstq, [dstq+strideq*4]
1028    dec                   cntd
1029    jg .loop
1030    RET
1031
1032INIT_XMM %1
1033cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a
1034    mova                    m0, [aq]
1035    mova                    m5, [aq+16]
1036    mova                    m4, [pb_1toE_2xF]
1037    DEFINE_ARGS dst, stride, dst16, cnt
1038    palignr                 m2, m5, m0, 1
1039    palignr                 m3, m5, m0, 2
1040    lea                 dst16q, [dstq  +strideq*8]
1041    LOWPASS                  3,  2,  0, 6
1042    pavgb                   m2, m0
1043    pshufb                  m0, m5, m4
1044    pshufb                  m1, m0, m4
1045    lea                 dst16q, [dst16q+strideq*8]
1046    LOWPASS                  1,  0,  5, 6
1047    pavgb                   m0, m5
1048    pshufb                  m5, [pb_15]
1049    mov                   cntd, 8
1050
1051.loop:
1052%macro %%write 3
1053    mova    [dstq+stride%1+ 0], %2
1054    mova    [dstq+stride%1+16], %3
1055    movhps  [dst16q+stride%1 ], %2
1056    movu  [dst16q+stride%1+ 8], %3
1057    movq  [dst16q+stride%1+24], m5
1058%if cpuflag(avx)
1059    palignr                 %2, %3, %2, 1
1060    pshufb                  %3, m4
1061%else
1062    palignr                 m6, %3, %2, 1
1063    pshufb                  %3, m4
1064    mova                    %2, m6
1065%endif
1066%endmacro
1067
1068    %%write                q*0, m2, m0
1069    %%write                q*1, m3, m1
1070    lea                   dstq, [dstq  +strideq*2]
1071    lea                 dst16q, [dst16q+strideq*2]
1072    dec                   cntd
1073    jg .loop
1074    RET
1075%endmacro
1076
1077VL_XMM_FUNCS ssse3
1078VL_XMM_FUNCS avx
1079
1080; vr
1081
1082INIT_MMX ssse3
1083cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a
1084    movq                    m1, [aq-1]
1085    punpckldq               m2, [lq]
1086    movd                    m0, [aq]
1087    DEFINE_ARGS dst, stride, stride3
1088    lea               stride3q, [strideq*3]
1089    pavgb                   m0, m1
1090    palignr                 m1, m2, 5
1091    psrlq                   m2, m1, 8
1092    psllq                   m3, m1, 8
1093    LOWPASS                  2,  1, 3, 4
1094
1095    ; ABCD <- for the following predictor:
1096    ; EFGH
1097    ; IABC  | m0 contains ABCDxxxx
1098    ; JEFG  | m2 contains xJIEFGHx
1099
1100    punpckldq               m0, m2
1101    pshufb                  m2, [pb_13456_3xm1]
1102    movd      [dstq+strideq*0], m0
1103    pshufb                  m0, [pb_6012_4xm1]
1104    movd      [dstq+stride3q ], m2
1105    psrlq                   m2, 8
1106    movd      [dstq+strideq*2], m0
1107    movd      [dstq+strideq*1], m2
1108    RET
1109
1110%macro VR_XMM_FUNCS 1
1111INIT_XMM %1
1112cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a
1113    movu                    m1, [aq-1]
1114    movhps                  m2, [lq]
1115    movq                    m0, [aq]
1116    DEFINE_ARGS dst, stride, stride3
1117    lea               stride3q, [strideq*3]
1118    pavgb                   m0, m1
1119    palignr                 m1, m2, 9
1120    pslldq                  m2, m1, 1
1121    pslldq                  m3, m1, 2
1122    LOWPASS                  1,  2, 3, 4
1123
1124    ; ABCDEFGH <- for the following predictor:
1125    ; IJKLMNOP
1126    ; QABCDEFG  | m0 contains ABCDEFGHxxxxxxxx
1127    ; RIJKLMNO  | m1 contains xxVUTSRQIJKLMNOP
1128    ; SQABCDEF
1129    ; TRIJKLMN
1130    ; USQABCDE
1131    ; VTRIJKLM
1132
1133    punpcklqdq              m0, m1 ; ABCDEFGHxxVUTSRQ
1134    movq      [dstq+strideq*0], m0
1135    pshufb                  m0, [pb_6xm1_BDF_0to6] ; xxxxxxUSQABCDEFG
1136    movhps    [dstq+strideq*1], m1
1137    pshufb                  m1, [pb_6xm1_246_8toE] ; xxxxxxVTRIJKLMNO
1138    movhps    [dstq+strideq*2], m0
1139    pslldq                  m0, 1
1140    movhps    [dstq+stride3q ], m1
1141    lea                   dstq, [dstq+strideq*4]
1142    pslldq                  m1, 1
1143    movhps    [dstq+strideq*0], m0
1144    pslldq                  m0, 1
1145    movhps    [dstq+strideq*1], m1
1146    pslldq                  m1, 1
1147    movhps    [dstq+strideq*2], m0
1148    movhps    [dstq+stride3q ], m1
1149    RET
1150
1151INIT_XMM %1
1152cglobal vp9_ipred_vr_16x16, 4, 4, 6, dst, stride, l, a
1153    mova                    m0, [aq]
1154    movu                    m1, [aq-1]
1155    mova                    m2, [lq]
1156    DEFINE_ARGS dst, stride, stride3, cnt
1157    lea               stride3q, [strideq*3]
1158    palignr                 m3, m1, m2, 15
1159    LOWPASS                  3,  1,  0,  4
1160    pavgb                   m0, m1
1161    palignr                 m1, m2,  1
1162    pslldq                  m4, m2,  1
1163    LOWPASS                  1,  2,  4,  5
1164    pshufb                  m1, [pb_02468ACE_13579BDF]
1165    mov                   cntd, 4
1166
1167.loop:
1168    movlhps                 m2, m1
1169    mova      [dstq+strideq*0], m0
1170    mova      [dstq+strideq*1], m3
1171    palignr                 m4, m0, m1, 15
1172    palignr                 m5, m3, m2, 15
1173    mova      [dstq+strideq*2], m4
1174    mova      [dstq+stride3q ], m5
1175    lea                   dstq, [dstq+strideq*4]
1176    palignr                 m0, m1, 14
1177    palignr                 m3, m2, 14
1178    pslldq                  m1, 2
1179    dec                   cntd
1180    jg .loop
1181    RET
1182
1183%if ARCH_X86_64
1184INIT_XMM %1
1185cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a
1186    mova                    m0, [aq]
1187    mova                    m2, [aq+16]
1188    movu                    m1, [aq-1]
1189    palignr                 m3, m2, m0, 15
1190    palignr                 m4, m2, m0, 14
1191    LOWPASS                  4,  3,  2,  5
1192    pavgb                   m3, m2
1193    mova                    m2, [lq+16]
1194    palignr                 m5, m1, m2, 15
1195    LOWPASS                  5,  1,  0,  6
1196    pavgb                   m0, m1
1197    mova                    m6, [lq]
1198    palignr                 m1, m2,  1
1199    palignr                 m7, m2, m6, 15
1200    LOWPASS                  1,  2,  7,  8
1201    palignr                 m2, m6,  1
1202    pslldq                  m7, m6,  1
1203    LOWPASS                  2,  6,  7,  8
1204    pshufb                  m1, [pb_02468ACE_13579BDF]
1205    pshufb                  m2, [pb_02468ACE_13579BDF]
1206    DEFINE_ARGS dst, stride, dst16, cnt
1207    lea                 dst16q, [dstq  +strideq*8]
1208    lea                 dst16q, [dst16q+strideq*8]
1209    SBUTTERFLY             qdq,  2,  1,  6
1210    mov                   cntd, 8
1211
1212.loop:
1213    ; even lines (0, 2, 4, ...): m1 | m0, m3
1214    ;  odd lines (1, 3, 5, ...): m2 | m5, m4
1215%macro %%write 4
1216    mova    [dstq+stride%1+ 0], %3
1217    mova    [dstq+stride%1+16], %4
1218    movhps  [dst16q+stride%1 ], %2
1219    movu  [dst16q+stride%1+ 8], %3
1220    movq  [dst16q+stride%1+24], %4
1221    palignr                 %4, %3, 15
1222    palignr                 %3, %2, 15
1223    pslldq                  %2,  1
1224%endmacro
1225
1226    %%write                q*0, m1, m0, m3
1227    %%write                q*1, m2, m5, m4
1228    lea                   dstq, [dstq  +strideq*2]
1229    lea                 dst16q, [dst16q+strideq*2]
1230    dec                   cntd
1231    jg .loop
1232    RET
1233%endif
1234%endmacro
1235
1236VR_XMM_FUNCS ssse3
1237VR_XMM_FUNCS avx
1238
1239; hd
1240
1241INIT_MMX ssse3
1242cglobal vp9_ipred_hd_4x4, 4, 4, 0, dst, stride, l, a
1243    movd                    m0, [lq]
1244    punpckldq               m0, [aq-1]
1245    DEFINE_ARGS dst, stride, stride3
1246    lea               stride3q, [strideq*3]
1247    psrlq                   m1, m0, 8
1248    psrlq                   m2, m1, 8
1249    LOWPASS                  2,  1, 0,  3
1250    pavgb                   m1, m0
1251
1252    ; DHIJ <- for the following predictor:
1253    ; CGDH
1254    ; BFCG  | m1 contains ABCDxxxx
1255    ; AEBF  | m2 contains EFGHIJxx
1256
1257    punpcklbw               m1, m2
1258    punpckhdq               m0, m1, m2
1259
1260    ; m1 contains AEBFCGDH
1261    ; m0 contains CGDHIJxx
1262
1263    movd      [dstq+stride3q ], m1
1264    movd      [dstq+strideq*1], m0
1265    psrlq                   m1, 16
1266    psrlq                   m0, 16
1267    movd      [dstq+strideq*2], m1
1268    movd      [dstq+strideq*0], m0
1269    RET
1270
1271%macro HD_XMM_FUNCS 1
1272INIT_XMM %1
1273cglobal vp9_ipred_hd_8x8, 4, 4, 4, dst, stride, l, a
1274    movq                    m0, [lq]
1275    movhps                  m0, [aq-1]
1276    DEFINE_ARGS dst, stride, stride3, dst4
1277    lea               stride3q, [strideq*3]
1278    lea                  dst4q, [dstq+strideq*4]
1279    psrldq                  m1, m0, 1
1280    psrldq                  m2, m1, 1
1281    LOWPASS                  2,  1,  0,  3
1282    pavgb                   m1, m0
1283
1284    ; HPQRSTUV <- for the following predictor
1285    ; GOHPQRST
1286    ; FNGOHPQR  | m1 contains ABCDEFGHxxxxxxxx
1287    ; EMFNGOHP  | m2 contains IJKLMNOPQRSTUVxx
1288    ; DLEMFNGO
1289    ; CKDLEMFN
1290    ; BJCKDLEM
1291    ; AIBJCKDL
1292
1293    punpcklbw               m1, m2
1294    movhlps                 m2, m2
1295
1296    ; m1 contains AIBJCKDLEMFNGOHP
1297    ; m2 contains QRSTUVxxxxxxxxxx
1298
1299    movhps   [dstq +stride3q ], m1
1300    movq     [dst4q+stride3q ], m1
1301    palignr                 m3, m2, m1, 2
1302    movhps   [dstq +strideq*2], m3
1303    movq     [dst4q+strideq*2], m3
1304    palignr                 m3, m2, m1, 4
1305    movhps   [dstq +strideq*1], m3
1306    movq     [dst4q+strideq*1], m3
1307    palignr                 m2, m1, 6
1308    movhps   [dstq +strideq*0], m2
1309    movq     [dst4q+strideq*0], m2
1310    RET
1311
1312INIT_XMM %1
1313cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a
1314    mova                    m0, [lq]
1315    movu                    m3, [aq-1]
1316    DEFINE_ARGS dst, stride, stride4, dst4, dst8, dst12
1317    lea               stride4q, [strideq*4]
1318    lea                  dst4q, [dstq +stride4q]
1319    lea                  dst8q, [dst4q+stride4q]
1320    lea                 dst12q, [dst8q+stride4q]
1321    psrldq                  m4, m3,  1
1322    psrldq                  m5, m3,  2
1323    LOWPASS                  5,  4,  3,  6
1324    palignr                 m1, m3, m0,  1
1325    palignr                 m2, m3, m0,  2
1326    LOWPASS                  2,  1,  0,  6
1327    pavgb                   m1, m0
1328    SBUTTERFLY              bw,  1,  2,  6
1329
1330    ; I PROBABLY INVERTED L0 ad L16 here
1331    ; m1, m2, m5
1332.loop:
1333    sub               stride4q, strideq
1334    movhps [dstq +stride4q +0], m2
1335    movq   [dstq +stride4q +8], m5
1336    mova   [dst4q+stride4q   ], m2
1337    movhps [dst8q+stride4q +0], m1
1338    movq   [dst8q+stride4q +8], m2
1339    mova  [dst12q+stride4q   ], m1
1340%if cpuflag(avx)
1341    palignr                 m1, m2, m1, 2
1342    palignr                 m2, m5, m2, 2
1343%else
1344    palignr                 m3, m2, m1, 2
1345    palignr                 m0, m5, m2, 2
1346    mova                    m1, m3
1347    mova                    m2, m0
1348%endif
1349    psrldq                  m5, 2
1350    jg .loop
1351    RET
1352
1353INIT_XMM %1
1354cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a
1355    mova                    m0, [lq]
1356    mova                    m1, [lq+16]
1357    movu                    m2, [aq-1]
1358    movu                    m3, [aq+15]
1359    DEFINE_ARGS dst, stride, stride8, dst8, dst16, dst24
1360    lea               stride8q, [strideq*8]
1361    lea                  dst8q, [dstq  +stride8q]
1362    lea                 dst16q, [dst8q +stride8q]
1363    lea                 dst24q, [dst16q+stride8q]
1364    psrldq                  m4, m3,  1
1365    psrldq                  m5, m3,  2
1366    LOWPASS                  5,  4,  3,  6
1367    palignr                 m4, m3, m2,  2
1368    palignr                 m3, m2,  1
1369    LOWPASS                  4,  3,  2,  6
1370    palignr                 m3, m2, m1,  2
1371    palignr                 m2, m1,  1
1372    LOWPASS                  3,  2,  1,  6
1373    pavgb                   m2, m1
1374    palignr                 m6, m1, m0,  1
1375    palignr                 m1, m0,  2
1376    LOWPASS                  1,  6,  0,  7
1377    pavgb                   m0, m6
1378    SBUTTERFLY              bw,  2,  3,  6
1379    SBUTTERFLY              bw,  0,  1,  6
1380
1381    ; m0, m1, m2, m3, m4, m5
1382.loop:
1383    sub               stride8q, strideq
1384    mova  [dstq  +stride8q+ 0], m3
1385    mova  [dstq  +stride8q+16], m4
1386    mova  [dst8q +stride8q+ 0], m2
1387    mova  [dst8q +stride8q+16], m3
1388    mova  [dst16q+stride8q+ 0], m1
1389    mova  [dst16q+stride8q+16], m2
1390    mova  [dst24q+stride8q+ 0], m0
1391    mova  [dst24q+stride8q+16], m1
1392%if cpuflag(avx)
1393    palignr                 m0, m1, m0, 2
1394    palignr                 m1, m2, m1, 2
1395    palignr                 m2, m3, m2, 2
1396    palignr                 m3, m4, m3, 2
1397    palignr                 m4, m5, m4, 2
1398    psrldq                  m5, 2
1399%else
1400    psrldq                  m6, m5, 2
1401    palignr                 m5, m4, 2
1402    palignr                 m4, m3, 2
1403    palignr                 m3, m2, 2
1404    palignr                 m2, m1, 2
1405    palignr                 m1, m0, 2
1406    mova                    m0, m1
1407    mova                    m1, m2
1408    mova                    m2, m3
1409    mova                    m3, m4
1410    mova                    m4, m5
1411    mova                    m5, m6
1412%endif
1413    jg .loop
1414    RET
1415%endmacro
1416
1417HD_XMM_FUNCS ssse3
1418HD_XMM_FUNCS avx
1419
1420INIT_MMX ssse3
1421cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l
1422    movd                    m0, [lq]
1423    pshufb                  m0, [pb_3to1_5x0]
1424    psrlq                   m1, m0, 8
1425    psrlq                   m2, m1, 8
1426    LOWPASS                  2,  1, 0, 3
1427    pavgb                   m1, m0
1428    DEFINE_ARGS dst, stride, stride3
1429    lea               stride3q, [strideq*3]
1430    SBUTTERFLY              bw,  1, 2, 0
1431    palignr                 m2, m1, 2
1432    movd      [dstq+strideq*0], m1
1433    movd      [dstq+strideq*1], m2
1434    punpckhdq               m1, m1
1435    punpckhdq               m2, m2
1436    movd      [dstq+strideq*2], m1
1437    movd      [dstq+stride3q ], m2
1438    RET
1439
1440%macro HU_XMM_FUNCS 1
1441INIT_XMM %1
1442cglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l
1443    movq                    m0, [lq]
1444    pshufb                  m0, [pb_7to1_9x0]
1445    psrldq                  m1, m0, 1
1446    psrldq                  m2, m1, 1
1447    LOWPASS                  2,  1, 0, 3
1448    pavgb                   m1, m0
1449    DEFINE_ARGS dst, stride, stride3, dst4
1450    lea               stride3q, [strideq*3]
1451    lea                  dst4q, [dstq+strideq*4]
1452    SBUTTERFLY              bw,  1, 2, 0
1453    movq     [dstq +strideq*0], m1
1454    movhps   [dst4q+strideq*0], m1
1455    palignr                 m0, m2, m1, 2
1456    movq     [dstq +strideq*1], m0
1457    movhps   [dst4q+strideq*1], m0
1458    palignr                 m0, m2, m1, 4
1459    movq     [dstq +strideq*2], m0
1460    movhps   [dst4q+strideq*2], m0
1461    palignr                 m2, m1, 6
1462    movq     [dstq +stride3q ], m2
1463    movhps   [dst4q+stride3q ], m2
1464    RET
1465
1466INIT_XMM %1
1467cglobal vp9_ipred_hu_16x16, 3, 4, 5, dst, stride, l
1468    mova                    m0, [lq]
1469    pshufb                  m0, [pb_Fto0]
1470    mova                    m3, [pb_2toE_3xF]
1471    pshufb                  m1, m0, [pb_1toE_2xF]
1472    pshufb                  m2, m0, m3
1473    LOWPASS                  2,  1,  0,  4
1474    pavgb                   m1, m0
1475    DEFINE_ARGS dst, stride, stride9, cnt
1476    lea                stride9q, [strideq *3]
1477    mov                   cntd,  4
1478    lea                stride9q, [stride9q*3]
1479    SBUTTERFLY              bw,  1,  2,  0
1480
1481.loop:
1482    mova      [dstq+strideq*0], m1
1483    mova      [dstq+strideq*8], m2
1484    palignr                 m0, m2, m1, 2
1485    pshufb                  m2, m3
1486    mova      [dstq+strideq*1], m0
1487    mova      [dstq+stride9q ], m2
1488    palignr                 m1, m2, m0, 2
1489    pshufb                  m2, m3
1490    lea                   dstq, [dstq+strideq*2]
1491    dec                   cntd
1492    jg .loop
1493    RET
1494
1495INIT_XMM %1
1496cglobal vp9_ipred_hu_32x32, 3, 7, 7, dst, stride, l
1497    mova                    m0, [lq]
1498    mova                    m1, [lq+16]
1499    mova                    m2, [pb_Fto0]
1500    mova                    m4, [pb_2toE_3xF]
1501    pshufb                  m0, m2
1502    pshufb                  m1, m2
1503    palignr                 m2, m0, m1,  1
1504    palignr                 m3, m0, m1,  2
1505    LOWPASS                  3,  2,  1,  5
1506    pavgb                   m2, m1
1507    pshufb                  m1, m0, m4
1508    pshufb                  m5, m0, [pb_1toE_2xF]
1509    LOWPASS                  1,  5,  0,  6
1510    pavgb                   m0, m5
1511    DEFINE_ARGS dst, stride, cnt, stride0, dst8, dst16, dst24
1512    mov                   cntd,  8
1513    xor               stride0q, stride0q
1514    lea                  dst8q, [dstq  +strideq*8]
1515    lea                 dst16q, [dst8q +strideq*8]
1516    lea                 dst24q, [dst16q+strideq*8]
1517    SBUTTERFLY              bw,  0,  1,  5
1518    SBUTTERFLY              bw,  2,  3,  5
1519    pshufb                  m6, m1, [pb_15]
1520
1521.loop:
1522    mova  [dstq  +stride0q+ 0], m2
1523    mova  [dstq  +stride0q+16], m3
1524    mova  [dst8q +stride0q+ 0], m3
1525    mova  [dst8q +stride0q+16], m0
1526    mova  [dst16q+stride0q+ 0], m0
1527    mova  [dst16q+stride0q+16], m1
1528    mova  [dst24q+stride0q+ 0], m1
1529    mova  [dst24q+stride0q+16], m6
1530%if cpuflag(avx)
1531    palignr                 m2, m3, m2, 2
1532    palignr                 m3, m0, m3, 2
1533    palignr                 m0, m1, m0, 2
1534    pshufb                  m1, m4
1535%else
1536    pshufb                  m5, m1, m4
1537    palignr                 m1, m0, 2
1538    palignr                 m0, m3, 2
1539    palignr                 m3, m2, 2
1540    mova                    m2, m3
1541    mova                    m3, m0
1542    mova                    m0, m1
1543    mova                    m1, m5
1544%endif
1545    add               stride0q, strideq
1546    dec                   cntd
1547    jg .loop
1548    RET
1549%endmacro
1550
1551HU_XMM_FUNCS ssse3
1552HU_XMM_FUNCS avx
1553
1554; FIXME 127, 128, 129 ?
1555