1;******************************************************************************
2;* H.264 intra prediction asm optimizations
3;* Copyright (c) 2010 Fiona Glaser
4;* Copyright (c) 2010 Holger Lubitz
5;* Copyright (c) 2010 Loren Merritt
6;* Copyright (c) 2010 Ronald S. Bultje
7;*
8;* This file is part of FFmpeg.
9;*
10;* FFmpeg is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* FFmpeg is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with FFmpeg; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "libavutil/x86/x86util.asm"
26
27SECTION_RODATA
28
29tm_shuf: times 8 db 0x03, 0x80
30pw_ff00: times 8 dw 0xff00
31plane_shuf:  db -8, -7, -6, -5, -4, -3, -2, -1
32             db  1,  2,  3,  4,  5,  6,  7,  8
33plane8_shuf: db -4, -3, -2, -1,  0,  0,  0,  0
34             db  1,  2,  3,  4,  0,  0,  0,  0
35pw_0to7:     dw  0,  1,  2,  3,  4,  5,  6,  7
36pw_1to8:     dw  1,  2,  3,  4,  5,  6,  7,  8
37pw_m8tom1:   dw -8, -7, -6, -5, -4, -3, -2, -1
38pw_m4to4:    dw -4, -3, -2, -1,  1,  2,  3,  4
39
40SECTION .text
41
42cextern pb_1
43cextern pb_3
44cextern pw_4
45cextern pw_5
46cextern pw_8
47cextern pw_16
48cextern pw_17
49cextern pw_32
50
51;-----------------------------------------------------------------------------
52; void ff_pred16x16_vertical_8(uint8_t *src, ptrdiff_t stride)
53;-----------------------------------------------------------------------------
54
55INIT_MMX mmx
56cglobal pred16x16_vertical_8, 2,3
57    sub   r0, r1
58    mov   r2, 8
59    movq mm0, [r0+0]
60    movq mm1, [r0+8]
61.loop:
62    movq [r0+r1*1+0], mm0
63    movq [r0+r1*1+8], mm1
64    movq [r0+r1*2+0], mm0
65    movq [r0+r1*2+8], mm1
66    lea   r0, [r0+r1*2]
67    dec   r2
68    jg .loop
69    REP_RET
70
71INIT_XMM sse
72cglobal pred16x16_vertical_8, 2,3
73    sub   r0, r1
74    mov   r2, 4
75    movaps xmm0, [r0]
76.loop:
77    movaps [r0+r1*1], xmm0
78    movaps [r0+r1*2], xmm0
79    lea   r0, [r0+r1*2]
80    movaps [r0+r1*1], xmm0
81    movaps [r0+r1*2], xmm0
82    lea   r0, [r0+r1*2]
83    dec   r2
84    jg .loop
85    REP_RET
86
87;-----------------------------------------------------------------------------
88; void ff_pred16x16_horizontal_8(uint8_t *src, ptrdiff_t stride)
89;-----------------------------------------------------------------------------
90
91%macro PRED16x16_H 0
92cglobal pred16x16_horizontal_8, 2,3
93    mov       r2, 8
94%if cpuflag(ssse3)
95    mova      m2, [pb_3]
96%endif
97.loop:
98    movd      m0, [r0+r1*0-4]
99    movd      m1, [r0+r1*1-4]
100
101%if cpuflag(ssse3)
102    pshufb    m0, m2
103    pshufb    m1, m2
104%else
105    punpcklbw m0, m0
106    punpcklbw m1, m1
107    SPLATW    m0, m0, 3
108    SPLATW    m1, m1, 3
109    mova [r0+r1*0+8], m0
110    mova [r0+r1*1+8], m1
111%endif
112
113    mova [r0+r1*0], m0
114    mova [r0+r1*1], m1
115    lea       r0, [r0+r1*2]
116    dec       r2
117    jg .loop
118    REP_RET
119%endmacro
120
121INIT_MMX mmx
122PRED16x16_H
123INIT_MMX mmxext
124PRED16x16_H
125INIT_XMM ssse3
126PRED16x16_H
127
128;-----------------------------------------------------------------------------
129; void ff_pred16x16_dc_8(uint8_t *src, ptrdiff_t stride)
130;-----------------------------------------------------------------------------
131
132%macro PRED16x16_DC 0
133cglobal pred16x16_dc_8, 2,7
134    mov       r4, r0
135    sub       r0, r1
136    pxor      mm0, mm0
137    pxor      mm1, mm1
138    psadbw    mm0, [r0+0]
139    psadbw    mm1, [r0+8]
140    dec        r0
141    movzx     r5d, byte [r0+r1*1]
142    paddw     mm0, mm1
143    movd      r6d, mm0
144    lea        r0, [r0+r1*2]
145%rep 7
146    movzx     r2d, byte [r0+r1*0]
147    movzx     r3d, byte [r0+r1*1]
148    add       r5d, r2d
149    add       r6d, r3d
150    lea        r0, [r0+r1*2]
151%endrep
152    movzx     r2d, byte [r0+r1*0]
153    add       r5d, r6d
154    lea       r2d, [r2+r5+16]
155    shr       r2d, 5
156%if cpuflag(ssse3)
157    pxor       m1, m1
158%endif
159    SPLATB_REG m0, r2, m1
160
161%if mmsize==8
162    mov       r3d, 8
163.loop:
164    mova [r4+r1*0+0], m0
165    mova [r4+r1*0+8], m0
166    mova [r4+r1*1+0], m0
167    mova [r4+r1*1+8], m0
168%else
169    mov       r3d, 4
170.loop:
171    mova [r4+r1*0], m0
172    mova [r4+r1*1], m0
173    lea   r4, [r4+r1*2]
174    mova [r4+r1*0], m0
175    mova [r4+r1*1], m0
176%endif
177    lea   r4, [r4+r1*2]
178    dec   r3d
179    jg .loop
180    REP_RET
181%endmacro
182
183INIT_MMX mmxext
184PRED16x16_DC
185INIT_XMM sse2
186PRED16x16_DC
187INIT_XMM ssse3
188PRED16x16_DC
189
190;-----------------------------------------------------------------------------
191; void ff_pred16x16_tm_vp8_8(uint8_t *src, ptrdiff_t stride)
192;-----------------------------------------------------------------------------
193
194%macro PRED16x16_TM 0
195cglobal pred16x16_tm_vp8_8, 2,5
196    sub        r0, r1
197    pxor      mm7, mm7
198    movq      mm0, [r0+0]
199    movq      mm2, [r0+8]
200    movq      mm1, mm0
201    movq      mm3, mm2
202    punpcklbw mm0, mm7
203    punpckhbw mm1, mm7
204    punpcklbw mm2, mm7
205    punpckhbw mm3, mm7
206    movzx     r3d, byte [r0-1]
207    mov       r4d, 16
208.loop:
209    movzx     r2d, byte [r0+r1-1]
210    sub       r2d, r3d
211    movd      mm4, r2d
212    SPLATW    mm4, mm4, 0
213    movq      mm5, mm4
214    movq      mm6, mm4
215    movq      mm7, mm4
216    paddw     mm4, mm0
217    paddw     mm5, mm1
218    paddw     mm6, mm2
219    paddw     mm7, mm3
220    packuswb  mm4, mm5
221    packuswb  mm6, mm7
222    movq [r0+r1+0], mm4
223    movq [r0+r1+8], mm6
224    add        r0, r1
225    dec       r4d
226    jg .loop
227    REP_RET
228%endmacro
229
230INIT_MMX mmx
231PRED16x16_TM
232INIT_MMX mmxext
233PRED16x16_TM
234
235INIT_XMM sse2
236cglobal pred16x16_tm_vp8_8, 2,6,6
237    sub          r0, r1
238    pxor       xmm2, xmm2
239    movdqa     xmm0, [r0]
240    movdqa     xmm1, xmm0
241    punpcklbw  xmm0, xmm2
242    punpckhbw  xmm1, xmm2
243    movzx       r4d, byte [r0-1]
244    mov         r5d, 8
245.loop:
246    movzx       r2d, byte [r0+r1*1-1]
247    movzx       r3d, byte [r0+r1*2-1]
248    sub         r2d, r4d
249    sub         r3d, r4d
250    movd       xmm2, r2d
251    movd       xmm4, r3d
252    pshuflw    xmm2, xmm2, 0
253    pshuflw    xmm4, xmm4, 0
254    punpcklqdq xmm2, xmm2
255    punpcklqdq xmm4, xmm4
256    movdqa     xmm3, xmm2
257    movdqa     xmm5, xmm4
258    paddw      xmm2, xmm0
259    paddw      xmm3, xmm1
260    paddw      xmm4, xmm0
261    paddw      xmm5, xmm1
262    packuswb   xmm2, xmm3
263    packuswb   xmm4, xmm5
264    movdqa [r0+r1*1], xmm2
265    movdqa [r0+r1*2], xmm4
266    lea          r0, [r0+r1*2]
267    dec         r5d
268    jg .loop
269    REP_RET
270
271%if HAVE_AVX2_EXTERNAL
272INIT_YMM avx2
273cglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration
274    sub                       dstq, strideq
275    pmovzxbw                    m0, [dstq]
276    vpbroadcastb               xm1, [r0-1]
277    pmovzxbw                    m1, xm1
278    psubw                       m0, m1
279    mov                 iterationd, 4
280    lea                   stride3q, [strideq*3]
281.loop:
282    vpbroadcastb               xm1, [dstq+strideq*1-1]
283    vpbroadcastb               xm2, [dstq+strideq*2-1]
284    vpbroadcastb               xm3, [dstq+stride3q-1]
285    vpbroadcastb               xm4, [dstq+strideq*4-1]
286    pmovzxbw                    m1, xm1
287    pmovzxbw                    m2, xm2
288    pmovzxbw                    m3, xm3
289    pmovzxbw                    m4, xm4
290    paddw                       m1, m0
291    paddw                       m2, m0
292    paddw                       m3, m0
293    paddw                       m4, m0
294    vpackuswb                   m1, m1, m2
295    vpackuswb                   m3, m3, m4
296    vpermq                      m1, m1, q3120
297    vpermq                      m3, m3, q3120
298    movdqa        [dstq+strideq*1], xm1
299    vextracti128  [dstq+strideq*2], m1, 1
300    movdqa       [dstq+stride3q*1], xm3
301    vextracti128  [dstq+strideq*4], m3, 1
302    lea                       dstq, [dstq+strideq*4]
303    dec                 iterationd
304    jg .loop
305    REP_RET
306%endif
307
308;-----------------------------------------------------------------------------
309; void ff_pred16x16_plane_*_8(uint8_t *src, ptrdiff_t stride)
310;-----------------------------------------------------------------------------
311
312%macro H264_PRED16x16_PLANE 1
313cglobal pred16x16_plane_%1_8, 2,9,7
314    mov          r2, r1           ; +stride
315    neg          r1               ; -stride
316
317    movh         m0, [r0+r1  -1]
318%if mmsize == 8
319    pxor         m4, m4
320    movh         m1, [r0+r1  +3 ]
321    movh         m2, [r0+r1  +8 ]
322    movh         m3, [r0+r1  +12]
323    punpcklbw    m0, m4
324    punpcklbw    m1, m4
325    punpcklbw    m2, m4
326    punpcklbw    m3, m4
327    pmullw       m0, [pw_m8tom1  ]
328    pmullw       m1, [pw_m8tom1+8]
329    pmullw       m2, [pw_1to8    ]
330    pmullw       m3, [pw_1to8  +8]
331    paddw        m0, m2
332    paddw        m1, m3
333%else ; mmsize == 16
334%if cpuflag(ssse3)
335    movhps       m0, [r0+r1  +8]
336    pmaddubsw    m0, [plane_shuf] ; H coefficients
337%else ; sse2
338    pxor         m2, m2
339    movh         m1, [r0+r1  +8]
340    punpcklbw    m0, m2
341    punpcklbw    m1, m2
342    pmullw       m0, [pw_m8tom1]
343    pmullw       m1, [pw_1to8]
344    paddw        m0, m1
345%endif
346    movhlps      m1, m0
347%endif
348    paddw        m0, m1
349%if cpuflag(mmxext)
350    PSHUFLW      m1, m0, 0xE
351%elif cpuflag(mmx)
352    mova         m1, m0
353    psrlq        m1, 32
354%endif
355    paddw        m0, m1
356%if cpuflag(mmxext)
357    PSHUFLW      m1, m0, 0x1
358%elif cpuflag(mmx)
359    mova         m1, m0
360    psrlq        m1, 16
361%endif
362    paddw        m0, m1           ; sum of H coefficients
363
364    lea          r4, [r0+r2*8-1]
365    lea          r3, [r0+r2*4-1]
366    add          r4, r2
367
368%if ARCH_X86_64
369%define e_reg r8
370%else
371%define e_reg r0
372%endif
373
374    movzx     e_reg, byte [r3+r2*2   ]
375    movzx        r5, byte [r4+r1     ]
376    sub          r5, e_reg
377
378    movzx     e_reg, byte [r3+r2     ]
379    movzx        r6, byte [r4        ]
380    sub          r6, e_reg
381    lea          r5, [r5+r6*2]
382
383    movzx     e_reg, byte [r3+r1     ]
384    movzx        r6, byte [r4+r2*2   ]
385    sub          r6, e_reg
386    lea          r5, [r5+r6*4]
387
388    movzx     e_reg, byte [r3        ]
389%if ARCH_X86_64
390    movzx        r7, byte [r4+r2     ]
391    sub          r7, e_reg
392%else
393    movzx        r6, byte [r4+r2     ]
394    sub          r6, e_reg
395    lea          r5, [r5+r6*4]
396    sub          r5, r6
397%endif
398
399    lea       e_reg, [r3+r1*4]
400    lea          r3, [r4+r2*4]
401
402    movzx        r4, byte [e_reg+r2  ]
403    movzx        r6, byte [r3        ]
404    sub          r6, r4
405%if ARCH_X86_64
406    lea          r6, [r7+r6*2]
407    lea          r5, [r5+r6*2]
408    add          r5, r6
409%else
410    lea          r5, [r5+r6*4]
411    lea          r5, [r5+r6*2]
412%endif
413
414    movzx        r4, byte [e_reg     ]
415%if ARCH_X86_64
416    movzx        r7, byte [r3   +r2  ]
417    sub          r7, r4
418    sub          r5, r7
419%else
420    movzx        r6, byte [r3   +r2  ]
421    sub          r6, r4
422    lea          r5, [r5+r6*8]
423    sub          r5, r6
424%endif
425
426    movzx        r4, byte [e_reg+r1  ]
427    movzx        r6, byte [r3   +r2*2]
428    sub          r6, r4
429%if ARCH_X86_64
430    add          r6, r7
431%endif
432    lea          r5, [r5+r6*8]
433
434    movzx        r4, byte [e_reg+r2*2]
435    movzx        r6, byte [r3   +r1  ]
436    sub          r6, r4
437    lea          r5, [r5+r6*4]
438    add          r5, r6           ; sum of V coefficients
439
440%if ARCH_X86_64 == 0
441    mov          r0, r0m
442%endif
443
444%ifidn %1, h264
445    lea          r5, [r5*5+32]
446    sar          r5, 6
447%elifidn %1, rv40
448    lea          r5, [r5*5]
449    sar          r5, 6
450%elifidn %1, svq3
451    test         r5, r5
452    lea          r6, [r5+3]
453    cmovs        r5, r6
454    sar          r5, 2            ; V/4
455    lea          r5, [r5*5]       ; 5*(V/4)
456    test         r5, r5
457    lea          r6, [r5+15]
458    cmovs        r5, r6
459    sar          r5, 4            ; (5*(V/4))/16
460%endif
461
462    movzx        r4, byte [r0+r1  +15]
463    movzx        r3, byte [r3+r2*2   ]
464    lea          r3, [r3+r4+1]
465    shl          r3, 4
466
467    movd        r1d, m0
468    movsx       r1d, r1w
469%ifnidn %1, svq3
470%ifidn %1, h264
471    lea         r1d, [r1d*5+32]
472%else ; rv40
473    lea         r1d, [r1d*5]
474%endif
475    sar         r1d, 6
476%else ; svq3
477    test        r1d, r1d
478    lea         r4d, [r1d+3]
479    cmovs       r1d, r4d
480    sar         r1d, 2           ; H/4
481    lea         r1d, [r1d*5]     ; 5*(H/4)
482    test        r1d, r1d
483    lea         r4d, [r1d+15]
484    cmovs       r1d, r4d
485    sar         r1d, 4           ; (5*(H/4))/16
486%endif
487    movd         m0, r1d
488
489    add         r1d, r5d
490    add         r3d, r1d
491    shl         r1d, 3
492    sub         r3d, r1d          ; a
493
494    movd         m1, r5d
495    movd         m3, r3d
496    SPLATW       m0, m0, 0        ; H
497    SPLATW       m1, m1, 0        ; V
498    SPLATW       m3, m3, 0        ; a
499%ifidn %1, svq3
500    SWAP          0, 1
501%endif
502    mova         m2, m0
503%if mmsize == 8
504    mova         m5, m0
505%endif
506    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
507%if mmsize == 16
508    psllw        m2, 3
509%else
510    psllw        m5, 3
511    psllw        m2, 2
512    mova         m6, m5
513    paddw        m6, m2
514%endif
515    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
516    paddw        m2, m0           ; a + {8,9,10,11,12,13,14,15}*H
517%if mmsize == 8
518    paddw        m5, m0           ; a + {8,9,10,11}*H
519    paddw        m6, m0           ; a + {12,13,14,15}*H
520%endif
521
522    mov          r4, 8
523.loop:
524    mova         m3, m0           ; b[0..7]
525    mova         m4, m2           ; b[8..15]
526    psraw        m3, 5
527    psraw        m4, 5
528    packuswb     m3, m4
529    mova       [r0], m3
530%if mmsize == 8
531    mova         m3, m5           ; b[8..11]
532    mova         m4, m6           ; b[12..15]
533    psraw        m3, 5
534    psraw        m4, 5
535    packuswb     m3, m4
536    mova     [r0+8], m3
537%endif
538    paddw        m0, m1
539    paddw        m2, m1
540%if mmsize == 8
541    paddw        m5, m1
542    paddw        m6, m1
543%endif
544
545    mova         m3, m0           ; b[0..7]
546    mova         m4, m2           ; b[8..15]
547    psraw        m3, 5
548    psraw        m4, 5
549    packuswb     m3, m4
550    mova    [r0+r2], m3
551%if mmsize == 8
552    mova         m3, m5           ; b[8..11]
553    mova         m4, m6           ; b[12..15]
554    psraw        m3, 5
555    psraw        m4, 5
556    packuswb     m3, m4
557    mova  [r0+r2+8], m3
558%endif
559    paddw        m0, m1
560    paddw        m2, m1
561%if mmsize == 8
562    paddw        m5, m1
563    paddw        m6, m1
564%endif
565
566    lea          r0, [r0+r2*2]
567    dec          r4
568    jg .loop
569    REP_RET
570%endmacro
571
572INIT_MMX mmx
573H264_PRED16x16_PLANE h264
574H264_PRED16x16_PLANE rv40
575H264_PRED16x16_PLANE svq3
576INIT_MMX mmxext
577H264_PRED16x16_PLANE h264
578H264_PRED16x16_PLANE rv40
579H264_PRED16x16_PLANE svq3
580INIT_XMM sse2
581H264_PRED16x16_PLANE h264
582H264_PRED16x16_PLANE rv40
583H264_PRED16x16_PLANE svq3
584INIT_XMM ssse3
585H264_PRED16x16_PLANE h264
586H264_PRED16x16_PLANE rv40
587H264_PRED16x16_PLANE svq3
588
589;-----------------------------------------------------------------------------
590; void ff_pred8x8_plane_8(uint8_t *src, ptrdiff_t stride)
591;-----------------------------------------------------------------------------
592
593%macro H264_PRED8x8_PLANE 0
594cglobal pred8x8_plane_8, 2,9,7
595    mov          r2, r1           ; +stride
596    neg          r1               ; -stride
597
598    movd         m0, [r0+r1  -1]
599%if mmsize == 8
600    pxor         m2, m2
601    movh         m1, [r0+r1  +4 ]
602    punpcklbw    m0, m2
603    punpcklbw    m1, m2
604    pmullw       m0, [pw_m4to4]
605    pmullw       m1, [pw_m4to4+8]
606%else ; mmsize == 16
607%if cpuflag(ssse3)
608    movhps       m0, [r0+r1  +4]   ; this reads 4 bytes more than necessary
609    pmaddubsw    m0, [plane8_shuf] ; H coefficients
610%else ; sse2
611    pxor         m2, m2
612    movd         m1, [r0+r1  +4]
613    punpckldq    m0, m1
614    punpcklbw    m0, m2
615    pmullw       m0, [pw_m4to4]
616%endif
617    movhlps      m1, m0
618%endif
619    paddw        m0, m1
620
621%if notcpuflag(ssse3)
622%if cpuflag(mmxext)
623    PSHUFLW      m1, m0, 0xE
624%elif cpuflag(mmx)
625    mova         m1, m0
626    psrlq        m1, 32
627%endif
628    paddw        m0, m1
629%endif ; !ssse3
630
631%if cpuflag(mmxext)
632    PSHUFLW      m1, m0, 0x1
633%elif cpuflag(mmx)
634    mova         m1, m0
635    psrlq        m1, 16
636%endif
637    paddw        m0, m1           ; sum of H coefficients
638
639    lea          r4, [r0+r2*4-1]
640    lea          r3, [r0     -1]
641    add          r4, r2
642
643%if ARCH_X86_64
644%define e_reg r8
645%else
646%define e_reg r0
647%endif
648
649    movzx     e_reg, byte [r3+r2*2   ]
650    movzx        r5, byte [r4+r1     ]
651    sub          r5, e_reg
652
653    movzx     e_reg, byte [r3        ]
654%if ARCH_X86_64
655    movzx        r7, byte [r4+r2     ]
656    sub          r7, e_reg
657    sub          r5, r7
658%else
659    movzx        r6, byte [r4+r2     ]
660    sub          r6, e_reg
661    lea          r5, [r5+r6*4]
662    sub          r5, r6
663%endif
664
665    movzx     e_reg, byte [r3+r1     ]
666    movzx        r6, byte [r4+r2*2   ]
667    sub          r6, e_reg
668%if ARCH_X86_64
669    add          r6, r7
670%endif
671    lea          r5, [r5+r6*4]
672
673    movzx     e_reg, byte [r3+r2     ]
674    movzx        r6, byte [r4        ]
675    sub          r6, e_reg
676    lea          r6, [r5+r6*2]
677
678    lea          r5, [r6*9+16]
679    lea          r5, [r5+r6*8]
680    sar          r5, 5
681
682%if ARCH_X86_64 == 0
683    mov          r0, r0m
684%endif
685
686    movzx        r3, byte [r4+r2*2  ]
687    movzx        r4, byte [r0+r1  +7]
688    lea          r3, [r3+r4+1]
689    shl          r3, 4
690    movd        r1d, m0
691    movsx       r1d, r1w
692    imul        r1d, 17
693    add         r1d, 16
694    sar         r1d, 5
695    movd         m0, r1d
696    add         r1d, r5d
697    sub         r3d, r1d
698    add         r1d, r1d
699    sub         r3d, r1d          ; a
700
701    movd         m1, r5d
702    movd         m3, r3d
703    SPLATW       m0, m0, 0        ; H
704    SPLATW       m1, m1, 0        ; V
705    SPLATW       m3, m3, 0        ; a
706%if mmsize == 8
707    mova         m2, m0
708%endif
709    pmullw       m0, [pw_0to7]    ; 0*H, 1*H, ..., 7*H  (words)
710    paddw        m0, m3           ; a + {0,1,2,3,4,5,6,7}*H
711%if mmsize == 8
712    psllw        m2, 2
713    paddw        m2, m0           ; a + {4,5,6,7}*H
714%endif
715
716    mov          r4, 4
717ALIGN 16
718.loop:
719%if mmsize == 16
720    mova         m3, m0           ; b[0..7]
721    paddw        m0, m1
722    psraw        m3, 5
723    mova         m4, m0           ; V+b[0..7]
724    paddw        m0, m1
725    psraw        m4, 5
726    packuswb     m3, m4
727    movh       [r0], m3
728    movhps  [r0+r2], m3
729%else ; mmsize == 8
730    mova         m3, m0           ; b[0..3]
731    mova         m4, m2           ; b[4..7]
732    paddw        m0, m1
733    paddw        m2, m1
734    psraw        m3, 5
735    psraw        m4, 5
736    mova         m5, m0           ; V+b[0..3]
737    mova         m6, m2           ; V+b[4..7]
738    paddw        m0, m1
739    paddw        m2, m1
740    psraw        m5, 5
741    psraw        m6, 5
742    packuswb     m3, m4
743    packuswb     m5, m6
744    mova       [r0], m3
745    mova    [r0+r2], m5
746%endif
747
748    lea          r0, [r0+r2*2]
749    dec          r4
750    jg .loop
751    REP_RET
752%endmacro
753
754INIT_MMX mmx
755H264_PRED8x8_PLANE
756INIT_MMX mmxext
757H264_PRED8x8_PLANE
758INIT_XMM sse2
759H264_PRED8x8_PLANE
760INIT_XMM ssse3
761H264_PRED8x8_PLANE
762
763;-----------------------------------------------------------------------------
764; void ff_pred8x8_vertical_8(uint8_t *src, ptrdiff_t stride)
765;-----------------------------------------------------------------------------
766
767INIT_MMX mmx
768cglobal pred8x8_vertical_8, 2,2
769    sub    r0, r1
770    movq  mm0, [r0]
771%rep 3
772    movq [r0+r1*1], mm0
773    movq [r0+r1*2], mm0
774    lea    r0, [r0+r1*2]
775%endrep
776    movq [r0+r1*1], mm0
777    movq [r0+r1*2], mm0
778    RET
779
780;-----------------------------------------------------------------------------
781; void ff_pred8x8_horizontal_8(uint8_t *src, ptrdiff_t stride)
782;-----------------------------------------------------------------------------
783
784%macro PRED8x8_H 0
785cglobal pred8x8_horizontal_8, 2,3
786    mov       r2, 4
787%if cpuflag(ssse3)
788    mova      m2, [pb_3]
789%endif
790.loop:
791    SPLATB_LOAD m0, r0+r1*0-1, m2
792    SPLATB_LOAD m1, r0+r1*1-1, m2
793    mova [r0+r1*0], m0
794    mova [r0+r1*1], m1
795    lea       r0, [r0+r1*2]
796    dec       r2
797    jg .loop
798    REP_RET
799%endmacro
800
801INIT_MMX mmx
802PRED8x8_H
803INIT_MMX mmxext
804PRED8x8_H
805INIT_MMX ssse3
806PRED8x8_H
807
808;-----------------------------------------------------------------------------
809; void ff_pred8x8_top_dc_8_mmxext(uint8_t *src, ptrdiff_t stride)
810;-----------------------------------------------------------------------------
811INIT_MMX mmxext
812cglobal pred8x8_top_dc_8, 2,5
813    sub         r0, r1
814    movq       mm0, [r0]
815    pxor       mm1, mm1
816    pxor       mm2, mm2
817    lea         r2, [r0+r1*2]
818    punpckhbw  mm1, mm0
819    punpcklbw  mm0, mm2
820    psadbw     mm1, mm2        ; s1
821    lea         r3, [r2+r1*2]
822    psadbw     mm0, mm2        ; s0
823    psrlw      mm1, 1
824    psrlw      mm0, 1
825    pavgw      mm1, mm2
826    lea         r4, [r3+r1*2]
827    pavgw      mm0, mm2
828    pshufw     mm1, mm1, 0
829    pshufw     mm0, mm0, 0     ; dc0 (w)
830    packuswb   mm0, mm1        ; dc0,dc1 (b)
831    movq [r0+r1*1], mm0
832    movq [r0+r1*2], mm0
833    lea         r0, [r3+r1*2]
834    movq [r2+r1*1], mm0
835    movq [r2+r1*2], mm0
836    movq [r3+r1*1], mm0
837    movq [r3+r1*2], mm0
838    movq [r0+r1*1], mm0
839    movq [r0+r1*2], mm0
840    RET
841
842;-----------------------------------------------------------------------------
843; void ff_pred8x8_dc_8_mmxext(uint8_t *src, ptrdiff_t stride)
844;-----------------------------------------------------------------------------
845
846INIT_MMX mmxext
847cglobal pred8x8_dc_8, 2,5
848    sub       r0, r1
849    pxor      m7, m7
850    movd      m0, [r0+0]
851    movd      m1, [r0+4]
852    psadbw    m0, m7            ; s0
853    mov       r4, r0
854    psadbw    m1, m7            ; s1
855
856    movzx    r2d, byte [r0+r1*1-1]
857    movzx    r3d, byte [r0+r1*2-1]
858    lea       r0, [r0+r1*2]
859    add      r2d, r3d
860    movzx    r3d, byte [r0+r1*1-1]
861    add      r2d, r3d
862    movzx    r3d, byte [r0+r1*2-1]
863    add      r2d, r3d
864    lea       r0, [r0+r1*2]
865    movd      m2, r2d            ; s2
866    movzx    r2d, byte [r0+r1*1-1]
867    movzx    r3d, byte [r0+r1*2-1]
868    lea       r0, [r0+r1*2]
869    add      r2d, r3d
870    movzx    r3d, byte [r0+r1*1-1]
871    add      r2d, r3d
872    movzx    r3d, byte [r0+r1*2-1]
873    add      r2d, r3d
874    movd      m3, r2d            ; s3
875
876    punpcklwd m0, m1
877    mov       r0, r4
878    punpcklwd m2, m3
879    punpckldq m0, m2            ; s0, s1, s2, s3
880    pshufw    m3, m0, 11110110b ; s2, s1, s3, s3
881    lea       r2, [r0+r1*2]
882    pshufw    m0, m0, 01110100b ; s0, s1, s3, s1
883    paddw     m0, m3
884    lea       r3, [r2+r1*2]
885    psrlw     m0, 2
886    pavgw     m0, m7            ; s0+s2, s1, s3, s1+s3
887    lea       r4, [r3+r1*2]
888    packuswb  m0, m0
889    punpcklbw m0, m0
890    movq      m1, m0
891    punpcklbw m0, m0
892    punpckhbw m1, m1
893    movq [r0+r1*1], m0
894    movq [r0+r1*2], m0
895    movq [r2+r1*1], m0
896    movq [r2+r1*2], m0
897    movq [r3+r1*1], m1
898    movq [r3+r1*2], m1
899    movq [r4+r1*1], m1
900    movq [r4+r1*2], m1
901    RET
902
903;-----------------------------------------------------------------------------
904; void ff_pred8x8_dc_rv40_8(uint8_t *src, ptrdiff_t stride)
905;-----------------------------------------------------------------------------
906
907INIT_MMX mmxext
908cglobal pred8x8_dc_rv40_8, 2,7
909    mov       r4, r0
910    sub       r0, r1
911    pxor      mm0, mm0
912    psadbw    mm0, [r0]
913    dec        r0
914    movzx     r5d, byte [r0+r1*1]
915    movd      r6d, mm0
916    lea        r0, [r0+r1*2]
917%rep 3
918    movzx     r2d, byte [r0+r1*0]
919    movzx     r3d, byte [r0+r1*1]
920    add       r5d, r2d
921    add       r6d, r3d
922    lea        r0, [r0+r1*2]
923%endrep
924    movzx     r2d, byte [r0+r1*0]
925    add       r5d, r6d
926    lea       r2d, [r2+r5+8]
927    shr       r2d, 4
928    movd      mm0, r2d
929    punpcklbw mm0, mm0
930    pshufw    mm0, mm0, 0
931    mov       r3d, 4
932.loop:
933    movq [r4+r1*0], mm0
934    movq [r4+r1*1], mm0
935    lea   r4, [r4+r1*2]
936    dec   r3d
937    jg .loop
938    REP_RET
939
940;-----------------------------------------------------------------------------
941; void ff_pred8x8_tm_vp8_8(uint8_t *src, ptrdiff_t stride)
942;-----------------------------------------------------------------------------
943
944%macro PRED8x8_TM 0
945cglobal pred8x8_tm_vp8_8, 2,6
946    sub        r0, r1
947    pxor      mm7, mm7
948    movq      mm0, [r0]
949    movq      mm1, mm0
950    punpcklbw mm0, mm7
951    punpckhbw mm1, mm7
952    movzx     r4d, byte [r0-1]
953    mov       r5d, 4
954.loop:
955    movzx     r2d, byte [r0+r1*1-1]
956    movzx     r3d, byte [r0+r1*2-1]
957    sub       r2d, r4d
958    sub       r3d, r4d
959    movd      mm2, r2d
960    movd      mm4, r3d
961    SPLATW    mm2, mm2, 0
962    SPLATW    mm4, mm4, 0
963    movq      mm3, mm2
964    movq      mm5, mm4
965    paddw     mm2, mm0
966    paddw     mm3, mm1
967    paddw     mm4, mm0
968    paddw     mm5, mm1
969    packuswb  mm2, mm3
970    packuswb  mm4, mm5
971    movq [r0+r1*1], mm2
972    movq [r0+r1*2], mm4
973    lea        r0, [r0+r1*2]
974    dec       r5d
975    jg .loop
976    REP_RET
977%endmacro
978
979INIT_MMX mmx
980PRED8x8_TM
981INIT_MMX mmxext
982PRED8x8_TM
983
984INIT_XMM sse2
985cglobal pred8x8_tm_vp8_8, 2,6,4
986    sub          r0, r1
987    pxor       xmm1, xmm1
988    movq       xmm0, [r0]
989    punpcklbw  xmm0, xmm1
990    movzx       r4d, byte [r0-1]
991    mov         r5d, 4
992.loop:
993    movzx       r2d, byte [r0+r1*1-1]
994    movzx       r3d, byte [r0+r1*2-1]
995    sub         r2d, r4d
996    sub         r3d, r4d
997    movd       xmm2, r2d
998    movd       xmm3, r3d
999    pshuflw    xmm2, xmm2, 0
1000    pshuflw    xmm3, xmm3, 0
1001    punpcklqdq xmm2, xmm2
1002    punpcklqdq xmm3, xmm3
1003    paddw      xmm2, xmm0
1004    paddw      xmm3, xmm0
1005    packuswb   xmm2, xmm3
1006    movq   [r0+r1*1], xmm2
1007    movhps [r0+r1*2], xmm2
1008    lea          r0, [r0+r1*2]
1009    dec         r5d
1010    jg .loop
1011    REP_RET
1012
1013INIT_XMM ssse3
1014cglobal pred8x8_tm_vp8_8, 2,3,6
1015    sub          r0, r1
1016    movdqa     xmm4, [tm_shuf]
1017    pxor       xmm1, xmm1
1018    movq       xmm0, [r0]
1019    punpcklbw  xmm0, xmm1
1020    movd       xmm5, [r0-4]
1021    pshufb     xmm5, xmm4
1022    mov         r2d, 4
1023.loop:
1024    movd       xmm2, [r0+r1*1-4]
1025    movd       xmm3, [r0+r1*2-4]
1026    pshufb     xmm2, xmm4
1027    pshufb     xmm3, xmm4
1028    psubw      xmm2, xmm5
1029    psubw      xmm3, xmm5
1030    paddw      xmm2, xmm0
1031    paddw      xmm3, xmm0
1032    packuswb   xmm2, xmm3
1033    movq   [r0+r1*1], xmm2
1034    movhps [r0+r1*2], xmm2
1035    lea          r0, [r0+r1*2]
1036    dec         r2d
1037    jg .loop
1038    REP_RET
1039
1040; dest, left, right, src, tmp
1041; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
1042%macro PRED4x4_LOWPASS 5
1043    mova    %5, %2
1044    pavgb   %2, %3
1045    pxor    %3, %5
1046    mova    %1, %4
1047    pand    %3, [pb_1]
1048    psubusb %2, %3
1049    pavgb   %1, %2
1050%endmacro
1051
1052;-----------------------------------------------------------------------------
1053; void ff_pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright,
1054;                           ptrdiff_t stride)
1055;-----------------------------------------------------------------------------
1056%macro PRED8x8L_TOP_DC 0
1057cglobal pred8x8l_top_dc_8, 4,4
1058    sub          r0, r3
1059    pxor        mm7, mm7
1060    movq        mm0, [r0-8]
1061    movq        mm3, [r0]
1062    movq        mm1, [r0+8]
1063    movq        mm2, mm3
1064    movq        mm4, mm3
1065    PALIGNR     mm2, mm0, 7, mm0
1066    PALIGNR     mm1, mm4, 1, mm4
1067    test        r1d, r1d ; top_left
1068    jz .fix_lt_2
1069    test        r2d, r2d ; top_right
1070    jz .fix_tr_1
1071    jmp .body
1072.fix_lt_2:
1073    movq        mm5, mm3
1074    pxor        mm5, mm2
1075    psllq       mm5, 56
1076    psrlq       mm5, 56
1077    pxor        mm2, mm5
1078    test        r2d, r2d ; top_right
1079    jnz .body
1080.fix_tr_1:
1081    movq        mm5, mm3
1082    pxor        mm5, mm1
1083    psrlq       mm5, 56
1084    psllq       mm5, 56
1085    pxor        mm1, mm5
1086.body:
1087    PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1088    psadbw   mm7, mm0
1089    paddw    mm7, [pw_4]
1090    psrlw    mm7, 3
1091    pshufw   mm7, mm7, 0
1092    packuswb mm7, mm7
1093%rep 3
1094    movq [r0+r3*1], mm7
1095    movq [r0+r3*2], mm7
1096    lea    r0, [r0+r3*2]
1097%endrep
1098    movq [r0+r3*1], mm7
1099    movq [r0+r3*2], mm7
1100    RET
1101%endmacro
1102
1103INIT_MMX mmxext
1104PRED8x8L_TOP_DC
1105INIT_MMX ssse3
1106PRED8x8L_TOP_DC
1107
1108;-----------------------------------------------------------------------------
1109; void ff_pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright,
1110;                       ptrdiff_t stride)
1111;-----------------------------------------------------------------------------
1112
1113%macro PRED8x8L_DC 0
1114cglobal pred8x8l_dc_8, 4,5
1115    sub          r0, r3
1116    lea          r4, [r0+r3*2]
1117    movq        mm0, [r0+r3*1-8]
1118    punpckhbw   mm0, [r0+r3*0-8]
1119    movq        mm1, [r4+r3*1-8]
1120    punpckhbw   mm1, [r0+r3*2-8]
1121    mov          r4, r0
1122    punpckhwd   mm1, mm0
1123    lea          r0, [r0+r3*4]
1124    movq        mm2, [r0+r3*1-8]
1125    punpckhbw   mm2, [r0+r3*0-8]
1126    lea          r0, [r0+r3*2]
1127    movq        mm3, [r0+r3*1-8]
1128    punpckhbw   mm3, [r0+r3*0-8]
1129    punpckhwd   mm3, mm2
1130    punpckhdq   mm3, mm1
1131    lea          r0, [r0+r3*2]
1132    movq        mm0, [r0+r3*0-8]
1133    movq        mm1, [r4]
1134    mov          r0, r4
1135    movq        mm4, mm3
1136    movq        mm2, mm3
1137    PALIGNR     mm4, mm0, 7, mm0
1138    PALIGNR     mm1, mm2, 1, mm2
1139    test        r1d, r1d
1140    jnz .do_left
1141.fix_lt_1:
1142    movq        mm5, mm3
1143    pxor        mm5, mm4
1144    psrlq       mm5, 56
1145    psllq       mm5, 48
1146    pxor        mm1, mm5
1147    jmp .do_left
1148.fix_lt_2:
1149    movq        mm5, mm3
1150    pxor        mm5, mm2
1151    psllq       mm5, 56
1152    psrlq       mm5, 56
1153    pxor        mm2, mm5
1154    test        r2d, r2d
1155    jnz .body
1156.fix_tr_1:
1157    movq        mm5, mm3
1158    pxor        mm5, mm1
1159    psrlq       mm5, 56
1160    psllq       mm5, 56
1161    pxor        mm1, mm5
1162    jmp .body
1163.do_left:
1164    movq        mm0, mm4
1165    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1166    movq        mm4, mm0
1167    movq        mm7, mm2
1168    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1169    psllq       mm1, 56
1170    PALIGNR     mm7, mm1, 7, mm3
1171    movq        mm0, [r0-8]
1172    movq        mm3, [r0]
1173    movq        mm1, [r0+8]
1174    movq        mm2, mm3
1175    movq        mm4, mm3
1176    PALIGNR     mm2, mm0, 7, mm0
1177    PALIGNR     mm1, mm4, 1, mm4
1178    test        r1d, r1d
1179    jz .fix_lt_2
1180    test        r2d, r2d
1181    jz .fix_tr_1
1182.body:
1183    lea          r1, [r0+r3*2]
1184    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1185    pxor        mm0, mm0
1186    pxor        mm1, mm1
1187    lea          r2, [r1+r3*2]
1188    psadbw      mm0, mm7
1189    psadbw      mm1, mm6
1190    paddw       mm0, [pw_8]
1191    paddw       mm0, mm1
1192    lea          r4, [r2+r3*2]
1193    psrlw       mm0, 4
1194    pshufw      mm0, mm0, 0
1195    packuswb    mm0, mm0
1196    movq [r0+r3*1], mm0
1197    movq [r0+r3*2], mm0
1198    movq [r1+r3*1], mm0
1199    movq [r1+r3*2], mm0
1200    movq [r2+r3*1], mm0
1201    movq [r2+r3*2], mm0
1202    movq [r4+r3*1], mm0
1203    movq [r4+r3*2], mm0
1204    RET
1205%endmacro
1206
1207INIT_MMX mmxext
1208PRED8x8L_DC
1209INIT_MMX ssse3
1210PRED8x8L_DC
1211
1212;-----------------------------------------------------------------------------
1213; void ff_pred8x8l_horizontal_8(uint8_t *src, int has_topleft,
1214;                               int has_topright, ptrdiff_t stride)
1215;-----------------------------------------------------------------------------
1216
1217%macro PRED8x8L_HORIZONTAL 0
1218cglobal pred8x8l_horizontal_8, 4,4
1219    sub          r0, r3
1220    lea          r2, [r0+r3*2]
1221    movq        mm0, [r0+r3*1-8]
1222    test        r1d, r1d
1223    lea          r1, [r0+r3]
1224    cmovnz       r1, r0
1225    punpckhbw   mm0, [r1+r3*0-8]
1226    movq        mm1, [r2+r3*1-8]
1227    punpckhbw   mm1, [r0+r3*2-8]
1228    mov          r2, r0
1229    punpckhwd   mm1, mm0
1230    lea          r0, [r0+r3*4]
1231    movq        mm2, [r0+r3*1-8]
1232    punpckhbw   mm2, [r0+r3*0-8]
1233    lea          r0, [r0+r3*2]
1234    movq        mm3, [r0+r3*1-8]
1235    punpckhbw   mm3, [r0+r3*0-8]
1236    punpckhwd   mm3, mm2
1237    punpckhdq   mm3, mm1
1238    lea          r0, [r0+r3*2]
1239    movq        mm0, [r0+r3*0-8]
1240    movq        mm1, [r1+r3*0-8]
1241    mov          r0, r2
1242    movq        mm4, mm3
1243    movq        mm2, mm3
1244    PALIGNR     mm4, mm0, 7, mm0
1245    PALIGNR     mm1, mm2, 1, mm2
1246    movq        mm0, mm4
1247    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1248    movq        mm4, mm0
1249    movq        mm7, mm2
1250    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1251    psllq       mm1, 56
1252    PALIGNR     mm7, mm1, 7, mm3
1253    movq        mm3, mm7
1254    lea         r1, [r0+r3*2]
1255    movq       mm7, mm3
1256    punpckhbw  mm3, mm3
1257    punpcklbw  mm7, mm7
1258    pshufw     mm0, mm3, 0xff
1259    pshufw     mm1, mm3, 0xaa
1260    lea         r2, [r1+r3*2]
1261    pshufw     mm2, mm3, 0x55
1262    pshufw     mm3, mm3, 0x00
1263    pshufw     mm4, mm7, 0xff
1264    pshufw     mm5, mm7, 0xaa
1265    pshufw     mm6, mm7, 0x55
1266    pshufw     mm7, mm7, 0x00
1267    movq [r0+r3*1], mm0
1268    movq [r0+r3*2], mm1
1269    movq [r1+r3*1], mm2
1270    movq [r1+r3*2], mm3
1271    movq [r2+r3*1], mm4
1272    movq [r2+r3*2], mm5
1273    lea         r0, [r2+r3*2]
1274    movq [r0+r3*1], mm6
1275    movq [r0+r3*2], mm7
1276    RET
1277%endmacro
1278
1279INIT_MMX mmxext
1280PRED8x8L_HORIZONTAL
1281INIT_MMX ssse3
1282PRED8x8L_HORIZONTAL
1283
1284;-----------------------------------------------------------------------------
1285; void ff_pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright,
1286;                             ptrdiff_t stride)
1287;-----------------------------------------------------------------------------
1288
1289%macro PRED8x8L_VERTICAL 0
1290cglobal pred8x8l_vertical_8, 4,4
1291    sub          r0, r3
1292    movq        mm0, [r0-8]
1293    movq        mm3, [r0]
1294    movq        mm1, [r0+8]
1295    movq        mm2, mm3
1296    movq        mm4, mm3
1297    PALIGNR     mm2, mm0, 7, mm0
1298    PALIGNR     mm1, mm4, 1, mm4
1299    test        r1d, r1d ; top_left
1300    jz .fix_lt_2
1301    test        r2d, r2d ; top_right
1302    jz .fix_tr_1
1303    jmp .body
1304.fix_lt_2:
1305    movq        mm5, mm3
1306    pxor        mm5, mm2
1307    psllq       mm5, 56
1308    psrlq       mm5, 56
1309    pxor        mm2, mm5
1310    test        r2d, r2d ; top_right
1311    jnz .body
1312.fix_tr_1:
1313    movq        mm5, mm3
1314    pxor        mm5, mm1
1315    psrlq       mm5, 56
1316    psllq       mm5, 56
1317    pxor        mm1, mm5
1318.body:
1319    PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
1320%rep 3
1321    movq [r0+r3*1], mm0
1322    movq [r0+r3*2], mm0
1323    lea    r0, [r0+r3*2]
1324%endrep
1325    movq [r0+r3*1], mm0
1326    movq [r0+r3*2], mm0
1327    RET
1328%endmacro
1329
1330INIT_MMX mmxext
1331PRED8x8L_VERTICAL
1332INIT_MMX ssse3
1333PRED8x8L_VERTICAL
1334
1335;-----------------------------------------------------------------------------
1336; void ff_pred8x8l_down_left_8(uint8_t *src, int has_topleft,
1337;                              int has_topright, ptrdiff_t stride)
1338;-----------------------------------------------------------------------------
1339
1340INIT_MMX mmxext
1341cglobal pred8x8l_down_left_8, 4,5
1342    sub          r0, r3
1343    movq        mm0, [r0-8]
1344    movq        mm3, [r0]
1345    movq        mm1, [r0+8]
1346    movq        mm2, mm3
1347    movq        mm4, mm3
1348    PALIGNR     mm2, mm0, 7, mm0
1349    PALIGNR     mm1, mm4, 1, mm4
1350    test        r1d, r1d
1351    jz .fix_lt_2
1352    test        r2d, r2d
1353    jz .fix_tr_1
1354    jmp .do_top
1355.fix_lt_2:
1356    movq        mm5, mm3
1357    pxor        mm5, mm2
1358    psllq       mm5, 56
1359    psrlq       mm5, 56
1360    pxor        mm2, mm5
1361    test        r2d, r2d
1362    jnz .do_top
1363.fix_tr_1:
1364    movq        mm5, mm3
1365    pxor        mm5, mm1
1366    psrlq       mm5, 56
1367    psllq       mm5, 56
1368    pxor        mm1, mm5
1369    jmp .do_top
1370.fix_tr_2:
1371    punpckhbw   mm3, mm3
1372    pshufw      mm1, mm3, 0xFF
1373    jmp .do_topright
1374.do_top:
1375    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1376    movq        mm7, mm4
1377    test        r2d, r2d
1378    jz .fix_tr_2
1379    movq        mm0, [r0+8]
1380    movq        mm5, mm0
1381    movq        mm2, mm0
1382    movq        mm4, mm0
1383    psrlq       mm5, 56
1384    PALIGNR     mm2, mm3, 7, mm3
1385    PALIGNR     mm5, mm4, 1, mm4
1386    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1387.do_topright:
1388    lea          r1, [r0+r3*2]
1389    movq        mm6, mm1
1390    psrlq       mm1, 56
1391    movq        mm4, mm1
1392    lea          r2, [r1+r3*2]
1393    movq        mm2, mm6
1394    PALIGNR     mm2, mm7, 1, mm0
1395    movq        mm3, mm6
1396    PALIGNR     mm3, mm7, 7, mm0
1397    PALIGNR     mm4, mm6, 1, mm0
1398    movq        mm5, mm7
1399    movq        mm1, mm7
1400    movq        mm7, mm6
1401    lea          r4, [r2+r3*2]
1402    psllq       mm1, 8
1403    PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1404    PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1405    movq  [r4+r3*2], mm1
1406    movq        mm2, mm0
1407    psllq       mm1, 8
1408    psrlq       mm2, 56
1409    psllq       mm0, 8
1410    por         mm1, mm2
1411    movq  [r4+r3*1], mm1
1412    movq        mm2, mm0
1413    psllq       mm1, 8
1414    psrlq       mm2, 56
1415    psllq       mm0, 8
1416    por         mm1, mm2
1417    movq  [r2+r3*2], mm1
1418    movq        mm2, mm0
1419    psllq       mm1, 8
1420    psrlq       mm2, 56
1421    psllq       mm0, 8
1422    por         mm1, mm2
1423    movq  [r2+r3*1], mm1
1424    movq        mm2, mm0
1425    psllq       mm1, 8
1426    psrlq       mm2, 56
1427    psllq       mm0, 8
1428    por         mm1, mm2
1429    movq  [r1+r3*2], mm1
1430    movq        mm2, mm0
1431    psllq       mm1, 8
1432    psrlq       mm2, 56
1433    psllq       mm0, 8
1434    por         mm1, mm2
1435    movq  [r1+r3*1], mm1
1436    movq        mm2, mm0
1437    psllq       mm1, 8
1438    psrlq       mm2, 56
1439    psllq       mm0, 8
1440    por         mm1, mm2
1441    movq  [r0+r3*2], mm1
1442    psllq       mm1, 8
1443    psrlq       mm0, 56
1444    por         mm1, mm0
1445    movq  [r0+r3*1], mm1
1446    RET
1447
1448%macro PRED8x8L_DOWN_LEFT 0
1449cglobal pred8x8l_down_left_8, 4,4
1450    sub          r0, r3
1451    movq        mm0, [r0-8]
1452    movq        mm3, [r0]
1453    movq        mm1, [r0+8]
1454    movq        mm2, mm3
1455    movq        mm4, mm3
1456    PALIGNR     mm2, mm0, 7, mm0
1457    PALIGNR     mm1, mm4, 1, mm4
1458    test        r1d, r1d ; top_left
1459    jz .fix_lt_2
1460    test        r2d, r2d ; top_right
1461    jz .fix_tr_1
1462    jmp .do_top
1463.fix_lt_2:
1464    movq        mm5, mm3
1465    pxor        mm5, mm2
1466    psllq       mm5, 56
1467    psrlq       mm5, 56
1468    pxor        mm2, mm5
1469    test        r2d, r2d ; top_right
1470    jnz .do_top
1471.fix_tr_1:
1472    movq        mm5, mm3
1473    pxor        mm5, mm1
1474    psrlq       mm5, 56
1475    psllq       mm5, 56
1476    pxor        mm1, mm5
1477    jmp .do_top
1478.fix_tr_2:
1479    punpckhbw   mm3, mm3
1480    pshufw      mm1, mm3, 0xFF
1481    jmp .do_topright
1482.do_top:
1483    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1484    movq2dq    xmm3, mm4
1485    test        r2d, r2d ; top_right
1486    jz .fix_tr_2
1487    movq        mm0, [r0+8]
1488    movq        mm5, mm0
1489    movq        mm2, mm0
1490    movq        mm4, mm0
1491    psrlq       mm5, 56
1492    PALIGNR     mm2, mm3, 7, mm3
1493    PALIGNR     mm5, mm4, 1, mm4
1494    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
1495.do_topright:
1496    movq2dq    xmm4, mm1
1497    psrlq       mm1, 56
1498    movq2dq    xmm5, mm1
1499    lea         r1, [r0+r3*2]
1500    pslldq    xmm4, 8
1501    por       xmm3, xmm4
1502    movdqa    xmm2, xmm3
1503    psrldq    xmm2, 1
1504    pslldq    xmm5, 15
1505    por       xmm2, xmm5
1506    lea         r2, [r1+r3*2]
1507    movdqa    xmm1, xmm3
1508    pslldq    xmm1, 1
1509INIT_XMM cpuname
1510    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1511    psrldq    xmm0, 1
1512    movq [r0+r3*1], xmm0
1513    psrldq    xmm0, 1
1514    movq [r0+r3*2], xmm0
1515    psrldq    xmm0, 1
1516    lea         r0, [r2+r3*2]
1517    movq [r1+r3*1], xmm0
1518    psrldq    xmm0, 1
1519    movq [r1+r3*2], xmm0
1520    psrldq    xmm0, 1
1521    movq [r2+r3*1], xmm0
1522    psrldq    xmm0, 1
1523    movq [r2+r3*2], xmm0
1524    psrldq    xmm0, 1
1525    movq [r0+r3*1], xmm0
1526    psrldq    xmm0, 1
1527    movq [r0+r3*2], xmm0
1528    RET
1529%endmacro
1530
1531INIT_MMX sse2
1532PRED8x8L_DOWN_LEFT
1533INIT_MMX ssse3
1534PRED8x8L_DOWN_LEFT
1535
1536;-----------------------------------------------------------------------------
1537; void ff_pred8x8l_down_right_8_mmxext(uint8_t *src, int has_topleft,
1538;                                      int has_topright, ptrdiff_t stride)
1539;-----------------------------------------------------------------------------
1540
1541INIT_MMX mmxext
1542cglobal pred8x8l_down_right_8, 4,5
1543    sub          r0, r3
1544    lea          r4, [r0+r3*2]
1545    movq        mm0, [r0+r3*1-8]
1546    punpckhbw   mm0, [r0+r3*0-8]
1547    movq        mm1, [r4+r3*1-8]
1548    punpckhbw   mm1, [r0+r3*2-8]
1549    mov          r4, r0
1550    punpckhwd   mm1, mm0
1551    lea          r0, [r0+r3*4]
1552    movq        mm2, [r0+r3*1-8]
1553    punpckhbw   mm2, [r0+r3*0-8]
1554    lea          r0, [r0+r3*2]
1555    movq        mm3, [r0+r3*1-8]
1556    punpckhbw   mm3, [r0+r3*0-8]
1557    punpckhwd   mm3, mm2
1558    punpckhdq   mm3, mm1
1559    lea          r0, [r0+r3*2]
1560    movq        mm0, [r0+r3*0-8]
1561    movq        mm1, [r4]
1562    mov          r0, r4
1563    movq        mm4, mm3
1564    movq        mm2, mm3
1565    PALIGNR     mm4, mm0, 7, mm0
1566    PALIGNR     mm1, mm2, 1, mm2
1567    test        r1d, r1d ; top_left
1568    jz .fix_lt_1
1569.do_left:
1570    movq        mm0, mm4
1571    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1572    movq        mm4, mm0
1573    movq        mm7, mm2
1574    movq        mm6, mm2
1575    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1576    psllq       mm1, 56
1577    PALIGNR     mm7, mm1, 7, mm3
1578    movq        mm0, [r0-8]
1579    movq        mm3, [r0]
1580    movq        mm1, [r0+8]
1581    movq        mm2, mm3
1582    movq        mm4, mm3
1583    PALIGNR     mm2, mm0, 7, mm0
1584    PALIGNR     mm1, mm4, 1, mm4
1585    test        r1d, r1d ; top_left
1586    jz .fix_lt_2
1587    test        r2d, r2d ; top_right
1588    jz .fix_tr_1
1589.do_top:
1590    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1591    movq        mm5, mm4
1592    jmp .body
1593.fix_lt_1:
1594    movq        mm5, mm3
1595    pxor        mm5, mm4
1596    psrlq       mm5, 56
1597    psllq       mm5, 48
1598    pxor        mm1, mm5
1599    jmp .do_left
1600.fix_lt_2:
1601    movq        mm5, mm3
1602    pxor        mm5, mm2
1603    psllq       mm5, 56
1604    psrlq       mm5, 56
1605    pxor        mm2, mm5
1606    test        r2d, r2d ; top_right
1607    jnz .do_top
1608.fix_tr_1:
1609    movq        mm5, mm3
1610    pxor        mm5, mm1
1611    psrlq       mm5, 56
1612    psllq       mm5, 56
1613    pxor        mm1, mm5
1614    jmp .do_top
1615.body:
1616    lea         r1, [r0+r3*2]
1617    movq       mm1, mm7
1618    movq       mm7, mm5
1619    movq       mm5, mm6
1620    movq       mm2, mm7
1621    lea         r2, [r1+r3*2]
1622    PALIGNR    mm2, mm6, 1, mm0
1623    movq       mm3, mm7
1624    PALIGNR    mm3, mm6, 7, mm0
1625    movq       mm4, mm7
1626    lea         r4, [r2+r3*2]
1627    psrlq      mm4, 8
1628    PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
1629    PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
1630    movq [r4+r3*2], mm0
1631    movq       mm2, mm1
1632    psrlq      mm0, 8
1633    psllq      mm2, 56
1634    psrlq      mm1, 8
1635    por        mm0, mm2
1636    movq [r4+r3*1], mm0
1637    movq       mm2, mm1
1638    psrlq      mm0, 8
1639    psllq      mm2, 56
1640    psrlq      mm1, 8
1641    por        mm0, mm2
1642    movq [r2+r3*2], mm0
1643    movq       mm2, mm1
1644    psrlq      mm0, 8
1645    psllq      mm2, 56
1646    psrlq      mm1, 8
1647    por        mm0, mm2
1648    movq [r2+r3*1], mm0
1649    movq       mm2, mm1
1650    psrlq      mm0, 8
1651    psllq      mm2, 56
1652    psrlq      mm1, 8
1653    por        mm0, mm2
1654    movq [r1+r3*2], mm0
1655    movq       mm2, mm1
1656    psrlq      mm0, 8
1657    psllq      mm2, 56
1658    psrlq      mm1, 8
1659    por        mm0, mm2
1660    movq [r1+r3*1], mm0
1661    movq       mm2, mm1
1662    psrlq      mm0, 8
1663    psllq      mm2, 56
1664    psrlq      mm1, 8
1665    por        mm0, mm2
1666    movq [r0+r3*2], mm0
1667    psrlq      mm0, 8
1668    psllq      mm1, 56
1669    por        mm0, mm1
1670    movq [r0+r3*1], mm0
1671    RET
1672
1673%macro PRED8x8L_DOWN_RIGHT 0
1674cglobal pred8x8l_down_right_8, 4,5
1675    sub          r0, r3
1676    lea          r4, [r0+r3*2]
1677    movq        mm0, [r0+r3*1-8]
1678    punpckhbw   mm0, [r0+r3*0-8]
1679    movq        mm1, [r4+r3*1-8]
1680    punpckhbw   mm1, [r0+r3*2-8]
1681    mov          r4, r0
1682    punpckhwd   mm1, mm0
1683    lea          r0, [r0+r3*4]
1684    movq        mm2, [r0+r3*1-8]
1685    punpckhbw   mm2, [r0+r3*0-8]
1686    lea          r0, [r0+r3*2]
1687    movq        mm3, [r0+r3*1-8]
1688    punpckhbw   mm3, [r0+r3*0-8]
1689    punpckhwd   mm3, mm2
1690    punpckhdq   mm3, mm1
1691    lea          r0, [r0+r3*2]
1692    movq        mm0, [r0+r3*0-8]
1693    movq        mm1, [r4]
1694    mov          r0, r4
1695    movq        mm4, mm3
1696    movq        mm2, mm3
1697    PALIGNR     mm4, mm0, 7, mm0
1698    PALIGNR     mm1, mm2, 1, mm2
1699    test        r1d, r1d
1700    jz .fix_lt_1
1701    jmp .do_left
1702.fix_lt_1:
1703    movq        mm5, mm3
1704    pxor        mm5, mm4
1705    psrlq       mm5, 56
1706    psllq       mm5, 48
1707    pxor        mm1, mm5
1708    jmp .do_left
1709.fix_lt_2:
1710    movq        mm5, mm3
1711    pxor        mm5, mm2
1712    psllq       mm5, 56
1713    psrlq       mm5, 56
1714    pxor        mm2, mm5
1715    test        r2d, r2d
1716    jnz .do_top
1717.fix_tr_1:
1718    movq        mm5, mm3
1719    pxor        mm5, mm1
1720    psrlq       mm5, 56
1721    psllq       mm5, 56
1722    pxor        mm1, mm5
1723    jmp .do_top
1724.do_left:
1725    movq        mm0, mm4
1726    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1727    movq        mm4, mm0
1728    movq        mm7, mm2
1729    movq2dq    xmm3, mm2
1730    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
1731    psllq       mm1, 56
1732    PALIGNR     mm7, mm1, 7, mm3
1733    movq2dq    xmm1, mm7
1734    movq        mm0, [r0-8]
1735    movq        mm3, [r0]
1736    movq        mm1, [r0+8]
1737    movq        mm2, mm3
1738    movq        mm4, mm3
1739    PALIGNR     mm2, mm0, 7, mm0
1740    PALIGNR     mm1, mm4, 1, mm4
1741    test        r1d, r1d
1742    jz .fix_lt_2
1743    test        r2d, r2d
1744    jz .fix_tr_1
1745.do_top:
1746    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
1747    movq2dq   xmm4, mm4
1748    lea         r1, [r0+r3*2]
1749    movdqa    xmm0, xmm3
1750    pslldq    xmm4, 8
1751    por       xmm3, xmm4
1752    lea         r2, [r1+r3*2]
1753    pslldq    xmm4, 1
1754    por       xmm1, xmm4
1755    psrldq    xmm0, 7
1756    pslldq    xmm0, 15
1757    psrldq    xmm0, 7
1758    por       xmm1, xmm0
1759    lea         r0, [r2+r3*2]
1760    movdqa    xmm2, xmm3
1761    psrldq    xmm2, 1
1762INIT_XMM cpuname
1763    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
1764    movdqa    xmm1, xmm0
1765    psrldq    xmm1, 1
1766    movq [r0+r3*2], xmm0
1767    movq [r0+r3*1], xmm1
1768    psrldq    xmm0, 2
1769    psrldq    xmm1, 2
1770    movq [r2+r3*2], xmm0
1771    movq [r2+r3*1], xmm1
1772    psrldq    xmm0, 2
1773    psrldq    xmm1, 2
1774    movq [r1+r3*2], xmm0
1775    movq [r1+r3*1], xmm1
1776    psrldq    xmm0, 2
1777    psrldq    xmm1, 2
1778    movq [r4+r3*2], xmm0
1779    movq [r4+r3*1], xmm1
1780    RET
1781%endmacro
1782
1783INIT_MMX sse2
1784PRED8x8L_DOWN_RIGHT
1785INIT_MMX ssse3
1786PRED8x8L_DOWN_RIGHT
1787
1788;-----------------------------------------------------------------------------
1789; void ff_pred8x8l_vertical_right_8(uint8_t *src, int has_topleft,
1790;                                   int has_topright, ptrdiff_t stride)
1791;-----------------------------------------------------------------------------
1792
1793INIT_MMX mmxext
1794cglobal pred8x8l_vertical_right_8, 4,5
1795    sub          r0, r3
1796    lea          r4, [r0+r3*2]
1797    movq        mm0, [r0+r3*1-8]
1798    punpckhbw   mm0, [r0+r3*0-8]
1799    movq        mm1, [r4+r3*1-8]
1800    punpckhbw   mm1, [r0+r3*2-8]
1801    mov          r4, r0
1802    punpckhwd   mm1, mm0
1803    lea          r0, [r0+r3*4]
1804    movq        mm2, [r0+r3*1-8]
1805    punpckhbw   mm2, [r0+r3*0-8]
1806    lea          r0, [r0+r3*2]
1807    movq        mm3, [r0+r3*1-8]
1808    punpckhbw   mm3, [r0+r3*0-8]
1809    punpckhwd   mm3, mm2
1810    punpckhdq   mm3, mm1
1811    lea          r0, [r0+r3*2]
1812    movq        mm0, [r0+r3*0-8]
1813    movq        mm1, [r4]
1814    mov          r0, r4
1815    movq        mm4, mm3
1816    movq        mm2, mm3
1817    PALIGNR     mm4, mm0, 7, mm0
1818    PALIGNR     mm1, mm2, 1, mm2
1819    test        r1d, r1d
1820    jz .fix_lt_1
1821    jmp .do_left
1822.fix_lt_1:
1823    movq        mm5, mm3
1824    pxor        mm5, mm4
1825    psrlq       mm5, 56
1826    psllq       mm5, 48
1827    pxor        mm1, mm5
1828    jmp .do_left
1829.fix_lt_2:
1830    movq        mm5, mm3
1831    pxor        mm5, mm2
1832    psllq       mm5, 56
1833    psrlq       mm5, 56
1834    pxor        mm2, mm5
1835    test        r2d, r2d
1836    jnz .do_top
1837.fix_tr_1:
1838    movq        mm5, mm3
1839    pxor        mm5, mm1
1840    psrlq       mm5, 56
1841    psllq       mm5, 56
1842    pxor        mm1, mm5
1843    jmp .do_top
1844.do_left:
1845    movq        mm0, mm4
1846    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1847    movq        mm7, mm2
1848    movq        mm0, [r0-8]
1849    movq        mm3, [r0]
1850    movq        mm1, [r0+8]
1851    movq        mm2, mm3
1852    movq        mm4, mm3
1853    PALIGNR     mm2, mm0, 7, mm0
1854    PALIGNR     mm1, mm4, 1, mm4
1855    test        r1d, r1d
1856    jz .fix_lt_2
1857    test        r2d, r2d
1858    jz .fix_tr_1
1859.do_top:
1860    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1861    lea         r1, [r0+r3*2]
1862    movq       mm2, mm6
1863    movq       mm3, mm6
1864    PALIGNR    mm3, mm7, 7, mm0
1865    PALIGNR    mm6, mm7, 6, mm1
1866    movq       mm4, mm3
1867    pavgb      mm3, mm2
1868    lea         r2, [r1+r3*2]
1869    PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
1870    movq [r0+r3*1], mm3
1871    movq [r0+r3*2], mm0
1872    movq       mm5, mm0
1873    movq       mm6, mm3
1874    movq       mm1, mm7
1875    movq       mm2, mm1
1876    psllq      mm2, 8
1877    movq       mm3, mm1
1878    psllq      mm3, 16
1879    lea         r4, [r2+r3*2]
1880    PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
1881    PALIGNR    mm6, mm0, 7, mm2
1882    movq [r1+r3*1], mm6
1883    psllq      mm0, 8
1884    PALIGNR    mm5, mm0, 7, mm1
1885    movq [r1+r3*2], mm5
1886    psllq      mm0, 8
1887    PALIGNR    mm6, mm0, 7, mm2
1888    movq [r2+r3*1], mm6
1889    psllq      mm0, 8
1890    PALIGNR    mm5, mm0, 7, mm1
1891    movq [r2+r3*2], mm5
1892    psllq      mm0, 8
1893    PALIGNR    mm6, mm0, 7, mm2
1894    movq [r4+r3*1], mm6
1895    psllq      mm0, 8
1896    PALIGNR    mm5, mm0, 7, mm1
1897    movq [r4+r3*2], mm5
1898    RET
1899
1900%macro PRED8x8L_VERTICAL_RIGHT 0
1901cglobal pred8x8l_vertical_right_8, 4,5,7
1902    ; manually spill XMM registers for Win64 because
1903    ; the code here is initialized with INIT_MMX
1904    WIN64_SPILL_XMM 7
1905    sub          r0, r3
1906    lea          r4, [r0+r3*2]
1907    movq        mm0, [r0+r3*1-8]
1908    punpckhbw   mm0, [r0+r3*0-8]
1909    movq        mm1, [r4+r3*1-8]
1910    punpckhbw   mm1, [r0+r3*2-8]
1911    mov          r4, r0
1912    punpckhwd   mm1, mm0
1913    lea          r0, [r0+r3*4]
1914    movq        mm2, [r0+r3*1-8]
1915    punpckhbw   mm2, [r0+r3*0-8]
1916    lea          r0, [r0+r3*2]
1917    movq        mm3, [r0+r3*1-8]
1918    punpckhbw   mm3, [r0+r3*0-8]
1919    punpckhwd   mm3, mm2
1920    punpckhdq   mm3, mm1
1921    lea          r0, [r0+r3*2]
1922    movq        mm0, [r0+r3*0-8]
1923    movq        mm1, [r4]
1924    mov          r0, r4
1925    movq        mm4, mm3
1926    movq        mm2, mm3
1927    PALIGNR     mm4, mm0, 7, mm0
1928    PALIGNR     mm1, mm2, 1, mm2
1929    test        r1d, r1d
1930    jnz .do_left
1931.fix_lt_1:
1932    movq        mm5, mm3
1933    pxor        mm5, mm4
1934    psrlq       mm5, 56
1935    psllq       mm5, 48
1936    pxor        mm1, mm5
1937    jmp .do_left
1938.fix_lt_2:
1939    movq        mm5, mm3
1940    pxor        mm5, mm2
1941    psllq       mm5, 56
1942    psrlq       mm5, 56
1943    pxor        mm2, mm5
1944    test        r2d, r2d
1945    jnz .do_top
1946.fix_tr_1:
1947    movq        mm5, mm3
1948    pxor        mm5, mm1
1949    psrlq       mm5, 56
1950    psllq       mm5, 56
1951    pxor        mm1, mm5
1952    jmp .do_top
1953.do_left:
1954    movq        mm0, mm4
1955    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
1956    movq2dq    xmm0, mm2
1957    movq        mm0, [r0-8]
1958    movq        mm3, [r0]
1959    movq        mm1, [r0+8]
1960    movq        mm2, mm3
1961    movq        mm4, mm3
1962    PALIGNR     mm2, mm0, 7, mm0
1963    PALIGNR     mm1, mm4, 1, mm4
1964    test        r1d, r1d
1965    jz .fix_lt_2
1966    test        r2d, r2d
1967    jz .fix_tr_1
1968.do_top:
1969    PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
1970    lea           r1, [r0+r3*2]
1971    movq2dq     xmm4, mm6
1972    pslldq      xmm4, 8
1973    por         xmm0, xmm4
1974    movdqa      xmm6, [pw_ff00]
1975    movdqa      xmm1, xmm0
1976    lea           r2, [r1+r3*2]
1977    movdqa      xmm2, xmm0
1978    movdqa      xmm3, xmm0
1979    pslldq      xmm0, 1
1980    pslldq      xmm1, 2
1981    pavgb       xmm2, xmm0
1982INIT_XMM cpuname
1983    PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
1984    pandn       xmm6, xmm4
1985    movdqa      xmm5, xmm4
1986    psrlw       xmm4, 8
1987    packuswb    xmm6, xmm4
1988    movhlps     xmm4, xmm6
1989    movhps [r0+r3*2], xmm5
1990    movhps [r0+r3*1], xmm2
1991    psrldq      xmm5, 4
1992    movss       xmm5, xmm6
1993    psrldq      xmm2, 4
1994    movss       xmm2, xmm4
1995    lea           r0, [r2+r3*2]
1996    psrldq      xmm5, 1
1997    psrldq      xmm2, 1
1998    movq        [r0+r3*2], xmm5
1999    movq        [r0+r3*1], xmm2
2000    psrldq      xmm5, 1
2001    psrldq      xmm2, 1
2002    movq        [r2+r3*2], xmm5
2003    movq        [r2+r3*1], xmm2
2004    psrldq      xmm5, 1
2005    psrldq      xmm2, 1
2006    movq        [r1+r3*2], xmm5
2007    movq        [r1+r3*1], xmm2
2008    RET
2009%endmacro
2010
2011INIT_MMX sse2
2012PRED8x8L_VERTICAL_RIGHT
2013INIT_MMX ssse3
2014PRED8x8L_VERTICAL_RIGHT
2015
2016;-----------------------------------------------------------------------------
2017; void ff_pred8x8l_vertical_left_8(uint8_t *src, int has_topleft,
2018;                                  int has_topright, ptrdiff_t stride)
2019;-----------------------------------------------------------------------------
2020
2021%macro PRED8x8L_VERTICAL_LEFT 0
2022cglobal pred8x8l_vertical_left_8, 4,4
2023    sub          r0, r3
2024    movq        mm0, [r0-8]
2025    movq        mm3, [r0]
2026    movq        mm1, [r0+8]
2027    movq        mm2, mm3
2028    movq        mm4, mm3
2029    PALIGNR     mm2, mm0, 7, mm0
2030    PALIGNR     mm1, mm4, 1, mm4
2031    test        r1d, r1d
2032    jz .fix_lt_2
2033    test        r2d, r2d
2034    jz .fix_tr_1
2035    jmp .do_top
2036.fix_lt_2:
2037    movq        mm5, mm3
2038    pxor        mm5, mm2
2039    psllq       mm5, 56
2040    psrlq       mm5, 56
2041    pxor        mm2, mm5
2042    test        r2d, r2d
2043    jnz .do_top
2044.fix_tr_1:
2045    movq        mm5, mm3
2046    pxor        mm5, mm1
2047    psrlq       mm5, 56
2048    psllq       mm5, 56
2049    pxor        mm1, mm5
2050    jmp .do_top
2051.fix_tr_2:
2052    punpckhbw   mm3, mm3
2053    pshufw      mm1, mm3, 0xFF
2054    jmp .do_topright
2055.do_top:
2056    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2057    movq2dq    xmm4, mm4
2058    test        r2d, r2d
2059    jz .fix_tr_2
2060    movq        mm0, [r0+8]
2061    movq        mm5, mm0
2062    movq        mm2, mm0
2063    movq        mm4, mm0
2064    psrlq       mm5, 56
2065    PALIGNR     mm2, mm3, 7, mm3
2066    PALIGNR     mm5, mm4, 1, mm4
2067    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2068.do_topright:
2069    movq2dq   xmm3, mm1
2070    lea         r1, [r0+r3*2]
2071    pslldq    xmm3, 8
2072    por       xmm4, xmm3
2073    movdqa    xmm2, xmm4
2074    movdqa    xmm1, xmm4
2075    movdqa    xmm3, xmm4
2076    psrldq    xmm2, 1
2077    pslldq    xmm1, 1
2078    pavgb     xmm3, xmm2
2079    lea         r2, [r1+r3*2]
2080INIT_XMM cpuname
2081    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
2082    psrldq    xmm0, 1
2083    movq [r0+r3*1], xmm3
2084    movq [r0+r3*2], xmm0
2085    lea         r0, [r2+r3*2]
2086    psrldq    xmm3, 1
2087    psrldq    xmm0, 1
2088    movq [r1+r3*1], xmm3
2089    movq [r1+r3*2], xmm0
2090    psrldq    xmm3, 1
2091    psrldq    xmm0, 1
2092    movq [r2+r3*1], xmm3
2093    movq [r2+r3*2], xmm0
2094    psrldq    xmm3, 1
2095    psrldq    xmm0, 1
2096    movq [r0+r3*1], xmm3
2097    movq [r0+r3*2], xmm0
2098    RET
2099%endmacro
2100
2101INIT_MMX sse2
2102PRED8x8L_VERTICAL_LEFT
2103INIT_MMX ssse3
2104PRED8x8L_VERTICAL_LEFT
2105
2106;-----------------------------------------------------------------------------
2107; void ff_pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft,
2108;                                  int has_topright, ptrdiff_t stride)
2109;-----------------------------------------------------------------------------
2110
2111%macro PRED8x8L_HORIZONTAL_UP 0
2112cglobal pred8x8l_horizontal_up_8, 4,4
2113    sub          r0, r3
2114    lea          r2, [r0+r3*2]
2115    movq        mm0, [r0+r3*1-8]
2116    test        r1d, r1d
2117    lea          r1, [r0+r3]
2118    cmovnz       r1, r0
2119    punpckhbw   mm0, [r1+r3*0-8]
2120    movq        mm1, [r2+r3*1-8]
2121    punpckhbw   mm1, [r0+r3*2-8]
2122    mov          r2, r0
2123    punpckhwd   mm1, mm0
2124    lea          r0, [r0+r3*4]
2125    movq        mm2, [r0+r3*1-8]
2126    punpckhbw   mm2, [r0+r3*0-8]
2127    lea          r0, [r0+r3*2]
2128    movq        mm3, [r0+r3*1-8]
2129    punpckhbw   mm3, [r0+r3*0-8]
2130    punpckhwd   mm3, mm2
2131    punpckhdq   mm3, mm1
2132    lea          r0, [r0+r3*2]
2133    movq        mm0, [r0+r3*0-8]
2134    movq        mm1, [r1+r3*0-8]
2135    mov          r0, r2
2136    movq        mm4, mm3
2137    movq        mm2, mm3
2138    PALIGNR     mm4, mm0, 7, mm0
2139    PALIGNR     mm1, mm2, 1, mm2
2140    movq       mm0, mm4
2141    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2142    movq       mm4, mm0
2143    movq       mm7, mm2
2144    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2145    psllq      mm1, 56
2146    PALIGNR    mm7, mm1, 7, mm3
2147    lea         r1, [r0+r3*2]
2148    pshufw     mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
2149    psllq      mm7, 56             ; l7 .. .. .. .. .. .. ..
2150    movq       mm2, mm0
2151    psllw      mm0, 8
2152    psrlw      mm2, 8
2153    por        mm2, mm0            ; l7 l6 l5 l4 l3 l2 l1 l0
2154    movq       mm3, mm2
2155    movq       mm4, mm2
2156    movq       mm5, mm2
2157    psrlq      mm2, 8
2158    psrlq      mm3, 16
2159    lea         r2, [r1+r3*2]
2160    por        mm2, mm7            ; l7 l7 l6 l5 l4 l3 l2 l1
2161    punpckhbw  mm7, mm7
2162    por        mm3, mm7            ; l7 l7 l7 l6 l5 l4 l3 l2
2163    pavgb      mm4, mm2
2164    PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
2165    movq       mm5, mm4
2166    punpcklbw  mm4, mm1            ; p4 p3 p2 p1
2167    punpckhbw  mm5, mm1            ; p8 p7 p6 p5
2168    movq       mm6, mm5
2169    movq       mm7, mm5
2170    movq       mm0, mm5
2171    PALIGNR    mm5, mm4, 2, mm1
2172    pshufw     mm1, mm6, 11111001b
2173    PALIGNR    mm6, mm4, 4, mm2
2174    pshufw     mm2, mm7, 11111110b
2175    PALIGNR    mm7, mm4, 6, mm3
2176    pshufw     mm3, mm0, 11111111b
2177    movq [r0+r3*1], mm4
2178    movq [r0+r3*2], mm5
2179    lea         r0, [r2+r3*2]
2180    movq [r1+r3*1], mm6
2181    movq [r1+r3*2], mm7
2182    movq [r2+r3*1], mm0
2183    movq [r2+r3*2], mm1
2184    movq [r0+r3*1], mm2
2185    movq [r0+r3*2], mm3
2186    RET
2187%endmacro
2188
2189INIT_MMX mmxext
2190PRED8x8L_HORIZONTAL_UP
2191INIT_MMX ssse3
2192PRED8x8L_HORIZONTAL_UP
2193
2194;-----------------------------------------------------------------------------
2195; void ff_pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft,
2196;                                    int has_topright, ptrdiff_t stride)
2197;-----------------------------------------------------------------------------
2198
2199INIT_MMX mmxext
2200cglobal pred8x8l_horizontal_down_8, 4,5
2201    sub          r0, r3
2202    lea          r4, [r0+r3*2]
2203    movq        mm0, [r0+r3*1-8]
2204    punpckhbw   mm0, [r0+r3*0-8]
2205    movq        mm1, [r4+r3*1-8]
2206    punpckhbw   mm1, [r0+r3*2-8]
2207    mov          r4, r0
2208    punpckhwd   mm1, mm0
2209    lea          r0, [r0+r3*4]
2210    movq        mm2, [r0+r3*1-8]
2211    punpckhbw   mm2, [r0+r3*0-8]
2212    lea          r0, [r0+r3*2]
2213    movq        mm3, [r0+r3*1-8]
2214    punpckhbw   mm3, [r0+r3*0-8]
2215    punpckhwd   mm3, mm2
2216    punpckhdq   mm3, mm1
2217    lea          r0, [r0+r3*2]
2218    movq        mm0, [r0+r3*0-8]
2219    movq        mm1, [r4]
2220    mov          r0, r4
2221    movq        mm4, mm3
2222    movq        mm2, mm3
2223    PALIGNR     mm4, mm0, 7, mm0
2224    PALIGNR     mm1, mm2, 1, mm2
2225    test        r1d, r1d
2226    jnz .do_left
2227.fix_lt_1:
2228    movq        mm5, mm3
2229    pxor        mm5, mm4
2230    psrlq       mm5, 56
2231    psllq       mm5, 48
2232    pxor        mm1, mm5
2233    jmp .do_left
2234.fix_lt_2:
2235    movq        mm5, mm3
2236    pxor        mm5, mm2
2237    psllq       mm5, 56
2238    psrlq       mm5, 56
2239    pxor        mm2, mm5
2240    test        r2d, r2d
2241    jnz .do_top
2242.fix_tr_1:
2243    movq        mm5, mm3
2244    pxor        mm5, mm1
2245    psrlq       mm5, 56
2246    psllq       mm5, 56
2247    pxor        mm1, mm5
2248    jmp .do_top
2249.do_left:
2250    movq        mm0, mm4
2251    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2252    movq        mm4, mm0
2253    movq        mm7, mm2
2254    movq        mm6, mm2
2255    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2256    psllq       mm1, 56
2257    PALIGNR     mm7, mm1, 7, mm3
2258    movq        mm0, [r0-8]
2259    movq        mm3, [r0]
2260    movq        mm1, [r0+8]
2261    movq        mm2, mm3
2262    movq        mm4, mm3
2263    PALIGNR     mm2, mm0, 7, mm0
2264    PALIGNR     mm1, mm4, 1, mm4
2265    test        r1d, r1d
2266    jz .fix_lt_2
2267    test        r2d, r2d
2268    jz .fix_tr_1
2269.do_top:
2270    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2271    movq       mm5, mm4
2272    lea         r1, [r0+r3*2]
2273    psllq      mm7, 56
2274    movq       mm2, mm5
2275    movq       mm3, mm6
2276    movq       mm4, mm2
2277    PALIGNR    mm2, mm6, 7, mm5
2278    PALIGNR    mm6, mm7, 7, mm0
2279    lea         r2, [r1+r3*2]
2280    PALIGNR    mm4, mm3, 1, mm7
2281    movq       mm5, mm3
2282    pavgb      mm3, mm6
2283    PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
2284    movq       mm4, mm2
2285    movq       mm1, mm2
2286    lea         r4, [r2+r3*2]
2287    psrlq      mm4, 16
2288    psrlq      mm1, 8
2289    PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
2290    movq       mm7, mm3
2291    punpcklbw  mm3, mm0
2292    punpckhbw  mm7, mm0
2293    movq       mm1, mm7
2294    movq       mm0, mm7
2295    movq       mm4, mm7
2296    movq [r4+r3*2], mm3
2297    PALIGNR    mm7, mm3, 2, mm5
2298    movq [r4+r3*1], mm7
2299    PALIGNR    mm1, mm3, 4, mm5
2300    movq [r2+r3*2], mm1
2301    PALIGNR    mm0, mm3, 6, mm3
2302    movq [r2+r3*1], mm0
2303    movq       mm2, mm6
2304    movq       mm3, mm6
2305    movq [r1+r3*2], mm4
2306    PALIGNR    mm6, mm4, 2, mm5
2307    movq [r1+r3*1], mm6
2308    PALIGNR    mm2, mm4, 4, mm5
2309    movq [r0+r3*2], mm2
2310    PALIGNR    mm3, mm4, 6, mm4
2311    movq [r0+r3*1], mm3
2312    RET
2313
2314%macro PRED8x8L_HORIZONTAL_DOWN 0
2315cglobal pred8x8l_horizontal_down_8, 4,5
2316    sub          r0, r3
2317    lea          r4, [r0+r3*2]
2318    movq        mm0, [r0+r3*1-8]
2319    punpckhbw   mm0, [r0+r3*0-8]
2320    movq        mm1, [r4+r3*1-8]
2321    punpckhbw   mm1, [r0+r3*2-8]
2322    mov          r4, r0
2323    punpckhwd   mm1, mm0
2324    lea          r0, [r0+r3*4]
2325    movq        mm2, [r0+r3*1-8]
2326    punpckhbw   mm2, [r0+r3*0-8]
2327    lea          r0, [r0+r3*2]
2328    movq        mm3, [r0+r3*1-8]
2329    punpckhbw   mm3, [r0+r3*0-8]
2330    punpckhwd   mm3, mm2
2331    punpckhdq   mm3, mm1
2332    lea          r0, [r0+r3*2]
2333    movq        mm0, [r0+r3*0-8]
2334    movq        mm1, [r4]
2335    mov          r0, r4
2336    movq        mm4, mm3
2337    movq        mm2, mm3
2338    PALIGNR     mm4, mm0, 7, mm0
2339    PALIGNR     mm1, mm2, 1, mm2
2340    test        r1d, r1d
2341    jnz .do_left
2342.fix_lt_1:
2343    movq        mm5, mm3
2344    pxor        mm5, mm4
2345    psrlq       mm5, 56
2346    psllq       mm5, 48
2347    pxor        mm1, mm5
2348    jmp .do_left
2349.fix_lt_2:
2350    movq        mm5, mm3
2351    pxor        mm5, mm2
2352    psllq       mm5, 56
2353    psrlq       mm5, 56
2354    pxor        mm2, mm5
2355    test        r2d, r2d
2356    jnz .do_top
2357.fix_tr_1:
2358    movq        mm5, mm3
2359    pxor        mm5, mm1
2360    psrlq       mm5, 56
2361    psllq       mm5, 56
2362    pxor        mm1, mm5
2363    jmp .do_top
2364.fix_tr_2:
2365    punpckhbw   mm3, mm3
2366    pshufw      mm1, mm3, 0xFF
2367    jmp .do_topright
2368.do_left:
2369    movq        mm0, mm4
2370    PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
2371    movq2dq    xmm0, mm2
2372    pslldq     xmm0, 8
2373    movq        mm4, mm0
2374    PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
2375    movq2dq    xmm2, mm1
2376    pslldq     xmm2, 15
2377    psrldq     xmm2, 8
2378    por        xmm0, xmm2
2379    movq        mm0, [r0-8]
2380    movq        mm3, [r0]
2381    movq        mm1, [r0+8]
2382    movq        mm2, mm3
2383    movq        mm4, mm3
2384    PALIGNR     mm2, mm0, 7, mm0
2385    PALIGNR     mm1, mm4, 1, mm4
2386    test        r1d, r1d
2387    jz .fix_lt_2
2388    test        r2d, r2d
2389    jz .fix_tr_1
2390.do_top:
2391    PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
2392    movq2dq    xmm1, mm4
2393    test        r2d, r2d
2394    jz .fix_tr_2
2395    movq        mm0, [r0+8]
2396    movq        mm5, mm0
2397    movq        mm2, mm0
2398    movq        mm4, mm0
2399    psrlq       mm5, 56
2400    PALIGNR     mm2, mm3, 7, mm3
2401    PALIGNR     mm5, mm4, 1, mm4
2402    PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
2403.do_topright:
2404    movq2dq    xmm5, mm1
2405    pslldq     xmm5, 8
2406    por        xmm1, xmm5
2407INIT_XMM cpuname
2408    lea         r2, [r4+r3*2]
2409    movdqa    xmm2, xmm1
2410    movdqa    xmm3, xmm1
2411    PALIGNR   xmm1, xmm0, 7, xmm4
2412    PALIGNR   xmm2, xmm0, 9, xmm5
2413    lea         r1, [r2+r3*2]
2414    PALIGNR   xmm3, xmm0, 8, xmm0
2415    movdqa    xmm4, xmm1
2416    pavgb     xmm4, xmm3
2417    lea         r0, [r1+r3*2]
2418    PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
2419    punpcklbw xmm4, xmm0
2420    movhlps   xmm0, xmm4
2421    movq   [r0+r3*2], xmm4
2422    movq   [r2+r3*2], xmm0
2423    psrldq xmm4, 2
2424    psrldq xmm0, 2
2425    movq   [r0+r3*1], xmm4
2426    movq   [r2+r3*1], xmm0
2427    psrldq xmm4, 2
2428    psrldq xmm0, 2
2429    movq   [r1+r3*2], xmm4
2430    movq   [r4+r3*2], xmm0
2431    psrldq xmm4, 2
2432    psrldq xmm0, 2
2433    movq   [r1+r3*1], xmm4
2434    movq   [r4+r3*1], xmm0
2435    RET
2436%endmacro
2437
2438INIT_MMX sse2
2439PRED8x8L_HORIZONTAL_DOWN
2440INIT_MMX ssse3
2441PRED8x8L_HORIZONTAL_DOWN
2442
2443;-------------------------------------------------------------------------------
2444; void ff_pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright,
2445;                             ptrdiff_t stride)
2446;-------------------------------------------------------------------------------
2447
2448INIT_MMX mmxext
2449cglobal pred4x4_dc_8, 3,5
2450    pxor   mm7, mm7
2451    mov     r4, r0
2452    sub     r0, r2
2453    movd   mm0, [r0]
2454    psadbw mm0, mm7
2455    movzx  r1d, byte [r0+r2*1-1]
2456    movd   r3d, mm0
2457    add    r3d, r1d
2458    movzx  r1d, byte [r0+r2*2-1]
2459    lea     r0, [r0+r2*2]
2460    add    r3d, r1d
2461    movzx  r1d, byte [r0+r2*1-1]
2462    add    r3d, r1d
2463    movzx  r1d, byte [r0+r2*2-1]
2464    add    r3d, r1d
2465    add    r3d, 4
2466    shr    r3d, 3
2467    imul   r3d, 0x01010101
2468    mov   [r4+r2*0], r3d
2469    mov   [r0+r2*0], r3d
2470    mov   [r0+r2*1], r3d
2471    mov   [r0+r2*2], r3d
2472    RET
2473
2474;-----------------------------------------------------------------------------
2475; void ff_pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
2476;                                 ptrdiff_t stride)
2477;-----------------------------------------------------------------------------
2478
2479%macro PRED4x4_TM 0
2480cglobal pred4x4_tm_vp8_8, 3,6
2481    sub        r0, r2
2482    pxor      mm7, mm7
2483    movd      mm0, [r0]
2484    punpcklbw mm0, mm7
2485    movzx     r4d, byte [r0-1]
2486    mov       r5d, 2
2487.loop:
2488    movzx     r1d, byte [r0+r2*1-1]
2489    movzx     r3d, byte [r0+r2*2-1]
2490    sub       r1d, r4d
2491    sub       r3d, r4d
2492    movd      mm2, r1d
2493    movd      mm4, r3d
2494%if cpuflag(mmxext)
2495    pshufw    mm2, mm2, 0
2496    pshufw    mm4, mm4, 0
2497%else
2498    punpcklwd mm2, mm2
2499    punpcklwd mm4, mm4
2500    punpckldq mm2, mm2
2501    punpckldq mm4, mm4
2502%endif
2503    paddw     mm2, mm0
2504    paddw     mm4, mm0
2505    packuswb  mm2, mm2
2506    packuswb  mm4, mm4
2507    movd [r0+r2*1], mm2
2508    movd [r0+r2*2], mm4
2509    lea        r0, [r0+r2*2]
2510    dec       r5d
2511    jg .loop
2512    REP_RET
2513%endmacro
2514
2515INIT_MMX mmx
2516PRED4x4_TM
2517INIT_MMX mmxext
2518PRED4x4_TM
2519
2520INIT_XMM ssse3
2521cglobal pred4x4_tm_vp8_8, 3,3
2522    sub         r0, r2
2523    movq       mm6, [tm_shuf]
2524    pxor       mm1, mm1
2525    movd       mm0, [r0]
2526    punpcklbw  mm0, mm1
2527    movd       mm7, [r0-4]
2528    pshufb     mm7, mm6
2529    lea         r1, [r0+r2*2]
2530    movd       mm2, [r0+r2*1-4]
2531    movd       mm3, [r0+r2*2-4]
2532    movd       mm4, [r1+r2*1-4]
2533    movd       mm5, [r1+r2*2-4]
2534    pshufb     mm2, mm6
2535    pshufb     mm3, mm6
2536    pshufb     mm4, mm6
2537    pshufb     mm5, mm6
2538    psubw      mm0, mm7
2539    paddw      mm2, mm0
2540    paddw      mm3, mm0
2541    paddw      mm4, mm0
2542    paddw      mm5, mm0
2543    packuswb   mm2, mm2
2544    packuswb   mm3, mm3
2545    packuswb   mm4, mm4
2546    packuswb   mm5, mm5
2547    movd [r0+r2*1], mm2
2548    movd [r0+r2*2], mm3
2549    movd [r1+r2*1], mm4
2550    movd [r1+r2*2], mm5
2551    RET
2552
2553;-----------------------------------------------------------------------------
2554; void ff_pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright,
2555;                                       ptrdiff_t stride)
2556;-----------------------------------------------------------------------------
2557
2558INIT_MMX mmxext
2559cglobal pred4x4_vertical_vp8_8, 3,3
2560    sub       r0, r2
2561    movd      m1, [r0-1]
2562    movd      m0, [r0]
2563    mova      m2, m0   ;t0 t1 t2 t3
2564    punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
2565    lea       r1, [r0+r2*2]
2566    psrlq     m0, 8    ;t1 t2 t3 t4
2567    PRED4x4_LOWPASS m3, m1, m0, m2, m4
2568    movd [r0+r2*1], m3
2569    movd [r0+r2*2], m3
2570    movd [r1+r2*1], m3
2571    movd [r1+r2*2], m3
2572    RET
2573
2574;-----------------------------------------------------------------------------
2575; void ff_pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright,
2576;                                    ptrdiff_t stride)
2577;-----------------------------------------------------------------------------
2578INIT_MMX mmxext
2579cglobal pred4x4_down_left_8, 3,3
2580    sub       r0, r2
2581    movq      m1, [r0]
2582    punpckldq m1, [r1]
2583    movq      m2, m1
2584    movq      m3, m1
2585    psllq     m1, 8
2586    pxor      m2, m1
2587    psrlq     m2, 8
2588    pxor      m2, m3
2589    PRED4x4_LOWPASS m0, m1, m2, m3, m4
2590    lea       r1, [r0+r2*2]
2591    psrlq     m0, 8
2592    movd      [r0+r2*1], m0
2593    psrlq     m0, 8
2594    movd      [r0+r2*2], m0
2595    psrlq     m0, 8
2596    movd      [r1+r2*1], m0
2597    psrlq     m0, 8
2598    movd      [r1+r2*2], m0
2599    RET
2600
2601;------------------------------------------------------------------------------
2602; void ff_pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright,
2603;                                        ptrdiff_t stride)
2604;------------------------------------------------------------------------------
2605
2606INIT_MMX mmxext
2607cglobal pred4x4_vertical_left_8, 3,3
2608    sub       r0, r2
2609    movq      m1, [r0]
2610    punpckldq m1, [r1]
2611    movq      m3, m1
2612    movq      m2, m1
2613    psrlq     m3, 8
2614    psrlq     m2, 16
2615    movq      m4, m3
2616    pavgb     m4, m1
2617    PRED4x4_LOWPASS m0, m1, m2, m3, m5
2618    lea       r1, [r0+r2*2]
2619    movh      [r0+r2*1], m4
2620    movh      [r0+r2*2], m0
2621    psrlq     m4, 8
2622    psrlq     m0, 8
2623    movh      [r1+r2*1], m4
2624    movh      [r1+r2*2], m0
2625    RET
2626
2627;------------------------------------------------------------------------------
2628; void ff_pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright,
2629;                                        ptrdiff_t stride)
2630;------------------------------------------------------------------------------
2631
2632INIT_MMX mmxext
2633cglobal pred4x4_horizontal_up_8, 3,3
2634    sub       r0, r2
2635    lea       r1, [r0+r2*2]
2636    movd      m0, [r0+r2*1-4]
2637    punpcklbw m0, [r0+r2*2-4]
2638    movd      m1, [r1+r2*1-4]
2639    punpcklbw m1, [r1+r2*2-4]
2640    punpckhwd m0, m1
2641    movq      m1, m0
2642    punpckhbw m1, m1
2643    pshufw    m1, m1, 0xFF
2644    punpckhdq m0, m1
2645    movq      m2, m0
2646    movq      m3, m0
2647    movq      m7, m0
2648    psrlq     m2, 16
2649    psrlq     m3, 8
2650    pavgb     m7, m3
2651    PRED4x4_LOWPASS m4, m0, m2, m3, m5
2652    punpcklbw m7, m4
2653    movd    [r0+r2*1], m7
2654    psrlq    m7, 16
2655    movd    [r0+r2*2], m7
2656    psrlq    m7, 16
2657    movd    [r1+r2*1], m7
2658    movd    [r1+r2*2], m1
2659    RET
2660
2661;------------------------------------------------------------------------------
2662; void ff_pred4x4_horizontal_down_8_mmxext(uint8_t *src,
2663;                                          const uint8_t *topright,
2664;                                          ptrdiff_t stride)
2665;------------------------------------------------------------------------------
2666
2667INIT_MMX mmxext
2668cglobal pred4x4_horizontal_down_8, 3,3
2669    sub       r0, r2
2670    lea       r1, [r0+r2*2]
2671    movh      m0, [r0-4]      ; lt ..
2672    punpckldq m0, [r0]        ; t3 t2 t1 t0 lt .. .. ..
2673    psllq     m0, 8           ; t2 t1 t0 lt .. .. .. ..
2674    movd      m1, [r1+r2*2-4] ; l3
2675    punpcklbw m1, [r1+r2*1-4] ; l2 l3
2676    movd      m2, [r0+r2*2-4] ; l1
2677    punpcklbw m2, [r0+r2*1-4] ; l0 l1
2678    punpckhwd m1, m2          ; l0 l1 l2 l3
2679    punpckhdq m1, m0          ; t2 t1 t0 lt l0 l1 l2 l3
2680    movq      m0, m1
2681    movq      m2, m1
2682    movq      m5, m1
2683    psrlq     m0, 16          ; .. .. t2 t1 t0 lt l0 l1
2684    psrlq     m2, 8           ; .. t2 t1 t0 lt l0 l1 l2
2685    pavgb     m5, m2
2686    PRED4x4_LOWPASS m3, m1, m0, m2, m4
2687    punpcklbw m5, m3
2688    psrlq     m3, 32
2689    PALIGNR   m3, m5, 6, m4
2690    movh      [r1+r2*2], m5
2691    psrlq     m5, 16
2692    movh      [r1+r2*1], m5
2693    psrlq     m5, 16
2694    movh      [r0+r2*2], m5
2695    movh      [r0+r2*1], m3
2696    RET
2697
2698;-----------------------------------------------------------------------------
2699; void ff_pred4x4_vertical_right_8_mmxext(uint8_t *src,
2700;                                         const uint8_t *topright,
2701;                                         ptrdiff_t stride)
2702;-----------------------------------------------------------------------------
2703
2704INIT_MMX mmxext
2705cglobal pred4x4_vertical_right_8, 3,3
2706    sub     r0, r2
2707    lea     r1, [r0+r2*2]
2708    movh    m0, [r0]                    ; ........t3t2t1t0
2709    movq    m5, m0
2710    PALIGNR m0, [r0-8], 7, m1           ; ......t3t2t1t0lt
2711    pavgb   m5, m0
2712    PALIGNR m0, [r0+r2*1-8], 7, m1      ; ....t3t2t1t0ltl0
2713    movq    m1, m0
2714    PALIGNR m0, [r0+r2*2-8], 7, m2      ; ..t3t2t1t0ltl0l1
2715    movq    m2, m0
2716    PALIGNR m0, [r1+r2*1-8], 7, m3      ; t3t2t1t0ltl0l1l2
2717    PRED4x4_LOWPASS m3, m1, m0, m2, m4
2718    movq    m1, m3
2719    psrlq   m3, 16
2720    psllq   m1, 48
2721    movh    [r0+r2*1], m5
2722    movh    [r0+r2*2], m3
2723    PALIGNR m5, m1, 7, m2
2724    psllq   m1, 8
2725    movh    [r1+r2*1], m5
2726    PALIGNR m3, m1, 7, m1
2727    movh    [r1+r2*2], m3
2728    RET
2729
2730;-----------------------------------------------------------------------------
2731; void ff_pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright,
2732;                                     ptrdiff_t stride)
2733;-----------------------------------------------------------------------------
2734
2735INIT_MMX mmxext
2736cglobal pred4x4_down_right_8, 3,3
2737    sub       r0, r2
2738    lea       r1, [r0+r2*2]
2739    movq      m1, [r1-8]
2740    movq      m2, [r0+r2*1-8]
2741    punpckhbw m2, [r0-8]
2742    movh      m3, [r0]
2743    punpckhwd m1, m2
2744    PALIGNR   m3, m1, 5, m1
2745    movq      m1, m3
2746    PALIGNR   m3, [r1+r2*1-8], 7, m4
2747    movq      m2, m3
2748    PALIGNR   m3, [r1+r2*2-8], 7, m4
2749    PRED4x4_LOWPASS m0, m3, m1, m2, m4
2750    movh      [r1+r2*2], m0
2751    psrlq     m0, 8
2752    movh      [r1+r2*1], m0
2753    psrlq     m0, 8
2754    movh      [r0+r2*2], m0
2755    psrlq     m0, 8
2756    movh      [r0+r2*1], m0
2757    RET
2758