1;*****************************************************************************
2;* predict-a.asm: x86 intra prediction
3;*****************************************************************************
4;* Copyright (C) 2005-2021 x264 project
5;*
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;*          Holger Lubitz <holger@lubitz.org>
8;*          Fiona Glaser <fiona@x264.com>
9;*          Henrik Gramner <henrik@gramner.com>
10;*
11;* This program is free software; you can redistribute it and/or modify
12;* it under the terms of the GNU General Public License as published by
13;* the Free Software Foundation; either version 2 of the License, or
14;* (at your option) any later version.
15;*
16;* This program is distributed in the hope that it will be useful,
17;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19;* GNU General Public License for more details.
20;*
21;* You should have received a copy of the GNU General Public License
22;* along with this program; if not, write to the Free Software
23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
24;*
25;* This program is also available under a commercial proprietary license.
26;* For more information, contact us at licensing@x264.com.
27;*****************************************************************************
28
29%include "x86inc.asm"
30%include "x86util.asm"
31
32SECTION_RODATA 32
33
34pw_43210123: times 2 dw -3, -2, -1, 0, 1, 2, 3, 4
35pw_m3:       times 16 dw -3
36pw_m7:       times 16 dw -7
37pb_00s_ff:   times 8 db 0
38pb_0s_ff:    times 7 db 0
39             db 0xff
40shuf_fixtr:  db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
41shuf_nop:    db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
42shuf_hu:     db 7,6,5,4,3,2,1,0,0,0,0,0,0,0,0,0
43shuf_vr:     db 2,4,6,8,9,10,11,12,13,14,15,0,1,3,5,7
44pw_reverse:  db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
45
46SECTION .text
47
48cextern pb_0
49cextern pb_1
50cextern pb_3
51cextern pw_1
52cextern pw_2
53cextern pw_4
54cextern pw_8
55cextern pw_16
56cextern pw_00ff
57cextern pw_pixel_max
58cextern pw_0to15
59
60%macro STORE8 1
61    mova [r0+0*FDEC_STRIDEB], %1
62    mova [r0+1*FDEC_STRIDEB], %1
63    add  r0, 4*FDEC_STRIDEB
64    mova [r0-2*FDEC_STRIDEB], %1
65    mova [r0-1*FDEC_STRIDEB], %1
66    mova [r0+0*FDEC_STRIDEB], %1
67    mova [r0+1*FDEC_STRIDEB], %1
68    mova [r0+2*FDEC_STRIDEB], %1
69    mova [r0+3*FDEC_STRIDEB], %1
70%endmacro
71
72%macro STORE16 1-4
73%if %0 > 1
74    mov  r1d, 2*%0
75.loop:
76    mova [r0+0*FDEC_STRIDEB+0*mmsize], %1
77    mova [r0+0*FDEC_STRIDEB+1*mmsize], %2
78    mova [r0+1*FDEC_STRIDEB+0*mmsize], %1
79    mova [r0+1*FDEC_STRIDEB+1*mmsize], %2
80%ifidn %0, 4
81    mova [r0+0*FDEC_STRIDEB+2*mmsize], %3
82    mova [r0+0*FDEC_STRIDEB+3*mmsize], %4
83    mova [r0+1*FDEC_STRIDEB+2*mmsize], %3
84    mova [r0+1*FDEC_STRIDEB+3*mmsize], %4
85    add  r0, 2*FDEC_STRIDEB
86%else ; %0 == 2
87    add  r0, 4*FDEC_STRIDEB
88    mova [r0-2*FDEC_STRIDEB+0*mmsize], %1
89    mova [r0-2*FDEC_STRIDEB+1*mmsize], %2
90    mova [r0-1*FDEC_STRIDEB+0*mmsize], %1
91    mova [r0-1*FDEC_STRIDEB+1*mmsize], %2
92%endif
93    dec  r1d
94    jg .loop
95%else ; %0 == 1
96    STORE8 %1
97%if HIGH_BIT_DEPTH ; Different code paths to reduce code size
98    add  r0, 6*FDEC_STRIDEB
99    mova [r0-2*FDEC_STRIDEB], %1
100    mova [r0-1*FDEC_STRIDEB], %1
101    mova [r0+0*FDEC_STRIDEB], %1
102    mova [r0+1*FDEC_STRIDEB], %1
103    add  r0, 4*FDEC_STRIDEB
104    mova [r0-2*FDEC_STRIDEB], %1
105    mova [r0-1*FDEC_STRIDEB], %1
106    mova [r0+0*FDEC_STRIDEB], %1
107    mova [r0+1*FDEC_STRIDEB], %1
108%else
109    add  r0, 8*FDEC_STRIDE
110    mova [r0-4*FDEC_STRIDE], %1
111    mova [r0-3*FDEC_STRIDE], %1
112    mova [r0-2*FDEC_STRIDE], %1
113    mova [r0-1*FDEC_STRIDE], %1
114    mova [r0+0*FDEC_STRIDE], %1
115    mova [r0+1*FDEC_STRIDE], %1
116    mova [r0+2*FDEC_STRIDE], %1
117    mova [r0+3*FDEC_STRIDE], %1
118%endif ; HIGH_BIT_DEPTH
119%endif
120%endmacro
121
122%macro PRED_H_LOAD 2 ; reg, offset
123%if cpuflag(avx2)
124    vpbroadcastpix %1, [r0+(%2)*FDEC_STRIDEB-SIZEOF_PIXEL]
125%elif HIGH_BIT_DEPTH
126    movd           %1, [r0+(%2)*FDEC_STRIDEB-4]
127    SPLATW         %1, %1, 1
128%else
129    SPLATB_LOAD    %1, r0+(%2)*FDEC_STRIDE-1, m2
130%endif
131%endmacro
132
133%macro PRED_H_STORE 3 ; reg, offset, width
134%assign %%w %3*SIZEOF_PIXEL
135%if %%w == 8
136    movq [r0+(%2)*FDEC_STRIDEB], %1
137%else
138    %assign %%i 0
139    %rep %%w/mmsize
140        mova [r0+(%2)*FDEC_STRIDEB+%%i], %1
141    %assign %%i %%i+mmsize
142    %endrep
143%endif
144%endmacro
145
146%macro PRED_H_4ROWS 2 ; width, inc_ptr
147    PRED_H_LOAD  m0, 0
148    PRED_H_LOAD  m1, 1
149    PRED_H_STORE m0, 0, %1
150    PRED_H_STORE m1, 1, %1
151    PRED_H_LOAD  m0, 2
152%if %2
153    add          r0, 4*FDEC_STRIDEB
154%endif
155    PRED_H_LOAD  m1, 3-4*%2
156    PRED_H_STORE m0, 2-4*%2, %1
157    PRED_H_STORE m1, 3-4*%2, %1
158%endmacro
159
160; dest, left, right, src, tmp
161; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
162%macro PRED8x8_LOWPASS 4-5
163%if HIGH_BIT_DEPTH
164    paddw       %2, %3
165    psrlw       %2, 1
166    pavgw       %1, %4, %2
167%else
168    mova        %5, %2
169    pavgb       %2, %3
170    pxor        %3, %5
171    pand        %3, [pb_1]
172    psubusb     %2, %3
173    pavgb       %1, %4, %2
174%endif
175%endmacro
176
177;-----------------------------------------------------------------------------
178; void predict_4x4_h( pixel *src )
179;-----------------------------------------------------------------------------
180%if HIGH_BIT_DEPTH
181INIT_XMM avx2
182cglobal predict_4x4_h, 1,1
183    PRED_H_4ROWS 4, 0
184    RET
185%endif
186
187;-----------------------------------------------------------------------------
188; void predict_4x4_ddl( pixel *src )
189;-----------------------------------------------------------------------------
190%macro PREDICT_4x4_DDL 0
191cglobal predict_4x4_ddl, 1,1
192    movu    m1, [r0-FDEC_STRIDEB]
193    PSLLPIX m2, m1, 1
194    mova    m0, m1
195%if HIGH_BIT_DEPTH
196    PSRLPIX m1, m1, 1
197    pshufhw m1, m1, q2210
198%else
199    pxor    m1, m2
200    PSRLPIX m1, m1, 1
201    pxor    m1, m0
202%endif
203    PRED8x8_LOWPASS m0, m2, m1, m0, m3
204
205%assign Y 0
206%rep 4
207    PSRLPIX m0, m0, 1
208    movh   [r0+Y*FDEC_STRIDEB], m0
209%assign Y (Y+1)
210%endrep
211
212    RET
213%endmacro
214
215%if HIGH_BIT_DEPTH
216INIT_XMM sse2
217PREDICT_4x4_DDL
218INIT_XMM avx
219PREDICT_4x4_DDL
220INIT_MMX mmx2
221cglobal predict_4x4_ddl, 1,2
222    movu    m1, [r0-FDEC_STRIDEB+4]
223    PRED8x8_LOWPASS m0, m1, [r0-FDEC_STRIDEB+0], [r0-FDEC_STRIDEB+2]
224    mova    m3, [r0-FDEC_STRIDEB+8]
225    mova    [r0+0*FDEC_STRIDEB], m0
226    pshufw  m4, m3, q3321
227    PRED8x8_LOWPASS m2, m4, [r0-FDEC_STRIDEB+6], m3
228    mova    [r0+3*FDEC_STRIDEB], m2
229    pshufw  m1, m0, q0021
230    punpckldq m1, m2
231    mova    [r0+1*FDEC_STRIDEB], m1
232    psllq   m0, 16
233    PALIGNR m2, m0, 6, m0
234    mova    [r0+2*FDEC_STRIDEB], m2
235    RET
236%else ; !HIGH_BIT_DEPTH
237INIT_MMX mmx2
238PREDICT_4x4_DDL
239%endif
240
241;-----------------------------------------------------------------------------
242; void predict_4x4_vr( pixel *src )
243;-----------------------------------------------------------------------------
244%if HIGH_BIT_DEPTH == 0
245INIT_MMX ssse3
246cglobal predict_4x4_vr, 1,1
247    movd    m1, [r0-1*FDEC_STRIDEB]        ; ........t3t2t1t0
248    mova    m4, m1
249    palignr m1, [r0-1*FDEC_STRIDEB-8], 7   ; ......t3t2t1t0lt
250    pavgb   m4, m1
251    palignr m1, [r0+0*FDEC_STRIDEB-8], 7   ; ....t3t2t1t0ltl0
252    mova    m0, m1
253    palignr m1, [r0+1*FDEC_STRIDEB-8], 7   ; ..t3t2t1t0ltl0l1
254    mova    m2, m1
255    palignr m1, [r0+2*FDEC_STRIDEB-8], 7   ; t3t2t1t0ltl0l1l2
256    PRED8x8_LOWPASS m2, m0, m1, m2, m3
257    pshufw  m0, m2, 0
258    psrlq   m2, 16
259    movd    [r0+0*FDEC_STRIDEB], m4
260    palignr m4, m0, 7
261    movd    [r0+1*FDEC_STRIDEB], m2
262    psllq   m0, 8
263    movd    [r0+2*FDEC_STRIDEB], m4
264    palignr m2, m0, 7
265    movd    [r0+3*FDEC_STRIDEB], m2
266    RET
267%endif ; !HIGH_BIT_DEPTH
268
269;-----------------------------------------------------------------------------
270; void predict_4x4_ddr( pixel *src )
271;-----------------------------------------------------------------------------
272%macro PREDICT_4x4 4
273cglobal predict_4x4_ddr, 1,1
274%if HIGH_BIT_DEPTH
275    movu      m2, [r0-1*FDEC_STRIDEB-8]
276    pinsrw    m2, [r0+0*FDEC_STRIDEB-2], 2
277    pinsrw    m2, [r0+1*FDEC_STRIDEB-2], 1
278    pinsrw    m2, [r0+2*FDEC_STRIDEB-2], 0
279    movhps    m3, [r0+3*FDEC_STRIDEB-8]
280%else ; !HIGH_BIT_DEPTH
281    movd      m0, [r0+2*FDEC_STRIDEB-4]
282    movd      m1, [r0+0*FDEC_STRIDEB-4]
283    punpcklbw m0, [r0+1*FDEC_STRIDEB-4]
284    punpcklbw m1, [r0-1*FDEC_STRIDEB-4]
285    punpckhwd m0, m1
286    movd      m2, [r0-1*FDEC_STRIDEB]
287%if cpuflag(ssse3)
288    palignr   m2, m0, 4
289%else
290    psllq     m2, 32
291    punpckhdq m0, m2
292    SWAP       2, 0
293%endif
294    movd      m3, [r0+3*FDEC_STRIDEB-4]
295    psllq     m3, 32
296%endif ; !HIGH_BIT_DEPTH
297
298    PSRLPIX   m1, m2, 1
299    mova      m0, m2
300    PALIGNR   m2, m3, 7*SIZEOF_PIXEL, m3
301    PRED8x8_LOWPASS m0, m2, m1, m0, m3
302%assign Y 3
303    movh      [r0+Y*FDEC_STRIDEB], m0
304%rep 3
305%assign Y (Y-1)
306    PSRLPIX   m0, m0, 1
307    movh      [r0+Y*FDEC_STRIDEB], m0
308%endrep
309    RET
310
311;-----------------------------------------------------------------------------
312; void predict_4x4_vr( pixel *src )
313;-----------------------------------------------------------------------------
314cglobal predict_4x4_vr, 1,1
315%if HIGH_BIT_DEPTH
316    movu      m1, [r0-1*FDEC_STRIDEB-8]
317    pinsrw    m1, [r0+0*FDEC_STRIDEB-2], 2
318    pinsrw    m1, [r0+1*FDEC_STRIDEB-2], 1
319    pinsrw    m1, [r0+2*FDEC_STRIDEB-2], 0
320%else ; !HIGH_BIT_DEPTH
321    movd      m0, [r0+2*FDEC_STRIDEB-4]
322    movd      m1, [r0+0*FDEC_STRIDEB-4]
323    punpcklbw m0, [r0+1*FDEC_STRIDEB-4]
324    punpcklbw m1, [r0-1*FDEC_STRIDEB-4]
325    punpckhwd m0, m1
326    movd      m1, [r0-1*FDEC_STRIDEB]
327%if cpuflag(ssse3)
328    palignr   m1, m0, 4
329%else
330    psllq     m1, 32
331    punpckhdq m0, m1
332    SWAP       1, 0
333%endif
334%endif ; !HIGH_BIT_DEPTH
335    PSRLPIX   m2, m1, 1
336    PSRLPIX   m0, m1, 2
337    pavg%1    m4, m1, m2
338    PSRLPIX   m4, m4, 3
339    PRED8x8_LOWPASS m2, m0, m1, m2, m3
340    PSLLPIX   m0, m2, 6
341    PSRLPIX   m2, m2, 2
342    movh      [r0+0*FDEC_STRIDEB], m4
343    PALIGNR   m4, m0, 7*SIZEOF_PIXEL, m3
344    movh      [r0+1*FDEC_STRIDEB], m2
345    PSLLPIX   m0, m0, 1
346    movh      [r0+2*FDEC_STRIDEB], m4
347    PALIGNR   m2, m0, 7*SIZEOF_PIXEL, m0
348    movh      [r0+3*FDEC_STRIDEB], m2
349    RET
350
351;-----------------------------------------------------------------------------
352; void predict_4x4_hd( pixel *src )
353;-----------------------------------------------------------------------------
354cglobal predict_4x4_hd, 1,1
355%if HIGH_BIT_DEPTH
356    movu      m1, [r0-1*FDEC_STRIDEB-8]
357    PSLLPIX   m1, m1, 1
358    pinsrw    m1, [r0+0*FDEC_STRIDEB-2], 3
359    pinsrw    m1, [r0+1*FDEC_STRIDEB-2], 2
360    pinsrw    m1, [r0+2*FDEC_STRIDEB-2], 1
361    pinsrw    m1, [r0+3*FDEC_STRIDEB-2], 0
362%else
363    movd      m0, [r0-1*FDEC_STRIDEB-4] ; lt ..
364    punpckldq m0, [r0-1*FDEC_STRIDEB]   ; t3 t2 t1 t0 lt .. .. ..
365    PSLLPIX   m0, m0, 1                 ; t2 t1 t0 lt .. .. .. ..
366    movd      m1, [r0+3*FDEC_STRIDEB-4] ; l3
367    punpcklbw m1, [r0+2*FDEC_STRIDEB-4] ; l2 l3
368    movd      m2, [r0+1*FDEC_STRIDEB-4] ; l1
369    punpcklbw m2, [r0+0*FDEC_STRIDEB-4] ; l0 l1
370    punpckh%3 m1, m2                    ; l0 l1 l2 l3
371    punpckh%4 m1, m0                    ; t2 t1 t0 lt l0 l1 l2 l3
372%endif
373    PSRLPIX   m2, m1, 1                 ; .. t2 t1 t0 lt l0 l1 l2
374    PSRLPIX   m0, m1, 2                 ; .. .. t2 t1 t0 lt l0 l1
375    pavg%1    m5, m1, m2
376    PRED8x8_LOWPASS m3, m1, m0, m2, m4
377    punpckl%2 m5, m3
378    PSRLPIX   m3, m3, 4
379    PALIGNR   m3, m5, 6*SIZEOF_PIXEL, m4
380%assign Y 3
381    movh      [r0+Y*FDEC_STRIDEB], m5
382%rep 2
383%assign Y (Y-1)
384    PSRLPIX   m5, m5, 2
385    movh      [r0+Y*FDEC_STRIDEB], m5
386%endrep
387    movh      [r0+0*FDEC_STRIDEB], m3
388    RET
389%endmacro ; PREDICT_4x4
390
391;-----------------------------------------------------------------------------
392; void predict_4x4_ddr( pixel *src )
393;-----------------------------------------------------------------------------
394%if HIGH_BIT_DEPTH
395INIT_MMX mmx2
396cglobal predict_4x4_ddr, 1,1
397    mova      m0, [r0+1*FDEC_STRIDEB-8]
398    punpckhwd m0, [r0+0*FDEC_STRIDEB-8]
399    mova      m3, [r0+3*FDEC_STRIDEB-8]
400    punpckhwd m3, [r0+2*FDEC_STRIDEB-8]
401    punpckhdq m3, m0
402
403    pshufw  m0, m3, q3321
404    pinsrw  m0, [r0-1*FDEC_STRIDEB-2], 3
405    pshufw  m1, m0, q3321
406    PRED8x8_LOWPASS m0, m1, m3, m0
407    movq    [r0+3*FDEC_STRIDEB], m0
408
409    movq    m2, [r0-1*FDEC_STRIDEB-0]
410    pshufw  m4, m2, q2100
411    pinsrw  m4, [r0-1*FDEC_STRIDEB-2], 0
412    movq    m1, m4
413    PALIGNR m4, m3, 6, m3
414    PRED8x8_LOWPASS m1, m4, m2, m1
415    movq    [r0+0*FDEC_STRIDEB], m1
416
417    pshufw  m2, m0, q3321
418    punpckldq m2, m1
419    psllq   m0, 16
420    PALIGNR m1, m0, 6, m0
421    movq    [r0+1*FDEC_STRIDEB], m1
422    movq    [r0+2*FDEC_STRIDEB], m2
423    movd    [r0+3*FDEC_STRIDEB+4], m1
424    RET
425
426;-----------------------------------------------------------------------------
427; void predict_4x4_hd( pixel *src )
428;-----------------------------------------------------------------------------
429cglobal predict_4x4_hd, 1,1
430    mova      m0, [r0+1*FDEC_STRIDEB-8]
431    punpckhwd m0, [r0+0*FDEC_STRIDEB-8]
432    mova      m1, [r0+3*FDEC_STRIDEB-8]
433    punpckhwd m1, [r0+2*FDEC_STRIDEB-8]
434    punpckhdq m1, m0
435    mova      m0, m1
436
437    movu      m3, [r0-1*FDEC_STRIDEB-2]
438    pshufw    m4, m1, q0032
439    mova      m7, m3
440    punpckldq m4, m3
441    PALIGNR   m3, m1, 2, m2
442    PRED8x8_LOWPASS m2, m4, m1, m3
443
444    pavgw     m0, m3
445    punpcklwd m5, m0, m2
446    punpckhwd m4, m0, m2
447    mova      [r0+3*FDEC_STRIDEB], m5
448    mova      [r0+1*FDEC_STRIDEB], m4
449    psrlq     m5, 32
450    punpckldq m5, m4
451    mova      [r0+2*FDEC_STRIDEB], m5
452
453    pshufw    m4, m7, q2100
454    mova      m6, [r0-1*FDEC_STRIDEB+0]
455    pinsrw    m4, [r0+0*FDEC_STRIDEB-2], 0
456    PRED8x8_LOWPASS m3, m4, m6, m7
457    PALIGNR   m3, m0, 6, m0
458    mova      [r0+0*FDEC_STRIDEB], m3
459    RET
460
461INIT_XMM sse2
462PREDICT_4x4 w, wd, dq, qdq
463INIT_XMM ssse3
464PREDICT_4x4 w, wd, dq, qdq
465INIT_XMM avx
466PREDICT_4x4 w, wd, dq, qdq
467%else ; !HIGH_BIT_DEPTH
468INIT_MMX mmx2
469PREDICT_4x4 b, bw, wd, dq
470INIT_MMX ssse3
471%define predict_4x4_vr_ssse3 predict_4x4_vr_cache64_ssse3
472PREDICT_4x4 b, bw, wd, dq
473%endif
474
475;-----------------------------------------------------------------------------
476; void predict_4x4_hu( pixel *src )
477;-----------------------------------------------------------------------------
478%if HIGH_BIT_DEPTH
479INIT_MMX
480cglobal predict_4x4_hu_mmx2, 1,1
481    movq      m0, [r0+0*FDEC_STRIDEB-8]
482    punpckhwd m0, [r0+1*FDEC_STRIDEB-8]
483    movq      m1, [r0+2*FDEC_STRIDEB-8]
484    punpckhwd m1, [r0+3*FDEC_STRIDEB-8]
485    punpckhdq m0, m1
486    pshufw    m1, m1, q3333
487    movq      [r0+3*FDEC_STRIDEB], m1
488    pshufw    m3, m0, q3321
489    pshufw    m4, m0, q3332
490    pavgw     m2, m0, m3
491    PRED8x8_LOWPASS m3, m0, m4, m3
492    punpcklwd m4, m2, m3
493    mova      [r0+0*FDEC_STRIDEB], m4
494    psrlq     m2, 16
495    psrlq     m3, 16
496    punpcklwd m2, m3
497    mova      [r0+1*FDEC_STRIDEB], m2
498    punpckhdq m2, m1
499    mova      [r0+2*FDEC_STRIDEB], m2
500    RET
501
502%else ; !HIGH_BIT_DEPTH
503INIT_MMX
504cglobal predict_4x4_hu_mmx2, 1,1
505    movd      m1, [r0+0*FDEC_STRIDEB-4]
506    punpcklbw m1, [r0+1*FDEC_STRIDEB-4]
507    movd      m0, [r0+2*FDEC_STRIDEB-4]
508    punpcklbw m0, [r0+3*FDEC_STRIDEB-4]
509    punpckhwd m1, m0
510    movq      m0, m1
511    punpckhbw m1, m1
512    pshufw    m1, m1, q3333
513    punpckhdq m0, m1
514    movq      m2, m0
515    movq      m3, m0
516    movq      m5, m0
517    psrlq     m3, 8
518    psrlq     m2, 16
519    pavgb     m5, m3
520    PRED8x8_LOWPASS m3, m0, m2, m3, m4
521    movd      [r0+3*FDEC_STRIDEB], m1
522    punpcklbw m5, m3
523    movd      [r0+0*FDEC_STRIDEB], m5
524    psrlq     m5, 16
525    movd      [r0+1*FDEC_STRIDEB], m5
526    psrlq     m5, 16
527    movd      [r0+2*FDEC_STRIDEB], m5
528    RET
529%endif ; HIGH_BIT_DEPTH
530
531;-----------------------------------------------------------------------------
532; void predict_4x4_vl( pixel *src )
533;-----------------------------------------------------------------------------
534%macro PREDICT_4x4_V1 1
535cglobal predict_4x4_vl, 1,1
536    movu        m1, [r0-FDEC_STRIDEB]
537    PSRLPIX     m3, m1, 1
538    PSRLPIX     m2, m1, 2
539    pavg%1      m4, m3, m1
540    PRED8x8_LOWPASS m0, m1, m2, m3, m5
541
542    movh        [r0+0*FDEC_STRIDEB], m4
543    movh        [r0+1*FDEC_STRIDEB], m0
544    PSRLPIX     m4, m4, 1
545    PSRLPIX     m0, m0, 1
546    movh        [r0+2*FDEC_STRIDEB], m4
547    movh        [r0+3*FDEC_STRIDEB], m0
548    RET
549%endmacro
550
551%if HIGH_BIT_DEPTH
552INIT_XMM sse2
553PREDICT_4x4_V1 w
554INIT_XMM avx
555PREDICT_4x4_V1 w
556
557INIT_MMX mmx2
558cglobal predict_4x4_vl, 1,4
559    mova    m1, [r0-FDEC_STRIDEB+0]
560    mova    m2, [r0-FDEC_STRIDEB+8]
561    mova    m0, m2
562    PALIGNR m2, m1, 4, m4
563    PALIGNR m0, m1, 2, m4
564    mova    m3, m0
565    pavgw   m3, m1
566    mova    [r0+0*FDEC_STRIDEB], m3
567    psrlq   m3, 16
568    mova    [r0+2*FDEC_STRIDEB], m3
569    PRED8x8_LOWPASS m0, m1, m2, m0
570    mova    [r0+1*FDEC_STRIDEB], m0
571    psrlq   m0, 16
572    mova    [r0+3*FDEC_STRIDEB], m0
573
574    movzx   r1d, word [r0-FDEC_STRIDEB+ 8]
575    movzx   r2d, word [r0-FDEC_STRIDEB+10]
576    movzx   r3d, word [r0-FDEC_STRIDEB+12]
577    lea     r1d, [r1+r2+1]
578    add     r3d, r2d
579    lea     r3d, [r3+r1+1]
580    shr     r1d, 1
581    shr     r3d, 2
582    mov     [r0+2*FDEC_STRIDEB+6], r1w
583    mov     [r0+3*FDEC_STRIDEB+6], r3w
584    RET
585%else ; !HIGH_BIT_DEPTH
586INIT_MMX mmx2
587PREDICT_4x4_V1 b
588%endif
589
590;-----------------------------------------------------------------------------
591; void predict_4x4_dc( pixel *src )
592;-----------------------------------------------------------------------------
593INIT_MMX mmx2
594%if HIGH_BIT_DEPTH
595cglobal predict_4x4_dc, 1,1
596    mova   m2, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL]
597    paddw  m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
598    paddw  m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
599    paddw  m2, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL]
600    psrlq  m2, 48
601    mova   m0, [r0-FDEC_STRIDEB]
602    HADDW  m0, m1
603    paddw  m0, [pw_4]
604    paddw  m0, m2
605    psrlw  m0, 3
606    SPLATW m0, m0
607    mova   [r0+0*FDEC_STRIDEB], m0
608    mova   [r0+1*FDEC_STRIDEB], m0
609    mova   [r0+2*FDEC_STRIDEB], m0
610    mova   [r0+3*FDEC_STRIDEB], m0
611    RET
612
613%else ; !HIGH_BIT_DEPTH
614cglobal predict_4x4_dc, 1,4
615    pxor   mm7, mm7
616    movd   mm0, [r0-FDEC_STRIDEB]
617    psadbw mm0, mm7
618    movd   r3d, mm0
619    movzx  r1d, byte [r0-1]
620%assign Y 1
621%rep 3
622    movzx  r2d, byte [r0+FDEC_STRIDEB*Y-1]
623    add    r1d, r2d
624%assign Y Y+1
625%endrep
626    lea    r1d, [r1+r3+4]
627    shr    r1d, 3
628    imul   r1d, 0x01010101
629    mov   [r0+FDEC_STRIDEB*0], r1d
630    mov   [r0+FDEC_STRIDEB*1], r1d
631    mov   [r0+FDEC_STRIDEB*2], r1d
632    mov   [r0+FDEC_STRIDEB*3], r1d
633    RET
634%endif ; HIGH_BIT_DEPTH
635
636%macro PREDICT_FILTER 4
637;-----------------------------------------------------------------------------
638;void predict_8x8_filter( pixel *src, pixel edge[36], int i_neighbor, int i_filters )
639;-----------------------------------------------------------------------------
640cglobal predict_8x8_filter, 4,6,6
641    add          r0, 0x58*SIZEOF_PIXEL
642%define src r0-0x58*SIZEOF_PIXEL
643%if ARCH_X86_64 == 0
644    mov          r4, r1
645%define t1 r4
646%define t4 r1
647%else
648%define t1 r1
649%define t4 r4
650%endif
651    test       r3b, 1
652    je .check_top
653    mov        t4d, r2d
654    and        t4d, 8
655    neg         t4
656    mova        m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL]
657    punpckh%1%2 m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL+t4*(FDEC_STRIDEB/8)]
658    mova        m1, [src+2*FDEC_STRIDEB-8*SIZEOF_PIXEL]
659    punpckh%1%2 m1, [src+1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
660    punpckh%2%3 m1, m0
661    mova        m2, [src+4*FDEC_STRIDEB-8*SIZEOF_PIXEL]
662    punpckh%1%2 m2, [src+3*FDEC_STRIDEB-8*SIZEOF_PIXEL]
663    mova        m3, [src+6*FDEC_STRIDEB-8*SIZEOF_PIXEL]
664    punpckh%1%2 m3, [src+5*FDEC_STRIDEB-8*SIZEOF_PIXEL]
665    punpckh%2%3 m3, m2
666    punpckh%3%4 m3, m1
667    mova        m0, [src+7*FDEC_STRIDEB-8*SIZEOF_PIXEL]
668    mova        m1, [src-1*FDEC_STRIDEB]
669    PALIGNR     m4, m3, m0, 7*SIZEOF_PIXEL, m0
670    PALIGNR     m1, m1, m3, 1*SIZEOF_PIXEL, m2
671    PRED8x8_LOWPASS m3, m1, m4, m3, m5
672    mova        [t1+8*SIZEOF_PIXEL], m3
673    movzx      t4d, pixel [src+7*FDEC_STRIDEB-1*SIZEOF_PIXEL]
674    movzx      r5d, pixel [src+6*FDEC_STRIDEB-1*SIZEOF_PIXEL]
675    lea        t4d, [t4*3+2]
676    add        t4d, r5d
677    shr        t4d, 2
678    mov         [t1+7*SIZEOF_PIXEL], t4%1
679    mov         [t1+6*SIZEOF_PIXEL], t4%1
680    test       r3b, 2
681    je .done
682.check_top:
683%if SIZEOF_PIXEL==1 && cpuflag(ssse3)
684INIT_XMM cpuname
685    movu        m3, [src-1*FDEC_STRIDEB]
686    movhps      m0, [src-1*FDEC_STRIDEB-8]
687    test       r2b, 8
688    je .fix_lt_2
689.do_top:
690    and        r2d, 4
691%if ARCH_X86_64
692    lea         r3, [shuf_fixtr]
693    pshufb      m3, [r3+r2*4]
694%else
695    pshufb      m3, [shuf_fixtr+r2*4] ; neighbor&MB_TOPRIGHT ? shuf_nop : shuf_fixtr
696%endif
697    psrldq      m1, m3, 15
698    PALIGNR     m2, m3, m0, 15, m0
699    PALIGNR     m1, m3, 1, m5
700    PRED8x8_LOWPASS m0, m2, m1, m3, m5
701    mova        [t1+16*SIZEOF_PIXEL], m0
702    psrldq      m0, 15
703    movd        [t1+32*SIZEOF_PIXEL], m0
704.done:
705    REP_RET
706.fix_lt_2:
707    pslldq      m0, m3, 15
708    jmp .do_top
709
710%else
711    mova        m0, [src-1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
712    mova        m3, [src-1*FDEC_STRIDEB]
713    mova        m1, [src-1*FDEC_STRIDEB+8*SIZEOF_PIXEL]
714    test       r2b, 8
715    je .fix_lt_2
716    test       r2b, 4
717    je .fix_tr_1
718.do_top:
719    PALIGNR     m2, m3, m0, 7*SIZEOF_PIXEL, m0
720    PALIGNR     m0, m1, m3, 1*SIZEOF_PIXEL, m5
721    PRED8x8_LOWPASS m4, m2, m0, m3, m5
722    mova        [t1+16*SIZEOF_PIXEL], m4
723    test       r3b, 4
724    je .done
725    PSRLPIX     m5, m1, 7
726    PALIGNR     m2, m1, m3, 7*SIZEOF_PIXEL, m3
727    PALIGNR     m5, m1, 1*SIZEOF_PIXEL, m4
728    PRED8x8_LOWPASS m0, m2, m5, m1, m4
729    mova        [t1+24*SIZEOF_PIXEL], m0
730    PSRLPIX     m0, m0, 7
731    movd        [t1+32*SIZEOF_PIXEL], m0
732.done:
733    REP_RET
734.fix_lt_2:
735    PSLLPIX     m0, m3, 7
736    test       r2b, 4
737    jne .do_top
738.fix_tr_1:
739    punpckh%1%2 m1, m3, m3
740    pshuf%2     m1, m1, q3333
741    jmp .do_top
742%endif
743%endmacro
744
745%if HIGH_BIT_DEPTH
746INIT_XMM sse2
747PREDICT_FILTER w, d, q, dq
748INIT_XMM ssse3
749PREDICT_FILTER w, d, q, dq
750INIT_XMM avx
751PREDICT_FILTER w, d, q, dq
752%else
753INIT_MMX mmx2
754PREDICT_FILTER b, w, d, q
755INIT_MMX ssse3
756PREDICT_FILTER b, w, d, q
757%endif
758
759;-----------------------------------------------------------------------------
760; void predict_8x8_v( pixel *src, pixel *edge )
761;-----------------------------------------------------------------------------
762%macro PREDICT_8x8_V 0
763cglobal predict_8x8_v, 2,2
764    mova        m0, [r1+16*SIZEOF_PIXEL]
765    STORE8      m0
766    RET
767%endmacro
768
769%if HIGH_BIT_DEPTH
770INIT_XMM sse
771PREDICT_8x8_V
772%else
773INIT_MMX mmx2
774PREDICT_8x8_V
775%endif
776
777;-----------------------------------------------------------------------------
778; void predict_8x8_h( pixel *src, pixel edge[36] )
779;-----------------------------------------------------------------------------
780%macro PREDICT_8x8_H 2
781cglobal predict_8x8_h, 2,2
782    movu      m1, [r1+7*SIZEOF_PIXEL]
783    add       r0, 4*FDEC_STRIDEB
784    punpckl%1 m2, m1, m1
785    punpckh%1 m1, m1
786%assign Y 0
787%rep 8
788%assign i 1+Y/4
789    SPLAT%2 m0, m %+ i, (3-Y)&3
790    mova [r0+(Y-4)*FDEC_STRIDEB], m0
791%assign Y Y+1
792%endrep
793    RET
794%endmacro
795
796%if HIGH_BIT_DEPTH
797INIT_XMM sse2
798PREDICT_8x8_H wd, D
799%else
800INIT_MMX mmx2
801PREDICT_8x8_H bw, W
802%endif
803
804;-----------------------------------------------------------------------------
805; void predict_8x8_dc( pixel *src, pixel *edge );
806;-----------------------------------------------------------------------------
807%if HIGH_BIT_DEPTH
808INIT_XMM sse2
809cglobal predict_8x8_dc, 2,2
810    movu        m0, [r1+14]
811    paddw       m0, [r1+32]
812    HADDW       m0, m1
813    paddw       m0, [pw_8]
814    psrlw       m0, 4
815    SPLATW      m0, m0
816    STORE8      m0
817    RET
818
819%else ; !HIGH_BIT_DEPTH
820INIT_MMX mmx2
821cglobal predict_8x8_dc, 2,2
822    pxor        mm0, mm0
823    pxor        mm1, mm1
824    psadbw      mm0, [r1+7]
825    psadbw      mm1, [r1+16]
826    paddw       mm0, [pw_8]
827    paddw       mm0, mm1
828    psrlw       mm0, 4
829    pshufw      mm0, mm0, 0
830    packuswb    mm0, mm0
831    STORE8      mm0
832    RET
833%endif ; HIGH_BIT_DEPTH
834
835;-----------------------------------------------------------------------------
836; void predict_8x8_dc_top ( pixel *src, pixel *edge );
837; void predict_8x8_dc_left( pixel *src, pixel *edge );
838;-----------------------------------------------------------------------------
839%if HIGH_BIT_DEPTH
840%macro PREDICT_8x8_DC 3
841cglobal %1, 2,2
842    %3          m0, [r1+%2]
843    HADDW       m0, m1
844    paddw       m0, [pw_4]
845    psrlw       m0, 3
846    SPLATW      m0, m0
847    STORE8      m0
848    RET
849%endmacro
850INIT_XMM sse2
851PREDICT_8x8_DC predict_8x8_dc_top , 32, mova
852PREDICT_8x8_DC predict_8x8_dc_left, 14, movu
853
854%else ; !HIGH_BIT_DEPTH
855%macro PREDICT_8x8_DC 2
856cglobal %1, 2,2
857    pxor        mm0, mm0
858    psadbw      mm0, [r1+%2]
859    paddw       mm0, [pw_4]
860    psrlw       mm0, 3
861    pshufw      mm0, mm0, 0
862    packuswb    mm0, mm0
863    STORE8      mm0
864    RET
865%endmacro
866INIT_MMX
867PREDICT_8x8_DC predict_8x8_dc_top_mmx2, 16
868PREDICT_8x8_DC predict_8x8_dc_left_mmx2, 7
869%endif ; HIGH_BIT_DEPTH
870
871; sse2 is faster even on amd for 8-bit, so there's no sense in spending exe
872; size on the 8-bit mmx functions below if we know sse2 is available.
873%macro PREDICT_8x8_DDLR 0
874;-----------------------------------------------------------------------------
875; void predict_8x8_ddl( pixel *src, pixel *edge )
876;-----------------------------------------------------------------------------
877cglobal predict_8x8_ddl, 2,2,7
878    mova        m0, [r1+16*SIZEOF_PIXEL]
879    mova        m1, [r1+24*SIZEOF_PIXEL]
880%if cpuflag(cache64)
881    movd        m5, [r1+32*SIZEOF_PIXEL]
882    palignr     m3, m1, m0, 1*SIZEOF_PIXEL
883    palignr     m5, m5, m1, 1*SIZEOF_PIXEL
884    palignr     m4, m1, m0, 7*SIZEOF_PIXEL
885%else
886    movu        m3, [r1+17*SIZEOF_PIXEL]
887    movu        m4, [r1+23*SIZEOF_PIXEL]
888    movu        m5, [r1+25*SIZEOF_PIXEL]
889%endif
890    PSLLPIX     m2, m0, 1
891    add         r0, FDEC_STRIDEB*4
892    PRED8x8_LOWPASS m0, m2, m3, m0, m6
893    PRED8x8_LOWPASS m1, m4, m5, m1, m6
894    mova        [r0+3*FDEC_STRIDEB], m1
895%assign Y 2
896%rep 6
897    PALIGNR     m1, m0, 7*SIZEOF_PIXEL, m2
898    PSLLPIX     m0, m0, 1
899    mova        [r0+Y*FDEC_STRIDEB], m1
900%assign Y (Y-1)
901%endrep
902    PALIGNR     m1, m0, 7*SIZEOF_PIXEL, m0
903    mova        [r0+Y*FDEC_STRIDEB], m1
904    RET
905
906;-----------------------------------------------------------------------------
907; void predict_8x8_ddr( pixel *src, pixel *edge )
908;-----------------------------------------------------------------------------
909cglobal predict_8x8_ddr, 2,2,7
910    add         r0, FDEC_STRIDEB*4
911    mova        m0, [r1+ 8*SIZEOF_PIXEL]
912    mova        m1, [r1+16*SIZEOF_PIXEL]
913    ; edge[] is 32byte aligned, so some of the unaligned loads are known to be not cachesplit
914    movu        m2, [r1+ 7*SIZEOF_PIXEL]
915    movu        m5, [r1+17*SIZEOF_PIXEL]
916%if cpuflag(cache64)
917    palignr     m3, m1, m0, 1*SIZEOF_PIXEL
918    palignr     m4, m1, m0, 7*SIZEOF_PIXEL
919%else
920    movu        m3, [r1+ 9*SIZEOF_PIXEL]
921    movu        m4, [r1+15*SIZEOF_PIXEL]
922%endif
923    PRED8x8_LOWPASS m0, m2, m3, m0, m6
924    PRED8x8_LOWPASS m1, m4, m5, m1, m6
925    mova        [r0+3*FDEC_STRIDEB], m0
926%assign Y -4
927%rep 6
928    PALIGNR     m1, m0, 7*SIZEOF_PIXEL, m2
929    PSLLPIX     m0, m0, 1
930    mova        [r0+Y*FDEC_STRIDEB], m1
931%assign Y (Y+1)
932%endrep
933    PALIGNR     m1, m0, 7*SIZEOF_PIXEL, m0
934    mova        [r0+Y*FDEC_STRIDEB], m1
935    RET
936%endmacro ; PREDICT_8x8_DDLR
937
938%if HIGH_BIT_DEPTH
939INIT_XMM sse2
940PREDICT_8x8_DDLR
941INIT_XMM ssse3
942PREDICT_8x8_DDLR
943INIT_XMM cache64, ssse3
944PREDICT_8x8_DDLR
945%elif ARCH_X86_64 == 0
946INIT_MMX mmx2
947PREDICT_8x8_DDLR
948%endif
949
950;-----------------------------------------------------------------------------
951; void predict_8x8_hu( pixel *src, pixel *edge )
952;-----------------------------------------------------------------------------
953%macro PREDICT_8x8_HU 2
954cglobal predict_8x8_hu, 2,2,8
955    add       r0, 4*FDEC_STRIDEB
956%if HIGH_BIT_DEPTH
957%if cpuflag(ssse3)
958    movu      m5, [r1+7*SIZEOF_PIXEL]
959    pshufb    m5, [pw_reverse]
960%else
961    movq      m6, [r1+7*SIZEOF_PIXEL]
962    movq      m5, [r1+11*SIZEOF_PIXEL]
963    pshuflw   m6, m6, q0123
964    pshuflw   m5, m5, q0123
965    movlhps   m5, m6
966%endif ; cpuflag
967    psrldq    m2, m5, 2
968    pshufd    m3, m5, q0321
969    pshufhw   m2, m2, q2210
970    pshufhw   m3, m3, q1110
971    pavgw     m4, m5, m2
972%else ; !HIGH_BIT_DEPTH
973    movu      m1, [r1+7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7
974    pshufw    m0, m1, q0123           ; l6 l7 l4 l5 l2 l3 l0 l1
975    psllq     m1, 56                  ; l7 .. .. .. .. .. .. ..
976    mova      m2, m0
977    psllw     m0, 8
978    psrlw     m2, 8
979    por       m2, m0
980    mova      m3, m2
981    mova      m4, m2
982    mova      m5, m2                  ; l7 l6 l5 l4 l3 l2 l1 l0
983    psrlq     m3, 16
984    psrlq     m2, 8
985    por       m2, m1                  ; l7 l7 l6 l5 l4 l3 l2 l1
986    punpckhbw m1, m1
987    por       m3, m1                  ; l7 l7 l7 l6 l5 l4 l3 l2
988    pavgb     m4, m2
989%endif ; !HIGH_BIT_DEPTH
990    PRED8x8_LOWPASS m2, m3, m5, m2, m6
991    punpckh%2 m0, m4, m2              ; p8 p7 p6 p5
992    punpckl%2 m4, m2                  ; p4 p3 p2 p1
993    PALIGNR   m5, m0, m4, 2*SIZEOF_PIXEL, m3
994    pshuf%1   m1, m0, q3321
995    PALIGNR   m6, m0, m4, 4*SIZEOF_PIXEL, m3
996    pshuf%1   m2, m0, q3332
997    PALIGNR   m7, m0, m4, 6*SIZEOF_PIXEL, m3
998    pshuf%1   m3, m0, q3333
999    mova      [r0-4*FDEC_STRIDEB], m4
1000    mova      [r0-3*FDEC_STRIDEB], m5
1001    mova      [r0-2*FDEC_STRIDEB], m6
1002    mova      [r0-1*FDEC_STRIDEB], m7
1003    mova      [r0+0*FDEC_STRIDEB], m0
1004    mova      [r0+1*FDEC_STRIDEB], m1
1005    mova      [r0+2*FDEC_STRIDEB], m2
1006    mova      [r0+3*FDEC_STRIDEB], m3
1007    RET
1008%endmacro
1009
1010%if HIGH_BIT_DEPTH
1011INIT_XMM sse2
1012PREDICT_8x8_HU d, wd
1013INIT_XMM ssse3
1014PREDICT_8x8_HU d, wd
1015INIT_XMM avx
1016PREDICT_8x8_HU d, wd
1017%elif ARCH_X86_64 == 0
1018INIT_MMX mmx2
1019PREDICT_8x8_HU w, bw
1020%endif
1021
1022;-----------------------------------------------------------------------------
1023; void predict_8x8_vr( pixel *src, pixel *edge )
1024;-----------------------------------------------------------------------------
1025%macro PREDICT_8x8_VR 1
1026cglobal predict_8x8_vr, 2,3
1027    mova        m2, [r1+16*SIZEOF_PIXEL]
1028%ifidn cpuname, ssse3
1029    mova        m0, [r1+8*SIZEOF_PIXEL]
1030    palignr     m3, m2, m0, 7*SIZEOF_PIXEL
1031    palignr     m1, m2, m0, 6*SIZEOF_PIXEL
1032%else
1033    movu        m3, [r1+15*SIZEOF_PIXEL]
1034    movu        m1, [r1+14*SIZEOF_PIXEL]
1035%endif
1036    pavg%1      m4, m3, m2
1037    add         r0, FDEC_STRIDEB*4
1038    PRED8x8_LOWPASS m3, m1, m2, m3, m5
1039    mova        [r0-4*FDEC_STRIDEB], m4
1040    mova        [r0-3*FDEC_STRIDEB], m3
1041    mova        m1, [r1+8*SIZEOF_PIXEL]
1042    PSLLPIX     m0, m1, 1
1043    PSLLPIX     m2, m1, 2
1044    PRED8x8_LOWPASS m0, m1, m2, m0, m6
1045
1046%assign Y -2
1047%rep 5
1048    PALIGNR     m4, m0, 7*SIZEOF_PIXEL, m5
1049    mova        [r0+Y*FDEC_STRIDEB], m4
1050    PSLLPIX     m0, m0, 1
1051    SWAP 3, 4
1052%assign Y (Y+1)
1053%endrep
1054    PALIGNR     m4, m0, 7*SIZEOF_PIXEL, m0
1055    mova        [r0+Y*FDEC_STRIDEB], m4
1056    RET
1057%endmacro
1058
1059%if HIGH_BIT_DEPTH
1060INIT_XMM sse2
1061PREDICT_8x8_VR w
1062INIT_XMM ssse3
1063PREDICT_8x8_VR w
1064INIT_XMM avx
1065PREDICT_8x8_VR w
1066%elif ARCH_X86_64 == 0
1067INIT_MMX mmx2
1068PREDICT_8x8_VR b
1069%endif
1070
1071%macro LOAD_PLANE_ARGS 0
1072%if cpuflag(avx2) && ARCH_X86_64 == 0
1073    vpbroadcastw m0, r1m
1074    vpbroadcastw m2, r2m
1075    vpbroadcastw m4, r3m
1076%elif mmsize == 8 ; MMX is only used on x86_32
1077    SPLATW       m0, r1m
1078    SPLATW       m2, r2m
1079    SPLATW       m4, r3m
1080%else
1081    movd        xm0, r1m
1082    movd        xm2, r2m
1083    movd        xm4, r3m
1084    SPLATW       m0, xm0
1085    SPLATW       m2, xm2
1086    SPLATW       m4, xm4
1087%endif
1088%endmacro
1089
1090;-----------------------------------------------------------------------------
1091; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
1092;-----------------------------------------------------------------------------
1093%if ARCH_X86_64 == 0 && HIGH_BIT_DEPTH == 0
1094%macro PREDICT_CHROMA_P_MMX 1
1095cglobal predict_8x%1c_p_core, 1,2
1096    LOAD_PLANE_ARGS
1097    movq        m1, m2
1098    pmullw      m2, [pw_0to15]
1099    psllw       m1, 2
1100    paddsw      m0, m2        ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b}
1101    paddsw      m1, m0        ; m1 = {i+4*b, i+5*b, i+6*b, i+7*b}
1102    mov         r1d, %1
1103ALIGN 4
1104.loop:
1105    movq        m5, m0
1106    movq        m6, m1
1107    psraw       m5, 5
1108    psraw       m6, 5
1109    packuswb    m5, m6
1110    movq        [r0], m5
1111
1112    paddsw      m0, m4
1113    paddsw      m1, m4
1114    add         r0, FDEC_STRIDE
1115    dec         r1d
1116    jg .loop
1117    RET
1118%endmacro ; PREDICT_CHROMA_P_MMX
1119
1120INIT_MMX mmx2
1121PREDICT_CHROMA_P_MMX 8
1122PREDICT_CHROMA_P_MMX 16
1123%endif ; !ARCH_X86_64 && !HIGH_BIT_DEPTH
1124
1125%macro PREDICT_CHROMA_P 1
1126%if HIGH_BIT_DEPTH
1127cglobal predict_8x%1c_p_core, 1,2,7
1128    LOAD_PLANE_ARGS
1129    mova        m3, [pw_pixel_max]
1130    pxor        m1, m1
1131    pmullw      m2, [pw_43210123] ; b
1132%if %1 == 16
1133    pmullw      m5, m4, [pw_m7]   ; c
1134%else
1135    pmullw      m5, m4, [pw_m3]
1136%endif
1137    paddw       m5, [pw_16]
1138%if mmsize == 32
1139    mova       xm6, xm4
1140    paddw       m4, m4
1141    paddw       m5, m6
1142%endif
1143    mov        r1d, %1/(mmsize/16)
1144.loop:
1145    paddsw      m6, m2, m5
1146    paddsw      m6, m0
1147    psraw       m6, 5
1148    CLIPW       m6, m1, m3
1149    paddw       m5, m4
1150%if mmsize == 32
1151    vextracti128 [r0], m6, 1
1152    mova [r0+FDEC_STRIDEB], xm6
1153    add         r0, 2*FDEC_STRIDEB
1154%else
1155    mova      [r0], m6
1156    add         r0, FDEC_STRIDEB
1157%endif
1158    dec        r1d
1159    jg .loop
1160    RET
1161%else ; !HIGH_BIT_DEPTH
1162cglobal predict_8x%1c_p_core, 1,2
1163    LOAD_PLANE_ARGS
1164%if mmsize == 32
1165    vbroadcasti128 m1, [pw_0to15]   ; 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
1166    pmullw      m2, m1
1167    mova       xm1, xm4             ; zero upper half
1168    paddsw      m4, m4
1169    paddsw      m0, m1
1170%else
1171    pmullw      m2, [pw_0to15]
1172%endif
1173    paddsw      m0, m2              ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
1174    paddsw      m1, m0, m4
1175    paddsw      m4, m4
1176    mov        r1d, %1/(mmsize/8)
1177.loop:
1178    psraw       m2, m0, 5
1179    psraw       m3, m1, 5
1180    paddsw      m0, m4
1181    paddsw      m1, m4
1182    packuswb    m2, m3
1183%if mmsize == 32
1184    movq        [r0+FDEC_STRIDE*1], xm2
1185    movhps      [r0+FDEC_STRIDE*3], xm2
1186    vextracti128 xm2, m2, 1
1187    movq        [r0+FDEC_STRIDE*0], xm2
1188    movhps      [r0+FDEC_STRIDE*2], xm2
1189%else
1190    movq        [r0+FDEC_STRIDE*0], xm2
1191    movhps      [r0+FDEC_STRIDE*1], xm2
1192%endif
1193    add         r0, FDEC_STRIDE*mmsize/8
1194    dec        r1d
1195    jg .loop
1196    RET
1197%endif ; HIGH_BIT_DEPTH
1198%endmacro ; PREDICT_CHROMA_P
1199
1200INIT_XMM sse2
1201PREDICT_CHROMA_P 8
1202PREDICT_CHROMA_P 16
1203INIT_XMM avx
1204PREDICT_CHROMA_P 8
1205PREDICT_CHROMA_P 16
1206INIT_YMM avx2
1207PREDICT_CHROMA_P 8
1208PREDICT_CHROMA_P 16
1209
1210;-----------------------------------------------------------------------------
1211; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
1212;-----------------------------------------------------------------------------
1213%if HIGH_BIT_DEPTH == 0 && ARCH_X86_64 == 0
1214INIT_MMX mmx2
1215cglobal predict_16x16_p_core, 1,2
1216    LOAD_PLANE_ARGS
1217    movq        mm5, mm2
1218    movq        mm1, mm2
1219    pmullw      mm5, [pw_0to15]
1220    psllw       mm2, 3
1221    psllw       mm1, 2
1222    movq        mm3, mm2
1223    paddsw      mm0, mm5        ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
1224    paddsw      mm1, mm0        ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
1225    paddsw      mm2, mm0        ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
1226    paddsw      mm3, mm1        ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
1227
1228    mov         r1d, 16
1229ALIGN 4
1230.loop:
1231    movq        mm5, mm0
1232    movq        mm6, mm1
1233    psraw       mm5, 5
1234    psraw       mm6, 5
1235    packuswb    mm5, mm6
1236    movq        [r0], mm5
1237
1238    movq        mm5, mm2
1239    movq        mm6, mm3
1240    psraw       mm5, 5
1241    psraw       mm6, 5
1242    packuswb    mm5, mm6
1243    movq        [r0+8], mm5
1244
1245    paddsw      mm0, mm4
1246    paddsw      mm1, mm4
1247    paddsw      mm2, mm4
1248    paddsw      mm3, mm4
1249    add         r0, FDEC_STRIDE
1250    dec         r1d
1251    jg          .loop
1252    RET
1253%endif ; !HIGH_BIT_DEPTH && !ARCH_X86_64
1254
1255%macro PREDICT_16x16_P 0
1256cglobal predict_16x16_p_core, 1,2,8
1257    movd     m0, r1m
1258    movd     m1, r2m
1259    movd     m2, r3m
1260    SPLATW   m0, m0, 0
1261    SPLATW   m1, m1, 0
1262    SPLATW   m2, m2, 0
1263    pmullw   m3, m1, [pw_0to15]
1264    psllw    m1, 3
1265%if HIGH_BIT_DEPTH
1266    pxor     m6, m6
1267    mov     r1d, 16
1268.loop:
1269    mova     m4, m0
1270    mova     m5, m0
1271    mova     m7, m3
1272    paddsw   m7, m6
1273    paddsw   m4, m7
1274    paddsw   m7, m1
1275    paddsw   m5, m7
1276    psraw    m4, 5
1277    psraw    m5, 5
1278    CLIPW    m4, [pb_0], [pw_pixel_max]
1279    CLIPW    m5, [pb_0], [pw_pixel_max]
1280    mova   [r0], m4
1281    mova [r0+16], m5
1282    add      r0, FDEC_STRIDEB
1283    paddw    m6, m2
1284%else ; !HIGH_BIT_DEPTH
1285    paddsw   m0, m3  ; m0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
1286    paddsw   m1, m0  ; m1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
1287    paddsw   m7, m2, m2
1288    mov     r1d, 8
1289ALIGN 4
1290.loop:
1291    psraw    m3, m0, 5
1292    psraw    m4, m1, 5
1293    paddsw   m5, m0, m2
1294    paddsw   m6, m1, m2
1295    psraw    m5, 5
1296    psraw    m6, 5
1297    packuswb m3, m4
1298    packuswb m5, m6
1299    mova [r0+FDEC_STRIDE*0], m3
1300    mova [r0+FDEC_STRIDE*1], m5
1301    paddsw   m0, m7
1302    paddsw   m1, m7
1303    add      r0, FDEC_STRIDE*2
1304%endif ; !HIGH_BIT_DEPTH
1305    dec     r1d
1306    jg .loop
1307    RET
1308%endmacro ; PREDICT_16x16_P
1309
1310INIT_XMM sse2
1311PREDICT_16x16_P
1312%if HIGH_BIT_DEPTH == 0
1313INIT_XMM avx
1314PREDICT_16x16_P
1315%endif
1316
1317INIT_YMM avx2
1318cglobal predict_16x16_p_core, 1,2,8*HIGH_BIT_DEPTH
1319    LOAD_PLANE_ARGS
1320%if HIGH_BIT_DEPTH
1321    pmullw       m2, [pw_0to15]
1322    pxor         m5, m5
1323    pxor         m6, m6
1324    mova         m7, [pw_pixel_max]
1325    mov         r1d, 8
1326.loop:
1327    paddsw       m1, m2, m5
1328    paddw        m5, m4
1329    paddsw       m1, m0
1330    paddsw       m3, m2, m5
1331    psraw        m1, 5
1332    paddsw       m3, m0
1333    psraw        m3, 5
1334    CLIPW        m1, m6, m7
1335    mova [r0+0*FDEC_STRIDEB], m1
1336    CLIPW        m3, m6, m7
1337    mova [r0+1*FDEC_STRIDEB], m3
1338    paddw        m5, m4
1339    add          r0, 2*FDEC_STRIDEB
1340%else ; !HIGH_BIT_DEPTH
1341    vbroadcasti128 m1, [pw_0to15]
1342    mova        xm3, xm4    ; zero high bits
1343    pmullw       m1, m2
1344    psllw        m2, 3
1345    paddsw       m0, m3
1346    paddsw       m0, m1     ; X+1*C X+0*C
1347    paddsw       m1, m0, m2 ; Y+1*C Y+0*C
1348    paddsw       m4, m4
1349    mov         r1d, 4
1350.loop:
1351    psraw        m2, m0, 5
1352    psraw        m3, m1, 5
1353    paddsw       m0, m4
1354    paddsw       m1, m4
1355    packuswb     m2, m3     ; X+1*C Y+1*C X+0*C Y+0*C
1356    vextracti128 [r0+0*FDEC_STRIDE], m2, 1
1357    mova         [r0+1*FDEC_STRIDE], xm2
1358    psraw        m2, m0, 5
1359    psraw        m3, m1, 5
1360    paddsw       m0, m4
1361    paddsw       m1, m4
1362    packuswb     m2, m3     ; X+3*C Y+3*C X+2*C Y+2*C
1363    vextracti128 [r0+2*FDEC_STRIDE], m2, 1
1364    mova         [r0+3*FDEC_STRIDE], xm2
1365    add          r0, FDEC_STRIDE*4
1366%endif ; !HIGH_BIT_DEPTH
1367    dec         r1d
1368    jg .loop
1369    RET
1370
1371%if HIGH_BIT_DEPTH == 0
1372%macro PREDICT_8x8 0
1373;-----------------------------------------------------------------------------
1374; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
1375;-----------------------------------------------------------------------------
1376cglobal predict_8x8_ddl, 2,2
1377    mova        m0, [r1+16]
1378%ifidn cpuname, ssse3
1379    movd        m2, [r1+32]
1380    palignr     m2, m0, 1
1381%else
1382    movu        m2, [r1+17]
1383%endif
1384    pslldq      m1, m0, 1
1385    add        r0, FDEC_STRIDE*4
1386    PRED8x8_LOWPASS m0, m1, m2, m0, m3
1387
1388%assign Y -4
1389%rep 8
1390    psrldq      m0, 1
1391    movq        [r0+Y*FDEC_STRIDE], m0
1392%assign Y (Y+1)
1393%endrep
1394    RET
1395
1396%ifnidn cpuname, ssse3
1397;-----------------------------------------------------------------------------
1398; void predict_8x8_ddr( uint8_t *src, uint8_t *edge )
1399;-----------------------------------------------------------------------------
1400cglobal predict_8x8_ddr, 2,2
1401    movu        m0, [r1+8]
1402    movu        m1, [r1+7]
1403    psrldq      m2, m0, 1
1404    add         r0, FDEC_STRIDE*4
1405    PRED8x8_LOWPASS m0, m1, m2, m0, m3
1406
1407    psrldq      m1, m0, 1
1408%assign Y 3
1409%rep 3
1410    movq        [r0+Y*FDEC_STRIDE], m0
1411    movq        [r0+(Y-1)*FDEC_STRIDE], m1
1412    psrldq      m0, 2
1413    psrldq      m1, 2
1414%assign Y (Y-2)
1415%endrep
1416    movq        [r0-3*FDEC_STRIDE], m0
1417    movq        [r0-4*FDEC_STRIDE], m1
1418    RET
1419
1420;-----------------------------------------------------------------------------
1421; void predict_8x8_vl( uint8_t *src, uint8_t *edge )
1422;-----------------------------------------------------------------------------
1423cglobal predict_8x8_vl, 2,2
1424    mova        m0, [r1+16]
1425    pslldq      m1, m0, 1
1426    psrldq      m2, m0, 1
1427    pavgb       m3, m0, m2
1428    add         r0, FDEC_STRIDE*4
1429    PRED8x8_LOWPASS m0, m1, m2, m0, m5
1430; m0: (t0 + 2*t1 + t2 + 2) >> 2
1431; m3: (t0 + t1 + 1) >> 1
1432
1433%assign Y -4
1434%rep 3
1435    psrldq      m0, 1
1436    movq        [r0+ Y   *FDEC_STRIDE], m3
1437    movq        [r0+(Y+1)*FDEC_STRIDE], m0
1438    psrldq      m3, 1
1439%assign Y (Y+2)
1440%endrep
1441    psrldq      m0, 1
1442    movq        [r0+ Y   *FDEC_STRIDE], m3
1443    movq        [r0+(Y+1)*FDEC_STRIDE], m0
1444    RET
1445%endif ; !ssse3
1446
1447;-----------------------------------------------------------------------------
1448; void predict_8x8_vr( uint8_t *src, uint8_t *edge )
1449;-----------------------------------------------------------------------------
1450cglobal predict_8x8_vr, 2,2
1451    movu        m2, [r1+8]
1452    add         r0, 4*FDEC_STRIDE
1453    pslldq      m1, m2, 2
1454    pslldq      m0, m2, 1
1455    pavgb       m3, m2, m0
1456    PRED8x8_LOWPASS m0, m2, m1, m0, m4
1457    movhps      [r0-4*FDEC_STRIDE], m3
1458    movhps      [r0-3*FDEC_STRIDE], m0
1459%if cpuflag(ssse3)
1460    punpckhqdq  m3, m3
1461    pshufb      m0, [shuf_vr]
1462    palignr     m3, m0, 13
1463%else
1464    mova        m2, m0
1465    mova        m1, [pw_00ff]
1466    pand        m1, m0
1467    psrlw       m0, 8
1468    packuswb    m1, m0
1469    pslldq      m1, 4
1470    movhlps     m3, m1
1471    shufps      m1, m2, q3210
1472    psrldq      m3, 5
1473    psrldq      m1, 5
1474    SWAP         0, 1
1475%endif
1476    movq        [r0+3*FDEC_STRIDE], m0
1477    movq        [r0+2*FDEC_STRIDE], m3
1478    psrldq      m0, 1
1479    psrldq      m3, 1
1480    movq        [r0+1*FDEC_STRIDE], m0
1481    movq        [r0+0*FDEC_STRIDE], m3
1482    psrldq      m0, 1
1483    psrldq      m3, 1
1484    movq        [r0-1*FDEC_STRIDE], m0
1485    movq        [r0-2*FDEC_STRIDE], m3
1486    RET
1487%endmacro ; PREDICT_8x8
1488
1489INIT_XMM sse2
1490PREDICT_8x8
1491INIT_XMM ssse3
1492PREDICT_8x8
1493INIT_XMM avx
1494PREDICT_8x8
1495
1496%endif ; !HIGH_BIT_DEPTH
1497
1498;-----------------------------------------------------------------------------
1499; void predict_8x8_vl( pixel *src, pixel *edge )
1500;-----------------------------------------------------------------------------
1501%macro PREDICT_8x8_VL_10 1
1502cglobal predict_8x8_vl, 2,2,8
1503    mova         m0, [r1+16*SIZEOF_PIXEL]
1504    mova         m1, [r1+24*SIZEOF_PIXEL]
1505    PALIGNR      m2, m1, m0, SIZEOF_PIXEL*1, m4
1506    PSRLPIX      m4, m1, 1
1507    pavg%1       m6, m0, m2
1508    pavg%1       m7, m1, m4
1509    add          r0, FDEC_STRIDEB*4
1510    mova         [r0-4*FDEC_STRIDEB], m6
1511    PALIGNR      m3, m7, m6, SIZEOF_PIXEL*1, m5
1512    mova         [r0-2*FDEC_STRIDEB], m3
1513    PALIGNR      m3, m7, m6, SIZEOF_PIXEL*2, m5
1514    mova         [r0+0*FDEC_STRIDEB], m3
1515    PALIGNR      m7, m7, m6, SIZEOF_PIXEL*3, m5
1516    mova         [r0+2*FDEC_STRIDEB], m7
1517    PALIGNR      m3, m1, m0, SIZEOF_PIXEL*7, m6
1518    PSLLPIX      m5, m0, 1
1519    PRED8x8_LOWPASS m0, m5, m2, m0, m7
1520    PRED8x8_LOWPASS m1, m3, m4, m1, m7
1521    PALIGNR      m4, m1, m0, SIZEOF_PIXEL*1, m2
1522    mova         [r0-3*FDEC_STRIDEB], m4
1523    PALIGNR      m4, m1, m0, SIZEOF_PIXEL*2, m2
1524    mova         [r0-1*FDEC_STRIDEB], m4
1525    PALIGNR      m4, m1, m0, SIZEOF_PIXEL*3, m2
1526    mova         [r0+1*FDEC_STRIDEB], m4
1527    PALIGNR      m1, m1, m0, SIZEOF_PIXEL*4, m2
1528    mova         [r0+3*FDEC_STRIDEB], m1
1529    RET
1530%endmacro
1531%if HIGH_BIT_DEPTH
1532INIT_XMM sse2
1533PREDICT_8x8_VL_10 w
1534INIT_XMM ssse3
1535PREDICT_8x8_VL_10 w
1536INIT_XMM avx
1537PREDICT_8x8_VL_10 w
1538%else
1539INIT_MMX mmx2
1540PREDICT_8x8_VL_10 b
1541%endif
1542
1543;-----------------------------------------------------------------------------
1544; void predict_8x8_hd( pixel *src, pixel *edge )
1545;-----------------------------------------------------------------------------
1546%macro PREDICT_8x8_HD 2
1547cglobal predict_8x8_hd, 2,2
1548    add       r0, 4*FDEC_STRIDEB
1549    mova      m0, [r1+ 8*SIZEOF_PIXEL]     ; lt l0 l1 l2 l3 l4 l5 l6
1550    movu      m1, [r1+ 7*SIZEOF_PIXEL]     ; l0 l1 l2 l3 l4 l5 l6 l7
1551%ifidn cpuname, ssse3
1552    mova      m2, [r1+16*SIZEOF_PIXEL]     ; t7 t6 t5 t4 t3 t2 t1 t0
1553    mova      m4, m2                       ; t7 t6 t5 t4 t3 t2 t1 t0
1554    palignr   m2, m0, 7*SIZEOF_PIXEL       ; t6 t5 t4 t3 t2 t1 t0 lt
1555    palignr   m4, m0, 1*SIZEOF_PIXEL       ; t0 lt l0 l1 l2 l3 l4 l5
1556%else
1557    movu      m2, [r1+15*SIZEOF_PIXEL]
1558    movu      m4, [r1+ 9*SIZEOF_PIXEL]
1559%endif ; cpuflag
1560    pavg%1    m3, m0, m1
1561    PRED8x8_LOWPASS m0, m4, m1, m0, m5
1562    PSRLPIX   m4, m2, 2                    ; .. .. t6 t5 t4 t3 t2 t1
1563    PSRLPIX   m1, m2, 1                    ; .. t6 t5 t4 t3 t2 t1 t0
1564    PRED8x8_LOWPASS m1, m4, m2, m1, m5
1565                                           ; .. p11 p10 p9
1566    punpckh%2 m2, m3, m0                   ; p8 p7 p6 p5
1567    punpckl%2 m3, m0                       ; p4 p3 p2 p1
1568    mova      [r0+3*FDEC_STRIDEB], m3
1569    PALIGNR   m0, m2, m3, 2*SIZEOF_PIXEL, m5
1570    mova      [r0+2*FDEC_STRIDEB], m0
1571    PALIGNR   m0, m2, m3, 4*SIZEOF_PIXEL, m5
1572    mova      [r0+1*FDEC_STRIDEB], m0
1573    PALIGNR   m0, m2, m3, 6*SIZEOF_PIXEL, m3
1574    mova      [r0+0*FDEC_STRIDEB], m0
1575    mova      [r0-1*FDEC_STRIDEB], m2
1576    PALIGNR   m0, m1, m2, 2*SIZEOF_PIXEL, m5
1577    mova      [r0-2*FDEC_STRIDEB], m0
1578    PALIGNR   m0, m1, m2, 4*SIZEOF_PIXEL, m5
1579    mova      [r0-3*FDEC_STRIDEB], m0
1580    PALIGNR   m1, m1, m2, 6*SIZEOF_PIXEL, m2
1581    mova      [r0-4*FDEC_STRIDEB], m1
1582    RET
1583%endmacro
1584
1585%if HIGH_BIT_DEPTH
1586INIT_XMM sse2
1587PREDICT_8x8_HD w, wd
1588INIT_XMM ssse3
1589PREDICT_8x8_HD w, wd
1590INIT_XMM avx
1591PREDICT_8x8_HD w, wd
1592%else
1593INIT_MMX mmx2
1594PREDICT_8x8_HD b, bw
1595
1596;-----------------------------------------------------------------------------
1597; void predict_8x8_hd( uint8_t *src, uint8_t *edge )
1598;-----------------------------------------------------------------------------
1599%macro PREDICT_8x8_HD 0
1600cglobal predict_8x8_hd, 2,2
1601    add     r0, 4*FDEC_STRIDE
1602    movu    m1, [r1+7]
1603    movu    m3, [r1+8]
1604    movu    m2, [r1+9]
1605    pavgb   m4, m1, m3
1606    PRED8x8_LOWPASS m0, m1, m2, m3, m5
1607    punpcklbw m4, m0
1608    movhlps m0, m4
1609
1610%assign Y 3
1611%rep 3
1612    movq   [r0+(Y)*FDEC_STRIDE], m4
1613    movq   [r0+(Y-4)*FDEC_STRIDE], m0
1614    psrldq m4, 2
1615    psrldq m0, 2
1616%assign Y (Y-1)
1617%endrep
1618    movq   [r0+(Y)*FDEC_STRIDE], m4
1619    movq   [r0+(Y-4)*FDEC_STRIDE], m0
1620    RET
1621%endmacro
1622
1623INIT_XMM sse2
1624PREDICT_8x8_HD
1625INIT_XMM avx
1626PREDICT_8x8_HD
1627%endif ; HIGH_BIT_DEPTH
1628
1629%if HIGH_BIT_DEPTH == 0
1630;-----------------------------------------------------------------------------
1631; void predict_8x8_hu( uint8_t *src, uint8_t *edge )
1632;-----------------------------------------------------------------------------
1633INIT_MMX
1634cglobal predict_8x8_hu_sse2, 2,2
1635    add        r0, 4*FDEC_STRIDE
1636    movq      mm1, [r1+7]           ; l0 l1 l2 l3 l4 l5 l6 l7
1637    pshufw    mm0, mm1, q0123       ; l6 l7 l4 l5 l2 l3 l0 l1
1638    movq      mm2, mm0
1639    psllw     mm0, 8
1640    psrlw     mm2, 8
1641    por       mm2, mm0              ; l7 l6 l5 l4 l3 l2 l1 l0
1642    psllq     mm1, 56               ; l7 .. .. .. .. .. .. ..
1643    movq      mm3, mm2
1644    movq      mm4, mm2
1645    movq      mm5, mm2
1646    psrlq     mm2, 8
1647    psrlq     mm3, 16
1648    por       mm2, mm1              ; l7 l7 l6 l5 l4 l3 l2 l1
1649    punpckhbw mm1, mm1
1650    por       mm3, mm1              ; l7 l7 l7 l6 l5 l4 l3 l2
1651    pavgb     mm4, mm2
1652    PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6
1653
1654    movq2dq   xmm0, mm4
1655    movq2dq   xmm1, mm1
1656    punpcklbw xmm0, xmm1
1657    punpckhbw  mm4, mm1
1658%assign Y -4
1659%rep 3
1660    movq     [r0+Y*FDEC_STRIDE], xmm0
1661    psrldq    xmm0, 2
1662%assign Y (Y+1)
1663%endrep
1664    pshufw     mm5, mm4, q3321
1665    pshufw     mm6, mm4, q3332
1666    pshufw     mm7, mm4, q3333
1667    movq     [r0+Y*FDEC_STRIDE], xmm0
1668    movq     [r0+0*FDEC_STRIDE], mm4
1669    movq     [r0+1*FDEC_STRIDE], mm5
1670    movq     [r0+2*FDEC_STRIDE], mm6
1671    movq     [r0+3*FDEC_STRIDE], mm7
1672    RET
1673
1674INIT_XMM
1675cglobal predict_8x8_hu_ssse3, 2,2
1676    add       r0, 4*FDEC_STRIDE
1677    movq      m3, [r1+7]
1678    pshufb    m3, [shuf_hu]
1679    psrldq    m1, m3, 1
1680    psrldq    m2, m3, 2
1681    pavgb     m0, m1, m3
1682    PRED8x8_LOWPASS m1, m3, m2, m1, m4
1683    punpcklbw m0, m1
1684%assign Y -4
1685%rep 3
1686    movq   [r0+ Y   *FDEC_STRIDE], m0
1687    movhps [r0+(Y+4)*FDEC_STRIDE], m0
1688    psrldq    m0, 2
1689    pshufhw   m0, m0, q2210
1690%assign Y (Y+1)
1691%endrep
1692    movq   [r0+ Y   *FDEC_STRIDE], m0
1693    movhps [r0+(Y+4)*FDEC_STRIDE], m0
1694    RET
1695%endif ; !HIGH_BIT_DEPTH
1696
1697;-----------------------------------------------------------------------------
1698; void predict_8x8c_v( uint8_t *src )
1699;-----------------------------------------------------------------------------
1700
1701%macro PREDICT_8x8C_V 0
1702cglobal predict_8x8c_v, 1,1
1703    mova        m0, [r0 - FDEC_STRIDEB]
1704    STORE8      m0
1705    RET
1706%endmacro
1707
1708%if HIGH_BIT_DEPTH
1709INIT_XMM sse
1710PREDICT_8x8C_V
1711%else
1712INIT_MMX mmx
1713PREDICT_8x8C_V
1714%endif
1715
1716%if HIGH_BIT_DEPTH
1717
1718INIT_MMX
1719cglobal predict_8x8c_v_mmx, 1,1
1720    mova        m0, [r0 - FDEC_STRIDEB]
1721    mova        m1, [r0 - FDEC_STRIDEB + 8]
1722%assign Y 0
1723%rep 8
1724    mova        [r0 + (Y&1)*FDEC_STRIDEB], m0
1725    mova        [r0 + (Y&1)*FDEC_STRIDEB + 8], m1
1726%if (Y&1) && (Y!=7)
1727    add         r0, FDEC_STRIDEB*2
1728%endif
1729%assign Y Y+1
1730%endrep
1731    RET
1732
1733%endif
1734
1735%macro PREDICT_8x16C_V 0
1736cglobal predict_8x16c_v, 1,1
1737    mova        m0, [r0 - FDEC_STRIDEB]
1738    STORE16     m0
1739    RET
1740%endmacro
1741
1742%if HIGH_BIT_DEPTH
1743INIT_XMM sse
1744PREDICT_8x16C_V
1745%else
1746INIT_MMX mmx
1747PREDICT_8x16C_V
1748%endif
1749
1750;-----------------------------------------------------------------------------
1751; void predict_8x8c_h( uint8_t *src )
1752;-----------------------------------------------------------------------------
1753%macro PREDICT_C_H 0
1754cglobal predict_8x8c_h, 1,1
1755%if cpuflag(ssse3) && notcpuflag(avx2)
1756    mova  m2, [pb_3]
1757%endif
1758    PRED_H_4ROWS 8, 1
1759    PRED_H_4ROWS 8, 0
1760    RET
1761
1762cglobal predict_8x16c_h, 1,2
1763%if cpuflag(ssse3) && notcpuflag(avx2)
1764    mova  m2, [pb_3]
1765%endif
1766    mov  r1d, 4
1767.loop:
1768    PRED_H_4ROWS 8, 1
1769    dec  r1d
1770    jg .loop
1771    RET
1772%endmacro
1773
1774INIT_MMX mmx2
1775PREDICT_C_H
1776%if HIGH_BIT_DEPTH
1777INIT_XMM sse2
1778PREDICT_C_H
1779INIT_XMM avx2
1780PREDICT_C_H
1781%else
1782INIT_MMX ssse3
1783PREDICT_C_H
1784%endif
1785
1786;-----------------------------------------------------------------------------
1787; void predict_8x8c_dc( pixel *src )
1788;-----------------------------------------------------------------------------
1789%macro LOAD_LEFT 1
1790    movzx    r1d, pixel [r0+FDEC_STRIDEB*(%1-4)-SIZEOF_PIXEL]
1791    movzx    r2d, pixel [r0+FDEC_STRIDEB*(%1-3)-SIZEOF_PIXEL]
1792    add      r1d, r2d
1793    movzx    r2d, pixel [r0+FDEC_STRIDEB*(%1-2)-SIZEOF_PIXEL]
1794    add      r1d, r2d
1795    movzx    r2d, pixel [r0+FDEC_STRIDEB*(%1-1)-SIZEOF_PIXEL]
1796    add      r1d, r2d
1797%endmacro
1798
1799%macro PREDICT_8x8C_DC 0
1800cglobal predict_8x8c_dc, 1,3
1801    pxor      m7, m7
1802%if HIGH_BIT_DEPTH
1803    movq      m0, [r0-FDEC_STRIDEB+0]
1804    movq      m1, [r0-FDEC_STRIDEB+8]
1805    HADDW     m0, m2
1806    HADDW     m1, m2
1807%else ; !HIGH_BIT_DEPTH
1808    movd      m0, [r0-FDEC_STRIDEB+0]
1809    movd      m1, [r0-FDEC_STRIDEB+4]
1810    psadbw    m0, m7            ; s0
1811    psadbw    m1, m7            ; s1
1812%endif
1813    add       r0, FDEC_STRIDEB*4
1814
1815    LOAD_LEFT 0                 ; s2
1816    movd      m2, r1d
1817    LOAD_LEFT 4                 ; s3
1818    movd      m3, r1d
1819
1820    punpcklwd m0, m1
1821    punpcklwd m2, m3
1822    punpckldq m0, m2            ; s0, s1, s2, s3
1823    pshufw    m3, m0, q3312     ; s2, s1, s3, s3
1824    pshufw    m0, m0, q1310     ; s0, s1, s3, s1
1825    paddw     m0, m3
1826    psrlw     m0, 2
1827    pavgw     m0, m7            ; s0+s2, s1, s3, s1+s3
1828%if HIGH_BIT_DEPTH
1829%if cpuflag(sse2)
1830    movq2dq   xmm0, m0
1831    punpcklwd xmm0, xmm0
1832    pshufd    xmm1, xmm0, q3322
1833    punpckldq xmm0, xmm0
1834%assign Y 0
1835%rep 8
1836%assign i (0 + (Y/4))
1837    movdqa [r0+FDEC_STRIDEB*(Y-4)+0], xmm %+ i
1838%assign Y Y+1
1839%endrep
1840%else ; !sse2
1841    pshufw    m1, m0, q0000
1842    pshufw    m2, m0, q1111
1843    pshufw    m3, m0, q2222
1844    pshufw    m4, m0, q3333
1845%assign Y 0
1846%rep 8
1847%assign i (1 + (Y/4)*2)
1848%assign j (2 + (Y/4)*2)
1849    movq [r0+FDEC_STRIDEB*(Y-4)+0], m %+ i
1850    movq [r0+FDEC_STRIDEB*(Y-4)+8], m %+ j
1851%assign Y Y+1
1852%endrep
1853%endif
1854%else ; !HIGH_BIT_DEPTH
1855    packuswb  m0, m0
1856    punpcklbw m0, m0
1857    movq      m1, m0
1858    punpcklbw m0, m0
1859    punpckhbw m1, m1
1860%assign Y 0
1861%rep 8
1862%assign i (0 + (Y/4))
1863    movq [r0+FDEC_STRIDEB*(Y-4)], m %+ i
1864%assign Y Y+1
1865%endrep
1866%endif
1867    RET
1868%endmacro
1869
1870INIT_MMX mmx2
1871PREDICT_8x8C_DC
1872%if HIGH_BIT_DEPTH
1873INIT_MMX sse2
1874PREDICT_8x8C_DC
1875%endif
1876
1877%if HIGH_BIT_DEPTH
1878%macro STORE_4LINES 3
1879%if cpuflag(sse2)
1880    movdqa [r0+FDEC_STRIDEB*(%3-4)], %1
1881    movdqa [r0+FDEC_STRIDEB*(%3-3)], %1
1882    movdqa [r0+FDEC_STRIDEB*(%3-2)], %1
1883    movdqa [r0+FDEC_STRIDEB*(%3-1)], %1
1884%else
1885    movq [r0+FDEC_STRIDEB*(%3-4)+0], %1
1886    movq [r0+FDEC_STRIDEB*(%3-4)+8], %2
1887    movq [r0+FDEC_STRIDEB*(%3-3)+0], %1
1888    movq [r0+FDEC_STRIDEB*(%3-3)+8], %2
1889    movq [r0+FDEC_STRIDEB*(%3-2)+0], %1
1890    movq [r0+FDEC_STRIDEB*(%3-2)+8], %2
1891    movq [r0+FDEC_STRIDEB*(%3-1)+0], %1
1892    movq [r0+FDEC_STRIDEB*(%3-1)+8], %2
1893%endif
1894%endmacro
1895%else
1896%macro STORE_4LINES 2
1897    movq [r0+FDEC_STRIDEB*(%2-4)], %1
1898    movq [r0+FDEC_STRIDEB*(%2-3)], %1
1899    movq [r0+FDEC_STRIDEB*(%2-2)], %1
1900    movq [r0+FDEC_STRIDEB*(%2-1)], %1
1901%endmacro
1902%endif
1903
1904%macro PREDICT_8x16C_DC 0
1905cglobal predict_8x16c_dc, 1,3
1906    pxor      m7, m7
1907%if HIGH_BIT_DEPTH
1908    movq      m0, [r0-FDEC_STRIDEB+0]
1909    movq      m1, [r0-FDEC_STRIDEB+8]
1910    HADDW     m0, m2
1911    HADDW     m1, m2
1912%else
1913    movd      m0, [r0-FDEC_STRIDEB+0]
1914    movd      m1, [r0-FDEC_STRIDEB+4]
1915    psadbw    m0, m7            ; s0
1916    psadbw    m1, m7            ; s1
1917%endif
1918    punpcklwd m0, m1            ; s0, s1
1919
1920    add       r0, FDEC_STRIDEB*4
1921    LOAD_LEFT 0                 ; s2
1922    pinsrw    m0, r1d, 2
1923    LOAD_LEFT 4                 ; s3
1924    pinsrw    m0, r1d, 3        ; s0, s1, s2, s3
1925    add       r0, FDEC_STRIDEB*8
1926    LOAD_LEFT 0                 ; s4
1927    pinsrw    m1, r1d, 2
1928    LOAD_LEFT 4                 ; s5
1929    pinsrw    m1, r1d, 3        ; s1, __, s4, s5
1930    sub       r0, FDEC_STRIDEB*8
1931
1932    pshufw    m2, m0, q1310     ; s0, s1, s3, s1
1933    pshufw    m0, m0, q3312     ; s2, s1, s3, s3
1934    pshufw    m3, m1, q0302     ; s4, s1, s5, s1
1935    pshufw    m1, m1, q3322     ; s4, s4, s5, s5
1936    paddw     m0, m2
1937    paddw     m1, m3
1938    psrlw     m0, 2
1939    psrlw     m1, 2
1940    pavgw     m0, m7
1941    pavgw     m1, m7
1942%if HIGH_BIT_DEPTH
1943%if cpuflag(sse2)
1944    movq2dq xmm0, m0
1945    movq2dq xmm1, m1
1946    punpcklwd xmm0, xmm0
1947    punpcklwd xmm1, xmm1
1948    pshufd    xmm2, xmm0, q3322
1949    pshufd    xmm3, xmm1, q3322
1950    punpckldq xmm0, xmm0
1951    punpckldq xmm1, xmm1
1952    STORE_4LINES xmm0, xmm0, 0
1953    STORE_4LINES xmm2, xmm2, 4
1954    STORE_4LINES xmm1, xmm1, 8
1955    STORE_4LINES xmm3, xmm3, 12
1956%else
1957    pshufw    m2, m0, q0000
1958    pshufw    m3, m0, q1111
1959    pshufw    m4, m0, q2222
1960    pshufw    m5, m0, q3333
1961    STORE_4LINES m2, m3, 0
1962    STORE_4LINES m4, m5, 4
1963    pshufw    m2, m1, q0000
1964    pshufw    m3, m1, q1111
1965    pshufw    m4, m1, q2222
1966    pshufw    m5, m1, q3333
1967    STORE_4LINES m2, m3, 8
1968    STORE_4LINES m4, m5, 12
1969%endif
1970%else
1971    packuswb  m0, m0            ; dc0, dc1, dc2, dc3
1972    packuswb  m1, m1            ; dc4, dc5, dc6, dc7
1973    punpcklbw m0, m0
1974    punpcklbw m1, m1
1975    pshufw    m2, m0, q1100
1976    pshufw    m3, m0, q3322
1977    pshufw    m4, m1, q1100
1978    pshufw    m5, m1, q3322
1979    STORE_4LINES m2, 0
1980    STORE_4LINES m3, 4
1981    add       r0, FDEC_STRIDEB*8
1982    STORE_4LINES m4, 0
1983    STORE_4LINES m5, 4
1984%endif
1985    RET
1986%endmacro
1987
1988INIT_MMX mmx2
1989PREDICT_8x16C_DC
1990%if HIGH_BIT_DEPTH
1991INIT_MMX sse2
1992PREDICT_8x16C_DC
1993%endif
1994
1995%macro PREDICT_C_DC_TOP 1
1996%if HIGH_BIT_DEPTH
1997INIT_XMM
1998cglobal predict_8x%1c_dc_top_sse2, 1,1
1999    pxor        m2, m2
2000    mova        m0, [r0 - FDEC_STRIDEB]
2001    pshufd      m1, m0, q2301
2002    paddw       m0, m1
2003    pshuflw     m1, m0, q2301
2004    pshufhw     m1, m1, q2301
2005    paddw       m0, m1
2006    psrlw       m0, 1
2007    pavgw       m0, m2
2008    STORE%1     m0
2009    RET
2010%else ; !HIGH_BIT_DEPTH
2011INIT_MMX
2012cglobal predict_8x%1c_dc_top_mmx2, 1,1
2013    movq        mm0, [r0 - FDEC_STRIDE]
2014    pxor        mm1, mm1
2015    pxor        mm2, mm2
2016    punpckhbw   mm1, mm0
2017    punpcklbw   mm0, mm2
2018    psadbw      mm1, mm2        ; s1
2019    psadbw      mm0, mm2        ; s0
2020    psrlw       mm1, 1
2021    psrlw       mm0, 1
2022    pavgw       mm1, mm2
2023    pavgw       mm0, mm2
2024    pshufw      mm1, mm1, 0
2025    pshufw      mm0, mm0, 0     ; dc0 (w)
2026    packuswb    mm0, mm1        ; dc0,dc1 (b)
2027    STORE%1     mm0
2028    RET
2029%endif
2030%endmacro
2031
2032PREDICT_C_DC_TOP 8
2033PREDICT_C_DC_TOP 16
2034
2035;-----------------------------------------------------------------------------
2036; void predict_16x16_v( pixel *src )
2037;-----------------------------------------------------------------------------
2038
2039%macro PREDICT_16x16_V 0
2040cglobal predict_16x16_v, 1,2
2041%assign %%i 0
2042%rep 16*SIZEOF_PIXEL/mmsize
2043    mova m %+ %%i, [r0-FDEC_STRIDEB+%%i*mmsize]
2044%assign %%i %%i+1
2045%endrep
2046%if 16*SIZEOF_PIXEL/mmsize == 4
2047    STORE16 m0, m1, m2, m3
2048%elif 16*SIZEOF_PIXEL/mmsize == 2
2049    STORE16 m0, m1
2050%else
2051    STORE16 m0
2052%endif
2053    RET
2054%endmacro
2055
2056INIT_MMX mmx2
2057PREDICT_16x16_V
2058INIT_XMM sse
2059PREDICT_16x16_V
2060%if HIGH_BIT_DEPTH
2061INIT_YMM avx
2062PREDICT_16x16_V
2063%endif
2064
2065;-----------------------------------------------------------------------------
2066; void predict_16x16_h( pixel *src )
2067;-----------------------------------------------------------------------------
2068%macro PREDICT_16x16_H 0
2069cglobal predict_16x16_h, 1,2
2070%if cpuflag(ssse3) && notcpuflag(avx2)
2071    mova  m2, [pb_3]
2072%endif
2073    mov  r1d, 4
2074.loop:
2075    PRED_H_4ROWS 16, 1
2076    dec  r1d
2077    jg .loop
2078    RET
2079%endmacro
2080
2081INIT_MMX mmx2
2082PREDICT_16x16_H
2083%if HIGH_BIT_DEPTH
2084INIT_XMM sse2
2085PREDICT_16x16_H
2086INIT_YMM avx2
2087PREDICT_16x16_H
2088%else
2089;no SSE2 for 8-bit, it's slower than MMX on all systems that don't support SSSE3
2090INIT_XMM ssse3
2091PREDICT_16x16_H
2092%endif
2093
2094;-----------------------------------------------------------------------------
2095; void predict_16x16_dc( pixel *src )
2096;-----------------------------------------------------------------------------
2097%if WIN64
2098DECLARE_REG_TMP 6 ; Reduces code size due to fewer REX prefixes
2099%else
2100DECLARE_REG_TMP 3
2101%endif
2102
2103INIT_XMM
2104; Returns the sum of the left pixels in r1d+r2d
2105cglobal predict_16x16_dc_left_internal, 0,4
2106    movzx r1d, pixel [r0-SIZEOF_PIXEL]
2107    movzx r2d, pixel [r0+FDEC_STRIDEB-SIZEOF_PIXEL]
2108%assign i 2*FDEC_STRIDEB
2109%rep 7
2110    movzx t0d, pixel [r0+i-SIZEOF_PIXEL]
2111    add   r1d, t0d
2112    movzx t0d, pixel [r0+i+FDEC_STRIDEB-SIZEOF_PIXEL]
2113    add   r2d, t0d
2114%assign i i+2*FDEC_STRIDEB
2115%endrep
2116    RET
2117
2118%macro PRED16x16_DC 2
2119%if HIGH_BIT_DEPTH
2120    mova      xm0, [r0 - FDEC_STRIDEB+ 0]
2121    paddw     xm0, [r0 - FDEC_STRIDEB+16]
2122    HADDW     xm0, xm2
2123    paddw     xm0, %1
2124    psrlw     xm0, %2
2125    SPLATW     m0, xm0
2126%if mmsize == 32
2127    STORE16    m0
2128%else
2129    STORE16    m0, m0
2130%endif
2131%else ; !HIGH_BIT_DEPTH
2132    pxor        m0, m0
2133    psadbw      m0, [r0 - FDEC_STRIDE]
2134    MOVHL       m1, m0
2135    paddw       m0, m1
2136    paddusw     m0, %1
2137    psrlw       m0, %2              ; dc
2138    SPLATW      m0, m0
2139    packuswb    m0, m0              ; dc in bytes
2140    STORE16     m0
2141%endif
2142%endmacro
2143
2144%macro PREDICT_16x16_DC 0
2145cglobal predict_16x16_dc, 1,3
2146    call predict_16x16_dc_left_internal
2147    lea          r1d, [r1+r2+16]
2148    movd         xm3, r1d
2149    PRED16x16_DC xm3, 5
2150    RET
2151
2152cglobal predict_16x16_dc_top, 1,2
2153    PRED16x16_DC [pw_8], 4
2154    RET
2155
2156cglobal predict_16x16_dc_left, 1,3
2157    call predict_16x16_dc_left_internal
2158    lea       r1d, [r1+r2+8]
2159    shr       r1d, 4
2160    movd      xm0, r1d
2161    SPLATW     m0, xm0
2162%if HIGH_BIT_DEPTH && mmsize == 16
2163    STORE16    m0, m0
2164%else
2165%if HIGH_BIT_DEPTH == 0
2166    packuswb   m0, m0
2167%endif
2168    STORE16    m0
2169%endif
2170    RET
2171%endmacro
2172
2173INIT_XMM sse2
2174PREDICT_16x16_DC
2175%if HIGH_BIT_DEPTH
2176INIT_YMM avx2
2177PREDICT_16x16_DC
2178%else
2179INIT_XMM avx2
2180PREDICT_16x16_DC
2181%endif
2182