1;******************************************************************************
2;* VP8 MMXEXT optimizations
3;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA
26
27fourtap_filter_hw_m: times 4 dw  -6, 123
28                     times 4 dw  12,  -1
29                     times 4 dw  -9,  93
30                     times 4 dw  50,  -6
31                     times 4 dw  -6,  50
32                     times 4 dw  93,  -9
33                     times 4 dw  -1,  12
34                     times 4 dw 123,  -6
35
36sixtap_filter_hw_m:  times 4 dw   2, -11
37                     times 4 dw 108,  36
38                     times 4 dw  -8,   1
39                     times 4 dw   3, -16
40                     times 4 dw  77,  77
41                     times 4 dw -16,   3
42                     times 4 dw   1,  -8
43                     times 4 dw  36, 108
44                     times 4 dw -11,   2
45
46fourtap_filter_hb_m: times 8 db  -6, 123
47                     times 8 db  12,  -1
48                     times 8 db  -9,  93
49                     times 8 db  50,  -6
50                     times 8 db  -6,  50
51                     times 8 db  93,  -9
52                     times 8 db  -1,  12
53                     times 8 db 123,  -6
54
55sixtap_filter_hb_m:  times 8 db   2,   1
56                     times 8 db -11, 108
57                     times 8 db  36,  -8
58                     times 8 db   3,   3
59                     times 8 db -16,  77
60                     times 8 db  77, -16
61                     times 8 db   1,   2
62                     times 8 db  -8,  36
63                     times 8 db 108, -11
64
65fourtap_filter_v_m:  times 8 dw  -6
66                     times 8 dw 123
67                     times 8 dw  12
68                     times 8 dw  -1
69                     times 8 dw  -9
70                     times 8 dw  93
71                     times 8 dw  50
72                     times 8 dw  -6
73                     times 8 dw  -6
74                     times 8 dw  50
75                     times 8 dw  93
76                     times 8 dw  -9
77                     times 8 dw  -1
78                     times 8 dw  12
79                     times 8 dw 123
80                     times 8 dw  -6
81
82sixtap_filter_v_m:   times 8 dw   2
83                     times 8 dw -11
84                     times 8 dw 108
85                     times 8 dw  36
86                     times 8 dw  -8
87                     times 8 dw   1
88                     times 8 dw   3
89                     times 8 dw -16
90                     times 8 dw  77
91                     times 8 dw  77
92                     times 8 dw -16
93                     times 8 dw   3
94                     times 8 dw   1
95                     times 8 dw  -8
96                     times 8 dw  36
97                     times 8 dw 108
98                     times 8 dw -11
99                     times 8 dw   2
100
101bilinear_filter_vw_m: times 8 dw 1
102                      times 8 dw 2
103                      times 8 dw 3
104                      times 8 dw 4
105                      times 8 dw 5
106                      times 8 dw 6
107                      times 8 dw 7
108
109bilinear_filter_vb_m: times 8 db 7, 1
110                      times 8 db 6, 2
111                      times 8 db 5, 3
112                      times 8 db 4, 4
113                      times 8 db 3, 5
114                      times 8 db 2, 6
115                      times 8 db 1, 7
116
117%ifdef PIC
118%define fourtap_filter_hw  picregq
119%define sixtap_filter_hw   picregq
120%define fourtap_filter_hb  picregq
121%define sixtap_filter_hb   picregq
122%define fourtap_filter_v   picregq
123%define sixtap_filter_v    picregq
124%define bilinear_filter_vw picregq
125%define bilinear_filter_vb picregq
126%define npicregs 1
127%else
128%define fourtap_filter_hw  fourtap_filter_hw_m
129%define sixtap_filter_hw   sixtap_filter_hw_m
130%define fourtap_filter_hb  fourtap_filter_hb_m
131%define sixtap_filter_hb   sixtap_filter_hb_m
132%define fourtap_filter_v   fourtap_filter_v_m
133%define sixtap_filter_v    sixtap_filter_v_m
134%define bilinear_filter_vw bilinear_filter_vw_m
135%define bilinear_filter_vb bilinear_filter_vb_m
136%define npicregs 0
137%endif
138
139filter_h2_shuf:  db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,  6, 6,  7,  7,  8
140filter_h4_shuf:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,  8, 8,  9,  9, 10
141
142filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11,  7, 12
143filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,  7, 7,  8,  8,  9
144filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,  9, 9, 10, 10, 11
145
146pw_20091: times 4 dw 20091
147pw_17734: times 4 dw 17734
148
149cextern pw_3
150cextern pw_4
151cextern pw_64
152cextern pw_256
153
154SECTION .text
155
156;-------------------------------------------------------------------------------
157; subpel MC functions:
158;
159; void ff_put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
160;                                                 uint8_t *src, int srcstride,
161;                                                 int height,   int mx, int my);
162;-------------------------------------------------------------------------------
163
164%macro FILTER_SSSE3 1
165cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
166    lea      mxd, [mxq*3]
167    mova      m3, [filter_h6_shuf2]
168    mova      m4, [filter_h6_shuf3]
169%ifdef PIC
170    lea  picregq, [sixtap_filter_hb_m]
171%endif
172    mova      m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
173    mova      m6, [sixtap_filter_hb+mxq*8-32]
174    mova      m7, [sixtap_filter_hb+mxq*8-16]
175
176.nextrow:
177    movu      m0, [srcq-2]
178    mova      m1, m0
179    mova      m2, m0
180%if mmsize == 8
181; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
182; shuffle with a memory operand
183    punpcklbw m0, [srcq+3]
184%else
185    pshufb    m0, [filter_h6_shuf1]
186%endif
187    pshufb    m1, m3
188    pshufb    m2, m4
189    pmaddubsw m0, m5
190    pmaddubsw m1, m6
191    pmaddubsw m2, m7
192    paddsw    m0, m1
193    paddsw    m0, m2
194    pmulhrsw  m0, [pw_256]
195    packuswb  m0, m0
196    movh  [dstq], m0        ; store
197
198    ; go to next line
199    add     dstq, dststrideq
200    add     srcq, srcstrideq
201    dec  heightd            ; next row
202    jg .nextrow
203    REP_RET
204
205cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
206    shl      mxd, 4
207    mova      m2, [pw_256]
208    mova      m3, [filter_h2_shuf]
209    mova      m4, [filter_h4_shuf]
210%ifdef PIC
211    lea  picregq, [fourtap_filter_hb_m]
212%endif
213    mova      m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
214    mova      m6, [fourtap_filter_hb+mxq]
215
216.nextrow:
217    movu      m0, [srcq-1]
218    mova      m1, m0
219    pshufb    m0, m3
220    pshufb    m1, m4
221    pmaddubsw m0, m5
222    pmaddubsw m1, m6
223    paddsw    m0, m1
224    pmulhrsw  m0, m2
225    packuswb  m0, m0
226    movh  [dstq], m0        ; store
227
228    ; go to next line
229    add     dstq, dststrideq
230    add     srcq, srcstrideq
231    dec  heightd            ; next row
232    jg .nextrow
233    REP_RET
234
235cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
236    shl      myd, 4
237%ifdef PIC
238    lea  picregq, [fourtap_filter_hb_m]
239%endif
240    mova      m5, [fourtap_filter_hb+myq-16]
241    mova      m6, [fourtap_filter_hb+myq]
242    mova      m7, [pw_256]
243
244    ; read 3 lines
245    sub     srcq, srcstrideq
246    movh      m0, [srcq]
247    movh      m1, [srcq+  srcstrideq]
248    movh      m2, [srcq+2*srcstrideq]
249    add     srcq, srcstrideq
250
251.nextrow:
252    movh      m3, [srcq+2*srcstrideq]      ; read new row
253    mova      m4, m0
254    mova      m0, m1
255    punpcklbw m4, m1
256    mova      m1, m2
257    punpcklbw m2, m3
258    pmaddubsw m4, m5
259    pmaddubsw m2, m6
260    paddsw    m4, m2
261    mova      m2, m3
262    pmulhrsw  m4, m7
263    packuswb  m4, m4
264    movh  [dstq], m4
265
266    ; go to next line
267    add      dstq, dststrideq
268    add      srcq, srcstrideq
269    dec   heightd                          ; next row
270    jg .nextrow
271    REP_RET
272
273cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
274    lea      myd, [myq*3]
275%ifdef PIC
276    lea  picregq, [sixtap_filter_hb_m]
277%endif
278    lea      myq, [sixtap_filter_hb+myq*8]
279
280    ; read 5 lines
281    sub     srcq, srcstrideq
282    sub     srcq, srcstrideq
283    movh      m0, [srcq]
284    movh      m1, [srcq+srcstrideq]
285    movh      m2, [srcq+srcstrideq*2]
286    lea     srcq, [srcq+srcstrideq*2]
287    add     srcq, srcstrideq
288    movh      m3, [srcq]
289    movh      m4, [srcq+srcstrideq]
290
291.nextrow:
292    movh      m5, [srcq+2*srcstrideq]      ; read new row
293    mova      m6, m0
294    punpcklbw m6, m5
295    mova      m0, m1
296    punpcklbw m1, m2
297    mova      m7, m3
298    punpcklbw m7, m4
299    pmaddubsw m6, [myq-48]
300    pmaddubsw m1, [myq-32]
301    pmaddubsw m7, [myq-16]
302    paddsw    m6, m1
303    paddsw    m6, m7
304    mova      m1, m2
305    mova      m2, m3
306    pmulhrsw  m6, [pw_256]
307    mova      m3, m4
308    packuswb  m6, m6
309    mova      m4, m5
310    movh  [dstq], m6
311
312    ; go to next line
313    add      dstq, dststrideq
314    add      srcq, srcstrideq
315    dec   heightd                          ; next row
316    jg .nextrow
317    REP_RET
318%endmacro
319
320INIT_MMX ssse3
321FILTER_SSSE3 4
322INIT_XMM ssse3
323FILTER_SSSE3 8
324
325; 4x4 block, H-only 4-tap filter
326INIT_MMX mmxext
327cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
328    shl       mxd, 4
329%ifdef PIC
330    lea   picregq, [fourtap_filter_hw_m]
331%endif
332    movq      mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
333    movq      mm5, [fourtap_filter_hw+mxq]
334    movq      mm7, [pw_64]
335    pxor      mm6, mm6
336
337.nextrow:
338    movq      mm1, [srcq-1]                ; (ABCDEFGH) load 8 horizontal pixels
339
340    ; first set of 2 pixels
341    movq      mm2, mm1                     ; byte ABCD..
342    punpcklbw mm1, mm6                     ; byte->word ABCD
343    pshufw    mm0, mm2, 9                  ; byte CDEF..
344    punpcklbw mm0, mm6                     ; byte->word CDEF
345    pshufw    mm3, mm1, 0x94               ; word ABBC
346    pshufw    mm1, mm0, 0x94               ; word CDDE
347    pmaddwd   mm3, mm4                     ; multiply 2px with F0/F1
348    movq      mm0, mm1                     ; backup for second set of pixels
349    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
350    paddd     mm3, mm1                     ; finish 1st 2px
351
352    ; second set of 2 pixels, use backup of above
353    punpckhbw mm2, mm6                     ; byte->word EFGH
354    pmaddwd   mm0, mm4                     ; multiply backed up 2px with F0/F1
355    pshufw    mm1, mm2, 0x94               ; word EFFG
356    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
357    paddd     mm0, mm1                     ; finish 2nd 2px
358
359    ; merge two sets of 2 pixels into one set of 4, round/clip/store
360    packssdw  mm3, mm0                     ; merge dword->word (4px)
361    paddsw    mm3, mm7                     ; rounding
362    psraw     mm3, 7
363    packuswb  mm3, mm6                     ; clip and word->bytes
364    movd   [dstq], mm3                     ; store
365
366    ; go to next line
367    add      dstq, dststrideq
368    add      srcq, srcstrideq
369    dec   heightd                          ; next row
370    jg .nextrow
371    REP_RET
372
373; 4x4 block, H-only 6-tap filter
374INIT_MMX mmxext
375cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
376    lea       mxd, [mxq*3]
377%ifdef PIC
378    lea   picregq, [sixtap_filter_hw_m]
379%endif
380    movq      mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
381    movq      mm5, [sixtap_filter_hw+mxq*8-32]
382    movq      mm6, [sixtap_filter_hw+mxq*8-16]
383    movq      mm7, [pw_64]
384    pxor      mm3, mm3
385
386.nextrow:
387    movq      mm1, [srcq-2]                ; (ABCDEFGH) load 8 horizontal pixels
388
389    ; first set of 2 pixels
390    movq      mm2, mm1                     ; byte ABCD..
391    punpcklbw mm1, mm3                     ; byte->word ABCD
392    pshufw    mm0, mm2, 0x9                ; byte CDEF..
393    punpckhbw mm2, mm3                     ; byte->word EFGH
394    punpcklbw mm0, mm3                     ; byte->word CDEF
395    pshufw    mm1, mm1, 0x94               ; word ABBC
396    pshufw    mm2, mm2, 0x94               ; word EFFG
397    pmaddwd   mm1, mm4                     ; multiply 2px with F0/F1
398    pshufw    mm3, mm0, 0x94               ; word CDDE
399    movq      mm0, mm3                     ; backup for second set of pixels
400    pmaddwd   mm3, mm5                     ; multiply 2px with F2/F3
401    paddd     mm1, mm3                     ; add to 1st 2px cache
402    movq      mm3, mm2                     ; backup for second set of pixels
403    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
404    paddd     mm1, mm2                     ; finish 1st 2px
405
406    ; second set of 2 pixels, use backup of above
407    movd      mm2, [srcq+3]                ; byte FGHI (prevent overreads)
408    pmaddwd   mm0, mm4                     ; multiply 1st backed up 2px with F0/F1
409    pmaddwd   mm3, mm5                     ; multiply 2nd backed up 2px with F2/F3
410    paddd     mm0, mm3                     ; add to 2nd 2px cache
411    pxor      mm3, mm3
412    punpcklbw mm2, mm3                     ; byte->word FGHI
413    pshufw    mm2, mm2, 0xE9               ; word GHHI
414    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
415    paddd     mm0, mm2                     ; finish 2nd 2px
416
417    ; merge two sets of 2 pixels into one set of 4, round/clip/store
418    packssdw  mm1, mm0                     ; merge dword->word (4px)
419    paddsw    mm1, mm7                     ; rounding
420    psraw     mm1, 7
421    packuswb  mm1, mm3                     ; clip and word->bytes
422    movd   [dstq], mm1                     ; store
423
424    ; go to next line
425    add      dstq, dststrideq
426    add      srcq, srcstrideq
427    dec   heightd                          ; next row
428    jg .nextrow
429    REP_RET
430
431INIT_XMM sse2
432cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
433    shl      mxd, 5
434%ifdef PIC
435    lea  picregq, [fourtap_filter_v_m]
436%endif
437    lea      mxq, [fourtap_filter_v+mxq-32]
438    pxor      m7, m7
439    mova      m4, [pw_64]
440    mova      m5, [mxq+ 0]
441    mova      m6, [mxq+16]
442%ifdef m8
443    mova      m8, [mxq+32]
444    mova      m9, [mxq+48]
445%endif
446.nextrow:
447    movq      m0, [srcq-1]
448    movq      m1, [srcq-0]
449    movq      m2, [srcq+1]
450    movq      m3, [srcq+2]
451    punpcklbw m0, m7
452    punpcklbw m1, m7
453    punpcklbw m2, m7
454    punpcklbw m3, m7
455    pmullw    m0, m5
456    pmullw    m1, m6
457%ifdef m8
458    pmullw    m2, m8
459    pmullw    m3, m9
460%else
461    pmullw    m2, [mxq+32]
462    pmullw    m3, [mxq+48]
463%endif
464    paddsw    m0, m1
465    paddsw    m2, m3
466    paddsw    m0, m2
467    paddsw    m0, m4
468    psraw     m0, 7
469    packuswb  m0, m7
470    movh  [dstq], m0        ; store
471
472    ; go to next line
473    add     dstq, dststrideq
474    add     srcq, srcstrideq
475    dec  heightd            ; next row
476    jg .nextrow
477    REP_RET
478
479INIT_XMM sse2
480cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
481    lea      mxd, [mxq*3]
482    shl      mxd, 4
483%ifdef PIC
484    lea  picregq, [sixtap_filter_v_m]
485%endif
486    lea      mxq, [sixtap_filter_v+mxq-96]
487    pxor      m7, m7
488    mova      m6, [pw_64]
489%ifdef m8
490    mova      m8, [mxq+ 0]
491    mova      m9, [mxq+16]
492    mova     m10, [mxq+32]
493    mova     m11, [mxq+48]
494    mova     m12, [mxq+64]
495    mova     m13, [mxq+80]
496%endif
497.nextrow:
498    movq      m0, [srcq-2]
499    movq      m1, [srcq-1]
500    movq      m2, [srcq-0]
501    movq      m3, [srcq+1]
502    movq      m4, [srcq+2]
503    movq      m5, [srcq+3]
504    punpcklbw m0, m7
505    punpcklbw m1, m7
506    punpcklbw m2, m7
507    punpcklbw m3, m7
508    punpcklbw m4, m7
509    punpcklbw m5, m7
510%ifdef m8
511    pmullw    m0, m8
512    pmullw    m1, m9
513    pmullw    m2, m10
514    pmullw    m3, m11
515    pmullw    m4, m12
516    pmullw    m5, m13
517%else
518    pmullw    m0, [mxq+ 0]
519    pmullw    m1, [mxq+16]
520    pmullw    m2, [mxq+32]
521    pmullw    m3, [mxq+48]
522    pmullw    m4, [mxq+64]
523    pmullw    m5, [mxq+80]
524%endif
525    paddsw    m1, m4
526    paddsw    m0, m5
527    paddsw    m1, m2
528    paddsw    m0, m3
529    paddsw    m0, m1
530    paddsw    m0, m6
531    psraw     m0, 7
532    packuswb  m0, m7
533    movh  [dstq], m0        ; store
534
535    ; go to next line
536    add     dstq, dststrideq
537    add     srcq, srcstrideq
538    dec  heightd            ; next row
539    jg .nextrow
540    REP_RET
541
542%macro FILTER_V 1
543; 4x4 block, V-only 4-tap filter
544cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
545    shl      myd, 5
546%ifdef PIC
547    lea  picregq, [fourtap_filter_v_m]
548%endif
549    lea      myq, [fourtap_filter_v+myq-32]
550    mova      m6, [pw_64]
551    pxor      m7, m7
552    mova      m5, [myq+48]
553
554    ; read 3 lines
555    sub     srcq, srcstrideq
556    movh      m0, [srcq]
557    movh      m1, [srcq+  srcstrideq]
558    movh      m2, [srcq+2*srcstrideq]
559    add     srcq, srcstrideq
560    punpcklbw m0, m7
561    punpcklbw m1, m7
562    punpcklbw m2, m7
563
564.nextrow:
565    ; first calculate negative taps (to prevent losing positive overflows)
566    movh      m4, [srcq+2*srcstrideq]      ; read new row
567    punpcklbw m4, m7
568    mova      m3, m4
569    pmullw    m0, [myq+0]
570    pmullw    m4, m5
571    paddsw    m4, m0
572
573    ; then calculate positive taps
574    mova      m0, m1
575    pmullw    m1, [myq+16]
576    paddsw    m4, m1
577    mova      m1, m2
578    pmullw    m2, [myq+32]
579    paddsw    m4, m2
580    mova      m2, m3
581
582    ; round/clip/store
583    paddsw    m4, m6
584    psraw     m4, 7
585    packuswb  m4, m7
586    movh  [dstq], m4
587
588    ; go to next line
589    add     dstq, dststrideq
590    add     srcq, srcstrideq
591    dec  heightd                           ; next row
592    jg .nextrow
593    REP_RET
594
595
596; 4x4 block, V-only 6-tap filter
597cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
598    shl      myd, 4
599    lea      myq, [myq*3]
600%ifdef PIC
601    lea  picregq, [sixtap_filter_v_m]
602%endif
603    lea      myq, [sixtap_filter_v+myq-96]
604    pxor      m7, m7
605
606    ; read 5 lines
607    sub     srcq, srcstrideq
608    sub     srcq, srcstrideq
609    movh      m0, [srcq]
610    movh      m1, [srcq+srcstrideq]
611    movh      m2, [srcq+srcstrideq*2]
612    lea     srcq, [srcq+srcstrideq*2]
613    add     srcq, srcstrideq
614    movh      m3, [srcq]
615    movh      m4, [srcq+srcstrideq]
616    punpcklbw m0, m7
617    punpcklbw m1, m7
618    punpcklbw m2, m7
619    punpcklbw m3, m7
620    punpcklbw m4, m7
621
622.nextrow:
623    ; first calculate negative taps (to prevent losing positive overflows)
624    mova      m5, m1
625    pmullw    m5, [myq+16]
626    mova      m6, m4
627    pmullw    m6, [myq+64]
628    paddsw    m6, m5
629
630    ; then calculate positive taps
631    movh      m5, [srcq+2*srcstrideq]      ; read new row
632    punpcklbw m5, m7
633    pmullw    m0, [myq+0]
634    paddsw    m6, m0
635    mova      m0, m1
636    mova      m1, m2
637    pmullw    m2, [myq+32]
638    paddsw    m6, m2
639    mova      m2, m3
640    pmullw    m3, [myq+48]
641    paddsw    m6, m3
642    mova      m3, m4
643    mova      m4, m5
644    pmullw    m5, [myq+80]
645    paddsw    m6, m5
646
647    ; round/clip/store
648    paddsw    m6, [pw_64]
649    psraw     m6, 7
650    packuswb  m6, m7
651    movh  [dstq], m6
652
653    ; go to next line
654    add     dstq, dststrideq
655    add     srcq, srcstrideq
656    dec  heightd                           ; next row
657    jg .nextrow
658    REP_RET
659%endmacro
660
661INIT_MMX mmxext
662FILTER_V 4
663INIT_XMM sse2
664FILTER_V 8
665
666%macro FILTER_BILINEAR 1
667cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
668    shl      myd, 4
669%ifdef PIC
670    lea  picregq, [bilinear_filter_vw_m]
671%endif
672    pxor      m6, m6
673    mova      m5, [bilinear_filter_vw+myq-1*16]
674    neg      myq
675    mova      m4, [bilinear_filter_vw+myq+7*16]
676.nextrow:
677    movh      m0, [srcq+srcstrideq*0]
678    movh      m1, [srcq+srcstrideq*1]
679    movh      m3, [srcq+srcstrideq*2]
680    punpcklbw m0, m6
681    punpcklbw m1, m6
682    punpcklbw m3, m6
683    mova      m2, m1
684    pmullw    m0, m4
685    pmullw    m1, m5
686    pmullw    m2, m4
687    pmullw    m3, m5
688    paddsw    m0, m1
689    paddsw    m2, m3
690    psraw     m0, 2
691    psraw     m2, 2
692    pavgw     m0, m6
693    pavgw     m2, m6
694%if mmsize == 8
695    packuswb  m0, m0
696    packuswb  m2, m2
697    movh   [dstq+dststrideq*0], m0
698    movh   [dstq+dststrideq*1], m2
699%else
700    packuswb  m0, m2
701    movh   [dstq+dststrideq*0], m0
702    movhps [dstq+dststrideq*1], m0
703%endif
704
705    lea     dstq, [dstq+dststrideq*2]
706    lea     srcq, [srcq+srcstrideq*2]
707    sub  heightd, 2
708    jg .nextrow
709    REP_RET
710
711cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
712    shl      mxd, 4
713%ifdef PIC
714    lea  picregq, [bilinear_filter_vw_m]
715%endif
716    pxor      m6, m6
717    mova      m5, [bilinear_filter_vw+mxq-1*16]
718    neg      mxq
719    mova      m4, [bilinear_filter_vw+mxq+7*16]
720.nextrow:
721    movh      m0, [srcq+srcstrideq*0+0]
722    movh      m1, [srcq+srcstrideq*0+1]
723    movh      m2, [srcq+srcstrideq*1+0]
724    movh      m3, [srcq+srcstrideq*1+1]
725    punpcklbw m0, m6
726    punpcklbw m1, m6
727    punpcklbw m2, m6
728    punpcklbw m3, m6
729    pmullw    m0, m4
730    pmullw    m1, m5
731    pmullw    m2, m4
732    pmullw    m3, m5
733    paddsw    m0, m1
734    paddsw    m2, m3
735    psraw     m0, 2
736    psraw     m2, 2
737    pavgw     m0, m6
738    pavgw     m2, m6
739%if mmsize == 8
740    packuswb  m0, m0
741    packuswb  m2, m2
742    movh   [dstq+dststrideq*0], m0
743    movh   [dstq+dststrideq*1], m2
744%else
745    packuswb  m0, m2
746    movh   [dstq+dststrideq*0], m0
747    movhps [dstq+dststrideq*1], m0
748%endif
749
750    lea     dstq, [dstq+dststrideq*2]
751    lea     srcq, [srcq+srcstrideq*2]
752    sub  heightd, 2
753    jg .nextrow
754    REP_RET
755%endmacro
756
757INIT_MMX mmxext
758FILTER_BILINEAR 4
759INIT_XMM sse2
760FILTER_BILINEAR 8
761
762%macro FILTER_BILINEAR_SSSE3 1
763cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
764    shl      myd, 4
765%ifdef PIC
766    lea  picregq, [bilinear_filter_vb_m]
767%endif
768    pxor      m4, m4
769    mova      m3, [bilinear_filter_vb+myq-16]
770.nextrow:
771    movh      m0, [srcq+srcstrideq*0]
772    movh      m1, [srcq+srcstrideq*1]
773    movh      m2, [srcq+srcstrideq*2]
774    punpcklbw m0, m1
775    punpcklbw m1, m2
776    pmaddubsw m0, m3
777    pmaddubsw m1, m3
778    psraw     m0, 2
779    psraw     m1, 2
780    pavgw     m0, m4
781    pavgw     m1, m4
782%if mmsize==8
783    packuswb  m0, m0
784    packuswb  m1, m1
785    movh   [dstq+dststrideq*0], m0
786    movh   [dstq+dststrideq*1], m1
787%else
788    packuswb  m0, m1
789    movh   [dstq+dststrideq*0], m0
790    movhps [dstq+dststrideq*1], m0
791%endif
792
793    lea     dstq, [dstq+dststrideq*2]
794    lea     srcq, [srcq+srcstrideq*2]
795    sub  heightd, 2
796    jg .nextrow
797    REP_RET
798
799cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
800    shl      mxd, 4
801%ifdef PIC
802    lea  picregq, [bilinear_filter_vb_m]
803%endif
804    pxor      m4, m4
805    mova      m2, [filter_h2_shuf]
806    mova      m3, [bilinear_filter_vb+mxq-16]
807.nextrow:
808    movu      m0, [srcq+srcstrideq*0]
809    movu      m1, [srcq+srcstrideq*1]
810    pshufb    m0, m2
811    pshufb    m1, m2
812    pmaddubsw m0, m3
813    pmaddubsw m1, m3
814    psraw     m0, 2
815    psraw     m1, 2
816    pavgw     m0, m4
817    pavgw     m1, m4
818%if mmsize==8
819    packuswb  m0, m0
820    packuswb  m1, m1
821    movh   [dstq+dststrideq*0], m0
822    movh   [dstq+dststrideq*1], m1
823%else
824    packuswb  m0, m1
825    movh   [dstq+dststrideq*0], m0
826    movhps [dstq+dststrideq*1], m0
827%endif
828
829    lea     dstq, [dstq+dststrideq*2]
830    lea     srcq, [srcq+srcstrideq*2]
831    sub  heightd, 2
832    jg .nextrow
833    REP_RET
834%endmacro
835
836INIT_MMX ssse3
837FILTER_BILINEAR_SSSE3 4
838INIT_XMM ssse3
839FILTER_BILINEAR_SSSE3 8
840
841INIT_MMX mmx
842cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
843.nextrow:
844    movq    mm0, [srcq+srcstrideq*0]
845    movq    mm1, [srcq+srcstrideq*1]
846    lea    srcq, [srcq+srcstrideq*2]
847    movq [dstq+dststrideq*0], mm0
848    movq [dstq+dststrideq*1], mm1
849    lea    dstq, [dstq+dststrideq*2]
850    sub heightd, 2
851    jg .nextrow
852    REP_RET
853
854%if ARCH_X86_32
855INIT_MMX mmx
856cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height
857.nextrow:
858    movq    mm0, [srcq+srcstrideq*0+0]
859    movq    mm1, [srcq+srcstrideq*0+8]
860    movq    mm2, [srcq+srcstrideq*1+0]
861    movq    mm3, [srcq+srcstrideq*1+8]
862    lea    srcq, [srcq+srcstrideq*2]
863    movq [dstq+dststrideq*0+0], mm0
864    movq [dstq+dststrideq*0+8], mm1
865    movq [dstq+dststrideq*1+0], mm2
866    movq [dstq+dststrideq*1+8], mm3
867    lea    dstq, [dstq+dststrideq*2]
868    sub heightd, 2
869    jg .nextrow
870    REP_RET
871%endif
872
873INIT_XMM sse
874cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
875.nextrow:
876    movups xmm0, [srcq+srcstrideq*0]
877    movups xmm1, [srcq+srcstrideq*1]
878    lea    srcq, [srcq+srcstrideq*2]
879    movaps [dstq+dststrideq*0], xmm0
880    movaps [dstq+dststrideq*1], xmm1
881    lea    dstq, [dstq+dststrideq*2]
882    sub heightd, 2
883    jg .nextrow
884    REP_RET
885
886;-----------------------------------------------------------------------------
887; void ff_vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
888;-----------------------------------------------------------------------------
889
890%macro ADD_DC 4
891    %4        m2, [dst1q+%3]
892    %4        m3, [dst1q+strideq+%3]
893    %4        m4, [dst2q+%3]
894    %4        m5, [dst2q+strideq+%3]
895    paddusb   m2, %1
896    paddusb   m3, %1
897    paddusb   m4, %1
898    paddusb   m5, %1
899    psubusb   m2, %2
900    psubusb   m3, %2
901    psubusb   m4, %2
902    psubusb   m5, %2
903    %4 [dst1q+%3], m2
904    %4 [dst1q+strideq+%3], m3
905    %4 [dst2q+%3], m4
906    %4 [dst2q+strideq+%3], m5
907%endmacro
908
909INIT_MMX mmx
910cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
911    ; load data
912    movd       m0, [blockq]
913
914    ; calculate DC
915    paddw      m0, [pw_4]
916    pxor       m1, m1
917    psraw      m0, 3
918    movd [blockq], m1
919    psubw      m1, m0
920    packuswb   m0, m0
921    packuswb   m1, m1
922    punpcklbw  m0, m0
923    punpcklbw  m1, m1
924    punpcklwd  m0, m0
925    punpcklwd  m1, m1
926
927    ; add DC
928    DEFINE_ARGS dst1, dst2, stride
929    lea     dst2q, [dst1q+strideq*2]
930    ADD_DC     m0, m1, 0, movh
931    RET
932
933INIT_XMM sse4
934cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
935    ; load data
936    movd       m0, [blockq]
937    pxor       m1, m1
938
939    ; calculate DC
940    paddw      m0, [pw_4]
941    movd [blockq], m1
942    DEFINE_ARGS dst1, dst2, stride
943    lea     dst2q, [dst1q+strideq*2]
944    movd       m2, [dst1q]
945    movd       m3, [dst1q+strideq]
946    movd       m4, [dst2q]
947    movd       m5, [dst2q+strideq]
948    psraw      m0, 3
949    pshuflw    m0, m0, 0
950    punpcklqdq m0, m0
951    punpckldq  m2, m3
952    punpckldq  m4, m5
953    punpcklbw  m2, m1
954    punpcklbw  m4, m1
955    paddw      m2, m0
956    paddw      m4, m0
957    packuswb   m2, m4
958    movd   [dst1q], m2
959    pextrd [dst1q+strideq], m2, 1
960    pextrd [dst2q], m2, 2
961    pextrd [dst2q+strideq], m2, 3
962    RET
963
964;-----------------------------------------------------------------------------
965; void ff_vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
966;-----------------------------------------------------------------------------
967
968%if ARCH_X86_32
969INIT_MMX mmx
970cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
971    ; load data
972    movd      m0, [blockq+32*0] ; A
973    movd      m1, [blockq+32*2] ; C
974    punpcklwd m0, [blockq+32*1] ; A B
975    punpcklwd m1, [blockq+32*3] ; C D
976    punpckldq m0, m1        ; A B C D
977    pxor      m6, m6
978
979    ; calculate DC
980    paddw     m0, [pw_4]
981    movd [blockq+32*0], m6
982    movd [blockq+32*1], m6
983    movd [blockq+32*2], m6
984    movd [blockq+32*3], m6
985    psraw     m0, 3
986    psubw     m6, m0
987    packuswb  m0, m0
988    packuswb  m6, m6
989    punpcklbw m0, m0 ; AABBCCDD
990    punpcklbw m6, m6 ; AABBCCDD
991    movq      m1, m0
992    movq      m7, m6
993    punpcklbw m0, m0 ; AAAABBBB
994    punpckhbw m1, m1 ; CCCCDDDD
995    punpcklbw m6, m6 ; AAAABBBB
996    punpckhbw m7, m7 ; CCCCDDDD
997
998    ; add DC
999    DEFINE_ARGS dst1, dst2, stride
1000    lea    dst2q, [dst1q+strideq*2]
1001    ADD_DC    m0, m6, 0, mova
1002    ADD_DC    m1, m7, 8, mova
1003    RET
1004%endif
1005
1006INIT_XMM sse2
1007cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
1008    ; load data
1009    movd      m0, [blockq+32*0] ; A
1010    movd      m1, [blockq+32*2] ; C
1011    punpcklwd m0, [blockq+32*1] ; A B
1012    punpcklwd m1, [blockq+32*3] ; C D
1013    punpckldq m0, m1        ; A B C D
1014    pxor      m1, m1
1015
1016    ; calculate DC
1017    paddw     m0, [pw_4]
1018    movd [blockq+32*0], m1
1019    movd [blockq+32*1], m1
1020    movd [blockq+32*2], m1
1021    movd [blockq+32*3], m1
1022    psraw     m0, 3
1023    psubw     m1, m0
1024    packuswb  m0, m0
1025    packuswb  m1, m1
1026    punpcklbw m0, m0
1027    punpcklbw m1, m1
1028    punpcklbw m0, m0
1029    punpcklbw m1, m1
1030
1031    ; add DC
1032    DEFINE_ARGS dst1, dst2, stride
1033    lea    dst2q, [dst1q+strideq*2]
1034    ADD_DC    m0, m1, 0, mova
1035    RET
1036
1037;-----------------------------------------------------------------------------
1038; void ff_vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
1039;-----------------------------------------------------------------------------
1040
1041INIT_MMX mmx
1042cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
1043    ; load data
1044    movd      m0, [blockq+32*0] ; A
1045    movd      m1, [blockq+32*2] ; C
1046    punpcklwd m0, [blockq+32*1] ; A B
1047    punpcklwd m1, [blockq+32*3] ; C D
1048    punpckldq m0, m1        ; A B C D
1049    pxor      m6, m6
1050
1051    ; calculate DC
1052    paddw     m0, [pw_4]
1053    movd [blockq+32*0], m6
1054    movd [blockq+32*1], m6
1055    movd [blockq+32*2], m6
1056    movd [blockq+32*3], m6
1057    psraw     m0, 3
1058    psubw     m6, m0
1059    packuswb  m0, m0
1060    packuswb  m6, m6
1061    punpcklbw m0, m0 ; AABBCCDD
1062    punpcklbw m6, m6 ; AABBCCDD
1063    movq      m1, m0
1064    movq      m7, m6
1065    punpcklbw m0, m0 ; AAAABBBB
1066    punpckhbw m1, m1 ; CCCCDDDD
1067    punpcklbw m6, m6 ; AAAABBBB
1068    punpckhbw m7, m7 ; CCCCDDDD
1069
1070    ; add DC
1071    DEFINE_ARGS dst1, dst2, stride
1072    lea    dst2q, [dst1q+strideq*2]
1073    ADD_DC    m0, m6, 0, mova
1074    lea    dst1q, [dst1q+strideq*4]
1075    lea    dst2q, [dst2q+strideq*4]
1076    ADD_DC    m1, m7, 0, mova
1077    RET
1078
1079;-----------------------------------------------------------------------------
1080; void ff_vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
1081;-----------------------------------------------------------------------------
1082
1083; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
1084;           this macro assumes that m6/m7 have words for 20091/17734 loaded
1085%macro VP8_MULTIPLY_SUMSUB 4
1086    mova      %3, %1
1087    mova      %4, %2
1088    pmulhw    %3, m6 ;20091(1)
1089    pmulhw    %4, m6 ;20091(2)
1090    paddw     %3, %1
1091    paddw     %4, %2
1092    paddw     %1, %1
1093    paddw     %2, %2
1094    pmulhw    %1, m7 ;35468(1)
1095    pmulhw    %2, m7 ;35468(2)
1096    psubw     %1, %4
1097    paddw     %2, %3
1098%endmacro
1099
1100; calculate x0=%1+%3; x1=%1-%3
1101;           x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
1102;           %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
1103;           %5/%6 are temporary registers
1104;           we assume m6/m7 have constant words 20091/17734 loaded in them
1105%macro VP8_IDCT_TRANSFORM4x4_1D 6
1106    SUMSUB_BA         w, %3,  %1,  %5     ;t0, t1
1107    VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
1108    SUMSUB_BA         w, %4,  %3,  %5     ;tmp0, tmp3
1109    SUMSUB_BA         w, %2,  %1,  %5     ;tmp1, tmp2
1110    SWAP                 %4,  %1
1111    SWAP                 %4,  %3
1112%endmacro
1113
1114%macro VP8_IDCT_ADD 0
1115cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
1116    ; load block data
1117    movq         m0, [blockq+ 0]
1118    movq         m1, [blockq+ 8]
1119    movq         m2, [blockq+16]
1120    movq         m3, [blockq+24]
1121    movq         m6, [pw_20091]
1122    movq         m7, [pw_17734]
1123%if cpuflag(sse)
1124    xorps      xmm0, xmm0
1125    movaps [blockq+ 0], xmm0
1126    movaps [blockq+16], xmm0
1127%else
1128    pxor         m4, m4
1129    movq [blockq+ 0], m4
1130    movq [blockq+ 8], m4
1131    movq [blockq+16], m4
1132    movq [blockq+24], m4
1133%endif
1134
1135    ; actual IDCT
1136    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1137    TRANSPOSE4x4W            0, 1, 2, 3, 4
1138    paddw        m0, [pw_4]
1139    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1140    TRANSPOSE4x4W            0, 1, 2, 3, 4
1141
1142    ; store
1143    pxor         m4, m4
1144    DEFINE_ARGS dst1, dst2, stride
1145    lea       dst2q, [dst1q+2*strideq]
1146    STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
1147    STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
1148
1149    RET
1150%endmacro
1151
1152%if ARCH_X86_32
1153INIT_MMX mmx
1154VP8_IDCT_ADD
1155%endif
1156INIT_MMX sse
1157VP8_IDCT_ADD
1158
1159;-----------------------------------------------------------------------------
1160; void ff_vp8_luma_dc_wht(int16_t block[4][4][16], int16_t dc[16])
1161;-----------------------------------------------------------------------------
1162
1163%macro SCATTER_WHT 3
1164    movd dc1d, m%1
1165    movd dc2d, m%2
1166    mov [blockq+2*16*(0+%3)], dc1w
1167    mov [blockq+2*16*(1+%3)], dc2w
1168    shr  dc1d, 16
1169    shr  dc2d, 16
1170    psrlq m%1, 32
1171    psrlq m%2, 32
1172    mov [blockq+2*16*(4+%3)], dc1w
1173    mov [blockq+2*16*(5+%3)], dc2w
1174    movd dc1d, m%1
1175    movd dc2d, m%2
1176    mov [blockq+2*16*(8+%3)], dc1w
1177    mov [blockq+2*16*(9+%3)], dc2w
1178    shr  dc1d, 16
1179    shr  dc2d, 16
1180    mov [blockq+2*16*(12+%3)], dc1w
1181    mov [blockq+2*16*(13+%3)], dc2w
1182%endmacro
1183
1184%macro HADAMARD4_1D 4
1185    SUMSUB_BADC w, %2, %1, %4, %3
1186    SUMSUB_BADC w, %4, %2, %3, %1
1187    SWAP %1, %4, %3
1188%endmacro
1189
1190%macro VP8_DC_WHT 0
1191cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
1192    movq          m0, [dc1q]
1193    movq          m1, [dc1q+8]
1194    movq          m2, [dc1q+16]
1195    movq          m3, [dc1q+24]
1196%if cpuflag(sse)
1197    xorps      xmm0, xmm0
1198    movaps [dc1q+ 0], xmm0
1199    movaps [dc1q+16], xmm0
1200%else
1201    pxor         m4, m4
1202    movq  [dc1q+ 0], m4
1203    movq  [dc1q+ 8], m4
1204    movq  [dc1q+16], m4
1205    movq  [dc1q+24], m4
1206%endif
1207    HADAMARD4_1D  0, 1, 2, 3
1208    TRANSPOSE4x4W 0, 1, 2, 3, 4
1209    paddw         m0, [pw_3]
1210    HADAMARD4_1D  0, 1, 2, 3
1211    psraw         m0, 3
1212    psraw         m1, 3
1213    psraw         m2, 3
1214    psraw         m3, 3
1215    SCATTER_WHT   0, 1, 0
1216    SCATTER_WHT   2, 3, 2
1217    RET
1218%endmacro
1219
1220%if ARCH_X86_32
1221INIT_MMX mmx
1222VP8_DC_WHT
1223%endif
1224INIT_MMX sse
1225VP8_DC_WHT
1226