1;******************************************************************************
2;* VP8 MMXEXT optimizations
3;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA
26
27fourtap_filter_hw_m: times 4 dw  -6, 123
28                     times 4 dw  12,  -1
29                     times 4 dw  -9,  93
30                     times 4 dw  50,  -6
31                     times 4 dw  -6,  50
32                     times 4 dw  93,  -9
33                     times 4 dw  -1,  12
34                     times 4 dw 123,  -6
35
36sixtap_filter_hw_m:  times 4 dw   2, -11
37                     times 4 dw 108,  36
38                     times 4 dw  -8,   1
39                     times 4 dw   3, -16
40                     times 4 dw  77,  77
41                     times 4 dw -16,   3
42                     times 4 dw   1,  -8
43                     times 4 dw  36, 108
44                     times 4 dw -11,   2
45
46fourtap_filter_hb_m: times 8 db  -6, 123
47                     times 8 db  12,  -1
48                     times 8 db  -9,  93
49                     times 8 db  50,  -6
50                     times 8 db  -6,  50
51                     times 8 db  93,  -9
52                     times 8 db  -1,  12
53                     times 8 db 123,  -6
54
55sixtap_filter_hb_m:  times 8 db   2,   1
56                     times 8 db -11, 108
57                     times 8 db  36,  -8
58                     times 8 db   3,   3
59                     times 8 db -16,  77
60                     times 8 db  77, -16
61                     times 8 db   1,   2
62                     times 8 db  -8,  36
63                     times 8 db 108, -11
64
65fourtap_filter_v_m:  times 8 dw  -6
66                     times 8 dw 123
67                     times 8 dw  12
68                     times 8 dw  -1
69                     times 8 dw  -9
70                     times 8 dw  93
71                     times 8 dw  50
72                     times 8 dw  -6
73                     times 8 dw  -6
74                     times 8 dw  50
75                     times 8 dw  93
76                     times 8 dw  -9
77                     times 8 dw  -1
78                     times 8 dw  12
79                     times 8 dw 123
80                     times 8 dw  -6
81
82sixtap_filter_v_m:   times 8 dw   2
83                     times 8 dw -11
84                     times 8 dw 108
85                     times 8 dw  36
86                     times 8 dw  -8
87                     times 8 dw   1
88                     times 8 dw   3
89                     times 8 dw -16
90                     times 8 dw  77
91                     times 8 dw  77
92                     times 8 dw -16
93                     times 8 dw   3
94                     times 8 dw   1
95                     times 8 dw  -8
96                     times 8 dw  36
97                     times 8 dw 108
98                     times 8 dw -11
99                     times 8 dw   2
100
101bilinear_filter_vw_m: times 8 dw 1
102                      times 8 dw 2
103                      times 8 dw 3
104                      times 8 dw 4
105                      times 8 dw 5
106                      times 8 dw 6
107                      times 8 dw 7
108
109bilinear_filter_vb_m: times 8 db 7, 1
110                      times 8 db 6, 2
111                      times 8 db 5, 3
112                      times 8 db 4, 4
113                      times 8 db 3, 5
114                      times 8 db 2, 6
115                      times 8 db 1, 7
116
117%ifdef PIC
118%define fourtap_filter_hw  picregq
119%define sixtap_filter_hw   picregq
120%define fourtap_filter_hb  picregq
121%define sixtap_filter_hb   picregq
122%define fourtap_filter_v   picregq
123%define sixtap_filter_v    picregq
124%define bilinear_filter_vw picregq
125%define bilinear_filter_vb picregq
126%define npicregs 1
127%else
128%define fourtap_filter_hw  fourtap_filter_hw_m
129%define sixtap_filter_hw   sixtap_filter_hw_m
130%define fourtap_filter_hb  fourtap_filter_hb_m
131%define sixtap_filter_hb   sixtap_filter_hb_m
132%define fourtap_filter_v   fourtap_filter_v_m
133%define sixtap_filter_v    sixtap_filter_v_m
134%define bilinear_filter_vw bilinear_filter_vw_m
135%define bilinear_filter_vb bilinear_filter_vb_m
136%define npicregs 0
137%endif
138
139filter_h2_shuf:  db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,  6, 6,  7,  7,  8
140filter_h4_shuf:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,  8, 8,  9,  9, 10
141
142filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11,  7, 12
143filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,  7, 7,  8,  8,  9
144filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8,  9, 9, 10, 10, 11
145
146pw_20091: times 4 dw 20091
147pw_17734: times 4 dw 17734
148
149cextern pw_3
150cextern pw_4
151cextern pw_64
152cextern pw_256
153
154SECTION .text
155
156;-------------------------------------------------------------------------------
157; subpel MC functions:
158;
159; void ff_put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, ptrdiff_t deststride,
160;                                                 uint8_t *src, ptrdiff_t srcstride,
161;                                                 int height,   int mx, int my);
162;-------------------------------------------------------------------------------
163
164%macro FILTER_SSSE3 1
165cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
166    lea      mxd, [mxq*3]
167    mova      m3, [filter_h6_shuf2]
168    mova      m4, [filter_h6_shuf3]
169%ifdef PIC
170    lea  picregq, [sixtap_filter_hb_m]
171%endif
172    mova      m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
173    mova      m6, [sixtap_filter_hb+mxq*8-32]
174    mova      m7, [sixtap_filter_hb+mxq*8-16]
175
176.nextrow:
177    movu      m0, [srcq-2]
178    mova      m1, m0
179    mova      m2, m0
180%if mmsize == 8
181; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
182; shuffle with a memory operand
183    punpcklbw m0, [srcq+3]
184%else
185    pshufb    m0, [filter_h6_shuf1]
186%endif
187    pshufb    m1, m3
188    pshufb    m2, m4
189    pmaddubsw m0, m5
190    pmaddubsw m1, m6
191    pmaddubsw m2, m7
192    paddsw    m0, m1
193    paddsw    m0, m2
194    pmulhrsw  m0, [pw_256]
195    packuswb  m0, m0
196    movh  [dstq], m0        ; store
197
198    ; go to next line
199    add     dstq, dststrideq
200    add     srcq, srcstrideq
201    dec  heightd            ; next row
202    jg .nextrow
203    REP_RET
204
205cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
206    shl      mxd, 4
207    mova      m2, [pw_256]
208    mova      m3, [filter_h2_shuf]
209    mova      m4, [filter_h4_shuf]
210%ifdef PIC
211    lea  picregq, [fourtap_filter_hb_m]
212%endif
213    mova      m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
214    mova      m6, [fourtap_filter_hb+mxq]
215
216.nextrow:
217    movu      m0, [srcq-1]
218    mova      m1, m0
219    pshufb    m0, m3
220    pshufb    m1, m4
221    pmaddubsw m0, m5
222    pmaddubsw m1, m6
223    paddsw    m0, m1
224    pmulhrsw  m0, m2
225    packuswb  m0, m0
226    movh  [dstq], m0        ; store
227
228    ; go to next line
229    add     dstq, dststrideq
230    add     srcq, srcstrideq
231    dec  heightd            ; next row
232    jg .nextrow
233    REP_RET
234
235cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
236    shl      myd, 4
237%ifdef PIC
238    lea  picregq, [fourtap_filter_hb_m]
239%endif
240    mova      m5, [fourtap_filter_hb+myq-16]
241    mova      m6, [fourtap_filter_hb+myq]
242    mova      m7, [pw_256]
243
244    ; read 3 lines
245    sub     srcq, srcstrideq
246    movh      m0, [srcq]
247    movh      m1, [srcq+  srcstrideq]
248    movh      m2, [srcq+2*srcstrideq]
249    add     srcq, srcstrideq
250
251.nextrow:
252    movh      m3, [srcq+2*srcstrideq]      ; read new row
253    mova      m4, m0
254    mova      m0, m1
255    punpcklbw m4, m1
256    mova      m1, m2
257    punpcklbw m2, m3
258    pmaddubsw m4, m5
259    pmaddubsw m2, m6
260    paddsw    m4, m2
261    mova      m2, m3
262    pmulhrsw  m4, m7
263    packuswb  m4, m4
264    movh  [dstq], m4
265
266    ; go to next line
267    add      dstq, dststrideq
268    add      srcq, srcstrideq
269    dec   heightd                          ; next row
270    jg .nextrow
271    REP_RET
272
273cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
274    lea      myd, [myq*3]
275%ifdef PIC
276    lea  picregq, [sixtap_filter_hb_m]
277%endif
278    lea      myq, [sixtap_filter_hb+myq*8]
279
280    ; read 5 lines
281    sub     srcq, srcstrideq
282    sub     srcq, srcstrideq
283    movh      m0, [srcq]
284    movh      m1, [srcq+srcstrideq]
285    movh      m2, [srcq+srcstrideq*2]
286    lea     srcq, [srcq+srcstrideq*2]
287    add     srcq, srcstrideq
288    movh      m3, [srcq]
289    movh      m4, [srcq+srcstrideq]
290
291.nextrow:
292    movh      m5, [srcq+2*srcstrideq]      ; read new row
293    mova      m6, m0
294    punpcklbw m6, m5
295    mova      m0, m1
296    punpcklbw m1, m2
297    mova      m7, m3
298    punpcklbw m7, m4
299    pmaddubsw m6, [myq-48]
300    pmaddubsw m1, [myq-32]
301    pmaddubsw m7, [myq-16]
302    paddsw    m6, m1
303    paddsw    m6, m7
304    mova      m1, m2
305    mova      m2, m3
306    pmulhrsw  m6, [pw_256]
307    mova      m3, m4
308    packuswb  m6, m6
309    mova      m4, m5
310    movh  [dstq], m6
311
312    ; go to next line
313    add      dstq, dststrideq
314    add      srcq, srcstrideq
315    dec   heightd                          ; next row
316    jg .nextrow
317    REP_RET
318%endmacro
319
320INIT_MMX ssse3
321FILTER_SSSE3 4
322INIT_XMM ssse3
323FILTER_SSSE3 8
324
325; 4x4 block, H-only 4-tap filter
326INIT_MMX mmxext
327cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
328    shl       mxd, 4
329%ifdef PIC
330    lea   picregq, [fourtap_filter_hw_m]
331%endif
332    movq      mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
333    movq      mm5, [fourtap_filter_hw+mxq]
334    movq      mm7, [pw_64]
335    pxor      mm6, mm6
336
337.nextrow:
338    movq      mm1, [srcq-1]                ; (ABCDEFGH) load 8 horizontal pixels
339
340    ; first set of 2 pixels
341    movq      mm2, mm1                     ; byte ABCD..
342    punpcklbw mm1, mm6                     ; byte->word ABCD
343    pshufw    mm0, mm2, 9                  ; byte CDEF..
344    punpcklbw mm0, mm6                     ; byte->word CDEF
345    pshufw    mm3, mm1, 0x94               ; word ABBC
346    pshufw    mm1, mm0, 0x94               ; word CDDE
347    pmaddwd   mm3, mm4                     ; multiply 2px with F0/F1
348    movq      mm0, mm1                     ; backup for second set of pixels
349    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
350    paddd     mm3, mm1                     ; finish 1st 2px
351
352    ; second set of 2 pixels, use backup of above
353    punpckhbw mm2, mm6                     ; byte->word EFGH
354    pmaddwd   mm0, mm4                     ; multiply backed up 2px with F0/F1
355    pshufw    mm1, mm2, 0x94               ; word EFFG
356    pmaddwd   mm1, mm5                     ; multiply 2px with F2/F3
357    paddd     mm0, mm1                     ; finish 2nd 2px
358
359    ; merge two sets of 2 pixels into one set of 4, round/clip/store
360    packssdw  mm3, mm0                     ; merge dword->word (4px)
361    paddsw    mm3, mm7                     ; rounding
362    psraw     mm3, 7
363    packuswb  mm3, mm6                     ; clip and word->bytes
364    movd   [dstq], mm3                     ; store
365
366    ; go to next line
367    add      dstq, dststrideq
368    add      srcq, srcstrideq
369    dec   heightd                          ; next row
370    jg .nextrow
371    REP_RET
372
373; 4x4 block, H-only 6-tap filter
374INIT_MMX mmxext
375cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
376    lea       mxd, [mxq*3]
377%ifdef PIC
378    lea   picregq, [sixtap_filter_hw_m]
379%endif
380    movq      mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
381    movq      mm5, [sixtap_filter_hw+mxq*8-32]
382    movq      mm6, [sixtap_filter_hw+mxq*8-16]
383    movq      mm7, [pw_64]
384    pxor      mm3, mm3
385
386.nextrow:
387    movq      mm1, [srcq-2]                ; (ABCDEFGH) load 8 horizontal pixels
388
389    ; first set of 2 pixels
390    movq      mm2, mm1                     ; byte ABCD..
391    punpcklbw mm1, mm3                     ; byte->word ABCD
392    pshufw    mm0, mm2, 0x9                ; byte CDEF..
393    punpckhbw mm2, mm3                     ; byte->word EFGH
394    punpcklbw mm0, mm3                     ; byte->word CDEF
395    pshufw    mm1, mm1, 0x94               ; word ABBC
396    pshufw    mm2, mm2, 0x94               ; word EFFG
397    pmaddwd   mm1, mm4                     ; multiply 2px with F0/F1
398    pshufw    mm3, mm0, 0x94               ; word CDDE
399    movq      mm0, mm3                     ; backup for second set of pixels
400    pmaddwd   mm3, mm5                     ; multiply 2px with F2/F3
401    paddd     mm1, mm3                     ; add to 1st 2px cache
402    movq      mm3, mm2                     ; backup for second set of pixels
403    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
404    paddd     mm1, mm2                     ; finish 1st 2px
405
406    ; second set of 2 pixels, use backup of above
407    movd      mm2, [srcq+3]                ; byte FGHI (prevent overreads)
408    pmaddwd   mm0, mm4                     ; multiply 1st backed up 2px with F0/F1
409    pmaddwd   mm3, mm5                     ; multiply 2nd backed up 2px with F2/F3
410    paddd     mm0, mm3                     ; add to 2nd 2px cache
411    pxor      mm3, mm3
412    punpcklbw mm2, mm3                     ; byte->word FGHI
413    pshufw    mm2, mm2, 0xE9               ; word GHHI
414    pmaddwd   mm2, mm6                     ; multiply 2px with F4/F5
415    paddd     mm0, mm2                     ; finish 2nd 2px
416
417    ; merge two sets of 2 pixels into one set of 4, round/clip/store
418    packssdw  mm1, mm0                     ; merge dword->word (4px)
419    paddsw    mm1, mm7                     ; rounding
420    psraw     mm1, 7
421    packuswb  mm1, mm3                     ; clip and word->bytes
422    movd   [dstq], mm1                     ; store
423
424    ; go to next line
425    add      dstq, dststrideq
426    add      srcq, srcstrideq
427    dec   heightd                          ; next row
428    jg .nextrow
429    REP_RET
430
431INIT_XMM sse2
432cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
433    shl      mxd, 5
434%ifdef PIC
435    lea  picregq, [fourtap_filter_v_m]
436%endif
437    lea      mxq, [fourtap_filter_v+mxq-32]
438    pxor      m7, m7
439    mova      m4, [pw_64]
440    mova      m5, [mxq+ 0]
441    mova      m6, [mxq+16]
442%ifdef m8
443    mova      m8, [mxq+32]
444    mova      m9, [mxq+48]
445%endif
446.nextrow:
447    movq      m0, [srcq-1]
448    movq      m1, [srcq-0]
449    movq      m2, [srcq+1]
450    movq      m3, [srcq+2]
451    punpcklbw m0, m7
452    punpcklbw m1, m7
453    punpcklbw m2, m7
454    punpcklbw m3, m7
455    pmullw    m0, m5
456    pmullw    m1, m6
457%ifdef m8
458    pmullw    m2, m8
459    pmullw    m3, m9
460%else
461    pmullw    m2, [mxq+32]
462    pmullw    m3, [mxq+48]
463%endif
464    paddsw    m0, m1
465    paddsw    m2, m3
466    paddsw    m0, m2
467    paddsw    m0, m4
468    psraw     m0, 7
469    packuswb  m0, m7
470    movh  [dstq], m0        ; store
471
472    ; go to next line
473    add     dstq, dststrideq
474    add     srcq, srcstrideq
475    dec  heightd            ; next row
476    jg .nextrow
477    REP_RET
478
479INIT_XMM sse2
480cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
481    lea      mxd, [mxq*3]
482    shl      mxd, 4
483%ifdef PIC
484    lea  picregq, [sixtap_filter_v_m]
485%endif
486    lea      mxq, [sixtap_filter_v+mxq-96]
487    pxor      m7, m7
488    mova      m6, [pw_64]
489%ifdef m8
490    mova      m8, [mxq+ 0]
491    mova      m9, [mxq+16]
492    mova     m10, [mxq+32]
493    mova     m11, [mxq+48]
494    mova     m12, [mxq+64]
495    mova     m13, [mxq+80]
496%endif
497.nextrow:
498    movq      m0, [srcq-2]
499    movq      m1, [srcq-1]
500    movq      m2, [srcq-0]
501    movq      m3, [srcq+1]
502    movq      m4, [srcq+2]
503    movq      m5, [srcq+3]
504    punpcklbw m0, m7
505    punpcklbw m1, m7
506    punpcklbw m2, m7
507    punpcklbw m3, m7
508    punpcklbw m4, m7
509    punpcklbw m5, m7
510%ifdef m8
511    pmullw    m0, m8
512    pmullw    m1, m9
513    pmullw    m2, m10
514    pmullw    m3, m11
515    pmullw    m4, m12
516    pmullw    m5, m13
517%else
518    pmullw    m0, [mxq+ 0]
519    pmullw    m1, [mxq+16]
520    pmullw    m2, [mxq+32]
521    pmullw    m3, [mxq+48]
522    pmullw    m4, [mxq+64]
523    pmullw    m5, [mxq+80]
524%endif
525    paddsw    m1, m4
526    paddsw    m0, m5
527    paddsw    m1, m2
528    paddsw    m0, m3
529    paddsw    m0, m1
530    paddsw    m0, m6
531    psraw     m0, 7
532    packuswb  m0, m7
533    movh  [dstq], m0        ; store
534
535    ; go to next line
536    add     dstq, dststrideq
537    add     srcq, srcstrideq
538    dec  heightd            ; next row
539    jg .nextrow
540    REP_RET
541
542%macro FILTER_V 1
543; 4x4 block, V-only 4-tap filter
544cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
545    shl      myd, 5
546%ifdef PIC
547    lea  picregq, [fourtap_filter_v_m]
548%endif
549    lea      myq, [fourtap_filter_v+myq-32]
550    mova      m6, [pw_64]
551    pxor      m7, m7
552    mova      m5, [myq+48]
553
554    ; read 3 lines
555    sub     srcq, srcstrideq
556    movh      m0, [srcq]
557    movh      m1, [srcq+  srcstrideq]
558    movh      m2, [srcq+2*srcstrideq]
559    add     srcq, srcstrideq
560    punpcklbw m0, m7
561    punpcklbw m1, m7
562    punpcklbw m2, m7
563
564.nextrow:
565    ; first calculate negative taps (to prevent losing positive overflows)
566    movh      m4, [srcq+2*srcstrideq]      ; read new row
567    punpcklbw m4, m7
568    mova      m3, m4
569    pmullw    m0, [myq+0]
570    pmullw    m4, m5
571    paddsw    m4, m0
572
573    ; then calculate positive taps
574    mova      m0, m1
575    pmullw    m1, [myq+16]
576    paddsw    m4, m1
577    mova      m1, m2
578    pmullw    m2, [myq+32]
579    paddsw    m4, m2
580    mova      m2, m3
581
582    ; round/clip/store
583    paddsw    m4, m6
584    psraw     m4, 7
585    packuswb  m4, m7
586    movh  [dstq], m4
587
588    ; go to next line
589    add     dstq, dststrideq
590    add     srcq, srcstrideq
591    dec  heightd                           ; next row
592    jg .nextrow
593    REP_RET
594
595
596; 4x4 block, V-only 6-tap filter
597cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
598    shl      myd, 4
599    lea      myq, [myq*3]
600%ifdef PIC
601    lea  picregq, [sixtap_filter_v_m]
602%endif
603    lea      myq, [sixtap_filter_v+myq-96]
604    pxor      m7, m7
605
606    ; read 5 lines
607    sub     srcq, srcstrideq
608    sub     srcq, srcstrideq
609    movh      m0, [srcq]
610    movh      m1, [srcq+srcstrideq]
611    movh      m2, [srcq+srcstrideq*2]
612    lea     srcq, [srcq+srcstrideq*2]
613    add     srcq, srcstrideq
614    movh      m3, [srcq]
615    movh      m4, [srcq+srcstrideq]
616    punpcklbw m0, m7
617    punpcklbw m1, m7
618    punpcklbw m2, m7
619    punpcklbw m3, m7
620    punpcklbw m4, m7
621
622.nextrow:
623    ; first calculate negative taps (to prevent losing positive overflows)
624    mova      m5, m1
625    pmullw    m5, [myq+16]
626    mova      m6, m4
627    pmullw    m6, [myq+64]
628    paddsw    m6, m5
629
630    ; then calculate positive taps
631    movh      m5, [srcq+2*srcstrideq]      ; read new row
632    punpcklbw m5, m7
633    pmullw    m0, [myq+0]
634    paddsw    m6, m0
635    mova      m0, m1
636    mova      m1, m2
637    pmullw    m2, [myq+32]
638    paddsw    m6, m2
639    mova      m2, m3
640    pmullw    m3, [myq+48]
641    paddsw    m6, m3
642    mova      m3, m4
643    mova      m4, m5
644    pmullw    m5, [myq+80]
645    paddsw    m6, m5
646
647    ; round/clip/store
648    paddsw    m6, [pw_64]
649    psraw     m6, 7
650    packuswb  m6, m7
651    movh  [dstq], m6
652
653    ; go to next line
654    add     dstq, dststrideq
655    add     srcq, srcstrideq
656    dec  heightd                           ; next row
657    jg .nextrow
658    REP_RET
659%endmacro
660
661INIT_MMX mmxext
662FILTER_V 4
663INIT_XMM sse2
664FILTER_V 8
665
666%macro FILTER_BILINEAR 1
667%if cpuflag(ssse3)
668cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
669    shl      myd, 4
670%ifdef PIC
671    lea  picregq, [bilinear_filter_vb_m]
672%endif
673    pxor      m4, m4
674    mova      m3, [bilinear_filter_vb+myq-16]
675.nextrow:
676    movh      m0, [srcq+srcstrideq*0]
677    movh      m1, [srcq+srcstrideq*1]
678    movh      m2, [srcq+srcstrideq*2]
679    punpcklbw m0, m1
680    punpcklbw m1, m2
681    pmaddubsw m0, m3
682    pmaddubsw m1, m3
683    psraw     m0, 2
684    psraw     m1, 2
685    pavgw     m0, m4
686    pavgw     m1, m4
687%if mmsize==8
688    packuswb  m0, m0
689    packuswb  m1, m1
690    movh   [dstq+dststrideq*0], m0
691    movh   [dstq+dststrideq*1], m1
692%else
693    packuswb  m0, m1
694    movh   [dstq+dststrideq*0], m0
695    movhps [dstq+dststrideq*1], m0
696%endif
697%else ; cpuflag(ssse3)
698cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
699    shl      myd, 4
700%ifdef PIC
701    lea  picregq, [bilinear_filter_vw_m]
702%endif
703    pxor      m6, m6
704    mova      m5, [bilinear_filter_vw+myq-1*16]
705    neg      myq
706    mova      m4, [bilinear_filter_vw+myq+7*16]
707.nextrow:
708    movh      m0, [srcq+srcstrideq*0]
709    movh      m1, [srcq+srcstrideq*1]
710    movh      m3, [srcq+srcstrideq*2]
711    punpcklbw m0, m6
712    punpcklbw m1, m6
713    punpcklbw m3, m6
714    mova      m2, m1
715    pmullw    m0, m4
716    pmullw    m1, m5
717    pmullw    m2, m4
718    pmullw    m3, m5
719    paddsw    m0, m1
720    paddsw    m2, m3
721    psraw     m0, 2
722    psraw     m2, 2
723    pavgw     m0, m6
724    pavgw     m2, m6
725%if mmsize == 8
726    packuswb  m0, m0
727    packuswb  m2, m2
728    movh   [dstq+dststrideq*0], m0
729    movh   [dstq+dststrideq*1], m2
730%else
731    packuswb  m0, m2
732    movh   [dstq+dststrideq*0], m0
733    movhps [dstq+dststrideq*1], m0
734%endif
735%endif ; cpuflag(ssse3)
736
737    lea     dstq, [dstq+dststrideq*2]
738    lea     srcq, [srcq+srcstrideq*2]
739    sub  heightd, 2
740    jg .nextrow
741    REP_RET
742
743%if cpuflag(ssse3)
744cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
745    shl      mxd, 4
746%ifdef PIC
747    lea  picregq, [bilinear_filter_vb_m]
748%endif
749    pxor      m4, m4
750    mova      m2, [filter_h2_shuf]
751    mova      m3, [bilinear_filter_vb+mxq-16]
752.nextrow:
753    movu      m0, [srcq+srcstrideq*0]
754    movu      m1, [srcq+srcstrideq*1]
755    pshufb    m0, m2
756    pshufb    m1, m2
757    pmaddubsw m0, m3
758    pmaddubsw m1, m3
759    psraw     m0, 2
760    psraw     m1, 2
761    pavgw     m0, m4
762    pavgw     m1, m4
763%if mmsize==8
764    packuswb  m0, m0
765    packuswb  m1, m1
766    movh   [dstq+dststrideq*0], m0
767    movh   [dstq+dststrideq*1], m1
768%else
769    packuswb  m0, m1
770    movh   [dstq+dststrideq*0], m0
771    movhps [dstq+dststrideq*1], m0
772%endif
773%else ; cpuflag(ssse3)
774cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
775    shl      mxd, 4
776%ifdef PIC
777    lea  picregq, [bilinear_filter_vw_m]
778%endif
779    pxor      m6, m6
780    mova      m5, [bilinear_filter_vw+mxq-1*16]
781    neg      mxq
782    mova      m4, [bilinear_filter_vw+mxq+7*16]
783.nextrow:
784    movh      m0, [srcq+srcstrideq*0+0]
785    movh      m1, [srcq+srcstrideq*0+1]
786    movh      m2, [srcq+srcstrideq*1+0]
787    movh      m3, [srcq+srcstrideq*1+1]
788    punpcklbw m0, m6
789    punpcklbw m1, m6
790    punpcklbw m2, m6
791    punpcklbw m3, m6
792    pmullw    m0, m4
793    pmullw    m1, m5
794    pmullw    m2, m4
795    pmullw    m3, m5
796    paddsw    m0, m1
797    paddsw    m2, m3
798    psraw     m0, 2
799    psraw     m2, 2
800    pavgw     m0, m6
801    pavgw     m2, m6
802%if mmsize == 8
803    packuswb  m0, m0
804    packuswb  m2, m2
805    movh   [dstq+dststrideq*0], m0
806    movh   [dstq+dststrideq*1], m2
807%else
808    packuswb  m0, m2
809    movh   [dstq+dststrideq*0], m0
810    movhps [dstq+dststrideq*1], m0
811%endif
812%endif ; cpuflag(ssse3)
813
814    lea     dstq, [dstq+dststrideq*2]
815    lea     srcq, [srcq+srcstrideq*2]
816    sub  heightd, 2
817    jg .nextrow
818    REP_RET
819%endmacro
820
821INIT_MMX mmxext
822FILTER_BILINEAR 4
823INIT_XMM sse2
824FILTER_BILINEAR 8
825INIT_MMX ssse3
826FILTER_BILINEAR 4
827INIT_XMM ssse3
828FILTER_BILINEAR 8
829
830INIT_MMX mmx
831cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
832.nextrow:
833    movq    mm0, [srcq+srcstrideq*0]
834    movq    mm1, [srcq+srcstrideq*1]
835    lea    srcq, [srcq+srcstrideq*2]
836    movq [dstq+dststrideq*0], mm0
837    movq [dstq+dststrideq*1], mm1
838    lea    dstq, [dstq+dststrideq*2]
839    sub heightd, 2
840    jg .nextrow
841    REP_RET
842
843%if ARCH_X86_32
844INIT_MMX mmx
845cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height
846.nextrow:
847    movq    mm0, [srcq+srcstrideq*0+0]
848    movq    mm1, [srcq+srcstrideq*0+8]
849    movq    mm2, [srcq+srcstrideq*1+0]
850    movq    mm3, [srcq+srcstrideq*1+8]
851    lea    srcq, [srcq+srcstrideq*2]
852    movq [dstq+dststrideq*0+0], mm0
853    movq [dstq+dststrideq*0+8], mm1
854    movq [dstq+dststrideq*1+0], mm2
855    movq [dstq+dststrideq*1+8], mm3
856    lea    dstq, [dstq+dststrideq*2]
857    sub heightd, 2
858    jg .nextrow
859    REP_RET
860%endif
861
862INIT_XMM sse
863cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
864.nextrow:
865    movups xmm0, [srcq+srcstrideq*0]
866    movups xmm1, [srcq+srcstrideq*1]
867    lea    srcq, [srcq+srcstrideq*2]
868    movaps [dstq+dststrideq*0], xmm0
869    movaps [dstq+dststrideq*1], xmm1
870    lea    dstq, [dstq+dststrideq*2]
871    sub heightd, 2
872    jg .nextrow
873    REP_RET
874
875;-----------------------------------------------------------------------------
876; void ff_vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
877;-----------------------------------------------------------------------------
878
879%macro ADD_DC 4
880    %4        m2, [dst1q+%3]
881    %4        m3, [dst1q+strideq+%3]
882    %4        m4, [dst2q+%3]
883    %4        m5, [dst2q+strideq+%3]
884    paddusb   m2, %1
885    paddusb   m3, %1
886    paddusb   m4, %1
887    paddusb   m5, %1
888    psubusb   m2, %2
889    psubusb   m3, %2
890    psubusb   m4, %2
891    psubusb   m5, %2
892    %4 [dst1q+%3], m2
893    %4 [dst1q+strideq+%3], m3
894    %4 [dst2q+%3], m4
895    %4 [dst2q+strideq+%3], m5
896%endmacro
897
898%if ARCH_X86_32
899INIT_MMX mmx
900cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
901    ; load data
902    movd       m0, [blockq]
903
904    ; calculate DC
905    paddw      m0, [pw_4]
906    pxor       m1, m1
907    psraw      m0, 3
908    movd [blockq], m1
909    psubw      m1, m0
910    packuswb   m0, m0
911    packuswb   m1, m1
912    punpcklbw  m0, m0
913    punpcklbw  m1, m1
914    punpcklwd  m0, m0
915    punpcklwd  m1, m1
916
917    ; add DC
918    DEFINE_ARGS dst1, dst2, stride
919    lea     dst2q, [dst1q+strideq*2]
920    ADD_DC     m0, m1, 0, movh
921    RET
922%endif
923
924%macro VP8_IDCT_DC_ADD 0
925cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
926    ; load data
927    movd       m0, [blockq]
928    pxor       m1, m1
929
930    ; calculate DC
931    paddw      m0, [pw_4]
932    movd [blockq], m1
933    DEFINE_ARGS dst1, dst2, stride
934    lea     dst2q, [dst1q+strideq*2]
935    movd       m2, [dst1q]
936    movd       m3, [dst1q+strideq]
937    movd       m4, [dst2q]
938    movd       m5, [dst2q+strideq]
939    psraw      m0, 3
940    pshuflw    m0, m0, 0
941    punpcklqdq m0, m0
942    punpckldq  m2, m3
943    punpckldq  m4, m5
944    punpcklbw  m2, m1
945    punpcklbw  m4, m1
946    paddw      m2, m0
947    paddw      m4, m0
948    packuswb   m2, m4
949    movd   [dst1q], m2
950%if cpuflag(sse4)
951    pextrd [dst1q+strideq], m2, 1
952    pextrd [dst2q], m2, 2
953    pextrd [dst2q+strideq], m2, 3
954%else
955    psrldq     m2, 4
956    movd [dst1q+strideq], m2
957    psrldq     m2, 4
958    movd [dst2q], m2
959    psrldq     m2, 4
960    movd [dst2q+strideq], m2
961%endif
962    RET
963%endmacro
964
965INIT_XMM sse2
966VP8_IDCT_DC_ADD
967INIT_XMM sse4
968VP8_IDCT_DC_ADD
969
970;-----------------------------------------------------------------------------
971; void ff_vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
972;-----------------------------------------------------------------------------
973
974%if ARCH_X86_32
975INIT_MMX mmx
976cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
977    ; load data
978    movd      m0, [blockq+32*0] ; A
979    movd      m1, [blockq+32*2] ; C
980    punpcklwd m0, [blockq+32*1] ; A B
981    punpcklwd m1, [blockq+32*3] ; C D
982    punpckldq m0, m1        ; A B C D
983    pxor      m6, m6
984
985    ; calculate DC
986    paddw     m0, [pw_4]
987    movd [blockq+32*0], m6
988    movd [blockq+32*1], m6
989    movd [blockq+32*2], m6
990    movd [blockq+32*3], m6
991    psraw     m0, 3
992    psubw     m6, m0
993    packuswb  m0, m0
994    packuswb  m6, m6
995    punpcklbw m0, m0 ; AABBCCDD
996    punpcklbw m6, m6 ; AABBCCDD
997    movq      m1, m0
998    movq      m7, m6
999    punpcklbw m0, m0 ; AAAABBBB
1000    punpckhbw m1, m1 ; CCCCDDDD
1001    punpcklbw m6, m6 ; AAAABBBB
1002    punpckhbw m7, m7 ; CCCCDDDD
1003
1004    ; add DC
1005    DEFINE_ARGS dst1, dst2, stride
1006    lea    dst2q, [dst1q+strideq*2]
1007    ADD_DC    m0, m6, 0, mova
1008    ADD_DC    m1, m7, 8, mova
1009    RET
1010%endif
1011
1012INIT_XMM sse2
1013cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
1014    ; load data
1015    movd      m0, [blockq+32*0] ; A
1016    movd      m1, [blockq+32*2] ; C
1017    punpcklwd m0, [blockq+32*1] ; A B
1018    punpcklwd m1, [blockq+32*3] ; C D
1019    punpckldq m0, m1        ; A B C D
1020    pxor      m1, m1
1021
1022    ; calculate DC
1023    paddw     m0, [pw_4]
1024    movd [blockq+32*0], m1
1025    movd [blockq+32*1], m1
1026    movd [blockq+32*2], m1
1027    movd [blockq+32*3], m1
1028    psraw     m0, 3
1029    psubw     m1, m0
1030    packuswb  m0, m0
1031    packuswb  m1, m1
1032    punpcklbw m0, m0
1033    punpcklbw m1, m1
1034    punpcklbw m0, m0
1035    punpcklbw m1, m1
1036
1037    ; add DC
1038    DEFINE_ARGS dst1, dst2, stride
1039    lea    dst2q, [dst1q+strideq*2]
1040    ADD_DC    m0, m1, 0, mova
1041    RET
1042
1043;-----------------------------------------------------------------------------
1044; void ff_vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
1045;-----------------------------------------------------------------------------
1046
1047INIT_MMX mmx
1048cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
1049    ; load data
1050    movd      m0, [blockq+32*0] ; A
1051    movd      m1, [blockq+32*2] ; C
1052    punpcklwd m0, [blockq+32*1] ; A B
1053    punpcklwd m1, [blockq+32*3] ; C D
1054    punpckldq m0, m1        ; A B C D
1055    pxor      m6, m6
1056
1057    ; calculate DC
1058    paddw     m0, [pw_4]
1059    movd [blockq+32*0], m6
1060    movd [blockq+32*1], m6
1061    movd [blockq+32*2], m6
1062    movd [blockq+32*3], m6
1063    psraw     m0, 3
1064    psubw     m6, m0
1065    packuswb  m0, m0
1066    packuswb  m6, m6
1067    punpcklbw m0, m0 ; AABBCCDD
1068    punpcklbw m6, m6 ; AABBCCDD
1069    movq      m1, m0
1070    movq      m7, m6
1071    punpcklbw m0, m0 ; AAAABBBB
1072    punpckhbw m1, m1 ; CCCCDDDD
1073    punpcklbw m6, m6 ; AAAABBBB
1074    punpckhbw m7, m7 ; CCCCDDDD
1075
1076    ; add DC
1077    DEFINE_ARGS dst1, dst2, stride
1078    lea    dst2q, [dst1q+strideq*2]
1079    ADD_DC    m0, m6, 0, mova
1080    lea    dst1q, [dst1q+strideq*4]
1081    lea    dst2q, [dst2q+strideq*4]
1082    ADD_DC    m1, m7, 0, mova
1083    RET
1084
1085;-----------------------------------------------------------------------------
1086; void ff_vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
1087;-----------------------------------------------------------------------------
1088
1089; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
1090;           this macro assumes that m6/m7 have words for 20091/17734 loaded
1091%macro VP8_MULTIPLY_SUMSUB 4
1092    mova      %3, %1
1093    mova      %4, %2
1094    pmulhw    %3, m6 ;20091(1)
1095    pmulhw    %4, m6 ;20091(2)
1096    paddw     %3, %1
1097    paddw     %4, %2
1098    paddw     %1, %1
1099    paddw     %2, %2
1100    pmulhw    %1, m7 ;35468(1)
1101    pmulhw    %2, m7 ;35468(2)
1102    psubw     %1, %4
1103    paddw     %2, %3
1104%endmacro
1105
1106; calculate x0=%1+%3; x1=%1-%3
1107;           x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
1108;           %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
1109;           %5/%6 are temporary registers
1110;           we assume m6/m7 have constant words 20091/17734 loaded in them
1111%macro VP8_IDCT_TRANSFORM4x4_1D 6
1112    SUMSUB_BA         w, %3,  %1,  %5     ;t0, t1
1113    VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
1114    SUMSUB_BA         w, %4,  %3,  %5     ;tmp0, tmp3
1115    SUMSUB_BA         w, %2,  %1,  %5     ;tmp1, tmp2
1116    SWAP                 %4,  %1
1117    SWAP                 %4,  %3
1118%endmacro
1119
1120%macro VP8_IDCT_ADD 0
1121cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
1122    ; load block data
1123    movq         m0, [blockq+ 0]
1124    movq         m1, [blockq+ 8]
1125    movq         m2, [blockq+16]
1126    movq         m3, [blockq+24]
1127    movq         m6, [pw_20091]
1128    movq         m7, [pw_17734]
1129%if cpuflag(sse)
1130    xorps      xmm0, xmm0
1131    movaps [blockq+ 0], xmm0
1132    movaps [blockq+16], xmm0
1133%else
1134    pxor         m4, m4
1135    movq [blockq+ 0], m4
1136    movq [blockq+ 8], m4
1137    movq [blockq+16], m4
1138    movq [blockq+24], m4
1139%endif
1140
1141    ; actual IDCT
1142    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1143    TRANSPOSE4x4W            0, 1, 2, 3, 4
1144    paddw        m0, [pw_4]
1145    VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1146    TRANSPOSE4x4W            0, 1, 2, 3, 4
1147
1148    ; store
1149    pxor         m4, m4
1150    DEFINE_ARGS dst1, dst2, stride
1151    lea       dst2q, [dst1q+2*strideq]
1152    STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
1153    STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
1154
1155    RET
1156%endmacro
1157
1158%if ARCH_X86_32
1159INIT_MMX mmx
1160VP8_IDCT_ADD
1161%endif
1162INIT_MMX sse
1163VP8_IDCT_ADD
1164
1165;-----------------------------------------------------------------------------
1166; void ff_vp8_luma_dc_wht(int16_t block[4][4][16], int16_t dc[16])
1167;-----------------------------------------------------------------------------
1168
1169%macro SCATTER_WHT 3
1170    movd dc1d, m%1
1171    movd dc2d, m%2
1172    mov [blockq+2*16*(0+%3)], dc1w
1173    mov [blockq+2*16*(1+%3)], dc2w
1174    shr  dc1d, 16
1175    shr  dc2d, 16
1176    psrlq m%1, 32
1177    psrlq m%2, 32
1178    mov [blockq+2*16*(4+%3)], dc1w
1179    mov [blockq+2*16*(5+%3)], dc2w
1180    movd dc1d, m%1
1181    movd dc2d, m%2
1182    mov [blockq+2*16*(8+%3)], dc1w
1183    mov [blockq+2*16*(9+%3)], dc2w
1184    shr  dc1d, 16
1185    shr  dc2d, 16
1186    mov [blockq+2*16*(12+%3)], dc1w
1187    mov [blockq+2*16*(13+%3)], dc2w
1188%endmacro
1189
1190%macro HADAMARD4_1D 4
1191    SUMSUB_BADC w, %2, %1, %4, %3
1192    SUMSUB_BADC w, %4, %2, %3, %1
1193    SWAP %1, %4, %3
1194%endmacro
1195
1196%macro VP8_DC_WHT 0
1197cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
1198    movq          m0, [dc1q]
1199    movq          m1, [dc1q+8]
1200    movq          m2, [dc1q+16]
1201    movq          m3, [dc1q+24]
1202%if cpuflag(sse)
1203    xorps      xmm0, xmm0
1204    movaps [dc1q+ 0], xmm0
1205    movaps [dc1q+16], xmm0
1206%else
1207    pxor         m4, m4
1208    movq  [dc1q+ 0], m4
1209    movq  [dc1q+ 8], m4
1210    movq  [dc1q+16], m4
1211    movq  [dc1q+24], m4
1212%endif
1213    HADAMARD4_1D  0, 1, 2, 3
1214    TRANSPOSE4x4W 0, 1, 2, 3, 4
1215    paddw         m0, [pw_3]
1216    HADAMARD4_1D  0, 1, 2, 3
1217    psraw         m0, 3
1218    psraw         m1, 3
1219    psraw         m2, 3
1220    psraw         m3, 3
1221    SCATTER_WHT   0, 1, 0
1222    SCATTER_WHT   2, 3, 2
1223    RET
1224%endmacro
1225
1226%if ARCH_X86_32
1227INIT_MMX mmx
1228VP8_DC_WHT
1229%endif
1230INIT_MMX sse
1231VP8_DC_WHT
1232