1;******************************************************************************
2;* VP9 motion compensation SIMD optimizations
3;*
4;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA 32
26
27cextern pw_256
28cextern pw_64
29
30%macro F8_SSSE3_TAPS 8
31times 16 db %1, %2
32times 16 db %3, %4
33times 16 db %5, %6
34times 16 db %7, %8
35%endmacro
36
37%macro F8_SSE2_TAPS 8
38times 8 dw %1
39times 8 dw %2
40times 8 dw %3
41times 8 dw %4
42times 8 dw %5
43times 8 dw %6
44times 8 dw %7
45times 8 dw %8
46%endmacro
47
48%macro F8_16BPP_TAPS 8
49times 8 dw %1, %2
50times 8 dw %3, %4
51times 8 dw %5, %6
52times 8 dw %7, %8
53%endmacro
54
55%macro FILTER 1
56const filters_%1 ; smooth
57                    F8_TAPS -3, -1,  32,  64,  38,   1, -3,  0
58                    F8_TAPS -2, -2,  29,  63,  41,   2, -3,  0
59                    F8_TAPS -2, -2,  26,  63,  43,   4, -4,  0
60                    F8_TAPS -2, -3,  24,  62,  46,   5, -4,  0
61                    F8_TAPS -2, -3,  21,  60,  49,   7, -4,  0
62                    F8_TAPS -1, -4,  18,  59,  51,   9, -4,  0
63                    F8_TAPS -1, -4,  16,  57,  53,  12, -4, -1
64                    F8_TAPS -1, -4,  14,  55,  55,  14, -4, -1
65                    F8_TAPS -1, -4,  12,  53,  57,  16, -4, -1
66                    F8_TAPS  0, -4,   9,  51,  59,  18, -4, -1
67                    F8_TAPS  0, -4,   7,  49,  60,  21, -3, -2
68                    F8_TAPS  0, -4,   5,  46,  62,  24, -3, -2
69                    F8_TAPS  0, -4,   4,  43,  63,  26, -2, -2
70                    F8_TAPS  0, -3,   2,  41,  63,  29, -2, -2
71                    F8_TAPS  0, -3,   1,  38,  64,  32, -1, -3
72                    ; regular
73                    F8_TAPS  0,  1,  -5, 126,   8,  -3,  1,  0
74                    F8_TAPS -1,  3, -10, 122,  18,  -6,  2,  0
75                    F8_TAPS -1,  4, -13, 118,  27,  -9,  3, -1
76                    F8_TAPS -1,  4, -16, 112,  37, -11,  4, -1
77                    F8_TAPS -1,  5, -18, 105,  48, -14,  4, -1
78                    F8_TAPS -1,  5, -19,  97,  58, -16,  5, -1
79                    F8_TAPS -1,  6, -19,  88,  68, -18,  5, -1
80                    F8_TAPS -1,  6, -19,  78,  78, -19,  6, -1
81                    F8_TAPS -1,  5, -18,  68,  88, -19,  6, -1
82                    F8_TAPS -1,  5, -16,  58,  97, -19,  5, -1
83                    F8_TAPS -1,  4, -14,  48, 105, -18,  5, -1
84                    F8_TAPS -1,  4, -11,  37, 112, -16,  4, -1
85                    F8_TAPS -1,  3,  -9,  27, 118, -13,  4, -1
86                    F8_TAPS  0,  2,  -6,  18, 122, -10,  3, -1
87                    F8_TAPS  0,  1,  -3,   8, 126,  -5,  1,  0
88                    ; sharp
89                    F8_TAPS -1,  3,  -7, 127,   8,  -3,  1,  0
90                    F8_TAPS -2,  5, -13, 125,  17,  -6,  3, -1
91                    F8_TAPS -3,  7, -17, 121,  27, -10,  5, -2
92                    F8_TAPS -4,  9, -20, 115,  37, -13,  6, -2
93                    F8_TAPS -4, 10, -23, 108,  48, -16,  8, -3
94                    F8_TAPS -4, 10, -24, 100,  59, -19,  9, -3
95                    F8_TAPS -4, 11, -24,  90,  70, -21, 10, -4
96                    F8_TAPS -4, 11, -23,  80,  80, -23, 11, -4
97                    F8_TAPS -4, 10, -21,  70,  90, -24, 11, -4
98                    F8_TAPS -3,  9, -19,  59, 100, -24, 10, -4
99                    F8_TAPS -3,  8, -16,  48, 108, -23, 10, -4
100                    F8_TAPS -2,  6, -13,  37, 115, -20,  9, -4
101                    F8_TAPS -2,  5, -10,  27, 121, -17,  7, -3
102                    F8_TAPS -1,  3,  -6,  17, 125, -13,  5, -2
103                    F8_TAPS  0,  1,  -3,   8, 127,  -7,  3, -1
104%endmacro
105
106%define F8_TAPS F8_SSSE3_TAPS
107; int8_t ff_filters_ssse3[3][15][4][32]
108FILTER ssse3
109%define F8_TAPS F8_SSE2_TAPS
110; int16_t ff_filters_sse2[3][15][8][8]
111FILTER sse2
112%define F8_TAPS F8_16BPP_TAPS
113; int16_t ff_filters_16bpp[3][15][4][16]
114FILTER 16bpp
115
116SECTION .text
117
118%macro filter_sse2_h_fn 1
119%assign %%px mmsize/2
120cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 15, dst, dstride, src, sstride, h, filtery
121    pxor        m5, m5
122    mova        m6, [pw_64]
123    mova        m7, [filteryq+  0]
124%if ARCH_X86_64 && mmsize > 8
125    mova        m8, [filteryq+ 16]
126    mova        m9, [filteryq+ 32]
127    mova       m10, [filteryq+ 48]
128    mova       m11, [filteryq+ 64]
129    mova       m12, [filteryq+ 80]
130    mova       m13, [filteryq+ 96]
131    mova       m14, [filteryq+112]
132%endif
133.loop:
134    movh        m0, [srcq-3]
135    movh        m1, [srcq-2]
136    movh        m2, [srcq-1]
137    movh        m3, [srcq+0]
138    movh        m4, [srcq+1]
139    punpcklbw   m0, m5
140    punpcklbw   m1, m5
141    punpcklbw   m2, m5
142    punpcklbw   m3, m5
143    punpcklbw   m4, m5
144    pmullw      m0, m7
145%if ARCH_X86_64 && mmsize > 8
146    pmullw      m1, m8
147    pmullw      m2, m9
148    pmullw      m3, m10
149    pmullw      m4, m11
150%else
151    pmullw      m1, [filteryq+ 16]
152    pmullw      m2, [filteryq+ 32]
153    pmullw      m3, [filteryq+ 48]
154    pmullw      m4, [filteryq+ 64]
155%endif
156    paddw       m0, m1
157    paddw       m2, m3
158    paddw       m0, m4
159    movh        m1, [srcq+2]
160    movh        m3, [srcq+3]
161    movh        m4, [srcq+4]
162    add       srcq, sstrideq
163    punpcklbw   m1, m5
164    punpcklbw   m3, m5
165    punpcklbw   m4, m5
166%if ARCH_X86_64 && mmsize > 8
167    pmullw      m1, m12
168    pmullw      m3, m13
169    pmullw      m4, m14
170%else
171    pmullw      m1, [filteryq+ 80]
172    pmullw      m3, [filteryq+ 96]
173    pmullw      m4, [filteryq+112]
174%endif
175    paddw       m0, m1
176    paddw       m3, m4
177    paddw       m0, m6
178    paddw       m2, m3
179    paddsw      m0, m2
180    psraw       m0, 7
181%ifidn %1, avg
182    movh        m1, [dstq]
183%endif
184    packuswb    m0, m0
185%ifidn %1, avg
186    pavgb       m0, m1
187%endif
188    movh    [dstq], m0
189    add       dstq, dstrideq
190    dec         hd
191    jg .loop
192    RET
193%endmacro
194
195INIT_MMX mmxext
196filter_sse2_h_fn put
197filter_sse2_h_fn avg
198
199INIT_XMM sse2
200filter_sse2_h_fn put
201filter_sse2_h_fn avg
202
203%macro filter_h_fn 1
204%assign %%px mmsize/2
205cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 11, dst, dstride, src, sstride, h, filtery
206    mova        m6, [pw_256]
207    mova        m7, [filteryq+ 0]
208%if ARCH_X86_64 && mmsize > 8
209    mova        m8, [filteryq+32]
210    mova        m9, [filteryq+64]
211    mova       m10, [filteryq+96]
212%endif
213.loop:
214    movh        m0, [srcq-3]
215    movh        m1, [srcq-2]
216    movh        m2, [srcq-1]
217    movh        m3, [srcq+0]
218    movh        m4, [srcq+1]
219    movh        m5, [srcq+2]
220    punpcklbw   m0, m1
221    punpcklbw   m2, m3
222    movh        m1, [srcq+3]
223    movh        m3, [srcq+4]
224    add       srcq, sstrideq
225    punpcklbw   m4, m5
226    punpcklbw   m1, m3
227    pmaddubsw   m0, m7
228%if ARCH_X86_64 && mmsize > 8
229    pmaddubsw   m2, m8
230    pmaddubsw   m4, m9
231    pmaddubsw   m1, m10
232%else
233    pmaddubsw   m2, [filteryq+32]
234    pmaddubsw   m4, [filteryq+64]
235    pmaddubsw   m1, [filteryq+96]
236%endif
237    paddw       m0, m4
238    paddw       m2, m1
239    paddsw      m0, m2
240    pmulhrsw    m0, m6
241%ifidn %1, avg
242    movh        m1, [dstq]
243%endif
244    packuswb    m0, m0
245%ifidn %1, avg
246    pavgb       m0, m1
247%endif
248    movh    [dstq], m0
249    add       dstq, dstrideq
250    dec         hd
251    jg .loop
252    RET
253%endmacro
254
255INIT_MMX ssse3
256filter_h_fn put
257filter_h_fn avg
258
259INIT_XMM ssse3
260filter_h_fn put
261filter_h_fn avg
262
263%if ARCH_X86_64
264%macro filter_hx2_fn 1
265%assign %%px mmsize
266cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 14, dst, dstride, src, sstride, h, filtery
267    mova       m13, [pw_256]
268    mova        m8, [filteryq+ 0]
269    mova        m9, [filteryq+32]
270    mova       m10, [filteryq+64]
271    mova       m11, [filteryq+96]
272.loop:
273    movu        m0, [srcq-3]
274    movu        m1, [srcq-2]
275    movu        m2, [srcq-1]
276    movu        m3, [srcq+0]
277    movu        m4, [srcq+1]
278    movu        m5, [srcq+2]
279    movu        m6, [srcq+3]
280    movu        m7, [srcq+4]
281    add       srcq, sstrideq
282    SBUTTERFLY  bw, 0, 1, 12
283    SBUTTERFLY  bw, 2, 3, 12
284    SBUTTERFLY  bw, 4, 5, 12
285    SBUTTERFLY  bw, 6, 7, 12
286    pmaddubsw   m0, m8
287    pmaddubsw   m1, m8
288    pmaddubsw   m2, m9
289    pmaddubsw   m3, m9
290    pmaddubsw   m4, m10
291    pmaddubsw   m5, m10
292    pmaddubsw   m6, m11
293    pmaddubsw   m7, m11
294    paddw       m0, m4
295    paddw       m1, m5
296    paddw       m2, m6
297    paddw       m3, m7
298    paddsw      m0, m2
299    paddsw      m1, m3
300    pmulhrsw    m0, m13
301    pmulhrsw    m1, m13
302    packuswb    m0, m1
303%ifidn %1, avg
304    pavgb       m0, [dstq]
305%endif
306    mova    [dstq], m0
307    add       dstq, dstrideq
308    dec         hd
309    jg .loop
310    RET
311%endmacro
312
313INIT_XMM ssse3
314filter_hx2_fn put
315filter_hx2_fn avg
316
317%if HAVE_AVX2_EXTERNAL
318INIT_YMM avx2
319filter_hx2_fn put
320filter_hx2_fn avg
321%endif
322
323%endif ; ARCH_X86_64
324
325%macro filter_sse2_v_fn 1
326%assign %%px mmsize/2
327%if ARCH_X86_64
328cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 15, dst, dstride, src, sstride, h, filtery, src4, sstride3
329%else
330cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 15, dst, dstride, src, sstride, filtery, src4, sstride3
331    mov   filteryq, r5mp
332%define hd r4mp
333%endif
334    pxor        m5, m5
335    mova        m6, [pw_64]
336    lea  sstride3q, [sstrideq*3]
337    lea      src4q, [srcq+sstrideq]
338    sub       srcq, sstride3q
339    mova        m7, [filteryq+  0]
340%if ARCH_X86_64 && mmsize > 8
341    mova        m8, [filteryq+ 16]
342    mova        m9, [filteryq+ 32]
343    mova       m10, [filteryq+ 48]
344    mova       m11, [filteryq+ 64]
345    mova       m12, [filteryq+ 80]
346    mova       m13, [filteryq+ 96]
347    mova       m14, [filteryq+112]
348%endif
349.loop:
350    ; FIXME maybe reuse loads from previous rows, or just
351    ; more generally unroll this to prevent multiple loads of
352    ; the same data?
353    movh        m0, [srcq]
354    movh        m1, [srcq+sstrideq]
355    movh        m2, [srcq+sstrideq*2]
356    movh        m3, [srcq+sstride3q]
357    add       srcq, sstrideq
358    movh        m4, [src4q]
359    punpcklbw   m0, m5
360    punpcklbw   m1, m5
361    punpcklbw   m2, m5
362    punpcklbw   m3, m5
363    punpcklbw   m4, m5
364    pmullw      m0, m7
365%if ARCH_X86_64 && mmsize > 8
366    pmullw      m1, m8
367    pmullw      m2, m9
368    pmullw      m3, m10
369    pmullw      m4, m11
370%else
371    pmullw      m1, [filteryq+ 16]
372    pmullw      m2, [filteryq+ 32]
373    pmullw      m3, [filteryq+ 48]
374    pmullw      m4, [filteryq+ 64]
375%endif
376    paddw       m0, m1
377    paddw       m2, m3
378    paddw       m0, m4
379    movh        m1, [src4q+sstrideq]
380    movh        m3, [src4q+sstrideq*2]
381    movh        m4, [src4q+sstride3q]
382    add      src4q, sstrideq
383    punpcklbw   m1, m5
384    punpcklbw   m3, m5
385    punpcklbw   m4, m5
386%if ARCH_X86_64 && mmsize > 8
387    pmullw      m1, m12
388    pmullw      m3, m13
389    pmullw      m4, m14
390%else
391    pmullw      m1, [filteryq+ 80]
392    pmullw      m3, [filteryq+ 96]
393    pmullw      m4, [filteryq+112]
394%endif
395    paddw       m0, m1
396    paddw       m3, m4
397    paddw       m0, m6
398    paddw       m2, m3
399    paddsw      m0, m2
400    psraw       m0, 7
401%ifidn %1, avg
402    movh        m1, [dstq]
403%endif
404    packuswb    m0, m0
405%ifidn %1, avg
406    pavgb       m0, m1
407%endif
408    movh    [dstq], m0
409    add       dstq, dstrideq
410    dec         hd
411    jg .loop
412    RET
413%endmacro
414
415INIT_MMX mmxext
416filter_sse2_v_fn put
417filter_sse2_v_fn avg
418
419INIT_XMM sse2
420filter_sse2_v_fn put
421filter_sse2_v_fn avg
422
423%macro filter_v_fn 1
424%assign %%px mmsize/2
425%if ARCH_X86_64
426cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3
427%else
428cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3
429    mov   filteryq, r5mp
430%define hd r4mp
431%endif
432    mova        m6, [pw_256]
433    lea  sstride3q, [sstrideq*3]
434    lea      src4q, [srcq+sstrideq]
435    sub       srcq, sstride3q
436    mova        m7, [filteryq+ 0]
437%if ARCH_X86_64 && mmsize > 8
438    mova        m8, [filteryq+32]
439    mova        m9, [filteryq+64]
440    mova       m10, [filteryq+96]
441%endif
442.loop:
443    ; FIXME maybe reuse loads from previous rows, or just more generally
444    ; unroll this to prevent multiple loads of the same data?
445    movh        m0, [srcq]
446    movh        m1, [srcq+sstrideq]
447    movh        m2, [srcq+sstrideq*2]
448    movh        m3, [srcq+sstride3q]
449    movh        m4, [src4q]
450    movh        m5, [src4q+sstrideq]
451    punpcklbw   m0, m1
452    punpcklbw   m2, m3
453    movh        m1, [src4q+sstrideq*2]
454    movh        m3, [src4q+sstride3q]
455    add       srcq, sstrideq
456    add      src4q, sstrideq
457    punpcklbw   m4, m5
458    punpcklbw   m1, m3
459    pmaddubsw   m0, m7
460%if ARCH_X86_64 && mmsize > 8
461    pmaddubsw   m2, m8
462    pmaddubsw   m4, m9
463    pmaddubsw   m1, m10
464%else
465    pmaddubsw   m2, [filteryq+32]
466    pmaddubsw   m4, [filteryq+64]
467    pmaddubsw   m1, [filteryq+96]
468%endif
469    paddw       m0, m4
470    paddw       m2, m1
471    paddsw      m0, m2
472    pmulhrsw    m0, m6
473%ifidn %1, avg
474    movh        m1, [dstq]
475%endif
476    packuswb    m0, m0
477%ifidn %1, avg
478    pavgb       m0, m1
479%endif
480    movh    [dstq], m0
481    add       dstq, dstrideq
482    dec         hd
483    jg .loop
484    RET
485%endmacro
486
487INIT_MMX ssse3
488filter_v_fn put
489filter_v_fn avg
490
491INIT_XMM ssse3
492filter_v_fn put
493filter_v_fn avg
494
495%if ARCH_X86_64
496
497%macro filter_vx2_fn 1
498%assign %%px mmsize
499cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3
500    mova       m13, [pw_256]
501    lea  sstride3q, [sstrideq*3]
502    lea      src4q, [srcq+sstrideq]
503    sub       srcq, sstride3q
504    mova        m8, [filteryq+ 0]
505    mova        m9, [filteryq+32]
506    mova       m10, [filteryq+64]
507    mova       m11, [filteryq+96]
508.loop:
509    ; FIXME maybe reuse loads from previous rows, or just
510    ; more generally unroll this to prevent multiple loads of
511    ; the same data?
512    movu        m0, [srcq]
513    movu        m1, [srcq+sstrideq]
514    movu        m2, [srcq+sstrideq*2]
515    movu        m3, [srcq+sstride3q]
516    movu        m4, [src4q]
517    movu        m5, [src4q+sstrideq]
518    movu        m6, [src4q+sstrideq*2]
519    movu        m7, [src4q+sstride3q]
520    add       srcq, sstrideq
521    add      src4q, sstrideq
522    SBUTTERFLY  bw, 0, 1, 12
523    SBUTTERFLY  bw, 2, 3, 12
524    SBUTTERFLY  bw, 4, 5, 12
525    SBUTTERFLY  bw, 6, 7, 12
526    pmaddubsw   m0, m8
527    pmaddubsw   m1, m8
528    pmaddubsw   m2, m9
529    pmaddubsw   m3, m9
530    pmaddubsw   m4, m10
531    pmaddubsw   m5, m10
532    pmaddubsw   m6, m11
533    pmaddubsw   m7, m11
534    paddw       m0, m4
535    paddw       m1, m5
536    paddw       m2, m6
537    paddw       m3, m7
538    paddsw      m0, m2
539    paddsw      m1, m3
540    pmulhrsw    m0, m13
541    pmulhrsw    m1, m13
542    packuswb    m0, m1
543%ifidn %1, avg
544    pavgb       m0, [dstq]
545%endif
546    mova    [dstq], m0
547    add       dstq, dstrideq
548    dec         hd
549    jg .loop
550    RET
551%endmacro
552
553INIT_XMM ssse3
554filter_vx2_fn put
555filter_vx2_fn avg
556
557%if HAVE_AVX2_EXTERNAL
558INIT_YMM avx2
559filter_vx2_fn put
560filter_vx2_fn avg
561%endif
562
563%endif ; ARCH_X86_64
564
565%macro fpel_fn 6-8 0, 4
566%if %2 == 4
567%define %%srcfn movh
568%define %%dstfn movh
569%else
570%define %%srcfn movu
571%define %%dstfn mova
572%endif
573
574%if %7 == 8
575%define %%pavg pavgb
576%define %%szsuf _8
577%elif %7 == 16
578%define %%pavg pavgw
579%define %%szsuf _16
580%else
581%define %%szsuf
582%endif
583
584%if %2 <= mmsize
585cglobal vp9_%1%2 %+ %%szsuf, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
586    lea  sstride3q, [sstrideq*3]
587    lea  dstride3q, [dstrideq*3]
588%else
589cglobal vp9_%1%2 %+ %%szsuf, 5, 5, %8, dst, dstride, src, sstride, h
590%endif
591.loop:
592    %%srcfn     m0, [srcq]
593    %%srcfn     m1, [srcq+s%3]
594    %%srcfn     m2, [srcq+s%4]
595    %%srcfn     m3, [srcq+s%5]
596%if %2/mmsize == 8
597    %%srcfn     m4, [srcq+mmsize*4]
598    %%srcfn     m5, [srcq+mmsize*5]
599    %%srcfn     m6, [srcq+mmsize*6]
600    %%srcfn     m7, [srcq+mmsize*7]
601%endif
602    lea       srcq, [srcq+sstrideq*%6]
603%ifidn %1, avg
604    %%pavg      m0, [dstq]
605    %%pavg      m1, [dstq+d%3]
606    %%pavg      m2, [dstq+d%4]
607    %%pavg      m3, [dstq+d%5]
608%if %2/mmsize == 8
609    %%pavg      m4, [dstq+mmsize*4]
610    %%pavg      m5, [dstq+mmsize*5]
611    %%pavg      m6, [dstq+mmsize*6]
612    %%pavg      m7, [dstq+mmsize*7]
613%endif
614%endif
615    %%dstfn [dstq], m0
616    %%dstfn [dstq+d%3], m1
617    %%dstfn [dstq+d%4], m2
618    %%dstfn [dstq+d%5], m3
619%if %2/mmsize == 8
620    %%dstfn [dstq+mmsize*4], m4
621    %%dstfn [dstq+mmsize*5], m5
622    %%dstfn [dstq+mmsize*6], m6
623    %%dstfn [dstq+mmsize*7], m7
624%endif
625    lea       dstq, [dstq+dstrideq*%6]
626    sub         hd, %6
627    jnz .loop
628    RET
629%endmacro
630
631%define d16 16
632%define s16 16
633%define d32 32
634%define s32 32
635INIT_MMX mmx
636fpel_fn put, 4,  strideq, strideq*2, stride3q, 4
637fpel_fn put, 8,  strideq, strideq*2, stride3q, 4
638INIT_MMX mmxext
639fpel_fn avg, 4,  strideq, strideq*2, stride3q, 4, 8
640fpel_fn avg, 8,  strideq, strideq*2, stride3q, 4, 8
641INIT_XMM sse
642fpel_fn put, 16, strideq, strideq*2, stride3q, 4
643fpel_fn put, 32, mmsize,  strideq,   strideq+mmsize, 2
644fpel_fn put, 64, mmsize,  mmsize*2,  mmsize*3, 1
645fpel_fn put, 128, mmsize, mmsize*2,  mmsize*3, 1, 0, 8
646INIT_XMM sse2
647fpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 8
648fpel_fn avg, 32, mmsize,  strideq,   strideq+mmsize, 2, 8
649fpel_fn avg, 64, mmsize,  mmsize*2,  mmsize*3, 1, 8
650INIT_YMM avx
651fpel_fn put, 32, strideq, strideq*2, stride3q, 4
652fpel_fn put, 64, mmsize,  strideq,   strideq+mmsize, 2
653fpel_fn put, 128, mmsize, mmsize*2,     mmsize*3, 1
654%if HAVE_AVX2_EXTERNAL
655INIT_YMM avx2
656fpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 8
657fpel_fn avg, 64, mmsize,  strideq,   strideq+mmsize, 2, 8
658%endif
659INIT_MMX mmxext
660fpel_fn avg,  8,  strideq, strideq*2, stride3q, 4, 16
661INIT_XMM sse2
662fpel_fn avg,  16, strideq, strideq*2, stride3q, 4, 16
663fpel_fn avg,  32, mmsize,  strideq,   strideq+mmsize, 2, 16
664fpel_fn avg,  64, mmsize,  mmsize*2,  mmsize*3, 1, 16
665fpel_fn avg, 128, mmsize,  mmsize*2,  mmsize*3, 1, 16, 8
666%if HAVE_AVX2_EXTERNAL
667INIT_YMM avx2
668fpel_fn avg,  32, strideq, strideq*2, stride3q, 4, 16
669fpel_fn avg,  64, mmsize,  strideq,   strideq+mmsize, 2, 16
670fpel_fn avg, 128, mmsize,  mmsize*2,  mmsize*3, 1, 16
671%endif
672%undef s16
673%undef d16
674%undef s32
675%undef d32
676