1;******************************************************************************
2;* VP9 MC SIMD optimizations
3;*
4;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA 32
26
27pd_64: times 8 dd 64
28
29cextern pw_1023
30cextern pw_4095
31
32SECTION .text
33
34%macro filter_h4_fn 1-2 12
35cglobal vp9_%1_8tap_1d_h_4_10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
36    mova        m5, [pw_1023]
37.body:
38%if notcpuflag(sse4) && ARCH_X86_64
39    pxor       m11, m11
40%endif
41    mova        m6, [pd_64]
42    mova        m7, [filteryq+ 0]
43%if ARCH_X86_64 && mmsize > 8
44    mova        m8, [filteryq+32]
45    mova        m9, [filteryq+64]
46    mova       m10, [filteryq+96]
47%endif
48.loop:
49    movh        m0, [srcq-6]
50    movh        m1, [srcq-4]
51    movh        m2, [srcq-2]
52    movh        m3, [srcq+0]
53    movh        m4, [srcq+2]
54    punpcklwd   m0, m1
55    punpcklwd   m2, m3
56    pmaddwd     m0, m7
57%if ARCH_X86_64 && mmsize > 8
58    pmaddwd     m2, m8
59%else
60    pmaddwd     m2, [filteryq+32]
61%endif
62    movu        m1, [srcq+4]
63    movu        m3, [srcq+6]
64    paddd       m0, m2
65    movu        m2, [srcq+8]
66    add       srcq, sstrideq
67    punpcklwd   m4, m1
68    punpcklwd   m3, m2
69%if ARCH_X86_64 && mmsize > 8
70    pmaddwd     m4, m9
71    pmaddwd     m3, m10
72%else
73    pmaddwd     m4, [filteryq+64]
74    pmaddwd     m3, [filteryq+96]
75%endif
76    paddd       m0, m4
77    paddd       m0, m3
78    paddd       m0, m6
79    psrad       m0, 7
80%if cpuflag(sse4)
81    packusdw    m0, m0
82%else
83    packssdw    m0, m0
84%endif
85%ifidn %1, avg
86    movh        m1, [dstq]
87%endif
88    pminsw      m0, m5
89%if notcpuflag(sse4)
90%if ARCH_X86_64
91    pmaxsw      m0, m11
92%else
93    pxor        m2, m2
94    pmaxsw      m0, m2
95%endif
96%endif
97%ifidn %1, avg
98    pavgw       m0, m1
99%endif
100    movh    [dstq], m0
101    add       dstq, dstrideq
102    dec         hd
103    jg .loop
104    RET
105
106cglobal vp9_%1_8tap_1d_h_4_12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
107    mova        m5, [pw_4095]
108    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_4_10 %+ SUFFIX).body
109%endmacro
110
111INIT_XMM sse2
112filter_h4_fn put
113filter_h4_fn avg
114
115%macro filter_h_fn 1-2 12
116%assign %%px mmsize/2
117cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
118    mova        m5, [pw_1023]
119.body:
120%if notcpuflag(sse4) && ARCH_X86_64
121    pxor       m11, m11
122%endif
123    mova        m6, [pd_64]
124    mova        m7, [filteryq+ 0]
125%if ARCH_X86_64 && mmsize > 8
126    mova        m8, [filteryq+32]
127    mova        m9, [filteryq+64]
128    mova       m10, [filteryq+96]
129%endif
130.loop:
131    movu        m0, [srcq-6]
132    movu        m1, [srcq-4]
133    movu        m2, [srcq-2]
134    movu        m3, [srcq+0]
135    movu        m4, [srcq+2]
136    pmaddwd     m0, m7
137    pmaddwd     m1, m7
138%if ARCH_X86_64 && mmsize > 8
139    pmaddwd     m2, m8
140    pmaddwd     m3, m8
141    pmaddwd     m4, m9
142%else
143    pmaddwd     m2, [filteryq+32]
144    pmaddwd     m3, [filteryq+32]
145    pmaddwd     m4, [filteryq+64]
146%endif
147    paddd       m0, m2
148    paddd       m1, m3
149    paddd       m0, m4
150    movu        m2, [srcq+4]
151    movu        m3, [srcq+6]
152    movu        m4, [srcq+8]
153    add       srcq, sstrideq
154%if ARCH_X86_64 && mmsize > 8
155    pmaddwd     m2, m9
156    pmaddwd     m3, m10
157    pmaddwd     m4, m10
158%else
159    pmaddwd     m2, [filteryq+64]
160    pmaddwd     m3, [filteryq+96]
161    pmaddwd     m4, [filteryq+96]
162%endif
163    paddd       m1, m2
164    paddd       m0, m3
165    paddd       m1, m4
166    paddd       m0, m6
167    paddd       m1, m6
168    psrad       m0, 7
169    psrad       m1, 7
170%if cpuflag(sse4)
171    packusdw    m0, m0
172    packusdw    m1, m1
173%else
174    packssdw    m0, m0
175    packssdw    m1, m1
176%endif
177    punpcklwd   m0, m1
178    pminsw      m0, m5
179%if notcpuflag(sse4)
180%if ARCH_X86_64
181    pmaxsw      m0, m11
182%else
183    pxor        m2, m2
184    pmaxsw      m0, m2
185%endif
186%endif
187%ifidn %1, avg
188    pavgw       m0, [dstq]
189%endif
190    mova    [dstq], m0
191    add       dstq, dstrideq
192    dec         hd
193    jg .loop
194    RET
195
196cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
197    mova        m5, [pw_4095]
198    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_ %+ %%px %+ _10 %+ SUFFIX).body
199%endmacro
200
201INIT_XMM sse2
202filter_h_fn put
203filter_h_fn avg
204%if HAVE_AVX2_EXTERNAL
205INIT_YMM avx2
206filter_h_fn put
207filter_h_fn avg
208%endif
209
210%macro filter_v4_fn 1-2 12
211%if ARCH_X86_64
212cglobal vp9_%1_8tap_1d_v_4_10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
213%else
214cglobal vp9_%1_8tap_1d_v_4_10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
215    mov   filteryq, r5mp
216%define hd r4mp
217%endif
218    mova        m5, [pw_1023]
219.body:
220%if notcpuflag(sse4) && ARCH_X86_64
221    pxor       m11, m11
222%endif
223    mova        m6, [pd_64]
224    lea  sstride3q, [sstrideq*3]
225    lea      src4q, [srcq+sstrideq]
226    sub       srcq, sstride3q
227    mova        m7, [filteryq+  0]
228%if ARCH_X86_64 && mmsize > 8
229    mova        m8, [filteryq+ 32]
230    mova        m9, [filteryq+ 64]
231    mova       m10, [filteryq+ 96]
232%endif
233.loop:
234    ; FIXME maybe reuse loads from previous rows, or just
235    ; more generally unroll this to prevent multiple loads of
236    ; the same data?
237    movh        m0, [srcq]
238    movh        m1, [srcq+sstrideq]
239    movh        m2, [srcq+sstrideq*2]
240    movh        m3, [srcq+sstride3q]
241    add       srcq, sstrideq
242    movh        m4, [src4q]
243    punpcklwd   m0, m1
244    punpcklwd   m2, m3
245    pmaddwd     m0, m7
246%if ARCH_X86_64 && mmsize > 8
247    pmaddwd     m2, m8
248%else
249    pmaddwd     m2, [filteryq+ 32]
250%endif
251    movh        m1, [src4q+sstrideq]
252    movh        m3, [src4q+sstrideq*2]
253    paddd       m0, m2
254    movh        m2, [src4q+sstride3q]
255    add      src4q, sstrideq
256    punpcklwd   m4, m1
257    punpcklwd   m3, m2
258%if ARCH_X86_64 && mmsize > 8
259    pmaddwd     m4, m9
260    pmaddwd     m3, m10
261%else
262    pmaddwd     m4, [filteryq+ 64]
263    pmaddwd     m3, [filteryq+ 96]
264%endif
265    paddd       m0, m4
266    paddd       m0, m3
267    paddd       m0, m6
268    psrad       m0, 7
269%if cpuflag(sse4)
270    packusdw    m0, m0
271%else
272    packssdw    m0, m0
273%endif
274%ifidn %1, avg
275    movh        m1, [dstq]
276%endif
277    pminsw      m0, m5
278%if notcpuflag(sse4)
279%if ARCH_X86_64
280    pmaxsw      m0, m11
281%else
282    pxor        m2, m2
283    pmaxsw      m0, m2
284%endif
285%endif
286%ifidn %1, avg
287    pavgw       m0, m1
288%endif
289    movh    [dstq], m0
290    add       dstq, dstrideq
291    dec         hd
292    jg .loop
293    RET
294
295%if ARCH_X86_64
296cglobal vp9_%1_8tap_1d_v_4_12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
297%else
298cglobal vp9_%1_8tap_1d_v_4_12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
299    mov   filteryq, r5mp
300%endif
301    mova        m5, [pw_4095]
302    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_4_10 %+ SUFFIX).body
303%endmacro
304
305INIT_XMM sse2
306filter_v4_fn put
307filter_v4_fn avg
308
309%macro filter_v_fn 1-2 13
310%assign %%px mmsize/2
311%if ARCH_X86_64
312cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
313%else
314cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
315    mov   filteryq, r5mp
316%define hd r4mp
317%endif
318    mova        m5, [pw_1023]
319.body:
320%if notcpuflag(sse4) && ARCH_X86_64
321    pxor       m12, m12
322%endif
323%if ARCH_X86_64
324    mova       m11, [pd_64]
325%endif
326    lea  sstride3q, [sstrideq*3]
327    lea      src4q, [srcq+sstrideq]
328    sub       srcq, sstride3q
329    mova        m7, [filteryq+  0]
330%if ARCH_X86_64 && mmsize > 8
331    mova        m8, [filteryq+ 32]
332    mova        m9, [filteryq+ 64]
333    mova       m10, [filteryq+ 96]
334%endif
335.loop:
336    ; FIXME maybe reuse loads from previous rows, or just
337    ; more generally unroll this to prevent multiple loads of
338    ; the same data?
339    movu        m0, [srcq]
340    movu        m1, [srcq+sstrideq]
341    movu        m2, [srcq+sstrideq*2]
342    movu        m3, [srcq+sstride3q]
343    add       srcq, sstrideq
344    movu        m4, [src4q]
345    SBUTTERFLY  wd, 0, 1, 6
346    SBUTTERFLY  wd, 2, 3, 6
347    pmaddwd     m0, m7
348    pmaddwd     m1, m7
349%if ARCH_X86_64 && mmsize > 8
350    pmaddwd     m2, m8
351    pmaddwd     m3, m8
352%else
353    pmaddwd     m2, [filteryq+ 32]
354    pmaddwd     m3, [filteryq+ 32]
355%endif
356    paddd       m0, m2
357    paddd       m1, m3
358    movu        m2, [src4q+sstrideq]
359    movu        m3, [src4q+sstrideq*2]
360    SBUTTERFLY  wd, 4, 2, 6
361%if ARCH_X86_64 && mmsize > 8
362    pmaddwd     m4, m9
363    pmaddwd     m2, m9
364%else
365    pmaddwd     m4, [filteryq+ 64]
366    pmaddwd     m2, [filteryq+ 64]
367%endif
368    paddd       m0, m4
369    paddd       m1, m2
370    movu        m4, [src4q+sstride3q]
371    add      src4q, sstrideq
372    SBUTTERFLY  wd, 3, 4, 6
373%if ARCH_X86_64 && mmsize > 8
374    pmaddwd     m3, m10
375    pmaddwd     m4, m10
376%else
377    pmaddwd     m3, [filteryq+ 96]
378    pmaddwd     m4, [filteryq+ 96]
379%endif
380    paddd       m0, m3
381    paddd       m1, m4
382%if ARCH_X86_64
383    paddd       m0, m11
384    paddd       m1, m11
385%else
386    paddd       m0, [pd_64]
387    paddd       m1, [pd_64]
388%endif
389    psrad       m0, 7
390    psrad       m1, 7
391%if cpuflag(sse4)
392    packusdw    m0, m1
393%else
394    packssdw    m0, m1
395%endif
396    pminsw      m0, m5
397%if notcpuflag(sse4)
398%if ARCH_X86_64
399    pmaxsw      m0, m12
400%else
401    pxor        m2, m2
402    pmaxsw      m0, m2
403%endif
404%endif
405%ifidn %1, avg
406    pavgw       m0, [dstq]
407%endif
408    mova    [dstq], m0
409    add       dstq, dstrideq
410    dec         hd
411    jg .loop
412    RET
413
414%if ARCH_X86_64
415cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
416%else
417cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
418    mov   filteryq, r5mp
419%endif
420    mova        m5, [pw_4095]
421    jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_ %+ %%px %+ _10 %+ SUFFIX).body
422%endmacro
423
424INIT_XMM sse2
425filter_v_fn put
426filter_v_fn avg
427%if HAVE_AVX2_EXTERNAL
428INIT_YMM avx2
429filter_v_fn put
430filter_v_fn avg
431%endif
432