1;******************************************************************************
2;* MMX/SSE2-optimized functions for the RV40 decoder
3;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
5;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA
27
28pw_1024:   times 8 dw 1 << (16 - 6) ; pw_1024
29
30sixtap_filter_hb_m:  times 8 db   1, -5
31                     times 8 db  52, 20
32                     ; multiplied by 2 to have the same shift
33                     times 8 db   2, -10
34                     times 8 db  40,  40
35                     ; back to normal
36                     times 8 db   1, -5
37                     times 8 db  20, 52
38
39sixtap_filter_v_m:   times 8 dw   1
40                     times 8 dw  -5
41                     times 8 dw  52
42                     times 8 dw  20
43                     ; multiplied by 2 to have the same shift
44                     times 8 dw   2
45                     times 8 dw -10
46                     times 8 dw  40
47                     times 8 dw  40
48                     ; back to normal
49                     times 8 dw   1
50                     times 8 dw  -5
51                     times 8 dw  20
52                     times 8 dw  52
53
54%ifdef PIC
55%define sixtap_filter_hw   picregq
56%define sixtap_filter_hb   picregq
57%define sixtap_filter_v    picregq
58%define npicregs 1
59%else
60%define sixtap_filter_hw   sixtap_filter_hw_m
61%define sixtap_filter_hb   sixtap_filter_hb_m
62%define sixtap_filter_v    sixtap_filter_v_m
63%define npicregs 0
64%endif
65
66filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5,  5, 6,  6,  7,  7,  8
67filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7,  7, 8,  8,  9,  9, 10
68filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
69
70cextern  pw_32
71cextern  pw_16
72cextern  pw_512
73
74SECTION .text
75
76;-----------------------------------------------------------------------------
77; subpel MC functions:
78;
79; void ff_[put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
80;                                          uint8_t *src, int srcstride,
81;                                          int len, int m);
82;----------------------------------------------------------------------
83%macro LOAD  2
84%if WIN64
85   movsxd   %1q, %1d
86%endif
87%ifdef PIC
88   add      %1q, picregq
89%else
90   add      %1q, %2
91%endif
92%endmacro
93
94%macro STORE 3
95%ifidn %3, avg
96    movh      %2, [dstq]
97%endif
98    packuswb  %1, %1
99%ifidn %3, avg
100    PAVGB     %1, %2
101%endif
102    movh  [dstq], %1
103%endmacro
104
105%macro FILTER_V 1
106cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
107%ifdef PIC
108    lea  picregq, [sixtap_filter_v_m]
109%endif
110    pxor      m7, m7
111    LOAD      my, sixtap_filter_v
112
113    ; read 5 lines
114    sub     srcq, srcstrideq
115    sub     srcq, srcstrideq
116    movh      m0, [srcq]
117    movh      m1, [srcq+srcstrideq]
118    movh      m2, [srcq+srcstrideq*2]
119    lea     srcq, [srcq+srcstrideq*2]
120    add     srcq, srcstrideq
121    movh      m3, [srcq]
122    movh      m4, [srcq+srcstrideq]
123    punpcklbw m0, m7
124    punpcklbw m1, m7
125    punpcklbw m2, m7
126    punpcklbw m3, m7
127    punpcklbw m4, m7
128
129%ifdef m8
130    mova      m8, [myq+ 0]
131    mova      m9, [myq+16]
132    mova     m10, [myq+32]
133    mova     m11, [myq+48]
134%define COEFF05  m8
135%define COEFF14  m9
136%define COEFF2   m10
137%define COEFF3   m11
138%else
139%define COEFF05  [myq+ 0]
140%define COEFF14  [myq+16]
141%define COEFF2   [myq+32]
142%define COEFF3   [myq+48]
143%endif
144.nextrow:
145    mova      m6, m1
146    movh      m5, [srcq+2*srcstrideq]      ; read new row
147    paddw     m6, m4
148    punpcklbw m5, m7
149    pmullw    m6, COEFF14
150    paddw     m0, m5
151    pmullw    m0, COEFF05
152    paddw     m6, m0
153    mova      m0, m1
154    paddw     m6, [pw_32]
155    mova      m1, m2
156    pmullw    m2, COEFF2
157    paddw     m6, m2
158    mova      m2, m3
159    pmullw    m3, COEFF3
160    paddw     m6, m3
161
162    ; round/clip/store
163    mova      m3, m4
164    psraw     m6, 6
165    mova      m4, m5
166    STORE     m6, m5, %1
167
168    ; go to next line
169    add     dstq, dststrideq
170    add     srcq, srcstrideq
171    dec  heightd                           ; next row
172    jg .nextrow
173    REP_RET
174%endmacro
175
176%macro FILTER_H  1
177cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
178%ifdef PIC
179    lea  picregq, [sixtap_filter_v_m]
180%endif
181    pxor      m7, m7
182    LOAD      mx, sixtap_filter_v
183    mova      m6, [pw_32]
184%ifdef m8
185    mova      m8, [mxq+ 0]
186    mova      m9, [mxq+16]
187    mova     m10, [mxq+32]
188    mova     m11, [mxq+48]
189%define COEFF05  m8
190%define COEFF14  m9
191%define COEFF2   m10
192%define COEFF3   m11
193%else
194%define COEFF05  [mxq+ 0]
195%define COEFF14  [mxq+16]
196%define COEFF2   [mxq+32]
197%define COEFF3   [mxq+48]
198%endif
199.nextrow:
200    movq      m0, [srcq-2]
201    movq      m5, [srcq+3]
202    movq      m1, [srcq-1]
203    movq      m4, [srcq+2]
204    punpcklbw m0, m7
205    punpcklbw m5, m7
206    punpcklbw m1, m7
207    punpcklbw m4, m7
208    movq      m2, [srcq-0]
209    movq      m3, [srcq+1]
210    paddw     m0, m5
211    paddw     m1, m4
212    punpcklbw m2, m7
213    punpcklbw m3, m7
214    pmullw    m0, COEFF05
215    pmullw    m1, COEFF14
216    pmullw    m2, COEFF2
217    pmullw    m3, COEFF3
218    paddw     m0, m6
219    paddw     m1, m2
220    paddw     m0, m3
221    paddw     m0, m1
222    psraw     m0, 6
223    STORE     m0, m1, %1
224
225    ; go to next line
226    add     dstq, dststrideq
227    add     srcq, srcstrideq
228    dec  heightd            ; next row
229    jg .nextrow
230    REP_RET
231%endmacro
232
233%if ARCH_X86_32
234INIT_MMX  mmx
235FILTER_V  put
236FILTER_H  put
237
238INIT_MMX  mmxext
239FILTER_V  avg
240FILTER_H  avg
241
242INIT_MMX  3dnow
243FILTER_V  avg
244FILTER_H  avg
245%endif
246
247INIT_XMM  sse2
248FILTER_H  put
249FILTER_H  avg
250FILTER_V  put
251FILTER_V  avg
252
253%macro FILTER_SSSE3 1
254cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
255%ifdef PIC
256    lea  picregq, [sixtap_filter_hb_m]
257%endif
258
259    ; read 5 lines
260    sub     srcq, srcstrideq
261    LOAD      my, sixtap_filter_hb
262    sub     srcq, srcstrideq
263    movh      m0, [srcq]
264    movh      m1, [srcq+srcstrideq]
265    movh      m2, [srcq+srcstrideq*2]
266    lea     srcq, [srcq+srcstrideq*2]
267    add     srcq, srcstrideq
268    mova      m5, [myq]
269    movh      m3, [srcq]
270    movh      m4, [srcq+srcstrideq]
271    lea     srcq, [srcq+2*srcstrideq]
272
273.nextrow:
274    mova      m6, m2
275    punpcklbw m0, m1
276    punpcklbw m6, m3
277    pmaddubsw m0, m5
278    pmaddubsw m6, [myq+16]
279    movh      m7, [srcq]      ; read new row
280    paddw     m6, m0
281    mova      m0, m1
282    mova      m1, m2
283    mova      m2, m3
284    mova      m3, m4
285    mova      m4, m7
286    punpcklbw m7, m3
287    pmaddubsw m7, m5
288    paddw     m6, m7
289    pmulhrsw  m6, [pw_512]
290    STORE     m6, m7, %1
291
292    ; go to next line
293    add     dstq, dststrideq
294    add     srcq, srcstrideq
295    dec       heightd                          ; next row
296    jg       .nextrow
297    REP_RET
298
299cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
300%ifdef PIC
301    lea  picregq, [sixtap_filter_hb_m]
302%endif
303    mova      m3, [filter_h6_shuf2]
304    mova      m4, [filter_h6_shuf3]
305    LOAD      mx, sixtap_filter_hb
306    mova      m5, [mxq] ; set up 6tap filter in bytes
307    mova      m6, [mxq+16]
308    mova      m7, [filter_h6_shuf1]
309
310.nextrow:
311    movu      m0, [srcq-2]
312    mova      m1, m0
313    mova      m2, m0
314    pshufb    m0, m7
315    pshufb    m1, m3
316    pshufb    m2, m4
317    pmaddubsw m0, m5
318    pmaddubsw m1, m6
319    pmaddubsw m2, m5
320    paddw     m0, m1
321    paddw     m0, m2
322    pmulhrsw  m0, [pw_512]
323    STORE     m0, m1, %1
324
325    ; go to next line
326    add     dstq, dststrideq
327    add     srcq, srcstrideq
328    dec  heightd            ; next row
329    jg .nextrow
330    REP_RET
331%endmacro
332
333INIT_XMM ssse3
334FILTER_SSSE3  put
335FILTER_SSSE3  avg
336
337; %1=5-bit weights?, %2=dst %3=src1 %4=src3 %5=stride if SSE2
338%macro RV40_WCORE  4-5
339    movh       m4, [%3 + r6 + 0]
340    movh       m5, [%4 + r6 + 0]
341%if %0 == 4
342%define OFFSET r6 + mmsize / 2
343%else
344    ; 8x8 block and SSE2, stride was provided
345%define OFFSET r6
346    add        r6, r5
347%endif
348    movh       m6, [%3 + OFFSET]
349    movh       m7, [%4 + OFFSET]
350
351%if %1 == 0
352    ; 14-bit weights
353    punpcklbw  m4, m0
354    punpcklbw  m5, m0
355    punpcklbw  m6, m0
356    punpcklbw  m7, m0
357
358    psllw      m4, 7
359    psllw      m5, 7
360    psllw      m6, 7
361    psllw      m7, 7
362    pmulhw     m4, m3
363    pmulhw     m5, m2
364    pmulhw     m6, m3
365    pmulhw     m7, m2
366
367    paddw      m4, m5
368    paddw      m6, m7
369%else
370    ; 5-bit weights
371%if cpuflag(ssse3)
372    punpcklbw  m4, m5
373    punpcklbw  m6, m7
374
375    pmaddubsw  m4, m3
376    pmaddubsw  m6, m3
377%else
378    punpcklbw  m4, m0
379    punpcklbw  m5, m0
380    punpcklbw  m6, m0
381    punpcklbw  m7, m0
382
383    pmullw     m4, m3
384    pmullw     m5, m2
385    pmullw     m6, m3
386    pmullw     m7, m2
387    paddw      m4, m5
388    paddw      m6, m7
389%endif
390
391%endif
392
393    ; bias and shift down
394%if cpuflag(ssse3)
395    pmulhrsw   m4, m1
396    pmulhrsw   m6, m1
397%else
398    paddw      m4, m1
399    paddw      m6, m1
400    psrlw      m4, 5
401    psrlw      m6, 5
402%endif
403
404    packuswb   m4, m6
405%if %0 == 5
406    ; Only called for 8x8 blocks and SSE2
407    sub        r6, r5
408    movh       [%2 + r6], m4
409    add        r6, r5
410    movhps     [%2 + r6], m4
411%else
412    mova       [%2 + r6], m4
413%endif
414%endmacro
415
416
417%macro MAIN_LOOP   2
418%if mmsize == 8
419    RV40_WCORE %2, r0, r1, r2
420%if %1 == 16
421    RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
422%endif
423
424    ; Prepare for next loop
425    add        r6, r5
426%else
427%ifidn %1, 8
428    RV40_WCORE %2, r0, r1, r2, r5
429    ; Prepare 2 next lines
430    add        r6, r5
431%else
432    RV40_WCORE %2, r0, r1, r2
433    ; Prepare single next line
434    add        r6, r5
435%endif
436%endif
437
438%endmacro
439
440; void ff_rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
441; %1=size  %2=num of xmm regs
442; The weights are FP0.14 notation of fractions depending on pts.
443; For timebases without rounding error (i.e. PAL), the fractions
444; can be simplified, and several operations can be avoided.
445; Therefore, we check here whether they are multiples of 2^9 for
446; those simplifications to occur.
447%macro RV40_WEIGHT  3
448cglobal rv40_weight_func_%1_%2, 6, 7, 8
449%if cpuflag(ssse3)
450    mova       m1, [pw_1024]
451%else
452    mova       m1, [pw_16]
453%endif
454    pxor       m0, m0
455    ; Set loop counter and increments
456    mov        r6, r5
457    shl        r6, %3
458    add        r0, r6
459    add        r1, r6
460    add        r2, r6
461    neg        r6
462
463    movd       m2, r3d
464    movd       m3, r4d
465%ifidn %1,rnd
466%define  RND   0
467    SPLATW     m2, m2
468%else
469%define  RND   1
470%if cpuflag(ssse3)
471    punpcklbw  m3, m2
472%else
473    SPLATW     m2, m2
474%endif
475%endif
476    SPLATW     m3, m3
477
478.loop:
479    MAIN_LOOP  %2, RND
480    jnz        .loop
481    REP_RET
482%endmacro
483
484INIT_MMX mmxext
485RV40_WEIGHT   rnd,    8, 3
486RV40_WEIGHT   rnd,   16, 4
487RV40_WEIGHT   nornd,  8, 3
488RV40_WEIGHT   nornd, 16, 4
489
490INIT_XMM sse2
491RV40_WEIGHT   rnd,    8, 3
492RV40_WEIGHT   rnd,   16, 4
493RV40_WEIGHT   nornd,  8, 3
494RV40_WEIGHT   nornd, 16, 4
495
496INIT_XMM ssse3
497RV40_WEIGHT   rnd,    8, 3
498RV40_WEIGHT   rnd,   16, 4
499RV40_WEIGHT   nornd,  8, 3
500RV40_WEIGHT   nornd, 16, 4
501