1;******************************************************************************
2;* mpeg4 qpel
3;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
4;* Copyright (c) 2008 Loren Merritt
5;* Copyright (c) 2013 Daniel Kang
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA
27cextern pb_1
28cextern pw_3
29cextern pw_15
30cextern pw_16
31cextern pw_20
32
33
34SECTION .text
35
36; void ff_put_no_rnd_pixels8_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
37%macro PUT_NO_RND_PIXELS8_L2 0
38cglobal put_no_rnd_pixels8_l2, 6,6
39    movsxdifnidn r4, r4d
40    movsxdifnidn r3, r3d
41    pcmpeqb      m6, m6
42    test        r5d, 1
43    je .loop
44    mova         m0, [r1]
45    mova         m1, [r2]
46    add          r1, r4
47    add          r2, 8
48    pxor         m0, m6
49    pxor         m1, m6
50    PAVGB        m0, m1
51    pxor         m0, m6
52    mova       [r0], m0
53    add          r0, r3
54    dec r5d
55.loop:
56    mova         m0, [r1]
57    add          r1, r4
58    mova         m1, [r1]
59    add          r1, r4
60    mova         m2, [r2]
61    mova         m3, [r2+8]
62    pxor         m0, m6
63    pxor         m1, m6
64    pxor         m2, m6
65    pxor         m3, m6
66    PAVGB        m0, m2
67    PAVGB        m1, m3
68    pxor         m0, m6
69    pxor         m1, m6
70    mova       [r0], m0
71    add          r0, r3
72    mova       [r0], m1
73    add          r0, r3
74    mova         m0, [r1]
75    add          r1, r4
76    mova         m1, [r1]
77    add          r1, r4
78    mova         m2, [r2+16]
79    mova         m3, [r2+24]
80    pxor         m0, m6
81    pxor         m1, m6
82    pxor         m2, m6
83    pxor         m3, m6
84    PAVGB        m0, m2
85    PAVGB        m1, m3
86    pxor         m0, m6
87    pxor         m1, m6
88    mova       [r0], m0
89    add          r0, r3
90    mova       [r0], m1
91    add          r0, r3
92    add          r2, 32
93    sub         r5d, 4
94    jne .loop
95    REP_RET
96%endmacro
97
98INIT_MMX mmxext
99PUT_NO_RND_PIXELS8_L2
100
101
102; void ff_put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
103%macro PUT_NO_RND_PIXELS16_l2 0
104cglobal put_no_rnd_pixels16_l2, 6,6
105    movsxdifnidn r3, r3d
106    movsxdifnidn r4, r4d
107    pcmpeqb      m6, m6
108    test        r5d, 1
109    je .loop
110    mova         m0, [r1]
111    mova         m1, [r1+8]
112    mova         m2, [r2]
113    mova         m3, [r2+8]
114    pxor         m0, m6
115    pxor         m1, m6
116    pxor         m2, m6
117    pxor         m3, m6
118    PAVGB        m0, m2
119    PAVGB        m1, m3
120    pxor         m0, m6
121    pxor         m1, m6
122    add          r1, r4
123    add          r2, 16
124    mova       [r0], m0
125    mova     [r0+8], m1
126    add          r0, r3
127    dec r5d
128.loop:
129    mova         m0, [r1]
130    mova         m1, [r1+8]
131    add          r1, r4
132    mova         m2, [r2]
133    mova         m3, [r2+8]
134    pxor         m0, m6
135    pxor         m1, m6
136    pxor         m2, m6
137    pxor         m3, m6
138    PAVGB        m0, m2
139    PAVGB        m1, m3
140    pxor         m0, m6
141    pxor         m1, m6
142    mova       [r0], m0
143    mova     [r0+8], m1
144    add          r0, r3
145    mova         m0, [r1]
146    mova         m1, [r1+8]
147    add          r1, r4
148    mova         m2, [r2+16]
149    mova         m3, [r2+24]
150    pxor         m0, m6
151    pxor         m1, m6
152    pxor         m2, m6
153    pxor         m3, m6
154    PAVGB        m0, m2
155    PAVGB        m1, m3
156    pxor         m0, m6
157    pxor         m1, m6
158    mova       [r0], m0
159    mova     [r0+8], m1
160    add          r0, r3
161    add          r2, 32
162    sub         r5d, 2
163    jne .loop
164    REP_RET
165%endmacro
166
167INIT_MMX mmxext
168PUT_NO_RND_PIXELS16_l2
169INIT_MMX 3dnow
170PUT_NO_RND_PIXELS16_l2
171
172%macro MPEG4_QPEL16_H_LOWPASS 1
173cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 16
174    movsxdifnidn r2, r2d
175    movsxdifnidn r3, r3d
176    pxor         m7, m7
177.loop:
178    mova         m0, [r1]
179    mova         m1, m0
180    mova         m2, m0
181    punpcklbw    m0, m7
182    punpckhbw    m1, m7
183    pshufw       m5, m0, 0x90
184    pshufw       m6, m0, 0x41
185    mova         m3, m2
186    mova         m4, m2
187    psllq        m2, 8
188    psllq        m3, 16
189    psllq        m4, 24
190    punpckhbw    m2, m7
191    punpckhbw    m3, m7
192    punpckhbw    m4, m7
193    paddw        m5, m3
194    paddw        m6, m2
195    paddw        m5, m5
196    psubw        m6, m5
197    pshufw       m5, m0, 6
198    pmullw       m6, [pw_3]
199    paddw        m0, m4
200    paddw        m5, m1
201    pmullw       m0, [pw_20]
202    psubw        m0, m5
203    paddw        m6, [PW_ROUND]
204    paddw        m0, m6
205    psraw        m0, 5
206    mova    [rsp+8], m0
207    mova         m0, [r1+5]
208    mova         m5, m0
209    mova         m6, m0
210    psrlq        m0, 8
211    psrlq        m5, 16
212    punpcklbw    m0, m7
213    punpcklbw    m5, m7
214    paddw        m2, m0
215    paddw        m3, m5
216    paddw        m2, m2
217    psubw        m3, m2
218    mova         m2, m6
219    psrlq        m6, 24
220    punpcklbw    m2, m7
221    punpcklbw    m6, m7
222    pmullw       m3, [pw_3]
223    paddw        m1, m2
224    paddw        m4, m6
225    pmullw       m1, [pw_20]
226    psubw        m3, m4
227    paddw        m1, [PW_ROUND]
228    paddw        m3, m1
229    psraw        m3, 5
230    mova         m1, [rsp+8]
231    packuswb     m1, m3
232    OP_MOV     [r0], m1, m4
233    mova         m1, [r1+9]
234    mova         m4, m1
235    mova         m3, m1
236    psrlq        m1, 8
237    psrlq        m4, 16
238    punpcklbw    m1, m7
239    punpcklbw    m4, m7
240    paddw        m5, m1
241    paddw        m0, m4
242    paddw        m5, m5
243    psubw        m0, m5
244    mova         m5, m3
245    psrlq        m3, 24
246    pmullw       m0, [pw_3]
247    punpcklbw    m3, m7
248    paddw        m2, m3
249    psubw        m0, m2
250    mova         m2, m5
251    punpcklbw    m2, m7
252    punpckhbw    m5, m7
253    paddw        m6, m2
254    pmullw       m6, [pw_20]
255    paddw        m0, [PW_ROUND]
256    paddw        m0, m6
257    psraw        m0, 5
258    paddw        m3, m5
259    pshufw       m6, m5, 0xf9
260    paddw        m6, m4
261    pshufw       m4, m5, 0xbe
262    pshufw       m5, m5, 0x6f
263    paddw        m4, m1
264    paddw        m5, m2
265    paddw        m6, m6
266    psubw        m4, m6
267    pmullw       m3, [pw_20]
268    pmullw       m4, [pw_3]
269    psubw        m3, m5
270    paddw        m4, [PW_ROUND]
271    paddw        m4, m3
272    psraw        m4, 5
273    packuswb     m0, m4
274    OP_MOV   [r0+8], m0, m4
275    add          r1, r3
276    add          r0, r2
277    dec r4d
278    jne .loop
279    REP_RET
280%endmacro
281
282%macro PUT_OP 2-3
283    mova %1, %2
284%endmacro
285
286%macro AVG_OP 2-3
287    mova  %3, %1
288    pavgb %2, %3
289    mova  %1, %2
290%endmacro
291
292INIT_MMX mmxext
293%define PW_ROUND pw_16
294%define OP_MOV PUT_OP
295MPEG4_QPEL16_H_LOWPASS put
296%define PW_ROUND pw_16
297%define OP_MOV AVG_OP
298MPEG4_QPEL16_H_LOWPASS avg
299%define PW_ROUND pw_15
300%define OP_MOV PUT_OP
301MPEG4_QPEL16_H_LOWPASS put_no_rnd
302
303
304
305%macro MPEG4_QPEL8_H_LOWPASS 1
306cglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 0, 8
307    movsxdifnidn r2, r2d
308    movsxdifnidn r3, r3d
309    pxor         m7, m7
310.loop:
311    mova         m0, [r1]
312    mova         m1, m0
313    mova         m2, m0
314    punpcklbw    m0, m7
315    punpckhbw    m1, m7
316    pshufw       m5, m0, 0x90
317    pshufw       m6, m0, 0x41
318    mova         m3, m2
319    mova         m4, m2
320    psllq        m2, 8
321    psllq        m3, 16
322    psllq        m4, 24
323    punpckhbw    m2, m7
324    punpckhbw    m3, m7
325    punpckhbw    m4, m7
326    paddw        m5, m3
327    paddw        m6, m2
328    paddw        m5, m5
329    psubw        m6, m5
330    pshufw       m5, m0, 0x6
331    pmullw       m6, [pw_3]
332    paddw        m0, m4
333    paddw        m5, m1
334    pmullw       m0, [pw_20]
335    psubw        m0, m5
336    paddw        m6, [PW_ROUND]
337    paddw        m0, m6
338    psraw        m0, 5
339    movh         m5, [r1+5]
340    punpcklbw    m5, m7
341    pshufw       m6, m5, 0xf9
342    paddw        m1, m5
343    paddw        m2, m6
344    pshufw       m6, m5, 0xbe
345    pshufw       m5, m5, 0x6f
346    paddw        m3, m6
347    paddw        m4, m5
348    paddw        m2, m2
349    psubw        m3, m2
350    pmullw       m1, [pw_20]
351    pmullw       m3, [pw_3]
352    psubw        m3, m4
353    paddw        m1, [PW_ROUND]
354    paddw        m3, m1
355    psraw        m3, 5
356    packuswb     m0, m3
357    OP_MOV     [r0], m0, m4
358    add          r1, r3
359    add          r0, r2
360    dec r4d
361    jne .loop
362    REP_RET
363%endmacro
364
365INIT_MMX mmxext
366%define PW_ROUND pw_16
367%define OP_MOV PUT_OP
368MPEG4_QPEL8_H_LOWPASS put
369%define PW_ROUND pw_16
370%define OP_MOV AVG_OP
371MPEG4_QPEL8_H_LOWPASS avg
372%define PW_ROUND pw_15
373%define OP_MOV PUT_OP
374MPEG4_QPEL8_H_LOWPASS put_no_rnd
375
376
377
378%macro QPEL_V_LOW 5
379    paddw      m0, m1
380    mova       m4, [pw_20]
381    pmullw     m4, m0
382    mova       m0, %4
383    mova       m5, %1
384    paddw      m5, m0
385    psubw      m4, m5
386    mova       m5, %2
387    mova       m6, %3
388    paddw      m5, m3
389    paddw      m6, m2
390    paddw      m6, m6
391    psubw      m5, m6
392    pmullw     m5, [pw_3]
393    paddw      m4, [PW_ROUND]
394    paddw      m5, m4
395    psraw      m5, 5
396    packuswb   m5, m5
397    OP_MOV     %5, m5, m7
398    SWAP 0,1,2,3
399%endmacro
400
401%macro MPEG4_QPEL16_V_LOWPASS 1
402cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 0, 544
403    movsxdifnidn r2, r2d
404    movsxdifnidn r3, r3d
405
406    mov         r4d, 17
407    mov          r5, rsp
408    pxor         m7, m7
409.looph:
410    mova         m0, [r1]
411    mova         m1, [r1]
412    mova         m2, [r1+8]
413    mova         m3, [r1+8]
414    punpcklbw    m0, m7
415    punpckhbw    m1, m7
416    punpcklbw    m2, m7
417    punpckhbw    m3, m7
418    mova       [r5], m0
419    mova  [r5+0x88], m1
420    mova [r5+0x110], m2
421    mova [r5+0x198], m3
422    add          r5, 8
423    add          r1, r3
424    dec r4d
425    jne .looph
426
427
428    ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 14*dstStride
429    mov         r4d, 4
430    mov          r1, 4
431    neg          r2
432    lea          r1, [r1+r2*8]
433    lea          r1, [r1+r2*4]
434    lea          r1, [r1+r2*2]
435    neg          r2
436    mov          r5, rsp
437.loopv:
438    pxor         m7, m7
439    mova         m0, [r5+ 0x0]
440    mova         m1, [r5+ 0x8]
441    mova         m2, [r5+0x10]
442    mova         m3, [r5+0x18]
443    QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
444    QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
445    lea    r0, [r0+r2*2]
446    QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
447    QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
448    lea    r0, [r0+r2*2]
449    QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
450    QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x48], [r0+r2]
451    lea    r0, [r0+r2*2]
452    QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x50], [r0]
453    QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x58], [r0+r2]
454    lea    r0, [r0+r2*2]
455    QPEL_V_LOW [r5+0x28], [r5+0x30], [r5+0x38], [r5+0x60], [r0]
456    QPEL_V_LOW [r5+0x30], [r5+0x38], [r5+0x40], [r5+0x68], [r0+r2]
457    lea    r0, [r0+r2*2]
458    QPEL_V_LOW [r5+0x38], [r5+0x40], [r5+0x48], [r5+0x70], [r0]
459    QPEL_V_LOW [r5+0x40], [r5+0x48], [r5+0x50], [r5+0x78], [r0+r2]
460    lea    r0, [r0+r2*2]
461    QPEL_V_LOW [r5+0x48], [r5+0x50], [r5+0x58], [r5+0x80], [r0]
462    QPEL_V_LOW [r5+0x50], [r5+0x58], [r5+0x60], [r5+0x80], [r0+r2]
463    lea    r0, [r0+r2*2]
464    QPEL_V_LOW [r5+0x58], [r5+0x60], [r5+0x68], [r5+0x78], [r0]
465    QPEL_V_LOW [r5+0x60], [r5+0x68], [r5+0x70], [r5+0x70], [r0+r2]
466
467    add    r5, 0x88
468    add    r0, r1
469    dec r4d
470    jne .loopv
471    REP_RET
472%endmacro
473
474%macro PUT_OPH 2-3
475    movh %1, %2
476%endmacro
477
478%macro AVG_OPH 2-3
479    movh  %3, %1
480    pavgb %2, %3
481    movh  %1, %2
482%endmacro
483
484INIT_MMX mmxext
485%define PW_ROUND pw_16
486%define OP_MOV PUT_OPH
487MPEG4_QPEL16_V_LOWPASS put
488%define PW_ROUND pw_16
489%define OP_MOV AVG_OPH
490MPEG4_QPEL16_V_LOWPASS avg
491%define PW_ROUND pw_15
492%define OP_MOV PUT_OPH
493MPEG4_QPEL16_V_LOWPASS put_no_rnd
494
495
496
497%macro MPEG4_QPEL8_V_LOWPASS 1
498cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 0, 288
499    movsxdifnidn r2, r2d
500    movsxdifnidn r3, r3d
501
502    mov         r4d, 9
503    mov          r5, rsp
504    pxor         m7, m7
505.looph:
506    mova         m0, [r1]
507    mova         m1, [r1]
508    punpcklbw    m0, m7
509    punpckhbw    m1, m7
510    mova       [r5], m0
511    mova  [r5+0x48], m1
512    add          r5, 8
513    add          r1, r3
514    dec r4d
515    jne .looph
516
517
518    ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 6*dstStride
519    mov         r4d, 2
520    mov          r1, 4
521    neg          r2
522    lea          r1, [r1+r2*4]
523    lea          r1, [r1+r2*2]
524    neg          r2
525    mov          r5, rsp
526.loopv:
527    pxor         m7, m7
528    mova         m0, [r5+ 0x0]
529    mova         m1, [r5+ 0x8]
530    mova         m2, [r5+0x10]
531    mova         m3, [r5+0x18]
532    QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
533    QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
534    lea    r0, [r0+r2*2]
535    QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
536    QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
537    lea    r0, [r0+r2*2]
538    QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
539    QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x40], [r0+r2]
540    lea    r0, [r0+r2*2]
541    QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x38], [r0]
542    QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x30], [r0+r2]
543
544    add    r5, 0x48
545    add    r0, r1
546    dec r4d
547    jne .loopv
548    REP_RET
549%endmacro
550
551INIT_MMX mmxext
552%define PW_ROUND pw_16
553%define OP_MOV PUT_OPH
554MPEG4_QPEL8_V_LOWPASS put
555%define PW_ROUND pw_16
556%define OP_MOV AVG_OPH
557MPEG4_QPEL8_V_LOWPASS avg
558%define PW_ROUND pw_15
559%define OP_MOV PUT_OPH
560MPEG4_QPEL8_V_LOWPASS put_no_rnd
561