1;*****************************************************************************
2;* MMX/SSE2/SSSE3-optimized H.264 QPEL code
3;*****************************************************************************
4;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
5;* Copyright (C) 2012 Daniel Kang
6;*
7;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
8;*
9;* This file is part of FFmpeg.
10;*
11;* FFmpeg is free software; you can redistribute it and/or
12;* modify it under the terms of the GNU Lesser General Public
13;* License as published by the Free Software Foundation; either
14;* version 2.1 of the License, or (at your option) any later version.
15;*
16;* FFmpeg is distributed in the hope that it will be useful,
17;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19;* Lesser General Public License for more details.
20;*
21;* You should have received a copy of the GNU Lesser General Public
22;* License along with FFmpeg; if not, write to the Free Software
23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24;******************************************************************************
25
26%include "libavutil/x86/x86util.asm"
27
28SECTION_RODATA 32
29
30cextern pw_16
31cextern pw_5
32cextern pb_0
33
34SECTION .text
35
36
37%macro op_avgh 3
38    movh   %3, %2
39    pavgb  %1, %3
40    movh   %2, %1
41%endmacro
42
43%macro op_avg 2-3
44    pavgb  %1, %2
45    mova   %2, %1
46%endmacro
47
48%macro op_puth 2-3
49    movh   %2, %1
50%endmacro
51
52%macro op_put 2-3
53    mova   %2, %1
54%endmacro
55
56%macro QPEL4_H_LOWPASS_OP 1
57cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
58    movsxdifnidn  r2, r2d
59    movsxdifnidn  r3, r3d
60    pxor          m7, m7
61    mova          m4, [pw_5]
62    mova          m5, [pw_16]
63    mov          r4d, 4
64.loop:
65    movh          m1, [r1-1]
66    movh          m2, [r1+0]
67    movh          m3, [r1+1]
68    movh          m0, [r1+2]
69    punpcklbw     m1, m7
70    punpcklbw     m2, m7
71    punpcklbw     m3, m7
72    punpcklbw     m0, m7
73    paddw         m1, m0
74    paddw         m2, m3
75    movh          m0, [r1-2]
76    movh          m3, [r1+3]
77    punpcklbw     m0, m7
78    punpcklbw     m3, m7
79    paddw         m0, m3
80    psllw         m2, 2
81    psubw         m2, m1
82    pmullw        m2, m4
83    paddw         m0, m5
84    paddw         m0, m2
85    psraw         m0, 5
86    packuswb      m0, m0
87    op_%1h        m0, [r0], m6
88    add           r0, r2
89    add           r1, r3
90    dec          r4d
91    jg         .loop
92    REP_RET
93%endmacro
94
95INIT_MMX mmxext
96QPEL4_H_LOWPASS_OP put
97QPEL4_H_LOWPASS_OP avg
98
99%macro QPEL8_H_LOWPASS_OP 1
100cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
101    movsxdifnidn  r2, r2d
102    movsxdifnidn  r3, r3d
103    mov          r4d, 8
104    pxor          m7, m7
105    mova          m6, [pw_5]
106.loop:
107    mova          m0, [r1]
108    mova          m2, [r1+1]
109    mova          m1, m0
110    mova          m3, m2
111    punpcklbw     m0, m7
112    punpckhbw     m1, m7
113    punpcklbw     m2, m7
114    punpckhbw     m3, m7
115    paddw         m0, m2
116    paddw         m1, m3
117    psllw         m0, 2
118    psllw         m1, 2
119    mova          m2, [r1-1]
120    mova          m4, [r1+2]
121    mova          m3, m2
122    mova          m5, m4
123    punpcklbw     m2, m7
124    punpckhbw     m3, m7
125    punpcklbw     m4, m7
126    punpckhbw     m5, m7
127    paddw         m2, m4
128    paddw         m5, m3
129    psubw         m0, m2
130    psubw         m1, m5
131    pmullw        m0, m6
132    pmullw        m1, m6
133    movd          m2, [r1-2]
134    movd          m5, [r1+7]
135    punpcklbw     m2, m7
136    punpcklbw     m5, m7
137    paddw         m2, m3
138    paddw         m4, m5
139    mova          m5, [pw_16]
140    paddw         m2, m5
141    paddw         m4, m5
142    paddw         m0, m2
143    paddw         m1, m4
144    psraw         m0, 5
145    psraw         m1, 5
146    packuswb      m0, m1
147    op_%1         m0, [r0], m4
148    add           r0, r2
149    add           r1, r3
150    dec          r4d
151    jg         .loop
152    REP_RET
153%endmacro
154
155INIT_MMX mmxext
156QPEL8_H_LOWPASS_OP put
157QPEL8_H_LOWPASS_OP avg
158
159%macro QPEL8_H_LOWPASS_OP_XMM 1
160cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride
161    movsxdifnidn  r2, r2d
162    movsxdifnidn  r3, r3d
163    mov          r4d, 8
164    pxor          m7, m7
165    mova          m6, [pw_5]
166.loop:
167    movu          m1, [r1-2]
168    mova          m0, m1
169    punpckhbw     m1, m7
170    punpcklbw     m0, m7
171    mova          m2, m1
172    mova          m3, m1
173    mova          m4, m1
174    mova          m5, m1
175    palignr       m4, m0, 2
176    palignr       m3, m0, 4
177    palignr       m2, m0, 6
178    palignr       m1, m0, 8
179    palignr       m5, m0, 10
180    paddw         m0, m5
181    paddw         m2, m3
182    paddw         m1, m4
183    psllw         m2, 2
184    psubw         m2, m1
185    paddw         m0, [pw_16]
186    pmullw        m2, m6
187    paddw         m2, m0
188    psraw         m2, 5
189    packuswb      m2, m2
190    op_%1h        m2, [r0], m4
191    add           r1, r3
192    add           r0, r2
193    dec          r4d
194    jne        .loop
195    REP_RET
196%endmacro
197
198INIT_XMM ssse3
199QPEL8_H_LOWPASS_OP_XMM put
200QPEL8_H_LOWPASS_OP_XMM avg
201
202
203%macro QPEL4_H_LOWPASS_L2_OP 1
204cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
205    movsxdifnidn  r3, r3d
206    movsxdifnidn  r4, r4d
207    pxor          m7, m7
208    mova          m4, [pw_5]
209    mova          m5, [pw_16]
210    mov          r5d, 4
211.loop:
212    movh          m1, [r1-1]
213    movh          m2, [r1+0]
214    movh          m3, [r1+1]
215    movh          m0, [r1+2]
216    punpcklbw     m1, m7
217    punpcklbw     m2, m7
218    punpcklbw     m3, m7
219    punpcklbw     m0, m7
220    paddw         m1, m0
221    paddw         m2, m3
222    movh          m0, [r1-2]
223    movh          m3, [r1+3]
224    punpcklbw     m0, m7
225    punpcklbw     m3, m7
226    paddw         m0, m3
227    psllw         m2, 2
228    psubw         m2, m1
229    pmullw        m2, m4
230    paddw         m0, m5
231    paddw         m0, m2
232    movh          m3, [r2]
233    psraw         m0, 5
234    packuswb      m0, m0
235    pavgb         m0, m3
236    op_%1h        m0, [r0], m6
237    add           r0, r3
238    add           r1, r3
239    add           r2, r4
240    dec          r5d
241    jg         .loop
242    REP_RET
243%endmacro
244
245INIT_MMX mmxext
246QPEL4_H_LOWPASS_L2_OP put
247QPEL4_H_LOWPASS_L2_OP avg
248
249
250%macro QPEL8_H_LOWPASS_L2_OP 1
251cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
252    movsxdifnidn  r3, r3d
253    movsxdifnidn  r4, r4d
254    mov          r5d, 8
255    pxor          m7, m7
256    mova          m6, [pw_5]
257.loop:
258    mova          m0, [r1]
259    mova          m2, [r1+1]
260    mova          m1, m0
261    mova          m3, m2
262    punpcklbw     m0, m7
263    punpckhbw     m1, m7
264    punpcklbw     m2, m7
265    punpckhbw     m3, m7
266    paddw         m0, m2
267    paddw         m1, m3
268    psllw         m0, 2
269    psllw         m1, 2
270    mova          m2, [r1-1]
271    mova          m4, [r1+2]
272    mova          m3, m2
273    mova          m5, m4
274    punpcklbw     m2, m7
275    punpckhbw     m3, m7
276    punpcklbw     m4, m7
277    punpckhbw     m5, m7
278    paddw         m2, m4
279    paddw         m5, m3
280    psubw         m0, m2
281    psubw         m1, m5
282    pmullw        m0, m6
283    pmullw        m1, m6
284    movd          m2, [r1-2]
285    movd          m5, [r1+7]
286    punpcklbw     m2, m7
287    punpcklbw     m5, m7
288    paddw         m2, m3
289    paddw         m4, m5
290    mova          m5, [pw_16]
291    paddw         m2, m5
292    paddw         m4, m5
293    paddw         m0, m2
294    paddw         m1, m4
295    psraw         m0, 5
296    psraw         m1, 5
297    mova          m4, [r2]
298    packuswb      m0, m1
299    pavgb         m0, m4
300    op_%1         m0, [r0], m4
301    add           r0, r3
302    add           r1, r3
303    add           r2, r4
304    dec          r5d
305    jg         .loop
306    REP_RET
307%endmacro
308
309INIT_MMX mmxext
310QPEL8_H_LOWPASS_L2_OP put
311QPEL8_H_LOWPASS_L2_OP avg
312
313
314%macro QPEL8_H_LOWPASS_L2_OP_XMM 1
315cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride
316    movsxdifnidn  r3, r3d
317    movsxdifnidn  r4, r4d
318    mov          r5d, 8
319    pxor          m7, m7
320    mova          m6, [pw_5]
321.loop:
322    lddqu         m1, [r1-2]
323    mova          m0, m1
324    punpckhbw     m1, m7
325    punpcklbw     m0, m7
326    mova          m2, m1
327    mova          m3, m1
328    mova          m4, m1
329    mova          m5, m1
330    palignr       m4, m0, 2
331    palignr       m3, m0, 4
332    palignr       m2, m0, 6
333    palignr       m1, m0, 8
334    palignr       m5, m0, 10
335    paddw         m0, m5
336    paddw         m2, m3
337    paddw         m1, m4
338    psllw         m2, 2
339    movh          m3, [r2]
340    psubw         m2, m1
341    paddw         m0, [pw_16]
342    pmullw        m2, m6
343    paddw         m2, m0
344    psraw         m2, 5
345    packuswb      m2, m2
346    pavgb         m2, m3
347    op_%1h        m2, [r0], m4
348    add           r1, r3
349    add           r0, r3
350    add           r2, r4
351    dec          r5d
352    jg         .loop
353    REP_RET
354%endmacro
355
356INIT_XMM ssse3
357QPEL8_H_LOWPASS_L2_OP_XMM put
358QPEL8_H_LOWPASS_L2_OP_XMM avg
359
360
361; All functions that call this are required to have function arguments of
362; dst, src, dstStride, srcStride
363%macro FILT_V 1
364    mova      m6, m2
365    movh      m5, [r1]
366    paddw     m6, m3
367    psllw     m6, 2
368    psubw     m6, m1
369    psubw     m6, m4
370    punpcklbw m5, m7
371    pmullw    m6, [pw_5]
372    paddw     m0, [pw_16]
373    add       r1, r3
374    paddw     m0, m5
375    paddw     m6, m0
376    psraw     m6, 5
377    packuswb  m6, m6
378    op_%1h    m6, [r0], m0 ; 1
379    add       r0, r2
380    SWAP       0, 1, 2, 3, 4, 5
381%endmacro
382
383%macro QPEL4_V_LOWPASS_OP 1
384cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride
385    movsxdifnidn  r2, r2d
386    movsxdifnidn  r3, r3d
387    sub           r1, r3
388    sub           r1, r3
389    pxor          m7, m7
390    movh          m0, [r1]
391    movh          m1, [r1+r3]
392    lea           r1, [r1+2*r3]
393    movh          m2, [r1]
394    movh          m3, [r1+r3]
395    lea           r1, [r1+2*r3]
396    movh          m4, [r1]
397    add           r1, r3
398    punpcklbw     m0, m7
399    punpcklbw     m1, m7
400    punpcklbw     m2, m7
401    punpcklbw     m3, m7
402    punpcklbw     m4, m7
403    FILT_V        %1
404    FILT_V        %1
405    FILT_V        %1
406    FILT_V        %1
407    RET
408%endmacro
409
410INIT_MMX mmxext
411QPEL4_V_LOWPASS_OP put
412QPEL4_V_LOWPASS_OP avg
413
414
415
416%macro QPEL8OR16_V_LOWPASS_OP 1
417%if cpuflag(sse2)
418cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h
419    movsxdifnidn  r2, r2d
420    movsxdifnidn  r3, r3d
421    sub           r1, r3
422    sub           r1, r3
423%else
424cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h
425    movsxdifnidn  r2, r2d
426    movsxdifnidn  r3, r3d
427%endif
428    pxor          m7, m7
429    movh          m0, [r1]
430    movh          m1, [r1+r3]
431    lea           r1, [r1+2*r3]
432    movh          m2, [r1]
433    movh          m3, [r1+r3]
434    lea           r1, [r1+2*r3]
435    movh          m4, [r1]
436    add           r1, r3
437    punpcklbw     m0, m7
438    punpcklbw     m1, m7
439    punpcklbw     m2, m7
440    punpcklbw     m3, m7
441    punpcklbw     m4, m7
442    FILT_V        %1
443    FILT_V        %1
444    FILT_V        %1
445    FILT_V        %1
446    FILT_V        %1
447    FILT_V        %1
448    FILT_V        %1
449    FILT_V        %1
450    cmp          r4d, 16
451    jne         .end
452    FILT_V        %1
453    FILT_V        %1
454    FILT_V        %1
455    FILT_V        %1
456    FILT_V        %1
457    FILT_V        %1
458    FILT_V        %1
459    FILT_V        %1
460.end:
461    REP_RET
462%endmacro
463
464INIT_MMX mmxext
465QPEL8OR16_V_LOWPASS_OP put
466QPEL8OR16_V_LOWPASS_OP avg
467
468INIT_XMM sse2
469QPEL8OR16_V_LOWPASS_OP put
470QPEL8OR16_V_LOWPASS_OP avg
471
472
473; All functions that use this are required to have args:
474; src, tmp, srcSize
475%macro FILT_HV 1 ; offset
476    mova           m6, m2
477    movh           m5, [r0]
478    paddw          m6, m3
479    psllw          m6, 2
480    paddw          m0, [pw_16]
481    psubw          m6, m1
482    psubw          m6, m4
483    punpcklbw      m5, m7
484    pmullw         m6, [pw_5]
485    paddw          m0, m5
486    add            r0, r2
487    paddw          m6, m0
488    mova      [r1+%1], m6
489    SWAP            0, 1, 2, 3, 4, 5
490%endmacro
491
492%macro QPEL4_HV1_LOWPASS_OP 1
493cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
494    movsxdifnidn  r2, r2d
495    pxor          m7, m7
496    movh          m0, [r0]
497    movh          m1, [r0+r2]
498    lea           r0, [r0+2*r2]
499    movh          m2, [r0]
500    movh          m3, [r0+r2]
501    lea           r0, [r0+2*r2]
502    movh          m4, [r0]
503    add           r0, r2
504    punpcklbw     m0, m7
505    punpcklbw     m1, m7
506    punpcklbw     m2, m7
507    punpcklbw     m3, m7
508    punpcklbw     m4, m7
509    FILT_HV       0*24
510    FILT_HV       1*24
511    FILT_HV       2*24
512    FILT_HV       3*24
513    RET
514
515cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride
516    movsxdifnidn  r2, r2d
517    mov          r3d, 4
518.loop:
519    mova          m0, [r0]
520    paddw         m0, [r0+10]
521    mova          m1, [r0+2]
522    paddw         m1, [r0+8]
523    mova          m2, [r0+4]
524    paddw         m2, [r0+6]
525    psubw         m0, m1
526    psraw         m0, 2
527    psubw         m0, m1
528    paddsw        m0, m2
529    psraw         m0, 2
530    paddw         m0, m2
531    psraw         m0, 6
532    packuswb      m0, m0
533    op_%1h        m0, [r1], m7
534    add           r0, 24
535    add           r1, r2
536    dec          r3d
537    jnz        .loop
538    REP_RET
539%endmacro
540
541INIT_MMX mmxext
542QPEL4_HV1_LOWPASS_OP put
543QPEL4_HV1_LOWPASS_OP avg
544
545%macro QPEL8OR16_HV1_LOWPASS_OP 1
546cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size
547    movsxdifnidn  r2, r2d
548    pxor          m7, m7
549    movh          m0, [r0]
550    movh          m1, [r0+r2]
551    lea           r0, [r0+2*r2]
552    movh          m2, [r0]
553    movh          m3, [r0+r2]
554    lea           r0, [r0+2*r2]
555    movh          m4, [r0]
556    add           r0, r2
557    punpcklbw     m0, m7
558    punpcklbw     m1, m7
559    punpcklbw     m2, m7
560    punpcklbw     m3, m7
561    punpcklbw     m4, m7
562    FILT_HV     0*48
563    FILT_HV     1*48
564    FILT_HV     2*48
565    FILT_HV     3*48
566    FILT_HV     4*48
567    FILT_HV     5*48
568    FILT_HV     6*48
569    FILT_HV     7*48
570    cmp          r3d, 16
571    jne         .end
572    FILT_HV     8*48
573    FILT_HV     9*48
574    FILT_HV    10*48
575    FILT_HV    11*48
576    FILT_HV    12*48
577    FILT_HV    13*48
578    FILT_HV    14*48
579    FILT_HV    15*48
580.end:
581    REP_RET
582%endmacro
583
584INIT_MMX mmxext
585QPEL8OR16_HV1_LOWPASS_OP put
586QPEL8OR16_HV1_LOWPASS_OP avg
587
588INIT_XMM sse2
589QPEL8OR16_HV1_LOWPASS_OP put
590
591
592
593%macro QPEL8OR16_HV2_LOWPASS_OP 1
594; unused is to match ssse3 and mmxext args
595cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h
596    movsxdifnidn  r2, r2d
597.loop:
598    mova          m0, [r1]
599    mova          m3, [r1+8]
600    mova          m1, [r1+2]
601    mova          m4, [r1+10]
602    paddw         m0, m4
603    paddw         m1, m3
604    paddw         m3, [r1+18]
605    paddw         m4, [r1+16]
606    mova          m2, [r1+4]
607    mova          m5, [r1+12]
608    paddw         m2, [r1+6]
609    paddw         m5, [r1+14]
610    psubw         m0, m1
611    psubw         m3, m4
612    psraw         m0, 2
613    psraw         m3, 2
614    psubw         m0, m1
615    psubw         m3, m4
616    paddsw        m0, m2
617    paddsw        m3, m5
618    psraw         m0, 2
619    psraw         m3, 2
620    paddw         m0, m2
621    paddw         m3, m5
622    psraw         m0, 6
623    psraw         m3, 6
624    packuswb      m0, m3
625    op_%1         m0, [r0], m7
626    add           r1, 48
627    add           r0, r2
628    dec          r4d
629    jne        .loop
630    REP_RET
631%endmacro
632
633INIT_MMX mmxext
634QPEL8OR16_HV2_LOWPASS_OP put
635QPEL8OR16_HV2_LOWPASS_OP avg
636
637%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1
638cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size
639    movsxdifnidn  r2, r2d
640    movsxdifnidn  r3, r3d
641    cmp          r4d, 16
642    je         .op16
643.loop8:
644    mova          m1, [r1+16]
645    mova          m0, [r1]
646    mova          m2, m1
647    mova          m3, m1
648    mova          m4, m1
649    mova          m5, m1
650    palignr       m5, m0, 10
651    palignr       m4, m0, 8
652    palignr       m3, m0, 6
653    palignr       m2, m0, 4
654    palignr       m1, m0, 2
655    paddw         m0, m5
656    paddw         m1, m4
657    paddw         m2, m3
658    psubw         m0, m1
659    psraw         m0, 2
660    psubw         m0, m1
661    paddw         m0, m2
662    psraw         m0, 2
663    paddw         m0, m2
664    psraw         m0, 6
665    packuswb      m0, m0
666    op_%1h        m0, [r0], m7
667    add           r1, 48
668    add           r0, r2
669    dec          r4d
670    jne       .loop8
671    jmp        .done
672.op16:
673    mova          m4, [r1+32]
674    mova          m5, [r1+16]
675    mova          m7, [r1]
676    mova          m3, m4
677    mova          m2, m4
678    mova          m1, m4
679    mova          m0, m4
680    palignr       m0, m5, 10
681    palignr       m1, m5, 8
682    palignr       m2, m5, 6
683    palignr       m3, m5, 4
684    palignr       m4, m5, 2
685    paddw         m0, m5
686    paddw         m1, m4
687    paddw         m2, m3
688    mova          m6, m5
689    mova          m4, m5
690    mova          m3, m5
691    palignr       m4, m7, 8
692    palignr       m6, m7, 2
693    palignr       m3, m7, 10
694    paddw         m4, m6
695    mova          m6, m5
696    palignr       m5, m7, 6
697    palignr       m6, m7, 4
698    paddw         m3, m7
699    paddw         m5, m6
700    psubw         m0, m1
701    psubw         m3, m4
702    psraw         m0, 2
703    psraw         m3, 2
704    psubw         m0, m1
705    psubw         m3, m4
706    paddw         m0, m2
707    paddw         m3, m5
708    psraw         m0, 2
709    psraw         m3, 2
710    paddw         m0, m2
711    paddw         m3, m5
712    psraw         m0, 6
713    psraw         m3, 6
714    packuswb      m3, m0
715    op_%1         m3, [r0], m7
716    add           r1, 48
717    add           r0, r2
718    dec          r4d
719    jne        .op16
720.done:
721    REP_RET
722%endmacro
723
724INIT_XMM ssse3
725QPEL8OR16_HV2_LOWPASS_OP_XMM put
726QPEL8OR16_HV2_LOWPASS_OP_XMM avg
727
728
729%macro PIXELS4_L2_SHIFT5 1
730cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h
731    movsxdifnidn  r3, r3d
732    movsxdifnidn  r4, r4d
733    mova          m0, [r1]
734    mova          m1, [r1+24]
735    psraw         m0, 5
736    psraw         m1, 5
737    packuswb      m0, m0
738    packuswb      m1, m1
739    pavgb         m0, [r2]
740    pavgb         m1, [r2+r4]
741    op_%1h        m0, [r0], m4
742    op_%1h        m1, [r0+r3], m5
743    lea           r2, [r2+r4*2]
744    lea           r0, [r0+r3*2]
745    mova          m0, [r1+48]
746    mova          m1, [r1+72]
747    psraw         m0, 5
748    psraw         m1, 5
749    packuswb      m0, m0
750    packuswb      m1, m1
751    pavgb         m0, [r2]
752    pavgb         m1, [r2+r4]
753    op_%1h        m0, [r0], m4
754    op_%1h        m1, [r0+r3], m5
755    RET
756%endmacro
757
758INIT_MMX mmxext
759PIXELS4_L2_SHIFT5 put
760PIXELS4_L2_SHIFT5 avg
761
762
763%macro PIXELS8_L2_SHIFT5 1
764cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h
765    movsxdifnidn  r3, r3d
766    movsxdifnidn  r4, r4d
767.loop:
768    mova          m0, [r1]
769    mova          m1, [r1+8]
770    mova          m2, [r1+48]
771    mova          m3, [r1+48+8]
772    psraw         m0, 5
773    psraw         m1, 5
774    psraw         m2, 5
775    psraw         m3, 5
776    packuswb      m0, m1
777    packuswb      m2, m3
778    pavgb         m0, [r2]
779    pavgb         m2, [r2+r4]
780    op_%1         m0, [r0], m4
781    op_%1         m2, [r0+r3], m5
782    lea           r2, [r2+2*r4]
783    add           r1, 48*2
784    lea           r0, [r0+2*r3]
785    sub          r5d, 2
786    jne        .loop
787    REP_RET
788%endmacro
789
790INIT_MMX mmxext
791PIXELS8_L2_SHIFT5 put
792PIXELS8_L2_SHIFT5 avg
793
794
795%if ARCH_X86_64
796%macro QPEL16_H_LOWPASS_L2_OP 1
797cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride
798    movsxdifnidn  r3, r3d
799    movsxdifnidn  r4, r4d
800    mov          r5d, 16
801    pxor         m15, m15
802    mova         m14, [pw_5]
803    mova         m13, [pw_16]
804.loop:
805    lddqu         m1, [r1+6]
806    lddqu         m7, [r1-2]
807    mova          m0, m1
808    punpckhbw     m1, m15
809    punpcklbw     m0, m15
810    punpcklbw     m7, m15
811    mova          m2, m1
812    mova          m6, m0
813    mova          m3, m1
814    mova          m8, m0
815    mova          m4, m1
816    mova          m9, m0
817    mova         m12, m0
818    mova         m11, m1
819    palignr      m11, m0, 10
820    palignr      m12, m7, 10
821    palignr       m4, m0, 2
822    palignr       m9, m7, 2
823    palignr       m3, m0, 4
824    palignr       m8, m7, 4
825    palignr       m2, m0, 6
826    palignr       m6, m7, 6
827    paddw        m11, m0
828    palignr       m1, m0, 8
829    palignr       m0, m7, 8
830    paddw         m7, m12
831    paddw         m2, m3
832    paddw         m6, m8
833    paddw         m1, m4
834    paddw         m0, m9
835    psllw         m2, 2
836    psllw         m6, 2
837    psubw         m2, m1
838    psubw         m6, m0
839    paddw        m11, m13
840    paddw         m7, m13
841    pmullw        m2, m14
842    pmullw        m6, m14
843    lddqu         m3, [r2]
844    paddw         m2, m11
845    paddw         m6, m7
846    psraw         m2, 5
847    psraw         m6, 5
848    packuswb      m6, m2
849    pavgb         m6, m3
850    op_%1         m6, [r0], m11
851    add           r1, r3
852    add           r0, r3
853    add           r2, r4
854    dec          r5d
855    jg         .loop
856    REP_RET
857%endmacro
858
859INIT_XMM ssse3
860QPEL16_H_LOWPASS_L2_OP put
861QPEL16_H_LOWPASS_L2_OP avg
862%endif
863