1;*****************************************************************************
2;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
3;*****************************************************************************
4;* Copyright (C) 2011 x264 project
5;*
6;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
7;*
8;* This file is part of FFmpeg.
9;*
10;* FFmpeg is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* FFmpeg is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with FFmpeg; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "libavutil/x86/x86util.asm"
26
27SECTION_RODATA 32
28
29cextern pd_65535
30cextern pw_1023
31%define pw_pixel_max pw_1023
32cextern pw_16
33cextern pw_1
34cextern pb_0
35
36pad10: times 8 dw 10*1023
37pad20: times 8 dw 20*1023
38pad30: times 8 dw 30*1023
39depad: times 4 dd 32*20*1023 + 512
40depad2: times 8 dw 20*1023 + 16*1022 + 16
41unpad: times 8 dw 16*1022/32 ; needs to be mod 16
42
43tap1: times 4 dw  1, -5
44tap2: times 4 dw 20, 20
45tap3: times 4 dw -5,  1
46
47SECTION .text
48
49
50%macro AVG_MOV 2
51    pavgw %2, %1
52    mova  %1, %2
53%endmacro
54
55%macro ADDW 3
56%if mmsize == 8
57    paddw %1, %2
58%else
59    movu  %3, %2
60    paddw %1, %3
61%endif
62%endmacro
63
64%macro FILT_H 4
65    paddw  %1, %4
66    psubw  %1, %2  ; a-b
67    psraw  %1, 2   ; (a-b)/4
68    psubw  %1, %2  ; (a-b)/4-b
69    paddw  %1, %3  ; (a-b)/4-b+c
70    psraw  %1, 2   ; ((a-b)/4-b+c)/4
71    paddw  %1, %3  ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
72%endmacro
73
74%macro PRELOAD_V 0
75    lea      r3, [r2*3]
76    sub      r1, r3
77    movu     m0, [r1+r2]
78    movu     m1, [r1+r2*2]
79    add      r1, r3
80    movu     m2, [r1]
81    movu     m3, [r1+r2]
82    movu     m4, [r1+r2*2]
83    add      r1, r3
84%endmacro
85
86%macro FILT_V 8
87    movu     %6, [r1]
88    paddw    %1, %6
89    mova     %7, %2
90    paddw    %7, %5
91    mova     %8, %3
92    paddw    %8, %4
93    FILT_H   %1, %7, %8, [pw_16]
94    psraw    %1, 1
95    CLIPW    %1, [pb_0], [pw_pixel_max]
96%endmacro
97
98%macro MC 1
99%define OP_MOV mova
100INIT_MMX mmxext
101%1 put, 4
102INIT_XMM sse2
103%1 put, 8
104
105%define OP_MOV AVG_MOV
106INIT_MMX mmxext
107%1 avg, 4
108INIT_XMM sse2
109%1 avg, 8
110%endmacro
111
112%macro MCAxA_OP 7
113%if ARCH_X86_32
114cglobal %1_h264_qpel%4_%2_10, %5,%6,%7
115    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
116    mov  r0, r0m
117    mov  r1, r1m
118    add  r0, %3*2
119    add  r1, %3*2
120    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
121    mov  r0, r0m
122    mov  r1, r1m
123    lea  r0, [r0+r2*%3]
124    lea  r1, [r1+r2*%3]
125    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
126    mov  r0, r0m
127    mov  r1, r1m
128    lea  r0, [r0+r2*%3+%3*2]
129    lea  r1, [r1+r2*%3+%3*2]
130    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
131    RET
132%else ; ARCH_X86_64
133cglobal %1_h264_qpel%4_%2_10, %5,%6 + 2,%7
134    mov r%6, r0
135%assign p1 %6+1
136    mov r %+ p1, r1
137    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
138    lea  r0, [r%6+%3*2]
139    lea  r1, [r %+ p1+%3*2]
140    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
141    lea  r0, [r%6+r2*%3]
142    lea  r1, [r %+ p1+r2*%3]
143    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
144    lea  r0, [r%6+r2*%3+%3*2]
145    lea  r1, [r %+ p1+r2*%3+%3*2]
146%if UNIX64 == 0 ; fall through to function
147    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
148    RET
149%endif
150%endif
151%endmacro
152
153;cpu, put/avg, mc, 4/8, ...
154%macro cglobal_mc 6
155%assign i %3*2
156%if ARCH_X86_32 || cpuflag(sse2)
157MCAxA_OP %1, %2, %3, i, %4,%5,%6
158%endif
159
160cglobal %1_h264_qpel%3_%2_10, %4,%5,%6
161%if UNIX64 == 0 ; no prologue or epilogue for UNIX64
162    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
163    RET
164%endif
165
166stub_%1_h264_qpel%3_%2_10 %+ SUFFIX:
167%endmacro
168
169;-----------------------------------------------------------------------------
170; void ff_h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
171;-----------------------------------------------------------------------------
172%macro COPY4 0
173    movu          m0, [r1     ]
174    OP_MOV [r0     ], m0
175    movu          m0, [r1+r2  ]
176    OP_MOV [r0+r2  ], m0
177    movu          m0, [r1+r2*2]
178    OP_MOV [r0+r2*2], m0
179    movu          m0, [r1+r3  ]
180    OP_MOV [r0+r3  ], m0
181%endmacro
182
183%macro MC00 1
184INIT_MMX mmxext
185cglobal_mc %1, mc00, 4, 3,4,0
186    lea           r3, [r2*3]
187    COPY4
188    ret
189
190INIT_XMM sse2
191cglobal %1_h264_qpel8_mc00_10, 3,4
192    lea  r3, [r2*3]
193    COPY4
194    lea  r0, [r0+r2*4]
195    lea  r1, [r1+r2*4]
196    COPY4
197    RET
198
199cglobal %1_h264_qpel16_mc00_10, 3,4
200    mov r3d, 8
201.loop:
202    movu           m0, [r1      ]
203    movu           m1, [r1   +16]
204    OP_MOV [r0      ], m0
205    OP_MOV [r0   +16], m1
206    movu           m0, [r1+r2   ]
207    movu           m1, [r1+r2+16]
208    OP_MOV [r0+r2   ], m0
209    OP_MOV [r0+r2+16], m1
210    lea            r0, [r0+r2*2]
211    lea            r1, [r1+r2*2]
212    dec r3d
213    jg .loop
214    REP_RET
215%endmacro
216
217%define OP_MOV mova
218MC00 put
219
220%define OP_MOV AVG_MOV
221MC00 avg
222
223;-----------------------------------------------------------------------------
224; void ff_h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
225;-----------------------------------------------------------------------------
226%macro MC_CACHE 1
227%define OP_MOV mova
228INIT_MMX mmxext
229%1 put, 4
230INIT_XMM sse2, cache64
231%1 put, 8
232INIT_XMM ssse3, cache64
233%1 put, 8
234INIT_XMM sse2
235%1 put, 8
236
237%define OP_MOV AVG_MOV
238INIT_MMX mmxext
239%1 avg, 4
240INIT_XMM sse2, cache64
241%1 avg, 8
242INIT_XMM ssse3, cache64
243%1 avg, 8
244INIT_XMM sse2
245%1 avg, 8
246%endmacro
247
248%macro MC20 2
249cglobal_mc %1, mc20, %2, 3,4,9
250    mov     r3d, %2
251    mova     m1, [pw_pixel_max]
252%if num_mmregs > 8
253    mova     m8, [pw_16]
254    %define p16 m8
255%else
256    %define p16 [pw_16]
257%endif
258.nextrow:
259%if %0 == 4
260    movu     m2, [r1-4]
261    movu     m3, [r1-2]
262    movu     m4, [r1+0]
263    ADDW     m2, [r1+6], m5
264    ADDW     m3, [r1+4], m5
265    ADDW     m4, [r1+2], m5
266%else ; movu is slow on these processors
267%if mmsize==16
268    movu     m2, [r1-4]
269    movu     m0, [r1+6]
270    mova     m6, m0
271    psrldq   m0, 6
272
273    paddw    m6, m2
274    PALIGNR  m3, m0, m2, 2, m5
275    PALIGNR  m7, m0, m2, 8, m5
276    paddw    m3, m7
277    PALIGNR  m4, m0, m2, 4, m5
278    PALIGNR  m7, m0, m2, 6, m5
279    paddw    m4, m7
280    SWAP      2, 6
281%else
282    movu     m2, [r1-4]
283    movu     m6, [r1+4]
284    PALIGNR  m3, m6, m2, 2, m5
285    paddw    m3, m6
286    PALIGNR  m4, m6, m2, 4, m5
287    PALIGNR  m7, m6, m2, 6, m5
288    paddw    m4, m7
289    paddw    m2, [r1+6]
290%endif
291%endif
292
293    FILT_H   m2, m3, m4, p16
294    psraw    m2, 1
295    pxor     m0, m0
296    CLIPW    m2, m0, m1
297    OP_MOV [r0], m2
298    add      r0, r2
299    add      r1, r2
300    dec     r3d
301    jg .nextrow
302    rep ret
303%endmacro
304
305MC_CACHE MC20
306
307;-----------------------------------------------------------------------------
308; void ff_h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
309;-----------------------------------------------------------------------------
310%macro MC30 2
311cglobal_mc %1, mc30, %2, 3,5,9
312    lea r4, [r1+2]
313    jmp stub_%1_h264_qpel%2_mc10_10 %+ SUFFIX %+ .body
314%endmacro
315
316MC_CACHE MC30
317
318;-----------------------------------------------------------------------------
319; void ff_h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
320;-----------------------------------------------------------------------------
321%macro MC10 2
322cglobal_mc %1, mc10, %2, 3,5,9
323    mov      r4, r1
324.body:
325    mov     r3d, %2
326    mova     m1, [pw_pixel_max]
327%if num_mmregs > 8
328    mova     m8, [pw_16]
329    %define p16 m8
330%else
331    %define p16 [pw_16]
332%endif
333.nextrow:
334%if %0 == 4
335    movu     m2, [r1-4]
336    movu     m3, [r1-2]
337    movu     m4, [r1+0]
338    ADDW     m2, [r1+6], m5
339    ADDW     m3, [r1+4], m5
340    ADDW     m4, [r1+2], m5
341%else ; movu is slow on these processors
342%if mmsize==16
343    movu     m2, [r1-4]
344    movu     m0, [r1+6]
345    mova     m6, m0
346    psrldq   m0, 6
347
348    paddw    m6, m2
349    PALIGNR  m3, m0, m2, 2, m5
350    PALIGNR  m7, m0, m2, 8, m5
351    paddw    m3, m7
352    PALIGNR  m4, m0, m2, 4, m5
353    PALIGNR  m7, m0, m2, 6, m5
354    paddw    m4, m7
355    SWAP      2, 6
356%else
357    movu     m2, [r1-4]
358    movu     m6, [r1+4]
359    PALIGNR  m3, m6, m2, 2, m5
360    paddw    m3, m6
361    PALIGNR  m4, m6, m2, 4, m5
362    PALIGNR  m7, m6, m2, 6, m5
363    paddw    m4, m7
364    paddw    m2, [r1+6]
365%endif
366%endif
367
368    FILT_H   m2, m3, m4, p16
369    psraw    m2, 1
370    pxor     m0, m0
371    CLIPW    m2, m0, m1
372    movu     m3, [r4]
373    pavgw    m2, m3
374    OP_MOV [r0], m2
375    add      r0, r2
376    add      r1, r2
377    add      r4, r2
378    dec     r3d
379    jg .nextrow
380    rep ret
381%endmacro
382
383MC_CACHE MC10
384
385;-----------------------------------------------------------------------------
386; void ff_h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
387;-----------------------------------------------------------------------------
388%macro V_FILT 10
389v_filt%9_%10_10:
390    add    r4, r2
391.no_addr4:
392    FILT_V m0, m1, m2, m3, m4, m5, m6, m7
393    add    r1, r2
394    add    r0, r2
395    ret
396%endmacro
397
398INIT_MMX mmxext
399RESET_MM_PERMUTATION
400%assign i 0
401%rep 4
402V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i
403SWAP 0,1,2,3,4,5
404%assign i i+1
405%endrep
406
407INIT_XMM sse2
408RESET_MM_PERMUTATION
409%assign i 0
410%rep 6
411V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i
412SWAP 0,1,2,3,4,5
413%assign i i+1
414%endrep
415
416%macro MC02 2
417cglobal_mc %1, mc02, %2, 3,4,8
418    PRELOAD_V
419
420    sub      r0, r2
421%assign j 0
422%rep %2
423    %assign i (j % 6)
424    call v_filt%2_ %+ i %+ _10.no_addr4
425    OP_MOV [r0], m0
426    SWAP 0,1,2,3,4,5
427    %assign j j+1
428%endrep
429    ret
430%endmacro
431
432MC MC02
433
434;-----------------------------------------------------------------------------
435; void ff_h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
436;-----------------------------------------------------------------------------
437%macro MC01 2
438cglobal_mc %1, mc01, %2, 3,5,8
439    mov      r4, r1
440.body:
441    PRELOAD_V
442
443    sub      r4, r2
444    sub      r0, r2
445%assign j 0
446%rep %2
447    %assign i (j % 6)
448    call v_filt%2_ %+ i %+ _10
449    movu     m7, [r4]
450    pavgw    m0, m7
451    OP_MOV [r0], m0
452    SWAP 0,1,2,3,4,5
453    %assign j j+1
454%endrep
455    ret
456%endmacro
457
458MC MC01
459
460;-----------------------------------------------------------------------------
461; void ff_h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
462;-----------------------------------------------------------------------------
463%macro MC03 2
464cglobal_mc %1, mc03, %2, 3,5,8
465    lea r4, [r1+r2]
466    jmp stub_%1_h264_qpel%2_mc01_10 %+ SUFFIX %+ .body
467%endmacro
468
469MC MC03
470
471;-----------------------------------------------------------------------------
472; void ff_h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
473;-----------------------------------------------------------------------------
474%macro H_FILT_AVG 2-3
475h_filt%1_%2_10:
476;FILT_H with fewer registers and averaged with the FILT_V result
477;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
478;unfortunately I need three registers, so m5 will have to be re-read from memory
479    movu     m5, [r4-4]
480    ADDW     m5, [r4+6], m7
481    movu     m6, [r4-2]
482    ADDW     m6, [r4+4], m7
483    paddw    m5, [pw_16]
484    psubw    m5, m6  ; a-b
485    psraw    m5, 2   ; (a-b)/4
486    psubw    m5, m6  ; (a-b)/4-b
487    movu     m6, [r4+0]
488    ADDW     m6, [r4+2], m7
489    paddw    m5, m6  ; (a-b)/4-b+c
490    psraw    m5, 2   ; ((a-b)/4-b+c)/4
491    paddw    m5, m6  ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
492    psraw    m5, 1
493    CLIPW    m5, [pb_0], [pw_pixel_max]
494;avg FILT_V, FILT_H
495    pavgw    m0, m5
496%if %0!=4
497    movu     m5, [r1+r5]
498%endif
499    ret
500%endmacro
501
502INIT_MMX mmxext
503RESET_MM_PERMUTATION
504%assign i 0
505%rep 3
506H_FILT_AVG 4, i
507SWAP 0,1,2,3,4,5
508%assign i i+1
509%endrep
510H_FILT_AVG 4, i, 0
511
512INIT_XMM sse2
513RESET_MM_PERMUTATION
514%assign i 0
515%rep 6
516%if i==1
517H_FILT_AVG 8, i, 0
518%else
519H_FILT_AVG 8, i
520%endif
521SWAP 0,1,2,3,4,5
522%assign i i+1
523%endrep
524
525%macro MC11 2
526; this REALLY needs x86_64
527cglobal_mc %1, mc11, %2, 3,6,8
528    mov      r4, r1
529.body:
530    PRELOAD_V
531
532    sub      r0, r2
533    sub      r4, r2
534    mov      r5, r2
535    neg      r5
536%assign j 0
537%rep %2
538    %assign i (j % 6)
539    call v_filt%2_ %+ i %+ _10
540    call h_filt%2_ %+ i %+ _10
541%if %2==8 && i==1
542    movu     m5, [r1+r5]
543%endif
544    OP_MOV [r0], m0
545    SWAP 0,1,2,3,4,5
546    %assign j j+1
547%endrep
548    ret
549%endmacro
550
551MC MC11
552
553;-----------------------------------------------------------------------------
554; void ff_h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
555;-----------------------------------------------------------------------------
556%macro MC31 2
557cglobal_mc %1, mc31, %2, 3,6,8
558    mov r4, r1
559    add r1, 2
560    jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
561%endmacro
562
563MC MC31
564
565;-----------------------------------------------------------------------------
566; void ff_h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
567;-----------------------------------------------------------------------------
568%macro MC13 2
569cglobal_mc %1, mc13, %2, 3,7,12
570    lea r4, [r1+r2]
571    jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
572%endmacro
573
574MC MC13
575
576;-----------------------------------------------------------------------------
577; void ff_h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
578;-----------------------------------------------------------------------------
579%macro MC33 2
580cglobal_mc %1, mc33, %2, 3,6,8
581    lea r4, [r1+r2]
582    add r1, 2
583    jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
584%endmacro
585
586MC MC33
587
588;-----------------------------------------------------------------------------
589; void ff_h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
590;-----------------------------------------------------------------------------
591%macro FILT_H2 3
592    psubw  %1, %2  ; a-b
593    psubw  %2, %3  ; b-c
594    psllw  %2, 2
595    psubw  %1, %2  ; a-5*b+4*c
596    psllw  %3, 4
597    paddw  %1, %3  ; a-5*b+20*c
598%endmacro
599
600%macro FILT_VNRD 8
601    movu     %6, [r1]
602    paddw    %1, %6
603    mova     %7, %2
604    paddw    %7, %5
605    mova     %8, %3
606    paddw    %8, %4
607    FILT_H2  %1, %7, %8
608%endmacro
609
610%macro HV 1
611%if mmsize==16
612%define PAD 12
613%define COUNT 2
614%else
615%define PAD 4
616%define COUNT 3
617%endif
618put_hv%1_10:
619    neg      r2           ; This actually saves instructions
620    lea      r1, [r1+r2*2-mmsize+PAD]
621    lea      r4, [rsp+PAD+gprsize]
622    mov     r3d, COUNT
623.v_loop:
624    movu     m0, [r1]
625    sub      r1, r2
626    movu     m1, [r1]
627    sub      r1, r2
628    movu     m2, [r1]
629    sub      r1, r2
630    movu     m3, [r1]
631    sub      r1, r2
632    movu     m4, [r1]
633    sub      r1, r2
634%assign i 0
635%rep %1-1
636    FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
637    psubw    m0, [pad20]
638    movu     [r4+i*mmsize*3], m0
639    sub      r1, r2
640    SWAP 0,1,2,3,4,5
641%assign i i+1
642%endrep
643    FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
644    psubw    m0, [pad20]
645    movu     [r4+i*mmsize*3], m0
646    add      r4, mmsize
647    lea      r1, [r1+r2*8+mmsize]
648%if %1==8
649    lea      r1, [r1+r2*4]
650%endif
651    dec      r3d
652    jg .v_loop
653    neg      r2
654    ret
655%endmacro
656
657INIT_MMX mmxext
658HV 4
659INIT_XMM sse2
660HV 8
661
662%macro H_LOOP 1
663%if num_mmregs > 8
664    %define s1 m8
665    %define s2 m9
666    %define s3 m10
667    %define d1 m11
668%else
669    %define s1 [tap1]
670    %define s2 [tap2]
671    %define s3 [tap3]
672    %define d1 [depad]
673%endif
674h%1_loop_op:
675    movu       m1, [r1+mmsize-4]
676    movu       m2, [r1+mmsize-2]
677    mova       m3, [r1+mmsize+0]
678    movu       m4, [r1+mmsize+2]
679    movu       m5, [r1+mmsize+4]
680    movu       m6, [r1+mmsize+6]
681%if num_mmregs > 8
682    pmaddwd    m1, s1
683    pmaddwd    m2, s1
684    pmaddwd    m3, s2
685    pmaddwd    m4, s2
686    pmaddwd    m5, s3
687    pmaddwd    m6, s3
688    paddd      m1, d1
689    paddd      m2, d1
690%else
691    mova       m0, s1
692    pmaddwd    m1, m0
693    pmaddwd    m2, m0
694    mova       m0, s2
695    pmaddwd    m3, m0
696    pmaddwd    m4, m0
697    mova       m0, s3
698    pmaddwd    m5, m0
699    pmaddwd    m6, m0
700    mova       m0, d1
701    paddd      m1, m0
702    paddd      m2, m0
703%endif
704    paddd      m3, m5
705    paddd      m4, m6
706    paddd      m1, m3
707    paddd      m2, m4
708    psrad      m1, 10
709    psrad      m2, 10
710    pslld      m2, 16
711    pand       m1, [pd_65535]
712    por        m1, m2
713%if num_mmregs <= 8
714    pxor       m0, m0
715%endif
716    CLIPW      m1, m0, m7
717    add        r1, mmsize*3
718    ret
719%endmacro
720
721INIT_MMX mmxext
722H_LOOP 4
723INIT_XMM sse2
724H_LOOP 8
725
726%macro MC22 2
727cglobal_mc %1, mc22, %2, 3,7,12
728%define PAD mmsize*8*4*2      ; SIZE*16*4*sizeof(pixel)
729    mov      r6, rsp          ; backup stack pointer
730    and     rsp, ~(mmsize-1)  ; align stack
731    sub     rsp, PAD
732
733    call put_hv%2_10
734
735    mov       r3d, %2
736    mova       m7, [pw_pixel_max]
737%if num_mmregs > 8
738    pxor       m0, m0
739    mova       m8, [tap1]
740    mova       m9, [tap2]
741    mova      m10, [tap3]
742    mova      m11, [depad]
743%endif
744    mov        r1, rsp
745.h_loop:
746    call h%2_loop_op
747
748    OP_MOV   [r0], m1
749    add        r0, r2
750    dec       r3d
751    jg .h_loop
752
753    mov     rsp, r6          ; restore stack pointer
754    ret
755%endmacro
756
757MC MC22
758
759;-----------------------------------------------------------------------------
760; void ff_h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
761;-----------------------------------------------------------------------------
762%macro MC12 2
763cglobal_mc %1, mc12, %2, 3,7,12
764%define PAD mmsize*8*4*2        ; SIZE*16*4*sizeof(pixel)
765    mov        r6, rsp          ; backup stack pointer
766    and       rsp, ~(mmsize-1)  ; align stack
767    sub       rsp, PAD
768
769    call put_hv%2_10
770
771    xor       r4d, r4d
772.body:
773    mov       r3d, %2
774    pxor       m0, m0
775    mova       m7, [pw_pixel_max]
776%if num_mmregs > 8
777    mova       m8, [tap1]
778    mova       m9, [tap2]
779    mova      m10, [tap3]
780    mova      m11, [depad]
781%endif
782    mov        r1, rsp
783.h_loop:
784    call h%2_loop_op
785
786    movu       m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc
787    paddw      m3, [depad2]
788    psrlw      m3, 5
789    psubw      m3, [unpad]
790    CLIPW      m3, m0, m7
791    pavgw      m1, m3
792
793    OP_MOV   [r0], m1
794    add        r0, r2
795    dec       r3d
796    jg .h_loop
797
798    mov     rsp, r6          ; restore stack pointer
799    ret
800%endmacro
801
802MC MC12
803
804;-----------------------------------------------------------------------------
805; void ff_h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
806;-----------------------------------------------------------------------------
807%macro MC32 2
808cglobal_mc %1, mc32, %2, 3,7,12
809%define PAD mmsize*8*3*2  ; SIZE*16*4*sizeof(pixel)
810    mov  r6, rsp          ; backup stack pointer
811    and rsp, ~(mmsize-1)  ; align stack
812    sub rsp, PAD
813
814    call put_hv%2_10
815
816    mov r4d, 2            ; sizeof(pixel)
817    jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
818%endmacro
819
820MC MC32
821
822;-----------------------------------------------------------------------------
823; void ff_h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
824;-----------------------------------------------------------------------------
825%macro H_NRD 1
826put_h%1_10:
827    add       rsp, gprsize
828    mov       r3d, %1
829    xor       r4d, r4d
830    mova       m6, [pad20]
831.nextrow:
832    movu       m2, [r5-4]
833    movu       m3, [r5-2]
834    movu       m4, [r5+0]
835    ADDW       m2, [r5+6], m5
836    ADDW       m3, [r5+4], m5
837    ADDW       m4, [r5+2], m5
838
839    FILT_H2    m2, m3, m4
840    psubw      m2, m6
841    mova [rsp+r4], m2
842    add       r4d, mmsize*3
843    add        r5, r2
844    dec       r3d
845    jg .nextrow
846    sub       rsp, gprsize
847    ret
848%endmacro
849
850INIT_MMX mmxext
851H_NRD 4
852INIT_XMM sse2
853H_NRD 8
854
855%macro MC21 2
856cglobal_mc %1, mc21, %2, 3,7,12
857    mov   r5, r1
858.body:
859%define PAD mmsize*8*3*2   ; SIZE*16*4*sizeof(pixel)
860    mov   r6, rsp          ; backup stack pointer
861    and  rsp, ~(mmsize-1)  ; align stack
862
863    sub  rsp, PAD
864    call put_h%2_10
865
866    sub  rsp, PAD
867    call put_hv%2_10
868
869    mov r4d, PAD-mmsize    ; H buffer
870    jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
871%endmacro
872
873MC MC21
874
875;-----------------------------------------------------------------------------
876; void ff_h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
877;-----------------------------------------------------------------------------
878%macro MC23 2
879cglobal_mc %1, mc23, %2, 3,7,12
880    lea   r5, [r1+r2]
881    jmp stub_%1_h264_qpel%2_mc21_10 %+ SUFFIX %+ .body
882%endmacro
883
884MC MC23
885