1; /*
2; * Provide SSE luma and chroma mc functions for HEVC decoding
3; * Copyright (c) 2013 Pierre-Edouard LEPERE
4; *
5; * This file is part of FFmpeg.
6; *
7; * FFmpeg is free software; you can redistribute it and/or
8; * modify it under the terms of the GNU Lesser General Public
9; * License as published by the Free Software Foundation; either
10; * version 2.1 of the License, or (at your option) any later version.
11; *
12; * FFmpeg is distributed in the hope that it will be useful,
13; * but WITHOUT ANY WARRANTY; without even the implied warranty of
14; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15; * Lesser General Public License for more details.
16; *
17; * You should have received a copy of the GNU Lesser General Public
18; * License along with FFmpeg; if not, write to the Free Software
19; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20; */
21%include "libavutil/x86/x86util.asm"
22
23SECTION_RODATA 32
24cextern pw_255
25cextern pw_512
26cextern pw_2048
27cextern pw_8192
28cextern pw_1023
29cextern pw_1024
30cextern pw_4096
31%define pw_8 pw_512
32%define pw_10 pw_2048
33%define pw_12 pw_8192
34%define pw_bi_10 pw_1024
35%define pw_bi_12 pw_4096
36%define max_pixels_8 pw_255
37%define max_pixels_10 pw_1023
38pw_bi_8:                times 16 dw  (1 <<  8)
39max_pixels_12:          times 16 dw ((1 << 12)-1)
40cextern pd_1
41cextern pb_0
42
43%macro EPEL_TABLE 4
44hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
45                        times %2 d%3 10, -2
46                        times %2 d%3 -4, 54
47                        times %2 d%3 16, -2
48                        times %2 d%3 -6, 46
49                        times %2 d%3 28, -4
50                        times %2 d%3 -4, 36
51                        times %2 d%3 36, -4
52                        times %2 d%3 -4, 28
53                        times %2 d%3 46, -6
54                        times %2 d%3 -2, 16
55                        times %2 d%3 54, -4
56                        times %2 d%3 -2, 10
57                        times %2 d%3 58, -2
58%endmacro
59
60
61EPEL_TABLE  8,16, b, avx2
62EPEL_TABLE 10, 8, w, avx2
63
64EPEL_TABLE  8, 8, b, sse4
65EPEL_TABLE 10, 4, w, sse4
66EPEL_TABLE 12, 4, w, sse4
67
68%macro QPEL_TABLE 4
69hevc_qpel_filters_%4_%1 times %2 d%3  -1,  4
70                        times %2 d%3 -10, 58
71                        times %2 d%3  17, -5
72                        times %2 d%3   1,  0
73                        times %2 d%3  -1,  4
74                        times %2 d%3 -11, 40
75                        times %2 d%3  40,-11
76                        times %2 d%3   4, -1
77                        times %2 d%3   0,  1
78                        times %2 d%3  -5, 17
79                        times %2 d%3  58,-10
80                        times %2 d%3   4, -1
81%endmacro
82
83QPEL_TABLE  8, 8, b, sse4
84QPEL_TABLE 10, 4, w, sse4
85QPEL_TABLE 12, 4, w, sse4
86
87QPEL_TABLE  8,16, b, avx2
88QPEL_TABLE 10, 8, w, avx2
89
90SECTION .text
91
92%define MAX_PB_SIZE  64
93
94%define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
95
96%define hevc_qpel_filters_avx2_14 hevc_qpel_filters_avx2_10
97
98%if ARCH_X86_64
99
100%macro SIMPLE_BILOAD 4   ;width, tab, r1, r2
101%if %1 <= 4
102    movq              %3, [%2]                                              ; load data from source2
103%elif %1 <= 8
104    movdqa            %3, [%2]                                              ; load data from source2
105%elif %1 <= 12
106%if cpuflag(avx2)
107    mova              %3, [%2]
108%else
109    movdqa            %3, [%2]                                              ; load data from source2
110    movq              %4, [%2+16]                                           ; load data from source2
111%endif ;avx
112%elif %1 <= 16
113%if cpuflag(avx2)
114    mova              %3, [%2]
115%else
116    movdqa            %3, [%2]                                              ; load data from source2
117    movdqa            %4, [%2+16]                                           ; load data from source2
118%endif ; avx
119%else ; %1 = 32
120    mova              %3, [%2]
121    mova              %4, [%2+32]
122%endif
123%endmacro
124
125%macro SIMPLE_LOAD 4    ;width, bitd, tab, r1
126%if %1 == 2 || (%2 == 8 && %1 <= 4)
127    movd              %4, [%3]                                               ; load data from source
128%elif %1 == 4 || (%2 == 8 && %1 <= 8)
129    movq              %4, [%3]                                               ; load data from source
130%elif notcpuflag(avx)
131    movu              %4, [%3]                                               ; load data from source
132%elif %1 <= 8 || (%2 == 8 && %1 <= 16)
133    movdqu           %4, [%3]
134%else
135    movu              %4, [%3]
136%endif
137%endmacro
138
139
140%macro EPEL_FILTER 5 ; bit depth, filter index, xmma, xmmb, gprtmp
141%if cpuflag(avx2)
142%assign %%offset 32
143%ifdef PIC
144    lea              %5q, [hevc_epel_filters_avx2_%1]
145    %define FILTER %5q
146%else
147    %define FILTER hevc_epel_filters_avx2_%1
148%endif
149%else
150%assign %%offset 16
151%ifdef PIC
152    lea              %5q, [hevc_epel_filters_sse4_%1]
153    %define FILTER %5q
154%else
155    %define FILTER hevc_epel_filters_sse4_%1
156%endif
157%endif ;cpuflag(avx2)
158    sub              %2q, 1
159%if cpuflag(avx2)
160    shl              %2q, 6                      ; multiply by 64
161  %else
162    shl              %2q, 5                      ; multiply by 32
163%endif
164    mova           %3, [FILTER + %2q]        ; get 2 first values of filters
165    mova           %4, [FILTER + %2q+%%offset]     ; get 2 last values of filters
166%endmacro
167
168%macro EPEL_HV_FILTER 1
169%if cpuflag(avx2)
170%assign %%offset 32
171%assign %%shift  6
172%define %%table  hevc_epel_filters_avx2_%1
173%else
174%assign %%offset 16
175%assign %%shift  5
176%define %%table  hevc_epel_filters_sse4_%1
177%endif
178
179%ifdef PIC
180    lea           r3srcq, [%%table]
181    %define FILTER r3srcq
182%else
183    %define FILTER %%table
184%endif
185    sub              mxq, 1
186    sub              myq, 1
187    shl              mxq, %%shift                ; multiply by 32
188    shl              myq, %%shift                ; multiply by 32
189    mova             m14, [FILTER + mxq]        ; get 2 first values of filters
190    mova             m15, [FILTER + mxq+%%offset]     ; get 2 last values of filters
191
192%if cpuflag(avx2)
193%define %%table  hevc_epel_filters_avx2_10
194%else
195%define %%table  hevc_epel_filters_sse4_10
196%endif
197%ifdef PIC
198    lea           r3srcq, [%%table]
199    %define FILTER r3srcq
200%else
201    %define FILTER %%table
202%endif
203    mova             m12, [FILTER + myq]        ; get 2 first values of filters
204    mova             m13, [FILTER + myq+%%offset]     ; get 2 last values of filters
205    lea           r3srcq, [srcstrideq*3]
206%endmacro
207
208%macro QPEL_FILTER 2
209
210%if cpuflag(avx2)
211%assign %%offset 32
212%assign %%shift  7
213%define %%table  hevc_qpel_filters_avx2_%1
214%else
215%assign %%offset 16
216%assign %%shift  6
217%define %%table  hevc_qpel_filters_sse4_%1
218%endif
219
220%ifdef PIC
221    lea         rfilterq, [%%table]
222%else
223    %define rfilterq %%table
224%endif
225    sub              %2q, 1
226    shl              %2q, %%shift                        ; multiply by 32
227    mova             m12, [rfilterq + %2q]               ; get 4 first values of filters
228    mova             m13, [rfilterq + %2q +   %%offset]  ; get 4 first values of filters
229    mova             m14, [rfilterq + %2q + 2*%%offset]  ; get 4 first values of filters
230    mova             m15, [rfilterq + %2q + 3*%%offset]  ; get 4 first values of filters
231%endmacro
232
233%macro EPEL_LOAD 4
234%if (%1 == 8 && %4 <= 4)
235%define %%load movd
236%elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4)
237%define %%load movq
238%else
239%define %%load movdqu
240%endif
241
242    %%load            m0, [%2q ]
243%ifnum %3
244    %%load            m1, [%2q+  %3]
245    %%load            m2, [%2q+2*%3]
246    %%load            m3, [%2q+3*%3]
247%else
248    %%load            m1, [%2q+  %3q]
249    %%load            m2, [%2q+2*%3q]
250    %%load            m3, [%2q+r3srcq]
251%endif
252%if %1 == 8
253%if %4 > 8
254    SBUTTERFLY        bw, 0, 1, 7
255    SBUTTERFLY        bw, 2, 3, 7
256%else
257    punpcklbw         m0, m1
258    punpcklbw         m2, m3
259%endif
260%else
261%if %4 > 4
262    SBUTTERFLY        wd, 0, 1, 7
263    SBUTTERFLY        wd, 2, 3, 7
264%else
265    punpcklwd         m0, m1
266    punpcklwd         m2, m3
267%endif
268%endif
269%endmacro
270
271
272%macro QPEL_H_LOAD 4
273%assign %%stride (%1+7)/8
274%if %1 == 8
275%if %3 <= 4
276%define %%load movd
277%elif %3 == 8
278%define %%load movq
279%else
280%define %%load movu
281%endif
282%else
283%if %3 == 2
284%define %%load movd
285%elif %3 == 4
286%define %%load movq
287%else
288%define %%load movu
289%endif
290%endif
291    %%load            m0, [%2-3*%%stride]        ;load data from source
292    %%load            m1, [%2-2*%%stride]
293    %%load            m2, [%2-%%stride  ]
294    %%load            m3, [%2           ]
295    %%load            m4, [%2+%%stride  ]
296    %%load            m5, [%2+2*%%stride]
297    %%load            m6, [%2+3*%%stride]
298    %%load            m7, [%2+4*%%stride]
299
300%if %1 == 8
301%if %3 > 8
302    SBUTTERFLY        wd, 0, 1, %4
303    SBUTTERFLY        wd, 2, 3, %4
304    SBUTTERFLY        wd, 4, 5, %4
305    SBUTTERFLY        wd, 6, 7, %4
306%else
307    punpcklbw         m0, m1
308    punpcklbw         m2, m3
309    punpcklbw         m4, m5
310    punpcklbw         m6, m7
311%endif
312%else
313%if %3 > 4
314    SBUTTERFLY        dq, 0, 1, %4
315    SBUTTERFLY        dq, 2, 3, %4
316    SBUTTERFLY        dq, 4, 5, %4
317    SBUTTERFLY        dq, 6, 7, %4
318%else
319    punpcklwd         m0, m1
320    punpcklwd         m2, m3
321    punpcklwd         m4, m5
322    punpcklwd         m6, m7
323%endif
324%endif
325%endmacro
326
327%macro QPEL_V_LOAD 5
328    lea              %5q, [%2]
329    sub              %5q, r3srcq
330    movu              m0, [%5q            ]      ;load x- 3*srcstride
331    movu              m1, [%5q+   %3q     ]      ;load x- 2*srcstride
332    movu              m2, [%5q+ 2*%3q     ]      ;load x-srcstride
333    movu              m3, [%2       ]      ;load x
334    movu              m4, [%2+   %3q]      ;load x+stride
335    movu              m5, [%2+ 2*%3q]      ;load x+2*stride
336    movu              m6, [%2+r3srcq]      ;load x+3*stride
337    movu              m7, [%2+ 4*%3q]      ;load x+4*stride
338%if %1 == 8
339%if %4 > 8
340    SBUTTERFLY        bw, 0, 1, 8
341    SBUTTERFLY        bw, 2, 3, 8
342    SBUTTERFLY        bw, 4, 5, 8
343    SBUTTERFLY        bw, 6, 7, 8
344%else
345    punpcklbw         m0, m1
346    punpcklbw         m2, m3
347    punpcklbw         m4, m5
348    punpcklbw         m6, m7
349%endif
350%else
351%if %4 > 4
352    SBUTTERFLY        wd, 0, 1, 8
353    SBUTTERFLY        wd, 2, 3, 8
354    SBUTTERFLY        wd, 4, 5, 8
355    SBUTTERFLY        wd, 6, 7, 8
356%else
357    punpcklwd         m0, m1
358    punpcklwd         m2, m3
359    punpcklwd         m4, m5
360    punpcklwd         m6, m7
361%endif
362%endif
363%endmacro
364
365%macro PEL_12STORE2 3
366    movd           [%1], %2
367%endmacro
368%macro PEL_12STORE4 3
369    movq           [%1], %2
370%endmacro
371%macro PEL_12STORE6 3
372    movq           [%1], %2
373    psrldq            %2, 8
374    movd         [%1+8], %2
375%endmacro
376%macro PEL_12STORE8 3
377    movdqa         [%1], %2
378%endmacro
379%macro PEL_12STORE12 3
380    movdqa         [%1], %2
381    movq        [%1+16], %3
382%endmacro
383%macro PEL_12STORE16 3
384    PEL_12STORE8      %1, %2, %3
385    movdqa       [%1+16], %3
386%endmacro
387
388%macro PEL_10STORE2 3
389    movd           [%1], %2
390%endmacro
391%macro PEL_10STORE4 3
392    movq           [%1], %2
393%endmacro
394%macro PEL_10STORE6 3
395    movq           [%1], %2
396    psrldq            %2, 8
397    movd         [%1+8], %2
398%endmacro
399%macro PEL_10STORE8 3
400    movdqa         [%1], %2
401%endmacro
402%macro PEL_10STORE12 3
403    movdqa         [%1], %2
404    movq        [%1+16], %3
405%endmacro
406%macro PEL_10STORE16 3
407%if cpuflag(avx2)
408    movu            [%1], %2
409%else
410    PEL_10STORE8      %1, %2, %3
411    movdqa       [%1+16], %3
412%endif
413%endmacro
414
415%macro PEL_10STORE32 3
416    PEL_10STORE16     %1, %2, %3
417    movu         [%1+32], %3
418%endmacro
419
420%macro PEL_8STORE2 3
421    pextrw          [%1], %2, 0
422%endmacro
423%macro PEL_8STORE4 3
424    movd            [%1], %2
425%endmacro
426%macro PEL_8STORE6 3
427    movd            [%1], %2
428    pextrw        [%1+4], %2, 2
429%endmacro
430%macro PEL_8STORE8 3
431    movq           [%1], %2
432%endmacro
433%macro PEL_8STORE12 3
434    movq            [%1], %2
435    psrldq            %2, 8
436    movd          [%1+8], %2
437%endmacro
438%macro PEL_8STORE16 3
439%if cpuflag(avx2)
440    movdqu        [%1], %2
441%else
442    mova          [%1], %2
443%endif ; avx
444%endmacro
445%macro PEL_8STORE32 3
446    movu          [%1], %2
447%endmacro
448
449%macro LOOP_END 3
450    add              %1q, 2*MAX_PB_SIZE          ; dst += dststride
451    add              %2q, %3q                    ; src += srcstride
452    dec          heightd                         ; cmp height
453    jnz               .loop                      ; height loop
454%endmacro
455
456
457%macro MC_PIXEL_COMPUTE 2-3 ;width, bitdepth
458%if %2 == 8
459%if cpuflag(avx2) && %0 ==3
460%if %1 > 16
461    vextracti128 xm1, m0, 1
462    pmovzxbw      m1, xm1
463    psllw         m1, 14-%2
464%endif
465    pmovzxbw      m0, xm0
466%else ; not avx
467%if %1 > 8
468    punpckhbw     m1, m0, m2
469    psllw         m1, 14-%2
470%endif
471    punpcklbw     m0, m2
472%endif
473%endif ;avx
474    psllw         m0, 14-%2
475%endmacro
476
477%macro EPEL_COMPUTE 4-8 ; bitdepth, width, filter1, filter2, HV/m0, m2, m1, m3
478%if %0 == 8
479%define %%reg0 %5
480%define %%reg2 %6
481%define %%reg1 %7
482%define %%reg3 %8
483%else
484%define %%reg0 m0
485%define %%reg2 m2
486%define %%reg1 m1
487%define %%reg3 m3
488%endif
489%if %1 == 8
490%if cpuflag(avx2) && (%0 == 5)
491%if %2 > 16
492    vperm2i128    m10, m0, m1, q0301
493%endif
494    vinserti128    m0, m0, xm1, 1
495    mova           m1, m10
496%if %2 > 16
497    vperm2i128    m10, m2, m3, q0301
498%endif
499    vinserti128    m2, m2, xm3, 1
500    mova           m3, m10
501%endif
502    pmaddubsw      %%reg0, %3   ;x1*c1+x2*c2
503    pmaddubsw      %%reg2, %4   ;x3*c3+x4*c4
504    paddw          %%reg0, %%reg2
505%if %2 > 8
506    pmaddubsw      %%reg1, %3
507    pmaddubsw      %%reg3, %4
508    paddw          %%reg1, %%reg3
509%endif
510%else
511    pmaddwd        %%reg0, %3
512    pmaddwd        %%reg2, %4
513    paddd          %%reg0, %%reg2
514%if %2 > 4
515    pmaddwd        %%reg1, %3
516    pmaddwd        %%reg3, %4
517    paddd          %%reg1, %%reg3
518%if %1 != 8
519    psrad          %%reg1, %1-8
520%endif
521%endif
522%if %1 != 8
523    psrad          %%reg0, %1-8
524%endif
525    packssdw       %%reg0, %%reg1
526%endif
527%endmacro
528
529%macro QPEL_HV_COMPUTE 4     ; width, bitdepth, filter idx
530
531%if cpuflag(avx2)
532%assign %%offset 32
533%define %%table  hevc_qpel_filters_avx2_%2
534%else
535%assign %%offset 16
536%define %%table  hevc_qpel_filters_sse4_%2
537%endif
538
539%ifdef PIC
540    lea         rfilterq, [%%table]
541%else
542    %define rfilterq %%table
543%endif
544
545%if %2 == 8
546    pmaddubsw         m0, [rfilterq + %3q*8   ]   ;x1*c1+x2*c2
547    pmaddubsw         m2, [rfilterq + %3q*8+%%offset]   ;x3*c3+x4*c4
548    pmaddubsw         m4, [rfilterq + %3q*8+2*%%offset]   ;x5*c5+x6*c6
549    pmaddubsw         m6, [rfilterq + %3q*8+3*%%offset]   ;x7*c7+x8*c8
550    paddw             m0, m2
551    paddw             m4, m6
552    paddw             m0, m4
553%else
554    pmaddwd           m0, [rfilterq + %3q*8   ]
555    pmaddwd           m2, [rfilterq + %3q*8+%%offset]
556    pmaddwd           m4, [rfilterq + %3q*8+2*%%offset]
557    pmaddwd           m6, [rfilterq + %3q*8+3*%%offset]
558    paddd             m0, m2
559    paddd             m4, m6
560    paddd             m0, m4
561%if %2 != 8
562    psrad             m0, %2-8
563%endif
564%if %1 > 4
565    pmaddwd           m1, [rfilterq + %3q*8   ]
566    pmaddwd           m3, [rfilterq + %3q*8+%%offset]
567    pmaddwd           m5, [rfilterq + %3q*8+2*%%offset]
568    pmaddwd           m7, [rfilterq + %3q*8+3*%%offset]
569    paddd             m1, m3
570    paddd             m5, m7
571    paddd             m1, m5
572%if %2 != 8
573    psrad             m1, %2-8
574%endif
575%endif
576    p%4               m0, m1
577%endif
578%endmacro
579
580%macro QPEL_COMPUTE 2-3     ; width, bitdepth
581%if %2 == 8
582%if cpuflag(avx2) && (%0 == 3)
583
584    vperm2i128 m10, m0,  m1, q0301
585    vinserti128 m0, m0, xm1, 1
586    SWAP 1, 10
587
588    vperm2i128 m10, m2,  m3, q0301
589    vinserti128 m2, m2, xm3, 1
590    SWAP 3, 10
591
592
593    vperm2i128 m10, m4,  m5, q0301
594    vinserti128 m4, m4, xm5, 1
595    SWAP 5, 10
596
597    vperm2i128 m10, m6,  m7, q0301
598    vinserti128 m6, m6, xm7, 1
599    SWAP 7, 10
600%endif
601
602    pmaddubsw         m0, m12   ;x1*c1+x2*c2
603    pmaddubsw         m2, m13   ;x3*c3+x4*c4
604    pmaddubsw         m4, m14   ;x5*c5+x6*c6
605    pmaddubsw         m6, m15   ;x7*c7+x8*c8
606    paddw             m0, m2
607    paddw             m4, m6
608    paddw             m0, m4
609%if %1 > 8
610    pmaddubsw         m1, m12
611    pmaddubsw         m3, m13
612    pmaddubsw         m5, m14
613    pmaddubsw         m7, m15
614    paddw             m1, m3
615    paddw             m5, m7
616    paddw             m1, m5
617%endif
618%else
619    pmaddwd           m0, m12
620    pmaddwd           m2, m13
621    pmaddwd           m4, m14
622    pmaddwd           m6, m15
623    paddd             m0, m2
624    paddd             m4, m6
625    paddd             m0, m4
626%if %2 != 8
627    psrad             m0, %2-8
628%endif
629%if %1 > 4
630    pmaddwd           m1, m12
631    pmaddwd           m3, m13
632    pmaddwd           m5, m14
633    pmaddwd           m7, m15
634    paddd             m1, m3
635    paddd             m5, m7
636    paddd             m1, m5
637%if %2 != 8
638    psrad             m1, %2-8
639%endif
640%endif
641%endif
642%endmacro
643
644%macro BI_COMPUTE 7-8     ; width, bitd, src1l, src1h, scr2l, scr2h, pw
645    paddsw            %3, %5
646%if %1 > 8
647    paddsw            %4, %6
648%endif
649    UNI_COMPUTE       %1, %2, %3, %4, %7
650%if %0 == 8 && cpuflag(avx2) && (%2 == 8)
651    vpermq            %3, %3, 216
652    vpermq            %4, %4, 216
653%endif
654%endmacro
655
656%macro UNI_COMPUTE 5
657    pmulhrsw          %3, %5
658%if %1 > 8 || (%2 > 8 && %1 > 4)
659    pmulhrsw          %4, %5
660%endif
661%if %2 == 8
662    packuswb          %3, %4
663%else
664    CLIPW             %3, [pb_0], [max_pixels_%2]
665%if (%1 > 8 && notcpuflag(avx)) || %1 > 16
666    CLIPW             %4, [pb_0], [max_pixels_%2]
667%endif
668%endif
669%endmacro
670
671
672; ******************************
673; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
674;                         uint8_t *_src, ptrdiff_t _srcstride,
675;                         int height, int mx, int my)
676; ******************************
677
678%macro HEVC_PUT_HEVC_PEL_PIXELS 2
679HEVC_PEL_PIXELS     %1, %2
680HEVC_UNI_PEL_PIXELS %1, %2
681HEVC_BI_PEL_PIXELS  %1, %2
682%endmacro
683
684%macro HEVC_PEL_PIXELS 2
685cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
686    pxor               m2, m2
687.loop:
688    SIMPLE_LOAD       %1, %2, srcq, m0
689    MC_PIXEL_COMPUTE  %1, %2, 1
690    PEL_10STORE%1     dstq, m0, m1
691    LOOP_END         dst, src, srcstride
692    RET
693 %endmacro
694
695%macro HEVC_UNI_PEL_PIXELS 2
696cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height
697.loop:
698    SIMPLE_LOAD       %1, %2, srcq, m0
699    PEL_%2STORE%1   dstq, m0, m1
700    add             dstq, dststrideq             ; dst += dststride
701    add             srcq, srcstrideq             ; src += srcstride
702    dec          heightd                         ; cmp height
703    jnz               .loop                      ; height loop
704    RET
705%endmacro
706
707%macro HEVC_BI_PEL_PIXELS 2
708cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
709    pxor              m2, m2
710    movdqa            m5, [pw_bi_%2]
711.loop:
712    SIMPLE_LOAD       %1, %2, srcq, m0
713    SIMPLE_BILOAD     %1, src2q, m3, m4
714    MC_PIXEL_COMPUTE  %1, %2, 1
715    BI_COMPUTE        %1, %2, m0, m1, m3, m4, m5, 1
716    PEL_%2STORE%1   dstq, m0, m1
717    add             dstq, dststrideq             ; dst += dststride
718    add             srcq, srcstrideq             ; src += srcstride
719    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
720    dec          heightd                         ; cmp height
721    jnz               .loop                      ; height loop
722    RET
723%endmacro
724
725
726; ******************************
727; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride,
728;                       uint8_t *_src, ptrdiff_t _srcstride,
729;                       int height, int mx, int my, int width);
730; ******************************
731
732
733%macro HEVC_PUT_HEVC_EPEL 2
734%if cpuflag(avx2)
735%define XMM_REGS  11
736%else
737%define XMM_REGS  8
738%endif
739
740cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, XMM_REGS, dst, src, srcstride, height, mx, rfilter
741%assign %%stride ((%2 + 7)/8)
742    EPEL_FILTER       %2, mx, m4, m5, rfilter
743.loop:
744    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
745    EPEL_COMPUTE      %2, %1, m4, m5, 1
746    PEL_10STORE%1      dstq, m0, m1
747    LOOP_END         dst, src, srcstride
748    RET
749
750cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, XMM_REGS, dst, dststride, src, srcstride, height, mx, rfilter
751%assign %%stride ((%2 + 7)/8)
752    movdqa            m6, [pw_%2]
753    EPEL_FILTER       %2, mx, m4, m5, rfilter
754.loop:
755    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
756    EPEL_COMPUTE      %2, %1, m4, m5
757    UNI_COMPUTE       %1, %2, m0, m1, m6
758    PEL_%2STORE%1   dstq, m0, m1
759    add             dstq, dststrideq             ; dst += dststride
760    add             srcq, srcstrideq             ; src += srcstride
761    dec          heightd                         ; cmp height
762    jnz               .loop                      ; height loop
763    RET
764
765cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, mx, rfilter
766    movdqa            m6, [pw_bi_%2]
767    EPEL_FILTER       %2, mx, m4, m5, rfilter
768.loop:
769    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
770    EPEL_COMPUTE      %2, %1, m4, m5, 1
771    SIMPLE_BILOAD     %1, src2q, m2, m3
772    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6, 1
773    PEL_%2STORE%1   dstq, m0, m1
774    add             dstq, dststrideq             ; dst += dststride
775    add             srcq, srcstrideq             ; src += srcstride
776    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
777    dec          heightd                         ; cmp height
778    jnz               .loop                      ; height loop
779    RET
780
781; ******************************
782; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
783;                      uint8_t *_src, ptrdiff_t _srcstride,
784;                      int height, int mx, int my, int width)
785; ******************************
786
787cglobal hevc_put_hevc_epel_v%1_%2, 4, 6, XMM_REGS, dst, src, srcstride, height, r3src, my
788    movifnidn        myd, mym
789    sub             srcq, srcstrideq
790    EPEL_FILTER       %2, my, m4, m5, r3src
791    lea           r3srcq, [srcstrideq*3]
792.loop:
793    EPEL_LOAD         %2, srcq, srcstride, %1
794    EPEL_COMPUTE      %2, %1, m4, m5, 1
795    PEL_10STORE%1     dstq, m0, m1
796    LOOP_END          dst, src, srcstride
797    RET
798
799cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 7, XMM_REGS, dst, dststride, src, srcstride, height, r3src, my
800    movifnidn        myd, mym
801    movdqa            m6, [pw_%2]
802    sub             srcq, srcstrideq
803    EPEL_FILTER       %2, my, m4, m5, r3src
804    lea           r3srcq, [srcstrideq*3]
805.loop:
806    EPEL_LOAD         %2, srcq, srcstride, %1
807    EPEL_COMPUTE      %2, %1, m4, m5
808    UNI_COMPUTE       %1, %2, m0, m1, m6
809    PEL_%2STORE%1   dstq, m0, m1
810    add             dstq, dststrideq             ; dst += dststride
811    add             srcq, srcstrideq             ; src += srcstride
812    dec          heightd                         ; cmp height
813    jnz               .loop                      ; height loop
814    RET
815
816
817cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, XMM_REGS, dst, dststride, src, srcstride, src2, height, r3src, my
818    movifnidn        myd, mym
819    movdqa            m6, [pw_bi_%2]
820    sub             srcq, srcstrideq
821    EPEL_FILTER       %2, my, m4, m5, r3src
822    lea           r3srcq, [srcstrideq*3]
823.loop:
824    EPEL_LOAD         %2, srcq, srcstride, %1
825    EPEL_COMPUTE      %2, %1, m4, m5, 1
826    SIMPLE_BILOAD     %1, src2q, m2, m3
827    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6, 1
828    PEL_%2STORE%1   dstq, m0, m1
829    add             dstq, dststrideq             ; dst += dststride
830    add             srcq, srcstrideq             ; src += srcstride
831    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
832    dec          heightd                         ; cmp height
833    jnz               .loop                      ; height loop
834    RET
835%endmacro
836
837
838; ******************************
839; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride,
840;                       uint8_t *_src, ptrdiff_t _srcstride,
841;                       int height, int mx, int my, int width)
842; ******************************
843
844%macro HEVC_PUT_HEVC_EPEL_HV 2
845cglobal hevc_put_hevc_epel_hv%1_%2, 6, 7, 16 , dst, src, srcstride, height, mx, my, r3src
846%assign %%stride ((%2 + 7)/8)
847    sub             srcq, srcstrideq
848    EPEL_HV_FILTER    %2
849    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
850    EPEL_COMPUTE      %2, %1, m14, m15
851%if (%1 > 8 && (%2 == 8))
852    SWAP              m8, m1
853%endif
854    SWAP              m4, m0
855    add             srcq, srcstrideq
856    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
857    EPEL_COMPUTE      %2, %1, m14, m15
858%if (%1 > 8 && (%2 == 8))
859    SWAP              m9, m1
860%endif
861    SWAP              m5, m0
862    add             srcq, srcstrideq
863    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
864    EPEL_COMPUTE      %2, %1, m14, m15
865%if (%1 > 8 && (%2 == 8))
866    SWAP             m10, m1
867%endif
868    SWAP              m6, m0
869    add             srcq, srcstrideq
870.loop:
871    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
872    EPEL_COMPUTE      %2, %1, m14, m15
873%if (%1 > 8 && (%2 == 8))
874    SWAP             m11, m1
875%endif
876    SWAP              m7, m0
877    punpcklwd         m0, m4, m5
878    punpcklwd         m2, m6, m7
879%if %1 > 4
880    punpckhwd         m1, m4, m5
881    punpckhwd         m3, m6, m7
882%endif
883    EPEL_COMPUTE      14, %1, m12, m13
884%if (%1 > 8 && (%2 == 8))
885    punpcklwd         m4, m8, m9
886    punpcklwd         m2, m10, m11
887    punpckhwd         m8, m8, m9
888    punpckhwd         m3, m10, m11
889    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
890%if cpuflag(avx2)
891    vinserti128       m2, m0, xm4, 1
892    vperm2i128        m3, m0, m4, q0301
893    PEL_10STORE%1     dstq, m2, m3
894%else
895    PEL_10STORE%1     dstq, m0, m4
896%endif
897%else
898    PEL_10STORE%1     dstq, m0, m1
899%endif
900    movdqa            m4, m5
901    movdqa            m5, m6
902    movdqa            m6, m7
903%if (%1 > 8 && (%2 == 8))
904    mova              m8, m9
905    mova              m9, m10
906    mova             m10, m11
907%endif
908    LOOP_END         dst, src, srcstride
909    RET
910
911cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, height, mx, my, r3src
912%assign %%stride ((%2 + 7)/8)
913    sub             srcq, srcstrideq
914    EPEL_HV_FILTER    %2
915    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
916    EPEL_COMPUTE      %2, %1, m14, m15
917%if (%1 > 8 && (%2 == 8))
918    SWAP              m8, m1
919%endif
920    SWAP              m4, m0
921    add             srcq, srcstrideq
922    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
923    EPEL_COMPUTE      %2, %1, m14, m15
924%if (%1 > 8 && (%2 == 8))
925    SWAP              m9, m1
926%endif
927    SWAP              m5, m0
928    add             srcq, srcstrideq
929    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
930    EPEL_COMPUTE      %2, %1, m14, m15
931%if (%1 > 8 && (%2 == 8))
932    SWAP             m10, m1
933%endif
934    SWAP              m6, m0
935    add             srcq, srcstrideq
936.loop:
937    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
938    EPEL_COMPUTE      %2, %1, m14, m15
939%if (%1 > 8 && (%2 == 8))
940    SWAP             m11, m1
941%endif
942    mova              m7, m0
943    punpcklwd         m0, m4, m5
944    punpcklwd         m2, m6, m7
945%if %1 > 4
946    punpckhwd         m1, m4, m5
947    punpckhwd         m3, m6, m7
948%endif
949    EPEL_COMPUTE      14, %1, m12, m13
950%if (%1 > 8 && (%2 == 8))
951    punpcklwd         m4, m8, m9
952    punpcklwd         m2, m10, m11
953    punpckhwd         m8, m8, m9
954    punpckhwd         m3, m10, m11
955    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
956    UNI_COMPUTE       %1, %2, m0, m4, [pw_%2]
957%else
958    UNI_COMPUTE       %1, %2, m0, m1, [pw_%2]
959%endif
960    PEL_%2STORE%1   dstq, m0, m1
961    mova              m4, m5
962    mova              m5, m6
963    mova              m6, m7
964%if (%1 > 8 && (%2 == 8))
965    mova              m8, m9
966    mova              m9, m10
967    mova             m10, m11
968%endif
969    add             dstq, dststrideq             ; dst += dststride
970    add             srcq, srcstrideq             ; src += srcstride
971    dec          heightd                         ; cmp height
972    jnz               .loop                      ; height loop
973    RET
974
975cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src
976%assign %%stride ((%2 + 7)/8)
977    sub             srcq, srcstrideq
978    EPEL_HV_FILTER    %2
979    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
980    EPEL_COMPUTE      %2, %1, m14, m15
981%if (%1 > 8 && (%2 == 8))
982    SWAP              m8, m1
983%endif
984    SWAP              m4, m0
985    add             srcq, srcstrideq
986    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
987    EPEL_COMPUTE      %2, %1, m14, m15
988%if (%1 > 8 && (%2 == 8))
989    SWAP              m9, m1
990%endif
991    SWAP              m5, m0
992    add             srcq, srcstrideq
993    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
994    EPEL_COMPUTE      %2, %1, m14, m15
995%if (%1 > 8 && (%2 == 8))
996    SWAP             m10, m1
997%endif
998    SWAP              m6, m0
999    add             srcq, srcstrideq
1000.loop:
1001    EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
1002    EPEL_COMPUTE      %2, %1, m14, m15
1003%if (%1 > 8 && (%2 == 8))
1004    SWAP             m11, m1
1005%endif
1006    SWAP              m7, m0
1007    punpcklwd         m0, m4, m5
1008    punpcklwd         m2, m6, m7
1009%if %1 > 4
1010    punpckhwd         m1, m4, m5
1011    punpckhwd         m3, m6, m7
1012%endif
1013    EPEL_COMPUTE      14, %1, m12, m13
1014%if (%1 > 8 && (%2 == 8))
1015    punpcklwd         m4, m8, m9
1016    punpcklwd         m2, m10, m11
1017    punpckhwd         m8, m8, m9
1018    punpckhwd         m3, m10, m11
1019    EPEL_COMPUTE      14, %1, m12, m13, m4, m2, m8, m3
1020    SIMPLE_BILOAD     %1, src2q, m8, m3
1021%if cpuflag(avx2)
1022    vinserti128       m1, m8, xm3, 1
1023    vperm2i128        m2, m8, m3, q0301
1024    BI_COMPUTE        %1, %2, m0, m4, m1, m2, [pw_bi_%2]
1025%else
1026    BI_COMPUTE        %1, %2, m0, m4, m8, m3, [pw_bi_%2]
1027%endif
1028%else
1029    SIMPLE_BILOAD     %1, src2q, m8, m9
1030    BI_COMPUTE        %1, %2, m0, m1, m8, m9, [pw_bi_%2]
1031%endif
1032    PEL_%2STORE%1   dstq, m0, m4
1033    mova              m4, m5
1034    mova              m5, m6
1035    mova              m6, m7
1036%if (%1 > 8 && (%2 == 8))
1037    mova              m8, m9
1038    mova              m9, m10
1039    mova             m10, m11
1040%endif
1041    add             dstq, dststrideq             ; dst += dststride
1042    add             srcq, srcstrideq             ; src += srcstride
1043    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
1044    dec          heightd                         ; cmp height
1045    jnz               .loop                      ; height loop
1046    RET
1047%endmacro
1048
1049; ******************************
1050; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride,
1051;                       uint8_t *_src, ptrdiff_t _srcstride,
1052;                       int height, int mx, int my, int width)
1053; ******************************
1054
1055%macro HEVC_PUT_HEVC_QPEL 2
1056cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rfilter
1057    QPEL_FILTER       %2, mx
1058.loop:
1059    QPEL_H_LOAD       %2, srcq, %1, 10
1060    QPEL_COMPUTE      %1, %2, 1
1061%if %2 > 8
1062    packssdw          m0, m1
1063%endif
1064    PEL_10STORE%1     dstq, m0, m1
1065    LOOP_END          dst, src, srcstride
1066    RET
1067
1068cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter
1069    mova              m9, [pw_%2]
1070    QPEL_FILTER       %2, mx
1071.loop:
1072    QPEL_H_LOAD       %2, srcq, %1, 10
1073    QPEL_COMPUTE      %1, %2
1074%if %2 > 8
1075    packssdw          m0, m1
1076%endif
1077    UNI_COMPUTE       %1, %2, m0, m1, m9
1078    PEL_%2STORE%1   dstq, m0, m1
1079    add             dstq, dststrideq             ; dst += dststride
1080    add             srcq, srcstrideq             ; src += srcstride
1081    dec          heightd                         ; cmp height
1082    jnz               .loop                      ; height loop
1083    RET
1084
1085cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter
1086    movdqa            m9, [pw_bi_%2]
1087    QPEL_FILTER       %2, mx
1088.loop:
1089    QPEL_H_LOAD       %2, srcq, %1, 10
1090    QPEL_COMPUTE      %1, %2, 1
1091%if %2 > 8
1092    packssdw          m0, m1
1093%endif
1094    SIMPLE_BILOAD     %1, src2q, m10, m11
1095    BI_COMPUTE        %1, %2, m0, m1, m10, m11, m9, 1
1096    PEL_%2STORE%1   dstq, m0, m1
1097    add             dstq, dststrideq             ; dst += dststride
1098    add             srcq, srcstrideq             ; src += srcstride
1099    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
1100    dec          heightd                         ; cmp height
1101    jnz               .loop                      ; height loop
1102    RET
1103
1104
1105; ******************************
1106; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride,
1107;                       uint8_t *_src, ptrdiff_t _srcstride,
1108;                       int height, int mx, int my, int width)
1109; ******************************
1110
1111cglobal hevc_put_hevc_qpel_v%1_%2, 4, 8, 16, dst, src, srcstride, height, r3src, my, rfilter
1112    movifnidn        myd, mym
1113    lea           r3srcq, [srcstrideq*3]
1114    QPEL_FILTER       %2, my
1115.loop:
1116    QPEL_V_LOAD       %2, srcq, srcstride, %1, r7
1117    QPEL_COMPUTE      %1, %2, 1
1118%if %2 > 8
1119    packssdw          m0, m1
1120%endif
1121    PEL_10STORE%1     dstq, m0, m1
1122    LOOP_END         dst, src, srcstride
1123    RET
1124
1125cglobal hevc_put_hevc_uni_qpel_v%1_%2, 5, 9, 16, dst, dststride, src, srcstride, height, r3src, my, rfilter
1126    movifnidn        myd, mym
1127    movdqa            m9, [pw_%2]
1128    lea           r3srcq, [srcstrideq*3]
1129    QPEL_FILTER       %2, my
1130.loop:
1131    QPEL_V_LOAD       %2, srcq, srcstride, %1, r8
1132    QPEL_COMPUTE      %1, %2
1133%if %2 > 8
1134    packssdw          m0, m1
1135%endif
1136    UNI_COMPUTE       %1, %2, m0, m1, m9
1137    PEL_%2STORE%1   dstq, m0, m1
1138    add             dstq, dststrideq             ; dst += dststride
1139    add             srcq, srcstrideq             ; src += srcstride
1140    dec          heightd                         ; cmp height
1141    jnz               .loop                      ; height loop
1142    RET
1143
1144cglobal hevc_put_hevc_bi_qpel_v%1_%2, 6, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
1145    movifnidn        myd, mym
1146    movdqa            m9, [pw_bi_%2]
1147    lea           r3srcq, [srcstrideq*3]
1148    QPEL_FILTER       %2, my
1149.loop:
1150    QPEL_V_LOAD       %2, srcq, srcstride, %1, r9
1151    QPEL_COMPUTE      %1, %2, 1
1152%if %2 > 8
1153    packssdw          m0, m1
1154%endif
1155    SIMPLE_BILOAD     %1, src2q, m10, m11
1156    BI_COMPUTE        %1, %2, m0, m1, m10, m11, m9, 1
1157    PEL_%2STORE%1   dstq, m0, m1
1158    add             dstq, dststrideq             ; dst += dststride
1159    add             srcq, srcstrideq             ; src += srcstride
1160    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
1161    dec          heightd                         ; cmp height
1162    jnz               .loop                      ; height loop
1163    RET
1164%endmacro
1165
1166
1167; ******************************
1168; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride,
1169;                       uint8_t *_src, ptrdiff_t _srcstride,
1170;                       int height, int mx, int my)
1171; ******************************
1172%macro HEVC_PUT_HEVC_QPEL_HV 2
1173cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter
1174%if cpuflag(avx2)
1175%assign %%shift  4
1176%else
1177%assign %%shift  3
1178%endif
1179    sub              mxq, 1
1180    sub              myq, 1
1181    shl              mxq, %%shift                ; multiply by 32
1182    shl              myq, %%shift                ; multiply by 32
1183    lea           r3srcq, [srcstrideq*3]
1184    sub             srcq, r3srcq
1185    QPEL_H_LOAD       %2, srcq, %1, 15
1186    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1187    SWAP              m8, m0
1188    add             srcq, srcstrideq
1189    QPEL_H_LOAD       %2, srcq, %1, 15
1190    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1191    SWAP              m9, m0
1192    add             srcq, srcstrideq
1193    QPEL_H_LOAD       %2, srcq, %1, 15
1194    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1195    SWAP             m10, m0
1196    add             srcq, srcstrideq
1197    QPEL_H_LOAD       %2, srcq, %1, 15
1198    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1199    SWAP             m11, m0
1200    add             srcq, srcstrideq
1201    QPEL_H_LOAD       %2, srcq, %1, 15
1202    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1203    SWAP             m12, m0
1204    add             srcq, srcstrideq
1205    QPEL_H_LOAD       %2, srcq, %1, 15
1206    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1207    SWAP             m13, m0
1208    add             srcq, srcstrideq
1209    QPEL_H_LOAD       %2, srcq, %1, 15
1210    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1211    SWAP             m14, m0
1212    add             srcq, srcstrideq
1213.loop:
1214    QPEL_H_LOAD       %2, srcq, %1, 15
1215    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1216    SWAP             m15, m0
1217    punpcklwd         m0, m8, m9
1218    punpcklwd         m2, m10, m11
1219    punpcklwd         m4, m12, m13
1220    punpcklwd         m6, m14, m15
1221%if %1 > 4
1222    punpckhwd         m1, m8, m9
1223    punpckhwd         m3, m10, m11
1224    punpckhwd         m5, m12, m13
1225    punpckhwd         m7, m14, m15
1226%endif
1227    QPEL_HV_COMPUTE   %1, 14, my, ackssdw
1228    PEL_10STORE%1     dstq, m0, m1
1229%if %1 <= 4
1230    movq              m8, m9
1231    movq              m9, m10
1232    movq             m10, m11
1233    movq             m11, m12
1234    movq             m12, m13
1235    movq             m13, m14
1236    movq             m14, m15
1237%else
1238    movdqa            m8, m9
1239    movdqa            m9, m10
1240    movdqa           m10, m11
1241    movdqa           m11, m12
1242    movdqa           m12, m13
1243    movdqa           m13, m14
1244    movdqa           m14, m15
1245%endif
1246    LOOP_END         dst, src, srcstride
1247    RET
1248
1249cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
1250%if cpuflag(avx2)
1251%assign %%shift  4
1252%else
1253%assign %%shift  3
1254%endif
1255    sub              mxq, 1
1256    sub              myq, 1
1257    shl              mxq, %%shift                ; multiply by 32
1258    shl              myq, %%shift                ; multiply by 32
1259    lea           r3srcq, [srcstrideq*3]
1260    sub             srcq, r3srcq
1261    QPEL_H_LOAD       %2, srcq, %1, 15
1262    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1263    SWAP              m8, m0
1264    add             srcq, srcstrideq
1265    QPEL_H_LOAD       %2, srcq, %1, 15
1266    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1267    SWAP              m9, m0
1268    add             srcq, srcstrideq
1269    QPEL_H_LOAD       %2, srcq, %1, 15
1270    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1271    SWAP             m10, m0
1272    add             srcq, srcstrideq
1273    QPEL_H_LOAD       %2, srcq, %1, 15
1274    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1275    SWAP             m11, m0
1276    add             srcq, srcstrideq
1277    QPEL_H_LOAD       %2, srcq, %1, 15
1278    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1279    SWAP             m12, m0
1280    add             srcq, srcstrideq
1281    QPEL_H_LOAD       %2, srcq, %1, 15
1282    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1283    SWAP             m13, m0
1284    add             srcq, srcstrideq
1285    QPEL_H_LOAD       %2, srcq, %1, 15
1286    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1287    SWAP             m14, m0
1288    add             srcq, srcstrideq
1289.loop:
1290    QPEL_H_LOAD       %2, srcq, %1, 15
1291    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1292    SWAP             m15, m0
1293    punpcklwd         m0, m8, m9
1294    punpcklwd         m2, m10, m11
1295    punpcklwd         m4, m12, m13
1296    punpcklwd         m6, m14, m15
1297%if %1 > 4
1298    punpckhwd         m1, m8, m9
1299    punpckhwd         m3, m10, m11
1300    punpckhwd         m5, m12, m13
1301    punpckhwd         m7, m14, m15
1302%endif
1303    QPEL_HV_COMPUTE   %1, 14, my, ackusdw
1304    UNI_COMPUTE       %1, %2, m0, m1, [pw_%2]
1305    PEL_%2STORE%1   dstq, m0, m1
1306
1307%if %1 <= 4
1308    movq              m8, m9
1309    movq              m9, m10
1310    movq             m10, m11
1311    movq             m11, m12
1312    movq             m12, m13
1313    movq             m13, m14
1314    movq             m14, m15
1315%else
1316    mova            m8, m9
1317    mova            m9, m10
1318    mova           m10, m11
1319    mova           m11, m12
1320    mova           m12, m13
1321    mova           m13, m14
1322    mova           m14, m15
1323%endif
1324    add             dstq, dststrideq             ; dst += dststride
1325    add             srcq, srcstrideq             ; src += srcstride
1326    dec          heightd                         ; cmp height
1327    jnz               .loop                      ; height loop
1328    RET
1329
1330cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
1331%if cpuflag(avx2)
1332%assign %%shift  4
1333%else
1334%assign %%shift  3
1335%endif
1336    sub              mxq, 1
1337    sub              myq, 1
1338    shl              mxq, %%shift                ; multiply by 32
1339    shl              myq, %%shift                ; multiply by 32
1340    lea           r3srcq, [srcstrideq*3]
1341    sub             srcq, r3srcq
1342    QPEL_H_LOAD       %2, srcq, %1, 15
1343    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1344    SWAP              m8, m0
1345    add             srcq, srcstrideq
1346    QPEL_H_LOAD       %2, srcq, %1, 15
1347    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1348    SWAP              m9, m0
1349    add             srcq, srcstrideq
1350    QPEL_H_LOAD       %2, srcq, %1, 15
1351    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1352    SWAP             m10, m0
1353    add             srcq, srcstrideq
1354    QPEL_H_LOAD       %2, srcq, %1, 15
1355    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1356    SWAP             m11, m0
1357    add             srcq, srcstrideq
1358    QPEL_H_LOAD       %2, srcq, %1, 15
1359    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1360    SWAP             m12, m0
1361    add             srcq, srcstrideq
1362    QPEL_H_LOAD       %2, srcq, %1, 15
1363    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1364    SWAP             m13, m0
1365    add             srcq, srcstrideq
1366    QPEL_H_LOAD       %2, srcq, %1, 15
1367    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1368    SWAP             m14, m0
1369    add             srcq, srcstrideq
1370.loop:
1371    QPEL_H_LOAD       %2, srcq, %1, 15
1372    QPEL_HV_COMPUTE   %1, %2, mx, ackssdw
1373    SWAP             m15, m0
1374    punpcklwd         m0, m8, m9
1375    punpcklwd         m2, m10, m11
1376    punpcklwd         m4, m12, m13
1377    punpcklwd         m6, m14, m15
1378%if %1 > 4
1379    punpckhwd         m1, m8, m9
1380    punpckhwd         m3, m10, m11
1381    punpckhwd         m5, m12, m13
1382    punpckhwd         m7, m14, m15
1383%endif
1384    QPEL_HV_COMPUTE   %1, 14, my, ackssdw
1385    SIMPLE_BILOAD     %1, src2q, m8, m9 ;m9 not used in this case
1386    BI_COMPUTE        %1, %2, m0, m1, m8, m9, [pw_bi_%2]
1387    PEL_%2STORE%1   dstq, m0, m1
1388
1389%if %1 <= 4
1390    movq              m8, m9
1391    movq              m9, m10
1392    movq             m10, m11
1393    movq             m11, m12
1394    movq             m12, m13
1395    movq             m13, m14
1396    movq             m14, m15
1397%else
1398    movdqa            m8, m9
1399    movdqa            m9, m10
1400    movdqa           m10, m11
1401    movdqa           m11, m12
1402    movdqa           m12, m13
1403    movdqa           m13, m14
1404    movdqa           m14, m15
1405%endif
1406    add             dstq, dststrideq             ; dst += dststride
1407    add             srcq, srcstrideq             ; src += srcstride
1408    add            src2q, 2*MAX_PB_SIZE          ; src += srcstride
1409    dec          heightd                         ; cmp height
1410    jnz               .loop                      ; height loop
1411    RET
1412%endmacro
1413
1414%macro WEIGHTING_FUNCS 2
1415%if WIN64 || ARCH_X86_32
1416cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, height, denom, wx, ox
1417    mov             r4d, denomm
1418%define SHIFT  r4d
1419%else
1420cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, height, denom, wx, ox
1421%define SHIFT  denomd
1422%endif
1423    lea           SHIFT, [SHIFT+14-%2]          ; shift = 14 - bitd + denom
1424%if %1 <= 4
1425    pxor             m1, m1
1426%endif
1427    movd             m2, wxm        ; WX
1428    movd             m4, SHIFT      ; shift
1429%if %1 <= 4
1430    punpcklwd        m2, m1
1431%else
1432    punpcklwd        m2, m2
1433%endif
1434    dec           SHIFT
1435    movdqu           m5, [pd_1]
1436    movd             m6, SHIFT
1437    pshufd           m2, m2, 0
1438    mov           SHIFT, oxm
1439    pslld            m5, m6
1440%if %2 != 8
1441    shl           SHIFT, %2-8       ; ox << (bitd - 8)
1442%endif
1443    movd             m3, SHIFT      ; OX
1444    pshufd           m3, m3, 0
1445%if WIN64 || ARCH_X86_32
1446    mov           SHIFT, heightm
1447%endif
1448.loop:
1449   SIMPLE_LOAD        %1, 10, srcq, m0
1450%if %1 <= 4
1451    punpcklwd         m0, m1
1452    pmaddwd           m0, m2
1453    paddd             m0, m5
1454    psrad             m0, m4
1455    paddd             m0, m3
1456%else
1457    pmulhw            m6, m0, m2
1458    pmullw            m0, m2
1459    punpckhwd         m1, m0, m6
1460    punpcklwd         m0, m6
1461    paddd             m0, m5
1462    paddd             m1, m5
1463    psrad             m0, m4
1464    psrad             m1, m4
1465    paddd             m0, m3
1466    paddd             m1, m3
1467%endif
1468    packssdw          m0, m1
1469%if %2 == 8
1470    packuswb          m0, m0
1471%else
1472    CLIPW             m0, [pb_0], [max_pixels_%2]
1473%endif
1474    PEL_%2STORE%1   dstq, m0, m1
1475    add             dstq, dststrideq             ; dst += dststride
1476    add             srcq, 2*MAX_PB_SIZE          ; src += srcstride
1477    dec          heightd                         ; cmp height
1478    jnz               .loop                      ; height loop
1479    RET
1480
1481cglobal hevc_put_hevc_bi_w%1_%2, 4, 6, 10, dst, dststride, src, src2, height, denom, wx0, wx1, ox0, ox1
1482    movifnidn        r5d, denomm
1483%if %1 <= 4
1484    pxor              m1, m1
1485%endif
1486    movd              m2, wx0m         ; WX0
1487    lea              r5d, [r5d+14-%2]  ; shift = 14 - bitd + denom
1488    movd              m3, wx1m         ; WX1
1489    movd              m0, r5d          ; shift
1490%if %1 <= 4
1491    punpcklwd         m2, m1
1492    punpcklwd         m3, m1
1493%else
1494    punpcklwd         m2, m2
1495    punpcklwd         m3, m3
1496%endif
1497    inc              r5d
1498    movd              m5, r5d          ; shift+1
1499    pshufd            m2, m2, 0
1500    mov              r5d, ox0m
1501    pshufd            m3, m3, 0
1502    add              r5d, ox1m
1503%if %2 != 8
1504    shl              r5d, %2-8         ; ox << (bitd - 8)
1505%endif
1506    inc              r5d
1507    movd              m4, r5d          ; offset
1508    pshufd            m4, m4, 0
1509%if UNIX64
1510%define h heightd
1511%else
1512    mov              r5d, heightm
1513%define h r5d
1514%endif
1515    pslld             m4, m0
1516
1517.loop:
1518   SIMPLE_LOAD        %1, 10, srcq,  m0
1519   SIMPLE_LOAD        %1, 10, src2q, m8
1520%if %1 <= 4
1521    punpcklwd         m0, m1
1522    punpcklwd         m8, m1
1523    pmaddwd           m0, m3
1524    pmaddwd           m8, m2
1525    paddd             m0, m4
1526    paddd             m0, m8
1527    psrad             m0, m5
1528%else
1529    pmulhw            m6, m0, m3
1530    pmullw            m0, m3
1531    pmulhw            m7, m8, m2
1532    pmullw            m8, m2
1533    punpckhwd         m1, m0, m6
1534    punpcklwd         m0, m6
1535    punpckhwd         m9, m8, m7
1536    punpcklwd         m8, m7
1537    paddd             m0, m8
1538    paddd             m1, m9
1539    paddd             m0, m4
1540    paddd             m1, m4
1541    psrad             m0, m5
1542    psrad             m1, m5
1543%endif
1544    packssdw          m0, m1
1545%if %2 == 8
1546    packuswb          m0, m0
1547%else
1548     CLIPW            m0, [pb_0], [max_pixels_%2]
1549%endif
1550    PEL_%2STORE%1   dstq, m0, m1
1551    add             dstq, dststrideq             ; dst += dststride
1552    add             srcq, 2*MAX_PB_SIZE          ; src += srcstride
1553    add            src2q, 2*MAX_PB_SIZE          ; src2 += srcstride
1554    dec                h                         ; cmp height
1555    jnz               .loop                      ; height loop
1556    RET
1557%endmacro
1558
1559INIT_XMM sse4                                    ; adds ff_ and _sse4 to function name
1560
1561WEIGHTING_FUNCS 2, 8
1562WEIGHTING_FUNCS 4, 8
1563WEIGHTING_FUNCS 6, 8
1564WEIGHTING_FUNCS 8, 8
1565
1566WEIGHTING_FUNCS 2, 10
1567WEIGHTING_FUNCS 4, 10
1568WEIGHTING_FUNCS 6, 10
1569WEIGHTING_FUNCS 8, 10
1570
1571WEIGHTING_FUNCS 2, 12
1572WEIGHTING_FUNCS 4, 12
1573WEIGHTING_FUNCS 6, 12
1574WEIGHTING_FUNCS 8, 12
1575
1576HEVC_PUT_HEVC_PEL_PIXELS  2, 8
1577HEVC_PUT_HEVC_PEL_PIXELS  4, 8
1578HEVC_PUT_HEVC_PEL_PIXELS  6, 8
1579HEVC_PUT_HEVC_PEL_PIXELS  8, 8
1580HEVC_PUT_HEVC_PEL_PIXELS 12, 8
1581HEVC_PUT_HEVC_PEL_PIXELS 16, 8
1582
1583HEVC_PUT_HEVC_PEL_PIXELS 2, 10
1584HEVC_PUT_HEVC_PEL_PIXELS 4, 10
1585HEVC_PUT_HEVC_PEL_PIXELS 6, 10
1586HEVC_PUT_HEVC_PEL_PIXELS 8, 10
1587
1588HEVC_PUT_HEVC_PEL_PIXELS 2, 12
1589HEVC_PUT_HEVC_PEL_PIXELS 4, 12
1590HEVC_PUT_HEVC_PEL_PIXELS 6, 12
1591HEVC_PUT_HEVC_PEL_PIXELS 8, 12
1592
1593HEVC_PUT_HEVC_EPEL 2,  8
1594HEVC_PUT_HEVC_EPEL 4,  8
1595HEVC_PUT_HEVC_EPEL 6,  8
1596HEVC_PUT_HEVC_EPEL 8,  8
1597HEVC_PUT_HEVC_EPEL 12, 8
1598HEVC_PUT_HEVC_EPEL 16, 8
1599
1600
1601HEVC_PUT_HEVC_EPEL 2, 10
1602HEVC_PUT_HEVC_EPEL 4, 10
1603HEVC_PUT_HEVC_EPEL 6, 10
1604HEVC_PUT_HEVC_EPEL 8, 10
1605
1606HEVC_PUT_HEVC_EPEL 2, 12
1607HEVC_PUT_HEVC_EPEL 4, 12
1608HEVC_PUT_HEVC_EPEL 6, 12
1609HEVC_PUT_HEVC_EPEL 8, 12
1610
1611HEVC_PUT_HEVC_EPEL_HV 2,  8
1612HEVC_PUT_HEVC_EPEL_HV 4,  8
1613HEVC_PUT_HEVC_EPEL_HV 6,  8
1614HEVC_PUT_HEVC_EPEL_HV 8,  8
1615HEVC_PUT_HEVC_EPEL_HV 16, 8
1616
1617HEVC_PUT_HEVC_EPEL_HV 2, 10
1618HEVC_PUT_HEVC_EPEL_HV 4, 10
1619HEVC_PUT_HEVC_EPEL_HV 6, 10
1620HEVC_PUT_HEVC_EPEL_HV 8, 10
1621
1622HEVC_PUT_HEVC_EPEL_HV 2, 12
1623HEVC_PUT_HEVC_EPEL_HV 4, 12
1624HEVC_PUT_HEVC_EPEL_HV 6, 12
1625HEVC_PUT_HEVC_EPEL_HV 8, 12
1626
1627HEVC_PUT_HEVC_QPEL 4,  8
1628HEVC_PUT_HEVC_QPEL 8,  8
1629HEVC_PUT_HEVC_QPEL 12, 8
1630HEVC_PUT_HEVC_QPEL 16, 8
1631
1632HEVC_PUT_HEVC_QPEL 4, 10
1633HEVC_PUT_HEVC_QPEL 8, 10
1634
1635HEVC_PUT_HEVC_QPEL 4, 12
1636HEVC_PUT_HEVC_QPEL 8, 12
1637
1638HEVC_PUT_HEVC_QPEL_HV 2, 8
1639HEVC_PUT_HEVC_QPEL_HV 4, 8
1640HEVC_PUT_HEVC_QPEL_HV 6, 8
1641HEVC_PUT_HEVC_QPEL_HV 8, 8
1642
1643HEVC_PUT_HEVC_QPEL_HV 2, 10
1644HEVC_PUT_HEVC_QPEL_HV 4, 10
1645HEVC_PUT_HEVC_QPEL_HV 6, 10
1646HEVC_PUT_HEVC_QPEL_HV 8, 10
1647
1648HEVC_PUT_HEVC_QPEL_HV 2, 12
1649HEVC_PUT_HEVC_QPEL_HV 4, 12
1650HEVC_PUT_HEVC_QPEL_HV 6, 12
1651HEVC_PUT_HEVC_QPEL_HV 8, 12
1652
1653%if HAVE_AVX2_EXTERNAL
1654INIT_YMM avx2  ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. cpuflag(avx2) = 1 / notcpuflag(avx) = 0
1655
1656HEVC_PUT_HEVC_PEL_PIXELS 32, 8
1657HEVC_PUT_HEVC_PEL_PIXELS 16, 10
1658
1659HEVC_PUT_HEVC_EPEL 32, 8
1660HEVC_PUT_HEVC_EPEL 16, 10
1661
1662HEVC_PUT_HEVC_EPEL_HV 16, 10
1663HEVC_PUT_HEVC_EPEL_HV 32, 8
1664
1665HEVC_PUT_HEVC_QPEL 32, 8
1666
1667HEVC_PUT_HEVC_QPEL 16, 10
1668
1669HEVC_PUT_HEVC_QPEL_HV 16, 10
1670
1671%endif ;AVX2
1672%endif ; ARCH_X86_64
1673