1;******************************************************************************
2;* x86-optimized functions for the CFHD decoder
3;* Copyright (c) 2020 Paul B Mahol
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA
25
26factor_p1_n1: dw 1, -1, 1, -1, 1, -1, 1, -1,
27factor_n1_p1: dw -1, 1, -1, 1, -1, 1, -1, 1,
28factor_p11_n4: dw 11, -4, 11, -4, 11, -4, 11, -4,
29factor_p5_p4: dw 5, 4, 5, 4, 5, 4, 5, 4,
30pd_4: times 4 dd 4
31pw_1: times 8 dw 1
32pw_0: times 8 dw 0
33pw_1023: times 8 dw 1023
34pw_4095: times 8 dw 4095
35
36SECTION .text
37
38%macro CFHD_HORIZ_FILTER 1
39%if %1 == 1023
40cglobal cfhd_horiz_filter_clip10, 5, 6, 8 + 4 * ARCH_X86_64, output, low, high, width, x, temp
41    shl        widthd, 1
42%define ostrideq widthq
43%define lwidthq  widthq
44%define hwidthq  widthq
45%elif %1 == 4095
46cglobal cfhd_horiz_filter_clip12, 5, 6, 8 + 4 * ARCH_X86_64, output, low, high, width, x, temp
47    shl        widthd, 1
48%define ostrideq widthq
49%define lwidthq  widthq
50%define hwidthq  widthq
51%else
52%if ARCH_X86_64
53cglobal cfhd_horiz_filter, 8, 11, 12, output, ostride, low, lwidth, high, hwidth, width, height, x, y, temp
54    shl  ostrided, 1
55    shl   lwidthd, 1
56    shl   hwidthd, 1
57    shl    widthd, 1
58
59    mov        yd, heightd
60    neg        yq
61%else
62cglobal cfhd_horiz_filter, 7, 7, 8, output, x, low, y, high, temp, width, height
63    shl        xd, 1
64    shl        yd, 1
65    shl     tempd, 1
66    shl    widthd, 1
67
68    mov       xmp, xq
69    mov       ymp, yq
70    mov    tempmp, tempq
71
72    mov        yd, r7m
73    neg        yq
74
75%define ostrideq xm
76%define lwidthq  ym
77%define hwidthq  tempm
78%endif
79%endif
80
81%if ARCH_X86_64
82    mova       m8, [factor_p1_n1]
83    mova       m9, [factor_n1_p1]
84    mova      m10, [pw_1]
85    mova      m11, [pd_4]
86%endif
87
88%if %1 == 0
89.looph:
90%endif
91    movsx          xq, word [lowq]
92    imul           xq, 11
93
94    movsx       tempq, word [lowq + 2]
95    imul        tempq, -4
96    add         tempq, xq
97
98    movsx          xq, word [lowq + 4]
99    add         tempq, xq
100    add         tempq, 4
101    sar         tempq, 3
102
103    movsx          xq, word [highq]
104    add         tempq, xq
105    sar         tempq, 1
106
107%if %1
108    movd          xm0, tempd
109    CLIPW          m0, [pw_0], [pw_%1]
110    pextrw      tempd, xm0, 0
111%endif
112    mov  word [outputq], tempw
113
114    movsx          xq, word [lowq]
115    imul           xq, 5
116
117    movsx       tempq, word [lowq + 2]
118    imul        tempq, 4
119    add         tempq, xq
120
121    movsx          xq, word [lowq + 4]
122    sub         tempq, xq
123    add         tempq, 4
124    sar         tempq, 3
125
126    movsx          xq, word [highq]
127    sub         tempq, xq
128    sar         tempq, 1
129
130%if %1
131    movd          xm0, tempd
132    CLIPW          m0, [pw_0], [pw_%1]
133    pextrw      tempd, xm0, 0
134%endif
135    mov  word [outputq + 2], tempw
136
137    mov            xq, 0
138
139.loop:
140    movu           m4, [lowq + xq]
141    movu           m1, [lowq + xq + 4]
142
143    mova           m5, m4
144    punpcklwd      m4, m1
145    punpckhwd      m5, m1
146
147    mova           m6, m4
148    mova           m7, m5
149
150%if ARCH_X86_64
151    pmaddwd        m4, m8
152    pmaddwd        m5, m8
153    pmaddwd        m6, m9
154    pmaddwd        m7, m9
155
156    paddd          m4, m11
157    paddd          m5, m11
158    paddd          m6, m11
159    paddd          m7, m11
160%else
161    pmaddwd        m4, [factor_p1_n1]
162    pmaddwd        m5, [factor_p1_n1]
163    pmaddwd        m6, [factor_n1_p1]
164    pmaddwd        m7, [factor_n1_p1]
165
166    paddd          m4, [pd_4]
167    paddd          m5, [pd_4]
168    paddd          m6, [pd_4]
169    paddd          m7, [pd_4]
170%endif
171
172    psrad          m4, 3
173    psrad          m5, 3
174    psrad          m6, 3
175    psrad          m7, 3
176
177    movu           m2, [lowq + xq + 2]
178    movu           m3, [highq + xq + 2]
179
180    mova           m0, m2
181    punpcklwd      m2, m3
182    punpckhwd      m0, m3
183
184    mova           m1, m2
185    mova           m3, m0
186
187%if ARCH_X86_64
188    pmaddwd        m2, m10
189    pmaddwd        m0, m10
190    pmaddwd        m1, m8
191    pmaddwd        m3, m8
192%else
193    pmaddwd        m2, [pw_1]
194    pmaddwd        m0, [pw_1]
195    pmaddwd        m1, [factor_p1_n1]
196    pmaddwd        m3, [factor_p1_n1]
197%endif
198
199    paddd          m2, m4
200    paddd          m0, m5
201    paddd          m1, m6
202    paddd          m3, m7
203
204    psrad          m2, 1
205    psrad          m0, 1
206    psrad          m1, 1
207    psrad          m3, 1
208
209    packssdw       m2, m0
210    packssdw       m1, m3
211
212    mova           m0, m2
213    punpcklwd      m2, m1
214    punpckhwd      m0, m1
215
216%if %1
217    CLIPW          m2, [pw_0], [pw_%1]
218    CLIPW          m0, [pw_0], [pw_%1]
219%endif
220
221    movu  [outputq + xq * 2 + 4], m2
222    movu  [outputq + xq * 2 + mmsize + 4], m0
223
224    add            xq, mmsize
225    cmp            xq, widthq
226    jl .loop
227
228    add          lowq, widthq
229    add         highq, widthq
230    add       outputq, widthq
231    add       outputq, widthq
232
233    movsx          xq, word [lowq - 2]
234    imul           xq, 5
235
236    movsx       tempq, word [lowq - 4]
237    imul        tempq, 4
238    add         tempq, xq
239
240    movsx          xq, word [lowq - 6]
241    sub         tempq, xq
242    add         tempq, 4
243    sar         tempq, 3
244
245    movsx          xq, word [highq - 2]
246    add         tempq, xq
247    sar         tempq, 1
248
249%if %1
250    movd          xm0, tempd
251    CLIPW          m0, [pw_0], [pw_%1]
252    pextrw      tempd, xm0, 0
253%endif
254    mov  word [outputq - 4], tempw
255
256    movsx          xq, word [lowq - 2]
257    imul           xq, 11
258
259    movsx       tempq, word [lowq - 4]
260    imul        tempq, -4
261    add         tempq, xq
262
263    movsx          xq, word [lowq - 6]
264    add         tempq, xq
265    add         tempq, 4
266    sar         tempq, 3
267
268    movsx          xq, word [highq - 2]
269    sub         tempq, xq
270    sar         tempq, 1
271
272%if %1
273    movd          xm0, tempd
274    CLIPW          m0, [pw_0], [pw_%1]
275    pextrw      tempd, xm0, 0
276%endif
277    mov  word [outputq - 2], tempw
278
279%if %1 == 0
280    sub          lowq, widthq
281    sub         highq, widthq
282    sub       outputq, widthq
283    sub       outputq, widthq
284
285    add          lowq, lwidthq
286    add         highq, hwidthq
287    add       outputq, ostrideq
288    add       outputq, ostrideq
289    add            yq, 1
290    jl .looph
291%endif
292
293    RET
294%endmacro
295
296INIT_XMM sse2
297CFHD_HORIZ_FILTER 0
298
299INIT_XMM sse2
300CFHD_HORIZ_FILTER 1023
301
302INIT_XMM sse2
303CFHD_HORIZ_FILTER 4095
304
305INIT_XMM sse2
306%if ARCH_X86_64
307cglobal cfhd_vert_filter, 8, 11, 14, output, ostride, low, lwidth, high, hwidth, width, height, x, y, pos
308    shl        ostrided, 1
309    shl         lwidthd, 1
310    shl         hwidthd, 1
311    shl          widthd, 1
312
313    dec   heightd
314
315    mova       m8, [factor_p1_n1]
316    mova       m9, [factor_n1_p1]
317    mova      m10, [pw_1]
318    mova      m11, [pd_4]
319    mova      m12, [factor_p11_n4]
320    mova      m13, [factor_p5_p4]
321%else
322cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height
323    shl        xd, 1
324    shl        yd, 1
325    shl      posd, 1
326    shl    widthd, 1
327
328    mov       xmp, xq
329    mov       ymp, yq
330    mov     posmp, posq
331
332    mov        xq, r7m
333    dec        xq
334    mov   widthmp, xq
335
336%define ostrideq xm
337%define lwidthq  ym
338%define hwidthq  posm
339%define heightq  widthm
340
341%endif
342
343    xor        xq, xq
344.loopw:
345    xor        yq, yq
346
347    mov      posq, xq
348    movu       m0, [lowq + posq]
349    add      posq, lwidthq
350    movu       m1, [lowq + posq]
351    mova       m2, m0
352    punpcklwd  m0, m1
353    punpckhwd  m2, m1
354
355%if ARCH_X86_64
356    pmaddwd    m0, m12
357    pmaddwd    m2, m12
358%else
359    pmaddwd    m0, [factor_p11_n4]
360    pmaddwd    m2, [factor_p11_n4]
361%endif
362
363    pxor       m4, m4
364    add      posq, lwidthq
365    movu       m1, [lowq + posq]
366    mova       m3, m4
367    punpcklwd  m4, m1
368    punpckhwd  m3, m1
369
370    psrad      m4, 16
371    psrad      m3, 16
372
373    paddd      m0, m4
374    paddd      m2, m3
375
376    paddd      m0, [pd_4]
377    paddd      m2, [pd_4]
378
379    psrad      m0, 3
380    psrad      m2, 3
381
382    mov      posq, xq
383    pxor       m4, m4
384    movu       m1, [highq + posq]
385    mova       m3, m4
386    punpcklwd  m4, m1
387    punpckhwd  m3, m1
388
389    psrad      m4, 16
390    psrad      m3, 16
391
392    paddd      m0, m4
393    paddd      m2, m3
394
395    psrad      m0, 1
396    psrad      m2, 1
397
398    packssdw   m0, m2
399
400    movu    [outputq + posq], m0
401
402    movu       m0, [lowq + posq]
403    add      posq, lwidthq
404    movu       m1, [lowq + posq]
405    mova       m2, m0
406    punpcklwd  m0, m1
407    punpckhwd  m2, m1
408
409%if ARCH_X86_64
410    pmaddwd    m0, m13
411    pmaddwd    m2, m13
412%else
413    pmaddwd    m0, [factor_p5_p4]
414    pmaddwd    m2, [factor_p5_p4]
415%endif
416
417    pxor       m4, m4
418    add      posq, lwidthq
419    movu       m1, [lowq + posq]
420    mova       m3, m4
421    punpcklwd  m4, m1
422    punpckhwd  m3, m1
423
424    psrad      m4, 16
425    psrad      m3, 16
426
427    psubd      m0, m4
428    psubd      m2, m3
429
430    paddd      m0, [pd_4]
431    paddd      m2, [pd_4]
432
433    psrad      m0, 3
434    psrad      m2, 3
435
436    mov      posq, xq
437    pxor       m4, m4
438    movu       m1, [highq + posq]
439    mova       m3, m4
440    punpcklwd  m4, m1
441    punpckhwd  m3, m1
442
443    psrad      m4, 16
444    psrad      m3, 16
445
446    psubd      m0, m4
447    psubd      m2, m3
448
449    psrad      m0, 1
450    psrad      m2, 1
451
452    packssdw   m0, m2
453
454    add      posq, ostrideq
455    movu    [outputq + posq], m0
456
457    add        yq, 1
458.looph:
459    mov      posq, lwidthq
460    imul     posq, yq
461    sub      posq, lwidthq
462    add      posq, xq
463
464    movu       m4, [lowq + posq]
465
466    add      posq, lwidthq
467    add      posq, lwidthq
468    movu       m1, [lowq + posq]
469
470    mova       m5, m4
471    punpcklwd  m4, m1
472    punpckhwd  m5, m1
473
474    mova       m6, m4
475    mova       m7, m5
476
477%if ARCH_X86_64
478    pmaddwd    m4, m8
479    pmaddwd    m5, m8
480    pmaddwd    m6, m9
481    pmaddwd    m7, m9
482
483    paddd      m4, m11
484    paddd      m5, m11
485    paddd      m6, m11
486    paddd      m7, m11
487%else
488    pmaddwd    m4, [factor_p1_n1]
489    pmaddwd    m5, [factor_p1_n1]
490    pmaddwd    m6, [factor_n1_p1]
491    pmaddwd    m7, [factor_n1_p1]
492
493    paddd      m4, [pd_4]
494    paddd      m5, [pd_4]
495    paddd      m6, [pd_4]
496    paddd      m7, [pd_4]
497%endif
498
499    psrad      m4, 3
500    psrad      m5, 3
501    psrad      m6, 3
502    psrad      m7, 3
503
504    sub      posq, lwidthq
505    movu       m0, [lowq + posq]
506
507    mov      posq, hwidthq
508    imul     posq, yq
509    add      posq, xq
510    movu       m1, [highq + posq]
511
512    mova       m2, m0
513    punpcklwd  m0, m1
514    punpckhwd  m2, m1
515
516    mova       m1, m0
517    mova       m3, m2
518
519%if ARCH_X86_64
520    pmaddwd    m0, m10
521    pmaddwd    m2, m10
522    pmaddwd    m1, m8
523    pmaddwd    m3, m8
524%else
525    pmaddwd    m0, [pw_1]
526    pmaddwd    m2, [pw_1]
527    pmaddwd    m1, [factor_p1_n1]
528    pmaddwd    m3, [factor_p1_n1]
529%endif
530
531    paddd      m0, m4
532    paddd      m2, m5
533    paddd      m1, m6
534    paddd      m3, m7
535
536    psrad      m0, 1
537    psrad      m2, 1
538    psrad      m1, 1
539    psrad      m3, 1
540
541    packssdw   m0, m2
542    packssdw   m1, m3
543
544    mov      posq, ostrideq
545    imul     posq, 2
546    imul     posq, yq
547    add      posq, xq
548
549    movu    [outputq + posq], m0
550    add      posq, ostrideq
551    movu    [outputq + posq], m1
552
553    add        yq, 1
554    cmp        yq, heightq
555    jl .looph
556
557    mov      posq, lwidthq
558    imul     posq, yq
559    add      posq, xq
560    movu       m0, [lowq + posq]
561    sub      posq, lwidthq
562    movu       m1, [lowq + posq]
563    mova       m2, m0
564    punpcklwd  m0, m1
565    punpckhwd  m2, m1
566
567%if ARCH_X86_64
568    pmaddwd    m0, m13
569    pmaddwd    m2, m13
570%else
571    pmaddwd    m0, [factor_p5_p4]
572    pmaddwd    m2, [factor_p5_p4]
573%endif
574
575    pxor       m4, m4
576    sub      posq, lwidthq
577    movu       m1, [lowq + posq]
578    mova       m3, m4
579    punpcklwd  m4, m1
580    punpckhwd  m3, m1
581
582    psrad      m4, 16
583    psrad      m3, 16
584
585    psubd      m0, m4
586    psubd      m2, m3
587
588%if ARCH_X86_64
589    paddd      m0, m11
590    paddd      m2, m11
591%else
592    paddd      m0, [pd_4]
593    paddd      m2, [pd_4]
594%endif
595
596    psrad      m0, 3
597    psrad      m2, 3
598
599    mov      posq, hwidthq
600    imul     posq, yq
601    add      posq, xq
602    pxor       m4, m4
603    movu       m1, [highq + posq]
604    mova       m3, m4
605    punpcklwd  m4, m1
606    punpckhwd  m3, m1
607
608    psrad      m4, 16
609    psrad      m3, 16
610
611    paddd      m0, m4
612    paddd      m2, m3
613
614    psrad      m0, 1
615    psrad      m2, 1
616
617    packssdw   m0, m2
618
619    mov      posq, ostrideq
620    imul     posq, 2
621    imul     posq, yq
622    add      posq, xq
623    movu    [outputq + posq], m0
624
625    mov      posq, lwidthq
626    imul     posq, yq
627    add      posq, xq
628    movu       m0, [lowq + posq]
629    sub      posq, lwidthq
630    movu       m1, [lowq + posq]
631    mova       m2, m0
632    punpcklwd  m0, m1
633    punpckhwd  m2, m1
634
635%if ARCH_X86_64
636    pmaddwd    m0, m12
637    pmaddwd    m2, m12
638%else
639    pmaddwd    m0, [factor_p11_n4]
640    pmaddwd    m2, [factor_p11_n4]
641%endif
642
643    pxor       m4, m4
644    sub      posq, lwidthq
645    movu       m1, [lowq + posq]
646    mova       m3, m4
647    punpcklwd  m4, m1
648    punpckhwd  m3, m1
649
650    psrad      m4, 16
651    psrad      m3, 16
652
653    paddd      m0, m4
654    paddd      m2, m3
655
656%if ARCH_X86_64
657    paddd      m0, m11
658    paddd      m2, m11
659%else
660    paddd      m0, [pd_4]
661    paddd      m2, [pd_4]
662%endif
663
664    psrad      m0, 3
665    psrad      m2, 3
666
667    mov      posq, hwidthq
668    imul     posq, yq
669    add      posq, xq
670    pxor       m4, m4
671    movu       m1, [highq + posq]
672    mova       m3, m4
673    punpcklwd  m4, m1
674    punpckhwd  m3, m1
675
676    psrad      m4, 16
677    psrad      m3, 16
678
679    psubd      m0, m4
680    psubd      m2, m3
681
682    psrad      m0, 1
683    psrad      m2, 1
684
685    packssdw   m0, m2
686
687    mov      posq, ostrideq
688    imul     posq, 2
689    imul     posq, yq
690    add      posq, ostrideq
691    add      posq, xq
692    movu    [outputq + posq], m0
693
694    add        xq, mmsize
695    cmp        xq, widthq
696    jl .loopw
697    RET
698