1;******************************************************************************
2;* V210 SIMD pack
3;* Copyright (c) 2014 Kieran Kunhya <kierank@obe.tv>
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA 32
25
26cextern pw_4
27%define v210_enc_min_10 pw_4
28v210_enc_max_10: times 16 dw 0x3fb
29
30v210_enc_luma_mult_10: times 2 dw 4,1,16,4,1,16,0,0
31v210_enc_luma_shuf_10: times 2 db -1,0,1,-1,2,3,4,5,-1,6,7,-1,8,9,10,11
32
33v210_enc_chroma_mult_10: times 2 dw 1,4,16,0,16,1,4,0
34v210_enc_chroma_shuf_10: times 2 db 0,1,8,9,-1,2,3,-1,10,11,4,5,-1,12,13,-1
35
36cextern pb_1
37%define v210_enc_min_8 pb_1
38cextern pb_FE
39%define v210_enc_max_8 pb_FE
40
41v210_enc_luma_shuf_8: times 2 db 6,-1,7,-1,8,-1,9,-1,10,-1,11,-1,-1,-1,-1,-1
42v210_enc_luma_mult_8: times 2 dw 16,4,64,16,4,64,0,0
43
44v210_enc_chroma_shuf1_8: times 2 db 0,-1,1,-1,2,-1,3,-1,8,-1,9,-1,10,-1,11,-1
45v210_enc_chroma_shuf2_8: times 2 db 3,-1,4,-1,5,-1,7,-1,11,-1,12,-1,13,-1,15,-1
46
47v210_enc_chroma_mult_8: times 2 dw 4,16,64,0,64,4,16,0
48
49SECTION .text
50
51%macro v210_planar_pack_10 0
52
53; v210_planar_pack_10(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width)
54cglobal v210_planar_pack_10, 5, 5, 4+cpuflag(avx2), y, u, v, dst, width
55    lea     r0, [yq+2*widthq]
56    add     uq, widthq
57    add     vq, widthq
58    neg     widthq
59
60    mova    m2, [v210_enc_min_10]
61    mova    m3, [v210_enc_max_10]
62
63.loop:
64    movu        xm0, [yq+2*widthq]
65%if cpuflag(avx2)
66    vinserti128 m0,   m0, [yq+widthq*2+12], 1
67%endif
68    CLIPW   m0, m2, m3
69
70    movq         xm1, [uq+widthq]
71    movhps       xm1, [vq+widthq]
72%if cpuflag(avx2)
73    movq         xm4, [uq+widthq+6]
74    movhps       xm4, [vq+widthq+6]
75    vinserti128  m1,   m1, xm4, 1
76%endif
77    CLIPW   m1, m2, m3
78
79    pmullw  m0, [v210_enc_luma_mult_10]
80    pshufb  m0, [v210_enc_luma_shuf_10]
81
82    pmullw  m1, [v210_enc_chroma_mult_10]
83    pshufb  m1, [v210_enc_chroma_shuf_10]
84
85    por     m0, m1
86
87    movu    [dstq], m0
88
89    add     dstq, mmsize
90    add     widthq, (mmsize*3)/8
91    jl .loop
92
93    RET
94%endmacro
95
96%if HAVE_SSSE3_EXTERNAL
97INIT_XMM ssse3
98v210_planar_pack_10
99%endif
100
101%if HAVE_AVX2_EXTERNAL
102INIT_YMM avx2
103v210_planar_pack_10
104%endif
105
106%macro v210_planar_pack_8 0
107
108; v210_planar_pack_8(const uint8_t *y, const uint8_t *u, const uint8_t *v, uint8_t *dst, ptrdiff_t width)
109cglobal v210_planar_pack_8, 5, 5, 7, y, u, v, dst, width
110    add     yq, widthq
111    shr     widthq, 1
112    add     uq, widthq
113    add     vq, widthq
114    neg     widthq
115
116    mova    m4, [v210_enc_min_8]
117    mova    m5, [v210_enc_max_8]
118    pxor    m6, m6
119
120.loop:
121    movu        xm1, [yq+widthq*2]
122%if cpuflag(avx2)
123    vinserti128 m1,   m1, [yq+widthq*2+12], 1
124%endif
125    CLIPUB  m1, m4, m5
126
127    punpcklbw m0, m1, m6
128    ; can't unpack high bytes in the same way because we process
129    ; only six bytes at a time
130    pshufb  m1, [v210_enc_luma_shuf_8]
131
132    pmullw  m0, [v210_enc_luma_mult_8]
133    pmullw  m1, [v210_enc_luma_mult_8]
134    pshufb  m0, [v210_enc_luma_shuf_10]
135    pshufb  m1, [v210_enc_luma_shuf_10]
136
137    movq         xm3, [uq+widthq]
138    movhps       xm3, [vq+widthq]
139%if cpuflag(avx2)
140    movq         xm2, [uq+widthq+6]
141    movhps       xm2, [vq+widthq+6]
142    vinserti128  m3,   m3, xm2, 1
143%endif
144    CLIPUB  m3, m4, m5
145
146    ; shuffle and multiply to get the same packing as in 10-bit
147    pshufb  m2, m3, [v210_enc_chroma_shuf1_8]
148    pshufb  m3, [v210_enc_chroma_shuf2_8]
149
150    pmullw  m2, [v210_enc_chroma_mult_8]
151    pmullw  m3, [v210_enc_chroma_mult_8]
152    pshufb  m2, [v210_enc_chroma_shuf_10]
153    pshufb  m3, [v210_enc_chroma_shuf_10]
154
155    por     m0, m2
156    por     m1, m3
157
158    movu         [dstq],    xm0
159    movu         [dstq+16], xm1
160%if cpuflag(avx2)
161    vextracti128 [dstq+32], m0, 1
162    vextracti128 [dstq+48], m1, 1
163%endif
164
165    add     dstq, 2*mmsize
166    add     widthq, (mmsize*3)/8
167    jl .loop
168
169    RET
170%endmacro
171
172%if HAVE_SSSE3_EXTERNAL
173INIT_XMM ssse3
174v210_planar_pack_8
175%endif
176%if HAVE_AVX_EXTERNAL
177INIT_XMM avx
178v210_planar_pack_8
179%endif
180
181%if HAVE_AVX2_EXTERNAL
182INIT_YMM avx2
183v210_planar_pack_8
184%endif
185