1 /*****************************************************************************
2  * quant.c: ppc quantization
3  *****************************************************************************
4  * Copyright (C) 2007-2014 x264 project
5  *
6  * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21  *
22  * This program is also available under a commercial proprietary license.
23  * For more information, contact us at licensing@x264.com.
24  *****************************************************************************/
25 
26 #include "common/common.h"
27 #include "ppccommon.h"
28 #include "quant.h"
29 
30 #if !HIGH_BIT_DEPTH
31 // quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
32 #define QUANT_16_U( idx0, idx1 )                                    \
33 {                                                                   \
34     temp1v = vec_ld((idx0), dct);                                   \
35     temp2v = vec_ld((idx1), dct);                                   \
36     mfvA = vec_ld((idx0), mf);                                      \
37     mfvB = vec_ld((idx1), mf);                                      \
38     biasvA = vec_ld((idx0), bias);                                  \
39     biasvB = vec_ld((idx1), bias);                                  \
40     mskA = vec_cmplt(temp1v, zero_s16v);                            \
41     mskB = vec_cmplt(temp2v, zero_s16v);                            \
42     coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
43     coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\
44     coefvA = vec_adds(coefvA, biasvA);                              \
45     coefvB = vec_adds(coefvB, biasvB);                              \
46     multEvenvA = vec_mule(coefvA, mfvA);                            \
47     multOddvA = vec_mulo(coefvA, mfvA);                             \
48     multEvenvB = vec_mule(coefvB, mfvB);                            \
49     multOddvB = vec_mulo(coefvB, mfvB);                             \
50     multEvenvA = vec_sr(multEvenvA, i_qbitsv);                      \
51     multOddvA = vec_sr(multOddvA, i_qbitsv);                        \
52     multEvenvB = vec_sr(multEvenvB, i_qbitsv);                      \
53     multOddvB = vec_sr(multOddvB, i_qbitsv);                        \
54     temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
55     temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
56     temp1v = vec_xor(temp1v, mskA);                                 \
57     temp2v = vec_xor(temp2v, mskB);                                 \
58     temp1v = vec_adds(temp1v, vec_and(mskA, one));                  \
59     vec_st(temp1v, (idx0), dct);                                    \
60     temp2v = vec_adds(temp2v, vec_and(mskB, one));                  \
61     nz = vec_or(nz, vec_or(temp1v, temp2v));                        \
62     vec_st(temp2v, (idx1), dct);                                    \
63 }
64 
x264_quant_4x4_altivec(int16_t dct[16],uint16_t mf[16],uint16_t bias[16])65 int x264_quant_4x4_altivec( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
66 {
67     LOAD_ZERO;
68     vector bool short mskA;
69     vec_u32_t i_qbitsv;
70     vec_u16_t coefvA;
71     vec_u32_t multEvenvA, multOddvA;
72     vec_u16_t mfvA;
73     vec_u16_t biasvA;
74     vec_s16_t one = vec_splat_s16(1);;
75     vec_s16_t nz = zero_s16v;
76 
77     vector bool short mskB;
78     vec_u16_t coefvB;
79     vec_u32_t multEvenvB, multOddvB;
80     vec_u16_t mfvB;
81     vec_u16_t biasvB;
82 
83     vec_s16_t temp1v, temp2v;
84 
85     vec_u32_u qbits_u;
86     qbits_u.s[0]=16;
87     i_qbitsv = vec_splat(qbits_u.v, 0);
88 
89     QUANT_16_U( 0, 16 );
90     return vec_any_ne(nz, zero_s16v);
91 }
92 
93 // DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
94 #define QUANT_16_U_DC( idx0, idx1 )                                 \
95 {                                                                   \
96     temp1v = vec_ld((idx0), dct);                                   \
97     temp2v = vec_ld((idx1), dct);                                   \
98     mskA = vec_cmplt(temp1v, zero_s16v);                            \
99     mskB = vec_cmplt(temp2v, zero_s16v);                            \
100     coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
101     coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\
102     coefvA = vec_add(coefvA, biasv);                                \
103     coefvB = vec_add(coefvB, biasv);                                \
104     multEvenvA = vec_mule(coefvA, mfv);                             \
105     multOddvA = vec_mulo(coefvA, mfv);                              \
106     multEvenvB = vec_mule(coefvB, mfv);                             \
107     multOddvB = vec_mulo(coefvB, mfv);                              \
108     multEvenvA = vec_sr(multEvenvA, i_qbitsv);                      \
109     multOddvA = vec_sr(multOddvA, i_qbitsv);                        \
110     multEvenvB = vec_sr(multEvenvB, i_qbitsv);                      \
111     multOddvB = vec_sr(multOddvB, i_qbitsv);                        \
112     temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
113     temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
114     temp1v = vec_xor(temp1v, mskA);                                 \
115     temp2v = vec_xor(temp2v, mskB);                                 \
116     temp1v = vec_add(temp1v, vec_and(mskA, one));                   \
117     vec_st(temp1v, (idx0), dct);                                    \
118     temp2v = vec_add(temp2v, vec_and(mskB, one));                   \
119     nz = vec_or(nz, vec_or(temp1v, temp2v));                        \
120     vec_st(temp2v, (idx1), dct);                                    \
121 }
122 
x264_quant_4x4_dc_altivec(int16_t dct[16],int mf,int bias)123 int x264_quant_4x4_dc_altivec( int16_t dct[16], int mf, int bias )
124 {
125     LOAD_ZERO;
126     vector bool short mskA;
127     vec_u32_t i_qbitsv;
128     vec_u16_t coefvA;
129     vec_u32_t multEvenvA, multOddvA;
130     vec_s16_t one = vec_splat_s16(1);
131     vec_s16_t nz = zero_s16v;
132 
133     vector bool short mskB;
134     vec_u16_t coefvB;
135     vec_u32_t multEvenvB, multOddvB;
136 
137     vec_s16_t temp1v, temp2v;
138 
139     vec_u16_t mfv;
140     vec_u16_t biasv;
141 
142     vec_u16_u mf_u;
143     mf_u.s[0]=mf;
144     mfv = vec_splat( mf_u.v, 0 );
145 
146     vec_u32_u qbits_u;
147     qbits_u.s[0]=16;
148     i_qbitsv = vec_splat(qbits_u.v, 0);
149 
150     vec_u16_u bias_u;
151     bias_u.s[0]=bias;
152     biasv = vec_splat(bias_u.v, 0);
153 
154     QUANT_16_U_DC( 0, 16 );
155     return vec_any_ne(nz, zero_s16v);
156 }
157 
158 // DC quant of a whole 2x2 block
159 #define QUANT_4_U_DC( idx0 )                                        \
160 {                                                                   \
161     const vec_u16_t sel = (vec_u16_t) CV(-1,-1,-1,-1,0,0,0,0);      \
162     temp1v = vec_ld((idx0), dct);                                   \
163     mskA = vec_cmplt(temp1v, zero_s16v);                            \
164     coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
165     coefvA = vec_add(coefvA, biasv);                                \
166     multEvenvA = vec_mule(coefvA, mfv);                             \
167     multOddvA = vec_mulo(coefvA, mfv);                              \
168     multEvenvA = vec_sr(multEvenvA, i_qbitsv);                      \
169     multOddvA = vec_sr(multOddvA, i_qbitsv);                        \
170     temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
171     temp2v = vec_xor(temp2v, mskA);                                 \
172     temp2v = vec_add(temp2v, vec_and(mskA, one));                   \
173     temp1v = vec_sel(temp1v, temp2v, sel);                          \
174     nz = vec_or(nz, temp1v);                                        \
175     vec_st(temp1v, (idx0), dct);                                    \
176 }
177 
x264_quant_2x2_dc_altivec(int16_t dct[4],int mf,int bias)178 int x264_quant_2x2_dc_altivec( int16_t dct[4], int mf, int bias )
179 {
180     LOAD_ZERO;
181     vector bool short mskA;
182     vec_u32_t i_qbitsv;
183     vec_u16_t coefvA;
184     vec_u32_t multEvenvA, multOddvA;
185     vec_s16_t one = vec_splat_s16(1);
186     vec_s16_t nz = zero_s16v;
187 
188     vec_s16_t temp1v, temp2v;
189 
190     vec_u16_t mfv;
191     vec_u16_t biasv;
192 
193     vec_u16_u mf_u;
194     mf_u.s[0]=mf;
195     mfv = vec_splat( mf_u.v, 0 );
196 
197     vec_u32_u qbits_u;
198     qbits_u.s[0]=16;
199     i_qbitsv = vec_splat(qbits_u.v, 0);
200 
201     vec_u16_u bias_u;
202     bias_u.s[0]=bias;
203     biasv = vec_splat(bias_u.v, 0);
204 
205     static const vec_s16_t mask2 = CV(-1, -1, -1, -1,  0, 0, 0, 0);
206     QUANT_4_U_DC(0);
207     return vec_any_ne(vec_and(nz, mask2), zero_s16v);
208 }
209 
x264_quant_8x8_altivec(int16_t dct[64],uint16_t mf[64],uint16_t bias[64])210 int x264_quant_8x8_altivec( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
211 {
212     LOAD_ZERO;
213     vector bool short mskA;
214     vec_u32_t i_qbitsv;
215     vec_u16_t coefvA;
216     vec_u32_t multEvenvA, multOddvA;
217     vec_u16_t mfvA;
218     vec_u16_t biasvA;
219     vec_s16_t one = vec_splat_s16(1);;
220     vec_s16_t nz = zero_s16v;
221 
222     vector bool short mskB;
223     vec_u16_t coefvB;
224     vec_u32_t multEvenvB, multOddvB;
225     vec_u16_t mfvB;
226     vec_u16_t biasvB;
227 
228     vec_s16_t temp1v, temp2v;
229 
230     vec_u32_u qbits_u;
231     qbits_u.s[0]=16;
232     i_qbitsv = vec_splat(qbits_u.v, 0);
233 
234     for( int i = 0; i < 4; i++ )
235         QUANT_16_U( i*2*16, i*2*16+16 );
236     return vec_any_ne(nz, zero_s16v);
237 }
238 
239 #define DEQUANT_SHL()                                                \
240 {                                                                    \
241     dctv = vec_ld(8*y, dct);                                         \
242     mf1v = vec_ld(16*y, dequant_mf[i_mf]);                           \
243     mf2v = vec_ld(16+16*y, dequant_mf[i_mf]);                        \
244     mfv  = vec_packs(mf1v, mf2v);                                    \
245                                                                      \
246     multEvenvA = vec_mule(dctv, mfv);                                \
247     multOddvA = vec_mulo(dctv, mfv);                                 \
248     dctv = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA),  \
249                                  vec_mergel(multEvenvA, multOddvA)); \
250     dctv = vec_sl(dctv, i_qbitsv);                                   \
251     vec_st(dctv, 8*y, dct);                                          \
252 }
253 
254 #define DEQUANT_SHR()                                          \
255 {                                                              \
256     dctv = vec_ld(8*y, dct);                                   \
257     dct1v = vec_mergeh(dctv, dctv);                            \
258     dct2v = vec_mergel(dctv, dctv);                            \
259     mf1v = vec_ld(16*y, dequant_mf[i_mf]);                     \
260     mf2v = vec_ld(16+16*y, dequant_mf[i_mf]);                  \
261                                                                \
262     multEvenvA = vec_mule(dct1v, (vec_s16_t)mf1v);             \
263     multOddvA = vec_mulo(dct1v, (vec_s16_t)mf1v);              \
264     temp1v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
265     temp1v = vec_add(temp1v, fv);                              \
266     temp1v = vec_sra(temp1v, i_qbitsv);                        \
267                                                                \
268     multEvenvA = vec_mule(dct2v, (vec_s16_t)mf2v);             \
269     multOddvA = vec_mulo(dct2v, (vec_s16_t)mf2v);              \
270     temp2v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
271     temp2v = vec_add(temp2v, fv);                              \
272     temp2v = vec_sra(temp2v, i_qbitsv);                        \
273                                                                \
274     dctv = (vec_s16_t)vec_packs(temp1v, temp2v);               \
275     vec_st(dctv, y*8, dct);                                    \
276 }
277 
x264_dequant_4x4_altivec(int16_t dct[16],int dequant_mf[6][16],int i_qp)278 void x264_dequant_4x4_altivec( int16_t dct[16], int dequant_mf[6][16], int i_qp )
279 {
280     int i_mf = i_qp%6;
281     int i_qbits = i_qp/6 - 4;
282 
283     vec_s16_t dctv;
284     vec_s16_t dct1v, dct2v;
285     vec_s32_t mf1v, mf2v;
286     vec_s16_t mfv;
287     vec_s32_t multEvenvA, multOddvA;
288     vec_s32_t temp1v, temp2v;
289 
290     if( i_qbits >= 0 )
291     {
292         vec_u16_t i_qbitsv;
293         vec_u16_u qbits_u;
294         qbits_u.s[0]=i_qbits;
295         i_qbitsv = vec_splat(qbits_u.v, 0);
296 
297         for( int y = 0; y < 4; y+=2 )
298             DEQUANT_SHL();
299     }
300     else
301     {
302         const int f = 1 << (-i_qbits-1);
303 
304         vec_s32_t fv;
305         vec_u32_u f_u;
306         f_u.s[0]=f;
307         fv = (vec_s32_t)vec_splat(f_u.v, 0);
308 
309         vec_u32_t i_qbitsv;
310         vec_u32_u qbits_u;
311         qbits_u.s[0]=-i_qbits;
312         i_qbitsv = vec_splat(qbits_u.v, 0);
313 
314         vec_u32_t sixteenv;
315         vec_u32_u sixteen_u;
316         sixteen_u.s[0]=16;
317         sixteenv = vec_splat(sixteen_u.v, 0);
318 
319         for( int y = 0; y < 4; y+=2 )
320             DEQUANT_SHR();
321     }
322 }
323 
x264_dequant_8x8_altivec(int16_t dct[64],int dequant_mf[6][64],int i_qp)324 void x264_dequant_8x8_altivec( int16_t dct[64], int dequant_mf[6][64], int i_qp )
325 {
326     int i_mf = i_qp%6;
327     int i_qbits = i_qp/6 - 6;
328 
329     vec_s16_t dctv;
330     vec_s16_t dct1v, dct2v;
331     vec_s32_t mf1v, mf2v;
332     vec_s16_t mfv;
333     vec_s32_t multEvenvA, multOddvA;
334     vec_s32_t temp1v, temp2v;
335 
336     if( i_qbits >= 0 )
337     {
338         vec_u16_t i_qbitsv;
339         vec_u16_u qbits_u;
340         qbits_u.s[0]=i_qbits;
341         i_qbitsv = vec_splat(qbits_u.v, 0);
342 
343         for( int y = 0; y < 16; y+=2 )
344             DEQUANT_SHL();
345     }
346     else
347     {
348         const int f = 1 << (-i_qbits-1);
349 
350         vec_s32_t fv;
351         vec_u32_u f_u;
352         f_u.s[0]=f;
353         fv = (vec_s32_t)vec_splat(f_u.v, 0);
354 
355         vec_u32_t i_qbitsv;
356         vec_u32_u qbits_u;
357         qbits_u.s[0]=-i_qbits;
358         i_qbitsv = vec_splat(qbits_u.v, 0);
359 
360         vec_u32_t sixteenv;
361         vec_u32_u sixteen_u;
362         sixteen_u.s[0]=16;
363         sixteenv = vec_splat(sixteen_u.v, 0);
364 
365         for( int y = 0; y < 16; y+=2 )
366             DEQUANT_SHR();
367     }
368 }
369 #endif // !HIGH_BIT_DEPTH
370 
371