1 /*****************************************************************************
2 * quant.c: ppc quantization
3 *****************************************************************************
4 * Copyright (C) 2007-2014 x264 project
5 *
6 * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 *
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at licensing@x264.com.
24 *****************************************************************************/
25
26 #include "common/common.h"
27 #include "ppccommon.h"
28 #include "quant.h"
29
30 #if !HIGH_BIT_DEPTH
31 // quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
32 #define QUANT_16_U( idx0, idx1 ) \
33 { \
34 temp1v = vec_ld((idx0), dct); \
35 temp2v = vec_ld((idx1), dct); \
36 mfvA = vec_ld((idx0), mf); \
37 mfvB = vec_ld((idx1), mf); \
38 biasvA = vec_ld((idx0), bias); \
39 biasvB = vec_ld((idx1), bias); \
40 mskA = vec_cmplt(temp1v, zero_s16v); \
41 mskB = vec_cmplt(temp2v, zero_s16v); \
42 coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
43 coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\
44 coefvA = vec_adds(coefvA, biasvA); \
45 coefvB = vec_adds(coefvB, biasvB); \
46 multEvenvA = vec_mule(coefvA, mfvA); \
47 multOddvA = vec_mulo(coefvA, mfvA); \
48 multEvenvB = vec_mule(coefvB, mfvB); \
49 multOddvB = vec_mulo(coefvB, mfvB); \
50 multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
51 multOddvA = vec_sr(multOddvA, i_qbitsv); \
52 multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
53 multOddvB = vec_sr(multOddvB, i_qbitsv); \
54 temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
55 temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
56 temp1v = vec_xor(temp1v, mskA); \
57 temp2v = vec_xor(temp2v, mskB); \
58 temp1v = vec_adds(temp1v, vec_and(mskA, one)); \
59 vec_st(temp1v, (idx0), dct); \
60 temp2v = vec_adds(temp2v, vec_and(mskB, one)); \
61 nz = vec_or(nz, vec_or(temp1v, temp2v)); \
62 vec_st(temp2v, (idx1), dct); \
63 }
64
x264_quant_4x4_altivec(int16_t dct[16],uint16_t mf[16],uint16_t bias[16])65 int x264_quant_4x4_altivec( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
66 {
67 LOAD_ZERO;
68 vector bool short mskA;
69 vec_u32_t i_qbitsv;
70 vec_u16_t coefvA;
71 vec_u32_t multEvenvA, multOddvA;
72 vec_u16_t mfvA;
73 vec_u16_t biasvA;
74 vec_s16_t one = vec_splat_s16(1);;
75 vec_s16_t nz = zero_s16v;
76
77 vector bool short mskB;
78 vec_u16_t coefvB;
79 vec_u32_t multEvenvB, multOddvB;
80 vec_u16_t mfvB;
81 vec_u16_t biasvB;
82
83 vec_s16_t temp1v, temp2v;
84
85 vec_u32_u qbits_u;
86 qbits_u.s[0]=16;
87 i_qbitsv = vec_splat(qbits_u.v, 0);
88
89 QUANT_16_U( 0, 16 );
90 return vec_any_ne(nz, zero_s16v);
91 }
92
93 // DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
94 #define QUANT_16_U_DC( idx0, idx1 ) \
95 { \
96 temp1v = vec_ld((idx0), dct); \
97 temp2v = vec_ld((idx1), dct); \
98 mskA = vec_cmplt(temp1v, zero_s16v); \
99 mskB = vec_cmplt(temp2v, zero_s16v); \
100 coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
101 coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\
102 coefvA = vec_add(coefvA, biasv); \
103 coefvB = vec_add(coefvB, biasv); \
104 multEvenvA = vec_mule(coefvA, mfv); \
105 multOddvA = vec_mulo(coefvA, mfv); \
106 multEvenvB = vec_mule(coefvB, mfv); \
107 multOddvB = vec_mulo(coefvB, mfv); \
108 multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
109 multOddvA = vec_sr(multOddvA, i_qbitsv); \
110 multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
111 multOddvB = vec_sr(multOddvB, i_qbitsv); \
112 temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
113 temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
114 temp1v = vec_xor(temp1v, mskA); \
115 temp2v = vec_xor(temp2v, mskB); \
116 temp1v = vec_add(temp1v, vec_and(mskA, one)); \
117 vec_st(temp1v, (idx0), dct); \
118 temp2v = vec_add(temp2v, vec_and(mskB, one)); \
119 nz = vec_or(nz, vec_or(temp1v, temp2v)); \
120 vec_st(temp2v, (idx1), dct); \
121 }
122
x264_quant_4x4_dc_altivec(int16_t dct[16],int mf,int bias)123 int x264_quant_4x4_dc_altivec( int16_t dct[16], int mf, int bias )
124 {
125 LOAD_ZERO;
126 vector bool short mskA;
127 vec_u32_t i_qbitsv;
128 vec_u16_t coefvA;
129 vec_u32_t multEvenvA, multOddvA;
130 vec_s16_t one = vec_splat_s16(1);
131 vec_s16_t nz = zero_s16v;
132
133 vector bool short mskB;
134 vec_u16_t coefvB;
135 vec_u32_t multEvenvB, multOddvB;
136
137 vec_s16_t temp1v, temp2v;
138
139 vec_u16_t mfv;
140 vec_u16_t biasv;
141
142 vec_u16_u mf_u;
143 mf_u.s[0]=mf;
144 mfv = vec_splat( mf_u.v, 0 );
145
146 vec_u32_u qbits_u;
147 qbits_u.s[0]=16;
148 i_qbitsv = vec_splat(qbits_u.v, 0);
149
150 vec_u16_u bias_u;
151 bias_u.s[0]=bias;
152 biasv = vec_splat(bias_u.v, 0);
153
154 QUANT_16_U_DC( 0, 16 );
155 return vec_any_ne(nz, zero_s16v);
156 }
157
158 // DC quant of a whole 2x2 block
159 #define QUANT_4_U_DC( idx0 ) \
160 { \
161 const vec_u16_t sel = (vec_u16_t) CV(-1,-1,-1,-1,0,0,0,0); \
162 temp1v = vec_ld((idx0), dct); \
163 mskA = vec_cmplt(temp1v, zero_s16v); \
164 coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
165 coefvA = vec_add(coefvA, biasv); \
166 multEvenvA = vec_mule(coefvA, mfv); \
167 multOddvA = vec_mulo(coefvA, mfv); \
168 multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
169 multOddvA = vec_sr(multOddvA, i_qbitsv); \
170 temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
171 temp2v = vec_xor(temp2v, mskA); \
172 temp2v = vec_add(temp2v, vec_and(mskA, one)); \
173 temp1v = vec_sel(temp1v, temp2v, sel); \
174 nz = vec_or(nz, temp1v); \
175 vec_st(temp1v, (idx0), dct); \
176 }
177
x264_quant_2x2_dc_altivec(int16_t dct[4],int mf,int bias)178 int x264_quant_2x2_dc_altivec( int16_t dct[4], int mf, int bias )
179 {
180 LOAD_ZERO;
181 vector bool short mskA;
182 vec_u32_t i_qbitsv;
183 vec_u16_t coefvA;
184 vec_u32_t multEvenvA, multOddvA;
185 vec_s16_t one = vec_splat_s16(1);
186 vec_s16_t nz = zero_s16v;
187
188 vec_s16_t temp1v, temp2v;
189
190 vec_u16_t mfv;
191 vec_u16_t biasv;
192
193 vec_u16_u mf_u;
194 mf_u.s[0]=mf;
195 mfv = vec_splat( mf_u.v, 0 );
196
197 vec_u32_u qbits_u;
198 qbits_u.s[0]=16;
199 i_qbitsv = vec_splat(qbits_u.v, 0);
200
201 vec_u16_u bias_u;
202 bias_u.s[0]=bias;
203 biasv = vec_splat(bias_u.v, 0);
204
205 static const vec_s16_t mask2 = CV(-1, -1, -1, -1, 0, 0, 0, 0);
206 QUANT_4_U_DC(0);
207 return vec_any_ne(vec_and(nz, mask2), zero_s16v);
208 }
209
x264_quant_8x8_altivec(int16_t dct[64],uint16_t mf[64],uint16_t bias[64])210 int x264_quant_8x8_altivec( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
211 {
212 LOAD_ZERO;
213 vector bool short mskA;
214 vec_u32_t i_qbitsv;
215 vec_u16_t coefvA;
216 vec_u32_t multEvenvA, multOddvA;
217 vec_u16_t mfvA;
218 vec_u16_t biasvA;
219 vec_s16_t one = vec_splat_s16(1);;
220 vec_s16_t nz = zero_s16v;
221
222 vector bool short mskB;
223 vec_u16_t coefvB;
224 vec_u32_t multEvenvB, multOddvB;
225 vec_u16_t mfvB;
226 vec_u16_t biasvB;
227
228 vec_s16_t temp1v, temp2v;
229
230 vec_u32_u qbits_u;
231 qbits_u.s[0]=16;
232 i_qbitsv = vec_splat(qbits_u.v, 0);
233
234 for( int i = 0; i < 4; i++ )
235 QUANT_16_U( i*2*16, i*2*16+16 );
236 return vec_any_ne(nz, zero_s16v);
237 }
238
239 #define DEQUANT_SHL() \
240 { \
241 dctv = vec_ld(8*y, dct); \
242 mf1v = vec_ld(16*y, dequant_mf[i_mf]); \
243 mf2v = vec_ld(16+16*y, dequant_mf[i_mf]); \
244 mfv = vec_packs(mf1v, mf2v); \
245 \
246 multEvenvA = vec_mule(dctv, mfv); \
247 multOddvA = vec_mulo(dctv, mfv); \
248 dctv = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), \
249 vec_mergel(multEvenvA, multOddvA)); \
250 dctv = vec_sl(dctv, i_qbitsv); \
251 vec_st(dctv, 8*y, dct); \
252 }
253
254 #define DEQUANT_SHR() \
255 { \
256 dctv = vec_ld(8*y, dct); \
257 dct1v = vec_mergeh(dctv, dctv); \
258 dct2v = vec_mergel(dctv, dctv); \
259 mf1v = vec_ld(16*y, dequant_mf[i_mf]); \
260 mf2v = vec_ld(16+16*y, dequant_mf[i_mf]); \
261 \
262 multEvenvA = vec_mule(dct1v, (vec_s16_t)mf1v); \
263 multOddvA = vec_mulo(dct1v, (vec_s16_t)mf1v); \
264 temp1v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
265 temp1v = vec_add(temp1v, fv); \
266 temp1v = vec_sra(temp1v, i_qbitsv); \
267 \
268 multEvenvA = vec_mule(dct2v, (vec_s16_t)mf2v); \
269 multOddvA = vec_mulo(dct2v, (vec_s16_t)mf2v); \
270 temp2v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
271 temp2v = vec_add(temp2v, fv); \
272 temp2v = vec_sra(temp2v, i_qbitsv); \
273 \
274 dctv = (vec_s16_t)vec_packs(temp1v, temp2v); \
275 vec_st(dctv, y*8, dct); \
276 }
277
x264_dequant_4x4_altivec(int16_t dct[16],int dequant_mf[6][16],int i_qp)278 void x264_dequant_4x4_altivec( int16_t dct[16], int dequant_mf[6][16], int i_qp )
279 {
280 int i_mf = i_qp%6;
281 int i_qbits = i_qp/6 - 4;
282
283 vec_s16_t dctv;
284 vec_s16_t dct1v, dct2v;
285 vec_s32_t mf1v, mf2v;
286 vec_s16_t mfv;
287 vec_s32_t multEvenvA, multOddvA;
288 vec_s32_t temp1v, temp2v;
289
290 if( i_qbits >= 0 )
291 {
292 vec_u16_t i_qbitsv;
293 vec_u16_u qbits_u;
294 qbits_u.s[0]=i_qbits;
295 i_qbitsv = vec_splat(qbits_u.v, 0);
296
297 for( int y = 0; y < 4; y+=2 )
298 DEQUANT_SHL();
299 }
300 else
301 {
302 const int f = 1 << (-i_qbits-1);
303
304 vec_s32_t fv;
305 vec_u32_u f_u;
306 f_u.s[0]=f;
307 fv = (vec_s32_t)vec_splat(f_u.v, 0);
308
309 vec_u32_t i_qbitsv;
310 vec_u32_u qbits_u;
311 qbits_u.s[0]=-i_qbits;
312 i_qbitsv = vec_splat(qbits_u.v, 0);
313
314 vec_u32_t sixteenv;
315 vec_u32_u sixteen_u;
316 sixteen_u.s[0]=16;
317 sixteenv = vec_splat(sixteen_u.v, 0);
318
319 for( int y = 0; y < 4; y+=2 )
320 DEQUANT_SHR();
321 }
322 }
323
x264_dequant_8x8_altivec(int16_t dct[64],int dequant_mf[6][64],int i_qp)324 void x264_dequant_8x8_altivec( int16_t dct[64], int dequant_mf[6][64], int i_qp )
325 {
326 int i_mf = i_qp%6;
327 int i_qbits = i_qp/6 - 6;
328
329 vec_s16_t dctv;
330 vec_s16_t dct1v, dct2v;
331 vec_s32_t mf1v, mf2v;
332 vec_s16_t mfv;
333 vec_s32_t multEvenvA, multOddvA;
334 vec_s32_t temp1v, temp2v;
335
336 if( i_qbits >= 0 )
337 {
338 vec_u16_t i_qbitsv;
339 vec_u16_u qbits_u;
340 qbits_u.s[0]=i_qbits;
341 i_qbitsv = vec_splat(qbits_u.v, 0);
342
343 for( int y = 0; y < 16; y+=2 )
344 DEQUANT_SHL();
345 }
346 else
347 {
348 const int f = 1 << (-i_qbits-1);
349
350 vec_s32_t fv;
351 vec_u32_u f_u;
352 f_u.s[0]=f;
353 fv = (vec_s32_t)vec_splat(f_u.v, 0);
354
355 vec_u32_t i_qbitsv;
356 vec_u32_u qbits_u;
357 qbits_u.s[0]=-i_qbits;
358 i_qbitsv = vec_splat(qbits_u.v, 0);
359
360 vec_u32_t sixteenv;
361 vec_u32_u sixteen_u;
362 sixteen_u.s[0]=16;
363 sixteenv = vec_splat(sixteen_u.v, 0);
364
365 for( int y = 0; y < 16; y+=2 )
366 DEQUANT_SHR();
367 }
368 }
369 #endif // !HIGH_BIT_DEPTH
370
371