1 /*****************************************************************************
2 * Copyright (C) 2013-2020 MulticoreWare, Inc
3 *
4 * Authors: Roger Moussalli <rmoussal@us.ibm.com>
5 * Min Chen <min.chen@multicorewareinc.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20 *
21 * This program is also available under a commercial proprietary license.
22 * For more information, contact us at license @ x265.com.
23 *****************************************************************************/
24
25 #include <iostream>
26 #include "common.h"
27 #include "primitives.h"
28 #include "ppccommon.h"
29
30 using namespace X265_NS;
31
32 // ORIGINAL : for(col=0; col<16; col++) {sum[col] = src[ocol+col + 0 * srcStride] * c[0];}
33 #define multiply_pixel_coeff(/*vector int*/ v_sum_0, /*vector int*/ v_sum_1, /*vector int*/ v_sum_2, /*vector int*/ v_sum_3, /*const pixel * */ src, /*int*/ src_offset, /*vector signed short*/ v_coeff) \
34 { \
35 vector unsigned char v_pixel ; \
36 vector signed short v_pixel_16_h, v_pixel_16_l ; \
37 const vector signed short v_mask_unisgned_8_to_16 = {0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF} ; \
38 \
39 /* load the pixels */ \
40 v_pixel = vec_xl(src_offset, src) ; \
41 \
42 /* unpack the 8-bit pixels to 16-bit values (and undo the sign extension) */ \
43 v_pixel_16_h = vec_unpackh((vector signed char)v_pixel) ; \
44 v_pixel_16_l = vec_unpackl((vector signed char)v_pixel) ; \
45 v_pixel_16_h = vec_and(v_pixel_16_h, v_mask_unisgned_8_to_16) ; \
46 v_pixel_16_l = vec_and(v_pixel_16_l, v_mask_unisgned_8_to_16) ; \
47 \
48 /* multiply the pixels by the coefficient */ \
49 v_sum_0 = vec_mule(v_pixel_16_h, v_coeff) ; \
50 v_sum_1 = vec_mulo(v_pixel_16_h, v_coeff) ; \
51 v_sum_2 = vec_mule(v_pixel_16_l, v_coeff) ; \
52 v_sum_3 = vec_mulo(v_pixel_16_l, v_coeff) ; \
53 } // end multiply_pixel_coeff()
54
55
56 // ORIGINAL : for(col=0; col<16; col++) {sum[col] += src[ocol+col + 1 * srcStride] * c[1];}
57 #define multiply_accumulate_pixel_coeff(/*vector int*/ v_sum_0, /*vector int*/ v_sum_1, /*vector int*/ v_sum_2, /*vector int*/ v_sum_3, /*const pixel * */ src, /*int*/ src_offset, /*vector signed short*/ v_coeff) \
58 { \
59 vector unsigned char v_pixel ; \
60 vector signed short v_pixel_16_h, v_pixel_16_l ; \
61 vector int v_product_int_0, v_product_int_1, v_product_int_2, v_product_int_3 ; \
62 const vector signed short v_mask_unisgned_8_to_16 = {0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF} ; \
63 \
64 /* ORIGINAL : for(col=0; col<16; col++) {sum[col] = src[ocol+col + 0 * srcStride] * c[0];} */ \
65 /* load the pixels */ \
66 v_pixel = vec_xl(src_offset, src) ; \
67 \
68 /* unpack the 8-bit pixels to 16-bit values (and undo the sign extension) */ \
69 v_pixel_16_h = vec_unpackh((vector signed char)v_pixel) ; \
70 v_pixel_16_l = vec_unpackl((vector signed char)v_pixel) ; \
71 v_pixel_16_h = vec_and(v_pixel_16_h, v_mask_unisgned_8_to_16) ; \
72 v_pixel_16_l = vec_and(v_pixel_16_l, v_mask_unisgned_8_to_16) ; \
73 \
74 /* multiply the pixels by the coefficient */ \
75 v_product_int_0 = vec_mule(v_pixel_16_h, v_coeff) ; \
76 v_product_int_1 = vec_mulo(v_pixel_16_h, v_coeff) ; \
77 v_product_int_2 = vec_mule(v_pixel_16_l, v_coeff) ; \
78 v_product_int_3 = vec_mulo(v_pixel_16_l, v_coeff) ; \
79 \
80 /* accumulate the results with the sum vectors */ \
81 v_sum_0 = vec_add(v_sum_0, v_product_int_0) ; \
82 v_sum_1 = vec_add(v_sum_1, v_product_int_1) ; \
83 v_sum_2 = vec_add(v_sum_2, v_product_int_2) ; \
84 v_sum_3 = vec_add(v_sum_3, v_product_int_3) ; \
85 } // end multiply_accumulate_pixel_coeff()
86
87
88
89 #if 0
90 //ORIGINAL
91 // Works with the following values:
92 // N = 8
93 // width >= 16 (multiple of 16)
94 // any height
95 template<int N, int width, int height>
96 void interp_vert_pp_altivec(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
97 {
98
99
100 const int16_t* c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
101 const int shift = IF_FILTER_PREC;
102 const int offset = 1 << (shift - 1);
103 const uint16_t maxVal = (1 << X265_DEPTH) - 1;
104
105 src -= (N / 2 - 1) * srcStride;
106
107
108 // Vector to hold replicated shift amount
109 const vector unsigned int v_shift = {shift, shift, shift, shift} ;
110
111 // Vector to hold replicated offset
112 const vector int v_offset = {offset, offset, offset, offset} ;
113
114 // Vector to hold replicated maxVal
115 const vector signed short v_maxVal = {maxVal, maxVal, maxVal, maxVal, maxVal, maxVal, maxVal, maxVal} ;
116
117
118 // Vector to hold replicated coefficients (one coefficient replicated per vector)
119 vector signed short v_coeff_0, v_coeff_1, v_coeff_2, v_coeff_3, v_coeff_4, v_coeff_5, v_coeff_6, v_coeff_7 ;
120 vector signed short v_coefficients = vec_xl(0, c) ; // load all coefficients into one vector
121
122 // Replicate the coefficients into respective vectors
123 v_coeff_0 = vec_splat(v_coefficients, 0) ;
124 v_coeff_1 = vec_splat(v_coefficients, 1) ;
125 v_coeff_2 = vec_splat(v_coefficients, 2) ;
126 v_coeff_3 = vec_splat(v_coefficients, 3) ;
127 v_coeff_4 = vec_splat(v_coefficients, 4) ;
128 v_coeff_5 = vec_splat(v_coefficients, 5) ;
129 v_coeff_6 = vec_splat(v_coefficients, 6) ;
130 v_coeff_7 = vec_splat(v_coefficients, 7) ;
131
132
133
134 int row, ocol, col;
135 for (row = 0; row < height; row++)
136 {
137 for (ocol = 0; ocol < width; ocol+=16)
138 {
139
140
141 // int sum[16] ;
142 // int16_t val[16] ;
143
144 // --> for(col=0; col<16; col++) {sum[col] = src[ocol+col + 1 * srcStride] * c[0];}
145 // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 1 * srcStride] * c[1];}
146 // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 2 * srcStride] * c[2];}
147 // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 3 * srcStride] * c[3];}
148 // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 4 * srcStride] * c[4];}
149 // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 5 * srcStride] * c[5];}
150 // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 6 * srcStride] * c[6];}
151 // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 7 * srcStride] * c[7];}
152
153
154 vector signed int v_sum_0, v_sum_1, v_sum_2, v_sum_3 ;
155 vector signed short v_val_0, v_val_1 ;
156
157
158
159 multiply_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol, v_coeff_0) ;
160 multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 1 * srcStride, v_coeff_1) ;
161 multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 2 * srcStride, v_coeff_2) ;
162 multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 3 * srcStride, v_coeff_3) ;
163 multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 4 * srcStride, v_coeff_4) ;
164 multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 5 * srcStride, v_coeff_5) ;
165 multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 6 * srcStride, v_coeff_6) ;
166 multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 7 * srcStride, v_coeff_7) ;
167
168
169
170
171
172 // --> for(col=0; col<16; col++) {val[col] = (int16_t)((sum[col] + offset) >> shift);}
173 // Add offset
174 v_sum_0 = vec_add(v_sum_0, v_offset) ;
175 v_sum_1 = vec_add(v_sum_1, v_offset) ;
176 v_sum_2 = vec_add(v_sum_2, v_offset) ;
177 v_sum_3 = vec_add(v_sum_3, v_offset) ;
178 // Shift right by "shift"
179 v_sum_0 = vec_sra(v_sum_0, v_shift) ;
180 v_sum_1 = vec_sra(v_sum_1, v_shift) ;
181 v_sum_2 = vec_sra(v_sum_2, v_shift) ;
182 v_sum_3 = vec_sra(v_sum_3, v_shift) ;
183
184 // Pack into 16-bit numbers
185 v_val_0 = vec_pack(v_sum_0, v_sum_2) ;
186 v_val_1 = vec_pack(v_sum_1, v_sum_3) ;
187
188
189
190 // --> for(col=0; col<16; col++) {val[col] = (val[col] < 0) ? 0 : val[col];}
191 vector bool short v_comp_zero_0, v_comp_zero_1 ;
192 vector signed short v_max_masked_0, v_max_masked_1 ;
193 vector signed short zeros16 = {0,0,0,0,0,0,0,0} ;
194 // Compute less than 0
195 v_comp_zero_0 = vec_cmplt(v_val_0, zeros16) ;
196 v_comp_zero_1 = vec_cmplt(v_val_1, zeros16) ;
197 // Keep values that are greater or equal to 0
198 v_val_0 = vec_andc(v_val_0, v_comp_zero_0) ;
199 v_val_1 = vec_andc(v_val_1, v_comp_zero_1) ;
200
201
202
203 // --> for(col=0; col<16; col++) {val[col] = (val[col] > maxVal) ? maxVal : val[col];}
204 vector bool short v_comp_max_0, v_comp_max_1 ;
205 // Compute greater than max
206 v_comp_max_0 = vec_cmpgt(v_val_0, v_maxVal) ;
207 v_comp_max_1 = vec_cmpgt(v_val_1, v_maxVal) ;
208 // Replace values greater than maxVal with maxVal
209 v_val_0 = vec_sel(v_val_0, v_maxVal, v_comp_max_0) ;
210 v_val_1 = vec_sel(v_val_1, v_maxVal, v_comp_max_1) ;
211
212
213
214 // --> for(col=0; col<16; col++) {dst[ocol+col] = (pixel)val[col];}
215 // Pack the vals into 8-bit numbers
216 // but also re-ordering them - side effect of mule and mulo
217 vector unsigned char v_result ;
218 vector unsigned char v_perm_index = {0x00, 0x10, 0x02, 0x12, 0x04, 0x14, 0x06, 0x16, 0x08 ,0x18, 0x0A, 0x1A, 0x0C, 0x1C, 0x0E, 0x1E} ;
219 v_result = (vector unsigned char)vec_perm(v_val_0, v_val_1, v_perm_index) ;
220 // Store the results back to dst[]
221 vec_xst(v_result, ocol, (unsigned char *)dst) ;
222 }
223
224 src += srcStride;
225 dst += dstStride;
226 }
227 } // end interp_vert_pp_altivec()
228 #else
229 // Works with the following values:
230 // N = 8
231 // width >= 16 (multiple of 16)
232 // any height
233 template<int N, int width, int height>
interp_vert_pp_altivec(const pixel * __restrict__ src,intptr_t srcStride,pixel * __restrict__ dst,intptr_t dstStride,int coeffIdx)234 void interp_vert_pp_altivec(const pixel* __restrict__ src, intptr_t srcStride, pixel* __restrict__ dst, intptr_t dstStride, int coeffIdx)
235 {
236 const int16_t* __restrict__ c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
237 int shift = IF_FILTER_PREC;
238 int offset = 1 << (shift - 1);
239 uint16_t maxVal = (1 << X265_DEPTH) - 1;
240
241 src -= (N / 2 - 1) * srcStride;
242
243 vector signed short vcoeff0 = vec_splats(c[0]);
244 vector signed short vcoeff1 = vec_splats(c[1]);
245 vector signed short vcoeff2 = vec_splats(c[2]);
246 vector signed short vcoeff3 = vec_splats(c[3]);
247 vector signed short vcoeff4 = vec_splats(c[4]);
248 vector signed short vcoeff5 = vec_splats(c[5]);
249 vector signed short vcoeff6 = vec_splats(c[6]);
250 vector signed short vcoeff7 = vec_splats(c[7]);
251 vector signed short voffset = vec_splats((short)offset);
252 vector signed short vshift = vec_splats((short)shift);
253 vector signed short vmaxVal = vec_splats((short)maxVal);
254 vector signed short vzero_s16 = vec_splats( (signed short)0u);;
255 vector signed int vzero_s32 = vec_splats( (signed int)0u);
256 vector unsigned char vzero_u8 = vec_splats( (unsigned char)0u );
257 vector unsigned char vchar_to_short_maskH = {24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31, 0};
258 vector unsigned char vchar_to_short_maskL = {16, 0, 17, 0 ,18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, 0};
259
260 vector signed short vsrcH, vsrcL, vsumH, vsumL;
261 vector unsigned char vsrc;
262
263 vector signed short vsrc2H, vsrc2L, vsum2H, vsum2L;
264 vector unsigned char vsrc2;
265
266 const pixel* __restrict__ src2 = src+srcStride;
267 pixel* __restrict__ dst2 = dst+dstStride;
268
269 int row, col;
270 for (row = 0; row < height; row+=2)
271 {
272 for (col = 0; col < width; col+=16)
273 {
274 vsrc = vec_xl(0, (unsigned char*)&src[col + 0*srcStride]);
275 vsrcH = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
276 vsrcL = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
277 vsumH = vsrcH * vcoeff0;
278 vsumL = vsrcL * vcoeff0;
279
280 vsrc = vec_xl(0, (unsigned char*)&src[col + 1*srcStride]);
281 vsrcH = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
282 vsrcL = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
283 vsumH += vsrcH * vcoeff1;
284 vsumL += vsrcL * vcoeff1;
285
286 vsrc = vec_xl(0, (unsigned char*)&src[col + 2*srcStride]);
287 vsrcH = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
288 vsrcL = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
289 vsumH += vsrcH * vcoeff2;
290 vsumL += vsrcL * vcoeff2;
291
292 vsrc = vec_xl(0, (unsigned char*)&src[col + 3*srcStride]);
293 vsrcH = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
294 vsrcL = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
295 vsumH += vsrcH * vcoeff3;
296 vsumL += vsrcL * vcoeff3;
297
298 vsrc = vec_xl(0, (unsigned char*)&src[col + 4*srcStride]);
299 vsrcH = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
300 vsrcL = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
301 vsumH += vsrcH * vcoeff4;
302 vsumL += vsrcL * vcoeff4;
303
304 vsrc = vec_xl(0, (unsigned char*)&src[col + 5*srcStride]);
305 vsrcH = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
306 vsrcL = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
307 vsumH += vsrcH * vcoeff5;
308 vsumL += vsrcL * vcoeff5;
309
310 vsrc = vec_xl(0, (unsigned char*)&src[col + 6*srcStride]);
311 vsrcH = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
312 vsrcL = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
313 vsumH += vsrcH * vcoeff6;
314 vsumL += vsrcL * vcoeff6;
315
316 vsrc = vec_xl(0, (unsigned char*)&src[col + 7*srcStride]);
317 vsrcH = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
318 vsrcL = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
319 vsumH += vsrcH * vcoeff7;
320 vsumL += vsrcL * vcoeff7;
321
322 vector short vvalH = (vsumH + voffset) >> vshift;
323 vvalH = vec_max( vvalH, vzero_s16 );
324 vvalH = vec_min( vvalH, vmaxVal );
325
326 vector short vvalL = (vsumL + voffset) >> vshift;
327 vvalL = vec_max( vvalL, vzero_s16 );
328 vvalL = vec_min( vvalL, vmaxVal );
329
330 vector signed char vdst = vec_pack( vvalL, vvalH );
331 vec_xst( vdst, 0, (signed char*)&dst[col] );
332
333 vsrc2 = vec_xl(0, (unsigned char*)&src2[col + 0*srcStride]);
334 vsrc2H = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
335 vsrc2L = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
336 vsum2H = vsrc2H * vcoeff0;
337 vsum2L = vsrc2L * vcoeff0;
338
339 vsrc2 = vec_xl(0, (unsigned char*)&src2[col + 1*srcStride]);
340 vsrc2H = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
341 vsrc2L = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
342 vsum2H += vsrc2H * vcoeff1;
343 vsum2L += vsrc2L * vcoeff1;
344
345 vsrc2 = vec_xl(0, (unsigned char*)&src2[col + 2*srcStride]);
346 vsrc2H = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
347 vsrc2L = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
348 vsum2H += vsrc2H * vcoeff2;
349 vsum2L += vsrc2L * vcoeff2;
350
351 vsrc2 = vec_xl(0, (unsigned char*)&src2[col + 3*srcStride]);
352 vsrc2H = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
353 vsrc2L = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
354 vsum2H += vsrc2H * vcoeff3;
355 vsum2L += vsrc2L * vcoeff3;
356
357 vsrc2 = vec_xl(0, (unsigned char*)&src2[col + 4*srcStride]);
358 vsrc2H = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
359 vsrc2L = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
360 vsum2H += vsrc2H * vcoeff4;
361 vsum2L += vsrc2L * vcoeff4;
362
363 vsrc2 = vec_xl(0, (unsigned char*)&src2[col + 5*srcStride]);
364 vsrc2H = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
365 vsrc2L = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
366 vsum2H += vsrc2H * vcoeff5;
367 vsum2L += vsrc2L * vcoeff5;
368
369 vsrc2 = vec_xl(0, (unsigned char*)&src2[col + 6*srcStride]);
370 vsrc2H = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
371 vsrc2L = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
372 vsum2H += vsrc2H * vcoeff6;
373 vsum2L += vsrc2L * vcoeff6;
374
375 vsrc2 = vec_xl(0, (unsigned char*)&src2[col + 7*srcStride]);
376 vsrc2H = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
377 vsrc2L = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
378 vsum2H += vsrc2H * vcoeff7;
379 vsum2L += vsrc2L * vcoeff7;
380
381 vector short vval2H = (vsum2H + voffset) >> vshift;
382 vval2H = vec_max( vval2H, vzero_s16 );
383 vval2H = vec_min( vval2H, vmaxVal );
384
385 vector short vval2L = (vsum2L + voffset) >> vshift;
386 vval2L = vec_max( vval2L, vzero_s16 );
387 vval2L = vec_min( vval2L, vmaxVal );
388
389 vector signed char vdst2 = vec_pack( vval2L, vval2H );
390 vec_xst( vdst2, 0, (signed char*)&dst2[col] );
391 }
392
393 src += 2*srcStride;
394 dst += 2*dstStride;
395 src2 += 2*srcStride;
396 dst2 += 2*dstStride;
397 }
398 }
399 #endif
400
401
402 // ORIGINAL : for(col=0; col<16; col++) {sum[col] = src[ocol+col + 0 * srcStride] * c[0];}
403 #define multiply_sp_pixel_coeff(/*vector int*/ v_sum_0, /*vector int*/ v_sum_1, /*vector int*/ v_sum_2, /*vector int*/ v_sum_3, /*const int16_t * */ src, /*int*/ src_offset, /*vector signed short*/ v_coeff) \
404 { \
405 vector signed short v_pixel_16_h, v_pixel_16_l ; \
406 \
407 /* load the pixels */ \
408 v_pixel_16_h = vec_xl(src_offset, src) ; \
409 v_pixel_16_l = vec_xl(src_offset + 16, src) ; \
410 \
411 /* multiply the pixels by the coefficient */ \
412 v_sum_0 = vec_mule(v_pixel_16_h, v_coeff) ; \
413 v_sum_1 = vec_mulo(v_pixel_16_h, v_coeff) ; \
414 v_sum_2 = vec_mule(v_pixel_16_l, v_coeff) ; \
415 v_sum_3 = vec_mulo(v_pixel_16_l, v_coeff) ; \
416 \
417 } // end multiply_pixel_coeff()
418
419
420 // ORIGINAL : for(col=0; col<16; col++) {sum[col] += src[ocol+col + 1 * srcStride] * c[1];}
421 #define multiply_accumulate_sp_pixel_coeff(/*vector int*/ v_sum_0, /*vector int*/ v_sum_1, /*vector int*/ v_sum_2, /*vector int*/ v_sum_3, /*const pixel * */ src, /*int*/ src_offset, /*vector signed short*/ v_coeff) \
422 { \
423 vector signed short v_pixel_16_h, v_pixel_16_l ; \
424 vector int v_product_int_0, v_product_int_1, v_product_int_2, v_product_int_3 ; \
425 \
426 /* ORIGINAL : for(col=0; col<16; col++) {sum[col] = src[ocol+col + 0 * srcStride] * c[0];} */ \
427 \
428 /* load the pixels */ \
429 v_pixel_16_h = vec_xl(src_offset, src) ; \
430 v_pixel_16_l = vec_xl(src_offset + 16, src) ; \
431 \
432 /* multiply the pixels by the coefficient */ \
433 v_product_int_0 = vec_mule(v_pixel_16_h, v_coeff) ; \
434 v_product_int_1 = vec_mulo(v_pixel_16_h, v_coeff) ; \
435 v_product_int_2 = vec_mule(v_pixel_16_l, v_coeff) ; \
436 v_product_int_3 = vec_mulo(v_pixel_16_l, v_coeff) ; \
437 \
438 /* accumulate the results with the sum vectors */ \
439 v_sum_0 = vec_add(v_sum_0, v_product_int_0) ; \
440 v_sum_1 = vec_add(v_sum_1, v_product_int_1) ; \
441 v_sum_2 = vec_add(v_sum_2, v_product_int_2) ; \
442 v_sum_3 = vec_add(v_sum_3, v_product_int_3) ; \
443 \
444 } // end multiply_accumulate_pixel_coeff()
445
446
447 // Works with the following values:
448 // N = 8
449 // width >= 16 (multiple of 16)
450 // any height
451 template <int N, int width, int height>
filterVertical_sp_altivec(const int16_t * src,intptr_t srcStride,pixel * dst,intptr_t dstStride,int coeffIdx)452 void filterVertical_sp_altivec(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
453 {
454 int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
455 unsigned int shift = IF_FILTER_PREC + headRoom;
456 int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << IF_FILTER_PREC);
457 const uint16_t maxVal = (1 << X265_DEPTH) - 1;
458 const int16_t* coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
459
460 src -= (N / 2 - 1) * srcStride;
461
462
463 // Vector to hold replicated shift amount
464 const vector unsigned int v_shift = {shift, shift, shift, shift} ;
465
466 // Vector to hold replicated offset
467 const vector int v_offset = {offset, offset, offset, offset} ;
468
469 // Vector to hold replicated maxVal
470 const vector signed short v_maxVal = {maxVal, maxVal, maxVal, maxVal, maxVal, maxVal, maxVal, maxVal} ;
471
472
473 // Vector to hold replicated coefficients (one coefficient replicated per vector)
474 vector signed short v_coeff_0, v_coeff_1, v_coeff_2, v_coeff_3, v_coeff_4, v_coeff_5, v_coeff_6, v_coeff_7 ;
475 vector signed short v_coefficients = vec_xl(0, coeff) ; // load all coefficients into one vector
476
477 // Replicate the coefficients into respective vectors
478 v_coeff_0 = vec_splat(v_coefficients, 0) ;
479 v_coeff_1 = vec_splat(v_coefficients, 1) ;
480 v_coeff_2 = vec_splat(v_coefficients, 2) ;
481 v_coeff_3 = vec_splat(v_coefficients, 3) ;
482 v_coeff_4 = vec_splat(v_coefficients, 4) ;
483 v_coeff_5 = vec_splat(v_coefficients, 5) ;
484 v_coeff_6 = vec_splat(v_coefficients, 6) ;
485 v_coeff_7 = vec_splat(v_coefficients, 7) ;
486
487
488
489 int row, ocol, col;
490 for (row = 0; row < height; row++)
491 {
492 for (ocol = 0; ocol < width; ocol+= 16 )
493 {
494
495 // int sum[16] ;
496 // int16_t val[16] ;
497
498 // --> for(col=0; col<16; col++) {sum[col] = src[ocol+col + 1 * srcStride] * c[0];}
499 // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 1 * srcStride] * c[1];}
500 // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 2 * srcStride] * c[2];}
501 // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 3 * srcStride] * c[3];}
502 // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 4 * srcStride] * c[4];}
503 // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 5 * srcStride] * c[5];}
504 // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 6 * srcStride] * c[6];}
505 // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 7 * srcStride] * c[7];}
506
507
508 vector signed int v_sum_0, v_sum_1, v_sum_2, v_sum_3 ;
509 vector signed short v_val_0, v_val_1 ;
510
511
512 // Added a factor of 2 to the offset since this is a BYTE offset, and each input pixel is of size 2Bytes
513 multiply_sp_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol * 2, v_coeff_0) ;
514 multiply_accumulate_sp_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, (ocol + 1 * srcStride) * 2, v_coeff_1) ;
515 multiply_accumulate_sp_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, (ocol + 2 * srcStride) * 2, v_coeff_2) ;
516 multiply_accumulate_sp_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, (ocol + 3 * srcStride) * 2, v_coeff_3) ;
517 multiply_accumulate_sp_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, (ocol + 4 * srcStride) * 2, v_coeff_4) ;
518 multiply_accumulate_sp_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, (ocol + 5 * srcStride) * 2, v_coeff_5) ;
519 multiply_accumulate_sp_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, (ocol + 6 * srcStride) * 2, v_coeff_6) ;
520 multiply_accumulate_sp_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, (ocol + 7 * srcStride) * 2, v_coeff_7) ;
521
522
523
524
525
526 // --> for(col=0; col<16; col++) {val[col] = (int16_t)((sum[col] + offset) >> shift);}
527 // Add offset
528 v_sum_0 = vec_add(v_sum_0, v_offset) ;
529 v_sum_1 = vec_add(v_sum_1, v_offset) ;
530 v_sum_2 = vec_add(v_sum_2, v_offset) ;
531 v_sum_3 = vec_add(v_sum_3, v_offset) ;
532 // Shift right by "shift"
533 v_sum_0 = vec_sra(v_sum_0, v_shift) ;
534 v_sum_1 = vec_sra(v_sum_1, v_shift) ;
535 v_sum_2 = vec_sra(v_sum_2, v_shift) ;
536 v_sum_3 = vec_sra(v_sum_3, v_shift) ;
537
538 // Pack into 16-bit numbers
539 v_val_0 = vec_pack(v_sum_0, v_sum_2) ;
540 v_val_1 = vec_pack(v_sum_1, v_sum_3) ;
541
542
543
544 // --> for(col=0; col<16; col++) {val[col] = (val[col] < 0) ? 0 : val[col];}
545 vector bool short v_comp_zero_0, v_comp_zero_1 ;
546 vector signed short v_max_masked_0, v_max_masked_1 ;
547 vector signed short zeros16 = {0,0,0,0,0,0,0,0} ;
548 // Compute less than 0
549 v_comp_zero_0 = vec_cmplt(v_val_0, zeros16) ;
550 v_comp_zero_1 = vec_cmplt(v_val_1, zeros16) ;
551 // Keep values that are greater or equal to 0
552 v_val_0 = vec_andc(v_val_0, v_comp_zero_0) ;
553 v_val_1 = vec_andc(v_val_1, v_comp_zero_1) ;
554
555
556
557 // --> for(col=0; col<16; col++) {val[col] = (val[col] > maxVal) ? maxVal : val[col];}
558 vector bool short v_comp_max_0, v_comp_max_1 ;
559 // Compute greater than max
560 v_comp_max_0 = vec_cmpgt(v_val_0, v_maxVal) ;
561 v_comp_max_1 = vec_cmpgt(v_val_1, v_maxVal) ;
562 // Replace values greater than maxVal with maxVal
563 v_val_0 = vec_sel(v_val_0, v_maxVal, v_comp_max_0) ;
564 v_val_1 = vec_sel(v_val_1, v_maxVal, v_comp_max_1) ;
565
566
567
568 // --> for(col=0; col<16; col++) {dst[ocol+col] = (pixel)val[col];}
569 // Pack the vals into 8-bit numbers
570 // but also re-ordering them - side effect of mule and mulo
571 vector unsigned char v_result ;
572 vector unsigned char v_perm_index = {0x00, 0x10, 0x02, 0x12, 0x04, 0x14, 0x06, 0x16, 0x08 ,0x18, 0x0A, 0x1A, 0x0C, 0x1C, 0x0E, 0x1E} ;
573 v_result = (vector unsigned char)vec_perm(v_val_0, v_val_1, v_perm_index) ;
574 // Store the results back to dst[]
575 vec_xst(v_result, ocol, (unsigned char *)dst) ;
576 }
577
578 src += srcStride;
579 dst += dstStride;
580 }
581 } // end filterVertical_sp_altivec()
582
583
584
585
586
587 // Works with the following values:
588 // N = 8
589 // width >= 32 (multiple of 32)
590 // any height
591 template <int N, int width, int height>
interp_horiz_ps_altivec(const pixel * src,intptr_t srcStride,int16_t * dst,intptr_t dstStride,int coeffIdx,int isRowExt)592 void interp_horiz_ps_altivec(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
593 {
594
595 const int16_t* coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
596 int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
597 unsigned int shift = IF_FILTER_PREC - headRoom;
598 int offset = -IF_INTERNAL_OFFS << shift;
599 int blkheight = height;
600
601 src -= N / 2 - 1;
602
603 if (isRowExt)
604 {
605 src -= (N / 2 - 1) * srcStride;
606 blkheight += N - 1;
607 }
608
609
610 vector signed short v_coeff ;
611 v_coeff = vec_xl(0, coeff) ;
612
613
614 vector unsigned char v_pixel_char_0, v_pixel_char_1, v_pixel_char_2 ;
615 vector signed short v_pixel_short_0, v_pixel_short_1, v_pixel_short_2, v_pixel_short_3, v_pixel_short_4 ;
616 const vector signed short v_mask_unisgned_char_to_short = {0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF} ; \
617 const vector signed int v_zeros_int = {0, 0, 0, 0} ;
618 const vector signed short v_zeros_short = {0, 0, 0, 0, 0, 0, 0, 0} ;
619
620 vector signed int v_product_0_0, v_product_0_1 ;
621 vector signed int v_product_1_0, v_product_1_1 ;
622 vector signed int v_product_2_0, v_product_2_1 ;
623 vector signed int v_product_3_0, v_product_3_1 ;
624
625 vector signed int v_sum_0, v_sum_1, v_sum_2, v_sum_3 ;
626
627 vector signed int v_sums_temp_col0, v_sums_temp_col1, v_sums_temp_col2, v_sums_temp_col3 ;
628 vector signed int v_sums_col0_0, v_sums_col0_1 ;
629 vector signed int v_sums_col1_0, v_sums_col1_1 ;
630 vector signed int v_sums_col2_0, v_sums_col2_1 ;
631 vector signed int v_sums_col3_0, v_sums_col3_1 ;
632
633
634 const vector signed int v_offset = {offset, offset, offset, offset};
635 const vector unsigned int v_shift = {shift, shift, shift, shift} ;
636
637
638 vector unsigned char v_sums_shamt = {0x20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} ;
639
640
641
642 pixel *next_src ;
643 int16_t *next_dst ;
644
645 int row, col;
646 for (row = 0; row < blkheight; row++)
647 {
648 next_src = (pixel *)src + srcStride ;
649 next_dst = (int16_t *)dst + dstStride ;
650
651 for(int col_iter=0; col_iter<width; col_iter+=32)
652 {
653 // Load a full row of pixels (32 + 7)
654 v_pixel_char_0 = vec_xl(0, src) ;
655 v_pixel_char_1 = vec_xl(16, src) ;
656 v_pixel_char_2 = vec_xl(32, src) ;
657
658
659 v_sums_temp_col0 = v_zeros_int ;
660 v_sums_temp_col1 = v_zeros_int ;
661 v_sums_temp_col2 = v_zeros_int ;
662 v_sums_temp_col3 = v_zeros_int ;
663
664
665 // Expand the loaded pixels into shorts
666 v_pixel_short_0 = vec_unpackh((vector signed char)v_pixel_char_0) ;
667 v_pixel_short_1 = vec_unpackl((vector signed char)v_pixel_char_0) ;
668 v_pixel_short_2 = vec_unpackh((vector signed char)v_pixel_char_1) ;
669 v_pixel_short_3 = vec_unpackl((vector signed char)v_pixel_char_1) ;
670 v_pixel_short_4 = vec_unpackh((vector signed char)v_pixel_char_2) ;
671
672 v_pixel_short_0 = vec_and(v_pixel_short_0, v_mask_unisgned_char_to_short) ;
673 v_pixel_short_1 = vec_and(v_pixel_short_1, v_mask_unisgned_char_to_short) ;
674 v_pixel_short_2 = vec_and(v_pixel_short_2, v_mask_unisgned_char_to_short) ;
675 v_pixel_short_3 = vec_and(v_pixel_short_3, v_mask_unisgned_char_to_short) ;
676 v_pixel_short_4 = vec_and(v_pixel_short_4, v_mask_unisgned_char_to_short) ;
677
678
679
680 // Four colum sets are processed below
681 // One colum per set per iteration
682 for(col=0; col < 8; col++)
683 {
684
685 // Multiply the pixels by the coefficients
686 v_product_0_0 = vec_mule(v_pixel_short_0, v_coeff) ;
687 v_product_0_1 = vec_mulo(v_pixel_short_0, v_coeff) ;
688
689 v_product_1_0 = vec_mule(v_pixel_short_1, v_coeff) ;
690 v_product_1_1 = vec_mulo(v_pixel_short_1, v_coeff) ;
691
692 v_product_2_0 = vec_mule(v_pixel_short_2, v_coeff) ;
693 v_product_2_1 = vec_mulo(v_pixel_short_2, v_coeff) ;
694
695 v_product_3_0 = vec_mule(v_pixel_short_3, v_coeff) ;
696 v_product_3_1 = vec_mulo(v_pixel_short_3, v_coeff) ;
697
698
699 // Sum up the multiplication results
700 v_sum_0 = vec_add(v_product_0_0, v_product_0_1) ;
701 v_sum_0 = vec_sums(v_sum_0, v_zeros_int) ;
702
703 v_sum_1 = vec_add(v_product_1_0, v_product_1_1) ;
704 v_sum_1 = vec_sums(v_sum_1, v_zeros_int) ;
705
706 v_sum_2 = vec_add(v_product_2_0, v_product_2_1) ;
707 v_sum_2 = vec_sums(v_sum_2, v_zeros_int) ;
708
709 v_sum_3 = vec_add(v_product_3_0, v_product_3_1) ;
710 v_sum_3 = vec_sums(v_sum_3, v_zeros_int) ;
711
712
713 // Insert the sum results into respective vectors
714 v_sums_temp_col0 = vec_sro(v_sums_temp_col0, v_sums_shamt) ;
715 v_sums_temp_col0 = vec_or(v_sum_0, v_sums_temp_col0) ;
716
717 v_sums_temp_col1 = vec_sro(v_sums_temp_col1, v_sums_shamt) ;
718 v_sums_temp_col1 = vec_or(v_sum_1, v_sums_temp_col1) ;
719
720 v_sums_temp_col2 = vec_sro(v_sums_temp_col2, v_sums_shamt) ;
721 v_sums_temp_col2 = vec_or(v_sum_2, v_sums_temp_col2) ;
722
723 v_sums_temp_col3 = vec_sro(v_sums_temp_col3, v_sums_shamt) ;
724 v_sums_temp_col3 = vec_or(v_sum_3, v_sums_temp_col3) ;
725
726
727 if(col == 3)
728 {
729 v_sums_col0_0 = v_sums_temp_col0 ;
730 v_sums_col1_0 = v_sums_temp_col1 ;
731 v_sums_col2_0 = v_sums_temp_col2 ;
732 v_sums_col3_0 = v_sums_temp_col3 ;
733
734 v_sums_temp_col0 = v_zeros_int ;
735 v_sums_temp_col1 = v_zeros_int ;
736 v_sums_temp_col2 = v_zeros_int ;
737 v_sums_temp_col3 = v_zeros_int ;
738 }
739
740
741 // Shift the pixels by 1 (short pixel)
742 v_pixel_short_0 = vec_sld(v_pixel_short_1, v_pixel_short_0, 14) ;
743 v_pixel_short_1 = vec_sld(v_pixel_short_2, v_pixel_short_1, 14) ;
744 v_pixel_short_2 = vec_sld(v_pixel_short_3, v_pixel_short_2, 14) ;
745 v_pixel_short_3 = vec_sld(v_pixel_short_4, v_pixel_short_3, 14) ;
746 const vector unsigned char v_shift_right_two_bytes_shamt = {0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} ;
747 v_pixel_short_4 = vec_sro(v_pixel_short_4, v_shift_right_two_bytes_shamt) ;
748 }
749
750 // Copy the sums result to the second vector (per colum)
751 v_sums_col0_1 = v_sums_temp_col0 ;
752 v_sums_col1_1 = v_sums_temp_col1 ;
753 v_sums_col2_1 = v_sums_temp_col2 ;
754 v_sums_col3_1 = v_sums_temp_col3 ;
755
756
757
758 // Post processing and eventually 2 stores
759 // Original code:
760 // int16_t val = (int16_t)((sum + offset) >> shift);
761 // dst[col] = val;
762
763
764 v_sums_col0_0 = vec_sra(vec_add(v_sums_col0_0, v_offset), v_shift) ;
765 v_sums_col0_1 = vec_sra(vec_add(v_sums_col0_1, v_offset), v_shift) ;
766 v_sums_col1_0 = vec_sra(vec_add(v_sums_col1_0, v_offset), v_shift) ;
767 v_sums_col1_1 = vec_sra(vec_add(v_sums_col1_1, v_offset), v_shift) ;
768 v_sums_col2_0 = vec_sra(vec_add(v_sums_col2_0, v_offset), v_shift) ;
769 v_sums_col2_1 = vec_sra(vec_add(v_sums_col2_1, v_offset), v_shift) ;
770 v_sums_col3_0 = vec_sra(vec_add(v_sums_col3_0, v_offset), v_shift) ;
771 v_sums_col3_1 = vec_sra(vec_add(v_sums_col3_1, v_offset), v_shift) ;
772
773
774 vector signed short v_val_col0, v_val_col1, v_val_col2, v_val_col3 ;
775 v_val_col0 = vec_pack(v_sums_col0_0, v_sums_col0_1) ;
776 v_val_col1 = vec_pack(v_sums_col1_0, v_sums_col1_1) ;
777 v_val_col2 = vec_pack(v_sums_col2_0, v_sums_col2_1) ;
778 v_val_col3 = vec_pack(v_sums_col3_0, v_sums_col3_1) ;
779
780
781
782 // Store results
783 vec_xst(v_val_col0, 0, dst) ;
784 vec_xst(v_val_col1, 16, dst) ;
785 vec_xst(v_val_col2, 32, dst) ;
786 vec_xst(v_val_col3, 48, dst) ;
787
788 src += 32 ;
789 dst += 32 ;
790
791 } // end for col_iter
792
793 src = next_src ;
794 dst = next_dst ;
795 }
796 } // interp_horiz_ps_altivec ()
797
798
799
800 // Works with the following values:
801 // N = 8
802 // width >= 32 (multiple of 32)
803 // any height
804 template <int N, int width, int height>
interp_hv_pp_altivec(const pixel * src,intptr_t srcStride,pixel * dst,intptr_t dstStride,int idxX,int idxY)805 void interp_hv_pp_altivec(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)
806 {
807
808 short immedVals[(64 + 8) * (64 + 8)];
809
810 interp_horiz_ps_altivec<N, width, height>(src, srcStride, immedVals, width, idxX, 1);
811
812 //!!filterVertical_sp_c<N>(immedVals + 3 * width, width, dst, dstStride, width, height, idxY);
813 filterVertical_sp_altivec<N,width,height>(immedVals + 3 * width, width, dst, dstStride, idxY);
814 }
815
816 //ORIGINAL
817 #if 0
818 // Works with the following values:
819 // N = 8
820 // width >= 32 (multiple of 32)
821 // any height
822 template <int N, int width, int height>
823 void interp_horiz_pp_altivec(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
824 {
825
826 const int16_t* coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
827 int headRoom = IF_FILTER_PREC;
828 int offset = (1 << (headRoom - 1));
829 uint16_t maxVal = (1 << X265_DEPTH) - 1;
830 int cStride = 1;
831
832 src -= (N / 2 - 1) * cStride;
833
834
835 vector signed short v_coeff ;
836 v_coeff = vec_xl(0, coeff) ;
837
838
839 vector unsigned char v_pixel_char_0, v_pixel_char_1, v_pixel_char_2 ;
840 vector signed short v_pixel_short_0, v_pixel_short_1, v_pixel_short_2, v_pixel_short_3, v_pixel_short_4 ;
841 const vector signed short v_mask_unisgned_char_to_short = {0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF} ; \
842 const vector signed int v_zeros_int = {0, 0, 0, 0} ;
843 const vector signed short v_zeros_short = {0, 0, 0, 0, 0, 0, 0, 0} ;
844
845 vector signed int v_product_0_0, v_product_0_1 ;
846 vector signed int v_product_1_0, v_product_1_1 ;
847 vector signed int v_product_2_0, v_product_2_1 ;
848 vector signed int v_product_3_0, v_product_3_1 ;
849
850 vector signed int v_sum_0, v_sum_1, v_sum_2, v_sum_3 ;
851
852 vector signed int v_sums_temp_col0, v_sums_temp_col1, v_sums_temp_col2, v_sums_temp_col3 ;
853 vector signed int v_sums_col0_0, v_sums_col0_1 ;
854 vector signed int v_sums_col1_0, v_sums_col1_1 ;
855 vector signed int v_sums_col2_0, v_sums_col2_1 ;
856 vector signed int v_sums_col3_0, v_sums_col3_1 ;
857
858
859 const vector signed int v_offset = {offset, offset, offset, offset};
860 const vector unsigned int v_headRoom = {headRoom, headRoom, headRoom, headRoom} ;
861
862
863 vector unsigned char v_sums_shamt = {0x20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} ;
864
865
866 pixel *next_src ;
867 pixel *next_dst ;
868
869 int row, col;
870 for (row = 0; row < height; row++)
871 {
872 next_src = (pixel *)src + srcStride ;
873 next_dst = (pixel *)dst + dstStride ;
874
875 for(int col_iter=0; col_iter<width; col_iter+=32)
876 {
877
878 // Load a full row of pixels (32 + 7)
879 v_pixel_char_0 = vec_xl(0, src) ;
880 v_pixel_char_1 = vec_xl(16, src) ;
881 v_pixel_char_2 = vec_xl(32, src) ;
882
883
884 v_sums_temp_col0 = v_zeros_int ;
885 v_sums_temp_col1 = v_zeros_int ;
886 v_sums_temp_col2 = v_zeros_int ;
887 v_sums_temp_col3 = v_zeros_int ;
888
889
890 // Expand the loaded pixels into shorts
891 v_pixel_short_0 = vec_unpackh((vector signed char)v_pixel_char_0) ;
892 v_pixel_short_1 = vec_unpackl((vector signed char)v_pixel_char_0) ;
893 v_pixel_short_2 = vec_unpackh((vector signed char)v_pixel_char_1) ;
894 v_pixel_short_3 = vec_unpackl((vector signed char)v_pixel_char_1) ;
895 v_pixel_short_4 = vec_unpackh((vector signed char)v_pixel_char_2) ;
896
897 v_pixel_short_0 = vec_and(v_pixel_short_0, v_mask_unisgned_char_to_short) ;
898 v_pixel_short_1 = vec_and(v_pixel_short_1, v_mask_unisgned_char_to_short) ;
899 v_pixel_short_2 = vec_and(v_pixel_short_2, v_mask_unisgned_char_to_short) ;
900 v_pixel_short_3 = vec_and(v_pixel_short_3, v_mask_unisgned_char_to_short) ;
901 v_pixel_short_4 = vec_and(v_pixel_short_4, v_mask_unisgned_char_to_short) ;
902
903
904
905 // Four colum sets are processed below
906 // One colum per set per iteration
907 for(col=0; col < 8; col++)
908 {
909
910 // Multiply the pixels by the coefficients
911 v_product_0_0 = vec_mule(v_pixel_short_0, v_coeff) ;
912 v_product_0_1 = vec_mulo(v_pixel_short_0, v_coeff) ;
913
914 v_product_1_0 = vec_mule(v_pixel_short_1, v_coeff) ;
915 v_product_1_1 = vec_mulo(v_pixel_short_1, v_coeff) ;
916
917 v_product_2_0 = vec_mule(v_pixel_short_2, v_coeff) ;
918 v_product_2_1 = vec_mulo(v_pixel_short_2, v_coeff) ;
919
920 v_product_3_0 = vec_mule(v_pixel_short_3, v_coeff) ;
921 v_product_3_1 = vec_mulo(v_pixel_short_3, v_coeff) ;
922
923
924 // Sum up the multiplication results
925 v_sum_0 = vec_add(v_product_0_0, v_product_0_1) ;
926 v_sum_0 = vec_sums(v_sum_0, v_zeros_int) ;
927
928 v_sum_1 = vec_add(v_product_1_0, v_product_1_1) ;
929 v_sum_1 = vec_sums(v_sum_1, v_zeros_int) ;
930
931 v_sum_2 = vec_add(v_product_2_0, v_product_2_1) ;
932 v_sum_2 = vec_sums(v_sum_2, v_zeros_int) ;
933
934 v_sum_3 = vec_add(v_product_3_0, v_product_3_1) ;
935 v_sum_3 = vec_sums(v_sum_3, v_zeros_int) ;
936
937
938 // Insert the sum results into respective vectors
939 v_sums_temp_col0 = vec_sro(v_sums_temp_col0, v_sums_shamt) ;
940 v_sums_temp_col0 = vec_or(v_sum_0, v_sums_temp_col0) ;
941
942 v_sums_temp_col1 = vec_sro(v_sums_temp_col1, v_sums_shamt) ;
943 v_sums_temp_col1 = vec_or(v_sum_1, v_sums_temp_col1) ;
944
945 v_sums_temp_col2 = vec_sro(v_sums_temp_col2, v_sums_shamt) ;
946 v_sums_temp_col2 = vec_or(v_sum_2, v_sums_temp_col2) ;
947
948 v_sums_temp_col3 = vec_sro(v_sums_temp_col3, v_sums_shamt) ;
949 v_sums_temp_col3 = vec_or(v_sum_3, v_sums_temp_col3) ;
950
951
952 if(col == 3)
953 {
954 v_sums_col0_0 = v_sums_temp_col0 ;
955 v_sums_col1_0 = v_sums_temp_col1 ;
956 v_sums_col2_0 = v_sums_temp_col2 ;
957 v_sums_col3_0 = v_sums_temp_col3 ;
958
959 v_sums_temp_col0 = v_zeros_int ;
960 v_sums_temp_col1 = v_zeros_int ;
961 v_sums_temp_col2 = v_zeros_int ;
962 v_sums_temp_col3 = v_zeros_int ;
963 }
964
965
966 // Shift the pixels by 1 (short pixel)
967 v_pixel_short_0 = vec_sld(v_pixel_short_1, v_pixel_short_0, 14) ;
968 v_pixel_short_1 = vec_sld(v_pixel_short_2, v_pixel_short_1, 14) ;
969 v_pixel_short_2 = vec_sld(v_pixel_short_3, v_pixel_short_2, 14) ;
970 v_pixel_short_3 = vec_sld(v_pixel_short_4, v_pixel_short_3, 14) ;
971 const vector unsigned char v_shift_right_two_bytes_shamt = {0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} ;
972 v_pixel_short_4 = vec_sro(v_pixel_short_4, v_shift_right_two_bytes_shamt) ;
973 }
974
975 // Copy the sums result to the second vector (per colum)
976 v_sums_col0_1 = v_sums_temp_col0 ;
977 v_sums_col1_1 = v_sums_temp_col1 ;
978 v_sums_col2_1 = v_sums_temp_col2 ;
979 v_sums_col3_1 = v_sums_temp_col3 ;
980
981
982
983 // Post processing and eventually 2 stores
984 // Original code:
985 // int16_t val = (int16_t)((sum + offset) >> headRoom);
986 // if (val < 0) val = 0;
987 // if (val > maxVal) val = maxVal;
988 // dst[col] = (pixel)val;
989
990
991 v_sums_col0_0 = vec_sra(vec_add(v_sums_col0_0, v_offset), v_headRoom) ;
992 v_sums_col0_1 = vec_sra(vec_add(v_sums_col0_1, v_offset), v_headRoom) ;
993 v_sums_col1_0 = vec_sra(vec_add(v_sums_col1_0, v_offset), v_headRoom) ;
994 v_sums_col1_1 = vec_sra(vec_add(v_sums_col1_1, v_offset), v_headRoom) ;
995 v_sums_col2_0 = vec_sra(vec_add(v_sums_col2_0, v_offset), v_headRoom) ;
996 v_sums_col2_1 = vec_sra(vec_add(v_sums_col2_1, v_offset), v_headRoom) ;
997 v_sums_col3_0 = vec_sra(vec_add(v_sums_col3_0, v_offset), v_headRoom) ;
998 v_sums_col3_1 = vec_sra(vec_add(v_sums_col3_1, v_offset), v_headRoom) ;
999
1000
1001 vector signed short v_val_col0, v_val_col1, v_val_col2, v_val_col3 ;
1002 v_val_col0 = vec_pack(v_sums_col0_0, v_sums_col0_1) ;
1003 v_val_col1 = vec_pack(v_sums_col1_0, v_sums_col1_1) ;
1004 v_val_col2 = vec_pack(v_sums_col2_0, v_sums_col2_1) ;
1005 v_val_col3 = vec_pack(v_sums_col3_0, v_sums_col3_1) ;
1006
1007
1008 // if (val < 0) val = 0;
1009 vector bool short v_comp_zero_col0, v_comp_zero_col1, v_comp_zero_col2, v_comp_zero_col3 ;
1010 // Compute less than 0
1011 v_comp_zero_col0 = vec_cmplt(v_val_col0, v_zeros_short) ;
1012 v_comp_zero_col1 = vec_cmplt(v_val_col1, v_zeros_short) ;
1013 v_comp_zero_col2 = vec_cmplt(v_val_col2, v_zeros_short) ;
1014 v_comp_zero_col3 = vec_cmplt(v_val_col3, v_zeros_short) ;
1015 // Keep values that are greater or equal to 0
1016 v_val_col0 = vec_andc(v_val_col0, v_comp_zero_col0) ;
1017 v_val_col1 = vec_andc(v_val_col1, v_comp_zero_col1) ;
1018 v_val_col2 = vec_andc(v_val_col2, v_comp_zero_col2) ;
1019 v_val_col3 = vec_andc(v_val_col3, v_comp_zero_col3) ;
1020
1021
1022 // if (val > maxVal) val = maxVal;
1023 vector bool short v_comp_max_col0, v_comp_max_col1, v_comp_max_col2, v_comp_max_col3 ;
1024 const vector signed short v_maxVal = {maxVal, maxVal, maxVal, maxVal, maxVal, maxVal, maxVal, maxVal} ;
1025 // Compute greater than max
1026 v_comp_max_col0 = vec_cmpgt(v_val_col0, v_maxVal) ;
1027 v_comp_max_col1 = vec_cmpgt(v_val_col1, v_maxVal) ;
1028 v_comp_max_col2 = vec_cmpgt(v_val_col2, v_maxVal) ;
1029 v_comp_max_col3 = vec_cmpgt(v_val_col3, v_maxVal) ;
1030 // Replace values greater than maxVal with maxVal
1031 v_val_col0 = vec_sel(v_val_col0, v_maxVal, v_comp_max_col0) ;
1032 v_val_col1 = vec_sel(v_val_col1, v_maxVal, v_comp_max_col1) ;
1033 v_val_col2 = vec_sel(v_val_col2, v_maxVal, v_comp_max_col2) ;
1034 v_val_col3 = vec_sel(v_val_col3, v_maxVal, v_comp_max_col3) ;
1035
1036 // (pixel)val
1037 vector unsigned char v_final_result_0, v_final_result_1 ;
1038 v_final_result_0 = vec_pack((vector unsigned short)v_val_col0, (vector unsigned short)v_val_col1) ;
1039 v_final_result_1 = vec_pack((vector unsigned short)v_val_col2, (vector unsigned short)v_val_col3) ;
1040
1041
1042
1043 // Store results
1044 vec_xst(v_final_result_0, 0, dst) ;
1045 vec_xst(v_final_result_1, 16, dst) ;
1046
1047
1048 src += 32 ;
1049 dst += 32 ;
1050
1051 } // end for col_iter
1052
1053
1054 src = next_src ;
1055 dst = next_dst ;
1056 }
1057 } // interp_horiz_pp_altivec()
1058 #else
1059 template<int N, int width, int height>
interp_horiz_pp_altivec(const pixel * __restrict__ src,intptr_t srcStride,pixel * __restrict__ dst,intptr_t dstStride,int coeffIdx)1060 void interp_horiz_pp_altivec(const pixel* __restrict__ src, intptr_t srcStride, pixel* __restrict__ dst, intptr_t dstStride, int coeffIdx)
1061 {
1062 const int16_t* __restrict__ coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
1063 int headRoom = IF_FILTER_PREC;
1064 int offset = (1 << (headRoom - 1));
1065 uint16_t maxVal = (1 << X265_DEPTH) - 1;
1066 int cStride = 1;
1067
1068 src -= (N / 2 - 1) * cStride;
1069
1070 vector signed short vcoeff0 = vec_splats(coeff[0]);
1071 vector signed short vcoeff1 = vec_splats(coeff[1]);
1072 vector signed short vcoeff2 = vec_splats(coeff[2]);
1073 vector signed short vcoeff3 = vec_splats(coeff[3]);
1074 vector signed short vcoeff4 = vec_splats(coeff[4]);
1075 vector signed short vcoeff5 = vec_splats(coeff[5]);
1076 vector signed short vcoeff6 = vec_splats(coeff[6]);
1077 vector signed short vcoeff7 = vec_splats(coeff[7]);
1078 vector signed short voffset = vec_splats((short)offset);
1079 vector signed short vheadRoom = vec_splats((short)headRoom);
1080 vector signed short vmaxVal = vec_splats((short)maxVal);
1081 vector signed short vzero_s16 = vec_splats( (signed short)0u);;
1082 vector signed int vzero_s32 = vec_splats( (signed int)0u);
1083 vector unsigned char vzero_u8 = vec_splats( (unsigned char)0u );
1084
1085 vector signed short vsrcH, vsrcL, vsumH, vsumL;
1086 vector unsigned char vsrc;
1087
1088 vector signed short vsrc2H, vsrc2L, vsum2H, vsum2L;
1089 vector unsigned char vsrc2;
1090
1091 vector unsigned char vchar_to_short_maskH = {24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31, 0};
1092 vector unsigned char vchar_to_short_maskL = {16, 0, 17, 0 ,18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, 0};
1093
1094 const pixel* __restrict__ src2 = src+srcStride;
1095 pixel* __restrict__ dst2 = dst+dstStride;
1096
1097 int row, col;
1098 for (row = 0; row < height; row+=2)
1099 {
1100 for (col = 0; col < width; col+=16)
1101 {
1102 vsrc = vec_xl(0, (unsigned char*)&src[col + 0*cStride]);
1103 vsrcH = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
1104 vsrcL = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
1105
1106 vsumH = vsrcH * vcoeff0;
1107 vsumL = vsrcL * vcoeff0;
1108
1109 vsrc = vec_xl(0, (unsigned char*)&src[col + 1*cStride]);
1110 vsrcH = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
1111 vsrcL = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
1112 vsumH += vsrcH * vcoeff1;
1113 vsumL += vsrcL * vcoeff1;
1114
1115 vsrc = vec_xl(0, (unsigned char*)&src[col + 2*cStride]);
1116 vsrcH = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
1117 vsrcL = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
1118 vsumH += vsrcH * vcoeff2;
1119 vsumL += vsrcL * vcoeff2;
1120
1121 vsrc = vec_xl(0, (unsigned char*)&src[col + 3*cStride]);
1122 vsrcH = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
1123 vsrcL = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
1124 vsumH += vsrcH * vcoeff3;
1125 vsumL += vsrcL * vcoeff3;
1126
1127 vsrc = vec_xl(0, (unsigned char*)&src[col + 4*cStride]);
1128 vsrcH = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
1129 vsrcL = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
1130 vsumH += vsrcH * vcoeff4;
1131 vsumL += vsrcL * vcoeff4;
1132
1133 vsrc = vec_xl(0, (unsigned char*)&src[col + 5*cStride]);
1134 vsrcH = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
1135 vsrcL = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
1136 vsumH += vsrcH * vcoeff5;
1137 vsumL += vsrcL * vcoeff5;
1138
1139 vsrc = vec_xl(0, (unsigned char*)&src[col + 6*cStride]);
1140 vsrcH = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
1141 vsrcL = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
1142 vsumH += vsrcH * vcoeff6;
1143 vsumL += vsrcL * vcoeff6;
1144
1145 vsrc = vec_xl(0, (unsigned char*)&src[col + 7*cStride]);
1146 vsrcH = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
1147 vsrcL = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
1148 vsumH += vsrcH * vcoeff7;
1149 vsumL += vsrcL * vcoeff7;
1150
1151 vector short vvalH = (vsumH + voffset) >> vheadRoom;
1152 vvalH = vec_max( vvalH, vzero_s16 );
1153 vvalH = vec_min( vvalH, vmaxVal );
1154
1155 vector short vvalL = (vsumL + voffset) >> vheadRoom;
1156 vvalL = vec_max( vvalL, vzero_s16 );
1157 vvalL = vec_min( vvalL, vmaxVal );
1158
1159 vector signed char vdst = vec_pack( vvalL, vvalH );
1160 vec_xst( vdst, 0, (signed char*)&dst[col] );
1161
1162
1163
1164 vsrc2 = vec_xl(0, (unsigned char*)&src2[col + 0*cStride]);
1165 vsrc2H = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
1166 vsrc2L = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
1167
1168 vsum2H = vsrc2H * vcoeff0;
1169 vsum2L = vsrc2L * vcoeff0;
1170
1171 vsrc2 = vec_xl(0, (unsigned char*)&src2[col + 1*cStride]);
1172 vsrc2H = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
1173 vsrc2L = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
1174 vsum2H += vsrc2H * vcoeff1;
1175 vsum2L += vsrc2L * vcoeff1;
1176
1177 vsrc2 = vec_xl(0, (unsigned char*)&src2[col + 2*cStride]);
1178 vsrc2H = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
1179 vsrc2L = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
1180 vsum2H += vsrc2H * vcoeff2;
1181 vsum2L += vsrc2L * vcoeff2;
1182
1183 vsrc2 = vec_xl(0, (unsigned char*)&src2[col + 3*cStride]);
1184 vsrc2H = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
1185 vsrc2L = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
1186 vsum2H += vsrc2H * vcoeff3;
1187 vsum2L += vsrc2L * vcoeff3;
1188
1189 vsrc2 = vec_xl(0, (unsigned char*)&src2[col + 4*cStride]);
1190 vsrc2H = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
1191 vsrc2L = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
1192 vsum2H += vsrc2H * vcoeff4;
1193 vsum2L += vsrc2L * vcoeff4;
1194
1195 vsrc2 = vec_xl(0, (unsigned char*)&src2[col + 5*cStride]);
1196 vsrc2H = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
1197 vsrc2L = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
1198 vsum2H += vsrc2H * vcoeff5;
1199 vsum2L += vsrc2L * vcoeff5;
1200
1201 vsrc2 = vec_xl(0, (unsigned char*)&src2[col + 6*cStride]);
1202 vsrc2H = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
1203 vsrc2L = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
1204 vsum2H += vsrc2H * vcoeff6;
1205 vsum2L += vsrc2L * vcoeff6;
1206
1207 vsrc2 = vec_xl(0, (unsigned char*)&src2[col + 7*cStride]);
1208 vsrc2H = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
1209 vsrc2L = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
1210 vsum2H += vsrc2H * vcoeff7;
1211 vsum2L += vsrc2L * vcoeff7;
1212
1213 vector short vval2H = (vsum2H + voffset) >> vheadRoom;
1214 vval2H = vec_max( vval2H, vzero_s16 );
1215 vval2H = vec_min( vval2H, vmaxVal );
1216
1217 vector short vval2L = (vsum2L + voffset) >> vheadRoom;
1218 vval2L = vec_max( vval2L, vzero_s16 );
1219 vval2L = vec_min( vval2L, vmaxVal );
1220
1221 vector signed char vdst2 = vec_pack( vval2L, vval2H );
1222 vec_xst( vdst2, 0, (signed char*)&dst2[col] );
1223 }
1224
1225 src += 2*srcStride;
1226 dst += 2*dstStride;
1227
1228 src2 += 2*srcStride;
1229 dst2 += 2*dstStride;
1230 }
1231 }
1232 #endif
1233
1234
1235 // Works with the following values:
1236 // N = 8
1237 // width >= 32 (multiple of 32)
1238 // any height
1239 //template <int N, int width, int height>
1240 //void interp_horiz_pp_altivec(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
1241 //{
1242 //
1243 // const int16_t* coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
1244 // int headRoom = IF_FILTER_PREC;
1245 // int offset = (1 << (headRoom - 1));
1246 // uint16_t maxVal = (1 << X265_DEPTH) - 1;
1247 // int cStride = 1;
1248 //
1249 // src -= (N / 2 - 1) * cStride;
1250 //
1251 //
1252 // vector signed short v_coeff ;
1253 // v_coeff = vec_xl(0, coeff) ;
1254 //
1255 //
1256 // vector unsigned char v_pixel_char_0, v_pixel_char_1, v_pixel_char_2 ;
1257 // vector signed short v_pixel_short_0, v_pixel_short_1, v_pixel_short_2, v_pixel_short_3, v_pixel_short_4 ;
1258 // const vector signed short v_mask_unisgned_char_to_short = {0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF} ;
1259 // const vector signed int v_zeros_int = {0, 0, 0, 0} ;
1260 // const vector signed short v_zeros_short = {0, 0, 0, 0, 0, 0, 0, 0} ;
1261 //
1262 // vector signed int v_product_0_0, v_product_0_1 ;
1263 // vector signed int v_product_1_0, v_product_1_1 ;
1264 // vector signed int v_product_2_0, v_product_2_1 ;
1265 // vector signed int v_product_3_0, v_product_3_1 ;
1266 //
1267 // vector signed int v_sum_0, v_sum_1, v_sum_2, v_sum_3 ;
1268 //
1269 // vector signed int v_sums_temp_col0, v_sums_temp_col1, v_sums_temp_col2, v_sums_temp_col3 ;
1270 // vector signed int v_sums_col0_0, v_sums_col0_1 ;
1271 // vector signed int v_sums_col1_0, v_sums_col1_1 ;
1272 // vector signed int v_sums_col2_0, v_sums_col2_1 ;
1273 // vector signed int v_sums_col3_0, v_sums_col3_1 ;
1274 //
1275 //
1276 // const vector signed int v_offset = {offset, offset, offset, offset};
1277 // const vector unsigned int v_headRoom = {headRoom, headRoom, headRoom, headRoom} ;
1278 //
1279 //
1280 // vector unsigned char v_sums_shamt = {0x20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} ;
1281 //
1282 //
1283 // pixel *next_src ;
1284 // pixel *next_dst ;
1285 //
1286 // int row, col;
1287 // for (row = 0; row < height; row++)
1288 // {
1289 // next_src = (pixel *)src + srcStride ;
1290 // next_dst = (pixel *)dst + dstStride ;
1291 //
1292 // for(int col_iter=0; col_iter<width; col_iter+=32)
1293 // {
1294 //
1295 // // Load a full row of pixels (32 + 7)
1296 // v_pixel_char_0 = vec_xl(0, src) ;
1297 // v_pixel_char_1 = vec_xl(16, src) ;
1298 // v_pixel_char_2 = vec_xl(32, src) ;
1299 //
1300 //
1301 // v_sums_temp_col0 = v_zeros_int ;
1302 // v_sums_temp_col1 = v_zeros_int ;
1303 // v_sums_temp_col2 = v_zeros_int ;
1304 // v_sums_temp_col3 = v_zeros_int ;
1305 //
1306 //
1307 // // Expand the loaded pixels into shorts
1308 // v_pixel_short_0 = vec_unpackh((vector signed char)v_pixel_char_0) ;
1309 // v_pixel_short_1 = vec_unpackl((vector signed char)v_pixel_char_0) ;
1310 // v_pixel_short_2 = vec_unpackh((vector signed char)v_pixel_char_1) ;
1311 // v_pixel_short_3 = vec_unpackl((vector signed char)v_pixel_char_1) ;
1312 // v_pixel_short_4 = vec_unpackh((vector signed char)v_pixel_char_2) ;
1313 //
1314 // v_pixel_short_0 = vec_and(v_pixel_short_0, v_mask_unisgned_char_to_short) ;
1315 // v_pixel_short_1 = vec_and(v_pixel_short_1, v_mask_unisgned_char_to_short) ;
1316 // v_pixel_short_2 = vec_and(v_pixel_short_2, v_mask_unisgned_char_to_short) ;
1317 // v_pixel_short_3 = vec_and(v_pixel_short_3, v_mask_unisgned_char_to_short) ;
1318 // v_pixel_short_4 = vec_and(v_pixel_short_4, v_mask_unisgned_char_to_short) ;
1319 //
1320 //
1321 //
1322 // // Four colum sets are processed below
1323 // // One colum per set per iteration
1324 // for(col=0; col < 8; col++)
1325 // {
1326 //
1327 // // Multiply the pixels by the coefficients
1328 // v_product_0_0 = vec_mule(v_pixel_short_0, v_coeff) ;
1329 // v_product_0_1 = vec_mulo(v_pixel_short_0, v_coeff) ;
1330 //
1331 // v_product_1_0 = vec_mule(v_pixel_short_1, v_coeff) ;
1332 // v_product_1_1 = vec_mulo(v_pixel_short_1, v_coeff) ;
1333 //
1334 // v_product_2_0 = vec_mule(v_pixel_short_2, v_coeff) ;
1335 // v_product_2_1 = vec_mulo(v_pixel_short_2, v_coeff) ;
1336 //
1337 // v_product_3_0 = vec_mule(v_pixel_short_3, v_coeff) ;
1338 // v_product_3_1 = vec_mulo(v_pixel_short_3, v_coeff) ;
1339 //
1340 //
1341 // // Sum up the multiplication results
1342 // v_sum_0 = vec_add(v_product_0_0, v_product_0_1) ;
1343 // v_sum_0 = vec_sums(v_sum_0, v_zeros_int) ;
1344 //
1345 // v_sum_1 = vec_add(v_product_1_0, v_product_1_1) ;
1346 // v_sum_1 = vec_sums(v_sum_1, v_zeros_int) ;
1347 //
1348 // v_sum_2 = vec_add(v_product_2_0, v_product_2_1) ;
1349 // v_sum_2 = vec_sums(v_sum_2, v_zeros_int) ;
1350 //
1351 // v_sum_3 = vec_add(v_product_3_0, v_product_3_1) ;
1352 // v_sum_3 = vec_sums(v_sum_3, v_zeros_int) ;
1353 //
1354 //
1355 // // Insert the sum results into respective vectors
1356 // v_sums_temp_col0 = vec_sro(v_sums_temp_col0, v_sums_shamt) ;
1357 // v_sums_temp_col0 = vec_or(v_sum_0, v_sums_temp_col0) ;
1358 //
1359 // v_sums_temp_col1 = vec_sro(v_sums_temp_col1, v_sums_shamt) ;
1360 // v_sums_temp_col1 = vec_or(v_sum_1, v_sums_temp_col1) ;
1361 //
1362 // v_sums_temp_col2 = vec_sro(v_sums_temp_col2, v_sums_shamt) ;
1363 // v_sums_temp_col2 = vec_or(v_sum_2, v_sums_temp_col2) ;
1364 //
1365 // v_sums_temp_col3 = vec_sro(v_sums_temp_col3, v_sums_shamt) ;
1366 // v_sums_temp_col3 = vec_or(v_sum_3, v_sums_temp_col3) ;
1367 //
1368 //
1369 // if(col == 3)
1370 // {
1371 // v_sums_col0_0 = v_sums_temp_col0 ;
1372 // v_sums_col1_0 = v_sums_temp_col1 ;
1373 // v_sums_col2_0 = v_sums_temp_col2 ;
1374 // v_sums_col3_0 = v_sums_temp_col3 ;
1375 //
1376 // v_sums_temp_col0 = v_zeros_int ;
1377 // v_sums_temp_col1 = v_zeros_int ;
1378 // v_sums_temp_col2 = v_zeros_int ;
1379 // v_sums_temp_col3 = v_zeros_int ;
1380 // }
1381 //
1382 //
1383 // // Shift the pixels by 1 (short pixel)
1384 // v_pixel_short_0 = vec_sld(v_pixel_short_1, v_pixel_short_0, 14) ;
1385 // v_pixel_short_1 = vec_sld(v_pixel_short_2, v_pixel_short_1, 14) ;
1386 // v_pixel_short_2 = vec_sld(v_pixel_short_3, v_pixel_short_2, 14) ;
1387 // v_pixel_short_3 = vec_sld(v_pixel_short_4, v_pixel_short_3, 14) ;
1388 // const vector unsigned char v_shift_right_two_bytes_shamt = {0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} ;
1389 // v_pixel_short_4 = vec_sro(v_pixel_short_4, v_shift_right_two_bytes_shamt) ;
1390 // }
1391 //
1392 // // Copy the sums result to the second vector (per colum)
1393 // v_sums_col0_1 = v_sums_temp_col0 ;
1394 // v_sums_col1_1 = v_sums_temp_col1 ;
1395 // v_sums_col2_1 = v_sums_temp_col2 ;
1396 // v_sums_col3_1 = v_sums_temp_col3 ;
1397 //
1398 //
1399 //
1400 // // Post processing and eventually 2 stores
1401 // // Original code:
1402 // // int16_t val = (int16_t)((sum + offset) >> headRoom);
1403 // // if (val < 0) val = 0;
1404 // // if (val > maxVal) val = maxVal;
1405 // // dst[col] = (pixel)val;
1406 //
1407 //
1408 // v_sums_col0_0 = vec_sra(vec_add(v_sums_col0_0, v_offset), v_headRoom) ;
1409 // v_sums_col0_1 = vec_sra(vec_add(v_sums_col0_1, v_offset), v_headRoom) ;
1410 // v_sums_col1_0 = vec_sra(vec_add(v_sums_col1_0, v_offset), v_headRoom) ;
1411 // v_sums_col1_1 = vec_sra(vec_add(v_sums_col1_1, v_offset), v_headRoom) ;
1412 // v_sums_col2_0 = vec_sra(vec_add(v_sums_col2_0, v_offset), v_headRoom) ;
1413 // v_sums_col2_1 = vec_sra(vec_add(v_sums_col2_1, v_offset), v_headRoom) ;
1414 // v_sums_col3_0 = vec_sra(vec_add(v_sums_col3_0, v_offset), v_headRoom) ;
1415 // v_sums_col3_1 = vec_sra(vec_add(v_sums_col3_1, v_offset), v_headRoom) ;
1416 //
1417 //
1418 // vector signed short v_val_col0, v_val_col1, v_val_col2, v_val_col3 ;
1419 // v_val_col0 = vec_pack(v_sums_col0_0, v_sums_col0_1) ;
1420 // v_val_col1 = vec_pack(v_sums_col1_0, v_sums_col1_1) ;
1421 // v_val_col2 = vec_pack(v_sums_col2_0, v_sums_col2_1) ;
1422 // v_val_col3 = vec_pack(v_sums_col3_0, v_sums_col3_1) ;
1423 //
1424 //
1425 // // if (val < 0) val = 0;
1426 // vector bool short v_comp_zero_col0, v_comp_zero_col1, v_comp_zero_col2, v_comp_zero_col3 ;
1427 // // Compute less than 0
1428 // v_comp_zero_col0 = vec_cmplt(v_val_col0, v_zeros_short) ;
1429 // v_comp_zero_col1 = vec_cmplt(v_val_col1, v_zeros_short) ;
1430 // v_comp_zero_col2 = vec_cmplt(v_val_col2, v_zeros_short) ;
1431 // v_comp_zero_col3 = vec_cmplt(v_val_col3, v_zeros_short) ;
1432 // // Keep values that are greater or equal to 0
1433 // v_val_col0 = vec_andc(v_val_col0, v_comp_zero_col0) ;
1434 // v_val_col1 = vec_andc(v_val_col1, v_comp_zero_col1) ;
1435 // v_val_col2 = vec_andc(v_val_col2, v_comp_zero_col2) ;
1436 // v_val_col3 = vec_andc(v_val_col3, v_comp_zero_col3) ;
1437 //
1438 //
1439 // // if (val > maxVal) val = maxVal;
1440 // vector bool short v_comp_max_col0, v_comp_max_col1, v_comp_max_col2, v_comp_max_col3 ;
1441 // const vector signed short v_maxVal = {maxVal, maxVal, maxVal, maxVal, maxVal, maxVal, maxVal, maxVal} ;
1442 // // Compute greater than max
1443 // v_comp_max_col0 = vec_cmpgt(v_val_col0, v_maxVal) ;
1444 // v_comp_max_col1 = vec_cmpgt(v_val_col1, v_maxVal) ;
1445 // v_comp_max_col2 = vec_cmpgt(v_val_col2, v_maxVal) ;
1446 // v_comp_max_col3 = vec_cmpgt(v_val_col3, v_maxVal) ;
1447 // // Replace values greater than maxVal with maxVal
1448 // v_val_col0 = vec_sel(v_val_col0, v_maxVal, v_comp_max_col0) ;
1449 // v_val_col1 = vec_sel(v_val_col1, v_maxVal, v_comp_max_col1) ;
1450 // v_val_col2 = vec_sel(v_val_col2, v_maxVal, v_comp_max_col2) ;
1451 // v_val_col3 = vec_sel(v_val_col3, v_maxVal, v_comp_max_col3) ;
1452 //
1453 // // (pixel)val
1454 // vector unsigned char v_final_result_0, v_final_result_1 ;
1455 // v_final_result_0 = vec_pack((vector unsigned short)v_val_col0, (vector unsigned short)v_val_col1) ;
1456 // v_final_result_1 = vec_pack((vector unsigned short)v_val_col2, (vector unsigned short)v_val_col3) ;
1457 //
1458 //
1459 //
1460 // // Store results
1461 // vec_xst(v_final_result_0, 0, dst) ;
1462 // vec_xst(v_final_result_1, 16, dst) ;
1463 //
1464 //
1465 // src += 32 ;
1466 // dst += 32 ;
1467 //
1468 // } // end for col_iter
1469 //
1470 //
1471 // src = next_src ;
1472 // dst = next_dst ;
1473 // }
1474 //} // interp_horiz_pp_altivec()
1475
1476
1477 namespace X265_NS {
1478
setupFilterPrimitives_altivec(EncoderPrimitives & p)1479 void setupFilterPrimitives_altivec(EncoderPrimitives& p)
1480 {
1481 // interp_vert_pp_c
1482 p.pu[LUMA_16x16].luma_vpp = interp_vert_pp_altivec<8, 16, 16> ;
1483 p.pu[LUMA_32x8].luma_vpp = interp_vert_pp_altivec<8, 32, 8> ;
1484 p.pu[LUMA_16x12].luma_vpp = interp_vert_pp_altivec<8, 16, 12> ;
1485 p.pu[LUMA_16x4].luma_vpp = interp_vert_pp_altivec<8, 16, 4> ;
1486 p.pu[LUMA_32x32].luma_vpp = interp_vert_pp_altivec<8, 32, 32> ;
1487 p.pu[LUMA_32x16].luma_vpp = interp_vert_pp_altivec<8, 32, 16> ;
1488 p.pu[LUMA_16x32].luma_vpp = interp_vert_pp_altivec<8, 16, 32> ;
1489 p.pu[LUMA_32x24].luma_vpp = interp_vert_pp_altivec<8, 32, 24> ;
1490 p.pu[LUMA_32x8].luma_vpp = interp_vert_pp_altivec<8, 32, 8> ;
1491 p.pu[LUMA_64x64].luma_vpp = interp_vert_pp_altivec<8, 64, 64> ;
1492 p.pu[LUMA_64x32].luma_vpp = interp_vert_pp_altivec<8, 64, 32> ;
1493 p.pu[LUMA_32x64].luma_vpp = interp_vert_pp_altivec<8, 32, 64> ;
1494 p.pu[LUMA_64x48].luma_vpp = interp_vert_pp_altivec<8, 64, 48> ;
1495 p.pu[LUMA_48x64].luma_vpp = interp_vert_pp_altivec<8, 48, 64> ;
1496 p.pu[LUMA_64x16].luma_vpp = interp_vert_pp_altivec<8, 64, 16> ;
1497 p.pu[LUMA_16x64].luma_vpp = interp_vert_pp_altivec<8, 16, 64> ;
1498
1499 // interp_hv_pp_c
1500 p.pu[LUMA_32x32].luma_hvpp = interp_hv_pp_altivec<8, 32, 32> ;
1501 p.pu[LUMA_32x16].luma_hvpp = interp_hv_pp_altivec<8, 32, 16> ;
1502 p.pu[LUMA_32x24].luma_hvpp = interp_hv_pp_altivec<8, 32, 24> ;
1503 p.pu[LUMA_32x8].luma_hvpp = interp_hv_pp_altivec<8, 32, 8> ;
1504 p.pu[LUMA_64x64].luma_hvpp = interp_hv_pp_altivec<8, 64, 64> ;
1505 p.pu[LUMA_64x32].luma_hvpp = interp_hv_pp_altivec<8, 64, 32> ;
1506 p.pu[LUMA_32x64].luma_hvpp = interp_hv_pp_altivec<8, 32, 64> ;
1507 p.pu[LUMA_64x48].luma_hvpp = interp_hv_pp_altivec<8, 64, 48> ;
1508 p.pu[LUMA_64x16].luma_hvpp = interp_hv_pp_altivec<8, 64, 16> ;
1509
1510 // interp_horiz_pp_c
1511 p.pu[LUMA_32x32].luma_hpp = interp_horiz_pp_altivec<8, 32, 32> ;
1512 p.pu[LUMA_32x16].luma_hpp = interp_horiz_pp_altivec<8, 32, 16> ;
1513 p.pu[LUMA_32x24].luma_hpp = interp_horiz_pp_altivec<8, 32, 24> ;
1514 p.pu[LUMA_32x8].luma_hpp = interp_horiz_pp_altivec<8, 32, 8> ;
1515 p.pu[LUMA_64x64].luma_hpp = interp_horiz_pp_altivec<8, 64, 64> ;
1516 p.pu[LUMA_64x32].luma_hpp = interp_horiz_pp_altivec<8, 64, 32> ;
1517 p.pu[LUMA_32x64].luma_hpp = interp_horiz_pp_altivec<8, 32, 64> ;
1518 p.pu[LUMA_64x48].luma_hpp = interp_horiz_pp_altivec<8, 64, 48> ;
1519 p.pu[LUMA_64x16].luma_hpp = interp_horiz_pp_altivec<8, 64, 16> ;
1520 }
1521
1522 } // end namespace X265_NS
1523