1 /*****************************************************************************
2  * Copyright (C) 2013-2020 MulticoreWare, Inc
3  *
4  * Authors: Roger Moussalli <rmoussal@us.ibm.com>
5  *          Min Chen <min.chen@multicorewareinc.com>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20  *
21  * This program is also available under a commercial proprietary license.
22  * For more information, contact us at license @ x265.com.
23  *****************************************************************************/
24 
25 #include <iostream>
26 #include "common.h"
27 #include "primitives.h"
28 #include "ppccommon.h"
29 
30 using namespace X265_NS;
31 
32 // ORIGINAL : for(col=0; col<16; col++) {sum[col]  = src[ocol+col + 0 * srcStride] * c[0];}
33 #define multiply_pixel_coeff(/*vector int*/ v_sum_0, /*vector int*/ v_sum_1, /*vector int*/ v_sum_2, /*vector int*/ v_sum_3, /*const pixel * */ src, /*int*/ src_offset, /*vector signed short*/ v_coeff) \
34 { \
35     vector unsigned char v_pixel ; \
36     vector signed short v_pixel_16_h, v_pixel_16_l ; \
37     const vector signed short v_mask_unisgned_8_to_16 = {0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF} ; \
38 \
39     /* load the pixels */ \
40     v_pixel = vec_xl(src_offset, src) ; \
41 \
42     /* unpack the 8-bit pixels to 16-bit values (and undo the sign extension) */ \
43     v_pixel_16_h = vec_unpackh((vector signed char)v_pixel) ; \
44     v_pixel_16_l = vec_unpackl((vector signed char)v_pixel) ; \
45     v_pixel_16_h = vec_and(v_pixel_16_h, v_mask_unisgned_8_to_16) ; \
46     v_pixel_16_l = vec_and(v_pixel_16_l, v_mask_unisgned_8_to_16) ; \
47 \
48     /* multiply the pixels by the coefficient */ \
49     v_sum_0 = vec_mule(v_pixel_16_h, v_coeff) ; \
50     v_sum_1 = vec_mulo(v_pixel_16_h, v_coeff) ; \
51     v_sum_2 = vec_mule(v_pixel_16_l, v_coeff) ; \
52     v_sum_3 = vec_mulo(v_pixel_16_l, v_coeff) ; \
53 } // end multiply_pixel_coeff()
54 
55 
56 // ORIGINAL : for(col=0; col<16; col++) {sum[col] += src[ocol+col + 1 * srcStride] * c[1];}
57 #define multiply_accumulate_pixel_coeff(/*vector int*/ v_sum_0, /*vector int*/ v_sum_1, /*vector int*/ v_sum_2, /*vector int*/ v_sum_3, /*const pixel * */ src, /*int*/ src_offset, /*vector signed short*/ v_coeff) \
58 { \
59     vector unsigned char v_pixel ; \
60     vector signed short v_pixel_16_h, v_pixel_16_l ; \
61     vector int v_product_int_0, v_product_int_1, v_product_int_2, v_product_int_3 ; \
62     const vector signed short v_mask_unisgned_8_to_16 = {0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF} ; \
63 \
64     /* ORIGINAL : for(col=0; col<16; col++) {sum[col]  = src[ocol+col + 0 * srcStride] * c[0];} */ \
65     /* load the pixels */ \
66     v_pixel = vec_xl(src_offset, src) ; \
67 \
68     /* unpack the 8-bit pixels to 16-bit values (and undo the sign extension) */ \
69     v_pixel_16_h = vec_unpackh((vector signed char)v_pixel) ; \
70     v_pixel_16_l = vec_unpackl((vector signed char)v_pixel) ; \
71     v_pixel_16_h = vec_and(v_pixel_16_h, v_mask_unisgned_8_to_16) ; \
72     v_pixel_16_l = vec_and(v_pixel_16_l, v_mask_unisgned_8_to_16) ; \
73 \
74     /* multiply the pixels by the coefficient */ \
75     v_product_int_0 = vec_mule(v_pixel_16_h, v_coeff) ; \
76     v_product_int_1 = vec_mulo(v_pixel_16_h, v_coeff) ; \
77     v_product_int_2 = vec_mule(v_pixel_16_l, v_coeff) ; \
78     v_product_int_3 = vec_mulo(v_pixel_16_l, v_coeff) ; \
79 \
80     /* accumulate the results with the sum vectors */ \
81     v_sum_0 = vec_add(v_sum_0, v_product_int_0) ; \
82     v_sum_1 = vec_add(v_sum_1, v_product_int_1) ; \
83     v_sum_2 = vec_add(v_sum_2, v_product_int_2) ; \
84     v_sum_3 = vec_add(v_sum_3, v_product_int_3) ; \
85 } // end multiply_accumulate_pixel_coeff()
86 
87 
88 
89 #if 0
90 //ORIGINAL
91 // Works with the following values:
92 // N = 8
93 // width >= 16 (multiple of 16)
94 // any height
95 template<int N, int width, int height>
96 void interp_vert_pp_altivec(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
97 {
98 
99 
100     const int16_t* c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
101     const int shift = IF_FILTER_PREC;
102     const int offset = 1 << (shift - 1);
103     const uint16_t maxVal = (1 << X265_DEPTH) - 1;
104 
105     src -= (N / 2 - 1) * srcStride;
106 
107 
108     // Vector to hold replicated shift amount
109     const vector unsigned int v_shift = {shift, shift, shift, shift} ;
110 
111     // Vector to hold replicated offset
112     const vector int v_offset = {offset, offset, offset, offset} ;
113 
114     // Vector to hold replicated maxVal
115     const vector signed short v_maxVal = {maxVal, maxVal, maxVal, maxVal, maxVal, maxVal, maxVal, maxVal} ;
116 
117 
118     // Vector to hold replicated coefficients (one coefficient replicated per vector)
119     vector signed short v_coeff_0, v_coeff_1, v_coeff_2, v_coeff_3, v_coeff_4, v_coeff_5, v_coeff_6, v_coeff_7 ;
120     vector signed short v_coefficients = vec_xl(0, c) ; // load all coefficients into one vector
121 
122     // Replicate the coefficients into respective vectors
123     v_coeff_0 = vec_splat(v_coefficients, 0) ;
124     v_coeff_1 = vec_splat(v_coefficients, 1) ;
125     v_coeff_2 = vec_splat(v_coefficients, 2) ;
126     v_coeff_3 = vec_splat(v_coefficients, 3) ;
127     v_coeff_4 = vec_splat(v_coefficients, 4) ;
128     v_coeff_5 = vec_splat(v_coefficients, 5) ;
129     v_coeff_6 = vec_splat(v_coefficients, 6) ;
130     v_coeff_7 = vec_splat(v_coefficients, 7) ;
131 
132 
133 
134     int row, ocol, col;
135     for (row = 0; row < height; row++)
136     {
137         for (ocol = 0; ocol < width; ocol+=16)
138         {
139 
140 
141            // int sum[16] ;
142            // int16_t val[16] ;
143 
144            // --> for(col=0; col<16; col++) {sum[col]  = src[ocol+col + 1 * srcStride] * c[0];}
145            // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 1 * srcStride] * c[1];}
146            // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 2 * srcStride] * c[2];}
147            // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 3 * srcStride] * c[3];}
148            // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 4 * srcStride] * c[4];}
149            // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 5 * srcStride] * c[5];}
150            // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 6 * srcStride] * c[6];}
151            // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 7 * srcStride] * c[7];}
152 
153 
154 	        vector signed int v_sum_0, v_sum_1, v_sum_2, v_sum_3 ;
155             vector signed short v_val_0, v_val_1 ;
156 
157 
158 
159             multiply_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol, v_coeff_0) ;
160             multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 1 * srcStride, v_coeff_1) ;
161             multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 2 * srcStride, v_coeff_2) ;
162             multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 3 * srcStride, v_coeff_3) ;
163             multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 4 * srcStride, v_coeff_4) ;
164             multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 5 * srcStride, v_coeff_5) ;
165             multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 6 * srcStride, v_coeff_6) ;
166             multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 7 * srcStride, v_coeff_7) ;
167 
168 
169 
170 
171 
172             // --> for(col=0; col<16; col++) {val[col] = (int16_t)((sum[col] + offset) >> shift);}
173             // Add offset
174             v_sum_0 = vec_add(v_sum_0, v_offset) ;
175             v_sum_1 = vec_add(v_sum_1, v_offset) ;
176             v_sum_2 = vec_add(v_sum_2, v_offset) ;
177             v_sum_3 = vec_add(v_sum_3, v_offset) ;
178             // Shift right by "shift"
179             v_sum_0 = vec_sra(v_sum_0, v_shift) ;
180             v_sum_1 = vec_sra(v_sum_1, v_shift) ;
181             v_sum_2 = vec_sra(v_sum_2, v_shift) ;
182             v_sum_3 = vec_sra(v_sum_3, v_shift) ;
183 
184             // Pack into 16-bit numbers
185             v_val_0 = vec_pack(v_sum_0, v_sum_2) ;
186             v_val_1 = vec_pack(v_sum_1, v_sum_3) ;
187 
188 
189 
190             // --> for(col=0; col<16; col++) {val[col] = (val[col] < 0) ? 0 : val[col];}
191             vector bool short v_comp_zero_0, v_comp_zero_1 ;
192             vector signed short v_max_masked_0, v_max_masked_1 ;
193             vector signed short zeros16 = {0,0,0,0,0,0,0,0} ;
194             // Compute less than 0
195             v_comp_zero_0 = vec_cmplt(v_val_0, zeros16) ;
196             v_comp_zero_1 = vec_cmplt(v_val_1, zeros16) ;
197             // Keep values that are greater or equal to 0
198             v_val_0 = vec_andc(v_val_0, v_comp_zero_0) ;
199             v_val_1 = vec_andc(v_val_1, v_comp_zero_1) ;
200 
201 
202 
203             // --> for(col=0; col<16; col++) {val[col] = (val[col] > maxVal) ? maxVal : val[col];}
204             vector bool short v_comp_max_0, v_comp_max_1 ;
205             // Compute greater than max
206             v_comp_max_0 = vec_cmpgt(v_val_0, v_maxVal) ;
207             v_comp_max_1 = vec_cmpgt(v_val_1, v_maxVal) ;
208             // Replace values greater than maxVal with maxVal
209             v_val_0 = vec_sel(v_val_0, v_maxVal, v_comp_max_0) ;
210             v_val_1 = vec_sel(v_val_1, v_maxVal, v_comp_max_1) ;
211 
212 
213 
214             // --> for(col=0; col<16; col++) {dst[ocol+col] = (pixel)val[col];}
215             // Pack the vals into 8-bit numbers
216             // but also re-ordering them - side effect of mule and mulo
217             vector unsigned char v_result ;
218             vector unsigned char v_perm_index = {0x00, 0x10, 0x02, 0x12, 0x04, 0x14, 0x06, 0x16, 0x08 ,0x18, 0x0A, 0x1A, 0x0C, 0x1C, 0x0E, 0x1E} ;
219             v_result = (vector unsigned char)vec_perm(v_val_0, v_val_1, v_perm_index) ;
220             // Store the results back to dst[]
221             vec_xst(v_result, ocol, (unsigned char *)dst) ;
222         }
223 
224         src += srcStride;
225         dst += dstStride;
226     }
227 } // end interp_vert_pp_altivec()
228 #else
229 // Works with the following values:
230 // N = 8
231 // width >= 16 (multiple of 16)
232 // any height
233 template<int N, int width, int height>
interp_vert_pp_altivec(const pixel * __restrict__ src,intptr_t srcStride,pixel * __restrict__ dst,intptr_t dstStride,int coeffIdx)234 void interp_vert_pp_altivec(const pixel* __restrict__ src, intptr_t srcStride, pixel* __restrict__ dst, intptr_t dstStride, int coeffIdx)
235 {
236     const int16_t* __restrict__ c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
237     int shift = IF_FILTER_PREC;
238     int offset = 1 << (shift - 1);
239     uint16_t maxVal = (1 << X265_DEPTH) - 1;
240 
241     src -= (N / 2 - 1) * srcStride;
242 
243     vector signed short vcoeff0     = vec_splats(c[0]);
244     vector signed short vcoeff1     = vec_splats(c[1]);
245     vector signed short vcoeff2     = vec_splats(c[2]);
246     vector signed short vcoeff3     = vec_splats(c[3]);
247     vector signed short vcoeff4     = vec_splats(c[4]);
248     vector signed short vcoeff5     = vec_splats(c[5]);
249     vector signed short vcoeff6     = vec_splats(c[6]);
250     vector signed short vcoeff7     = vec_splats(c[7]);
251     vector signed short voffset     = vec_splats((short)offset);
252     vector signed short vshift      = vec_splats((short)shift);
253     vector signed short vmaxVal     = vec_splats((short)maxVal);
254     vector signed short  vzero_s16  = vec_splats( (signed short)0u);;
255     vector signed int    vzero_s32  = vec_splats( (signed int)0u);
256     vector unsigned char vzero_u8   = vec_splats( (unsigned char)0u );
257     vector unsigned char vchar_to_short_maskH = {24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31, 0};
258     vector unsigned char vchar_to_short_maskL = {16, 0, 17, 0 ,18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, 0};
259 
260     vector signed short vsrcH, vsrcL, vsumH, vsumL;
261     vector unsigned char vsrc;
262 
263     vector signed short vsrc2H, vsrc2L, vsum2H, vsum2L;
264     vector unsigned char vsrc2;
265 
266     const pixel* __restrict__ src2 = src+srcStride;
267     pixel* __restrict__ dst2       = dst+dstStride;
268 
269     int row, col;
270     for (row = 0; row < height; row+=2)
271     {
272         for (col = 0; col < width; col+=16)
273         {
274             vsrc   = vec_xl(0, (unsigned char*)&src[col + 0*srcStride]);
275             vsrcH  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
276             vsrcL  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
277             vsumH  = vsrcH * vcoeff0;
278             vsumL  = vsrcL * vcoeff0;
279 
280             vsrc   = vec_xl(0, (unsigned char*)&src[col + 1*srcStride]);
281             vsrcH  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
282             vsrcL  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
283             vsumH += vsrcH * vcoeff1;
284             vsumL += vsrcL * vcoeff1;
285 
286             vsrc   = vec_xl(0, (unsigned char*)&src[col + 2*srcStride]);
287             vsrcH  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
288             vsrcL  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
289             vsumH += vsrcH * vcoeff2;
290             vsumL += vsrcL * vcoeff2;
291 
292             vsrc   = vec_xl(0, (unsigned char*)&src[col + 3*srcStride]);
293             vsrcH  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
294             vsrcL  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
295             vsumH += vsrcH * vcoeff3;
296             vsumL += vsrcL * vcoeff3;
297 
298             vsrc   = vec_xl(0, (unsigned char*)&src[col + 4*srcStride]);
299             vsrcH  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
300             vsrcL  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
301             vsumH += vsrcH * vcoeff4;
302             vsumL += vsrcL * vcoeff4;
303 
304             vsrc   = vec_xl(0, (unsigned char*)&src[col + 5*srcStride]);
305             vsrcH  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
306             vsrcL  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
307             vsumH += vsrcH * vcoeff5;
308             vsumL += vsrcL * vcoeff5;
309 
310             vsrc   = vec_xl(0, (unsigned char*)&src[col + 6*srcStride]);
311             vsrcH  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
312             vsrcL  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
313             vsumH += vsrcH * vcoeff6;
314             vsumL += vsrcL * vcoeff6;
315 
316             vsrc   = vec_xl(0, (unsigned char*)&src[col + 7*srcStride]);
317             vsrcH  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
318             vsrcL  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
319             vsumH += vsrcH * vcoeff7;
320             vsumL += vsrcL * vcoeff7;
321 
322             vector short vvalH = (vsumH + voffset) >> vshift;
323             vvalH = vec_max( vvalH, vzero_s16 );
324             vvalH = vec_min( vvalH, vmaxVal   );
325 
326             vector short vvalL = (vsumL + voffset) >> vshift;
327             vvalL = vec_max( vvalL, vzero_s16 );
328             vvalL = vec_min( vvalL, vmaxVal   );
329 
330             vector signed char vdst = vec_pack( vvalL, vvalH );
331             vec_xst( vdst, 0, (signed char*)&dst[col] );
332 
333             vsrc2   = vec_xl(0, (unsigned char*)&src2[col + 0*srcStride]);
334             vsrc2H  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
335             vsrc2L  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
336             vsum2H  = vsrc2H * vcoeff0;
337             vsum2L  = vsrc2L * vcoeff0;
338 
339             vsrc2   = vec_xl(0, (unsigned char*)&src2[col + 1*srcStride]);
340             vsrc2H  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
341             vsrc2L  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
342             vsum2H += vsrc2H * vcoeff1;
343             vsum2L += vsrc2L * vcoeff1;
344 
345             vsrc2   = vec_xl(0, (unsigned char*)&src2[col + 2*srcStride]);
346             vsrc2H  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
347             vsrc2L  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
348             vsum2H += vsrc2H * vcoeff2;
349             vsum2L += vsrc2L * vcoeff2;
350 
351             vsrc2   = vec_xl(0, (unsigned char*)&src2[col + 3*srcStride]);
352             vsrc2H  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
353             vsrc2L  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
354             vsum2H += vsrc2H * vcoeff3;
355             vsum2L += vsrc2L * vcoeff3;
356 
357             vsrc2   = vec_xl(0, (unsigned char*)&src2[col + 4*srcStride]);
358             vsrc2H  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
359             vsrc2L  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
360             vsum2H += vsrc2H * vcoeff4;
361             vsum2L += vsrc2L * vcoeff4;
362 
363             vsrc2   = vec_xl(0, (unsigned char*)&src2[col + 5*srcStride]);
364             vsrc2H  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
365             vsrc2L  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
366             vsum2H += vsrc2H * vcoeff5;
367             vsum2L += vsrc2L * vcoeff5;
368 
369             vsrc2   = vec_xl(0, (unsigned char*)&src2[col + 6*srcStride]);
370             vsrc2H  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
371             vsrc2L  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
372             vsum2H += vsrc2H * vcoeff6;
373             vsum2L += vsrc2L * vcoeff6;
374 
375             vsrc2   = vec_xl(0, (unsigned char*)&src2[col + 7*srcStride]);
376             vsrc2H  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
377             vsrc2L  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
378             vsum2H += vsrc2H * vcoeff7;
379             vsum2L += vsrc2L * vcoeff7;
380 
381             vector short vval2H = (vsum2H + voffset) >> vshift;
382             vval2H = vec_max( vval2H, vzero_s16 );
383             vval2H = vec_min( vval2H, vmaxVal   );
384 
385             vector short vval2L = (vsum2L + voffset) >> vshift;
386             vval2L = vec_max( vval2L, vzero_s16 );
387             vval2L = vec_min( vval2L, vmaxVal   );
388 
389             vector signed char vdst2 = vec_pack( vval2L, vval2H );
390             vec_xst( vdst2, 0, (signed char*)&dst2[col] );
391         }
392 
393         src  += 2*srcStride;
394         dst  += 2*dstStride;
395         src2 += 2*srcStride;
396         dst2 += 2*dstStride;
397     }
398 }
399 #endif
400 
401 
402 // ORIGINAL : for(col=0; col<16; col++) {sum[col]  = src[ocol+col + 0 * srcStride] * c[0];}
403 #define multiply_sp_pixel_coeff(/*vector int*/ v_sum_0, /*vector int*/ v_sum_1, /*vector int*/ v_sum_2, /*vector int*/ v_sum_3, /*const int16_t * */ src, /*int*/ src_offset, /*vector signed short*/ v_coeff) \
404 { \
405     vector signed short v_pixel_16_h, v_pixel_16_l ; \
406 \
407     /* load the pixels */ \
408     v_pixel_16_h = vec_xl(src_offset, src) ; \
409     v_pixel_16_l = vec_xl(src_offset + 16, src) ; \
410 \
411     /* multiply the pixels by the coefficient */ \
412     v_sum_0 = vec_mule(v_pixel_16_h, v_coeff) ; \
413     v_sum_1 = vec_mulo(v_pixel_16_h, v_coeff) ; \
414     v_sum_2 = vec_mule(v_pixel_16_l, v_coeff) ; \
415     v_sum_3 = vec_mulo(v_pixel_16_l, v_coeff) ; \
416 \
417 } // end multiply_pixel_coeff()
418 
419 
420 // ORIGINAL : for(col=0; col<16; col++) {sum[col] += src[ocol+col + 1 * srcStride] * c[1];}
421 #define multiply_accumulate_sp_pixel_coeff(/*vector int*/ v_sum_0, /*vector int*/ v_sum_1, /*vector int*/ v_sum_2, /*vector int*/ v_sum_3, /*const pixel * */ src, /*int*/ src_offset, /*vector signed short*/ v_coeff) \
422 { \
423     vector signed short v_pixel_16_h, v_pixel_16_l ; \
424     vector int v_product_int_0, v_product_int_1, v_product_int_2, v_product_int_3 ; \
425 \
426     /* ORIGINAL : for(col=0; col<16; col++) {sum[col]  = src[ocol+col + 0 * srcStride] * c[0];} */ \
427 \
428     /* load the pixels */ \
429     v_pixel_16_h = vec_xl(src_offset, src) ; \
430     v_pixel_16_l = vec_xl(src_offset + 16, src) ; \
431 \
432     /* multiply the pixels by the coefficient */ \
433     v_product_int_0 = vec_mule(v_pixel_16_h, v_coeff) ; \
434     v_product_int_1 = vec_mulo(v_pixel_16_h, v_coeff) ; \
435     v_product_int_2 = vec_mule(v_pixel_16_l, v_coeff) ; \
436     v_product_int_3 = vec_mulo(v_pixel_16_l, v_coeff) ; \
437 \
438     /* accumulate the results with the sum vectors */ \
439     v_sum_0 = vec_add(v_sum_0, v_product_int_0) ; \
440     v_sum_1 = vec_add(v_sum_1, v_product_int_1) ; \
441     v_sum_2 = vec_add(v_sum_2, v_product_int_2) ; \
442     v_sum_3 = vec_add(v_sum_3, v_product_int_3) ; \
443 \
444 } // end multiply_accumulate_pixel_coeff()
445 
446 
447 // Works with the following values:
448 // N = 8
449 // width >= 16 (multiple of 16)
450 // any height
451 template <int N, int width, int height>
filterVertical_sp_altivec(const int16_t * src,intptr_t srcStride,pixel * dst,intptr_t dstStride,int coeffIdx)452 void filterVertical_sp_altivec(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
453 {
454     int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
455     unsigned int shift = IF_FILTER_PREC + headRoom;
456     int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << IF_FILTER_PREC);
457     const uint16_t maxVal = (1 << X265_DEPTH) - 1;
458     const int16_t* coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
459 
460     src -= (N / 2 - 1) * srcStride;
461 
462 
463     // Vector to hold replicated shift amount
464     const vector unsigned int v_shift = {shift, shift, shift, shift} ;
465 
466     // Vector to hold replicated offset
467     const vector int v_offset = {offset, offset, offset, offset} ;
468 
469     // Vector to hold replicated maxVal
470     const vector signed short v_maxVal = {maxVal, maxVal, maxVal, maxVal, maxVal, maxVal, maxVal, maxVal} ;
471 
472 
473     // Vector to hold replicated coefficients (one coefficient replicated per vector)
474     vector signed short v_coeff_0, v_coeff_1, v_coeff_2, v_coeff_3, v_coeff_4, v_coeff_5, v_coeff_6, v_coeff_7 ;
475     vector signed short v_coefficients = vec_xl(0, coeff) ; // load all coefficients into one vector
476 
477     // Replicate the coefficients into respective vectors
478     v_coeff_0 = vec_splat(v_coefficients, 0) ;
479     v_coeff_1 = vec_splat(v_coefficients, 1) ;
480     v_coeff_2 = vec_splat(v_coefficients, 2) ;
481     v_coeff_3 = vec_splat(v_coefficients, 3) ;
482     v_coeff_4 = vec_splat(v_coefficients, 4) ;
483     v_coeff_5 = vec_splat(v_coefficients, 5) ;
484     v_coeff_6 = vec_splat(v_coefficients, 6) ;
485     v_coeff_7 = vec_splat(v_coefficients, 7) ;
486 
487 
488 
489     int row, ocol, col;
490     for (row = 0; row < height; row++)
491     {
492         for (ocol = 0; ocol < width; ocol+= 16 )
493         {
494 
495            // int sum[16] ;
496            // int16_t val[16] ;
497 
498            // --> for(col=0; col<16; col++) {sum[col]  = src[ocol+col + 1 * srcStride] * c[0];}
499            // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 1 * srcStride] * c[1];}
500            // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 2 * srcStride] * c[2];}
501            // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 3 * srcStride] * c[3];}
502            // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 4 * srcStride] * c[4];}
503            // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 5 * srcStride] * c[5];}
504            // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 6 * srcStride] * c[6];}
505            // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 7 * srcStride] * c[7];}
506 
507 
508 	        vector signed int v_sum_0, v_sum_1, v_sum_2, v_sum_3 ;
509             vector signed short v_val_0, v_val_1 ;
510 
511 
512             // Added a factor of 2 to the offset since this is a BYTE offset, and each input pixel is of size 2Bytes
513             multiply_sp_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol * 2, v_coeff_0) ;
514             multiply_accumulate_sp_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, (ocol + 1 * srcStride) * 2, v_coeff_1) ;
515             multiply_accumulate_sp_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, (ocol + 2 * srcStride) * 2, v_coeff_2) ;
516             multiply_accumulate_sp_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, (ocol + 3 * srcStride) * 2, v_coeff_3) ;
517             multiply_accumulate_sp_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, (ocol + 4 * srcStride) * 2, v_coeff_4) ;
518             multiply_accumulate_sp_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, (ocol + 5 * srcStride) * 2, v_coeff_5) ;
519             multiply_accumulate_sp_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, (ocol + 6 * srcStride) * 2, v_coeff_6) ;
520             multiply_accumulate_sp_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, (ocol + 7 * srcStride) * 2, v_coeff_7) ;
521 
522 
523 
524 
525 
526             // --> for(col=0; col<16; col++) {val[col] = (int16_t)((sum[col] + offset) >> shift);}
527             // Add offset
528             v_sum_0 = vec_add(v_sum_0, v_offset) ;
529             v_sum_1 = vec_add(v_sum_1, v_offset) ;
530             v_sum_2 = vec_add(v_sum_2, v_offset) ;
531             v_sum_3 = vec_add(v_sum_3, v_offset) ;
532             // Shift right by "shift"
533             v_sum_0 = vec_sra(v_sum_0, v_shift) ;
534             v_sum_1 = vec_sra(v_sum_1, v_shift) ;
535             v_sum_2 = vec_sra(v_sum_2, v_shift) ;
536             v_sum_3 = vec_sra(v_sum_3, v_shift) ;
537 
538             // Pack into 16-bit numbers
539             v_val_0 = vec_pack(v_sum_0, v_sum_2) ;
540             v_val_1 = vec_pack(v_sum_1, v_sum_3) ;
541 
542 
543 
544             // --> for(col=0; col<16; col++) {val[col] = (val[col] < 0) ? 0 : val[col];}
545             vector bool short v_comp_zero_0, v_comp_zero_1 ;
546             vector signed short v_max_masked_0, v_max_masked_1 ;
547             vector signed short zeros16 = {0,0,0,0,0,0,0,0} ;
548             // Compute less than 0
549             v_comp_zero_0 = vec_cmplt(v_val_0, zeros16) ;
550             v_comp_zero_1 = vec_cmplt(v_val_1, zeros16) ;
551             // Keep values that are greater or equal to 0
552             v_val_0 = vec_andc(v_val_0, v_comp_zero_0) ;
553             v_val_1 = vec_andc(v_val_1, v_comp_zero_1) ;
554 
555 
556 
557             // --> for(col=0; col<16; col++) {val[col] = (val[col] > maxVal) ? maxVal : val[col];}
558             vector bool short v_comp_max_0, v_comp_max_1 ;
559             // Compute greater than max
560             v_comp_max_0 = vec_cmpgt(v_val_0, v_maxVal) ;
561             v_comp_max_1 = vec_cmpgt(v_val_1, v_maxVal) ;
562             // Replace values greater than maxVal with maxVal
563             v_val_0 = vec_sel(v_val_0, v_maxVal, v_comp_max_0) ;
564             v_val_1 = vec_sel(v_val_1, v_maxVal, v_comp_max_1) ;
565 
566 
567 
568             // --> for(col=0; col<16; col++) {dst[ocol+col] = (pixel)val[col];}
569             // Pack the vals into 8-bit numbers
570             // but also re-ordering them - side effect of mule and mulo
571             vector unsigned char v_result ;
572             vector unsigned char v_perm_index = {0x00, 0x10, 0x02, 0x12, 0x04, 0x14, 0x06, 0x16, 0x08 ,0x18, 0x0A, 0x1A, 0x0C, 0x1C, 0x0E, 0x1E} ;
573             v_result = (vector unsigned char)vec_perm(v_val_0, v_val_1, v_perm_index) ;
574             // Store the results back to dst[]
575             vec_xst(v_result, ocol, (unsigned char *)dst) ;
576         }
577 
578         src += srcStride;
579         dst += dstStride;
580     }
581 } // end filterVertical_sp_altivec()
582 
583 
584 
585 
586 
587 // Works with the following values:
588 // N = 8
589 // width >= 32 (multiple of 32)
590 // any height
591 template <int N, int width, int height>
interp_horiz_ps_altivec(const pixel * src,intptr_t srcStride,int16_t * dst,intptr_t dstStride,int coeffIdx,int isRowExt)592 void interp_horiz_ps_altivec(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
593 {
594 
595     const int16_t* coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
596     int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
597     unsigned int shift = IF_FILTER_PREC - headRoom;
598     int offset = -IF_INTERNAL_OFFS << shift;
599     int blkheight = height;
600 
601     src -= N / 2 - 1;
602 
603     if (isRowExt)
604     {
605         src -= (N / 2 - 1) * srcStride;
606         blkheight += N - 1;
607     }
608 
609 
610     vector signed short v_coeff ;
611     v_coeff = vec_xl(0, coeff) ;
612 
613 
614     vector unsigned char v_pixel_char_0, v_pixel_char_1, v_pixel_char_2 ;
615     vector signed short v_pixel_short_0, v_pixel_short_1, v_pixel_short_2, v_pixel_short_3, v_pixel_short_4 ;
616     const vector signed short v_mask_unisgned_char_to_short = {0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF} ; \
617     const vector signed int v_zeros_int = {0, 0, 0, 0} ;
618     const vector signed short v_zeros_short = {0, 0, 0, 0, 0, 0, 0, 0} ;
619 
620     vector signed int v_product_0_0, v_product_0_1 ;
621     vector signed int v_product_1_0, v_product_1_1 ;
622     vector signed int v_product_2_0, v_product_2_1 ;
623     vector signed int v_product_3_0, v_product_3_1 ;
624 
625     vector signed int v_sum_0, v_sum_1, v_sum_2, v_sum_3 ;
626 
627     vector signed int v_sums_temp_col0, v_sums_temp_col1, v_sums_temp_col2, v_sums_temp_col3 ;
628     vector signed int v_sums_col0_0, v_sums_col0_1 ;
629     vector signed int v_sums_col1_0, v_sums_col1_1 ;
630     vector signed int v_sums_col2_0, v_sums_col2_1 ;
631     vector signed int v_sums_col3_0, v_sums_col3_1 ;
632 
633 
634     const vector signed int v_offset = {offset, offset, offset, offset};
635     const vector unsigned int v_shift = {shift, shift, shift, shift} ;
636 
637 
638     vector unsigned char v_sums_shamt = {0x20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} ;
639 
640 
641 
642     pixel *next_src ;
643     int16_t *next_dst ;
644 
645     int row, col;
646     for (row = 0; row < blkheight; row++)
647     {
648         next_src = (pixel *)src + srcStride ;
649         next_dst = (int16_t *)dst + dstStride ;
650 
651         for(int col_iter=0; col_iter<width; col_iter+=32)
652         {
653             // Load a full row of pixels (32 + 7)
654             v_pixel_char_0 = vec_xl(0, src) ;
655             v_pixel_char_1 = vec_xl(16, src) ;
656             v_pixel_char_2 = vec_xl(32, src) ;
657 
658 
659             v_sums_temp_col0 = v_zeros_int ;
660             v_sums_temp_col1 = v_zeros_int ;
661             v_sums_temp_col2 = v_zeros_int ;
662             v_sums_temp_col3 = v_zeros_int ;
663 
664 
665             // Expand the loaded pixels into shorts
666             v_pixel_short_0 = vec_unpackh((vector signed char)v_pixel_char_0) ;
667             v_pixel_short_1 = vec_unpackl((vector signed char)v_pixel_char_0) ;
668             v_pixel_short_2 = vec_unpackh((vector signed char)v_pixel_char_1) ;
669             v_pixel_short_3 = vec_unpackl((vector signed char)v_pixel_char_1) ;
670             v_pixel_short_4 = vec_unpackh((vector signed char)v_pixel_char_2) ;
671 
672             v_pixel_short_0 = vec_and(v_pixel_short_0, v_mask_unisgned_char_to_short) ;
673             v_pixel_short_1 = vec_and(v_pixel_short_1, v_mask_unisgned_char_to_short) ;
674             v_pixel_short_2 = vec_and(v_pixel_short_2, v_mask_unisgned_char_to_short) ;
675             v_pixel_short_3 = vec_and(v_pixel_short_3, v_mask_unisgned_char_to_short) ;
676             v_pixel_short_4 = vec_and(v_pixel_short_4, v_mask_unisgned_char_to_short) ;
677 
678 
679 
680             // Four colum sets are processed below
681             // One colum per set per iteration
682             for(col=0; col < 8; col++)
683             {
684 
685                 // Multiply the pixels by the coefficients
686                 v_product_0_0 = vec_mule(v_pixel_short_0, v_coeff) ;
687                 v_product_0_1 = vec_mulo(v_pixel_short_0, v_coeff) ;
688 
689                 v_product_1_0 = vec_mule(v_pixel_short_1, v_coeff) ;
690                 v_product_1_1 = vec_mulo(v_pixel_short_1, v_coeff) ;
691 
692                 v_product_2_0 = vec_mule(v_pixel_short_2, v_coeff) ;
693                 v_product_2_1 = vec_mulo(v_pixel_short_2, v_coeff) ;
694 
695                 v_product_3_0 = vec_mule(v_pixel_short_3, v_coeff) ;
696                 v_product_3_1 = vec_mulo(v_pixel_short_3, v_coeff) ;
697 
698 
699                 // Sum up the multiplication results
700                 v_sum_0 = vec_add(v_product_0_0, v_product_0_1) ;
701                 v_sum_0 = vec_sums(v_sum_0, v_zeros_int) ;
702 
703                 v_sum_1 = vec_add(v_product_1_0, v_product_1_1) ;
704                 v_sum_1 = vec_sums(v_sum_1, v_zeros_int) ;
705 
706                 v_sum_2 = vec_add(v_product_2_0, v_product_2_1) ;
707                 v_sum_2 = vec_sums(v_sum_2, v_zeros_int) ;
708 
709                 v_sum_3 = vec_add(v_product_3_0, v_product_3_1) ;
710                 v_sum_3 = vec_sums(v_sum_3, v_zeros_int) ;
711 
712 
713                 // Insert the sum results into respective vectors
714                 v_sums_temp_col0 = vec_sro(v_sums_temp_col0, v_sums_shamt) ;
715                 v_sums_temp_col0 = vec_or(v_sum_0, v_sums_temp_col0) ;
716 
717                 v_sums_temp_col1 = vec_sro(v_sums_temp_col1, v_sums_shamt) ;
718                 v_sums_temp_col1 = vec_or(v_sum_1, v_sums_temp_col1) ;
719 
720                 v_sums_temp_col2 = vec_sro(v_sums_temp_col2, v_sums_shamt) ;
721                 v_sums_temp_col2 = vec_or(v_sum_2, v_sums_temp_col2) ;
722 
723                 v_sums_temp_col3 = vec_sro(v_sums_temp_col3, v_sums_shamt) ;
724                 v_sums_temp_col3 = vec_or(v_sum_3, v_sums_temp_col3) ;
725 
726 
727                 if(col == 3)
728                 {
729                     v_sums_col0_0 = v_sums_temp_col0 ;
730                     v_sums_col1_0 = v_sums_temp_col1 ;
731                     v_sums_col2_0 = v_sums_temp_col2 ;
732                     v_sums_col3_0 = v_sums_temp_col3 ;
733 
734                     v_sums_temp_col0 = v_zeros_int ;
735                     v_sums_temp_col1 = v_zeros_int ;
736                     v_sums_temp_col2 = v_zeros_int ;
737                     v_sums_temp_col3 = v_zeros_int ;
738                 }
739 
740 
741                 // Shift the pixels by 1 (short pixel)
742                 v_pixel_short_0 = vec_sld(v_pixel_short_1, v_pixel_short_0, 14) ;
743                 v_pixel_short_1 = vec_sld(v_pixel_short_2, v_pixel_short_1, 14) ;
744                 v_pixel_short_2 = vec_sld(v_pixel_short_3, v_pixel_short_2, 14) ;
745                 v_pixel_short_3 = vec_sld(v_pixel_short_4, v_pixel_short_3, 14) ;
746                 const vector unsigned char v_shift_right_two_bytes_shamt = {0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} ;
747                 v_pixel_short_4 = vec_sro(v_pixel_short_4, v_shift_right_two_bytes_shamt) ;
748             }
749 
750             // Copy the sums result to the second vector (per colum)
751             v_sums_col0_1 = v_sums_temp_col0 ;
752             v_sums_col1_1 = v_sums_temp_col1 ;
753             v_sums_col2_1 = v_sums_temp_col2 ;
754             v_sums_col3_1 = v_sums_temp_col3 ;
755 
756 
757 
758             // Post processing and eventually 2 stores
759             // Original code:
760             // int16_t val = (int16_t)((sum + offset) >> shift);
761             // dst[col] = val;
762 
763 
764             v_sums_col0_0 = vec_sra(vec_add(v_sums_col0_0, v_offset), v_shift) ;
765             v_sums_col0_1 = vec_sra(vec_add(v_sums_col0_1, v_offset), v_shift) ;
766             v_sums_col1_0 = vec_sra(vec_add(v_sums_col1_0, v_offset), v_shift) ;
767             v_sums_col1_1 = vec_sra(vec_add(v_sums_col1_1, v_offset), v_shift) ;
768             v_sums_col2_0 = vec_sra(vec_add(v_sums_col2_0, v_offset), v_shift) ;
769             v_sums_col2_1 = vec_sra(vec_add(v_sums_col2_1, v_offset), v_shift) ;
770             v_sums_col3_0 = vec_sra(vec_add(v_sums_col3_0, v_offset), v_shift) ;
771             v_sums_col3_1 = vec_sra(vec_add(v_sums_col3_1, v_offset), v_shift) ;
772 
773 
774             vector signed short v_val_col0, v_val_col1, v_val_col2, v_val_col3 ;
775             v_val_col0 = vec_pack(v_sums_col0_0, v_sums_col0_1) ;
776             v_val_col1 = vec_pack(v_sums_col1_0, v_sums_col1_1) ;
777             v_val_col2 = vec_pack(v_sums_col2_0, v_sums_col2_1) ;
778             v_val_col3 = vec_pack(v_sums_col3_0, v_sums_col3_1) ;
779 
780 
781 
782             // Store results
783             vec_xst(v_val_col0, 0, dst) ;
784             vec_xst(v_val_col1, 16, dst) ;
785             vec_xst(v_val_col2, 32, dst) ;
786             vec_xst(v_val_col3, 48, dst) ;
787 
788             src += 32 ;
789             dst += 32 ;
790 
791         } // end for col_iter
792 
793         src = next_src ;
794         dst = next_dst ;
795     }
796 } // interp_horiz_ps_altivec ()
797 
798 
799 
800 // Works with the following values:
801 // N = 8
802 // width >= 32 (multiple of 32)
803 // any height
804 template <int N, int width, int height>
interp_hv_pp_altivec(const pixel * src,intptr_t srcStride,pixel * dst,intptr_t dstStride,int idxX,int idxY)805 void interp_hv_pp_altivec(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)
806 {
807 
808     short immedVals[(64 + 8) * (64 + 8)];
809 
810     interp_horiz_ps_altivec<N, width, height>(src, srcStride, immedVals, width, idxX, 1);
811 
812     //!!filterVertical_sp_c<N>(immedVals + 3 * width, width, dst, dstStride, width, height, idxY);
813     filterVertical_sp_altivec<N,width,height>(immedVals + 3 * width, width, dst, dstStride, idxY);
814 }
815 
816 //ORIGINAL
817 #if 0
818 // Works with the following values:
819 // N = 8
820 // width >= 32 (multiple of 32)
821 // any height
822 template <int N, int width, int height>
823 void interp_horiz_pp_altivec(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
824 {
825 
826     const int16_t* coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
827     int headRoom = IF_FILTER_PREC;
828     int offset =  (1 << (headRoom - 1));
829     uint16_t maxVal = (1 << X265_DEPTH) - 1;
830     int cStride = 1;
831 
832     src -= (N / 2 - 1) * cStride;
833 
834 
835     vector signed short v_coeff ;
836     v_coeff = vec_xl(0, coeff) ;
837 
838 
839     vector unsigned char v_pixel_char_0, v_pixel_char_1, v_pixel_char_2 ;
840     vector signed short v_pixel_short_0, v_pixel_short_1, v_pixel_short_2, v_pixel_short_3, v_pixel_short_4 ;
841     const vector signed short v_mask_unisgned_char_to_short = {0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF} ; \
842     const vector signed int v_zeros_int = {0, 0, 0, 0} ;
843     const vector signed short v_zeros_short = {0, 0, 0, 0, 0, 0, 0, 0} ;
844 
845     vector signed int v_product_0_0, v_product_0_1 ;
846     vector signed int v_product_1_0, v_product_1_1 ;
847     vector signed int v_product_2_0, v_product_2_1 ;
848     vector signed int v_product_3_0, v_product_3_1 ;
849 
850     vector signed int v_sum_0, v_sum_1, v_sum_2, v_sum_3 ;
851 
852     vector signed int v_sums_temp_col0, v_sums_temp_col1, v_sums_temp_col2, v_sums_temp_col3 ;
853     vector signed int v_sums_col0_0, v_sums_col0_1 ;
854     vector signed int v_sums_col1_0, v_sums_col1_1 ;
855     vector signed int v_sums_col2_0, v_sums_col2_1 ;
856     vector signed int v_sums_col3_0, v_sums_col3_1 ;
857 
858 
859     const vector signed int v_offset = {offset, offset, offset, offset};
860     const vector unsigned int v_headRoom = {headRoom, headRoom, headRoom, headRoom} ;
861 
862 
863     vector unsigned char v_sums_shamt = {0x20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} ;
864 
865 
866     pixel *next_src ;
867     pixel *next_dst ;
868 
869     int row, col;
870     for (row = 0; row < height; row++)
871     {
872         next_src = (pixel *)src + srcStride ;
873         next_dst = (pixel *)dst + dstStride ;
874 
875         for(int col_iter=0; col_iter<width; col_iter+=32)
876         {
877 
878             // Load a full row of pixels (32 + 7)
879             v_pixel_char_0 = vec_xl(0, src) ;
880             v_pixel_char_1 = vec_xl(16, src) ;
881             v_pixel_char_2 = vec_xl(32, src) ;
882 
883 
884             v_sums_temp_col0 = v_zeros_int ;
885             v_sums_temp_col1 = v_zeros_int ;
886             v_sums_temp_col2 = v_zeros_int ;
887             v_sums_temp_col3 = v_zeros_int ;
888 
889 
890             // Expand the loaded pixels into shorts
891             v_pixel_short_0 = vec_unpackh((vector signed char)v_pixel_char_0) ;
892             v_pixel_short_1 = vec_unpackl((vector signed char)v_pixel_char_0) ;
893             v_pixel_short_2 = vec_unpackh((vector signed char)v_pixel_char_1) ;
894             v_pixel_short_3 = vec_unpackl((vector signed char)v_pixel_char_1) ;
895             v_pixel_short_4 = vec_unpackh((vector signed char)v_pixel_char_2) ;
896 
897             v_pixel_short_0 = vec_and(v_pixel_short_0, v_mask_unisgned_char_to_short) ;
898             v_pixel_short_1 = vec_and(v_pixel_short_1, v_mask_unisgned_char_to_short) ;
899             v_pixel_short_2 = vec_and(v_pixel_short_2, v_mask_unisgned_char_to_short) ;
900             v_pixel_short_3 = vec_and(v_pixel_short_3, v_mask_unisgned_char_to_short) ;
901             v_pixel_short_4 = vec_and(v_pixel_short_4, v_mask_unisgned_char_to_short) ;
902 
903 
904 
905             // Four colum sets are processed below
906             // One colum per set per iteration
907             for(col=0; col < 8; col++)
908             {
909 
910                 // Multiply the pixels by the coefficients
911                 v_product_0_0 = vec_mule(v_pixel_short_0, v_coeff) ;
912                 v_product_0_1 = vec_mulo(v_pixel_short_0, v_coeff) ;
913 
914                 v_product_1_0 = vec_mule(v_pixel_short_1, v_coeff) ;
915                 v_product_1_1 = vec_mulo(v_pixel_short_1, v_coeff) ;
916 
917                 v_product_2_0 = vec_mule(v_pixel_short_2, v_coeff) ;
918                 v_product_2_1 = vec_mulo(v_pixel_short_2, v_coeff) ;
919 
920                 v_product_3_0 = vec_mule(v_pixel_short_3, v_coeff) ;
921                 v_product_3_1 = vec_mulo(v_pixel_short_3, v_coeff) ;
922 
923 
924                 // Sum up the multiplication results
925                 v_sum_0 = vec_add(v_product_0_0, v_product_0_1) ;
926                 v_sum_0 = vec_sums(v_sum_0, v_zeros_int) ;
927 
928                 v_sum_1 = vec_add(v_product_1_0, v_product_1_1) ;
929                 v_sum_1 = vec_sums(v_sum_1, v_zeros_int) ;
930 
931                 v_sum_2 = vec_add(v_product_2_0, v_product_2_1) ;
932                 v_sum_2 = vec_sums(v_sum_2, v_zeros_int) ;
933 
934                 v_sum_3 = vec_add(v_product_3_0, v_product_3_1) ;
935                 v_sum_3 = vec_sums(v_sum_3, v_zeros_int) ;
936 
937 
938                 // Insert the sum results into respective vectors
939                 v_sums_temp_col0 = vec_sro(v_sums_temp_col0, v_sums_shamt) ;
940                 v_sums_temp_col0 = vec_or(v_sum_0, v_sums_temp_col0) ;
941 
942                 v_sums_temp_col1 = vec_sro(v_sums_temp_col1, v_sums_shamt) ;
943                 v_sums_temp_col1 = vec_or(v_sum_1, v_sums_temp_col1) ;
944 
945                 v_sums_temp_col2 = vec_sro(v_sums_temp_col2, v_sums_shamt) ;
946                 v_sums_temp_col2 = vec_or(v_sum_2, v_sums_temp_col2) ;
947 
948                 v_sums_temp_col3 = vec_sro(v_sums_temp_col3, v_sums_shamt) ;
949                 v_sums_temp_col3 = vec_or(v_sum_3, v_sums_temp_col3) ;
950 
951 
952                 if(col == 3)
953                 {
954                     v_sums_col0_0 = v_sums_temp_col0 ;
955                     v_sums_col1_0 = v_sums_temp_col1 ;
956                     v_sums_col2_0 = v_sums_temp_col2 ;
957                     v_sums_col3_0 = v_sums_temp_col3 ;
958 
959                     v_sums_temp_col0 = v_zeros_int ;
960                     v_sums_temp_col1 = v_zeros_int ;
961                     v_sums_temp_col2 = v_zeros_int ;
962                     v_sums_temp_col3 = v_zeros_int ;
963                 }
964 
965 
966                 // Shift the pixels by 1 (short pixel)
967                 v_pixel_short_0 = vec_sld(v_pixel_short_1, v_pixel_short_0, 14) ;
968                 v_pixel_short_1 = vec_sld(v_pixel_short_2, v_pixel_short_1, 14) ;
969                 v_pixel_short_2 = vec_sld(v_pixel_short_3, v_pixel_short_2, 14) ;
970                 v_pixel_short_3 = vec_sld(v_pixel_short_4, v_pixel_short_3, 14) ;
971                 const vector unsigned char v_shift_right_two_bytes_shamt = {0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} ;
972                 v_pixel_short_4 = vec_sro(v_pixel_short_4, v_shift_right_two_bytes_shamt) ;
973             }
974 
975             // Copy the sums result to the second vector (per colum)
976             v_sums_col0_1 = v_sums_temp_col0 ;
977             v_sums_col1_1 = v_sums_temp_col1 ;
978             v_sums_col2_1 = v_sums_temp_col2 ;
979             v_sums_col3_1 = v_sums_temp_col3 ;
980 
981 
982 
983             // Post processing and eventually 2 stores
984             // Original code:
985             // int16_t val = (int16_t)((sum + offset) >> headRoom);
986             // if (val < 0) val = 0;
987             // if (val > maxVal) val = maxVal;
988             // dst[col] = (pixel)val;
989 
990 
991             v_sums_col0_0 = vec_sra(vec_add(v_sums_col0_0, v_offset), v_headRoom) ;
992             v_sums_col0_1 = vec_sra(vec_add(v_sums_col0_1, v_offset), v_headRoom) ;
993             v_sums_col1_0 = vec_sra(vec_add(v_sums_col1_0, v_offset), v_headRoom) ;
994             v_sums_col1_1 = vec_sra(vec_add(v_sums_col1_1, v_offset), v_headRoom) ;
995             v_sums_col2_0 = vec_sra(vec_add(v_sums_col2_0, v_offset), v_headRoom) ;
996             v_sums_col2_1 = vec_sra(vec_add(v_sums_col2_1, v_offset), v_headRoom) ;
997             v_sums_col3_0 = vec_sra(vec_add(v_sums_col3_0, v_offset), v_headRoom) ;
998             v_sums_col3_1 = vec_sra(vec_add(v_sums_col3_1, v_offset), v_headRoom) ;
999 
1000 
1001             vector signed short v_val_col0, v_val_col1, v_val_col2, v_val_col3 ;
1002             v_val_col0 = vec_pack(v_sums_col0_0, v_sums_col0_1) ;
1003             v_val_col1 = vec_pack(v_sums_col1_0, v_sums_col1_1) ;
1004             v_val_col2 = vec_pack(v_sums_col2_0, v_sums_col2_1) ;
1005             v_val_col3 = vec_pack(v_sums_col3_0, v_sums_col3_1) ;
1006 
1007 
1008             // if (val < 0) val = 0;
1009             vector bool short v_comp_zero_col0, v_comp_zero_col1, v_comp_zero_col2, v_comp_zero_col3 ;
1010             // Compute less than 0
1011             v_comp_zero_col0 = vec_cmplt(v_val_col0, v_zeros_short) ;
1012             v_comp_zero_col1 = vec_cmplt(v_val_col1, v_zeros_short) ;
1013             v_comp_zero_col2 = vec_cmplt(v_val_col2, v_zeros_short) ;
1014             v_comp_zero_col3 = vec_cmplt(v_val_col3, v_zeros_short) ;
1015             // Keep values that are greater or equal to 0
1016             v_val_col0 = vec_andc(v_val_col0, v_comp_zero_col0) ;
1017             v_val_col1 = vec_andc(v_val_col1, v_comp_zero_col1) ;
1018             v_val_col2 = vec_andc(v_val_col2, v_comp_zero_col2) ;
1019             v_val_col3 = vec_andc(v_val_col3, v_comp_zero_col3) ;
1020 
1021 
1022             // if (val > maxVal) val = maxVal;
1023             vector bool short v_comp_max_col0, v_comp_max_col1, v_comp_max_col2, v_comp_max_col3 ;
1024             const vector signed short v_maxVal = {maxVal, maxVal, maxVal, maxVal, maxVal, maxVal, maxVal, maxVal} ;
1025             // Compute greater than max
1026             v_comp_max_col0 = vec_cmpgt(v_val_col0, v_maxVal) ;
1027             v_comp_max_col1 = vec_cmpgt(v_val_col1, v_maxVal) ;
1028             v_comp_max_col2 = vec_cmpgt(v_val_col2, v_maxVal) ;
1029             v_comp_max_col3 = vec_cmpgt(v_val_col3, v_maxVal) ;
1030             // Replace values greater than maxVal with maxVal
1031             v_val_col0 = vec_sel(v_val_col0, v_maxVal, v_comp_max_col0) ;
1032             v_val_col1 = vec_sel(v_val_col1, v_maxVal, v_comp_max_col1) ;
1033             v_val_col2 = vec_sel(v_val_col2, v_maxVal, v_comp_max_col2) ;
1034             v_val_col3 = vec_sel(v_val_col3, v_maxVal, v_comp_max_col3) ;
1035 
1036             // (pixel)val
1037             vector unsigned char v_final_result_0, v_final_result_1 ;
1038             v_final_result_0 = vec_pack((vector unsigned short)v_val_col0, (vector unsigned short)v_val_col1) ;
1039             v_final_result_1 = vec_pack((vector unsigned short)v_val_col2, (vector unsigned short)v_val_col3) ;
1040 
1041 
1042 
1043             // Store results
1044             vec_xst(v_final_result_0, 0, dst) ;
1045             vec_xst(v_final_result_1, 16, dst) ;
1046 
1047 
1048             src += 32 ;
1049             dst += 32 ;
1050 
1051         } // end for col_iter
1052 
1053 
1054         src = next_src ;
1055         dst = next_dst ;
1056     }
1057 } // interp_horiz_pp_altivec()
1058 #else
1059 template<int N, int width, int height>
interp_horiz_pp_altivec(const pixel * __restrict__ src,intptr_t srcStride,pixel * __restrict__ dst,intptr_t dstStride,int coeffIdx)1060 void interp_horiz_pp_altivec(const pixel* __restrict__ src, intptr_t srcStride, pixel* __restrict__ dst, intptr_t dstStride, int coeffIdx)
1061 {
1062     const int16_t* __restrict__ coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
1063     int headRoom = IF_FILTER_PREC;
1064     int offset =  (1 << (headRoom - 1));
1065     uint16_t maxVal = (1 << X265_DEPTH) - 1;
1066     int cStride = 1;
1067 
1068     src -= (N / 2 - 1) * cStride;
1069 
1070     vector signed short vcoeff0     = vec_splats(coeff[0]);
1071     vector signed short vcoeff1     = vec_splats(coeff[1]);
1072     vector signed short vcoeff2     = vec_splats(coeff[2]);
1073     vector signed short vcoeff3     = vec_splats(coeff[3]);
1074     vector signed short vcoeff4     = vec_splats(coeff[4]);
1075     vector signed short vcoeff5     = vec_splats(coeff[5]);
1076     vector signed short vcoeff6     = vec_splats(coeff[6]);
1077     vector signed short vcoeff7     = vec_splats(coeff[7]);
1078     vector signed short voffset     = vec_splats((short)offset);
1079     vector signed short vheadRoom   = vec_splats((short)headRoom);
1080     vector signed short vmaxVal     = vec_splats((short)maxVal);
1081     vector signed short  vzero_s16  = vec_splats( (signed short)0u);;
1082     vector signed int    vzero_s32  = vec_splats( (signed int)0u);
1083     vector unsigned char vzero_u8   = vec_splats( (unsigned char)0u );
1084 
1085     vector signed short vsrcH, vsrcL, vsumH, vsumL;
1086     vector unsigned char vsrc;
1087 
1088     vector signed short vsrc2H, vsrc2L, vsum2H, vsum2L;
1089     vector unsigned char vsrc2;
1090 
1091     vector unsigned char vchar_to_short_maskH = {24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31, 0};
1092     vector unsigned char vchar_to_short_maskL = {16, 0, 17, 0 ,18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, 0};
1093 
1094     const pixel* __restrict__ src2 = src+srcStride;
1095     pixel* __restrict__ dst2 = dst+dstStride;
1096 
1097     int row, col;
1098     for (row = 0; row < height; row+=2)
1099     {
1100         for (col = 0; col < width; col+=16)
1101         {
1102             vsrc   = vec_xl(0, (unsigned char*)&src[col + 0*cStride]);
1103             vsrcH  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
1104             vsrcL  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
1105 
1106             vsumH  = vsrcH * vcoeff0;
1107             vsumL  = vsrcL * vcoeff0;
1108 
1109             vsrc   = vec_xl(0, (unsigned char*)&src[col + 1*cStride]);
1110             vsrcH  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
1111             vsrcL  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
1112             vsumH  += vsrcH * vcoeff1;
1113             vsumL  += vsrcL * vcoeff1;
1114 
1115             vsrc   = vec_xl(0, (unsigned char*)&src[col + 2*cStride]);
1116             vsrcH  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
1117             vsrcL  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
1118             vsumH  += vsrcH * vcoeff2;
1119             vsumL  += vsrcL * vcoeff2;
1120 
1121             vsrc   = vec_xl(0, (unsigned char*)&src[col + 3*cStride]);
1122             vsrcH  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
1123             vsrcL  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
1124             vsumH  += vsrcH * vcoeff3;
1125             vsumL  += vsrcL * vcoeff3;
1126 
1127             vsrc   = vec_xl(0, (unsigned char*)&src[col + 4*cStride]);
1128             vsrcH  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
1129             vsrcL  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
1130             vsumH  += vsrcH * vcoeff4;
1131             vsumL  += vsrcL * vcoeff4;
1132 
1133             vsrc   = vec_xl(0, (unsigned char*)&src[col + 5*cStride]);
1134             vsrcH  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
1135             vsrcL  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
1136             vsumH  += vsrcH * vcoeff5;
1137             vsumL  += vsrcL * vcoeff5;
1138 
1139             vsrc   = vec_xl(0, (unsigned char*)&src[col + 6*cStride]);
1140             vsrcH  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
1141             vsrcL  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
1142             vsumH  += vsrcH * vcoeff6;
1143             vsumL  += vsrcL * vcoeff6;
1144 
1145             vsrc   = vec_xl(0, (unsigned char*)&src[col + 7*cStride]);
1146             vsrcH  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskH );
1147             vsrcL  = (vector signed short)vec_perm( vzero_u8, vsrc, vchar_to_short_maskL );
1148             vsumH  += vsrcH * vcoeff7;
1149             vsumL  += vsrcL * vcoeff7;
1150 
1151             vector short vvalH = (vsumH + voffset) >> vheadRoom;
1152             vvalH = vec_max( vvalH, vzero_s16 );
1153             vvalH = vec_min( vvalH, vmaxVal   );
1154 
1155             vector short vvalL = (vsumL + voffset) >> vheadRoom;
1156             vvalL = vec_max( vvalL, vzero_s16 );
1157             vvalL = vec_min( vvalL, vmaxVal   );
1158 
1159             vector signed char vdst = vec_pack( vvalL, vvalH );
1160             vec_xst( vdst, 0, (signed char*)&dst[col] );
1161 
1162 
1163 
1164             vsrc2   = vec_xl(0, (unsigned char*)&src2[col + 0*cStride]);
1165             vsrc2H  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
1166             vsrc2L  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
1167 
1168             vsum2H  = vsrc2H * vcoeff0;
1169             vsum2L  = vsrc2L * vcoeff0;
1170 
1171             vsrc2   = vec_xl(0, (unsigned char*)&src2[col + 1*cStride]);
1172             vsrc2H  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
1173             vsrc2L  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
1174             vsum2H  += vsrc2H * vcoeff1;
1175             vsum2L  += vsrc2L * vcoeff1;
1176 
1177             vsrc2   = vec_xl(0, (unsigned char*)&src2[col + 2*cStride]);
1178             vsrc2H  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
1179             vsrc2L  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
1180             vsum2H  += vsrc2H * vcoeff2;
1181             vsum2L  += vsrc2L * vcoeff2;
1182 
1183             vsrc2   = vec_xl(0, (unsigned char*)&src2[col + 3*cStride]);
1184             vsrc2H  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
1185             vsrc2L  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
1186             vsum2H  += vsrc2H * vcoeff3;
1187             vsum2L  += vsrc2L * vcoeff3;
1188 
1189             vsrc2   = vec_xl(0, (unsigned char*)&src2[col + 4*cStride]);
1190             vsrc2H  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
1191             vsrc2L  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
1192             vsum2H  += vsrc2H * vcoeff4;
1193             vsum2L  += vsrc2L * vcoeff4;
1194 
1195             vsrc2   = vec_xl(0, (unsigned char*)&src2[col + 5*cStride]);
1196             vsrc2H  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
1197             vsrc2L  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
1198             vsum2H  += vsrc2H * vcoeff5;
1199             vsum2L  += vsrc2L * vcoeff5;
1200 
1201             vsrc2   = vec_xl(0, (unsigned char*)&src2[col + 6*cStride]);
1202             vsrc2H  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
1203             vsrc2L  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
1204             vsum2H  += vsrc2H * vcoeff6;
1205             vsum2L  += vsrc2L * vcoeff6;
1206 
1207             vsrc2   = vec_xl(0, (unsigned char*)&src2[col + 7*cStride]);
1208             vsrc2H  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskH );
1209             vsrc2L  = (vector signed short)vec_perm( vzero_u8, vsrc2, vchar_to_short_maskL );
1210             vsum2H  += vsrc2H * vcoeff7;
1211             vsum2L  += vsrc2L * vcoeff7;
1212 
1213             vector short vval2H = (vsum2H + voffset) >> vheadRoom;
1214             vval2H = vec_max( vval2H, vzero_s16 );
1215             vval2H = vec_min( vval2H, vmaxVal   );
1216 
1217             vector short vval2L = (vsum2L + voffset) >> vheadRoom;
1218             vval2L = vec_max( vval2L, vzero_s16 );
1219             vval2L = vec_min( vval2L, vmaxVal   );
1220 
1221             vector signed char vdst2 = vec_pack( vval2L, vval2H );
1222             vec_xst( vdst2, 0, (signed char*)&dst2[col] );
1223         }
1224 
1225         src  += 2*srcStride;
1226         dst  += 2*dstStride;
1227 
1228         src2 += 2*srcStride;
1229         dst2 += 2*dstStride;
1230     }
1231 }
1232 #endif
1233 
1234 
1235 // Works with the following values:
1236 // N = 8
1237 // width >= 32 (multiple of 32)
1238 // any height
1239 //template <int N, int width, int height>
1240 //void interp_horiz_pp_altivec(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
1241 //{
1242 //
1243 //    const int16_t* coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
1244 //    int headRoom = IF_FILTER_PREC;
1245 //    int offset =  (1 << (headRoom - 1));
1246 //    uint16_t maxVal = (1 << X265_DEPTH) - 1;
1247 //    int cStride = 1;
1248 //
1249 //    src -= (N / 2 - 1) * cStride;
1250 //
1251 //
1252 //    vector signed short v_coeff ;
1253 //    v_coeff = vec_xl(0, coeff) ;
1254 //
1255 //
1256 //    vector unsigned char v_pixel_char_0, v_pixel_char_1, v_pixel_char_2 ;
1257 //    vector signed short v_pixel_short_0, v_pixel_short_1, v_pixel_short_2, v_pixel_short_3, v_pixel_short_4 ;
1258 //    const vector signed short v_mask_unisgned_char_to_short = {0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF} ;
1259 //    const vector signed int v_zeros_int = {0, 0, 0, 0} ;
1260 //    const vector signed short v_zeros_short = {0, 0, 0, 0, 0, 0, 0, 0} ;
1261 //
1262 //    vector signed int v_product_0_0, v_product_0_1 ;
1263 //    vector signed int v_product_1_0, v_product_1_1 ;
1264 //    vector signed int v_product_2_0, v_product_2_1 ;
1265 //    vector signed int v_product_3_0, v_product_3_1 ;
1266 //
1267 //    vector signed int v_sum_0, v_sum_1, v_sum_2, v_sum_3 ;
1268 //
1269 //    vector signed int v_sums_temp_col0, v_sums_temp_col1, v_sums_temp_col2, v_sums_temp_col3 ;
1270 //    vector signed int v_sums_col0_0, v_sums_col0_1 ;
1271 //    vector signed int v_sums_col1_0, v_sums_col1_1 ;
1272 //    vector signed int v_sums_col2_0, v_sums_col2_1 ;
1273 //    vector signed int v_sums_col3_0, v_sums_col3_1 ;
1274 //
1275 //
1276 //    const vector signed int v_offset = {offset, offset, offset, offset};
1277 //    const vector unsigned int v_headRoom = {headRoom, headRoom, headRoom, headRoom} ;
1278 //
1279 //
1280 //    vector unsigned char v_sums_shamt = {0x20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} ;
1281 //
1282 //
1283 //    pixel *next_src ;
1284 //    pixel *next_dst ;
1285 //
1286 //    int row, col;
1287 //    for (row = 0; row < height; row++)
1288 //    {
1289 //        next_src = (pixel *)src + srcStride ;
1290 //        next_dst = (pixel *)dst + dstStride ;
1291 //
1292 //        for(int col_iter=0; col_iter<width; col_iter+=32)
1293 //        {
1294 //
1295 //            // Load a full row of pixels (32 + 7)
1296 //            v_pixel_char_0 = vec_xl(0, src) ;
1297 //            v_pixel_char_1 = vec_xl(16, src) ;
1298 //            v_pixel_char_2 = vec_xl(32, src) ;
1299 //
1300 //
1301 //            v_sums_temp_col0 = v_zeros_int ;
1302 //            v_sums_temp_col1 = v_zeros_int ;
1303 //            v_sums_temp_col2 = v_zeros_int ;
1304 //            v_sums_temp_col3 = v_zeros_int ;
1305 //
1306 //
1307 //            // Expand the loaded pixels into shorts
1308 //            v_pixel_short_0 = vec_unpackh((vector signed char)v_pixel_char_0) ;
1309 //            v_pixel_short_1 = vec_unpackl((vector signed char)v_pixel_char_0) ;
1310 //            v_pixel_short_2 = vec_unpackh((vector signed char)v_pixel_char_1) ;
1311 //            v_pixel_short_3 = vec_unpackl((vector signed char)v_pixel_char_1) ;
1312 //            v_pixel_short_4 = vec_unpackh((vector signed char)v_pixel_char_2) ;
1313 //
1314 //            v_pixel_short_0 = vec_and(v_pixel_short_0, v_mask_unisgned_char_to_short) ;
1315 //            v_pixel_short_1 = vec_and(v_pixel_short_1, v_mask_unisgned_char_to_short) ;
1316 //            v_pixel_short_2 = vec_and(v_pixel_short_2, v_mask_unisgned_char_to_short) ;
1317 //            v_pixel_short_3 = vec_and(v_pixel_short_3, v_mask_unisgned_char_to_short) ;
1318 //            v_pixel_short_4 = vec_and(v_pixel_short_4, v_mask_unisgned_char_to_short) ;
1319 //
1320 //
1321 //
1322 //            // Four colum sets are processed below
1323 //            // One colum per set per iteration
1324 //            for(col=0; col < 8; col++)
1325 //            {
1326 //
1327 //                // Multiply the pixels by the coefficients
1328 //                v_product_0_0 = vec_mule(v_pixel_short_0, v_coeff) ;
1329 //                v_product_0_1 = vec_mulo(v_pixel_short_0, v_coeff) ;
1330 //
1331 //                v_product_1_0 = vec_mule(v_pixel_short_1, v_coeff) ;
1332 //                v_product_1_1 = vec_mulo(v_pixel_short_1, v_coeff) ;
1333 //
1334 //                v_product_2_0 = vec_mule(v_pixel_short_2, v_coeff) ;
1335 //                v_product_2_1 = vec_mulo(v_pixel_short_2, v_coeff) ;
1336 //
1337 //                v_product_3_0 = vec_mule(v_pixel_short_3, v_coeff) ;
1338 //                v_product_3_1 = vec_mulo(v_pixel_short_3, v_coeff) ;
1339 //
1340 //
1341 //                // Sum up the multiplication results
1342 //                v_sum_0 = vec_add(v_product_0_0, v_product_0_1) ;
1343 //                v_sum_0 = vec_sums(v_sum_0, v_zeros_int) ;
1344 //
1345 //                v_sum_1 = vec_add(v_product_1_0, v_product_1_1) ;
1346 //                v_sum_1 = vec_sums(v_sum_1, v_zeros_int) ;
1347 //
1348 //                v_sum_2 = vec_add(v_product_2_0, v_product_2_1) ;
1349 //                v_sum_2 = vec_sums(v_sum_2, v_zeros_int) ;
1350 //
1351 //                v_sum_3 = vec_add(v_product_3_0, v_product_3_1) ;
1352 //                v_sum_3 = vec_sums(v_sum_3, v_zeros_int) ;
1353 //
1354 //
1355 //                // Insert the sum results into respective vectors
1356 //                v_sums_temp_col0 = vec_sro(v_sums_temp_col0, v_sums_shamt) ;
1357 //                v_sums_temp_col0 = vec_or(v_sum_0, v_sums_temp_col0) ;
1358 //
1359 //                v_sums_temp_col1 = vec_sro(v_sums_temp_col1, v_sums_shamt) ;
1360 //                v_sums_temp_col1 = vec_or(v_sum_1, v_sums_temp_col1) ;
1361 //
1362 //                v_sums_temp_col2 = vec_sro(v_sums_temp_col2, v_sums_shamt) ;
1363 //                v_sums_temp_col2 = vec_or(v_sum_2, v_sums_temp_col2) ;
1364 //
1365 //                v_sums_temp_col3 = vec_sro(v_sums_temp_col3, v_sums_shamt) ;
1366 //                v_sums_temp_col3 = vec_or(v_sum_3, v_sums_temp_col3) ;
1367 //
1368 //
1369 //                if(col == 3)
1370 //                {
1371 //                    v_sums_col0_0 = v_sums_temp_col0 ;
1372 //                    v_sums_col1_0 = v_sums_temp_col1 ;
1373 //                    v_sums_col2_0 = v_sums_temp_col2 ;
1374 //                    v_sums_col3_0 = v_sums_temp_col3 ;
1375 //
1376 //                    v_sums_temp_col0 = v_zeros_int ;
1377 //                    v_sums_temp_col1 = v_zeros_int ;
1378 //                    v_sums_temp_col2 = v_zeros_int ;
1379 //                    v_sums_temp_col3 = v_zeros_int ;
1380 //                }
1381 //
1382 //
1383 //                // Shift the pixels by 1 (short pixel)
1384 //                v_pixel_short_0 = vec_sld(v_pixel_short_1, v_pixel_short_0, 14) ;
1385 //                v_pixel_short_1 = vec_sld(v_pixel_short_2, v_pixel_short_1, 14) ;
1386 //                v_pixel_short_2 = vec_sld(v_pixel_short_3, v_pixel_short_2, 14) ;
1387 //                v_pixel_short_3 = vec_sld(v_pixel_short_4, v_pixel_short_3, 14) ;
1388 //                const vector unsigned char v_shift_right_two_bytes_shamt = {0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} ;
1389 //                v_pixel_short_4 = vec_sro(v_pixel_short_4, v_shift_right_two_bytes_shamt) ;
1390 //            }
1391 //
1392 //            // Copy the sums result to the second vector (per colum)
1393 //            v_sums_col0_1 = v_sums_temp_col0 ;
1394 //            v_sums_col1_1 = v_sums_temp_col1 ;
1395 //            v_sums_col2_1 = v_sums_temp_col2 ;
1396 //            v_sums_col3_1 = v_sums_temp_col3 ;
1397 //
1398 //
1399 //
1400 //            // Post processing and eventually 2 stores
1401 //            // Original code:
1402 //            // int16_t val = (int16_t)((sum + offset) >> headRoom);
1403 //            // if (val < 0) val = 0;
1404 //            // if (val > maxVal) val = maxVal;
1405 //            // dst[col] = (pixel)val;
1406 //
1407 //
1408 //            v_sums_col0_0 = vec_sra(vec_add(v_sums_col0_0, v_offset), v_headRoom) ;
1409 //            v_sums_col0_1 = vec_sra(vec_add(v_sums_col0_1, v_offset), v_headRoom) ;
1410 //            v_sums_col1_0 = vec_sra(vec_add(v_sums_col1_0, v_offset), v_headRoom) ;
1411 //            v_sums_col1_1 = vec_sra(vec_add(v_sums_col1_1, v_offset), v_headRoom) ;
1412 //            v_sums_col2_0 = vec_sra(vec_add(v_sums_col2_0, v_offset), v_headRoom) ;
1413 //            v_sums_col2_1 = vec_sra(vec_add(v_sums_col2_1, v_offset), v_headRoom) ;
1414 //            v_sums_col3_0 = vec_sra(vec_add(v_sums_col3_0, v_offset), v_headRoom) ;
1415 //            v_sums_col3_1 = vec_sra(vec_add(v_sums_col3_1, v_offset), v_headRoom) ;
1416 //
1417 //
1418 //            vector signed short v_val_col0, v_val_col1, v_val_col2, v_val_col3 ;
1419 //            v_val_col0 = vec_pack(v_sums_col0_0, v_sums_col0_1) ;
1420 //            v_val_col1 = vec_pack(v_sums_col1_0, v_sums_col1_1) ;
1421 //            v_val_col2 = vec_pack(v_sums_col2_0, v_sums_col2_1) ;
1422 //            v_val_col3 = vec_pack(v_sums_col3_0, v_sums_col3_1) ;
1423 //
1424 //
1425 //            // if (val < 0) val = 0;
1426 //            vector bool short v_comp_zero_col0, v_comp_zero_col1, v_comp_zero_col2, v_comp_zero_col3 ;
1427 //            // Compute less than 0
1428 //            v_comp_zero_col0 = vec_cmplt(v_val_col0, v_zeros_short) ;
1429 //            v_comp_zero_col1 = vec_cmplt(v_val_col1, v_zeros_short) ;
1430 //            v_comp_zero_col2 = vec_cmplt(v_val_col2, v_zeros_short) ;
1431 //            v_comp_zero_col3 = vec_cmplt(v_val_col3, v_zeros_short) ;
1432 //            // Keep values that are greater or equal to 0
1433 //            v_val_col0 = vec_andc(v_val_col0, v_comp_zero_col0) ;
1434 //            v_val_col1 = vec_andc(v_val_col1, v_comp_zero_col1) ;
1435 //            v_val_col2 = vec_andc(v_val_col2, v_comp_zero_col2) ;
1436 //            v_val_col3 = vec_andc(v_val_col3, v_comp_zero_col3) ;
1437 //
1438 //
1439 //            // if (val > maxVal) val = maxVal;
1440 //            vector bool short v_comp_max_col0, v_comp_max_col1, v_comp_max_col2, v_comp_max_col3 ;
1441 //            const vector signed short v_maxVal = {maxVal, maxVal, maxVal, maxVal, maxVal, maxVal, maxVal, maxVal} ;
1442 //            // Compute greater than max
1443 //            v_comp_max_col0 = vec_cmpgt(v_val_col0, v_maxVal) ;
1444 //            v_comp_max_col1 = vec_cmpgt(v_val_col1, v_maxVal) ;
1445 //            v_comp_max_col2 = vec_cmpgt(v_val_col2, v_maxVal) ;
1446 //            v_comp_max_col3 = vec_cmpgt(v_val_col3, v_maxVal) ;
1447 //            // Replace values greater than maxVal with maxVal
1448 //            v_val_col0 = vec_sel(v_val_col0, v_maxVal, v_comp_max_col0) ;
1449 //            v_val_col1 = vec_sel(v_val_col1, v_maxVal, v_comp_max_col1) ;
1450 //            v_val_col2 = vec_sel(v_val_col2, v_maxVal, v_comp_max_col2) ;
1451 //            v_val_col3 = vec_sel(v_val_col3, v_maxVal, v_comp_max_col3) ;
1452 //
1453 //            // (pixel)val
1454 //            vector unsigned char v_final_result_0, v_final_result_1 ;
1455 //            v_final_result_0 = vec_pack((vector unsigned short)v_val_col0, (vector unsigned short)v_val_col1) ;
1456 //            v_final_result_1 = vec_pack((vector unsigned short)v_val_col2, (vector unsigned short)v_val_col3) ;
1457 //
1458 //
1459 //
1460 //            // Store results
1461 //            vec_xst(v_final_result_0, 0, dst) ;
1462 //            vec_xst(v_final_result_1, 16, dst) ;
1463 //
1464 //
1465 //            src += 32 ;
1466 //            dst += 32 ;
1467 //
1468 //        } // end for col_iter
1469 //
1470 //
1471 //        src = next_src ;
1472 //        dst = next_dst ;
1473 //    }
1474 //} // interp_horiz_pp_altivec()
1475 
1476 
1477 namespace X265_NS {
1478 
setupFilterPrimitives_altivec(EncoderPrimitives & p)1479 void setupFilterPrimitives_altivec(EncoderPrimitives& p)
1480 {
1481     // interp_vert_pp_c
1482     p.pu[LUMA_16x16].luma_vpp   = interp_vert_pp_altivec<8, 16, 16> ;
1483     p.pu[LUMA_32x8].luma_vpp    = interp_vert_pp_altivec<8, 32, 8> ;
1484     p.pu[LUMA_16x12].luma_vpp   = interp_vert_pp_altivec<8, 16, 12> ;
1485     p.pu[LUMA_16x4].luma_vpp    = interp_vert_pp_altivec<8, 16, 4> ;
1486     p.pu[LUMA_32x32].luma_vpp   = interp_vert_pp_altivec<8, 32, 32> ;
1487     p.pu[LUMA_32x16].luma_vpp   = interp_vert_pp_altivec<8, 32, 16> ;
1488     p.pu[LUMA_16x32].luma_vpp   = interp_vert_pp_altivec<8, 16, 32> ;
1489     p.pu[LUMA_32x24].luma_vpp   = interp_vert_pp_altivec<8, 32, 24> ;
1490     p.pu[LUMA_32x8].luma_vpp    = interp_vert_pp_altivec<8, 32, 8> ;
1491     p.pu[LUMA_64x64].luma_vpp   = interp_vert_pp_altivec<8, 64, 64> ;
1492     p.pu[LUMA_64x32].luma_vpp   = interp_vert_pp_altivec<8, 64, 32> ;
1493     p.pu[LUMA_32x64].luma_vpp   = interp_vert_pp_altivec<8, 32, 64> ;
1494     p.pu[LUMA_64x48].luma_vpp   = interp_vert_pp_altivec<8, 64, 48> ;
1495     p.pu[LUMA_48x64].luma_vpp   = interp_vert_pp_altivec<8, 48, 64> ;
1496     p.pu[LUMA_64x16].luma_vpp   = interp_vert_pp_altivec<8, 64, 16> ;
1497     p.pu[LUMA_16x64].luma_vpp   = interp_vert_pp_altivec<8, 16, 64> ;
1498 
1499     // interp_hv_pp_c
1500     p.pu[LUMA_32x32].luma_hvpp   = interp_hv_pp_altivec<8, 32, 32> ;
1501     p.pu[LUMA_32x16].luma_hvpp   = interp_hv_pp_altivec<8, 32, 16> ;
1502     p.pu[LUMA_32x24].luma_hvpp   = interp_hv_pp_altivec<8, 32, 24> ;
1503     p.pu[LUMA_32x8].luma_hvpp    = interp_hv_pp_altivec<8, 32, 8> ;
1504     p.pu[LUMA_64x64].luma_hvpp   = interp_hv_pp_altivec<8, 64, 64> ;
1505     p.pu[LUMA_64x32].luma_hvpp   = interp_hv_pp_altivec<8, 64, 32> ;
1506     p.pu[LUMA_32x64].luma_hvpp   = interp_hv_pp_altivec<8, 32, 64> ;
1507     p.pu[LUMA_64x48].luma_hvpp   = interp_hv_pp_altivec<8, 64, 48> ;
1508     p.pu[LUMA_64x16].luma_hvpp   = interp_hv_pp_altivec<8, 64, 16> ;
1509 
1510     // interp_horiz_pp_c
1511     p.pu[LUMA_32x32].luma_hpp   = interp_horiz_pp_altivec<8, 32, 32> ;
1512     p.pu[LUMA_32x16].luma_hpp   = interp_horiz_pp_altivec<8, 32, 16> ;
1513     p.pu[LUMA_32x24].luma_hpp   = interp_horiz_pp_altivec<8, 32, 24> ;
1514     p.pu[LUMA_32x8].luma_hpp    = interp_horiz_pp_altivec<8, 32, 8> ;
1515     p.pu[LUMA_64x64].luma_hpp   = interp_horiz_pp_altivec<8, 64, 64> ;
1516     p.pu[LUMA_64x32].luma_hpp   = interp_horiz_pp_altivec<8, 64, 32> ;
1517     p.pu[LUMA_32x64].luma_hpp   = interp_horiz_pp_altivec<8, 32, 64> ;
1518     p.pu[LUMA_64x48].luma_hpp   = interp_horiz_pp_altivec<8, 64, 48> ;
1519     p.pu[LUMA_64x16].luma_hpp   = interp_horiz_pp_altivec<8, 64, 16> ;
1520 }
1521 
1522 } // end namespace X265_NS
1523