1 /****************************************************/
2 /*                                                  */
3 /*  intrinsic.hh:                                   */
4 /*                                                  */
5 /*                                                  */
6 /*  Nicolas Scaringella                             */
7 /*                                                  */
8 /****************************************************/
9 
10 // dans fichiers architectures desormais :
11 // inline void *aligned_calloc(size_t nmemb, size_t size) { return (void*)((unsigned)(calloc((nmemb*size)+15,sizeof(char)))+15 & 0xfffffff0); }
12 
13 #ifdef __SSE2__
14 
15 /****************************************************/
16 /*                                                  */
17 /*  			   SSE2 implementation      */
18 /*                                                  */
19 /****************************************************/
20 
21 #include <mmintrin.h>
22 #include <xmmintrin.h>
23 #include <emmintrin.h>
24 //#include <sse2mmx.h>
25 
26 struct vec_int
27 {
28 	__m128i vec;
29 
vec_intvec_int30 	vec_int()										{}
vec_intvec_int31 	vec_int(int a)									{ vec = _mm_set_epi32(a,a,a,a); }
vec_intvec_int32 	vec_int(int a, int b, int c, int d)				{ vec = _mm_set_epi32(d,c,b,a); }
vec_intvec_int33 	vec_int(__m128i m)								{ vec = m; }
operator __m128ivec_int34 	operator  __m128i() const						{ return vec; }
operator []vec_int35 	const int& operator[](int i)const				{ int* ip = (int*)&vec; return *(ip+i); }
operator []vec_int36 	int& operator[](int i)							{ int* ip = (int*)&vec; return *(ip+i); }
37 
38 };
39 
40 struct vec_float
41 {
42 	__m128 vec;
vec_floatvec_float43 	vec_float()										{}
vec_floatvec_float44 	vec_float(float a)								{ vec = _mm_set_ps1(a); }
vec_floatvec_float45 	vec_float(float a, float b, float c, float d)	{ vec = _mm_set_ps(d,c,b,a); }
vec_floatvec_float46 	vec_float(__m128 m)								{ vec = m; }
47 	//vec_float(vec_int vi)   						{ vec = _mm_cvtepi32_ps(vi); }
operator __m128vec_float48 	operator  __m128() const						{ return vec; }
operator []vec_float49 	const float& operator[](int i)const				{ float* fp = (float*)&vec; return *(fp+i); }
operator []vec_float50 	float& operator[](int i)						{ float* fp = (float*)&vec; return *(fp+i); }
51 
52 };
53 
54 // Flush to zero mode: during underflow zero result is returned when the result is true
55 // Not compatible with the IEEE standard 754 ( which
56 // deliver denormalized result in case of underflow )
57 #define NO_DENORMALIZE _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (_MM_FLUSH_ZERO_ON))
58 #define DENORMALIZE    _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (_MM_FLUSH_ZERO_OFF))
59 
60 
61 // constants
62 
63 // 0 0 0 0
64 #define VEC_INT_ZERO(a)   _mm_xor_si128(a,a)
65 
66 // 0xffffffff 0xffffffff 0xffffffff 0xffffffff
67 #define VEC_INT_ONES(a)   _mm_cmpeq_epi32(a,a)
68 
69 // Example: 2^10 - 1 = 1023 -> VEC_INT_PW2_MINUS_1(a,10)
70 #define VEC_INT_PW2_MINUS_1(a,pw) _mm_srli_epi32(_mm_cmpeq_epi32(a,a),32-pw)
71 
72 // 1 1 1 1: particular case
73 #define VEC_INT_ONE(a)    _mm_srli_epi32(_mm_cmpeq_epi32(a,a),31)
74 
75 // Example: 2^10 = 1024 -> VEC_INT_PW2(a,10)
76 #define VEC_INT_PW2(a,pw) _mm_slli_epi32(_mm_srli_epi32(_mm_cmpeq_epi32(a,a),31),pw)
77 
78 // Example: -2^10 = -1024 -> VEC_INT_MINUS_PW2(a,10)
79 #define VEC_INT_MINUS_PW2(a,pw) _mm_slli_epi32(_mm_cmpeq_epi32(a,a),pw)
80 
81 // -1 -1 -1 -1: particular case
82 #define VEC_INT_MINUS_ONE(a) _mm_cmpeq_epi32(a,a)
83 
84 // 0.0 0.0 0.0 0.0
85 #define VEC_FLOAT_ZERO(a) _mm_xor_ps(a,a)
86 
87 // 0xffffffff 0xffffffff 0xffffffff 0xffffffff
88 #define VEC_FLOAT_ONES(a) _mm_cmpeq_ps(a,a)
89 
90 // conversions entre vecteurs d'ints et de floats
float2int(vec_float a)91 inline vec_int   float2int( vec_float a)   { return _mm_cvtps_epi32(a); }
float2int(float a)92 inline int       float2int( float a )      { return int(a); }
93 
int2float(vec_int a)94 inline vec_float int2float( vec_int a)   { return _mm_cvtepi32_ps(a); }
int2float(int a)95 inline float     int2float( int a )      { return float(a); }
96 
97 // arithmetic
add_vec(vec_float a,vec_float b)98 inline vec_float add_vec( vec_float a, vec_float b)   	{ return _mm_add_ps(a,b); }
add_vec(vec_int a,vec_float b)99 inline vec_float add_vec( vec_int a, vec_float b)   	{ return _mm_add_ps(int2float(a),b); }
add_vec(vec_float a,vec_int b)100 inline vec_float add_vec( vec_float a, vec_int b)   	{ return _mm_add_ps(a,int2float(b)); }
add_vec(vec_int a,vec_int b)101 inline vec_int   add_vec( vec_int a, vec_int b)       { return _mm_add_epi32(a,b); }
add_scal(vec_float a,vec_float b)102 inline vec_float add_scal( vec_float a, vec_float b)  { return _mm_add_ss(a,b); }
add_scal(vec_int a,vec_int b)103 inline vec_int   add_scal( vec_int a, vec_int b)      { return _mm_add_epi32(a,b); } // _mm_add_pi32 en MMX
104 //inline scal_int  add_scal( scal_int a, scal_int b) { return _mm_add_pi32(a,b); }
105 
sub_vec(vec_float a,vec_float b)106 inline vec_float sub_vec( vec_float a, vec_float b)   { return _mm_sub_ps(a,b); }
sub_vec(vec_int a,vec_int b)107 inline vec_int   sub_vec( vec_int a, vec_int b)       { return _mm_sub_epi32(a,b); }
sub_scal(vec_float a,vec_float b)108 inline vec_float sub_scal( vec_float a, vec_float b)  { return _mm_sub_ss(a,b); }
sub_scal(vec_int a,vec_int b)109 inline vec_int   sub_scal( vec_int a, vec_int b)      { return _mm_sub_epi32(a,b); } // _mm_sub_pi32 en MMX
110 //inline scal_int  sub_scal( scal_int a, scal_int b) { return _mm_sub_pi32(a,b); }
111 
mul_vec(vec_float a,vec_float b)112 inline vec_float mul_vec( vec_float a, vec_float b)   	{ return _mm_mul_ps(a,b); }
mul_vec(vec_int a,vec_float b)113 inline vec_float mul_vec( vec_int a, vec_float b)   	{ return _mm_mul_ps(int2float(a),b); }
mul_vec(vec_float a,vec_int b)114 inline vec_float mul_vec( vec_float a, vec_int b)   	{ return _mm_mul_ps(a,int2float(b)); }
115 
mul_scal(vec_float a,vec_float b)116 inline vec_float mul_scal( vec_float a, vec_float b)  	{ return _mm_mul_ss(a,b); }
117 
118 // INTEGER MULTIPLICATION
119 // low 32 bits of a 32 * 32 bit multiplication: each double-word X and Y is broken down into two words, A & B and C & D:
120 // X = ( A << 16 ) + B
121 // Y = ( C << 16 ) + D
122 // then:
123 // X * Y = (( A << 16 ) + B ) * (( C << 16 ) + D )
124 // X * Y = ( A*C << 32 ) + ( A*D << 16 ) + ( B*C << 16 ) + B*D
125 // the partial result A*C does not appear in the low 32 bits result so does not need to be computed
126 // ( however, if it's different from zero, then there is an overflow )
127 
mul_vec(vec_int a,vec_int b)128 inline vec_int mul_vec( vec_int a, vec_int b) {
129 
130   vec_int temp0 = _mm_shufflehi_epi16( _mm_shufflelo_epi16( b, 0xB1), 0xB1);
131   vec_int temp1 = _mm_and_si128( b, _mm_srli_epi32( _mm_cmpeq_epi32( b,b), 16));
132 
133   vec_int temp2 = _mm_madd_epi16( a, temp0);
134   vec_int temp3 = _mm_madd_epi16( a, temp1);
135 
136   vec_int temp4 = _mm_slli_epi32( temp2, 16);
137 
138   return _mm_add_epi32( temp4, temp3);
139 }
140 
mul_scal(vec_int a,vec_int b)141 inline vec_int mul_scal( vec_int a, vec_int b) {
142 
143   vec_int temp0 = _mm_shufflelo_epi16( b, 0xB1);
144   vec_int temp1 = _mm_and_si128( b, _mm_cvtsi32_si128(0x00ff));
145 
146   vec_int temp2 = _mm_madd_epi16( a, temp0);
147   vec_int temp3 = _mm_madd_epi16( a, temp1);
148 
149   vec_int temp4 = _mm_slli_epi32( temp2, 16);
150 
151   return _mm_add_epi32( temp4, temp3);
152 }
153 
div_vec(vec_float a,vec_float b)154 inline vec_float div_vec( vec_float a, vec_float b)   { return _mm_mul_ps(a,_mm_rcp_ps(b)); /*_mm_div_ps(a,b);*/ }
div_vec(vec_int a,vec_int b)155 inline vec_int   div_vec( vec_int a, vec_int b)       { return _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(a),_mm_rcp_ps(_mm_cvtepi32_ps(b)))); } // A CHANGER !!!!
div_scal(vec_float a,vec_float b)156 inline vec_float div_scal( vec_float a, vec_float b)  { return _mm_mul_ss(a,_mm_rcp_ss(b)); /*_mm_div_ss(a,b);*/ }
div_scal(vec_int a,vec_int b)157 inline vec_int   div_scal( vec_int a, vec_int b)      { return _mm_cvtps_epi32(_mm_mul_ss(_mm_cvtepi32_ps(a),_mm_rcp_ss(_mm_cvtepi32_ps(b)))); } // A CHANGER !!!!!
158 //inline scal_int  div_scal( scal_int a, scal_int b) { return _mm_cvtsi32_si64((_mm_cvtsi64_si32(a))/(_mm_cvtsi64_si32(b))); }  // A CHANGER !!!!!
159 
mod_vec(vec_int a,vec_int N)160 inline vec_int   mod_vec( vec_int a, vec_int N) {
161 
162   vec_int temp = _mm_sub_epi32(a,N);
163   vec_int zero = _mm_xor_si128(a,a);
164 
165   vec_int select = _mm_xor_si128( _mm_cmpgt_epi32(temp,zero), _mm_cmpeq_epi32(temp,zero)); // a - N >= 0
166 
167   return _mm_or_si128(_mm_and_si128(select,temp),_mm_andnot_si128(select,a)); // if( a - N >=0 ) return a - N; else return a;
168 }
169 
mod_scal(vec_int a,vec_int N)170 inline vec_int   mod_scal( vec_int a, vec_int N) {
171 
172   vec_int temp = _mm_sub_epi32(a,N);
173   vec_int zero = _mm_xor_si128(a,a);
174 
175   vec_int select = _mm_xor_si128( _mm_cmpgt_epi32(temp,zero), _mm_cmpeq_epi32(temp,zero)); // a - N >= 0
176 
177   return _mm_or_si128(_mm_and_si128(select,temp),_mm_andnot_si128(select,a)); // if( a - N >=0 ) return a - N; else return a;
178 }
179 
180 
181 
182 // simulation of  a*b + c
183 #define madd_vec(a,b,c)  add_vec(mul_vec(a,b),c)
184 #define madd_scal(a,b,c)  add_scal(mul_scal(a,b),c)
185 
186 //inline vec_float madd_scal( vec_float a, vec_float b, vec_float c) { return _mm_add_ss(_mm_mul_ss(a,b),c); }
187 //inline vec_int madd_vec( vec_int a, vec_int b, vec_int c)  { return add_vec(mul_vec(a,b),c); }
188 //inline vec_int madd_scal( vec_int a, vec_int b, vec_int c) { return add_scal(mul_scal(a,b),c); }
189 
190 
191 // simulation of  - ( a*b - c )
192 //inline vec_float nmsub_vec( vec_float a, vec_float b, vec_float c) {  }
193 
194 // simulation of a*(1/b) + c
divadd_vec(vec_float a,vec_float b,vec_float c)195 inline vec_float divadd_vec( vec_float a, vec_float b, vec_float c)  { return _mm_add_ps(_mm_mul_ps(a,_mm_rcp_ps(b)),c); }
divadd_scal(vec_float a,vec_float b,vec_float c)196 inline vec_float divadd_scal( vec_float a, vec_float b, vec_float c) { return _mm_add_ss(_mm_mul_ss(a,_mm_rcp_ss(b)),c); }
197 // simulation of - ( a*(1/b) - c )
198 //inline vec_float divsub_vec( vec_float a, vec_float b, vec_float c) {  }
199 
200 // shift ( and fill with 0's )
shift_left_vec(vec_int a,vec_int num)201 inline vec_int   shift_left_vec( vec_int a, vec_int num)     { return _mm_sll_epi32(a,num); }
shift_left_vec(vec_int a,int num)202 inline vec_int   shift_left_vec( vec_int a, int num)         { return _mm_slli_epi32(a,num); }
shift_left_scal(vec_int a,vec_int num)203 inline vec_int   shift_left_scal( vec_int a, vec_int num)     { return _mm_sll_epi32(a,num); } // _mm_sll_pi32(a,num) en MMX
204 //inline scal_int shift_left_scal( scal_int a, scal_int num) { return _mm_sll_pi32(a,num); }
shift_left_scal(vec_int a,int num)205 inline vec_int   shift_left_scal( vec_int a, int num)         { return _mm_slli_epi32(a,num); } // _mm_slli_pi32(a,num) en MMX
206 //inline scal_int shift_left_scal( scal_int a, int num) { return _mm_slli_pi32(a,num); }
207 
208 // shift ( and fill with the sign bit )
shift_right_vec(vec_int a,vec_int num)209 inline vec_int   shift_right_vec( vec_int a, vec_int num)     { return _mm_sra_epi32(a,num); }
shift_right_vec(vec_int a,int num)210 inline vec_int   shift_right_vec( vec_int a, int num)         { return _mm_srai_epi32(a,num); }
shift_right_scal(vec_int a,vec_int num)211 inline vec_int   shift_right_scal( vec_int a, vec_int num)     { return _mm_sra_epi32(a,num); } // _mm_sra_pi32(a,num) en MMX
212 //inline scal_int shift_right_scal( scal_int a, scal_int num) { return _mm_sra_pi32(a,num); }
shift_right_scal(vec_int a,int num)213 inline vec_int   shift_right_scal( vec_int a, int num)         { return _mm_srai_epi32(a,num); } // _mm_srai_pi32(a,num) en MMX
214 //inline scal_int shift_right_scal( scal_int a, int num) { return _mm_srai_pi32(a,num); }
215 
216 // shift ( and fill with 0's )
shift_right_vec_logical(vec_int a,vec_int num)217 inline vec_int   shift_right_vec_logical( vec_int a, vec_int num)     { return _mm_srl_epi32(a,num); }
shift_right_vec_logical(vec_int a,int num)218 inline vec_int   shift_right_vec_logical( vec_int a, int num)         { return _mm_srli_epi32(a,num); }
shift_right_scal_logical(vec_int a,vec_int num)219 inline vec_int   shift_right_scal_logical( vec_int a, vec_int num)     { return _mm_srl_epi32(a,num); } // _mm_sra_pi32(a,num) en MMX
220 //inline scal_int shift_right_scal_logical( scal_int a, scal_int num) { return _mm_srl_pi32(a,num); }
shift_right_scal_logical(vec_int a,int num)221 inline vec_int   shift_right_scal_logical( vec_int a, int num)         { return _mm_srli_epi32(a,num); } // _mm_srai_pi32(a,num) en MMX
222 //inline scal_int shift_right_scal_logical( scal_int a, int num) { return _mm_srli_pi32(a,num); }
223 
224 // Logic
225 // Ajouts YO;; supprime
226 //inline vec_float and_vec( vec_float a, vec_int b)   { return _mm_and_ps(a,b); }
227 //inline vec_float and_vec( vec_int a, vec_float b)   { return _mm_and_ps(a,b); }
228 
and_vec(vec_float a,vec_float b)229 inline vec_float and_vec( vec_float a, vec_float b)   { return _mm_and_ps(a,b); }
and_vec(vec_int a,vec_int b)230 inline vec_int   and_vec( vec_int a, vec_int b)       { return _mm_and_si128(a,b); }
and_scal(vec_float a,vec_float b)231 inline vec_float and_scal( vec_float a, vec_float b)  { return _mm_and_ps(a,b); }
and_scal(vec_int a,vec_int b)232 inline vec_int   and_scal( vec_int a, vec_int b)      { return _mm_and_si128(a,b); } // _mm_and_si64(a,b) en MMX
233 //inline scal_int   and_scal( scal_int a, scal_int b)      { return _mm_and_si64(a,b); }
234 
or_vec(vec_float a,vec_float b)235 inline vec_float or_vec( vec_float a, vec_float b)    { return _mm_or_ps(a,b); }
or_vec(vec_int a,vec_int b)236 inline vec_int   or_vec( vec_int a, vec_int b)        { return _mm_or_si128(a,b); }
or_scal(vec_float a,vec_float b)237 inline vec_float or_scal( vec_float a, vec_float b)   { return _mm_or_ps(a,b); }
or_scal(vec_int a,vec_int b)238 inline vec_int   or_scal( vec_int a, vec_int b)       { return _mm_or_si128(a,b); } // _mm_or_si64(a,b) en MMX
239 //inline scal_int or_scal( scal_int a, scal_int b) { return _mm_or_si64(a,b); }
240 
xor_vec(vec_float a,vec_float b)241 inline vec_float xor_vec( vec_float a, vec_float b)   { return _mm_xor_ps(a,b); }
xor_vec(vec_int a,vec_int b)242 inline vec_int   xor_vec( vec_int a, vec_int b)       { return _mm_xor_si128(a,b); }
xor_scal(vec_float a,vec_float b)243 inline vec_float xor_scal( vec_float a, vec_float b)  { return _mm_xor_ps(a,b); }
xor_scal(vec_int a,vec_int b)244 inline vec_int   xor_scal( vec_int a, vec_int b)      { return _mm_xor_si128(a,b); } // _mm_xor_si64(a,b) en MMX
245 //inline scal_int xor_scal( scal_int a, scal_int b) { return _mm_xor_si64(a,b); }
246 
247 //------------------------------------------------------------------------------------------------------------
248 // YO : remplacement de inline vec_float par inline vec_int dans les operations de comparaison entre vec_float
249 // pour une meilleur compatibilit� avec la compilation vectorielle
250 //------------------------------------------------------------------------------------------------------------
251 
252 // cast (without conversion)
cast2vec_float(vec_int x)253 inline vec_float cast2vec_float(vec_int x)				{ return _mm_castsi128_ps(x); }
cast2vec_int(vec_float x)254 inline vec_int cast2vec_int(vec_float x)				{ return _mm_castps_si128(x); }
255 
256 // convertions
conv2vec_float(vec_int x)257 inline vec_float conv2vec_float(vec_int x)				{ return _mm_cvtepi32_ps(x); }
conv2vec_int(vec_float x)258 inline vec_int conv2vec_int(vec_float x)				{ return _mm_cvtps_epi32(x); }
259 
260 // comparaison
261 //inline vec_float int2float( vec_int a)   { return _mm_cvtepi32_ps(a); }
262 
gt_vec(vec_float a,vec_float b)263 inline vec_float gt_vec( vec_float a, vec_float b)    	{ return _mm_cmpgt_ps(a,b); }
gt_vec(vec_int a,vec_float b)264 inline vec_float gt_vec( vec_int a, vec_float b)    	{ return _mm_cmpgt_ps(_mm_cvtepi32_ps(a),b); }
gt_vec(vec_float a,vec_int b)265 inline vec_float gt_vec( vec_float a, vec_int b)    	{ return _mm_cmpgt_ps(a,_mm_cvtepi32_ps(b)); }
gt_vec(vec_int a,vec_int b)266 inline vec_int   gt_vec( vec_int a, vec_int b)      	{ return _mm_cmpgt_epi32(a,b); }
267 
gt_scal(vec_float a,vec_float b)268 inline vec_float gt_scal( vec_float a, vec_float b)    	{ return _mm_cmpgt_ps(a,b); }
gt_scal(vec_int a,vec_float b)269 inline vec_float gt_scal( vec_int a, vec_float b)    	{ return _mm_cmpgt_ps(_mm_cvtepi32_ps(a),b); }
gt_scal(vec_float a,vec_int b)270 inline vec_float gt_scal( vec_float a, vec_int b)    	{ return _mm_cmpgt_ps(a,_mm_cvtepi32_ps(b)); }
gt_scal(vec_int a,vec_int b)271 inline vec_int   gt_scal( vec_int a, vec_int b)      	{ return _mm_cmpgt_epi32(a,b); }
272 
273 // choose between two values choose(c,u,v) = c?u:v
274 // the type of the result depends of the types of u and v, not of the type of c
275 
choose(vec_float c,vec_float u,vec_float v)276 inline vec_float choose(vec_float c, vec_float u, vec_float v)	{ return _mm_or_ps(_mm_and_ps(c,u), _mm_andnot_ps(c,v)); }
choose(vec_float c,vec_int u,vec_float v)277 inline vec_float choose(vec_float c, vec_int u, vec_float v)	{ return _mm_or_ps(_mm_and_ps(c,_mm_cvtepi32_ps(u)), _mm_andnot_ps(c,v)); }
choose(vec_float c,vec_float u,vec_int v)278 inline vec_float choose(vec_float c, vec_float u, vec_int v)	{ return _mm_or_ps(_mm_and_ps(c,u), _mm_andnot_ps(c,_mm_cvtepi32_ps(v))); }
279 
choose(vec_int c,vec_float u,vec_float v)280 inline vec_float choose(vec_int c, vec_float u, vec_float v)	{ return choose(cast2vec_float(c), u, v); }
choose(vec_int c,vec_int u,vec_float v)281 inline vec_float choose(vec_int c, vec_int u, vec_float v)		{ return choose(cast2vec_float(c), u, v); }
choose(vec_int c,vec_float u,vec_int v)282 inline vec_float choose(vec_int c, vec_float u, vec_int v)		{ return choose(cast2vec_float(c), u, v); }
283 
choose(vec_int c,vec_int u,vec_int v)284 inline vec_int choose(vec_int c, vec_int u, vec_int v)			{ return _mm_or_si128(_mm_and_si128(c,u), _mm_andnot_si128(c,v)); }
choose(vec_float c,vec_int u,vec_int v)285 inline vec_int choose(vec_float c, vec_int u, vec_int v)		{ return choose(cast2vec_int(c), u, v); }
286 
287 // choose between two values choosezero(c,u) = c?u:0
choosezero(vec_float c,vec_float u)288 inline vec_float choosezero(vec_float c, vec_float u)			{ return _mm_and_ps(c,u); }
choosezero(vec_int c,vec_float u)289 inline vec_float choosezero(vec_int c, vec_float u)				{ return choosezero(cast2vec_float(c), u); }
290 
choosezero(vec_int c,vec_int u)291 inline vec_int choosezero(vec_int c, vec_int u)					{ return _mm_and_si128(c,u); }
choosezero(vec_float c,vec_int u)292 inline vec_int choosezero(vec_float c, vec_int u)				{ return choosezero(cast2vec_int(c), u); }
293 
294 
295 //inline vec_int gt_vec( vec_float a, vec_float b)    { return _mm_srli_epi32(_mm_cmpgt_ps(a,b),31); }
296 //inline vec_int gt_vec( vec_float a, vec_float b)    { vec_univ v; v.f4 = _mm_cmpgt_ps(a,b); return _mm_srli_epi32(v.i4,31); }
297 //inline vec_int   gt_vec( vec_int a, vec_int b)        { return _mm_cmpgt_epi32(a,b); }
298 //inline vec_int gt_scal( vec_float a, vec_float b)   { return _mm_srli_epi32(_mm_cmpgt_ss(a,b),31); }
299 //inline vec_int gt_scal( vec_float a, vec_float b)   { vec_univ v; v.f4 = _mm_cmpgt_ss(a,b); return _mm_srli_epi32(v.i4,31); }
300 //inline vec_int gt_scal( vec_float a, vec_float b)   	{ return _mm_srli_epi32(_mm_cmpgt_ss(a,b),31); }
301 //inline vec_int gt_scal( vec_int a, vec_float b)   	{ return _mm_srli_epi32(_mm_cmpgt_ss(a,b),31); }
302 //inline vec_int   gt_scal( vec_int a, vec_int b)       { return _mm_cmpgt_epi32(a,b); } // _mm_cmpgt_pi32(a,b) en MMX
303 
304 //inline scal_int gt_scal( scal_int a, scal_int b) { return (__m128i) _mm_cmpgt_pi32(a,b); }
305 
306 #if 0
307 
308 inline vec_int lt_vec( vec_float a, vec_float b)    { return _mm_cmplt_ps(a,b); }
309 inline vec_int   lt_vec( vec_int a, vec_int b)        { return _mm_cmpgt_epi32(b,a); }
310 inline vec_int lt_scal( vec_float a, vec_float b)   { return _mm_cmplt_ss(a,b); }
311 inline vec_int   lt_scal( vec_int a, vec_int b)       { return _mm_cmpgt_epi32(b,a); } // _mm_cmpgt_pi32(b,a) en MMX
312 //inline scal_int lt_scal( scal_int a, scal_int b) { return _mm_cmpgt_pi32(b,a); }
313 
314 inline vec_int ge_vec( vec_float a, vec_float b)    { return _mm_cmpge_ps(a,b); }
315 inline vec_int   ge_vec( vec_int a, vec_int b)        { return _mm_xor_si128( _mm_cmpgt_epi32(a,b), _mm_cmpeq_epi32(a,b)); }
316 inline vec_int ge_scal( vec_float a, vec_float b)   { return _mm_cmpge_ss(a,b); }
317 inline vec_int   ge_scal( vec_int a, vec_int b)       { return _mm_xor_si128( _mm_cmpgt_epi32(a,b), _mm_cmpeq_epi32(a,b)); } // _mm_xor_si64,_mm_cmpgt_pi32,_mm_cmpeq_pi32 MMX
318 //inline scal_int ge_scal( scal_int a, scal_int b) { return _mm_xor_si64( _mm_cmpgt_pi32(a,b),_mm_cmpeq_pi32(a,b)); }
319 
320 
321 inline vec_int le_vec( vec_float a, vec_float b)    { return _mm_cmple_ps(a,b); }
322 inline vec_int   le_vec( vec_int a, vec_int b)        { return _mm_xor_si128( _mm_cmpgt_epi32(b,a), _mm_cmpeq_epi32(b,a)); }
323 inline vec_int le_scal( vec_float a, vec_float b)   { return _mm_cmple_ss(a,b); }
324 inline vec_int   le_scal( vec_int a, vec_int b)       { return _mm_xor_si128( _mm_cmpgt_epi32(b,a), _mm_cmpeq_epi32(b,a)); } // _mm_xor_si64,_mm_cmpgt_pi32,_mm_cmpeq_pi32 MMX
325 //inline scal_int le_scal( scal_int a, scal_int b) { return _mm_xor_si64( _mm_cmpgt_pi32(b,a),_mm_cmpeq_pi32(b,a)); }
326 
327 inline vec_int eq_vec( vec_float a, vec_float b)    { return _mm_cmpeq_ps(a,b); }
328 inline vec_int   eq_vec( vec_int a, vec_int b)        { return _mm_cmpeq_epi32(a,b); }
329 inline vec_int eq_scal( vec_float a, vec_float b)   { return _mm_cmpeq_ss(a,b); }
330 inline vec_int   eq_scal( vec_int a, vec_int b)       { return _mm_cmpeq_epi32(a,b); } // _mm_cmpeq_pi32(a,b) en MMX
331 //inline scal_int eq_scal( scal_int a, scal_int b) { return _mm_cmpeq_pi32(a,b); }
332 
333 inline vec_int neq_vec( vec_float a, vec_float b)   { return _mm_cmpneq_ps(a,b); }
334 inline vec_int   neq_vec( vec_int a, vec_int b)       { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b), _mm_cmpeq_epi32(a,a)); }
335 inline vec_int neq_scal( vec_float a, vec_float b)  { return _mm_cmpneq_ss(a,b); }
336 inline vec_int   neq_scal( vec_int a, vec_int b)      { return _mm_andnot_si128(_mm_cmpeq_epi32(a,b), _mm_cmpeq_epi32(a,a)); } // _mm_andnot_si64,_mm_cmpeq_pi32 MMX
337 //inline scal_int neq_scal( scal_int a, scal_int b) { return _mm_andnot_si64(_mm_cmpeq_pi32(a,b),SCAL_INT_ALL_ONE); }
338 
339 #endif
340 
341 // memory
342 
343 #if 0
344 inline vec_float set_vec( double a)       	{ float val = float(a); vec_float temp = _mm_load_ss(&val); return _mm_shuffle_ps(temp,temp,0x00); }
345 inline vec_float set_vec( float a)         	{ float val = a; vec_float temp = _mm_load_ss(&val); return _mm_shuffle_ps(temp,temp,0x00); }
346 inline vec_int   set_vec( long int a)    	{ vec_int temp = _mm_cvtsi32_si128(int(a)); temp = _mm_unpacklo_epi32(temp,temp); return _mm_unpacklo_epi32(temp,temp); }
347 inline vec_int   set_vec( int a)          	{ vec_int temp = _mm_cvtsi32_si128(a); temp = _mm_unpacklo_epi32(temp,temp); return _mm_unpacklo_epi32(temp,temp);}
348 inline vec_int   set_vec( short a)      	{ vec_int temp = _mm_cvtsi32_si128(int(a)); temp = _mm_unpacklo_epi32(temp,temp); return _mm_unpacklo_epi32(temp,temp); }
349 //inline scal_int  set_vec( long int a) { _mm_cvtsi32_si64(int(a)); }
350 //inline scal_int  set_vec( int a) { _mm_cvtsi32_si64(a); }
351 //inline scal_int  set_vec( short a) { _mm_cvtsi32_si64(int(a)); }
352 #endif
353 
354 #if 0
355 
356 inline vec_float set_vec( double a, double b, double c, double d)         { __vec_float temp; temp.s[0]=float(a); temp.s[1]=float(b); temp.s[2]=float(c); temp.s[3]=float(d); return temp.v; }
357 inline vec_float set_vec( float a, float b, float c, float d)             { __vec_float temp; temp.s[0]=a; temp.s[1]=b; temp.s[2]=c; temp.s[3]=d; return temp.v; }
358 inline vec_int   set_vec( int a, int b, int c, int d)                     { __vec_int temp; temp.s[0]=a; temp.s[1]=b; temp.s[2]=c; temp.s[3]=d; return temp.v; }
359 inline vec_int   set_vec( long int a, long int b, long int c, long int d) { __vec_int temp; temp.s[0]=int(a); temp.s[1]=int(b); temp.s[2]=int(c); temp.s[3]=int(d); return temp.v; }
360 inline vec_int   set_vec( short a, short b, short c, short d)             { __vec_int temp; temp.s[0]=short(a); temp.s[1]=short(b); temp.s[2]=short(c); temp.s[3]=short(d); return temp.v; }
361 
362 #endif
363 
set_vec(float a,float b,float c,float d)364 inline vec_float set_vec( float a, float b, float c, float d)         	{ return vec_float(a,b,c,d); }
set_vec(int a,int b,int c,int d)365 inline vec_int   set_vec( int a, int b, int c, int d)             		{ return vec_int(a,b,c,d); }
366 
set_vec(float a)367 inline vec_float set_vec( float a)                                		{ return vec_float(a); }
set_vec(int a)368 inline vec_int   set_vec( int a)                                  		{ return vec_int(a); }
369 
load_a_vec(float * a)370 inline vec_float load_a_vec( float* a)                                { return _mm_load_ps(a); }
load_a_vec(int * a)371 inline vec_int   load_a_vec( int* a)                                  { return _mm_load_si128((__m128i*)a); }
372 
load_u_vec(float * a)373 inline vec_float load_u_vec( float* a)                                { return _mm_loadu_ps(a); }
load_u_vec(int * a)374 inline vec_int   load_u_vec( int* a)                                  { return _mm_loadu_si128((__m128i*)a); }
375 
376 // nouvelles fonctions d'�criture sans polluer le cache
store_stream(float * a,vec_float b)377 inline void store_stream( float* a, vec_float b)                       { return _mm_stream_ps(a,b); }
store_stream(int * a,vec_int b)378 inline void store_stream( int* a, vec_int b)                           { return _mm_stream_si128((__m128i*)a,b); }
379 
store_a_vec(float * a,vec_float b)380 inline void store_a_vec( float* a, vec_float b)                       { return _mm_store_ps(a,b); }
store_a_vec(int * a,vec_int b)381 inline void store_a_vec( int* a, vec_int b)                           { return _mm_store_si128((__m128i*)a,b); }
382 
store_u_vec(float * a,vec_float b)383 inline void store_u_vec( float* a, vec_float b)                       { return _mm_storeu_ps(a,b); }
store_u_vec(int * a,vec_int b)384 inline void store_u_vec( int* a, vec_int b)                           { return _mm_storeu_si128((__m128i*)a,b); }
385 
386 
load_scal(float * a)387 inline vec_float load_scal(float* a) { return _mm_load_ss(a); }
load_scal(int * a)388 inline vec_int   load_scal(int* a) { return _mm_cvtsi32_si128(*a); }
389 //inline scal_int  load_scal(int* a) { return _mm_cvtsi32_si64(*a); }
390 
store_scal(float * a,vec_float content)391 inline void store_scal(float* a, vec_float content) { return _mm_store_ss(a,content); }
store_scal(int * a,vec_int content)392 inline void store_scal(int* a, vec_int content)     { *a = _mm_cvtsi128_si32(content); return; }
393 //inline void store_scal(int* a, scal_int content) { *a = _mm_cvtsi64_si32(content); return; }
394 
REC0(vec_float a)395 inline vec_float REC0(vec_float a) { return _mm_unpacklo_ps(a,a); }
REC1(vec_float a)396 inline vec_float REC1(vec_float a) { return _mm_unpacklo_ps(a,a); }
REC2(vec_float a)397 inline vec_float REC2(vec_float a) { return _mm_shuffle_ps(a,a,0x90); }
REC3(vec_float a)398 inline vec_float REC3(vec_float a) { return _mm_shuffle_ps(a,a,0x1B); }
399 
REC0(vec_int a)400 inline vec_int REC0(vec_int a) { return _mm_unpacklo_epi32(a,a); }
REC1(vec_int a)401 inline vec_int REC1(vec_int a) { return _mm_unpacklo_epi32(a,a); }
REC2(vec_int a)402 inline vec_int REC2(vec_int a) { return _mm_shuffle_epi32(a,0x90); }
REC3(vec_int a)403 inline vec_int REC3(vec_int a) { return _mm_shuffle_epi32(a,0x1B); }
404 
405 // scalar to vector: takes 4 vector which lower elements stands for a scalar value and rebuild a vector from these 4 scalar
406 //inline vec_float SCAL2VEC(vec_float a0,vec_float a1,vec_float a2,vec_float a3) { return _mm_shuffle_ps(_mm_shuffle_ps(a0,a1,0x00),_mm_shuffle_ps(a2,a3,0x00),0x88); }
SCAL2VEC(vec_float a0,vec_float a1,vec_float a2,vec_float a3)407 inline vec_float SCAL2VEC(vec_float a0,vec_float a1,vec_float a2,vec_float a3) { return _mm_unpacklo_ps(_mm_unpacklo_ps(a0,a2),_mm_unpacklo_ps(a1,a3)); }
SCAL2VEC(vec_int a0,vec_int a1,vec_int a2,vec_int a3)408 inline vec_int   SCAL2VEC(vec_int a0,vec_int a1,vec_int a2,vec_int a3) { return _mm_unpacklo_epi32(_mm_unpacklo_epi32(a0,a2),_mm_unpacklo_epi32(a1,a3)); }
409 //inline vec_int   SCAL2VEC(scal_int a0,scal_int a1,scal_int a2,scal_int a3) { return _mm_unpacklo_epi32( _mm_movpi64_epi64(_mm_unpacklo_pi32(a0,a2)), _mm_movpi64_epi64(_mm_unpacklo_pi32(a1,a3)) ); } // ou _mm_set_epi64( _mm_unpacklo_pi32(a0,a1), _mm_unpacklo_pi32(a2,a3))
410 
SCAL2VEC(double a0,double a1,double a2,double a3)411 inline vec_float SCAL2VEC(double a0, double a1, double a2, double a3) { return _mm_set_ps(float(a3),float(a2),float(a1),float(a0)); }
SCAL2VEC(float a0,float a1,float a2,float a3)412 inline vec_float SCAL2VEC(float a0, float a1, float a2, float a3) { return _mm_set_ps(a3,a2,a1,a0); }
SCAL2VEC(long a0,long a1,long a2,long a3)413 inline vec_int   SCAL2VEC(long a0, long a1, long a2, long a3) { return _mm_set_epi32(int(a3),int(a2),int(a1),int(a0)); }
SCAL2VEC(int a0,int a1,int a2,int a3)414 inline vec_int   SCAL2VEC(int a0, int a1, int a2, int a3) { return _mm_set_epi32(a3,a2,a1,a0); }
SCAL2VEC(short a0,short a1,short a2,short a3)415 inline vec_int   SCAL2VEC(short a0, short a1, short a2, short a3) { return _mm_set_epi32(int(a3),int(a2),int(a1),int(a0)); }
416 
417 
418 // vector to scalar: build a scalar vector from one element of the initial vector
VEC2SCALVEC0(vec_float a)419 inline vec_float VEC2SCALVEC0(vec_float a) { return a; } // return x,x,x,a0   // _mm_shuffle_ps(a,a,Ox00) would return a0,a0,a0,a0
VEC2SCALVEC1(vec_float a)420 inline vec_float VEC2SCALVEC1(vec_float a) { return _mm_shuffle_ps(a,a,0x55); } // return a1,a1,a1,a1
VEC2SCALVEC2(vec_float a)421 inline vec_float VEC2SCALVEC2(vec_float a) { return _mm_shuffle_ps(a,a,0xAA); } // return a2,a2,a2,a2
VEC2SCALVEC3(vec_float a)422 inline vec_float VEC2SCALVEC3(vec_float a) { return _mm_shuffle_ps(a,a,0xFF); } // return a3,a3,a3,a3
423 
VEC2SCALVEC0(vec_int a)424 inline vec_int VEC2SCALVEC0(vec_int a) { return a; } // return x,x,x,a0   // _mm_shuffle_epi32(a,Ox00) would return a0,a0,a0,a0
VEC2SCALVEC1(vec_int a)425 inline vec_int VEC2SCALVEC1(vec_int a) { return _mm_shuffle_epi32(a,0x55); } // return a1,a1,a1,a1
VEC2SCALVEC2(vec_int a)426 inline vec_int VEC2SCALVEC2(vec_int a) { return _mm_shuffle_epi32(a,0xAA); } // return a2,a2,a2,a2
VEC2SCALVEC3(vec_int a)427 inline vec_int VEC2SCALVEC3(vec_int a) { return _mm_shuffle_epi32(a,0xFF); } // return a3,a3,a3,a3
428 
429 //inline scal_int VEC2SCALVEC0(vec_int a) { return _mm_movepi64_pi64(a); }  // ATTENTION !!!! :
430 //inline scal_int VEC2SCALVEC1(vec_int a) { __m64 temp = _mm_movepi64_pi64(a); return _mm_unpackhi_pi32(temp,temp); } // VEC2SCALVEC0 et 1 peuvent �tre r�unis en une instruction plus efficace
431 //inline scal_int VEC2SCALVEC2(vec_int a) { return _mm_movepi64_pi64(_mm_shuffle_epi32(a,0xAA)); }
432 //inline scal_int VEC2SCALVEC3(vec_int a) { return _mm_movepi64_pi64(_mm_shuffle_epi32(a,0xFF)); }
433 
434 // vector to scalar: build a single scalar from a vector
VEC2SCAL0(vec_float a)435 inline float VEC2SCAL0(vec_float a) { float temp; _mm_store_ss(&temp,a); return temp; }
VEC2SCAL1(vec_float a)436 inline float VEC2SCAL1(vec_float a) { float temp; _mm_store_ss(&temp,_mm_shuffle_ps(a,a,0x55)); return temp; }
VEC2SCAL2(vec_float a)437 inline float VEC2SCAL2(vec_float a) { float temp; _mm_store_ss(&temp,_mm_shuffle_ps(a,a,0xAA)); return temp; }
VEC2SCAL3(vec_float a)438 inline float VEC2SCAL3(vec_float a) { float temp; _mm_store_ss(&temp,_mm_shuffle_ps(a,a,0xFF)); return temp; }
439 
VEC2SCAL0(vec_int a)440 inline int VEC2SCAL0(vec_int a) { return _mm_cvtsi128_si32(a); }
VEC2SCAL1(vec_int a)441 inline int VEC2SCAL1(vec_int a) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(a,0x55)); }
VEC2SCAL2(vec_int a)442 inline int VEC2SCAL2(vec_int a) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(a,0xAA)); }
VEC2SCAL3(vec_int a)443 inline int VEC2SCAL3(vec_int a) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(a,0xFF)); }
444 //inline int VEC2SCAL0(scal_int a) { return _mm_cvtsi64_si32(a); }
445 
446 // select: if( select == 0 ) then a ; else b ;
select_vec(vec_float select,vec_float a,vec_float b)447 inline vec_float select_vec( vec_float select, vec_float a, vec_float b) { return _mm_or_ps( _mm_andnot_ps(select,a), _mm_and_ps(select,b));}
select_scal(vec_float select,vec_float a,vec_float b)448 inline vec_float select_scal( vec_float select, vec_float a, vec_float b){ return _mm_or_ps( _mm_andnot_ps(select,a), _mm_and_ps(select,b));}
select_vec(vec_int select,vec_float a,vec_float b)449 inline vec_float select_vec( vec_int select, vec_float a, vec_float b) { __m128 temp = _mm_cvtepi32_ps(select); return _mm_or_ps( _mm_andnot_ps(temp,a), _mm_and_ps(temp,b));}
select_scal(vec_int select,vec_float a,vec_float b)450 inline vec_float select_scal( vec_int select, vec_float a, vec_float b){ __m128 temp = _mm_cvtepi32_ps(select); return _mm_or_ps( _mm_andnot_ps(temp,a), _mm_and_ps(temp,b));}
select_vec(vec_int select,vec_int a,vec_int b)451 inline vec_int select_vec( vec_int select, vec_int a, vec_int b)  { return _mm_or_si128( _mm_andnot_si128(select,a), _mm_and_si128(select,b)); }
select_scal(vec_int select,vec_int a,vec_int b)452 inline vec_int select_scal( vec_int select, vec_int a, vec_int b) { return _mm_or_si128( _mm_andnot_si128(select,a), _mm_and_si128(select,b)); } // ou MMX
453 
454 // vectorial version of the "mem" Faust key-word
455 // return a[2] a[1] a[0] b[3]
mem1_vec(vec_float a,vec_float b)456 inline vec_float mem1_vec( vec_float a, vec_float b)       { return _mm_shuffle_ps(_mm_shuffle_ps(b,a,0x4E),a,0x99); }
mem1_vec(vec_int a,vec_int b)457 inline vec_int   mem1_vec( vec_int a, vec_int b)           {
458   return _mm_unpacklo_epi32( _mm_shuffle_epi32( _mm_unpacklo_epi32( _mm_shuffle_epi32( b, 0xFF), a), 0xEE), _mm_shuffle_epi32( a, 0x88) );
459 }
460 
461 // return a[1] a[0] b[3] b[2]
mem2_vec(vec_float a,vec_float b)462 inline vec_float mem2_vec( vec_float a, vec_float b)       { return _mm_shuffle_ps(b,a,0x4E); }
mem2_vec(vec_int a,vec_int b)463 inline vec_int   mem2_vec( vec_int a, vec_int b)           { return _mm_shuffle_epi32( _mm_unpackhi_epi32( b, _mm_shuffle_epi32( a, 0x44)), 0xD8 );
464 
465 }
466 
467 // return a[0] b[3] b[2] b[1]
mem3_vec(vec_float a,vec_float b)468 inline vec_float mem3_vec( vec_float a, vec_float b)       { return _mm_shuffle_ps(b,_mm_shuffle_ps(b,a,0x4E),0x99);  }
mem3_vec(vec_int a,vec_int b)469 inline vec_int   mem3_vec( vec_int a, vec_int b)           {
470   return _mm_unpacklo_epi32( _mm_shuffle_epi32( b, 0x99), _mm_shuffle_epi32( _mm_unpackhi_epi32( b, _mm_shuffle_epi32( a, 0x00)), 0xEE) );
471 }
472 
473 // conversion
bool2float(vec_float a)474 inline vec_float bool2float( vec_float a )  { return _mm_and_ps(a,set_vec(1.0f)); }
bool2float(vec_int a)475 inline vec_float bool2float( vec_int a )    { return _mm_cvtepi32_ps(_mm_and_si128(a,_mm_sub_epi32(_mm_xor_si128(a,a),_mm_cmpeq_epi32(a,a)))); }
476 
bool2int(vec_int a)477 inline vec_int   bool2int( vec_int   a)     { return _mm_and_si128(a,_mm_sub_epi32(_mm_xor_si128(a,a),_mm_cmpeq_epi32(a,a))); }
bool2int(vec_float a)478 inline vec_int   bool2int( vec_float a )    { return _mm_cvtps_epi32(_mm_and_ps(a,set_vec(1.0f))); }
479 
boolfloat2boolint(vec_float a)480 inline vec_int   boolfloat2boolint( vec_float a ) { vec_int   temp; asm volatile("" : "=xmm" (temp) : "0" (a)); return temp; }
boolint2boolfloat(vec_int a)481 inline vec_float boolint2boolfloat( vec_int   a ) { vec_float temp; asm volatile("" : "=xmm" (temp) : "0" (a)); return temp; }
482 
483 #elif  defined(__ALTIVEC__)
484 
485 /****************************************************/
486 /*                                                  */
487 /*  			  ALTIVEC implementation    */
488 /*                                                  */
489 /****************************************************/
490 
491 //#define vec_float vector float
492 //#define vec_int vector signed int
493 //#define vec_bool vector bool int
494 
495 struct vec_int
496 {
497 	vector signed int vec;
vec_intvec_int498 	vec_int()								{}
vec_intvec_int499 	vec_int(vector signed int m)			{ vec = m; }
500 //	operator  __m128i() const				{ return vec; }
501 
502 };
503 
504 struct vec_float
505 {
506 //	union { __m128 vec; __m128i i4; };
507 	vector float vec;
vec_floatvec_float508 	vec_float()							{}
vec_floatvec_float509 	vec_float(vector float m)			{ vec = m; }
510 	//vec_float(vec_int a)   		{ vec = _mm_cvtepi32_ps(a); }
511 
512 	//operator  __m128() const	{ return vec; }
513 };
514 
515 typedef union{
516   float s[4];
517   vec_float v;
518 } __vec_float;
519 
520 
521 typedef union{
522   int s[4];
523   vec_int v;
524 } __vec_int;
525 
526 // Non-Java mode: during underflow zero result is returned
527 // Not compatible with the Java-IEEE-C9X standard ( which
528 // deliver denormalized result in case of underflow )
529 #define NO_DENORMALIZE vec_mtvscr(vec_or(vec_mfvscr(),(vector unsigned short)(0x8000)))
530 #define DENORMALIZE    vec_mtvscr(vec_or(vec_mfvscr(),(vector unsigned short)(0x0000)))
531 
532 // constants
533 
534 // 0 0 0 0
535 #define VEC_INT_ZERO(a)   vec_xor(a,a)
536 
537 // 0xffffffff 0xffffffff 0xffffffff 0xffffffff
538 #define VEC_INT_ONES(a)   (vector signed int)vec_cmpeq(a,a)
539 
540 // 1 1 1 1
541 #define VEC_INT_ONE(a) vec_splat_s32(1)
542 
543 // -1 -1 -1 -1
544 #define VEC_INT_MINUS_ONE(a) vec_splat_s32(-1)
545 
546 // a must belong to [-16,15]
547 // no efficient equivalent with SSE2
548 #define VEC_INT_MINUS_16_TO_15(a) vec_splat_s32(a)
549 
550 // This not exactly equivalent to the SSE2 version
551 // the power must belong to [17,31]
552 // ( that is 2^17 - 1 = 32767 is the minimum
553 //   and  2^31 - 1 = 2147483647 is the maximum;
554 // if you need 2^32 - 1, use VEC_INT_ONES )
555 // Example: 2^19 - 1 = 524287 -> VEC_INT_PW2_MINUS_1(a,19)
556 #define VEC_INT_PW2_MINUS_1(a,pw) vec_sr((vector signed int)vec_cmpeq(a,a), vec_splat_u32(32-pw))
557 
558 // This not exactly equivalent to the SSE2 version
559 // the power must belong to [4,18]
560 // ( that is 2^18 = 262144 is the maximum
561 //   and  2^4 = 16 is the minimum;
562 // if you need 2^0 = 1, use VEC_INT_ONE,
563 // if you need 2^1 = 2, use VEC_INT_MINUS_16_TO_15(2)
564 // if you need 2^2 = 4, use VEC_INT_MINUS_16_TO_15(4)
565 // if you need 2^3 = 8, use VEC_INT_MINUS_16_TO_15(8) )
566 // Example: 2^10 = 1024 -> VEC_INT_PW2(a,10)
567 #define VEC_INT_PW2(a,pw) vec_sl(vec_splat_s32(8), vec_splat_u32(pw-3))
568 
569 
570 //vec_sr(a,(vector unsigned int)num);
571 
572 // 0.0 0.0 0.0 0.0
573 #define VEC_FLOAT_ZERO(a) vec_xor(a,a)
574 
575 // 0xffffffff 0xffffffff 0xffffffff 0xffffffff
576 #define VEC_FLOAT_ONES(a) (vector float)vec_cmpeq(a,a)
577 
578 // arithmetic
add_vec(vec_float a,vec_float b)579 inline vec_float add_vec( vec_float a, vec_float b)   { return vec_add(a,b); }
add_scal(vec_float a,vec_float b)580 inline vec_float add_scal(vec_float a, vec_float b)   { return vec_add(a,b); }
add_vec(vec_int a,vec_int b)581 inline vec_int   add_vec( vec_int a, vec_int b)       { return vec_add(a,b); }
add_scal(vec_int a,vec_int b)582 inline vec_int   add_scal(vec_int a, vec_int b)       { return vec_add(a,b); }
583 
sub_vec(vec_float a,vec_float b)584 inline vec_float sub_vec( vec_float a, vec_float b)   { return vec_sub(a,b); }
sub_scal(vec_float a,vec_float b)585 inline vec_float sub_scal( vec_float a, vec_float b)  { return vec_sub(a,b); }
sub_vec(vec_int a,vec_int b)586 inline vec_int   sub_vec( vec_int a, vec_int b)       { return vec_sub(a,b); }
sub_scal(vec_int a,vec_int b)587 inline vec_int   sub_scal( vec_int a, vec_int b)      { return vec_sub(a,b); }
588 
mul_vec(vec_float a,vec_float b)589 inline vec_float mul_vec( vec_float a, vec_float b)   { return vec_madd(a,b,(vec_float)(vec_splat_s32(int(0x00000000)))); }
mul_scal(vec_float a,vec_float b)590 inline vec_float mul_scal( vec_float a, vec_float b)  { return vec_madd(a,b,(vec_float)(vec_splat_s32(int(0x00000000)))); }
591 
592 // low 32 bits of a 32 * 32 bit multiplication: each double-word X and Y is broken down into two words, A & B and C & D:
593 // X = ( A << 16 ) + B
594 // Y = ( C << 16 ) + D
595 // then:
596 // X * Y = (( A << 16 ) + B ) * (( C << 16 ) + D )
597 // X * Y = ( A*C << 32 ) + ( A*D << 16 ) + ( B*C << 16 ) + B*D
598 // the partial result A*C does not appear in the low 32 bits result so does not need to be computed ( however, if it's different
599 // from zero, then there is an overflow )
600 // In this implementation A*D + B*C is computed in a single "vec_msum"
601 
mul_vec(vec_int a,vec_int b)602 inline vec_int mul_vec( vec_int a, vec_int b)   {
603   const vector unsigned int VEC_SIXTEEN_UINT32 = vec_splat_u32(-16);
604   return (vector signed int)vec_add( vec_sl( vec_msum( (vector unsigned short)a, (vector unsigned short)(vec_rl( b, VEC_SIXTEEN_UINT32 ) ), vec_splat_u32(0)  ), VEC_SIXTEEN_UINT32 ),   vec_mulo( (vector unsigned short)a, (vector unsigned short)b ) );
605                                                 }
mul_scal(vec_int a,vec_int b)606 inline vec_int mul_scal( vec_int a, vec_int b)   {
607   const vector unsigned int VEC_SIXTEEN_UINT32 = vec_splat_u32(-16);
608   return (vector signed int)vec_add( vec_sl( vec_msum( (vector unsigned short)a, (vector unsigned short)(vec_rl( b, VEC_SIXTEEN_UINT32 ) ), vec_splat_u32(0)  ), VEC_SIXTEEN_UINT32 ),   vec_mulo( (vector unsigned short)a, (vector unsigned short)b ) );
609                                                 }
610 //inline vec_int mul_vec( vec_int a, vec_int b)   { return (vec_int)vec_round(vec_madd((vec_float)(a),(vec_float)(b),(vec_float)(vec_splat_s32(int(0x00000000))))); }
611 //inline vec_int mul_scal( vec_int a, vec_int b)  { return (vec_int)vec_round(vec_madd((vec_float)(a),(vec_float)(b),(vec_float)(vec_splat_s32(int(0x00000000))))); }
612 
div_vec(vec_float a,vec_float b)613 inline vec_float div_vec( vec_float a, vec_float b)   { return vec_madd(a,vec_re(b),(vec_float)(vec_splat_s32(int(0x00000000)))); }
div_scal(vec_float a,vec_float b)614 inline vec_float div_scal( vec_float a, vec_float b)  { return vec_madd(a,vec_re(b),(vec_float)(vec_splat_s32(int(0x00000000)))); }
div_vec(vec_int a,vec_int b)615 inline vec_int div_vec( vec_int a, vec_int b)         { return (vec_int)vec_round(vec_madd((vec_float)(a),vec_re((vec_float)(b)),(vec_float)(vec_splat_s32(int(0x00000000))))); }
div_scal(vec_int a,vec_int b)616 inline vec_int div_scal( vec_int a, vec_int b)        { return (vec_int)vec_round(vec_madd((vec_float)(a),vec_re((vec_float)(b)),(vec_float)(vec_splat_s32(int(0x00000000))))); }
617 
mod_vec(vec_int a,vec_int N)618 inline vec_int   mod_vec( vec_int a, vec_int N) {
619 
620   vec_int temp = vec_sub(a,N);
621   vec_int zero = vec_splat_s32(int(0x00000000));
622   vector bool int select = (vector bool int )(vec_xor(vec_cmpgt(temp,zero),vec_cmpeq(temp,zero))); // a - N >= 0
623   return vec_sel(a,temp,select); // if( a - N >=0 ) return a - N; else return a;
624 }
625 
mod_scal(vec_int a,vec_int N)626 inline vec_int   mod_scal( vec_int a, vec_int N) {
627 
628   vec_int temp = vec_sub(a,N);
629   vec_int zero = vec_splat_s32(int(0x00000000));
630   vector bool int select = (vector bool int )(vec_xor(vec_cmpgt(temp,zero),vec_cmpeq(temp,zero))); // a - N >= 0
631   return vec_sel(a,temp,select); // if( a - N >=0 ) return a - N; else return a;
632 }
633 
634 // return a*b + c
madd_vec(vec_float a,vec_float b,vec_float c)635 inline vec_float madd_vec( vec_float a, vec_float b, vec_float c)  { return vec_madd(a,b,c); }
madd_scal(vec_float a,vec_float b,vec_float c)636 inline vec_float madd_scal( vec_float a, vec_float b, vec_float c) { return vec_madd(a,b,c); }
637 
638 // return  - ( a*b - c )
nmsub_vec(vec_float a,vec_float b,vec_float c)639 inline vec_float nmsub_vec( vec_float a, vec_float b, vec_float c)  { return vec_nmsub(a,b,c); }
nmsub_scal(vec_float a,vec_float b,vec_float c)640 inline vec_float nmsub_scal( vec_float a, vec_float b, vec_float c) { return vec_nmsub(a,b,c); }
641 
642 // return a*(1/b) + c
divadd_vec(vec_float a,vec_float b,vec_float c)643 inline vec_float divadd_vec( vec_float a, vec_float b, vec_float c)  { return vec_madd(a,vec_re(b),c); }
divadd_scal(vec_float a,vec_float b,vec_float c)644 inline vec_float divadd_scal( vec_float a, vec_float b, vec_float c) { return vec_madd(a,vec_re(b),c); }
645 
646 // return - ( a*(1/b) - c )
divsub_vec(vec_float a,vec_float b,vec_float c)647 inline vec_float divsub_vec( vec_float a, vec_float b, vec_float c)  { return vec_nmsub(a,vec_re(b),c); }
divsub_scal(vec_float a,vec_float b,vec_float c)648 inline vec_float divsub_scal( vec_float a, vec_float b, vec_float c) { return vec_nmsub(a,vec_re(b),c); }
649 
650 // logic
and_vec(vec_float a,vec_float b)651 inline vec_float and_vec( vec_float a, vec_float b)   { return vec_and(a,b); }
and_scal(vec_float a,vec_float b)652 inline vec_float and_scal( vec_float a, vec_float b)  { return vec_and(a,b); }
and_vec(vec_int a,vec_int b)653 inline vec_int   and_vec( vec_int a, vec_int b)       { return vec_and(a,b); }
and_scal(vec_int a,vec_int b)654 inline vec_int   and_scal( vec_int a, vec_int b)      { return vec_and(a,b); }
655 
or_vec(vec_float a,vec_float b)656 inline vec_float or_vec( vec_float a, vec_float b)    { return vec_or(a,b); }
or_scal(vec_float a,vec_float b)657 inline vec_float or_scal( vec_float a, vec_float b)   { return vec_or(a,b); }
or_vec(vec_int a,vec_int b)658 inline vec_int   or_vec( vec_int a, vec_int b)        { return vec_or(a,b); }
or_scal(vec_int a,vec_int b)659 inline vec_int   or_scal( vec_int a, vec_int b)       { return vec_or(a,b); }
660 
xor_vec(vec_float a,vec_float b)661 inline vec_float xor_vec( vec_float a, vec_float b)   { return vec_xor(a,b); }
xor_scal(vec_float a,vec_float b)662 inline vec_float xor_scal( vec_float a, vec_float b)  { return vec_xor(a,b); }
xor_vec(vec_int a,vec_int b)663 inline vec_int   xor_vec( vec_int a, vec_int b)       { return vec_xor(a,b); }
xor_scal(vec_int a,vec_int b)664 inline vec_int   xor_scal( vec_int a, vec_int b)      { return vec_xor(a,b); }
665 
666 // shift left
shift_left_vec(vec_int a,vec_int num)667 inline vec_int   shift_left_vec( vec_int a, vec_int num)     { return vec_sl(a,(vector unsigned int)num); }
shift_left_scal(vec_int a,vec_int num)668 inline vec_int   shift_left_scal( vec_int a, vec_int num)    { return vec_sl(a,(vector unsigned int)num); }
669 
670 // shift ( and fill with the sign bit )
shift_right_vec(vec_int a,vec_int num)671 inline vec_int   shift_right_vec( vec_int a, vec_int num)     { return vec_sra(a,(vector unsigned int)num); }
shift_right_scal(vec_int a,vec_int num)672 inline vec_int   shift_right_scal( vec_int a, vec_int num)    { return vec_sra(a,(vector unsigned int)num); }
673 
674 // shift ( and fill with 0's )
675 //inline vec_int   shift_right_vec_logical( vec_int a, int num)         { return vec_sr(a, set_vec(num) ); }
676 //inline vec_int   shift_right_scal_logical( vec_int a, int num)        { return vec_sr(a, set_vec(num) ); ); }
shift_right_vec_logical(vec_int a,vec_int num)677 inline vec_int   shift_right_vec_logical( vec_int a, vec_int num)     { return vec_sr(a,(vector unsigned int)num); }
shift_right_scal_logical(vec_int a,vec_int num)678 inline vec_int   shift_right_scal_logical( vec_int a, vec_int num)    { return vec_sr(a,(vector unsigned int)num); }
679 
680 // comparaison
gt_vec(vec_float a,vec_float b)681 inline vec_float gt_vec( vec_float a, vec_float b)    { return (vector float)vec_cmpgt(a,b); }
gt_scal(vec_float a,vec_float b)682 inline vec_float gt_scal( vec_float a, vec_float b)   { return (vector float)vec_cmpgt(a,b); }
gt_vec(vec_int a,vec_int b)683 inline vec_int gt_vec( vec_int a, vec_int b)        { return (vector signed int)vec_cmpgt(a,b); }
gt_scal(vec_int a,vec_int b)684 inline vec_int gt_scal( vec_int a, vec_int b)       { return (vector signed int)vec_cmpgt(a,b); }
685 
lt_vec(vec_float a,vec_float b)686 inline vec_float lt_vec( vec_float a, vec_float b)    { return (vector float)vec_cmplt(a,b); }
lt_scal(vec_float a,vec_float b)687 inline vec_float lt_scal( vec_float a, vec_float b)   { return (vector float)vec_cmplt(a,b); }
lt_vec(vec_int a,vec_int b)688 inline vec_int lt_vec( vec_int a, vec_int b)        { return (vector signed int)vec_cmplt(a,b); }
lt_scal(vec_int a,vec_int b)689 inline vec_int lt_scal( vec_int a, vec_int b)       { return (vector signed int)vec_cmplt(a,b); }
690 
ge_vec(vec_float a,vec_float b)691 inline vec_float ge_vec( vec_float a, vec_float b)    { return (vector float)vec_cmpge(a,b); }
ge_scal(vec_float a,vec_float b)692 inline vec_float ge_scal( vec_float a, vec_float b)   { return (vector float)vec_cmpge(a,b); }
ge_vec(vec_int a,vec_int b)693 inline vec_int ge_vec( vec_int a, vec_int b)        { return (vector signed int)vec_xor(vec_cmpgt(a,b),vec_cmpeq(a,b)); }
ge_scal(vec_int a,vec_int b)694 inline vec_int ge_scal( vec_int a, vec_int b)       { return (vector signed int)vec_xor(vec_cmpgt(a,b),vec_cmpeq(a,b)); }
695 
le_vec(vec_float a,vec_float b)696 inline vec_float le_vec( vec_float a, vec_float b)    { return (vector float)vec_cmple(a,b); }
le_scal(vec_float a,vec_float b)697 inline vec_float le_scal( vec_float a, vec_float b)   { return (vector float)vec_cmple(a,b); }
le_vec(vec_int a,vec_int b)698 inline vec_int le_vec( vec_int a, vec_int b)        { return (vector signed int)vec_xor(vec_cmplt(a,b),vec_cmpeq(a,b)); }
le_scal(vec_int a,vec_int b)699 inline vec_int le_scal( vec_int a, vec_int b)       { return (vector signed int)vec_xor(vec_cmplt(a,b),vec_cmpeq(a,b)); }
700 
701 
eq_vec(vec_float a,vec_float b)702 inline vec_float eq_vec( vec_float a, vec_float b)    { return (vector float)vec_cmpeq(a,b); }
eq_scal(vec_float a,vec_float b)703 inline vec_float eq_scal( vec_float a, vec_float b)   { return (vector float)vec_cmpeq(a,b); }
eq_vec(vec_int a,vec_int b)704 inline vec_int eq_vec( vec_int a, vec_int b)        { return (vector signed int)vec_cmpeq(a,b); }
eq_scal(vec_int a,vec_int b)705 inline vec_int eq_scal( vec_int a, vec_int b)       { return (vector signed int)vec_cmpeq(a,b); }
706 
707 
neq_vec(vec_float a,vec_float b)708 inline vec_float neq_vec( vec_float a, vec_float b)    { return (vector float)vec_xor(vec_cmpeq(a,b),vec_cmpeq(a,a)); }
neq_scal(vec_float a,vec_float b)709 inline vec_float neq_scal( vec_float a, vec_float b)   { return (vector float)vec_xor(vec_cmpeq(a,b),vec_cmpeq(a,a)); }
neq_vec(vec_int a,vec_int b)710 inline vec_int neq_vec( vec_int a, vec_int b)        { return (vector signed int)vec_xor(vec_cmpeq(a,b),vec_cmpeq(a,a)); }
neq_scal(vec_int a,vec_int b)711 inline vec_int neq_scal( vec_int a, vec_int b)       { return (vector signed int)vec_xor(vec_cmpeq(a,b),vec_cmpeq(a,a)); }
712 
713 // memory
set_vec(vec_float a)714 inline vec_float set_vec( vec_float a)                { return a; }
set_vec(__vec_float a)715 inline vec_float set_vec( __vec_float a)              { return a.v; }
set_vec(vec_int a)716 inline vec_int   set_vec( vec_int a)                  { return a; }
set_vec(__vec_int a)717 inline vec_int   set_vec( __vec_int a)                { return a.v; }
718 
set_vec(double a)719 inline vec_float set_vec( double a)                   {
720 	float af;
721 	af = (float)a;
722 	vector float temp; temp = vec_lde(0,&af);
723 	temp = vec_perm(temp,temp,vec_lvsl(0,&af));
724 	return vec_splat(temp,0);
725 	//__vec_float temp; float af = float(a); temp.s[0]=af; temp.s[1]=af; temp.s[2]=af; temp.s[3]=af; return temp.v;
726 }
set_vec(float a)727 inline vec_float set_vec( float a)                    {
728 	float af;
729 	af = a;
730 	vector float temp; temp = vec_lde(0,&af);
731 	temp = vec_perm(temp,temp,vec_lvsl(0,&af));
732 	return vec_splat(temp,0);
733 	//__vec_float temp; temp.s[0]=a; temp.s[1]=a; temp.s[2]=a; temp.s[3]=a; return temp.v;
734 }
set_vec(long int a)735 inline vec_int   set_vec( long int a)                 {
736 	int ai;
737 	ai = (int)a;
738 	vector signed int temp; temp = vec_lde(0,&ai);
739 	temp = vec_perm(temp,temp,vec_lvsl(0,&ai));
740 	return vec_splat(temp,0);
741 	//__vec_int temp; int al = int(a); temp.s[0]=al; temp.s[1]=al; temp.s[2]=al; temp.s[3]=al; return temp.v;
742 }
set_vec(int a)743 inline vec_int   set_vec( int a)                      {
744 	int ai;
745 	ai = a;
746 	vector signed int temp; temp = vec_lde(0,&ai);
747 	temp = vec_perm(temp,temp,vec_lvsl(0,&ai));
748 	return vec_splat(temp,0);
749 	//__vec_int temp; temp.s[0]=a; temp.s[1]=a; temp.s[2]=a; temp.s[3]=a; return temp.v;
750 }
set_vec(short a)751 inline vec_int   set_vec( short a)                    {
752 	int ai;
753 	ai = (int)a;
754 	vector signed int temp; temp = vec_lde(0,&ai);
755 	temp = vec_perm(temp,temp,vec_lvsl(0,&ai));
756 	return vec_splat(temp,0);
757 	//__vec_int temp; int as = int(a); temp.s[0]=as; temp.s[1]=as; temp.s[2]=as; temp.s[3]=as; return temp.v;
758 }
759 
set_vec(double a,double b,double c,double d)760 inline vec_float set_vec( double a, double b, double c, double d)         { __vec_float temp; temp.s[0]=float(a); temp.s[1]=float(b); temp.s[2]=float(c); temp.s[3]=float(d); return temp.v; }
set_vec(float a,float b,float c,float d)761 inline vec_float set_vec( float a, float b, float c, float d)             { __vec_float temp; temp.s[0]=a; temp.s[1]=b; temp.s[2]=c; temp.s[3]=d; return temp.v; }
set_vec(int a,int b,int c,int d)762 inline vec_int   set_vec( int a, int b, int c, int d)                     { __vec_int temp; temp.s[0]=a; temp.s[1]=b; temp.s[2]=c; temp.s[3]=d; return temp.v; }
set_vec(long int a,long int b,long int c,long int d)763 inline vec_int   set_vec( long int a, long int b, long int c, long int d) { __vec_int temp; temp.s[0]=int(a); temp.s[1]=int(b); temp.s[2]=int(c); temp.s[3]=int(d); return temp.v; }
set_vec(short a,short b,short c,short d)764 inline vec_int   set_vec( short a, short b, short c, short d)             { __vec_int temp; temp.s[0]=short(a); temp.s[1]=short(b); temp.s[2]=short(c); temp.s[3]=short(d); return temp.v; }
765 
load_a_vec(float * a)766 inline vec_float load_a_vec( float* a)                                { return vec_ld(0,a); }
767 //inline vec_float load_u_vec( float* a)                                {  }
load_a_vec(int * a)768 inline vec_int   load_a_vec( int* a)                                  { return vec_ld(0,a); }
769 //inline vec_int   load_u_vec( int* a)                                  {  }
770 
store_a_vec(float * a,vec_float b)771 inline void store_a_vec( float* a, vec_float b)                       { return vec_st(b,0,a); }
772 //inline void store_u_vec( float* a, vec_float b)                       {  }
store_a_vec(int * a,vec_int b)773 inline void store_a_vec( int* a, vec_int b)                           { return vec_st(b,0,a); }
774 //inline void store_u_vec( int* a, vec_int b)                           {  }
775 
776 
load_scal(float * a)777 inline vec_float load_scal(float* a) { vector float temp; temp = vec_lde(0,a); return vec_perm(temp,temp,vec_lvsl(0,a)); }
load_scal(int * a)778 inline vec_int   load_scal(int* a)   { vector signed int temp; temp = vec_lde(0,a); return vec_perm(temp,temp,vec_lvsl(0,a)); }
store_scal(float * a,vec_float content)779 inline void store_scal(float* a, vec_float content) { vec_float temp = vec_splat(content,0); return vec_ste(temp,0,a); }
store_scal(int * a,vec_int content)780 inline void store_scal(int* a, vec_int content)     { vec_int temp = vec_splat(content,0); return vec_ste(temp,0,a); }
781 
REC0(vec_float a)782 inline vec_float REC0(vec_float a) { return vec_mergeh(a,a); }
REC1(vec_float a)783 inline vec_float REC1(vec_float a) { return vec_mergeh(a,a); }
REC2(vec_float a)784 inline vec_float REC2(vec_float a) { return vec_sld(vec_splat(a,0),a,12); }
REC3(vec_float a)785 inline vec_float REC3(vec_float a) {
786   vector float temp1 = vec_mergel(a,a);
787   vector float temp2 = vec_mergeh(a,a);
788   return vec_mergel(vec_mergel(temp1,temp2),vec_mergeh(temp1,temp2));
789 }
790 
REC0(vec_int a)791 inline vec_int REC0(vec_int a) { return vec_mergeh(a,a); }
REC1(vec_int a)792 inline vec_int REC1(vec_int a) { return vec_mergeh(a,a); }
REC2(vec_int a)793 inline vec_int REC2(vec_int a) { return vec_sld(vec_splat(a,0),a,12); }
REC3(vec_int a)794 inline vec_int REC3(vec_int a) {
795   vector signed int temp1 = vec_mergel(a,a);
796   vector signed int temp2 = vec_mergeh(a,a);
797   return vec_mergel(vec_mergel(temp1,temp2),vec_mergeh(temp1,temp2));
798 }
799 
800 // scalar to vector: takes 4 vector which lower elements stands for a scalar value and rebuild a vector from these 4 scalar
SCAL2VEC(vec_float a0,vec_float a1,vec_float a2,vec_float a3)801 inline vec_float SCAL2VEC(vec_float a0,vec_float a1,vec_float a2,vec_float a3) {
802 	return vec_perm(vec_mergeh(a0,a1),vec_mergeh(a2,a3),vec_lvsl(8,(float*)(0)));
803 	//return vec_perm(vec_perm(a0,a1,VEC_PERM0),vec_perm(a2,a3,VEC_PERM0),VEC_PERM1);
804 }
SCAL2VEC(vec_int a0,vec_int a1,vec_int a2,vec_int a3)805 inline vec_int   SCAL2VEC(vec_int a0,vec_int a1,vec_int a2,vec_int a3) {
806 	return vec_perm(vec_mergeh(a0,a1),vec_mergeh(a2,a3),vec_lvsl(8,(int*)(0)));
807 	//return vec_perm(vec_perm(a0,a1,VEC_PERM0),vec_perm(a2,a3,VEC_PERM0),VEC_PERM1);
808 }
809 
SCAL2VEC(float a0,float a1,float a2,float a3)810 inline vec_float SCAL2VEC(float a0, float a1, float a2, float a3) { __vec_float temp; temp.s[0]=a0; temp.s[1]=a1; temp.s[2]=a2; temp.s[3]=a3; return temp.v; }
SCAL2VEC(int a0,int a1,int a2,int a3)811 inline vec_int   SCAL2VEC(int a0, int a1, int a2, int a3) { __vec_int temp; temp.s[0]=a0; temp.s[1]=a1; temp.s[2]=a2; temp.s[3]=a3; return temp.v; }
812 
813 // vector to scalar: build a scalar vector from one element of the initial vector
VEC2SCALVEC0(vec_float a)814 inline vec_float VEC2SCALVEC0(vec_float a) { return a; } // return x,x,x,a0   // vec_splat(a,0) would return a0,a0,a0,a0
VEC2SCALVEC1(vec_float a)815 inline vec_float VEC2SCALVEC1(vec_float a) { return vec_splat(a,1); } // return a1,a1,a1,a1
VEC2SCALVEC2(vec_float a)816 inline vec_float VEC2SCALVEC2(vec_float a) { return vec_splat(a,2); } // return a2,a2,a2,a2
VEC2SCALVEC3(vec_float a)817 inline vec_float VEC2SCALVEC3(vec_float a) { return vec_splat(a,3); } // return a3,a3,a3,a3
818 
VEC2SCALVEC0(vec_int a)819 inline vec_int VEC2SCALVEC0(vec_int a) { return a; } // return x,x,x,a0   // vec_splat(a,0) would return a0,a0,a0,a0
VEC2SCALVEC1(vec_int a)820 inline vec_int VEC2SCALVEC1(vec_int a) { return vec_splat(a,1); } // return a1,a1,a1,a1
VEC2SCALVEC2(vec_int a)821 inline vec_int VEC2SCALVEC2(vec_int a) { return vec_splat(a,2); } // return a2,a2,a2,a2
VEC2SCALVEC3(vec_int a)822 inline vec_int VEC2SCALVEC3(vec_int a) { return vec_splat(a,3); } // return a3,a3,a3,a3
823 
824 // vector to scalar: build a single scalar from a vector
VEC2SCAL0(vec_float a)825 inline float VEC2SCAL0(vec_float a) { float temp __attribute__ ((aligned(16))); vec_ste(vec_splat(a,0),0,&temp); return temp; }
VEC2SCAL1(vec_float a)826 inline float VEC2SCAL1(vec_float a) { float temp __attribute__ ((aligned(16))); vec_ste(vec_splat(a,1),0,&temp); return temp; }
VEC2SCAL2(vec_float a)827 inline float VEC2SCAL2(vec_float a) { float temp __attribute__ ((aligned(16))); vec_ste(vec_splat(a,2),0,&temp); return temp; }
VEC2SCAL3(vec_float a)828 inline float VEC2SCAL3(vec_float a) { float temp __attribute__ ((aligned(16))); vec_ste(vec_splat(a,3),0,&temp); return temp; }
829 
VEC2SCAL0(vec_int a)830 inline int VEC2SCAL0(vec_int a) { int temp __attribute__ ((aligned(16))); vec_ste(vec_splat(a,0),0,&temp); return temp; }
VEC2SCAL1(vec_int a)831 inline int VEC2SCAL1(vec_int a) { int temp __attribute__ ((aligned(16))); vec_ste(vec_splat(a,1),0,&temp); return temp; }
VEC2SCAL2(vec_int a)832 inline int VEC2SCAL2(vec_int a) { int temp __attribute__ ((aligned(16))); vec_ste(vec_splat(a,2),0,&temp); return temp; }
VEC2SCAL3(vec_int a)833 inline int VEC2SCAL3(vec_int a) { int temp __attribute__ ((aligned(16))); vec_ste(vec_splat(a,3),0,&temp); return temp; }
834 
835 // select: if( select == 0 ) then a ; else b ;
select_vec(vec_float select,vec_float a,vec_float b)836 inline vec_float select_vec( vec_float select, vec_float a, vec_float b)  { return vec_sel(a,b,(vector bool int)select ); }
select_scal(vec_float select,vec_float a,vec_float b)837 inline vec_float select_scal( vec_float select, vec_float a, vec_float b) {return vec_sel(a,b,(vector bool int)select );  }
select_vec(vec_int select,vec_float a,vec_float b)838 inline vec_float select_vec( vec_int select, vec_float a, vec_float b)  { return vec_sel(a,b,(vector bool int)select ); }
select_scal(vec_int select,vec_float a,vec_float b)839 inline vec_float select_scal( vec_int select, vec_float a, vec_float b) {return vec_sel(a,b,(vector bool int)select );  }
select_vec(vec_int select,vec_int a,vec_int b)840 inline vec_int select_vec( vec_int select, vec_int a, vec_int b)  { return vec_sel(a,b,(vector bool int)select ); }
select_scal(vec_int select,vec_int a,vec_int b)841 inline vec_int select_scal( vec_int select, vec_int a, vec_int b) { return vec_sel(a,b,(vector bool int)select ); }
842 
843 // vectorial version of the "mem" Faust key-word
844 // result = { a[2] a[1] a[0] b[3] }
mem1_vec(vec_float a,vec_float b)845 inline vec_float mem1_vec( vec_float a, vec_float b)       { return vec_sld(b,a,12); }
mem1_vec(vec_int a,vec_int b)846 inline vec_int   mem1_vec( vec_int a, vec_int b)           { return vec_sld(b,a,12); }
847 
848 // result = { a[1] a[0] b[3] b[2] }
mem2_vec(vec_float a,vec_float b)849 inline vec_float mem2_vec( vec_float a, vec_float b)       { return vec_sld(b,a,8); }
mem2_vec(vec_int a,vec_int b)850 inline vec_int   mem2_vec( vec_int a, vec_int b)           { return vec_sld(b,a,8); }
851 
852 // result = { a[0] b[3] b[2] b[1] }
mem3_vec(vec_float a,vec_float b)853 inline vec_float mem3_vec( vec_float a, vec_float b)       { return vec_sld(b,a,4); }
mem3_vec(vec_int a,vec_int b)854 inline vec_int   mem3_vec( vec_int a, vec_int b)           { return vec_sld(b,a,4); }
855 
856 // conversion
bool2float(vec_float a)857 inline vec_float bool2float( vec_float a )  { return vec_and(a,(vec_float)(vec_splat_s32(int(0x00000001)))); }
bool2float(vec_int a)858 inline vec_float bool2float( vec_int a )    { return (vec_float)(vec_and(a,vec_splat_s32(int(0x00000001)))); }
859 
bool2int(vec_int a)860 inline vec_int   bool2int( vec_int   a)     { return (vec_int)vec_and(a,vec_splat_s32(int(0x00000001))); }
bool2int(vec_float a)861 inline vec_int   bool2int( vec_float a )    { return (vec_int)vec_round(vec_and(a,(vec_float)(vec_splat_s32(int(0x00000001))))); }
862 
boolfloat2boolint(vec_float a)863 inline vec_int   boolfloat2boolint( vec_float a ) { return (vector signed int)a; }
boolint2boolfloat(vec_int a)864 inline vec_float boolint2boolfloat( vec_int   a ) { return (vector float)a; }
865 
float2int(vec_float a)866 inline vec_int   float2int( vec_float a)   { return (vec_int)vec_round(a); }
float2int(float a)867 inline int       float2int( float a )      { return int(a); }
868 
int2float(vec_int a)869 inline vec_float int2float( vec_int a)   { return vec_ctf(a,0); }
int2float(int a)870 inline float     int2float( int a )      { return float(a); }
871 
872 #endif
873