1 /*
2  * hhutil-inl.h
3  *
4  *  Created on: Mar 28, 2014
5  *      Author: meiermark
6  */
7 
8 #ifndef HHUTIL_INL_H_
9 #define HHUTIL_INL_H_
10 
11 #include "simd.h"
12 
13 /////////////////////////////////////////////////////////////////////////////////////
14 // Transform a character to lower case and '.' to '-' and vice versa
15 /////////////////////////////////////////////////////////////////////////////////////
MatchChr(char c)16 inline char MatchChr(char c)  {
17 	return ((c>='a' && c<='z')? c-'a'+'A' : (c=='.'? '-':c) );
18 }
19 
InsertChr(char c)20 inline char InsertChr(char c) {
21 	return ((c>='A' && c<='Z')? c+'a'-'A' : ((c>='0' && c<='9') || c=='-')? '.':c );
22 }
23 
WordChr(char c)24 inline int  WordChr(char c) {
25 	return (int)((c>='A' && c<='Z') || (c>='a' && c<='z'));
26 }
27 
28 // Compute the sum of bits of one or two integers
NumberOfSetBits(int i)29 inline int NumberOfSetBits(int i)
30 {
31     i = i - ((i >> 1) & 0x55555555);
32     i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
33     return (((i + (i >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
34 }
35 
36 //TODO: check
37 //inline int NumberOfSetBits(int i)
38 //{
39 //  return _mm_popcnt_u32(i);
40 //}
41 
42 /////////////////////////////////////////////////////////////////////////////////////
43 // Transforms the one-letter amino acid code into an integer between 0 and 22
44 /////////////////////////////////////////////////////////////////////////////////////
aa2i(char c)45 inline char aa2i(char c)
46 {
47   //A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V
48   if (c>='a' && c<='z') c+='A'-'a';
49   switch (c)
50     {
51     case 'A': return 0;
52     case 'R': return 1;
53     case 'N': return 2;
54     case 'D': return 3;
55     case 'C': return 4;
56     case 'Q': return 5;
57     case 'E': return 6;
58     case 'G': return 7;
59     case 'H': return 8;
60     case 'I': return 9;
61     case 'L': return 10;
62     case 'K': return 11;
63     case 'M': return 12;
64     case 'F': return 13;
65     case 'P': return 14;
66     case 'S': return 15;
67     case 'T': return 16;
68     case 'W': return 17;
69     case 'Y': return 18;
70     case 'V': return 19;
71     case 'X': return ANY;
72     case 'J': return ANY;
73     case 'O': return ANY;
74     case 'U': return 4;  //Selenocystein -> Cystein
75     case 'B': return 3;  //D (or N)
76     case 'Z': return 6;  //E (or Q)
77     case '-': return GAP;
78     case '.': return GAP;
79     case '_': return GAP;
80     }
81   if (c>=0 && c<=32) return -1; // white space and control characters
82   return -2;
83 }
84 
85 /////////////////////////////////////////////////////////////////////////////////////
86 // Transforms integers between 0 and 22 into the one-letter amino acid code
87 /////////////////////////////////////////////////////////////////////////////////////
i2aa(char c)88 inline char i2aa(char c)
89 {
90   //A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V
91   switch (c)
92     {
93     case 0: return 'A';
94     case 1: return 'R';
95     case 2: return 'N';
96     case 3: return 'D';
97     case 4: return 'C';
98     case 5: return 'Q';
99     case 6: return 'E';
100     case 7: return 'G';
101     case 8: return 'H';
102     case 9: return 'I';
103     case 10: return 'L';
104     case 11: return 'K';
105     case 12: return 'M';
106     case 13: return 'F';
107     case 14: return 'P';
108     case 15: return 'S';
109     case 16: return 'T';
110     case 17: return 'W';
111     case 18: return 'Y';
112     case 19: return 'V';
113     case ANY: return 'X';
114     case GAP: return '-';
115     case ENDGAP: return '-';
116     }
117   return '?';
118 }
119 
120 /////////////////////////////////////////////////////////////////////////////////////
121 // Transforms the dssp/psipred secondary structure code into an integer number
122 /////////////////////////////////////////////////////////////////////////////////////
ss2i(char c)123 inline char ss2i(char c)
124 {
125   //- H E C S T G B
126   if (c>='a' && c<='z') c+='A'-'a';
127   switch (c)
128     {
129     case '.': return 0;
130     case '-': return 0;
131     case 'X': return 0;
132     case 'H': return 1;
133     case 'E': return 2;
134     case 'C': return 3;
135     case '~': return 3;
136     case 'S': return 4;
137     case 'T': return 5;
138     case 'G': return 6;
139     case 'B': return 7;
140     case 'I': return 3;
141     case ' ': return -1;
142     case '\t': return -1;
143     case '\n': return -1;
144     }
145   return -2;
146 }
147 
148 /////////////////////////////////////////////////////////////////////////////////////
149 // Transforms integers between 0 and 8 into the dssp/psipred secondary structure code
150 /////////////////////////////////////////////////////////////////////////////////////
i2ss(int c)151 inline char i2ss(int c)
152 {
153   //- H E C S T G B
154   switch (c)
155     {
156     case 0: return '-';
157     case 1: return 'H';
158     case 2: return 'E';
159     case 3: return 'C';
160     case 4: return 'S';
161     case 5: return 'T';
162     case 6: return 'G';
163     case 7: return 'B';
164     case 8: return 'I';
165     }
166   return '?';
167 }
168 
169 
170 /////////////////////////////////////////////////////////////////////////////////////
171 // Transforms the solvend accessiblity code into an integer number
172 /////////////////////////////////////////////////////////////////////////////////////
sa2i(char c)173 inline char sa2i(char c)
174 {
175   //- A B C D E
176   if (c>='a' && c<='z') c+='A'-'a';
177   switch (c)
178     {
179     case '.': return 0;
180     case '-': return 0;
181     case 'A': return 1;
182     case 'B': return 2;
183     case 'C': return 3;
184     case 'D': return 4;
185     case 'E': return 5;
186     case 'F': return 6;
187     case ' ': return -1;
188     case '\t': return -1;
189     case '\n': return -1;
190     }
191   return -2;
192 }
193 
194 /////////////////////////////////////////////////////////////////////////////////////
195 // Transforms integers between 0 and 5 into the solvent accessibility code
196 /////////////////////////////////////////////////////////////////////////////////////
i2sa(int c)197 inline char i2sa(int c)
198 {
199   //- H E C S T G B
200   switch (c)
201     {
202     case 0: return '-';
203     case 1: return 'A';
204     case 2: return 'B';
205     case 3: return 'C';
206     case 4: return 'D';
207     case 5: return 'E';
208     case 6: return 'F';
209     }
210   return '?';
211 }
212 
213 
214 /////////////////////////////////////////////////////////////////////////////////////
215 // Transforms alternative secondary structure symbols into symbols
216 /////////////////////////////////////////////////////////////////////////////////////
ss2ss(char c)217 inline char ss2ss(char c)
218 {
219   //- H E C S T G B
220   switch (c)
221     {
222     case '~': return 'C';
223     case 'I': return 'C';
224     case 'i': return 'c';
225     case 'H':
226     case 'E':
227     case 'C':
228     case 'S':
229     case 'T':
230     case 'G':
231     case 'B':
232     case 'h':
233     case 'e':
234     case 'c':
235     case 's':
236     case 't':
237     case 'g':
238     case 'b':
239     case '.':
240       return c;
241     }
242   return '-';
243 }
244 
245 /////////////////////////////////////////////////////////////////////////////////////
246 // Transforms confidence values of psipred into internal code
247 /////////////////////////////////////////////////////////////////////////////////////
cf2i(char c)248 inline char cf2i(char c)
249 {
250   switch (c)
251     {
252     case '-': return 0;
253     case '.': return 0;
254     case '0': return 1;
255     case '1': return 2;
256     case '2': return 3;
257     case '3': return 4;
258     case '4': return 5;
259     case '5': return 6;
260     case '6': return 7;
261     case '7': return 8;
262     case '8': return 9;
263     case '9': return 10;
264     }
265   return 0;
266 }
267 
268 /////////////////////////////////////////////////////////////////////////////////////
269 // Transforms internal representation of psipred confidence values into printable chars
270 /////////////////////////////////////////////////////////////////////////////////////
i2cf(char c)271 inline char i2cf(char c)
272 {
273   switch (c)
274     {
275     case 0: return '-';
276     case 1: return '0';
277     case 2: return '1';
278     case 3: return '2';
279     case 4: return '3';
280     case 5: return '4';
281     case 6: return '5';
282     case 7: return '6';
283     case 8: return '7';
284     case 9: return '8';
285     case 10: return '9';
286     }
287   return '-';
288 }
289 
290 
291 /////////////////////////////////////////////////////////////////////////////////////
292 // Fast lookup of log2(1+2^(-x)) for x>=0 (precision < 0.35%)
293 /////////////////////////////////////////////////////////////////////////////////////
fast_addscore(float x)294 inline float fast_addscore(float x)
295 {
296   static float val[2001];         // val[i]=log2(1+2^(-x))
297   static char initialized=0;
298   if (x>20) return 0.0;
299   if (x<0)
300     {
301       fprintf(stderr,"Error in function fast_addscore: argument %g is negative\n",x);
302       exit(7);
303     }
304   if (!initialized)   //First fill in the log2-vector
305     {
306       for (int i=0; i<=2000; i++) val[i]=flog2(1.0+fpow2(-0.01*(i+0.5)));
307       initialized=1;
308     }
309   return val[(int)(100.0*x)];
310 }
311 
312 
313 
314 /////////////////////////////////////////////////////////////////////////////////////
315 // Little utilities for output
316 /////////////////////////////////////////////////////////////////////////////////////
fout(FILE * outf,int d)317 inline void fout(FILE* outf, int d)
318 {
319   if (d>=99999) fprintf(outf,"*\t"); else fprintf(outf,"%i\t",d);
320   return;
321 }
322 
323 
sout(std::stringstream & out,int d)324 inline void sout(std::stringstream& out, int d) {
325   if (d>=99999)
326     out << "*\t";
327   else
328     out << d << "\t";
329   return;
330 }
331 
332 
333 /////////////////////////////////////////////////////////////////////////////////////
334 //// Takes family code (eg. a.1.2.3) and returns strings 'a', 'a.1', and 'a.1.2'
335 /////////////////////////////////////////////////////////////////////////////////////
ScopID(char cl[],char fold[],char sfam[],const char fam[])336 inline void  ScopID(char cl[], char fold[], char sfam[], const char fam[])
337 {
338   char* ptr;
339 
340   //get scop class ID
341   strcpy(cl,fam);
342   ptr = strchr(cl,'.');               //return adress of next '.' in name
343   if(ptr) ptr[0]='\0';
344 
345   //get scop fold ID
346   strcpy(fold,fam);
347   ptr = strchr(fold,'.');             //return adress of next '.' in name
348   if(ptr) ptr = strchr(ptr+1,'.');    //return adress of next '.' in name
349   if(ptr) ptr[0]='\0';
350 
351   //get scop superfamily ID
352   strcpy(sfam,fam);
353   ptr = strchr(sfam,'.');            //return adress of next '.' in name
354   if(ptr) ptr = strchr(ptr+1,'.');   //return adress of next '.' in name
355   if(ptr) ptr = strchr(ptr+1,'.');   //return adress of next '.' in name
356   if(ptr) ptr[0]='\0';
357   return;
358 }
359 
360 /////////////////////////////////////////////////////////////////////////////////////
361 // SIMD 2^x for four floats
362 // Calculate float of 2pow(x) for four floats in parallel with SSE2
363 // ATTENTION: need to compile with g++ -fno-strict-aliasing when using -O2 or -O3!!!
364 // Relative deviation < 4.6E-6  (< 2.3E-7 with 5'th order polynomial)
365 //
366 // Internal representation of float number according to IEEE 754 (__m128 --> 4x):
367 //   1bit sign, 8 bits exponent, 23 bits mantissa: seee eeee emmm mmmm mmmm mmmm mmmm mmmm
368 //                                    0x4b400000 = 0100 1011 0100 0000 0000 0000 0000 0000
369 //   In summary: x = (-1)^s * 1.mmmmmmmmmmmmmmmmmmmmmm * 2^(eeeeeee-127)
370 /////////////////////////////////////////////////////////////////////////////////////
simdf32_fpow2(simd_float X)371 inline simd_float simdf32_fpow2(simd_float X) {
372 
373     simd_int* xPtr = (simd_int*) &X;    // store address of float as pointer to int
374 
375     const simd_float CONST32_05f       = simdf32_set(0.5f); // Initialize a vector (4x32) with 0.5f
376     // (3 << 22) --> Initialize a large integer vector (shift left)
377     const simd_int CONST32_3i          = simdi32_set(3);
378     const simd_int CONST32_3shift22    = simdi32_slli(CONST32_3i, 22);
379     const simd_float CONST32_1f        = simdf32_set(1.0f);
380     const simd_float CONST32_FLTMAXEXP = simdf32_set(FLT_MAX_EXP);
381     const simd_float CONST32_FLTMAX    = simdf32_set(FLT_MAX);
382     const simd_float CONST32_FLTMINEXP = simdf32_set(FLT_MIN_EXP);
383     // fifth order
384     const simd_float CONST32_A = simdf32_set(0.00187682f);
385     const simd_float CONST32_B = simdf32_set(0.00898898f);
386     const simd_float CONST32_C = simdf32_set(0.0558282f);
387     const simd_float CONST32_D = simdf32_set(0.240153f);
388     const simd_float CONST32_E = simdf32_set(0.693153f);
389 
390     simd_float tx;
391     simd_int lx;
392     simd_float dx;
393     simd_float result    = simdf32_set(0.0f);
394     simd_float maskedMax = simdf32_set(0.0f);
395     simd_float maskedMin = simdf32_set(0.0f);
396 
397     // Check wheter one of the values is bigger or smaller than FLT_MIN_EXP or FLT_MAX_EXP
398     // The correct FLT_MAX_EXP value is written to the right place
399     maskedMax = simdf32_gt(X, CONST32_FLTMAXEXP);
400     maskedMin = simdf32_gt(X, CONST32_FLTMINEXP);
401     maskedMin = simdf32_xor(maskedMin, maskedMax);
402     // If a value is bigger than FLT_MAX_EXP --> replace the later result with FLTMAX
403     maskedMax = simdf32_and(CONST32_FLTMAX, simdf32_gt(X, CONST32_FLTMAXEXP));
404 
405     tx = simdf32_add((simd_float ) CONST32_3shift22, simdf32_sub(X, CONST32_05f)); // temporary value for truncation: x-0.5 is added to a large integer (3<<22),
406                                                                              // 3<<22 = (1.1bin)*2^23 = (1.1bin)*2^(150-127),
407                                                                              // which, in internal bits, is written 0x4b400000 (since 10010110bin = 150)
408 
409     lx = simdf32_f2i(tx);                                       // integer value of x
410 
411     dx = simdf32_sub(X, simdi32_i2f(lx));                       // float remainder of x
412 
413     //   x = 1.0f + dx*(0.693153f             // polynomial apporoximation of 2^x for x in the range [0, 1]
414     //            + dx*(0.240153f             // Gives relative deviation < 2.3E-7
415     //            + dx*(0.0558282f            // Speed: 2.3E-8s
416     //            + dx*(0.00898898f
417     //            + dx* 0.00187682f ))));
418     X = simdf32_mul(dx, CONST32_A);
419     X = simdf32_add(CONST32_B, X);  // add constant B
420     X = simdf32_mul(dx, X);
421     X = simdf32_add(CONST32_C, X);  // add constant C
422     X = simdf32_mul(dx, X);
423     X = simdf32_add(CONST32_D, X);  // add constant D
424     X = simdf32_mul(dx, X);
425     X = simdf32_add(CONST32_E, X);  // add constant E
426     X = simdf32_mul(dx, X);
427     X = simdf32_add(X, CONST32_1f); // add 1.0f
428 
429     simd_int lxExp = simdi32_slli(lx, 23); // add integer power of 2 to exponent
430 
431     *xPtr = simdi32_add(*xPtr, lxExp); // add integer power of 2 to exponent
432 
433     // Add all Values that are greater than min and less than max
434     result = simdf32_and(maskedMin, X);
435     // Add MAX_FLT values where entry values were > FLT_MAX_EXP
436     result = simdf32_or(result, maskedMax);
437 
438     return result;
439 }
440 
441 // Fast SIMD log2 for four floats
442 // Calculate integer of log2 for four floats in parallel with SSE2
443 // Maximum deviation: +/- 2.1E-5
444 // Run time: ~5.6ns on Intel core2 2.13GHz.
445 // For a negative argument, nonsense is returned. Otherwise, when <1E-38, a value
446 // close to -126 is returned and when >1.7E38, +128 is returned.
447 // The function makes use of the representation of 4-byte floating point numbers:
448 // seee eeee emmm mmmm mmmm mmmm mmmm mmmm
449 // s is the sign, eee eee e gives the exponent + 127 (in hex: 0x7f).
450 // The following 23 bits give the mantisse, the binary digits after the decimal
451 // point:  x = (-1)^s * 1.mmmmmmmmmmmmmmmmmmmmmmm * 2^(eeeeeeee-127)
452 // Therefore,  log2(x) = eeeeeeee-127 + log2(1.mmmmmm...)
453 //                     = eeeeeeee-127 + log2(1+y),  where y = 0.mmmmmm...
454 //                     ~ eeeeeeee-127 + ((a*y+b)*y+c)*y
455 // The coefficients a, b  were determined by a least squares fit, and c=1-a-b to get 1 at y=1.
456 // Lower/higher order polynomials may be used for faster or more precise calculation:
457 // Order 1: log2(1+y) ~ y
458 // Order 2: log2(1+y) = (a*y + 1-a)*y, a=-0.3427
459 //  => max dev = +/- 8E-3, run time ~ 3.8ns
460 // Order 3: log2(1+y) = ((a*y+b)*y + 1-a-b)*y, a=0.1564, b=-0.5773
461 //  => max dev = +/- 1E-3, run time ~ 4.4ns
462 // Order 4: log2(1+y) = (((a*y+b)*y+c)*y + 1-a-b-c)*y, a=-0.0803 b=0.3170 c=-0.6748
463 //  => max dev = +/- 1.4E-4, run time ~ 5.0ns?
464 // Order 5: log2(1+y) = ((((a*y+b)*y+c)*y+d)*y + 1-a-b-c-d)*y, a=0.0440047 b=-0.1903190 c=0.4123442 d=-0.7077702
465 //  => max dev = +/- 2.1E-5, run time ~ 5.6ns?
466 
simdf32_flog2(simd_float X)467 inline simd_float simdf32_flog2(simd_float X) {
468 
469   const simd_int CONST32_0x7f = simdi32_set(0x7f);
470   const simd_int CONST32_0x7fffff = simdi32_set(0x7fffff);
471   const simd_int CONST32_0x3f800000 = simdi32_set(0x3f800000);
472   const simd_float  CONST32_1f = simdf32_set(1.0);
473   // const float a=0.1564, b=-0.5773, c=1.0-a-b;  // third order
474   const float a=0.0440047f, b=-0.1903190f, c=0.4123442f, d=-0.7077702f, e=1.0-a-b-c-d; // fifth order
475   const simd_float  CONST32_A = simdf32_set(a);
476   const simd_float  CONST32_B = simdf32_set(b);
477   const simd_float  CONST32_C = simdf32_set(c);
478   const simd_float  CONST32_D = simdf32_set(d);
479   const simd_float  CONST32_E = simdf32_set(e);
480   simd_int E; // exponents of X
481   simd_float R; //  result
482   E = simdi32_srli((simd_int) X, 23);    // shift right by 23 bits to obtain exponent+127
483   E = simdi32_sub(E, CONST32_0x7f);     // subtract 127 = 0x7f
484   X = (simd_float) simdi_and((simd_int) X, CONST32_0x7fffff);  // mask out exponent => mantisse
485   X = (simd_float) simdi_or ((simd_int) X, CONST32_0x3f800000); // set exponent to 127 (i.e., 0)
486   X = simdf32_sub(X, CONST32_1f);          // subtract one from mantisse
487   R = simdf32_mul(X, CONST32_A);           // R = a*X
488   R = simdf32_add(R, CONST32_B);           // R = a*X+b
489   R = simdf32_mul(R, X);                   // R = (a*X+b)*X
490   R = simdf32_add(R, CONST32_C);           // R = (a*X+b)*X+c
491   R = simdf32_mul(R, X);                   // R = ((a*X+b)*X+c)*X
492   R = simdf32_add(R, CONST32_D);           // R = ((a*X+b)*X+c)*X+d
493   R = simdf32_mul(R, X);                   // R = (((a*X+b)*X+c)*X+d)*X
494   R = simdf32_add(R, CONST32_E);           // R = (((a*X+b)*X+c)*X+d)*X+e
495   R = simdf32_mul(R, X);                   // R = ((((a*X+b)*X+c)*X+d)*X+e)*X ~ log2(1+X) !!
496   R = simdf32_add(R, simdi32_i2f(E));  // convert integer exponent to float and add to mantisse
497   return R;
498 
499 }
500 
501 #define LOG_POLY_DEGREE 4
502 #define POLY0(x, c0) simdf32_set(c0)
503 #define POLY1(x, c0, c1) simdf32_add(simdf32_mul(POLY0(x, c1), x), simdf32_set(c0))
504 #define POLY2(x, c0, c1, c2) simdf32_add(simdf32_mul(POLY1(x, c1, c2), x), simdf32_set(c0))
505 #define POLY3(x, c0, c1, c2, c3) simdf32_add(simdf32_mul(POLY2(x, c1, c2, c3), x), simdf32_set(c0))
506 #define POLY4(x, c0, c1, c2, c3, c4) simdf32_add(simdf32_mul(POLY3(x, c1, c2, c3, c4), x), simdf32_set(c0))
507 #define POLY5(x, c0, c1, c2, c3, c4, c5) simdf32_add(simdf32_mul(POLY4(x, c1, c2, c3, c4, c5), x), simdf32_set(c0))
508 
log2f4(simd_float x)509 inline simd_float log2f4(simd_float x)
510 {
511     simd_int exp  = simdi32_set(0x7F800000);
512     simd_int mant = simdi32_set(0x007FFFFF);
513 
514     simd_float one = simdf32_set( 1.0f);
515 
516     simd_int i = simdf_f2icast(x);
517 
518     simd_float e = simdi32_i2f(simdi32_sub(simdi32_srli(simdi_and(i, exp), 23), simdi32_set(127)));
519 
520     simd_float m = simdf32_or(simdi_i2fcast(simdi_and(i, mant)), one);
521 
522     simd_float p;
523 
524     /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ */
525 #if LOG_POLY_DEGREE == 6
526     p = POLY5( m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f,  3.1821337e-1f, -3.4436006e-2f);
527 #elif LOG_POLY_DEGREE == 5
528     p = POLY4(m, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
529 #elif LOG_POLY_DEGREE == 4
530     p = POLY3(m, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
531 #elif LOG_POLY_DEGREE == 3
532     p = POLY2(m, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
533 #else
534 #error
535 #endif
536 
537     /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
538     p = simdf32_mul(p, simdf32_sub(m, one));
539 
540     return simdf32_add(p, e);
541 }
542 
543 //// Perform log-sum-exp calculation with six SIMD variables
544 ////              result = x1 + x2 + x3 + x4 + x5 + x6
545 ////  -->     result = log2( 2^(x1) + 2^(x2) + 2^(x3) + 2^(x4) + 2^(x5) + 2^(x6))
546 ////      // to prevent overflows apply log sum of exp trick
547 ////      --> xMax = max(x1, x2, x3, x4, x5, x6)
548 ////  -->     result = log2(2^(xMax) * (2^(x1 - xMax) + 2^(x2 - xMax) + 2^(x3 - xMax) + 2^(x4 - xMax) + 2^(x5 - xMax) + 2^(x6 - xMax)))
549 ////  -->     result = log2(2^(xMax)) + log2(2^(x1 - xMax) + 2^(x2 - xMax) + 2^(x3 - xMax) + 2^(x4 - xMax) + 2^(x5 - xMax) + 2^(x6 - xMax))
550 ////  -->     result = xMax + log2(2^(x1 - xMax) + 2^(x2 - xMax) + 2^(x3 - xMax) + 2^(x4 - xMax) + 2^(x5 - xMax) + 2^(x6 - xMax))
551 //// WHERE x1, x2, x3, x4, x5 and x6 contain the log-values respectively!
552 //inline simd_float simd_flog2_sum_fpow2(simd_float x1, simd_float x2,
553 //        simd_float x3, simd_float x4, simd_float x5, simd_float x6) {
554 //
555 //    // Calculate the maximum out of the six variables
556 //    simd_float x_max0 = simdf32_max(x1,x2);
557 //    simd_float x_max1 = simdf32_max(x3,x4);
558 //    simd_float x_max2 = simdf32_max(x5,x6);
559 //    x_max0 = simdf32_max(x_max0, x_max1);
560 //    simd_float x_max  = simdf32_max(x_max2, x_max0);
561 //
562 //    simd_float max_comp_vec = simdf32_set(-FLT_MAX);
563 //    simd_float term0 = simdf32_fpow2(simdf32_sub(x1, x_max));
564 //    simd_float term1 = simdf32_fpow2(simdf32_sub(x2, x_max));
565 //    simd_float term2 = simdf32_fpow2(simdf32_sub(x3, x_max));
566 //    simd_float term3 = simdf32_fpow2(simdf32_sub(x4, x_max));
567 //    simd_float term4 = simdf32_fpow2(simdf32_sub(x5, x_max));
568 //    simd_float term5 = simdf32_fpow2(simdf32_sub(x6, x_max));
569 //
570 //    term0 = simdf32_add(term0, term1);          //      2^(x1 - x_max)
571 //    term2 = simdf32_add(term2, term3);          // +    2^(x2 - x_max)
572 //    term4 = simdf32_add(term4, term5);          // +    2^(x3 - x_max)
573 //    term0 = simdf32_add(term0, term2);          // +    2^(x4 - x_max)
574 //    term4 = simdf32_add(term4, term0);          // +    2^(x5 - x_max)
575 //     //      max(-FLT_MAX, x_max + log2(sum(terms)))
576 //    return simdf32_max(max_comp_vec, simdf32_add(x_max, simdf32_flog2(term4)));
577 //}
578 
579 // Perform log-sum-exp calculation with three SIMD variables
580 //              result = x1 + x2 + x3
581 //  -->     result = log2( 2^(x1) + 2^(x2) + 2^(x3) )
582 //      // to prevent overflows apply log sum of exp trick
583 //      --> xMax = max(x1, x2, x3)
584 //  -->     result = log2(2^(xMax) * (2^(x1 - xMax) + 2^(x2 - xMax) + 2^(x3 - xMax)))
585 //  -->     result = log2(2^(xMax)) + log2(2^(x1 - xMax) + 2^(x2 - xMax) + 2^(x3 - xMax))
586 //  -->     result = xMax + log2(2^(x1 - xMax) + 2^(x2 - xMax) + 2^(x3 - xMax))
587 // WHERE x1, x2 and x3 contain the log-values respectively!
simd_flog2_sum_fpow2(simd_float x1,simd_float x2,simd_float x3)588 inline simd_float simd_flog2_sum_fpow2(simd_float x1, simd_float x2, simd_float x3) {
589 
590     // Calculate the maximum out of the six variables
591     simd_float x_max = simdf32_max(x1, simdf32_max(x2, x3));
592 
593     simd_float max_comp_vec = simdf32_set(-FLT_MAX);
594 
595 
596     return simdf32_max(max_comp_vec, simdf32_add(x_max, simdf32_flog2(  //      x_max + log2(
597                     simdf32_add(simdf32_fpow2(simdf32_sub(x1, x_max)),                  //      2^(x1 - x_max)
598                             simdf32_add(simdf32_fpow2(simdf32_sub(x2, x_max)),          // +    2^(x2 - x_max)
599                                     simdf32_fpow2(simdf32_sub(x3, x_max)))))));                 // +    2^(x3 - x_max))
600 
601 }
602 
603 //// Perform log-sum-exp calculation with two SIMD variables
604 ////              result = x1 + x2
605 ////  -->     result = log2(2^(x1) + 2^(x2))
606 ////      // to prevent overflows apply log sum of exp trick
607 ////      --> xMax = max(x1, x2)
608 ////  -->     result = log2(2^(xMax) * (2^(x1 - xMax) + 2^(x2 - xMax)))
609 ////  -->     result = log2(2^(xMax)) + log2(2^(x1 - xMax) + 2^(x2 - xMax))
610 ////  -->     result = xMax + log2(2^(x1 - xMax) + 2^(x2 - xMax))
611 //// WHERE x1, x2 and x3 contain the log-values respectively!
612 //inline simd_float simd_flog2_sum_fpow2(simd_float x1, simd_float x2) {
613 //
614 //    // Calculate the maximum out of the six variables
615 //    simd_float x_max = simdf32_max(x1, x2);
616 //
617 //    simd_float max_comp_vec = simdf32_set(-FLT_MAX);
618 //
619 //    return simdf32_max(max_comp_vec, simdf32_add(x_max, simdf32_flog2(  //      fmax(-FLT_MAX, x_max + log2(
620 //                    simdf32_add(simdf32_fpow2(simdf32_sub(x1, x_max)),                  //      2^(x1 - x_max)
621 //                                        simdf32_fpow2(simdf32_sub(x2, x_max))))));              // +    2^(x2 - x_max))
622 //
623 //}
624 
625 // Perform log-sum-exp calculation with two SIMD f32 variables
626 // Calculate log2( 2^x1 + 2^x2) in stable fashion using
627 //  max(-FLT_MAX, max(x1,x2) + flog2(1.0 + fpow2(min(x1,x2) - max(x1,x2)));
simd_flog2_sum_fpow2(simd_float x1,simd_float x2)628 inline simd_float simd_flog2_sum_fpow2(simd_float x1, simd_float x2) {
629 
630     const simd_float CONST_1f = simdf32_set(1.0f);
631     const simd_float CONST_max_comp_vec = simdf32_set(-FLT_MAX);
632     simd_float xmax = simdf32_max(x1, x2);
633     simd_float x    = simdf32_min(x1, x2);
634     x = simdf32_sub(x,xmax);
635     x = simdf32_fpow2(x);
636     x = simdf32_add( CONST_1f, x );
637     x = simdf32_flog2(x);
638     x = simdf32_add(xmax,x);
639     return simdf32_max(CONST_max_comp_vec, x);
640 }
641 
642 
643 
644 // Perform log-sum-exp calculation with six variables
645 //                              result = x1 + x2 + x3 + x4 + x5 + x6
646 //      -->             result = log2( 2^(x1) + 2^(x2) + 2^(x3) + 2^(x4) + 2^(x5) + 2^(x6))
647 //              // to prevent overflows apply log sum of exp trick
648 //              --> xMax = max(x1, x2, x3, x4, x5, x6)
649 //      -->     result = log2(2^(xMax) * (2^(x1 - xMax) + 2^(x2 - xMax) + 2^(x3 - xMax) + 2^(x4 - xMax) + 2^(x5 - xMax) + 2^(x6 - xMax)))
650 //      -->             result = log2(2^(xMax)) + log2(2^(x1 - xMax) + 2^(x2 - xMax) + 2^(x3 - xMax) + 2^(x4 - xMax) + 2^(x5 - xMax) + 2^(x6 - xMax))
651 //      -->             result = xMax + log2(2^(x1 - xMax) + 2^(x2 - xMax) + 2^(x3 - xMax) + 2^(x4 - xMax) + 2^(x5 - xMax) + 2^(x6 - xMax))
652 // WHERE x1, x2, x3, x4, x5 and x6 contain the log-values respectively!
flog2_sum_fpow2(float x1,float x2,float x3,float x4,float x5,float x6)653 inline float flog2_sum_fpow2(float x1, float x2, float x3, float x4, float x5, float x6) {
654 
655     float xmax = fmax(x1, fmax(x2, fmax(x3, fmax(x4, fmax(x5, x6)))));
656 
657     return fmax(-FLT_MAX, xmax + flog2(
658                                        fpow2(x1 - xmax) +
659                                        fpow2(x2 - xmax) +
660                                        fpow2(x3 - xmax) +
661                                        fpow2(x4 - xmax) +
662                                        fpow2(x5 - xmax) +
663                                        fpow2(x6 - xmax)));
664     //      return fmax(-FLT_MAX, xmax + log2f(
665     //                                              powf(2, x1 - xmax) +
666     //                                              powf(2, x2 - xmax) +
667     //                                              powf(2, x3 - xmax) +
668     //                                              powf(2, x4 - xmax) +
669     //                                              powf(2, x5 - xmax) +
670     //                                              powf(2, x6 - xmax)));
671 
672 }
673 
674 //
675 //inline float fast_flog2_sum_fpow2(float x1, float x2, float x3, float x4, float x5, float x6) {
676 //
677 //    simd_float X = _mm256_set_ps(x1,x2,x3,x4,x5,x6,-FLT_MAX_EXP,-FLT_MAX_EXP);
678 //
679 //    // Compute the maximum of the eight floats in X
680 //    // xmax = fmax( fmax( fmax(x1,x2), fmax(x3, x4) ) , fmax(x5, x6) );
681 //    simd_float Xshuf = _mm256_permute_ps(X,0xB1); // permute mask of 2bits: (2,3,0,1) = (10,11,00,01) = 0xB1
682 //    simd_float Xmax  = simdf32_max(X,Xshuf);
683 //    Xshuf = _mm256_permute_ps(Xmax,0x8E); // permute mask of 2bits: (1,0,3,2) = (01,00,11,10) = 0x8E
684 //    Xmax  = simdf32_max(Xmax,Xshuf);
685 //    Xshuf = _mm256_permute2f128_ps(Xmax,Xmax,0x08); // permute mask of 2bits: (0,0,1,0) = (00,00,01,00) = 0x08
686 //    Xmax  = simdf32_max(Xmax,Xshuf);
687 //
688 //    // Compute pow2(X-Xmax) for all f32 floats
689 //    X = simdf32_sub(X,Xmax);
690 //    X = simdf32_fpow2(X);
691 //
692 //    // Horizontally add f32 floats
693 //    Xshuf = _mm256_permute_ps(X,0xB1); // permute mask of 2bits: (2,3,0,1) = (10,11,00,01) = 0xB1
694 //    X = simdf32_add(X,Xshuf);
695 //    Xshuf = _mm256_permute_ps(X,0x8E); // permute mask of 2bits: (2,3,0,1) = (10,11,00,01) = 0xB1
696 //    X = simdf32_add(X,Xshuf);
697 //    Xshuf = _mm256_permute2f128_ps(X,X,0x08); // permute mask of 2bits: (0,0,1,0) = (00,00,01,00) = 0x08
698 //    X  = simdf32_add(X,Xshuf);
699 //
700 //    float xmax = simdf32_extract(Xmax,0x00);
701 //    float x    = simdf32_extract(X, 0x00);
702 //    return fmax(-FLT_MAX, xmax + flog2(x) );
703 //}
704 //
705 
706 
707 
708 
709 
710 
711 
712 
713 // Perform log-sum-exp calculation with three variables
714 //                              result = x1 + x2 + x3
715 //      -->             result = log2( 2^(x1) + 2^(x2) + 2^(x3) )
716 //              // to prevent overflows apply log sum of exp trick
717 //              --> xMax = max(x1, x2, x3)
718 //      -->     result = log2(2^(xMax) * (2^(x1 - xMax) + 2^(x2 - xMax) + 2^(x3 - xMax)))
719 //      -->             result = log2(2^(xMax)) + log2(2^(x1 - xMax) + 2^(x2 - xMax) + 2^(x3 - xMax))
720 //      -->             result = xMax + log2(2^(x1 - xMax) + 2^(x2 - xMax) + 2^(x3 - xMax))
721 // WHERE x1, x2 and x3 contain the log-values respectively!
flog2_sum_fpow2(float x1,float x2,float x3)722 inline float flog2_sum_fpow2(float x1, float x2, float x3) {
723 
724     //      float arr[] = {x1, x2, x3};
725     //      std::sort(arr, arr + 3);
726     //
727     //      return fmax(-FLT_MAX, arr[2] + logo(arr[1] - arr[2] + logo(arr[0] - arr[1])));
728 
729     float xmax = fmax(x1, fmax(x2, x3));
730 
731     return fmax(-FLT_MAX, xmax + flog2(
732                                        fpow2(x1 - xmax) +
733                                        fpow2(x2 - xmax) +
734                                        fpow2(x3 - xmax)));
735     //      return fmax(-FLT_MAX, xmax + log2f(
736     //                                              powf(2, x1 - xmax) +
737     //                                              powf(2, x2 - xmax) +
738     //                                              powf(2, x3 - xmax)));
739 
740 }
741 
742 // Perform log-sum-exp calculation with two variables
743 //                              result = x1 + x2
744 //      -->             result = log2(2^(x1) + 2^(x2))
745 //              // to prevent overflows apply log sum of exp trick
746 //              --> xMax = max(x1, x2)
747 //      -->     result = log2(2^(xMax) * (2^(x1 - xMax) + 2^(x2 - xMax)))
748 //      -->             result = log2(2^(xMax)) + log2(2^(x1 - xMax) + 2^(x2 - xMax))
749 //      -->             result = xMax + log2(2^(x1 - xMax) + 2^(x2 - xMax))
750 // WHERE x1 and x2 contain the log-values respectively!
flog2_sum_fpow2(float x1,float x2)751 inline float flog2_sum_fpow2(float x1, float x2) {
752 
753     //      float arr[] = {x1, x2};
754     //      std::sort(arr, arr + 2);
755     //
756     //      return fmax(-FLT_MAX, arr[1] + logo(arr[0] - arr[1]));
757 
758     float xmax = fmax(x1, x2);
759 
760     return fmax(-FLT_MAX, xmax + flog2(
761                                        fpow2(x1 - xmax) +
762                                        fpow2(x2 - xmax)));
763     //      return fmax(-FLT_MAX, xmax + log2f(
764     //                                              powf(2, x1 - xmax) +
765     //                                              powf(2, x2 - xmax)));
766 
767 }
768 
769 
770 
771 #endif /* HHUTIL_INL_H_ */
772