1 /*
2 * hhutil-inl.h
3 *
4 * Created on: Mar 28, 2014
5 * Author: meiermark
6 */
7
8 #ifndef HHUTIL_INL_H_
9 #define HHUTIL_INL_H_
10
11 #include "simd.h"
12
13 /////////////////////////////////////////////////////////////////////////////////////
14 // Transform a character to lower case and '.' to '-' and vice versa
15 /////////////////////////////////////////////////////////////////////////////////////
MatchChr(char c)16 inline char MatchChr(char c) {
17 return ((c>='a' && c<='z')? c-'a'+'A' : (c=='.'? '-':c) );
18 }
19
InsertChr(char c)20 inline char InsertChr(char c) {
21 return ((c>='A' && c<='Z')? c+'a'-'A' : ((c>='0' && c<='9') || c=='-')? '.':c );
22 }
23
WordChr(char c)24 inline int WordChr(char c) {
25 return (int)((c>='A' && c<='Z') || (c>='a' && c<='z'));
26 }
27
28 // Compute the sum of bits of one or two integers
NumberOfSetBits(int i)29 inline int NumberOfSetBits(int i)
30 {
31 i = i - ((i >> 1) & 0x55555555);
32 i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
33 return (((i + (i >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
34 }
35
36 //TODO: check
37 //inline int NumberOfSetBits(int i)
38 //{
39 // return _mm_popcnt_u32(i);
40 //}
41
42 /////////////////////////////////////////////////////////////////////////////////////
43 // Transforms the one-letter amino acid code into an integer between 0 and 22
44 /////////////////////////////////////////////////////////////////////////////////////
aa2i(char c)45 inline char aa2i(char c)
46 {
47 //A R N D C Q E G H I L K M F P S T W Y V
48 if (c>='a' && c<='z') c+='A'-'a';
49 switch (c)
50 {
51 case 'A': return 0;
52 case 'R': return 1;
53 case 'N': return 2;
54 case 'D': return 3;
55 case 'C': return 4;
56 case 'Q': return 5;
57 case 'E': return 6;
58 case 'G': return 7;
59 case 'H': return 8;
60 case 'I': return 9;
61 case 'L': return 10;
62 case 'K': return 11;
63 case 'M': return 12;
64 case 'F': return 13;
65 case 'P': return 14;
66 case 'S': return 15;
67 case 'T': return 16;
68 case 'W': return 17;
69 case 'Y': return 18;
70 case 'V': return 19;
71 case 'X': return ANY;
72 case 'J': return ANY;
73 case 'O': return ANY;
74 case 'U': return 4; //Selenocystein -> Cystein
75 case 'B': return 3; //D (or N)
76 case 'Z': return 6; //E (or Q)
77 case '-': return GAP;
78 case '.': return GAP;
79 case '_': return GAP;
80 }
81 if (c>=0 && c<=32) return -1; // white space and control characters
82 return -2;
83 }
84
85 /////////////////////////////////////////////////////////////////////////////////////
86 // Transforms integers between 0 and 22 into the one-letter amino acid code
87 /////////////////////////////////////////////////////////////////////////////////////
i2aa(char c)88 inline char i2aa(char c)
89 {
90 //A R N D C Q E G H I L K M F P S T W Y V
91 switch (c)
92 {
93 case 0: return 'A';
94 case 1: return 'R';
95 case 2: return 'N';
96 case 3: return 'D';
97 case 4: return 'C';
98 case 5: return 'Q';
99 case 6: return 'E';
100 case 7: return 'G';
101 case 8: return 'H';
102 case 9: return 'I';
103 case 10: return 'L';
104 case 11: return 'K';
105 case 12: return 'M';
106 case 13: return 'F';
107 case 14: return 'P';
108 case 15: return 'S';
109 case 16: return 'T';
110 case 17: return 'W';
111 case 18: return 'Y';
112 case 19: return 'V';
113 case ANY: return 'X';
114 case GAP: return '-';
115 case ENDGAP: return '-';
116 }
117 return '?';
118 }
119
120 /////////////////////////////////////////////////////////////////////////////////////
121 // Transforms the dssp/psipred secondary structure code into an integer number
122 /////////////////////////////////////////////////////////////////////////////////////
ss2i(char c)123 inline char ss2i(char c)
124 {
125 //- H E C S T G B
126 if (c>='a' && c<='z') c+='A'-'a';
127 switch (c)
128 {
129 case '.': return 0;
130 case '-': return 0;
131 case 'X': return 0;
132 case 'H': return 1;
133 case 'E': return 2;
134 case 'C': return 3;
135 case '~': return 3;
136 case 'S': return 4;
137 case 'T': return 5;
138 case 'G': return 6;
139 case 'B': return 7;
140 case 'I': return 3;
141 case ' ': return -1;
142 case '\t': return -1;
143 case '\n': return -1;
144 }
145 return -2;
146 }
147
148 /////////////////////////////////////////////////////////////////////////////////////
149 // Transforms integers between 0 and 8 into the dssp/psipred secondary structure code
150 /////////////////////////////////////////////////////////////////////////////////////
i2ss(int c)151 inline char i2ss(int c)
152 {
153 //- H E C S T G B
154 switch (c)
155 {
156 case 0: return '-';
157 case 1: return 'H';
158 case 2: return 'E';
159 case 3: return 'C';
160 case 4: return 'S';
161 case 5: return 'T';
162 case 6: return 'G';
163 case 7: return 'B';
164 case 8: return 'I';
165 }
166 return '?';
167 }
168
169
170 /////////////////////////////////////////////////////////////////////////////////////
171 // Transforms the solvend accessiblity code into an integer number
172 /////////////////////////////////////////////////////////////////////////////////////
sa2i(char c)173 inline char sa2i(char c)
174 {
175 //- A B C D E
176 if (c>='a' && c<='z') c+='A'-'a';
177 switch (c)
178 {
179 case '.': return 0;
180 case '-': return 0;
181 case 'A': return 1;
182 case 'B': return 2;
183 case 'C': return 3;
184 case 'D': return 4;
185 case 'E': return 5;
186 case 'F': return 6;
187 case ' ': return -1;
188 case '\t': return -1;
189 case '\n': return -1;
190 }
191 return -2;
192 }
193
194 /////////////////////////////////////////////////////////////////////////////////////
195 // Transforms integers between 0 and 5 into the solvent accessibility code
196 /////////////////////////////////////////////////////////////////////////////////////
i2sa(int c)197 inline char i2sa(int c)
198 {
199 //- H E C S T G B
200 switch (c)
201 {
202 case 0: return '-';
203 case 1: return 'A';
204 case 2: return 'B';
205 case 3: return 'C';
206 case 4: return 'D';
207 case 5: return 'E';
208 case 6: return 'F';
209 }
210 return '?';
211 }
212
213
214 /////////////////////////////////////////////////////////////////////////////////////
215 // Transforms alternative secondary structure symbols into symbols
216 /////////////////////////////////////////////////////////////////////////////////////
ss2ss(char c)217 inline char ss2ss(char c)
218 {
219 //- H E C S T G B
220 switch (c)
221 {
222 case '~': return 'C';
223 case 'I': return 'C';
224 case 'i': return 'c';
225 case 'H':
226 case 'E':
227 case 'C':
228 case 'S':
229 case 'T':
230 case 'G':
231 case 'B':
232 case 'h':
233 case 'e':
234 case 'c':
235 case 's':
236 case 't':
237 case 'g':
238 case 'b':
239 case '.':
240 return c;
241 }
242 return '-';
243 }
244
245 /////////////////////////////////////////////////////////////////////////////////////
246 // Transforms confidence values of psipred into internal code
247 /////////////////////////////////////////////////////////////////////////////////////
cf2i(char c)248 inline char cf2i(char c)
249 {
250 switch (c)
251 {
252 case '-': return 0;
253 case '.': return 0;
254 case '0': return 1;
255 case '1': return 2;
256 case '2': return 3;
257 case '3': return 4;
258 case '4': return 5;
259 case '5': return 6;
260 case '6': return 7;
261 case '7': return 8;
262 case '8': return 9;
263 case '9': return 10;
264 }
265 return 0;
266 }
267
268 /////////////////////////////////////////////////////////////////////////////////////
269 // Transforms internal representation of psipred confidence values into printable chars
270 /////////////////////////////////////////////////////////////////////////////////////
i2cf(char c)271 inline char i2cf(char c)
272 {
273 switch (c)
274 {
275 case 0: return '-';
276 case 1: return '0';
277 case 2: return '1';
278 case 3: return '2';
279 case 4: return '3';
280 case 5: return '4';
281 case 6: return '5';
282 case 7: return '6';
283 case 8: return '7';
284 case 9: return '8';
285 case 10: return '9';
286 }
287 return '-';
288 }
289
290
291 /////////////////////////////////////////////////////////////////////////////////////
292 // Fast lookup of log2(1+2^(-x)) for x>=0 (precision < 0.35%)
293 /////////////////////////////////////////////////////////////////////////////////////
fast_addscore(float x)294 inline float fast_addscore(float x)
295 {
296 static float val[2001]; // val[i]=log2(1+2^(-x))
297 static char initialized=0;
298 if (x>20) return 0.0;
299 if (x<0)
300 {
301 fprintf(stderr,"Error in function fast_addscore: argument %g is negative\n",x);
302 exit(7);
303 }
304 if (!initialized) //First fill in the log2-vector
305 {
306 for (int i=0; i<=2000; i++) val[i]=flog2(1.0+fpow2(-0.01*(i+0.5)));
307 initialized=1;
308 }
309 return val[(int)(100.0*x)];
310 }
311
312
313
314 /////////////////////////////////////////////////////////////////////////////////////
315 // Little utilities for output
316 /////////////////////////////////////////////////////////////////////////////////////
fout(FILE * outf,int d)317 inline void fout(FILE* outf, int d)
318 {
319 if (d>=99999) fprintf(outf,"*\t"); else fprintf(outf,"%i\t",d);
320 return;
321 }
322
323
sout(std::stringstream & out,int d)324 inline void sout(std::stringstream& out, int d) {
325 if (d>=99999)
326 out << "*\t";
327 else
328 out << d << "\t";
329 return;
330 }
331
332
333 /////////////////////////////////////////////////////////////////////////////////////
334 //// Takes family code (eg. a.1.2.3) and returns strings 'a', 'a.1', and 'a.1.2'
335 /////////////////////////////////////////////////////////////////////////////////////
ScopID(char cl[],char fold[],char sfam[],const char fam[])336 inline void ScopID(char cl[], char fold[], char sfam[], const char fam[])
337 {
338 char* ptr;
339
340 //get scop class ID
341 strcpy(cl,fam);
342 ptr = strchr(cl,'.'); //return adress of next '.' in name
343 if(ptr) ptr[0]='\0';
344
345 //get scop fold ID
346 strcpy(fold,fam);
347 ptr = strchr(fold,'.'); //return adress of next '.' in name
348 if(ptr) ptr = strchr(ptr+1,'.'); //return adress of next '.' in name
349 if(ptr) ptr[0]='\0';
350
351 //get scop superfamily ID
352 strcpy(sfam,fam);
353 ptr = strchr(sfam,'.'); //return adress of next '.' in name
354 if(ptr) ptr = strchr(ptr+1,'.'); //return adress of next '.' in name
355 if(ptr) ptr = strchr(ptr+1,'.'); //return adress of next '.' in name
356 if(ptr) ptr[0]='\0';
357 return;
358 }
359
360 /////////////////////////////////////////////////////////////////////////////////////
361 // SIMD 2^x for four floats
362 // Calculate float of 2pow(x) for four floats in parallel with SSE2
363 // ATTENTION: need to compile with g++ -fno-strict-aliasing when using -O2 or -O3!!!
364 // Relative deviation < 4.6E-6 (< 2.3E-7 with 5'th order polynomial)
365 //
366 // Internal representation of float number according to IEEE 754 (__m128 --> 4x):
367 // 1bit sign, 8 bits exponent, 23 bits mantissa: seee eeee emmm mmmm mmmm mmmm mmmm mmmm
368 // 0x4b400000 = 0100 1011 0100 0000 0000 0000 0000 0000
369 // In summary: x = (-1)^s * 1.mmmmmmmmmmmmmmmmmmmmmm * 2^(eeeeeee-127)
370 /////////////////////////////////////////////////////////////////////////////////////
simdf32_fpow2(simd_float X)371 inline simd_float simdf32_fpow2(simd_float X) {
372
373 simd_int* xPtr = (simd_int*) &X; // store address of float as pointer to int
374
375 const simd_float CONST32_05f = simdf32_set(0.5f); // Initialize a vector (4x32) with 0.5f
376 // (3 << 22) --> Initialize a large integer vector (shift left)
377 const simd_int CONST32_3i = simdi32_set(3);
378 const simd_int CONST32_3shift22 = simdi32_slli(CONST32_3i, 22);
379 const simd_float CONST32_1f = simdf32_set(1.0f);
380 const simd_float CONST32_FLTMAXEXP = simdf32_set(FLT_MAX_EXP);
381 const simd_float CONST32_FLTMAX = simdf32_set(FLT_MAX);
382 const simd_float CONST32_FLTMINEXP = simdf32_set(FLT_MIN_EXP);
383 // fifth order
384 const simd_float CONST32_A = simdf32_set(0.00187682f);
385 const simd_float CONST32_B = simdf32_set(0.00898898f);
386 const simd_float CONST32_C = simdf32_set(0.0558282f);
387 const simd_float CONST32_D = simdf32_set(0.240153f);
388 const simd_float CONST32_E = simdf32_set(0.693153f);
389
390 simd_float tx;
391 simd_int lx;
392 simd_float dx;
393 simd_float result = simdf32_set(0.0f);
394 simd_float maskedMax = simdf32_set(0.0f);
395 simd_float maskedMin = simdf32_set(0.0f);
396
397 // Check wheter one of the values is bigger or smaller than FLT_MIN_EXP or FLT_MAX_EXP
398 // The correct FLT_MAX_EXP value is written to the right place
399 maskedMax = simdf32_gt(X, CONST32_FLTMAXEXP);
400 maskedMin = simdf32_gt(X, CONST32_FLTMINEXP);
401 maskedMin = simdf32_xor(maskedMin, maskedMax);
402 // If a value is bigger than FLT_MAX_EXP --> replace the later result with FLTMAX
403 maskedMax = simdf32_and(CONST32_FLTMAX, simdf32_gt(X, CONST32_FLTMAXEXP));
404
405 tx = simdf32_add((simd_float ) CONST32_3shift22, simdf32_sub(X, CONST32_05f)); // temporary value for truncation: x-0.5 is added to a large integer (3<<22),
406 // 3<<22 = (1.1bin)*2^23 = (1.1bin)*2^(150-127),
407 // which, in internal bits, is written 0x4b400000 (since 10010110bin = 150)
408
409 lx = simdf32_f2i(tx); // integer value of x
410
411 dx = simdf32_sub(X, simdi32_i2f(lx)); // float remainder of x
412
413 // x = 1.0f + dx*(0.693153f // polynomial apporoximation of 2^x for x in the range [0, 1]
414 // + dx*(0.240153f // Gives relative deviation < 2.3E-7
415 // + dx*(0.0558282f // Speed: 2.3E-8s
416 // + dx*(0.00898898f
417 // + dx* 0.00187682f ))));
418 X = simdf32_mul(dx, CONST32_A);
419 X = simdf32_add(CONST32_B, X); // add constant B
420 X = simdf32_mul(dx, X);
421 X = simdf32_add(CONST32_C, X); // add constant C
422 X = simdf32_mul(dx, X);
423 X = simdf32_add(CONST32_D, X); // add constant D
424 X = simdf32_mul(dx, X);
425 X = simdf32_add(CONST32_E, X); // add constant E
426 X = simdf32_mul(dx, X);
427 X = simdf32_add(X, CONST32_1f); // add 1.0f
428
429 simd_int lxExp = simdi32_slli(lx, 23); // add integer power of 2 to exponent
430
431 *xPtr = simdi32_add(*xPtr, lxExp); // add integer power of 2 to exponent
432
433 // Add all Values that are greater than min and less than max
434 result = simdf32_and(maskedMin, X);
435 // Add MAX_FLT values where entry values were > FLT_MAX_EXP
436 result = simdf32_or(result, maskedMax);
437
438 return result;
439 }
440
441 // Fast SIMD log2 for four floats
442 // Calculate integer of log2 for four floats in parallel with SSE2
443 // Maximum deviation: +/- 2.1E-5
444 // Run time: ~5.6ns on Intel core2 2.13GHz.
445 // For a negative argument, nonsense is returned. Otherwise, when <1E-38, a value
446 // close to -126 is returned and when >1.7E38, +128 is returned.
447 // The function makes use of the representation of 4-byte floating point numbers:
448 // seee eeee emmm mmmm mmmm mmmm mmmm mmmm
449 // s is the sign, eee eee e gives the exponent + 127 (in hex: 0x7f).
450 // The following 23 bits give the mantisse, the binary digits after the decimal
451 // point: x = (-1)^s * 1.mmmmmmmmmmmmmmmmmmmmmmm * 2^(eeeeeeee-127)
452 // Therefore, log2(x) = eeeeeeee-127 + log2(1.mmmmmm...)
453 // = eeeeeeee-127 + log2(1+y), where y = 0.mmmmmm...
454 // ~ eeeeeeee-127 + ((a*y+b)*y+c)*y
455 // The coefficients a, b were determined by a least squares fit, and c=1-a-b to get 1 at y=1.
456 // Lower/higher order polynomials may be used for faster or more precise calculation:
457 // Order 1: log2(1+y) ~ y
458 // Order 2: log2(1+y) = (a*y + 1-a)*y, a=-0.3427
459 // => max dev = +/- 8E-3, run time ~ 3.8ns
460 // Order 3: log2(1+y) = ((a*y+b)*y + 1-a-b)*y, a=0.1564, b=-0.5773
461 // => max dev = +/- 1E-3, run time ~ 4.4ns
462 // Order 4: log2(1+y) = (((a*y+b)*y+c)*y + 1-a-b-c)*y, a=-0.0803 b=0.3170 c=-0.6748
463 // => max dev = +/- 1.4E-4, run time ~ 5.0ns?
464 // Order 5: log2(1+y) = ((((a*y+b)*y+c)*y+d)*y + 1-a-b-c-d)*y, a=0.0440047 b=-0.1903190 c=0.4123442 d=-0.7077702
465 // => max dev = +/- 2.1E-5, run time ~ 5.6ns?
466
simdf32_flog2(simd_float X)467 inline simd_float simdf32_flog2(simd_float X) {
468
469 const simd_int CONST32_0x7f = simdi32_set(0x7f);
470 const simd_int CONST32_0x7fffff = simdi32_set(0x7fffff);
471 const simd_int CONST32_0x3f800000 = simdi32_set(0x3f800000);
472 const simd_float CONST32_1f = simdf32_set(1.0);
473 // const float a=0.1564, b=-0.5773, c=1.0-a-b; // third order
474 const float a=0.0440047f, b=-0.1903190f, c=0.4123442f, d=-0.7077702f, e=1.0-a-b-c-d; // fifth order
475 const simd_float CONST32_A = simdf32_set(a);
476 const simd_float CONST32_B = simdf32_set(b);
477 const simd_float CONST32_C = simdf32_set(c);
478 const simd_float CONST32_D = simdf32_set(d);
479 const simd_float CONST32_E = simdf32_set(e);
480 simd_int E; // exponents of X
481 simd_float R; // result
482 E = simdi32_srli((simd_int) X, 23); // shift right by 23 bits to obtain exponent+127
483 E = simdi32_sub(E, CONST32_0x7f); // subtract 127 = 0x7f
484 X = (simd_float) simdi_and((simd_int) X, CONST32_0x7fffff); // mask out exponent => mantisse
485 X = (simd_float) simdi_or ((simd_int) X, CONST32_0x3f800000); // set exponent to 127 (i.e., 0)
486 X = simdf32_sub(X, CONST32_1f); // subtract one from mantisse
487 R = simdf32_mul(X, CONST32_A); // R = a*X
488 R = simdf32_add(R, CONST32_B); // R = a*X+b
489 R = simdf32_mul(R, X); // R = (a*X+b)*X
490 R = simdf32_add(R, CONST32_C); // R = (a*X+b)*X+c
491 R = simdf32_mul(R, X); // R = ((a*X+b)*X+c)*X
492 R = simdf32_add(R, CONST32_D); // R = ((a*X+b)*X+c)*X+d
493 R = simdf32_mul(R, X); // R = (((a*X+b)*X+c)*X+d)*X
494 R = simdf32_add(R, CONST32_E); // R = (((a*X+b)*X+c)*X+d)*X+e
495 R = simdf32_mul(R, X); // R = ((((a*X+b)*X+c)*X+d)*X+e)*X ~ log2(1+X) !!
496 R = simdf32_add(R, simdi32_i2f(E)); // convert integer exponent to float and add to mantisse
497 return R;
498
499 }
500
501 #define LOG_POLY_DEGREE 4
502 #define POLY0(x, c0) simdf32_set(c0)
503 #define POLY1(x, c0, c1) simdf32_add(simdf32_mul(POLY0(x, c1), x), simdf32_set(c0))
504 #define POLY2(x, c0, c1, c2) simdf32_add(simdf32_mul(POLY1(x, c1, c2), x), simdf32_set(c0))
505 #define POLY3(x, c0, c1, c2, c3) simdf32_add(simdf32_mul(POLY2(x, c1, c2, c3), x), simdf32_set(c0))
506 #define POLY4(x, c0, c1, c2, c3, c4) simdf32_add(simdf32_mul(POLY3(x, c1, c2, c3, c4), x), simdf32_set(c0))
507 #define POLY5(x, c0, c1, c2, c3, c4, c5) simdf32_add(simdf32_mul(POLY4(x, c1, c2, c3, c4, c5), x), simdf32_set(c0))
508
log2f4(simd_float x)509 inline simd_float log2f4(simd_float x)
510 {
511 simd_int exp = simdi32_set(0x7F800000);
512 simd_int mant = simdi32_set(0x007FFFFF);
513
514 simd_float one = simdf32_set( 1.0f);
515
516 simd_int i = simdf_f2icast(x);
517
518 simd_float e = simdi32_i2f(simdi32_sub(simdi32_srli(simdi_and(i, exp), 23), simdi32_set(127)));
519
520 simd_float m = simdf32_or(simdi_i2fcast(simdi_and(i, mant)), one);
521
522 simd_float p;
523
524 /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ */
525 #if LOG_POLY_DEGREE == 6
526 p = POLY5( m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
527 #elif LOG_POLY_DEGREE == 5
528 p = POLY4(m, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
529 #elif LOG_POLY_DEGREE == 4
530 p = POLY3(m, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
531 #elif LOG_POLY_DEGREE == 3
532 p = POLY2(m, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
533 #else
534 #error
535 #endif
536
537 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
538 p = simdf32_mul(p, simdf32_sub(m, one));
539
540 return simdf32_add(p, e);
541 }
542
543 //// Perform log-sum-exp calculation with six SIMD variables
544 //// result = x1 + x2 + x3 + x4 + x5 + x6
545 //// --> result = log2( 2^(x1) + 2^(x2) + 2^(x3) + 2^(x4) + 2^(x5) + 2^(x6))
546 //// // to prevent overflows apply log sum of exp trick
547 //// --> xMax = max(x1, x2, x3, x4, x5, x6)
548 //// --> result = log2(2^(xMax) * (2^(x1 - xMax) + 2^(x2 - xMax) + 2^(x3 - xMax) + 2^(x4 - xMax) + 2^(x5 - xMax) + 2^(x6 - xMax)))
549 //// --> result = log2(2^(xMax)) + log2(2^(x1 - xMax) + 2^(x2 - xMax) + 2^(x3 - xMax) + 2^(x4 - xMax) + 2^(x5 - xMax) + 2^(x6 - xMax))
550 //// --> result = xMax + log2(2^(x1 - xMax) + 2^(x2 - xMax) + 2^(x3 - xMax) + 2^(x4 - xMax) + 2^(x5 - xMax) + 2^(x6 - xMax))
551 //// WHERE x1, x2, x3, x4, x5 and x6 contain the log-values respectively!
552 //inline simd_float simd_flog2_sum_fpow2(simd_float x1, simd_float x2,
553 // simd_float x3, simd_float x4, simd_float x5, simd_float x6) {
554 //
555 // // Calculate the maximum out of the six variables
556 // simd_float x_max0 = simdf32_max(x1,x2);
557 // simd_float x_max1 = simdf32_max(x3,x4);
558 // simd_float x_max2 = simdf32_max(x5,x6);
559 // x_max0 = simdf32_max(x_max0, x_max1);
560 // simd_float x_max = simdf32_max(x_max2, x_max0);
561 //
562 // simd_float max_comp_vec = simdf32_set(-FLT_MAX);
563 // simd_float term0 = simdf32_fpow2(simdf32_sub(x1, x_max));
564 // simd_float term1 = simdf32_fpow2(simdf32_sub(x2, x_max));
565 // simd_float term2 = simdf32_fpow2(simdf32_sub(x3, x_max));
566 // simd_float term3 = simdf32_fpow2(simdf32_sub(x4, x_max));
567 // simd_float term4 = simdf32_fpow2(simdf32_sub(x5, x_max));
568 // simd_float term5 = simdf32_fpow2(simdf32_sub(x6, x_max));
569 //
570 // term0 = simdf32_add(term0, term1); // 2^(x1 - x_max)
571 // term2 = simdf32_add(term2, term3); // + 2^(x2 - x_max)
572 // term4 = simdf32_add(term4, term5); // + 2^(x3 - x_max)
573 // term0 = simdf32_add(term0, term2); // + 2^(x4 - x_max)
574 // term4 = simdf32_add(term4, term0); // + 2^(x5 - x_max)
575 // // max(-FLT_MAX, x_max + log2(sum(terms)))
576 // return simdf32_max(max_comp_vec, simdf32_add(x_max, simdf32_flog2(term4)));
577 //}
578
579 // Perform log-sum-exp calculation with three SIMD variables
580 // result = x1 + x2 + x3
581 // --> result = log2( 2^(x1) + 2^(x2) + 2^(x3) )
582 // // to prevent overflows apply log sum of exp trick
583 // --> xMax = max(x1, x2, x3)
584 // --> result = log2(2^(xMax) * (2^(x1 - xMax) + 2^(x2 - xMax) + 2^(x3 - xMax)))
585 // --> result = log2(2^(xMax)) + log2(2^(x1 - xMax) + 2^(x2 - xMax) + 2^(x3 - xMax))
586 // --> result = xMax + log2(2^(x1 - xMax) + 2^(x2 - xMax) + 2^(x3 - xMax))
587 // WHERE x1, x2 and x3 contain the log-values respectively!
simd_flog2_sum_fpow2(simd_float x1,simd_float x2,simd_float x3)588 inline simd_float simd_flog2_sum_fpow2(simd_float x1, simd_float x2, simd_float x3) {
589
590 // Calculate the maximum out of the six variables
591 simd_float x_max = simdf32_max(x1, simdf32_max(x2, x3));
592
593 simd_float max_comp_vec = simdf32_set(-FLT_MAX);
594
595
596 return simdf32_max(max_comp_vec, simdf32_add(x_max, simdf32_flog2( // x_max + log2(
597 simdf32_add(simdf32_fpow2(simdf32_sub(x1, x_max)), // 2^(x1 - x_max)
598 simdf32_add(simdf32_fpow2(simdf32_sub(x2, x_max)), // + 2^(x2 - x_max)
599 simdf32_fpow2(simdf32_sub(x3, x_max))))))); // + 2^(x3 - x_max))
600
601 }
602
603 //// Perform log-sum-exp calculation with two SIMD variables
604 //// result = x1 + x2
605 //// --> result = log2(2^(x1) + 2^(x2))
606 //// // to prevent overflows apply log sum of exp trick
607 //// --> xMax = max(x1, x2)
608 //// --> result = log2(2^(xMax) * (2^(x1 - xMax) + 2^(x2 - xMax)))
609 //// --> result = log2(2^(xMax)) + log2(2^(x1 - xMax) + 2^(x2 - xMax))
610 //// --> result = xMax + log2(2^(x1 - xMax) + 2^(x2 - xMax))
611 //// WHERE x1, x2 and x3 contain the log-values respectively!
612 //inline simd_float simd_flog2_sum_fpow2(simd_float x1, simd_float x2) {
613 //
614 // // Calculate the maximum out of the six variables
615 // simd_float x_max = simdf32_max(x1, x2);
616 //
617 // simd_float max_comp_vec = simdf32_set(-FLT_MAX);
618 //
619 // return simdf32_max(max_comp_vec, simdf32_add(x_max, simdf32_flog2( // fmax(-FLT_MAX, x_max + log2(
620 // simdf32_add(simdf32_fpow2(simdf32_sub(x1, x_max)), // 2^(x1 - x_max)
621 // simdf32_fpow2(simdf32_sub(x2, x_max)))))); // + 2^(x2 - x_max))
622 //
623 //}
624
625 // Perform log-sum-exp calculation with two SIMD f32 variables
626 // Calculate log2( 2^x1 + 2^x2) in stable fashion using
627 // max(-FLT_MAX, max(x1,x2) + flog2(1.0 + fpow2(min(x1,x2) - max(x1,x2)));
simd_flog2_sum_fpow2(simd_float x1,simd_float x2)628 inline simd_float simd_flog2_sum_fpow2(simd_float x1, simd_float x2) {
629
630 const simd_float CONST_1f = simdf32_set(1.0f);
631 const simd_float CONST_max_comp_vec = simdf32_set(-FLT_MAX);
632 simd_float xmax = simdf32_max(x1, x2);
633 simd_float x = simdf32_min(x1, x2);
634 x = simdf32_sub(x,xmax);
635 x = simdf32_fpow2(x);
636 x = simdf32_add( CONST_1f, x );
637 x = simdf32_flog2(x);
638 x = simdf32_add(xmax,x);
639 return simdf32_max(CONST_max_comp_vec, x);
640 }
641
642
643
644 // Perform log-sum-exp calculation with six variables
645 // result = x1 + x2 + x3 + x4 + x5 + x6
646 // --> result = log2( 2^(x1) + 2^(x2) + 2^(x3) + 2^(x4) + 2^(x5) + 2^(x6))
647 // // to prevent overflows apply log sum of exp trick
648 // --> xMax = max(x1, x2, x3, x4, x5, x6)
649 // --> result = log2(2^(xMax) * (2^(x1 - xMax) + 2^(x2 - xMax) + 2^(x3 - xMax) + 2^(x4 - xMax) + 2^(x5 - xMax) + 2^(x6 - xMax)))
650 // --> result = log2(2^(xMax)) + log2(2^(x1 - xMax) + 2^(x2 - xMax) + 2^(x3 - xMax) + 2^(x4 - xMax) + 2^(x5 - xMax) + 2^(x6 - xMax))
651 // --> result = xMax + log2(2^(x1 - xMax) + 2^(x2 - xMax) + 2^(x3 - xMax) + 2^(x4 - xMax) + 2^(x5 - xMax) + 2^(x6 - xMax))
652 // WHERE x1, x2, x3, x4, x5 and x6 contain the log-values respectively!
flog2_sum_fpow2(float x1,float x2,float x3,float x4,float x5,float x6)653 inline float flog2_sum_fpow2(float x1, float x2, float x3, float x4, float x5, float x6) {
654
655 float xmax = fmax(x1, fmax(x2, fmax(x3, fmax(x4, fmax(x5, x6)))));
656
657 return fmax(-FLT_MAX, xmax + flog2(
658 fpow2(x1 - xmax) +
659 fpow2(x2 - xmax) +
660 fpow2(x3 - xmax) +
661 fpow2(x4 - xmax) +
662 fpow2(x5 - xmax) +
663 fpow2(x6 - xmax)));
664 // return fmax(-FLT_MAX, xmax + log2f(
665 // powf(2, x1 - xmax) +
666 // powf(2, x2 - xmax) +
667 // powf(2, x3 - xmax) +
668 // powf(2, x4 - xmax) +
669 // powf(2, x5 - xmax) +
670 // powf(2, x6 - xmax)));
671
672 }
673
674 //
675 //inline float fast_flog2_sum_fpow2(float x1, float x2, float x3, float x4, float x5, float x6) {
676 //
677 // simd_float X = _mm256_set_ps(x1,x2,x3,x4,x5,x6,-FLT_MAX_EXP,-FLT_MAX_EXP);
678 //
679 // // Compute the maximum of the eight floats in X
680 // // xmax = fmax( fmax( fmax(x1,x2), fmax(x3, x4) ) , fmax(x5, x6) );
681 // simd_float Xshuf = _mm256_permute_ps(X,0xB1); // permute mask of 2bits: (2,3,0,1) = (10,11,00,01) = 0xB1
682 // simd_float Xmax = simdf32_max(X,Xshuf);
683 // Xshuf = _mm256_permute_ps(Xmax,0x8E); // permute mask of 2bits: (1,0,3,2) = (01,00,11,10) = 0x8E
684 // Xmax = simdf32_max(Xmax,Xshuf);
685 // Xshuf = _mm256_permute2f128_ps(Xmax,Xmax,0x08); // permute mask of 2bits: (0,0,1,0) = (00,00,01,00) = 0x08
686 // Xmax = simdf32_max(Xmax,Xshuf);
687 //
688 // // Compute pow2(X-Xmax) for all f32 floats
689 // X = simdf32_sub(X,Xmax);
690 // X = simdf32_fpow2(X);
691 //
692 // // Horizontally add f32 floats
693 // Xshuf = _mm256_permute_ps(X,0xB1); // permute mask of 2bits: (2,3,0,1) = (10,11,00,01) = 0xB1
694 // X = simdf32_add(X,Xshuf);
695 // Xshuf = _mm256_permute_ps(X,0x8E); // permute mask of 2bits: (2,3,0,1) = (10,11,00,01) = 0xB1
696 // X = simdf32_add(X,Xshuf);
697 // Xshuf = _mm256_permute2f128_ps(X,X,0x08); // permute mask of 2bits: (0,0,1,0) = (00,00,01,00) = 0x08
698 // X = simdf32_add(X,Xshuf);
699 //
700 // float xmax = simdf32_extract(Xmax,0x00);
701 // float x = simdf32_extract(X, 0x00);
702 // return fmax(-FLT_MAX, xmax + flog2(x) );
703 //}
704 //
705
706
707
708
709
710
711
712
713 // Perform log-sum-exp calculation with three variables
714 // result = x1 + x2 + x3
715 // --> result = log2( 2^(x1) + 2^(x2) + 2^(x3) )
716 // // to prevent overflows apply log sum of exp trick
717 // --> xMax = max(x1, x2, x3)
718 // --> result = log2(2^(xMax) * (2^(x1 - xMax) + 2^(x2 - xMax) + 2^(x3 - xMax)))
719 // --> result = log2(2^(xMax)) + log2(2^(x1 - xMax) + 2^(x2 - xMax) + 2^(x3 - xMax))
720 // --> result = xMax + log2(2^(x1 - xMax) + 2^(x2 - xMax) + 2^(x3 - xMax))
721 // WHERE x1, x2 and x3 contain the log-values respectively!
flog2_sum_fpow2(float x1,float x2,float x3)722 inline float flog2_sum_fpow2(float x1, float x2, float x3) {
723
724 // float arr[] = {x1, x2, x3};
725 // std::sort(arr, arr + 3);
726 //
727 // return fmax(-FLT_MAX, arr[2] + logo(arr[1] - arr[2] + logo(arr[0] - arr[1])));
728
729 float xmax = fmax(x1, fmax(x2, x3));
730
731 return fmax(-FLT_MAX, xmax + flog2(
732 fpow2(x1 - xmax) +
733 fpow2(x2 - xmax) +
734 fpow2(x3 - xmax)));
735 // return fmax(-FLT_MAX, xmax + log2f(
736 // powf(2, x1 - xmax) +
737 // powf(2, x2 - xmax) +
738 // powf(2, x3 - xmax)));
739
740 }
741
742 // Perform log-sum-exp calculation with two variables
743 // result = x1 + x2
744 // --> result = log2(2^(x1) + 2^(x2))
745 // // to prevent overflows apply log sum of exp trick
746 // --> xMax = max(x1, x2)
747 // --> result = log2(2^(xMax) * (2^(x1 - xMax) + 2^(x2 - xMax)))
748 // --> result = log2(2^(xMax)) + log2(2^(x1 - xMax) + 2^(x2 - xMax))
749 // --> result = xMax + log2(2^(x1 - xMax) + 2^(x2 - xMax))
750 // WHERE x1 and x2 contain the log-values respectively!
flog2_sum_fpow2(float x1,float x2)751 inline float flog2_sum_fpow2(float x1, float x2) {
752
753 // float arr[] = {x1, x2};
754 // std::sort(arr, arr + 2);
755 //
756 // return fmax(-FLT_MAX, arr[1] + logo(arr[0] - arr[1]));
757
758 float xmax = fmax(x1, x2);
759
760 return fmax(-FLT_MAX, xmax + flog2(
761 fpow2(x1 - xmax) +
762 fpow2(x2 - xmax)));
763 // return fmax(-FLT_MAX, xmax + log2f(
764 // powf(2, x1 - xmax) +
765 // powf(2, x2 - xmax)));
766
767 }
768
769
770
771 #endif /* HHUTIL_INL_H_ */
772