1 // hhhmm.C
2
3 #include "hhhmm.h"
4
5 /////////////////////////////////////////////////////////////////////////////////////
6 //// Class HMM
7 /////////////////////////////////////////////////////////////////////////////////////
8
9 /////////////////////////////////////////////////////////////////////////////////////
10 // Object constructor
11 /////////////////////////////////////////////////////////////////////////////////////
HMM(int maxseqdis,int maxres)12 HMM::HMM(int maxseqdis, int maxres) : maxseqdis(maxseqdis), maxres(maxres) {
13 sname = new char*[maxseqdis]; // names of stored sequences
14 seq = new char*[maxseqdis]; // residues of stored sequences (first at pos 1!)
15 Neff_M = new float[maxres]; // Neff_M[i] = diversity of subalignment of seqs that have residue in col i
16 Neff_I = new float[maxres]; // Neff_I[i] = diversity of subalignment of seqs that have insert in col i
17 Neff_D = new float[maxres]; // Neff_D[i] = diversity of subalignment of seqs that have delete in col i
18 longname = new char[DESCLEN]; // Full name of first sequence of original alignment (NAME field)
19 // Make Clang memory sanitizer happy, avoid copying uninitialized values in HMM::operator=()
20 name[0] = file[0] = fam[0] = sfam[0] = fold[0] = cl[0] = '\0';
21 ss_dssp = new char[maxres]; // secondary structure determined by dssp 0:- 1:H 2:E 3:C 4:S 5:T 6:G 7:B
22 sa_dssp = new char[maxres]; // solvent accessibility state determined by dssp 0:- 1:A (absolutely buried) 2:B 3:C 4:D 5:E (exposed)
23 ss_pred = new char[maxres]; // predicted secondary structure 0:- 1:H 2:E 3:C
24 ss_conf = new char[maxres]; // confidence value of prediction 0:- 1:0 ... 10:9
25 l = new int[maxres]; // l[i] = pos. of j'th match state in aligment
26 f = new float*[maxres]; // f[i][a] = prob of finding amino acid a in column i WITHOUT pseudocounts
27 g = new float*[maxres]; // f[i][a] = prob of finding amino acid a in column i WITH pseudocounts
28 p = new float*[maxres]; // p[i][a] = prob of finding amino acid a in column i WITH OPTIMUM pseudocounts
29 tr = new float*[maxres]; // log2 of transition probabilities M2M M2I M2D I2M I2I D2M D2D
30 for (int i = 0; i < maxres; i++)
31 f[i] = new float[NAA + 3];
32 for (int i = 0; i < maxres; i++)
33 g[i] = new float[NAA];
34 for (int i = 0; i < maxres; i++)
35 p[i] = (float*) mem_align(ALIGN_FLOAT, NAA * sizeof(float)); // align memory on 16B/32B boundaries for SSE2 / AVX
36 for (int i = 0; i < maxres; i++)
37 tr[i] = (float*) mem_align(ALIGN_FLOAT, NTRANS * sizeof(float));
38
39 L = 0;
40 Neff_HMM = 0;
41 n_display = n_seqs = N_in = N_filtered = 0;
42 nss_dssp = nsa_dssp = nss_pred = nss_conf = nfirst = ncons = -1;
43 lamda = 0.0;
44 mu = 0.0;
45 name[0] = longname[0] = fam[0] = '\0';
46 trans_lin = 0; // transition probs in log space
47 dont_delete_seqs = false;
48 has_pseudocounts = false;
49 divided_by_local_bg_freqs = false;
50 }
51
52 /////////////////////////////////////////////////////////////////////////////////////
53 // Object destructor
54 /////////////////////////////////////////////////////////////////////////////////////
~HMM()55 HMM::~HMM() {
56 //Delete name and seq matrices
57 if (!dont_delete_seqs) // don't delete sname and seq if flat copy to hit object has been made
58 {
59 for (int k = 0; k < n_seqs; k++)
60 delete[] sname[k];
61 for (int k = 0; k < n_seqs; k++)
62 delete[] seq[k];
63 } else // Delete all not shown sequences (lost otherwise)
64 {
65 if (n_seqs > n_display) {
66 for (int k = n_display; k < n_seqs; k++)
67 delete[] sname[k];
68 for (int k = n_display; k < n_seqs; k++)
69 delete[] seq[k];
70 }
71 }
72 delete[] sname;
73 delete[] seq;
74 delete[] Neff_M;
75 delete[] Neff_D;
76 delete[] Neff_I;
77 delete[] longname;
78 delete[] ss_dssp;
79 delete[] sa_dssp;
80 delete[] ss_pred;
81 delete[] ss_conf;
82 delete[] l;
83 for (int i = 0; i < maxres; i++)
84 if (f[i])
85 delete[] f[i];
86 else
87 break;
88 for (int i = 0; i < maxres; i++)
89 if (g[i])
90 delete[] g[i];
91 else
92 break;
93 for (int i = 0; i < maxres; i++)
94 if (p[i])
95 free(p[i]);
96 else
97 break;
98 for (int i = 0; i < maxres; i++)
99 if (tr[i])
100 free(tr[i]);
101 else
102 break;
103 delete[] f;
104 delete[] g;
105 delete[] p;
106 delete[] tr;
107 }
108
109 /////////////////////////////////////////////////////////////////////////////////////
110 // Deep-copy constructor
111 /////////////////////////////////////////////////////////////////////////////////////
operator =(HMM & q)112 HMM& HMM::operator=(HMM& q) {
113 if (!dont_delete_seqs) // don't delete sname and seq if flat copy to hit object has been made
114 {
115 for (int k = 0; k < n_seqs; k++)
116 delete[] sname[k];
117 for (int k = 0; k < n_seqs; k++)
118 delete[] seq[k];
119 } else // Delete all not shown sequences (lost otherwise)
120 {
121 if (n_seqs > n_display) {
122 for (int k = n_display; k < n_seqs; k++)
123 delete[] sname[k];
124 for (int k = n_display; k < n_seqs; k++)
125 delete[] seq[k];
126 }
127 }
128
129 L = q.L;
130 for (int i = 0; i <= L + 1; ++i) {
131 for (int a = 0; a < NAA; ++a) {
132 f[i][a] = q.f[i][a];
133 g[i][a] = q.g[i][a];
134 p[i][a] = q.p[i][a];
135 }
136 for (int a = 0; a < NTRANS; ++a)
137 tr[i][a] = q.tr[i][a];
138 ss_dssp[i] = q.ss_dssp[i];
139 sa_dssp[i] = q.sa_dssp[i];
140 ss_pred[i] = q.ss_pred[i];
141 ss_conf[i] = q.ss_conf[i];
142 l[i] = q.l[i];
143 }
144
145 n_display = q.n_display;
146 n_seqs = q.n_seqs;
147 for (int k = 0; k < n_seqs; k++) {
148 sname[k] = new char[strlen(q.sname[k]) + 1];
149 if (!sname[k])
150 MemoryError("array of names for sequences to display", __FILE__,
151 __LINE__, __func__);
152 strcpy(sname[k], q.sname[k]);
153 }
154 for (int k = 0; k < n_seqs; k++) {
155 seq[k] = new char[strlen(q.seq[k]) + 1];
156 if (!seq[k])
157 MemoryError("array of names for sequences to display", __FILE__,
158 __LINE__, __func__);
159 strcpy(seq[k], q.seq[k]);
160 }
161 ncons = q.ncons;
162 nfirst = q.nfirst;
163 nss_dssp = q.nss_dssp;
164 nsa_dssp = q.nsa_dssp;
165 nss_pred = q.nss_pred;
166 nss_conf = q.nss_conf;
167
168 for (int i = 0; i <= L + 1; ++i)
169 Neff_M[i] = q.Neff_M[i];
170 for (int i = 0; i <= L + 1; ++i)
171 Neff_I[i] = q.Neff_I[i];
172 for (int i = 0; i <= L + 1; ++i)
173 Neff_D[i] = q.Neff_D[i];
174 Neff_HMM = q.Neff_HMM;
175
176 strcpy(longname, q.longname);
177 strmcpy(name, q.name, NAMELEN - 1);
178 strmcpy(file, q.file, NAMELEN - 1);
179 strmcpy(fam, q.fam, NAMELEN - 1);
180 strmcpy(sfam, q.sfam, IDLEN - 1);
181 strmcpy(fold, q.fold, IDLEN - 1);
182 strmcpy(cl, q.cl, IDLEN - 1);
183
184 lamda = q.lamda;
185 mu = q.mu;
186 trans_lin = q.trans_lin; // transition probs in log space
187 dont_delete_seqs = q.dont_delete_seqs;
188 has_pseudocounts = q.has_pseudocounts;
189 divided_by_local_bg_freqs = q.divided_by_local_bg_freqs;
190
191 for (int a = 0; a < NAA; ++a)
192 pav[a] = q.pav[a];
193 N_in = q.N_in;
194 N_filtered = q.N_filtered;
195 return (HMM&) (*this);
196 }
197
198 /////////////////////////////////////////////////////////////////////////////////////
199 //// Read an HMM from an HHsearch .hhm file; return 0 at end of file
200 /////////////////////////////////////////////////////////////////////////////////////
Read(FILE * dbf,const int maxcol,const int nseqdis,float * pb,char * path)201 int HMM::Read(FILE* dbf, const int maxcol, const int nseqdis, float* pb,
202 char* path) {
203 char line[LINELEN] = ""; // input line
204 char str3[8] = "", str4[8] = ""; // first 3 and 4 letters of input line
205 char* ptr; // pointer for string manipulation
206 int i = 0; // index for match state (first=1)
207 int a; // amino acid index
208 int warn = 0;
209
210 //Delete name and seq matrices
211 if (!dont_delete_seqs) // Delete all sname and seq if no flat copy to hit object has been made
212 {
213 for (int k = 0; k < n_seqs; k++)
214 delete[] sname[k];
215 for (int k = 0; k < n_seqs; k++)
216 delete[] seq[k];
217 } else // Otherwise, delete only sequences not diplayed (lost otherwise)
218 {
219 if (n_seqs > n_display) {
220 for (int k = n_display; k < n_seqs; k++)
221 delete[] sname[k];
222 for (int k = n_display; k < n_seqs; k++)
223 delete[] seq[k];
224 }
225 }
226
227 L = 0;
228 Neff_HMM = 0;
229 n_display = N_in = N_filtered = 0;
230 nss_dssp = nsa_dssp = nss_pred = nss_conf = nfirst = ncons = -1;
231 lamda = mu = 0.0;
232 name[0] = longname[0] = fam[0] = '\0';
233 trans_lin = 0; // transition probs in log space
234 has_pseudocounts = false;
235 dont_delete_seqs = false;
236 divided_by_local_bg_freqs = false;
237 //If at the end of while-loop L is still 0 then we have reached end of db file
238
239 while (fgetline(line, LINELEN - 1, dbf)
240 && !(line[0] == '/' && line[1] == '/')) {
241
242 if (strscn(line) == NULL)
243 continue; // skip lines that contain only white space
244 substr(str3, line, 0, 2); // copy the first three characters into str3
245 substr(str4, line, 0, 3); // copy the first four characters into str4
246
247 if (!strncmp("HH", line, 2))
248 continue;
249
250 if (!strcmp("NAME", str4)) {
251 ptr = strscn(line + 4); //advance to first non-white-space character
252 if (ptr) {
253 strmcpy(longname, ptr, DESCLEN - 1); //copy full name to longname
254 strmcpy(name, ptr, NAMELEN - 1); //copy longname to name...
255 strcut(name); //...cut after first word...
256 } else {
257 strcpy(longname, "undefined");
258 strcpy(name, "undefined");
259 }
260
261 HH_LOG(DEBUG1) << "Reading in HMM " << name << ":" << std::endl;
262 }
263
264 else if (!strcmp("FAM", str3)) {
265 ptr = strscn(line + 3); //advance to first non-white-space character
266 if (ptr)
267 strmcpy(fam, ptr, IDLEN - 1);
268 else
269 strcpy(fam, ""); //copy family name to basename
270 ScopID(cl, fold, sfam, fam); //get scop classification from basename (e.g. a.1.2.3.4)
271 }
272
273 else if (!strcmp("FILE", str4)) {
274 if (path)
275 strmcpy(file, path, NAMELEN - 1);
276 else
277 *file = '\0'; // copy path to file variable
278 ptr = strscn(line + 4); //advance to first non-white-space character
279 if (ptr)
280 strncat(file, ptr, NAMELEN - 1 - strlen(file)); // append file name read from file to path
281 else
282 strcat(file, "*");
283 }
284
285 else if (!strcmp("LENG", str4)) {
286 ptr = line + 4;
287 L = strint(ptr); //read next integer (number of match states)
288 } else if (!strcmp("FILT", str4) || !strcmp("NSEQ", str4)) {
289 ptr = line + 4;
290 N_filtered = strint(ptr); //read next integer: number of sequences after filtering
291 N_in = strint(ptr); //read next integer: number of sequences in alignment
292 }
293
294 else if (!strcmp("NEFF", str4) || !strcmp("NAA", str3))
295 sscanf(line + 6, "%f", &Neff_HMM);
296
297 else if (!strcmp("EVD", str3)) {
298 // char key[IDLEN];
299 sscanf(line + 6, "%f %f", &lamda, &mu);
300 // sscanf(line+22,"%s",key);
301 // lamda_hash.Add(key,lamda);
302 // mu_hash.Add(key,mu);
303 }
304
305 else if (!strcmp("PCT", str3)) {
306 has_pseudocounts = true;
307 } else if (!strcmp("DESC", str4))
308 continue;
309 else if (!strcmp("COM", str3))
310 continue;
311 else if (!strcmp("DATE", str4))
312 continue;
313
314 /////////////////////////////////////////////////////////////////////////////////////
315 // Read template sequences that should get displayed in output alignments
316 else if (!strcmp("SEQ", str3)) {
317 //char cur_seq[par.maxcol]=""; //Sequence currently read in
318 char* cur_seq = new char[maxcol]; //Sequence currently read in
319 int k; // sequence index; start with -1; after reading name of n'th sequence-> k=n
320 int h; // index for character in input line
321 int l = 1; // index of character in sequence seq[k]
322 int i = 1; // index of match states in ss_dssp[i] and ss_pred[i] sequence
323 int n_seq = 0; // number of sequences to be displayed EXCLUDING ss sequences
324 cur_seq[0] = '-'; // overwrite '\0' character at beginning to be able to do strcpy(*,cur_seq)
325 k = -1;
326 while (fgetline(line, LINELEN - 1, dbf) && line[0] != '#') {
327 HH_LOG(DEBUG1) << "Read from file:" << line << std::endl;
328 if (line[0] == '>') //line contains sequence name
329 {
330 if (k >= MAXSEQDIS - 1) //maximum number of allowable sequences exceeded
331 {
332 HH_LOG(WARNING) << "Warning in " << __FILE__ << ":"
333 << __LINE__ << ": " << __func__ << ":"
334 << std::endl;
335 HH_LOG(WARNING) << "\tnumber of sequences in " << file
336 << " exceeds maximum allowed number of "
337 << MAXSEQDIS << ". Skipping sequences.\n";
338 while (fgetline(line, LINELEN - 1, dbf)
339 && line[0] != '#')
340 ;
341 break;
342 }
343 k++;
344 if (!strncmp(line, ">ss_dssp", 8))
345 nss_dssp = k;
346 else if (!strncmp(line, ">sa_dssp", 8))
347 nsa_dssp = k;
348 else if (!strncmp(line, ">ss_pred", 8))
349 nss_pred = k;
350 else if (!strncmp(line, ">ss_conf", 8))
351 nss_conf = k;
352 else if (!strncmp(line, ">Cons-", 6)
353 || !strncmp(line, ">Consensus", 10))
354 ncons = k;
355 else {
356 if (nfirst == -1)
357 nfirst = k;
358 // if (n_seq>=par.nseqdis)
359 // {while (fgetline(line,LINELEN-1,dbf) && line[0]!='#'); k--; break;}
360 n_seq++;
361 }
362
363 //If this is not the first sequence then store residues of previous sequence
364 if (k > 0) {
365 seq[k - 1] = new char[strlen(cur_seq) + 1];
366 if (!seq[k - 1])
367 MemoryError("array of sequences to display",
368 __FILE__, __LINE__, __func__);
369 strcpy(seq[k - 1], cur_seq);
370 }
371
372 // store sequence name
373 strcut(line + 1); //find next white-space character and overwrite it with end-of-string character
374 sname[k] = new char[strlen(line + 1) + 1]; //+1 for terminating '\0'
375 if (!sname[k])
376 MemoryError("array of names for sequences to display",
377 __FILE__, __LINE__, __func__);
378 strcpy(sname[k], line + 1); //store sequence name in **name
379 l = 1;
380 i = 1;
381 } else //line contains sequence residues
382 {
383 if (k == -1) {
384 HH_LOG(WARNING) << "Ignoring following line while reading HMM" << name << ":\n\'" << line << "\'" << std::endl;
385 continue;
386 }
387
388 h = 0; //counts characters in current line
389
390 // Check whether all characters are correct; store into cur_seq
391 if (k == nss_dssp) // lines with dssp secondary structure states (. - H E C S T G B)
392 {
393 while (h < LINELEN && line[h] > '\0' && l < maxcol - 1) {
394 if (ss2i(line[h]) >= 0 && line[h] != '.') {
395 char c = ss2ss(line[h]);
396 cur_seq[l] = c;
397 if (c != '.' && !(c >= 'a' && c <= 'z'))
398 ss_dssp[i++] = ss2i(c);
399 l++;
400 } else if (ss2i(line[h]) == -2) {
401 HH_LOG(WARNING) << std::endl
402 << "Ignoring invalid symbol \'"
403 << line[h] << "\' at pos. " << h
404 << " in line '" << line << "' of HMM "
405 << name << "\n";
406 }
407 h++;
408 }
409 }
410 if (k == nsa_dssp) // lines with dssp secondary solvent accessibility (- A B C D E)
411 {
412 while (h < LINELEN && line[h] > '\0' && l < maxcol - 1) {
413 if (sa2i(line[h]) >= 0) {
414 char c = line[h];
415 cur_seq[l] = c;
416 if (c != '.' && !(c >= 'a' && c <= 'z'))
417 sa_dssp[i++] = sa2i(c);
418 l++;
419 } else if (sa2i(line[h]) == -2) {
420 HH_LOG(WARNING) << std::endl
421 << "Ignoring invalid symbol \'"
422 << line[h] << "\' at pos. " << h
423 << " in line '" << line << "' of HMM "
424 << name << "\n";
425 }
426 h++;
427 }
428 } else if (k == nss_pred) // lines with predicted secondary structure (. - H E C)
429 {
430 while (h < LINELEN && line[h] > '\0' && l < maxcol - 1) {
431 if (ss2i(line[h]) >= 0 && ss2i(line[h]) <= 3
432 && line[h] != '.') {
433 char c = ss2ss(line[h]);
434 cur_seq[l] = c;
435 if (c != '.' && !(c >= 'a' && c <= 'z'))
436 ss_pred[i++] = ss2i(c);
437 l++;
438 } else if (ss2i(line[h]) == -2) {
439 HH_LOG(WARNING) << std::endl
440 << "Ignoring invalid symbol \'"
441 << line[h] << "\' at pos. " << h
442 << " in line '" << line << "' of HMM "
443 << name << "\n";
444 }
445 h++;
446 }
447 } else if (k == nss_conf) // lines with confidence values should contain only 0-9, '-', or '.'
448 {
449 while (h < LINELEN && line[h] > '\0' && l < maxcol - 1) {
450 if (line[h] == '-'
451 || (line[h] >= '0' && line[h] <= '9')) {
452 cur_seq[l] = line[h];
453 ss_conf[l] = cf2i(line[h]);
454 l++;
455 } else if (cf2i(line[h]) == -2) {
456 HH_LOG(WARNING) << std::endl
457 << "Ignoring invalid symbol \'"
458 << line[h] << "\' at pos. " << h
459 << " in line '" << line << "' of HMM "
460 << name << "\n";
461 }
462 h++;
463 }
464 } else // normal line containing residues
465 {
466 while (h < LINELEN && line[h] > '\0' && l < maxcol - 1) {
467 // ignore '.' and white-space characters ' ', \t and \n (aa2i()==-1)
468 if (aa2i(line[h]) >= 0 && line[h] != '.') {
469 cur_seq[l] = line[h];
470 l++;
471 } else if (aa2i(line[h]) == -2) {
472 HH_LOG(WARNING) << std::endl
473 << "Ignoring invalid symbol \'"
474 << line[h] << "\' at pos. " << h
475 << " in line '" << line << "' of HMM "
476 << name << "\n";
477 }
478 h++;
479 }
480 }
481 cur_seq[l] = '\0'; //Ensure that cur_seq ends with a '\0' character
482
483 } //end else
484
485 if (n_seq <= nseqdis)
486 n_display = k + 1;
487 } //while(getline)
488
489 //If this is not the first sequence some residues have already been read in
490 if (k >= 0) {
491 seq[k] = new char[strlen(cur_seq) + 1];
492 if (!seq[k])
493 MemoryError("array of sequences to display", __FILE__,
494 __LINE__, __func__);
495 strcpy(seq[k], cur_seq);
496 }
497 n_seqs = k + 1;
498
499 // DEBUG
500 if (Log::reporting_level() >= DEBUG1) {
501 HH_LOG(DEBUG1) << "nss_dssp=" << nss_dssp << " nsa_dssp=" << nsa_dssp << " nss_pred=" << nss_pred << " nss_conf=" << nss_conf << " nfirst=" << nfirst << std::endl;
502 for (k = 0; k < n_display; k++) {
503 int j;
504 HH_LOG(DEBUG1) << ">" << sname[k] << "(k=" << k << ")\n";
505 if (k == nss_dssp) {
506 for (j = 1; j <= L; j++)
507 HH_LOG(DEBUG1) << char(i2ss(ss_dssp[j]));
508 } else if (k == nsa_dssp) {
509 for (j = 1; j <= L; j++)
510 HH_LOG(DEBUG1) << char(i2sa(sa_dssp[j]));
511 } else if (k == nss_pred) {
512 for (j = 1; j <= L; j++)
513 HH_LOG(DEBUG1) << char(i2ss(ss_pred[j]));
514 } else if (k == nss_conf) {
515 for (j = 1; j <= L; j++)
516 HH_LOG(DEBUG1) << int(ss_conf[j] - 1);
517 } else {
518 for (j = 1; j <= L; j++)
519 HH_LOG(DEBUG1) << seq[k][j];
520 }
521 HH_LOG(DEBUG1) << "\n";
522 }
523 }
524
525 delete[] cur_seq;
526
527 } //end if("SEQ")
528
529 /////////////////////////////////////////////////////////////////////////////////////
530 // Read average amino acid frequencies for HMM
531 else if (!strcmp("FREQ", str4))
532 FormatError(file, __FILE__, __LINE__, __func__,
533 "File has obsolete format. Please use hhmake version > 1.1 to generate hhm files.\n");
534
535 else if (!strcmp("AVER", str4)) {
536 } // AVER line scrapped
537 else if (!strcmp("NULL", str4)) {
538 ptr = line + 4;
539 for (a = 0; a < 20 && ptr; ++a)
540 //s2[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
541 pb[s2a[a]] = (float) fpow2(float(-strinta(ptr)) / HMMSCALE);
542 if (!ptr)
543 return Warning(dbf, line, name);
544 if (Log::reporting_level() >= DEBUG1) {
545 HH_LOG(DEBUG1) << "\nNULL ";
546 for (a = 0; a < 20; ++a)
547 HH_LOG(DEBUG1) << 100. * pb[s2a[a]] << " ";
548 HH_LOG(DEBUG1) << std::endl;
549 }
550 }
551
552 /////////////////////////////////////////////////////////////////////////////////////
553 // Read transition probabilities from start state
554 else if (!strcmp("HMM", str3)) {
555 fgetline(line, LINELEN - 1, dbf); // Skip line with amino acid labels
556 fgetline(line, LINELEN - 1, dbf); // Skip line with transition labels
557 ptr = line;
558
559 for (a = 0; a <= D2D && ptr; ++a)
560 tr[0][a] = float(-strinta(ptr)) / HMMSCALE; //store transition probabilites as log2 values
561 // strinta returns next integer in string and puts ptr to first char
562 // after the integer. Returns -99999 if '*' is found.
563 // ptr is set to 0 if no integer is found after ptr.
564 Neff_M[0] = float(strinta(ptr)) / HMMSCALE; // Read eff. number of sequences with M->? transition
565 Neff_I[0] = float(strinta(ptr)) / HMMSCALE; // Read eff. number of sequences with I->? transition
566 Neff_D[0] = float(strinta(ptr)) / HMMSCALE; // Read eff. number of sequences with D->? transition
567 if (!ptr)
568 return Warning(dbf, line, name);
569
570 /////////////////////////////////////////////////////////////////////////////////////
571 // Read columns of HMM
572 int next_i = 0; // index of next column
573 while (fgetline(line, LINELEN - 2, dbf)
574 && !(line[0] == '/' && line[1] == '/') && line[0] != '#') {
575 if (strscn(line) == NULL)
576 continue; // skip lines that contain only white space
577
578 // Read in AA probabilities
579 ptr = line + 1;
580 int prev_i = next_i;
581 next_i = strint(ptr);
582 ++i;
583 if (next_i != prev_i + 1)
584 if (++warn <= 5) {
585 HH_LOG(WARNING) << "In HMM " << name
586 << " state " << prev_i
587 << " is followed by state " << next_i << "\n";
588 if (warn == 5) {
589 HH_LOG(WARNING) << std::endl
590 << "Further warnings while reading HMMs will be suppressed.\n";
591 }
592 }
593 if (i > L) {
594 HH_LOG(WARNING) << std::endl << "In HMM " << name
595 << " there are more columns than the stated length "
596 << L << ". Skipping HMM\n";
597 return 2;
598 }
599 if (i > maxres - 2) {
600 fgetline(line, LINELEN - 1, dbf); // Skip line
601 continue;
602 }
603
604 for (a = 0; a < 20 && ptr; ++a)
605 // f[i][s2a[a]] = (float)pow(2.,float(-strinta(ptr))/HMMSCALE);
606 f[i][s2a[a]] = fpow2(float(-strinta(ptr)) / HMMSCALE); // speed-up ~5 s for 10000 SCOP domains
607
608 //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
609 l[i] = strint(ptr);
610 if (!ptr)
611 return Warning(dbf, line, name);
612 if (Log::reporting_level() >= DEBUG1) {
613 HH_LOG(DEBUG1) << line;
614 HH_LOG(DEBUG1) << i << " ";
615 for (a = 0; a < 20; ++a)
616 HH_LOG(DEBUG1) << 100 * f[i][s2a[a]] << " ";
617 HH_LOG(DEBUG1) << l[i];
618 HH_LOG(DEBUG1) << std::endl;
619 }
620
621 // Read transition probabilities
622 fgetline(line, LINELEN - 1, dbf); // Skip line with amino acid labels
623 if (line[0] != ' ' && line[0] != '\t')
624 return Warning(dbf, line, name);
625 ptr = line;
626 for (a = 0; a <= D2D && ptr; ++a)
627 tr[i][a] = float(-strinta(ptr)) / HMMSCALE; //store transition prob's as log2-values
628 Neff_M[i] = float(strinta(ptr)) / HMMSCALE; // Read eff. number of sequences with M->? transition
629 if (Neff_M[i] == 0) {
630 Neff_M[i] = 1;
631 }
632 Neff_I[i] = float(strinta(ptr)) / HMMSCALE; // Read eff. number of sequences with I->? transition
633 Neff_D[i] = float(strinta(ptr)) / HMMSCALE; // Read eff. number of sequences with D->? transition
634 if (!ptr)
635 return Warning(dbf, line, name);
636 if (Log::reporting_level() >= DEBUG1) {
637 HH_LOG(DEBUG1) << " ";
638 for (a = 0; a <= D2D; ++a)
639 HH_LOG(DEBUG1) << 100 * fpow2(tr[i][a]) << " ";
640 HH_LOG(DEBUG1) << Neff_M[i] << " " << Neff_I[i] << " " << Neff_D[i] << std::endl;
641 }
642 }
643 if (line[0] == '/' && line[1] == '/')
644 break;
645 } else
646 HH_LOG(WARNING) << std::endl << "Ignoring line\n\'" << line
647 << "\'\nin HMM " << name << "\n";
648
649 } //while(getline)
650
651 if (L == 0)
652 return 0; //End of db file -> stop reading in
653
654 // Set coefficients of EVD (= 0.0 if not calibrated for these parameters)
655 // lamda = lamda_hash.Show(par.Key());
656 // mu = mu_hash.Show(par.Key());
657 if (lamda) {
658 HH_LOG(DEBUG) << "HMM " << name << " is already calibrated: lamda="
659 << lamda << ", mu=" << mu << std::endl;
660 }
661
662 if (i != L) {
663 HH_LOG(WARNING) << "In HMM " << name << " there are only " << i
664 << " columns while the stated length is " << L << "\n";
665 }
666
667 if (i > maxres - 2) {
668 i = maxres - 2;
669 HH_LOG(WARNING) << "Maximum number " << maxres - 2
670 << " of residues exceeded while reading HMM " << name << "\n";
671 }
672 if (!i) {
673 HH_LOG(WARNING) << std::endl << "HMM " << name
674 << " contains no match states. Check the alignment that gave rise to this HMM.\n";
675 }
676
677 HH_LOG(DEBUG) << "Read in HMM " << name << " with " << L
678 << " match states and effective number of sequences = "
679 << Neff_HMM << "\n";
680 L = i;
681
682 // Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities
683 for (a = 0; a < 20; ++a)
684 f[0][a] = f[L + 1][a] = pb[a];
685 Neff_M[L + 1] = 1.0f;
686 Neff_I[L + 1] = Neff_D[L + 1] = 0.0f;
687
688 return 1; //return status: ok
689 }
690
691 /////////////////////////////////////////////////////////////////////////////////////
692 //// Read an HMM from a HMMer .hmm file; return 0 at end of file
693 /////////////////////////////////////////////////////////////////////////////////////
ReadHMMer(FILE * dbf,const char showcons,float * pb,char * filestr)694 int HMM::ReadHMMer(FILE* dbf, const char showcons, float* pb, char* filestr) {
695 char line[LINELEN] = ""; // input line
696 char desc[DESCLEN] = ""; // description of family
697 char str4[5] = ""; // first 4 letters of input line
698 char* ptr; // pointer for string manipulation
699 int i = 0; // index for match state (first=1)
700 int a; // amino acid index
701 char dssp = 0; // 1 if a consensus SS has been found in the transition prob lines
702 char annot = 0; // 1 if at least one annotation character in insert lines is ne '-' or ' '
703 int k = 0; // index for seq[k]
704 static char ignore_hmmer_cal = 0;
705 char* annotchr; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
706 annotchr = new char[maxres]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
707 int warn = 0;
708
709 L = 0;
710 Neff_HMM = 0;
711 n_seqs = n_display = N_in = N_filtered = 0;
712 nss_dssp = nsa_dssp = nss_pred = nss_conf = nfirst = ncons = -1;
713 lamda = mu = 0.0;
714 trans_lin = 0; // transition probs in log space
715 has_pseudocounts = true; // !!
716 dont_delete_seqs = false;
717 divided_by_local_bg_freqs = false;
718 name[0] = longname[0] = desc[0] = fam[0] = '\0';
719 //If at the end of while-loop L is still 0 then we have reached end of db file
720
721 // Do not delete name and seq vectors because their adresses are transferred to hitlist as part of a hit!!
722
723 while (fgetline(line, LINELEN - 1, dbf)
724 && !(line[0] == '/' && line[1] == '/')) {
725
726 if (strscn(line) == NULL)
727 continue; // skip lines that contain only white space
728 if (!strncmp("HMMER", line, 5))
729 continue;
730
731 substr(str4, line, 0, 3); // copy the first four characters into str4
732
733 if (!strcmp("NAME", str4) && name[0] == '\0') {
734 ptr = strscn(line + 4); // advance to first non-white-space character
735 strmcpy(name, ptr, NAMELEN - 1); // copy full name to name
736 strcut(name); // ...cut after first word...
737 HH_LOG(DEBUG1) << "Reading in HMM " << name << ":" << std::endl;
738 }
739
740 else if (!strcmp("ACC ", str4)) {
741 ptr = strscn(line + 4); // advance to first non-white-space character
742 strmcpy(longname, ptr, DESCLEN - 1); // copy Accession id to longname...
743 }
744
745 else if (!strcmp("DESC", str4)) {
746 ptr = strscn(line + 4); // advance to first non-white-space character
747 if (ptr) {
748 strmcpy(desc, ptr, DESCLEN - 1); // copy description to name...
749 strcut(ptr); // ...cut after first word...
750 }
751 if (!ptr || ptr[1] != '.' || strchr(ptr + 3, '.') == NULL)
752 strcpy(fam, "");
753 else
754 strmcpy(fam, ptr, NAMELEN - 1); // could not find two '.' in name?
755 }
756
757 else if (!strcmp("LENG", str4)) {
758 ptr = line + 4;
759 L = strint(ptr); //read next integer (number of match states)
760 }
761
762 else if (!strcmp("ALPH", str4))
763 continue;
764 else if (!strcmp("RF ", str4))
765 continue;
766 else if (!strcmp("CS ", str4))
767 continue;
768 else if (!strcmp("MAP ", str4))
769 continue;
770 else if (!strcmp("COM ", str4))
771 continue;
772 else if (!strcmp("NSEQ", str4)) {
773 ptr = line + 4;
774 N_in = N_filtered = strint(ptr); //read next integer: number of sequences after filtering
775 }
776
777 else if (!strcmp("DATE", str4))
778 continue;
779 else if (!strncmp("CKSUM ", line, 5))
780 continue;
781 else if (!strcmp("GA ", str4))
782 continue;
783 else if (!strcmp("TC ", str4))
784 continue;
785 else if (!strcmp("NC ", str4))
786 continue;
787
788 else if (!strncmp("SADSS", line, 5)) {
789 if (nsa_dssp < 0) {
790 nsa_dssp = k++;
791 seq[nsa_dssp] = new char[maxres + 2];
792 sname[nsa_dssp] = new char[15];
793 strcpy(seq[nsa_dssp], " ");
794 strcpy(sname[nsa_dssp], "sa_dssp");
795
796 }
797 ptr = strscn(line + 5);
798 if (ptr) {
799 strcut(ptr);
800 if (strlen(seq[nsa_dssp]) + strlen(ptr) >= (unsigned) (maxres))
801 printf(
802 "HMM %s has SADSS records with more than %i residues.\n",
803 name, maxres);
804 else
805 strcat(seq[nsa_dssp], ptr);
806 }
807 }
808
809 else if (!strncmp("SSPRD", line, 5)) {
810 if (nss_pred < 0) {
811 nss_pred = k++;
812 seq[nss_pred] = new char[maxres + 2];
813 sname[nss_pred] = new char[15];
814 strcpy(seq[nss_pred], " ");
815 strcpy(sname[nss_pred], "ss_pred");
816
817 }
818 ptr = strscn(line + 5);
819 if (ptr) {
820 strcut(ptr);
821 if (strlen(seq[nss_pred]) + strlen(ptr) >= (unsigned) (maxres))
822 printf(
823 "HMM %s has SSPRD records with more than %i residues.\n",
824 name, maxres);
825 else
826 strcat(seq[nss_pred], ptr);
827 }
828 }
829
830 else if (!strncmp("SSCON", line, 5)) {
831 if (nss_conf < 0) {
832 nss_conf = k++;
833 seq[nss_conf] = new char[maxres + 2];
834 sname[nss_conf] = new char[15];
835 strcpy(seq[nss_conf], " ");
836 strcpy(sname[nss_conf], "ss_conf");
837 }
838 ptr = strscn(line + 5);
839 if (ptr) {
840 strcut(ptr);
841 if (strlen(seq[nss_conf]) + strlen(ptr) >= (unsigned) (maxres))
842 printf(
843 "HMM %s has SSPRD records with more than %i residues.\n",
844 name, maxres);
845 else
846 strcat(seq[nss_conf], ptr);
847 }
848 }
849
850 else if (!strncmp("SSCIT", line, 5))
851 continue;
852 else if (!strcmp("XT ", str4))
853 continue;
854 else if (!strcmp("NULT", str4))
855 continue;
856
857 else if (!strcmp("NULE", str4)) {
858 ptr = line + 4;
859 for (a = 0; a < 20 && ptr; ++a)
860 //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
861 pb[s2a[a]] = (float) 0.05
862 * fpow2(float(strinta(ptr, -99999)) / HMMSCALE);
863 if (!ptr)
864 return Warning(dbf, line, name);
865 if (Log::reporting_level() >= DEBUG1) {
866 HH_LOG(DEBUG1) << "\nNULL ";
867 for (a = 0; a < 20; ++a)
868 HH_LOG(DEBUG1) << 100. * pb[s2a[a]] << " ";
869 HH_LOG(DEBUG1) << std::endl;
870 }
871 }
872
873 else if (!strcmp("EVD ", str4)) {
874 char* ptr = line + 4;
875 ptr = strscn(ptr);
876 sscanf(ptr, "%f", &lamda);
877 ptr = strscn(ptr);
878 sscanf(ptr, "%f", &mu);
879 if (lamda < 0) {
880 if (ignore_hmmer_cal == 0) {
881 HH_LOG(WARNING) << std::endl
882 << "Some HMMs have been calibrated with HMMER's 'hmmcalibrate'. These calibrations will be ignored\n";
883 }
884 ignore_hmmer_cal = 1;
885 mu = lamda = 0.0;
886 }
887 }
888
889 /////////////////////////////////////////////////////////////////////////////////////
890 // Read transition probabilities from start state
891 else if (!strncmp("HMM", line, 3)) {
892 fgetline(line, LINELEN - 1, dbf); // Skip line with amino acid labels
893 fgetline(line, LINELEN - 1, dbf); // Skip line with transition labels
894 ptr = line;
895 for (a = 0; a <= M2D && ptr; ++a)
896 tr[0][a] = float(strinta(ptr, -99999)) / HMMSCALE; //store transition probabilites as log2 values
897 // strinta returns next integer in string and puts ptr to first char
898 // after the integer. Returns -99999 if '*' is found.
899 // ptr is set to 0 if no integer is found after ptr.
900 tr[0][I2M] = tr[0][D2M] = 0.0;
901 tr[0][I2I] = tr[0][D2D] = -99999.0;
902 if (!ptr)
903 return Warning(dbf, line, name);
904 if (Log::reporting_level() >= DEBUG1) {
905 HH_LOG(DEBUG1) << " ";
906 for (a = 0; a <= D2D && ptr; ++a)
907 HH_LOG(DEBUG1) << 100 * fpow2(tr[i][a]) << " ";
908 HH_LOG(DEBUG1) << std::endl;
909 }
910
911 // Prepare to store DSSP states (if there are none, delete afterwards)
912 nss_dssp = k++;
913 seq[nss_dssp] = new char[maxres + 2];
914 sname[nss_dssp] = new char[15];
915 strcpy(sname[nss_dssp], "ss_dssp");
916
917 /////////////////////////////////////////////////////////////////////////////////////
918 // Read columns of HMM
919 int next_i = 0; // index of next column
920 while (fgetline(line, LINELEN - 1, dbf)
921 && !(line[0] == '/' && line[1] == '/') && line[0] != '#') {
922 if (strscn(line) == NULL)
923 continue; // skip lines that contain only white space
924
925 // Read in AA probabilities
926 ptr = line;
927 int prev_i = next_i;
928 next_i = strint(ptr);
929 ++i;
930 if (next_i != prev_i + 1)
931 if (++warn < 5) {
932 HH_LOG(WARNING) << std::endl << "In HMM " << name
933 << " state " << prev_i
934 << " is followed by state " << next_i << "\n";
935 if (warn == 5) {
936 HH_LOG(WARNING) << std::endl
937 << "Further warnings while reading HMMs will be suppressed.\n";
938 }
939 }
940 if (i > L) {
941 HH_LOG(WARNING) << std::endl << "In HMM " << name
942 << " there are more columns than the stated length "
943 << L << ". Skipping columns.\n";
944 break;
945 }
946 if (i > L) {
947 HH_LOG(WARNING) << std::endl << "In HMM " << name
948 << " there are more columns than the stated length "
949 << L << "\n";
950 }
951 if (i >= maxres - 2) {
952 fgetline(line, LINELEN - 1, dbf); // Skip two lines
953 fgetline(line, LINELEN - 1, dbf);
954 continue;
955 }
956
957 for (a = 0; a < 20 && ptr; ++a)
958 f[i][s2a[a]] = (float) pb[s2a[a]]
959 * fpow2(float(strinta(ptr, -99999)) / HMMSCALE);
960 //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
961 if (!ptr)
962 return Warning(dbf, line, name);
963 if (Log::reporting_level() >= DEBUG1) {
964 HH_LOG(WARNING) << i;
965 for (a = 0; a < 20; ++a)
966 HH_LOG(WARNING) << 100 * f[i][s2a[a]] << " ";
967 HH_LOG(WARNING) << std::endl;
968 }
969
970 // Read insert emission line
971 fgetline(line, LINELEN - 1, dbf);
972 ptr = strscn(line);
973 if (!ptr)
974 return Warning(dbf, line, name);
975 annotchr[i] = uprchr(*ptr);
976 if (*ptr != '-' && *ptr != ' ' && *ptr != 'X' && *ptr != 'x')
977 annot = 1;
978
979 // Read annotation character and seven transition probabilities
980 fgetline(line, LINELEN - 1, dbf);
981 ptr = strscn(line);
982 switch (*ptr) {
983 case 'H':
984 ss_dssp[i] = 1;
985 seq[nss_dssp][i] = *ptr;
986 dssp = 1;
987 break;
988 case 'E':
989 ss_dssp[i] = 2;
990 seq[nss_dssp][i] = *ptr;
991 dssp = 1;
992 break;
993 case 'C':
994 ss_dssp[i] = 3;
995 seq[nss_dssp][i] = *ptr;
996 dssp = 1;
997 break;
998 case 'S':
999 ss_dssp[i] = 4;
1000 seq[nss_dssp][i] = *ptr;
1001 dssp = 1;
1002 break;
1003 case 'T':
1004 ss_dssp[i] = 5;
1005 seq[nss_dssp][i] = *ptr;
1006 dssp = 1;
1007 break;
1008 case 'G':
1009 ss_dssp[i] = 6;
1010 seq[nss_dssp][i] = *ptr;
1011 dssp = 1;
1012 break;
1013 case 'B':
1014 ss_dssp[i] = 7;
1015 seq[nss_dssp][i] = *ptr;
1016 dssp = 1;
1017 break;
1018 case 'I':
1019 dssp = 1;
1020 case '~':
1021 ss_dssp[i] = 3;
1022 seq[nss_dssp][i] = *ptr;
1023 break;
1024 case '-': // no SS available from any template
1025 case '.': // no clear consensus SS structure
1026 case 'X': // no clear consensus SS structure
1027 ss_dssp[i] = 0;
1028 seq[nss_dssp][i] = '-';
1029 break;
1030 default:
1031 ss_dssp[i] = 0;
1032 seq[nss_dssp][i] = *ptr;
1033 break;
1034 }
1035
1036 ptr += 2;
1037 for (a = 0; a <= D2D && ptr; ++a)
1038 tr[i][a] = float(strinta(ptr, -99999)) / HMMSCALE; //store transition prob's as log2-values
1039 if (!ptr)
1040 return Warning(dbf, line, name);
1041 if (Log::reporting_level() >= DEBUG1) {
1042 HH_LOG(DEBUG1) << " ";
1043 for (a = 0; a <= D2D; ++a)
1044 HH_LOG(DEBUG1) << 100 * fpow2(tr[i][a]) << " ";
1045 HH_LOG(DEBUG1) << std::endl;
1046 }
1047 }
1048
1049 if (line[0] == '/' && line[1] == '/')
1050 break;
1051
1052 }
1053
1054 } //while(getline)
1055
1056 if (L == 0)
1057 return 0; //End of db file -> stop reading in
1058
1059 // Set coefficients of EVD (= 0.0 if not calibrated for these parameters)
1060 // lamda = lamda_hash.Show(par.Key());
1061 // mu = mu_hash.Show(par.Key());
1062 if (lamda) {
1063 HH_LOG(DEBUG) << "HMM "<< name <<" is already calibrated: lamda="<< lamda<<", mu="<<mu<< std::endl;
1064 }
1065
1066 if (i != L) {
1067 HH_LOG(WARNING) << std::endl << "In HMM " << name
1068 << " there are only " << i
1069 << " columns while the stated length is " << L << std::endl;
1070 }
1071 if (i >= maxres - 2) {
1072 i = maxres - 2;
1073 HH_LOG(WARNING) << std::endl << "Maximum number " << maxres - 2
1074 << " of residues exceeded while reading HMM " << name << std::endl;
1075 }
1076 if (!i) {
1077 HH_LOG(WARNING) << std::endl << "HMM " << name
1078 << " contains no match states. Check the alignment that gave rise to this HMM." << std::endl;
1079 }
1080 L = i;
1081
1082 if (strlen(longname) > 0)
1083 strcat(longname, " ");
1084 strncat(longname, name, DESCLEN - strlen(longname) - 1); // longname = ACC NAME DESC
1085 if (strlen(name) > 0)
1086 strcat(longname, " ");
1087 strncat(longname, desc, DESCLEN - strlen(longname) - 1);
1088 longname[DESCLEN - 1] = '\0';
1089 ScopID(cl, fold, sfam, fam); // get scop classification from basename (e.g. a.1.2.3.4)
1090 RemoveExtension(file, filestr); // copy name of dbfile without extension into 'file'
1091
1092 // Secondary structure
1093 if (!dssp) {
1094 // remove dssp sequence
1095 delete[] seq[nss_dssp]; // memory that had been allocated in case ss_dssp was given needs to be freed
1096 delete[] sname[nss_dssp]; // memory that had been allocated in case ss_dssp was given needs to be freed
1097 nss_dssp = -1;
1098 k--;
1099 } else {
1100 seq[nss_dssp][0] = '-';
1101 seq[nss_dssp][L + 1] = '\0';
1102 }
1103
1104 if (nss_pred >= 0) {
1105 for (i = 1; i <= L; ++i)
1106 ss_pred[i] = ss2i(seq[nss_pred][i]);
1107 if (nss_conf >= 0)
1108 for (i = 1; i <= L; ++i)
1109 ss_conf[i] = cf2i(seq[nss_conf][i]);
1110 else
1111 for (i = 1; i <= L; ++i)
1112 ss_conf[i] = 5;
1113 }
1114
1115 // Copy query (first sequence) and consensus residues?
1116 if (showcons) {
1117 sname[k] = new char[10];
1118 strcpy(sname[k], "Consensus");
1119 sname[k + 1] = new char[strlen(longname) + 1];
1120 strcpy(sname[k + 1], longname);
1121 seq[k] = new char[L + 2];
1122 seq[k][0] = ' ';
1123 seq[k][L + 1] = '\0';
1124 seq[k + 1] = new char[L + 2];
1125 seq[k + 1][0] = ' ';
1126 seq[k + 1][L + 1] = '\0';
1127 for (i = 1; i <= L; ++i) {
1128 float pmax = 0.0;
1129 int amax = 0;
1130 for (a = 0; a < NAA; ++a)
1131 if (f[i][a] > pmax) {
1132 amax = a;
1133 pmax = f[i][a];
1134 }
1135 if (pmax > 0.6)
1136 seq[k][i] = i2aa(amax);
1137 else if (pmax > 0.4)
1138 seq[k][i] = lwrchr(i2aa(amax));
1139 else
1140 seq[k][i] = 'x';
1141 seq[k + 1][i] = i2aa(amax);
1142 }
1143 ncons = k++; // nfirst is set later!
1144 } else {
1145 sname[k] = new char[strlen(longname) + 1];
1146 strcpy(sname[k], longname);
1147 seq[k] = new char[L + 2];
1148 seq[k][0] = ' ';
1149 seq[k][L + 1] = '\0';
1150 }
1151
1152 if (annot) // read in some annotation characters?
1153 {
1154 annotchr[0] = ' ';
1155 annotchr[L + 1] = '\0';
1156 strcpy(seq[k], annotchr); // overwrite the consensus sequence with the annotation characters
1157 } else if (!showcons) // we have not yet calculated the consensus, but we need it now as query (first sequence)
1158 {
1159 for (i = 1; i <= L; ++i) {
1160 float pmax = 0.0;
1161 int amax = 0;
1162 for (a = 0; a < NAA; ++a)
1163 if (f[i][a] > pmax) {
1164 amax = a;
1165 pmax = f[i][a];
1166 }
1167 seq[k][i] = i2aa(amax);
1168 }
1169 }
1170 // printf("%i query name=%s seq=%s\n",n,sname[n],seq[n]);
1171 nfirst = k++;
1172
1173 n_display = k;
1174 n_seqs = k;
1175
1176 // Calculate overall Neff_HMM
1177 Neff_HMM = 0;
1178 for (i = 1; i <= L; ++i) {
1179 float S = 0.0;
1180 for (a = 0; a < 20; ++a)
1181 if (f[i][a] > 1E-10)
1182 S -= f[i][a] * fast_log2(f[i][a]);
1183 Neff_HMM += (float) fpow2(S);
1184 }
1185 Neff_HMM /= L;
1186 for (i = 0; i <= L; ++i)
1187 Neff_M[i] = Neff_I[i] = Neff_D[i] = 10.0; // to add only little additional pseudocounts!
1188 Neff_M[L + 1] = 1.0f;
1189 Neff_I[L + 1] = Neff_D[L + 1] = 0.0f;
1190
1191 HH_LOG(DEBUG) << "Read in HMM " << name << " with " << L
1192 << " match states and effective number of sequences = "
1193 << Neff_HMM << "\n";
1194
1195 // Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities
1196 for (a = 0; a < 20; ++a)
1197 f[0][a] = f[L + 1][a] = pb[a];
1198 delete[] annotchr;
1199 return 1; //return status: ok
1200 }
1201
1202 /////////////////////////////////////////////////////////////////////////////////////
1203 //// Read an HMM from a HMMER3 .hmm file; return 0 at end of file
1204 /////////////////////////////////////////////////////////////////////////////////////
ReadHMMer3(FILE * dbf,const char showcons,float * pb,char * filestr)1205 int HMM::ReadHMMer3(FILE* dbf, const char showcons, float* pb, char* filestr) {
1206 char line[LINELEN] = ""; // input line
1207 char desc[DESCLEN] = ""; // description of family
1208 char str4[5] = ""; // first 4 letters of input line
1209 char* ptr; // pointer for string manipulation
1210 int i = 0; // index for match state (first=1)
1211 int a; // amino acid index
1212 char dssp = 0; // 1 if a consensus SS has been found in the transition prob lines
1213 char annot = 0; // 1 if at least one annotation character in insert lines is ne '-' or ' '
1214 int k = 0; // index for seq[k]
1215 char* annotchr; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
1216 annotchr = new char[maxres]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
1217 int warn = 0;
1218
1219 L = 0;
1220 Neff_HMM = 0;
1221 n_seqs = n_display = N_in = N_filtered = 0;
1222 nss_dssp = nsa_dssp = nss_pred = nss_conf = nfirst = ncons = -1;
1223 lamda = mu = 0.0;
1224 trans_lin = 0; // transition probs in log space
1225 has_pseudocounts = true; // !!
1226 dont_delete_seqs = false;
1227 divided_by_local_bg_freqs = false;
1228 name[0] = longname[0] = desc[0] = fam[0] = '\0';
1229 //If at the end of while-loop L is still 0 then we have reached end of db file
1230
1231 // Do not delete name and seq vectors because their adresses are transferred to hitlist as part of a hit!!
1232
1233 while (fgetline(line, LINELEN - 1, dbf)
1234 && !(line[0] == '/' && line[1] == '/')) {
1235
1236 if (strscn(line) == NULL)
1237 continue; // skip lines that contain only white space
1238 if (!strncmp("HMMER", line, 5))
1239 continue;
1240
1241 substr(str4, line, 0, 3); // copy the first four characters into str4
1242
1243 if (!strcmp("NAME", str4) && name[0] == '\0') {
1244 ptr = strscn(line + 4); // advance to first non-white-space character
1245 strmcpy(name, ptr, NAMELEN - 1); // copy full name to name
1246 strcut(name); // ...cut after first word...
1247 HH_LOG(DEBUG1) << "Reading in HMM " << name << ":\n";
1248 }
1249
1250 else if (!strcmp("ACC ", str4)) {
1251 ptr = strscn(line + 4); // advance to first non-white-space character
1252 strmcpy(longname, ptr, DESCLEN - 1); // copy Accession id to longname...
1253 }
1254
1255 else if (!strcmp("DESC", str4)) {
1256 ptr = strscn(line + 4); // advance to first non-white-space character
1257 if (ptr) {
1258 strmcpy(desc, ptr, DESCLEN - 1); // copy description to name...
1259 desc[DESCLEN - 1] = '\0';
1260 strcut(ptr); // ...cut after first word...
1261 }
1262 if (!ptr || ptr[1] != '.' || strchr(ptr + 3, '.') == NULL)
1263 strcpy(fam, "");
1264 else
1265 strmcpy(fam, ptr, NAMELEN - 1); // could not find two '.' in name?
1266 }
1267
1268 else if (!strcmp("LENG", str4)) {
1269 ptr = line + 4;
1270 L = strint(ptr); //read next integer (number of match states)
1271 }
1272
1273 else if (!strcmp("ALPH", str4))
1274 continue;
1275 else if (!strcmp("RF ", str4))
1276 continue;
1277 else if (!strcmp("CS ", str4))
1278 continue;
1279 else if (!strcmp("MAP ", str4))
1280 continue;
1281 else if (!strcmp("COM ", str4))
1282 continue;
1283 else if (!strcmp("NSEQ", str4)) {
1284 ptr = line + 4;
1285 N_in = N_filtered = strint(ptr); //read next integer: number of sequences after filtering
1286 }
1287
1288 else if (!strcmp("DATE", str4))
1289 continue;
1290 else if (!strncmp("CKSUM ", line, 5))
1291 continue;
1292 else if (!strcmp("GA ", str4))
1293 continue;
1294 else if (!strcmp("TC ", str4))
1295 continue;
1296 else if (!strcmp("NC ", str4))
1297 continue;
1298
1299 //////////////////////////////////////////////////////////////////////////////////////////////////////
1300 // Still needed???
1301
1302 else if (!strncmp("SADSS", line, 5)) {
1303 if (nsa_dssp < 0) {
1304 nsa_dssp = k++;
1305 seq[nsa_dssp] = new char[maxres + 2];
1306 sname[nsa_dssp] = new char[15];
1307 strcpy(seq[nsa_dssp], " ");
1308 strcpy(sname[nsa_dssp], "sa_dssp");
1309
1310 }
1311 ptr = strscn(line + 5);
1312 if (ptr) {
1313 strcut(ptr);
1314 if (strlen(seq[nsa_dssp]) + strlen(ptr) >= (unsigned) (maxres))
1315 printf(
1316 "HMM %s has SADSS records with more than %i residues.\n",
1317 name, maxres);
1318 else
1319 strcat(seq[nsa_dssp], ptr);
1320 }
1321 }
1322
1323 else if (!strncmp("SSPRD", line, 5)) {
1324 if (nss_pred < 0) {
1325 nss_pred = k++;
1326 seq[nss_pred] = new char[maxres + 2];
1327 sname[nss_pred] = new char[15];
1328 strcpy(seq[nss_pred], " ");
1329 strcpy(sname[nss_pred], "ss_pred");
1330
1331 }
1332 ptr = strscn(line + 5);
1333 if (ptr) {
1334 strcut(ptr);
1335 if (strlen(seq[nss_pred]) + strlen(ptr) >= (unsigned) (maxres))
1336 printf(
1337 "HMM %s has SSPRD records with more than %i residues.\n",
1338 name, maxres);
1339 else
1340 strcat(seq[nss_pred], ptr);
1341 }
1342 }
1343
1344 else if (!strncmp("SSCON", line, 5)) {
1345 if (nss_conf < 0) {
1346 nss_conf = k++;
1347 seq[nss_conf] = new char[maxres + 2];
1348 sname[nss_conf] = new char[15];
1349 strcpy(seq[nss_conf], " ");
1350 strcpy(sname[nss_conf], "ss_conf");
1351 }
1352 ptr = strscn(line + 5);
1353 if (ptr) {
1354 strcut(ptr);
1355 if (strlen(seq[nss_conf]) + strlen(ptr) >= (unsigned) (maxres))
1356 printf(
1357 "HMM %s has SSPRD records with more than %i residues.\n",
1358 name, maxres);
1359 else
1360 strcat(seq[nss_conf], ptr);
1361 }
1362 }
1363
1364 else if (!strncmp("SSCIT", line, 5))
1365 continue;
1366 else if (!strcmp("XT ", str4))
1367 continue;
1368 //////////////////////////////////////////////////////////////////////////////////////////////////////
1369
1370 else if (!strncmp("STATS LOCAL", line, 11))
1371 continue;
1372
1373 else if (!strcmp("EFFN", str4)) {
1374 ptr = line + 4;
1375 float effn = strflt(ptr);
1376 // Calculate Neff_HMM by using f(x) = ax^0.1 + bx^0.5 + cx + d (fitted with scop25 dataset)
1377 Neff_HMM = -1.403534 * pow(effn, 0.1) + 4.428118 * pow(effn, 0.5)
1378 - 0.2885410 * effn - 1.108568;
1379 }
1380
1381 /////////////////////////////////////////////////////////////////////////////////////
1382 // Read transition probabilities from start state
1383 else if (!strncmp("HMM", line, 3)) {
1384 fgetline(line, LINELEN - 1, dbf); // Skip line with amino acid labels
1385 fgetline(line, LINELEN - 1, dbf); // Skip line with transition labels
1386 ptr = strscn(line);
1387
1388 if (!strncmp("COMPO", ptr, 5)) {
1389 ptr = ptr + 5;
1390 for (a = 0; a < 20 && ptr; ++a)
1391 //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
1392 pb[s2a[a]] = (float) exp(-1.0 * strflta(ptr, 99999));
1393 if (!ptr)
1394 return Warning(dbf, line, name);
1395 if (Log::reporting_level() >= DEBUG1) {
1396 HH_LOG(DEBUG1) << "\nNULL ";
1397 for (a = 0; a < 20; ++a)
1398 HH_LOG(DEBUG1) << 100. * pb[s2a[a]] << " ";
1399 HH_LOG(DEBUG1) << std::endl;
1400 }
1401 fgetline(line, LINELEN - 1, dbf); // Read next line
1402 }
1403
1404 fgetline(line, LINELEN - 1, dbf); // Skip line with 0-states insert probabilities
1405
1406 ptr = strscn(line);
1407 for (a = 0; a <= D2D && ptr; ++a)
1408 tr[0][a] = log2((float) exp(-1.0 * strflta(ptr, 99999))); //store transition probabilites as log2 values
1409 // strinta returns next integer in string and puts ptr to first char
1410 // after the integer. Returns -99999 if '*' is found.
1411 // ptr is set to 0 if no integer is found after ptr.
1412 if (!ptr)
1413 return Warning(dbf, line, name);
1414 if (Log::reporting_level() >= DEBUG1) {
1415 HH_LOG(DEBUG1) << " ";
1416 for (a = 0; a <= D2D && ptr; ++a)
1417 HH_LOG(DEBUG1) << 100 * fpow2(tr[i][a]) << " ";
1418 HH_LOG(DEBUG1) << std::endl;
1419 }
1420
1421 // Prepare to store DSSP states (if there are none, delete afterwards)
1422 nss_dssp = k++;
1423 seq[nss_dssp] = new char[maxres + 2];
1424 sname[nss_dssp] = new char[15];
1425 strcpy(sname[nss_dssp], "ss_dssp");
1426
1427 /////////////////////////////////////////////////////////////////////////////////////
1428 // Read columns of HMM
1429 int next_i = 0; // index of next column
1430 while (fgetline(line, LINELEN - 1, dbf)
1431 && !(line[0] == '/' && line[1] == '/') && line[0] != '#') {
1432 if (strscn(line) == NULL)
1433 continue; // skip lines that contain only white space
1434
1435 // Read in AA probabilities
1436 ptr = line;
1437 int prev_i = next_i;
1438 next_i = strint(ptr);
1439 ++i;
1440 if (next_i != prev_i + 1)
1441 if (++warn < 5) {
1442 HH_LOG(WARNING) << "In HMM " << name
1443 << " state " << prev_i
1444 << " is followed by state " << next_i << "\n";
1445 if (warn == 5) {
1446 HH_LOG(WARNING) << std::endl
1447 << "Further warnings while reading HMMs will be suppressed.\n";
1448 }
1449 }
1450 if (i > L) {
1451 HH_LOG(WARNING) << std::endl << "In HMM " << name
1452 << " there are more columns than the stated length "
1453 << L << ". Skipping columns.\n";
1454 break;
1455 }
1456 if (i > L) {
1457 HH_LOG(WARNING) << std::endl << "In HMM " << name
1458 << " there are more columns than the stated length "
1459 << L << "\n";
1460 }
1461 if (i >= maxres - 2) {
1462 fgetline(line, LINELEN - 1, dbf); // Skip two lines
1463 fgetline(line, LINELEN - 1, dbf);
1464 continue;
1465 }
1466
1467 for (a = 0; a < 20 && ptr; ++a)
1468 f[i][s2a[a]] = (float) exp(-1.0 * strflta(ptr, 99999));
1469 //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
1470 if (!ptr)
1471 return Warning(dbf, line, name);
1472 if (Log::reporting_level() >= DEBUG1) {
1473 HH_LOG(WARNING) << i << " ";
1474 for (a = 0; a < 20; ++a)
1475 HH_LOG(WARNING) << 100 * f[i][s2a[a]] << " ";
1476 HH_LOG(WARNING) << std::endl;
1477 }
1478
1479 // Ignore MAP annotation
1480 ptr = strscn(ptr); //find next word
1481 ptr = strscn_ws(ptr); // ignore word
1482
1483 // Read RF and CS annotation
1484 ptr = strscn(ptr);
1485 if (!ptr)
1486 return Warning(dbf, line, name);
1487 annotchr[i] = uprchr(*ptr);
1488 if (*ptr != '-' && *ptr != ' ' && *ptr != 'X' && *ptr != 'x')
1489 annot = 1;
1490
1491 ptr = strscn(ptr);
1492 switch (*ptr) {
1493 case 'H':
1494 ss_dssp[i] = 1;
1495 seq[nss_dssp][i] = *ptr;
1496 dssp = 1;
1497 break;
1498 case 'E':
1499 ss_dssp[i] = 2;
1500 seq[nss_dssp][i] = *ptr;
1501 dssp = 1;
1502 break;
1503 case 'C':
1504 ss_dssp[i] = 3;
1505 seq[nss_dssp][i] = *ptr;
1506 dssp = 1;
1507 break;
1508 case 'S':
1509 ss_dssp[i] = 4;
1510 seq[nss_dssp][i] = *ptr;
1511 dssp = 1;
1512 break;
1513 case 'T':
1514 ss_dssp[i] = 5;
1515 seq[nss_dssp][i] = *ptr;
1516 dssp = 1;
1517 break;
1518 case 'G':
1519 ss_dssp[i] = 6;
1520 seq[nss_dssp][i] = *ptr;
1521 dssp = 1;
1522 break;
1523 case 'B':
1524 ss_dssp[i] = 7;
1525 seq[nss_dssp][i] = *ptr;
1526 dssp = 1;
1527 break;
1528 case 'I':
1529 dssp = 1;
1530 case '~':
1531 ss_dssp[i] = 3;
1532 seq[nss_dssp][i] = *ptr;
1533 break;
1534 case '-': // no SS available from any template
1535 case '.': // no clear consensus SS structure
1536 case 'X': // no clear consensus SS structure
1537 ss_dssp[i] = 0;
1538 seq[nss_dssp][i] = '-';
1539 break;
1540 default:
1541 ss_dssp[i] = 0;
1542 seq[nss_dssp][i] = *ptr;
1543 break;
1544 }
1545
1546 // Read insert emission line
1547 fgetline(line, LINELEN - 1, dbf);
1548
1549 // Read seven transition probabilities
1550 fgetline(line, LINELEN - 1, dbf);
1551
1552 ptr = line;
1553 for (a = 0; a <= D2D && ptr; ++a)
1554 tr[i][a] = log2((float) exp(-1.0 * strflta(ptr, 99999))); //store transition prob's as log2-values
1555 if (!ptr)
1556 return Warning(dbf, line, name);
1557 if (Log::reporting_level() >= DEBUG1) {
1558 HH_LOG(DEBUG1) << " ";
1559 for (a = 0; a <= D2D; ++a)
1560 HH_LOG(DEBUG1) << 100 * fpow2(tr[i][a]) << " ";
1561 HH_LOG(DEBUG1) << std::endl;
1562 }
1563 }
1564
1565 if (line[0] == '/' && line[1] == '/')
1566 break;
1567
1568 }
1569
1570 } //while(getline)
1571
1572 if (L == 0)
1573 return 0; //End of db file -> stop reading in
1574
1575 if (i != L) {
1576 HH_LOG(WARNING) << std::endl << "In HMM " << name
1577 << " there are only " << i
1578 << " columns while the stated length is " << L << "\n";
1579 }
1580 if (i >= maxres - 2) {
1581 i = maxres - 2;
1582 HH_LOG(WARNING) << std::endl << "Maximum number " << maxres - 2
1583 << " of residues exceeded while reading HMM " << name << "\n";
1584 }
1585 if (!i) {
1586 HH_LOG(WARNING) << std::endl << "HMM " << name
1587 << " contains no match states. Check the alignment that gave rise to this HMM.\n";
1588 }
1589 L = i;
1590
1591 if (strlen(longname) > 0)
1592 strcat(longname, " ");
1593 strncat(longname, name, DESCLEN - strlen(longname) - 1); // longname = ACC NAME DESC
1594 if (strlen(name) > 0)
1595 strcat(longname, " ");
1596 strncat(longname, desc, DESCLEN - strlen(longname) - 1);
1597 longname[DESCLEN - 1] = '\0';
1598 ScopID(cl, fold, sfam, fam); // get scop classification from basename (e.g. a.1.2.3.4)
1599 RemoveExtension(file, filestr); // copy name of dbfile without extension into 'file'
1600
1601 // Secondary structure
1602 if (!dssp) {
1603 // remove dssp sequence
1604 delete[] seq[nss_dssp]; // memory that had been allocated in case ss_dssp was given needs to be freed
1605 delete[] sname[nss_dssp]; // memory that had been allocated in case ss_dssp was given needs to be freed
1606 nss_dssp = -1;
1607 k--;
1608 } else {
1609 seq[nss_dssp][0] = '-';
1610 seq[nss_dssp][L + 1] = '\0';
1611 }
1612
1613 if (nss_pred >= 0) {
1614 for (i = 1; i <= L; ++i)
1615 ss_pred[i] = ss2i(seq[nss_pred][i]);
1616 if (nss_conf >= 0)
1617 for (i = 1; i <= L; ++i)
1618 ss_conf[i] = cf2i(seq[nss_conf][i]);
1619 else
1620 for (i = 1; i <= L; ++i)
1621 ss_conf[i] = 5;
1622 }
1623
1624 // Copy query (first sequence) and consensus residues?
1625 if (showcons) {
1626 sname[k] = new char[10];
1627 strcpy(sname[k], "Consensus");
1628 sname[k + 1] = new char[strlen(longname) + 1];
1629 strcpy(sname[k + 1], longname);
1630 seq[k] = new char[L + 2];
1631 seq[k][0] = ' ';
1632 seq[k][L + 1] = '\0';
1633 seq[k + 1] = new char[L + 2];
1634 seq[k + 1][0] = ' ';
1635 seq[k + 1][L + 1] = '\0';
1636 for (i = 1; i <= L; ++i) {
1637 float pmax = 0.0;
1638 int amax = 0;
1639 for (a = 0; a < NAA; ++a)
1640 if (f[i][a] > pmax) {
1641 amax = a;
1642 pmax = f[i][a];
1643 }
1644 if (pmax > 0.6)
1645 seq[k][i] = i2aa(amax);
1646 else if (pmax > 0.4)
1647 seq[k][i] = lwrchr(i2aa(amax));
1648 else
1649 seq[k][i] = 'x';
1650 seq[k + 1][i] = i2aa(amax);
1651 }
1652 ncons = k++; // nfirst is set later!
1653 } else {
1654 sname[k] = new char[strlen(longname) + 1];
1655 strcpy(sname[k], longname);
1656 seq[k] = new char[L + 2];
1657 seq[k][0] = ' ';
1658 seq[k][L + 1] = '\0';
1659 }
1660
1661 if (annot) // read in some annotation characters?
1662 {
1663 annotchr[0] = ' ';
1664 annotchr[L + 1] = '\0';
1665 strcpy(seq[k], annotchr); // overwrite the consensus sequence with the annotation characters
1666 } else if (!showcons) // we have not yet calculated the consensus, but we need it now as query (first sequence)
1667 {
1668 for (i = 1; i <= L; ++i) {
1669 float pmax = 0.0;
1670 int amax = 0;
1671 for (a = 0; a < NAA; ++a)
1672 if (f[i][a] > pmax) {
1673 amax = a;
1674 pmax = f[i][a];
1675 }
1676 seq[k][i] = i2aa(amax);
1677 }
1678 }
1679 // printf("%i query name=%s seq=%s\n",n,sname[n],seq[n]);
1680 nfirst = k++;
1681
1682 n_display = k;
1683 n_seqs = k;
1684
1685 // If no effektive number of sequences is given, calculate Neff_HMM by given profile
1686 if (Neff_HMM == 0) {
1687 for (i = 1; i <= L; ++i) {
1688 float S = 0.0;
1689 for (a = 0; a < 20; ++a)
1690 if (f[i][a] > 1E-10)
1691 S -= f[i][a] * fast_log2(f[i][a]);
1692 Neff_HMM += (float) fpow2(S);
1693 }
1694 Neff_HMM /= L;
1695 }
1696
1697 for (i = 0; i <= L; ++i)
1698 Neff_M[i] = Neff_I[i] = Neff_D[i] = 10.0; // to add only little additional pseudocounts!
1699 Neff_M[L + 1] = 1.0f;
1700 Neff_I[L + 1] = Neff_D[L + 1] = 0.0f;
1701
1702 HH_LOG(DEBUG) << "Read in HMM " << name << " with " << L
1703 << " match states and effective number of sequences = "
1704 << Neff_HMM << "\n";
1705
1706 ///////////////////////////////////////////////////////////////////
1707
1708 // Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities
1709 for (a = 0; a < 20; ++a)
1710 f[0][a] = f[L + 1][a] = pb[a];
1711 delete[] annotchr;
1712 return 1; //return status: ok
1713 }
1714
1715 /////////////////////////////////////////////////////////////////////////////////////
1716 // Add transition pseudocounts to HMM (and calculate lin-space transition probs)
1717 /////////////////////////////////////////////////////////////////////////////////////
AddTransitionPseudocounts(float gapd,float gape,float gapf,float gapg,float gaph,float gapi,float gapb,const float par_gapb)1718 void HMM::AddTransitionPseudocounts(float gapd, float gape, float gapf,
1719 float gapg, float gaph, float gapi, float gapb, const float par_gapb) {
1720 int i; //position in alignment
1721 float sum;
1722 float pM2M, pM2I, pM2D, pI2I, pI2M, pD2D, pD2M;
1723 float p0, p1, p2;
1724 if (gapb <= 0)
1725 return;
1726 if (trans_lin == 1) {
1727 HH_LOG(ERROR) << "In " << __FILE__ << ":" << __LINE__ << ": " << __func__ << ":" << std::endl;
1728 HH_LOG(WARNING) << "\tAdding transition pseudocounts to linear representation of " << name
1729 << " not allowed. Please report this error to the HHsearch developers.\n";
1730 exit(6);
1731 }
1732 if (trans_lin == 2) {
1733 HH_LOG(ERROR) << "In " << __FILE__ << ":" << __LINE__ << ": " << __func__ << ":" << std::endl;
1734 HH_LOG(ERROR) << "\tAdding transition pseudocounts twice in " << name
1735 << " not allowed. Please report this error to the HHsearch developers.\n";
1736 exit(6);
1737 }
1738 trans_lin = 2;
1739
1740 // Calculate pseudocount transition probabilities
1741 pM2D = pM2I = gapd * 0.0286; //a-priori probability for inserts and deletions
1742 pM2M = 1 - pM2D - pM2I;
1743 // gape=0 -> pI2I=0 gape=1 -> pI2I=0.75 gape=inf -> pI2I=1.
1744 pI2I = 1.0 * gape / (gape - 1 + 1.0 / 0.75);
1745 pI2M = 1 - pI2I;
1746 // gape=0 -> pD2D=0 gape=1 -> pD2D=0.75 gape=inf -> pD2D=1.
1747 pD2D = 1.0 * gape / (gape - 1 + 1.0 / 0.75);
1748 pD2M = 1 - pD2D;
1749
1750 for (i = 0; i <= L; ++i) //for all columns in HMM
1751 {
1752 // Transitions from M state
1753 p0 = (Neff_M[i] - 1) * fpow2(tr[i][M2M]) + gapb * pM2M;
1754 p1 = (Neff_M[i] - 1) * fpow2(tr[i][M2D]) + gapb * pM2D;
1755 p2 = (Neff_M[i] - 1) * fpow2(tr[i][M2I]) + gapb * pM2I;
1756 if (i == 0)
1757 p1 = p2 = 0; //from M(0) no transition to D(1) and I(0) possible
1758 if (i == L)
1759 p1 = p2 = 0; //from M(L) no transition to D(L+1) and I(L+1) possible
1760 sum = p0 + p1 + p2 + FLT_MIN;
1761 tr[i][M2M] = fast_log2(p0 / sum);
1762 tr[i][M2D] = fast_log2(p1 / sum) * gapf;
1763 tr[i][M2I] = fast_log2(p2 / sum) * gapg;
1764
1765 // Transitions from I state
1766 p0 = Neff_I[i] * fpow2(tr[i][I2M]) + gapb * pI2M;
1767 p1 = Neff_I[i] * fpow2(tr[i][I2I]) + gapb * pI2I;
1768 sum = p0 + p1 + FLT_MIN;
1769 tr[i][I2M] = fast_log2(p0 / sum);
1770 tr[i][I2I] = fast_log2(p1 / sum) * gapi;
1771
1772 // Transitions from D state
1773 p0 = Neff_D[i] * fpow2(tr[i][D2M]) + gapb * pD2M;
1774 p1 = Neff_D[i] * fpow2(tr[i][D2D]) + gapb * pD2D;
1775 if (i == L)
1776 p1 = 0; //from D(L) no transition to D(L+1) possible
1777 sum = p0 + p1 + FLT_MIN;
1778 tr[i][D2M] = fast_log2(p0 / sum);
1779 tr[i][D2D] = fast_log2(p1 / sum) * gaph;
1780
1781 }
1782
1783 if (Log::reporting_level() >= DEBUG1) {
1784 HH_LOG(DEBUG1) << "\nPseudocount transition probabilities:\n";
1785 HH_LOG(DEBUG1) << "pM2M=" << 100*pM2M <<"%, pM2I="<< 100 * pM2I <<"%, pM2D="<<100*pM2D<<"%, ";
1786 HH_LOG(DEBUG1) << "pI2M=" << 100*pI2M << "%, pI2I=" << 100*pI2I << "%, ";
1787 HH_LOG(DEBUG1) << "pD2M=" << 100*pD2M << "%, pD2D=" << 100*pD2D << "% ";
1788 HH_LOG(DEBUG1) << "tau = " << 100. * gapb / (Neff_HMM - 1 + gapb) << "%\n\n";
1789 HH_LOG(DEBUG1) << "Listing transition probabilities WITH pseudocounts:\n";
1790 HH_LOG(DEBUG1) << " i dssp pred sacc M->M M->I M->D I->M I->I D->M D->D\n";
1791
1792 //for all columns in HMM
1793 for (i = 1; i <= L; ++i) {
1794 HH_LOG(DEBUG1) << i << "\t" << i2ss(ss_dssp[i]) << "\t" << i2ss(ss_pred[i]) << "\t" << i2sa(sa_dssp[i]) << "\t" << fpow2(tr[i][M2M]) << "\t" << fpow2(tr[i][M2I]) << "\t" << fpow2(tr[i][M2D]) << "\t";
1795 HH_LOG(DEBUG1) << fpow2(tr[i][I2M]) << "\t" << fpow2(tr[i][I2I]) << "\t";
1796 HH_LOG(DEBUG1) << fpow2(tr[i][D2M]) << "\t" << fpow2(tr[i][D2D]) << "\t";
1797 HH_LOG(DEBUG1) << ss_pred[i] << "\t" << ss_conf[i] << "\t" << ss_dssp[i] << std::endl;
1798 }
1799 HH_LOG(DEBUG1) << std::endl;
1800 HH_LOG(DEBUG1) << "nss_dssp=" << nss_dssp << " nss_pred=" << nss_pred << std::endl;
1801 }
1802 }
1803
1804 /////////////////////////////////////////////////////////////////////////////////////
1805 // Generate an amino acid frequency matrix g[][] with full pseudocount admixture (tau=1)
1806 /////////////////////////////////////////////////////////////////////////////////////
PreparePseudocounts(const float R[20][20])1807 void HMM::PreparePseudocounts(const float R[20][20]) {
1808 for (int i = 0; i <= L + 1; ++i)
1809 for (int a = 0; a < 20; ++a)
1810 g[i][a] = ScalarProd20(R[a], f[i]);
1811 }
1812
1813 /////////////////////////////////////////////////////////////////////////////////////
1814 // Add pseudocounts to profile p[]
1815 /////////////////////////////////////////////////////////////////////////////////////
AddContextSpecificPseudocounts(cs::Pseudocounts<cs::AA> * pc,cs::Admix * admix)1816 void HMM::AddContextSpecificPseudocounts(cs::Pseudocounts<cs::AA>* pc,
1817 cs::Admix* admix) {
1818 if (has_pseudocounts || pc == NULL || admix == NULL) {
1819 for (int i = 1; i <= L; ++i) {
1820 for (int a = 0; a < 20; ++a) {
1821 p[i][a] = f[i][a];
1822 }
1823 }
1824 } else {
1825 cs::CountProfile<cs::AA> cp(L);
1826 fillCountProfile(&cp);
1827 cs::Profile<cs::AA> profile = pc->AddTo(cp, *admix);
1828 for (int i = 1; i <= L; ++i) {
1829 for (int a = 0; a < 20; ++a) {
1830 p[i][a] = profile[i - 1][a];
1831 }
1832 }
1833 }
1834 }
1835
1836 /////////////////////////////////////////////////////////////////////////////////////
1837 // Fill CountProfile with actual aa-frequencies
1838 /////////////////////////////////////////////////////////////////////////////////////
fillCountProfile(cs::CountProfile<cs::AA> * csProfile)1839 void HMM::fillCountProfile(cs::CountProfile<cs::AA> *csProfile) {
1840 for (int i = 0; i < L; ++i) {
1841 csProfile->neff[i] = Neff_M[i + 1];
1842 for (int a = 0; a < 20; ++a)
1843 csProfile->counts[i][a] = f[i + 1][a] * Neff_M[i + 1];
1844 }
1845 }
1846
1847 /////////////////////////////////////////////////////////////////////////////////////
1848 // Calculate amino acid background frequencies in HMM
1849 /////////////////////////////////////////////////////////////////////////////////////
CalculateAminoAcidBackground(const float * pb)1850 void HMM::CalculateAminoAcidBackground(const float* pb) {
1851 int a, i;
1852 // initialize vector of average aa freqs with pseudocounts
1853 for (a = 0; a < 20; ++a)
1854 pav[a] = pb[a] * 100.0f / Neff_HMM;
1855 // calculate averages
1856 for (i = 1; i <= L; ++i)
1857 for (a = 0; a < 20; ++a)
1858 pav[a] += p[i][a];
1859 // Normalize vector of average aa frequencies pav[a]
1860 NormalizeTo1(pav, NAA);
1861 for (a = 0; a < 20; ++a)
1862 p[0][a] = p[L + 1][a] = pav[a];
1863
1864 }
1865
1866 /////////////////////////////////////////////////////////////////////////////////////
1867 // Add amino acid pseudocounts to HMM and calculate average protein aa probabilities pav[a]
1868 // Pseudocounts: p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
1869 /////////////////////////////////////////////////////////////////////////////////////
AddAminoAcidPseudocounts(char pcm,float pca,float pcb,float pcc)1870 void HMM::AddAminoAcidPseudocounts(char pcm, float pca, float pcb, float pcc) {
1871 int i; //position in HMM
1872 int a; //amino acid (0..19)
1873 float sum;
1874 float tau; //tau = pseudocount admixture
1875
1876 if (has_pseudocounts) {
1877 pcm = 0;
1878 }
1879
1880 // Calculate amino acid frequencies p[i][a] = (1-tau(i))*f[i][a] + tau(i)*g[i][a]
1881 switch (pcm) {
1882 case 0: //no pseudocounts whatsoever: tau=0
1883 for (i = 1; i <= L; ++i)
1884 for (a = 0; a < 20; ++a)
1885 p[i][a] = f[i][a];
1886 break;
1887 case 1: //constant pseudocounts (for optimization): tau = pca
1888 tau = pca;
1889 for (i = 1; i <= L; ++i)
1890 for (a = 0; a < 20; ++a)
1891 p[i][a] = (1. - tau) * f[i][a] + tau * g[i][a];
1892 break;
1893 case 2: //divergence-dependent pseudocounts and rate matrix rescaling
1894 if (pcc == 1.0f)
1895 for (i = 1; i <= L; ++i) {
1896 tau = fmin(1.0, pca / (1. + Neff_M[i] / pcb));
1897 for (a = 0; a < 20; ++a)
1898 p[i][a] = (1. - tau) * f[i][a] + tau * g[i][a];
1899 }
1900 else
1901 for (i = 1; i <= L; ++i) {
1902 tau = fmin(1.0, pca / (1. + pow((Neff_M[i]) / pcb, pcc)));
1903 for (a = 0; a < 20; ++a)
1904 p[i][a] = (1. - tau) * f[i][a] + tau * g[i][a];
1905 }
1906 break;
1907 case 3: // constant-diversity pseudocounts // Is this still used? => scrap? (JS)
1908 for (i = 1; i <= L; ++i) {
1909 float x = Neff_M[i] / pcb;
1910 pca = 0.793 + 0.048 * (pcb - 10.0);
1911 tau = fmax(0.0, pca * (1 - x + pcc * x * (1 - x)));
1912 for (a = 0; a < 20; ++a)
1913 p[i][a] = (1. - tau) * f[i][a] + tau * g[i][a];
1914 }
1915 HH_LOG(DEBUG) << "Divergence before / after addition of amino acid pseudocounts: " << Neff_HMM << " / " << CalcNeff() << std::endl;
1916 break;
1917 } //end switch (pcm)
1918
1919 //turn on pseudocount switch to indicate that HMM contains pseudocounts
1920 if (pcm != 0)
1921 has_pseudocounts = true;
1922
1923 // DEBUGGING output
1924 if (Log::reporting_level() >= DEBUG) {
1925 switch (pcm) {
1926 case 0:
1927 HH_LOG(DEBUG) << "No pseudocounts added (-pcm 0)\n";
1928 return;
1929 case 1:
1930 HH_LOG(DEBUG) << "Adding constant AA pseudocount admixture of " << pca
1931 << " to HMM " << name << "\n";
1932 break;
1933 case 2:
1934 HH_LOG(DEBUG)
1935 << "Adding divergence-dependent AA pseudocounts (-pcm 2) with admixture of "
1936 << fmin(1.0, pca / (1. + Neff_HMM / pcb)) << " to HMM "
1937 << name << "\n";
1938 break;
1939 } //end switch (pcm)
1940
1941 if ( Log::reporting_level() >= DEBUG1) {
1942 HH_LOG(DEBUG1) << "Amino acid frequencies WITHOUT pseudocounts:\n A R N D C Q E G H I L K M F P S T W Y V" << std::endl;
1943 for (i=1; i<=L; ++i)
1944 {
1945 HH_LOG(DEBUG1) << i << ": " << f[i][0] * 100 << " " << f[i][1] * 100 << " " << f[i][2] * 100 << " " << f[i][3] * 100 << " " << f[i][4] * 100 << " "
1946 << f[i][5] * 100 << " " << f[i][6] * 100 << " " << f[i][7] * 100 << " " << f[i][8] * 100 << " " << f[i][9] * 100 << " " << f[i][10] * 100 << " "
1947 << f[i][11] * 100 << " " << f[i][12] * 100 << " " << f[i][13] * 100 << " " << f[i][14] * 100 << " " << f[i][15] * 100 << " "
1948 << f[i][16] * 100 << " " << f[i][17] * 100 << " " << f[i][18] * 100 << " " << f[i][19] * 100 << " sum=" << sum << std::endl;
1949 }
1950 HH_LOG(DEBUG1) << "Amino acid frequencies WITH pseudocounts:\n A R N D C Q E G H I L K M F P S T W Y V" << std::endl;
1951 for (i=1; i<=L; ++i)
1952 {
1953 HH_LOG(DEBUG1) << std::setprecision(1) << i << ": " << p[i][0] * 100 << " " << p[i][1] * 100 << " " << p[i][2] * 100 << " " << p[i][3] * 100 << " " << p[i][4] * 100 << " "
1954 << p[i][5] * 100 << " " << p[i][6] * 100 << " " << p[i][7] * 100 << " " << p[i][8] * 100 << " " << p[i][9] * 100 << " " << p[i][10] * 100 << " "
1955 << p[i][11] * 100 << " " << p[i][12] * 100 << " " << p[i][13] * 100 << " " << p[i][14] * 100 << " " << p[i][15] * 100 << " "
1956 << p[i][16] * 100 << " " << p[i][17] * 100 << " " << p[i][18] * 100 << " " << p[i][19] * 100 << " sum=" << sum << std::endl;
1957 }
1958 }
1959 }
1960 }
1961
1962
computeScoreSSMode(HMM * q,HMM * t)1963 int HMM::computeScoreSSMode( HMM *q, HMM *t){
1964 int returnMode = HMM::NO_SS_INFORMATION;
1965 returnMode |= (q->nss_pred>=0 && t->nss_dssp>=0) ? HMM::PRED_DSSP : 0;
1966 returnMode |= (q->nss_dssp>=0 && t->nss_pred>=0) ? HMM::DSSP_PRED : 0;
1967 returnMode |= (q->nss_pred>=0 && t->nss_pred>=0) ? HMM::PRED_PRED : 0;
1968 return returnMode;
1969 }
1970
1971 /////////////////////////////////////////////////////////////////////////////////////
1972 // Divide aa probabilties by square root of locally averaged background frequencies
1973 // !!!!! ATTENTION!!!!!!! after this p is not the same as after adding pseudocounts !!!
1974 /////////////////////////////////////////////////////////////////////////////////////
DivideBySqrtOfLocalBackgroundFreqs(const int D,const float * pb)1975 void HMM::DivideBySqrtOfLocalBackgroundFreqs(const int D, const float* pb) // 2*D+1 is window size
1976 {
1977 if (divided_by_local_bg_freqs) {
1978 HH_LOG(WARNING) << "Already divided probs by local aa frequencies!" << std::endl;
1979 return;
1980 }
1981 divided_by_local_bg_freqs = 1;
1982
1983 int i; // query and template match state indices
1984 int a; // amino acid index
1985 const float pc = 10.0; // amount of pseudocounts on local amino acid frequencies
1986 float fac = 1.0 / (2.0 * (float) D + 1.0 + pc); // 1 / window size
1987
1988 float** pnul = new float*[L + 1]; // null model probabilities
1989 for (i = 0; i <= L; ++i)
1990 pnul[i] = new float[NAA];
1991 for (a = 0; a < NAA; ++a)
1992 pnul[0][a] = pc * pb[a];
1993
1994 // HMM shorter than window length? => average over entire length L
1995 if (L <= 2 * D + 1) {
1996 for (i = 1; i <= L; ++i)
1997 for (a = 0; a < NAA; ++a)
1998 pnul[0][a] += p[i][a];
1999 for (i = 1; i <= L; ++i)
2000 for (a = 0; a < NAA; ++a)
2001 pnul[i][a] = pnul[0][a];
2002 fac = 1.0 / ((float) L + pc);
2003 }
2004 // HMM longer than window length? => average over window size 2*D+1
2005 else {
2006 // Calculate local amino acid background frequencies in leftmost window (1,..,2*D+1)
2007 for (i = 1; i <= 2 * D + 1; ++i)
2008 for (a = 0; a < NAA; ++a)
2009 pnul[0][a] += p[i][a];
2010
2011 // Copy local amino acid background frequencies in leftmost window to positions 1,..,D+1
2012 for (i = 1; i <= D + 1; ++i)
2013 for (a = 0; a < NAA; ++a)
2014 pnul[i][a] = pnul[0][a];
2015
2016 // Calculate local amino acid background frequencies in window of size 2*D+1 around each residue
2017 for (i = D + 2; i <= L - D; ++i)
2018 for (a = 0; a < NAA; ++a)
2019 pnul[i][a] = pnul[i - 1][a] + p[i + D][a] - p[i - 1 - D][a];
2020
2021 // Copy local amino acid background frequencies from pos. L-D to positions L-D+1,..,L
2022 for (i = L - D + 1; i <= L; ++i)
2023 for (a = 0; a < NAA; ++a)
2024 pnul[i][a] = pnul[L - D][a];
2025 }
2026
2027 // Divide amino acid probs by sqrt of local amino acid background frequencies
2028 for (i = 1; i <= L; ++i)
2029 for (a = 0; a < NAA; ++a)
2030 p[i][a] /= sqrt(fac * pnul[i][a]);
2031
2032 if (Log::reporting_level() >= DEBUG1) {
2033 HH_LOG(DEBUG1) << "\nLocal amino acid background frequencies\n";
2034 HH_LOG(DEBUG1) << " A R N D C Q E G H I L K M F P S T W Y V sum\n";
2035 for (i = 1; i <= L; ++i) {
2036 HH_LOG(DEBUG1) << i << " ";
2037 float sum = 0.0;
2038 for (a = 0; a < 20; ++a) {
2039 HH_LOG(DEBUG1) << 100 * fac * pnul[i][a] << " ";
2040 sum += fac * pnul[i][a];
2041 }
2042 HH_LOG(DEBUG1) << 100 * sum << std::endl;
2043 }
2044 }
2045
2046 for (i = 0; i <= L; ++i)
2047 delete[] pnul[i];
2048 delete[] pnul;
2049 }
2050
2051 /////////////////////////////////////////////////////////////////////////////////////
2052 // Factor Null model into HMM t
2053 // !!!!! ATTENTION!!!!!!! after this t->p is not the same as after adding pseudocounts !!!
2054 /////////////////////////////////////////////////////////////////////////////////////
IncludeNullModelInHMM(HMM * q,HMM * t,int columnscore,const int half_window_size_local_aa_bg_freqs,const float * pb)2055 void HMM::IncludeNullModelInHMM(HMM* q, HMM* t, int columnscore,
2056 const int half_window_size_local_aa_bg_freqs, const float* pb) {
2057
2058 int i, j; //query and template match state indices
2059 int a; //amino acid index
2060
2061 // Multiply template frequencies with amino acid weights = 1/background_freq(a) (for all but SOP scores)
2062 switch (columnscore) {
2063 default:
2064 case 0: // Null model with background prob. from database
2065 for (j = 0; j <= t->L + 1; ++j)
2066 for (a = 0; a < 20; ++a)
2067 t->p[j][a] /= pb[a];
2068 break;
2069
2070 case 1: // Null model with background prob. equal average from query and template
2071 float pnul[NAA]; // null model probabilities used in comparison (only set in template/db HMMs)
2072 for (a = 0; a < 20; ++a)
2073 pnul[a] = 0.5 * (q->pav[a] + t->pav[a]);
2074 for (j = 0; j <= t->L + 1; ++j)
2075 for (a = 0; a < 20; ++a)
2076 t->p[j][a] /= pnul[a];
2077 break;
2078
2079 case 2: // Null model with background prob. from template protein
2080 for (j = 0; j <= t->L + 1; ++j)
2081 for (a = 0; a < 20; ++a)
2082 t->p[j][a] /= t->pav[a];
2083 break;
2084
2085 case 3: // Null model with background prob. from query protein
2086 for (j = 0; j <= t->L + 1; ++j)
2087 for (a = 0; a < 20; ++a)
2088 t->p[j][a] /= q->pav[a];
2089 break;
2090
2091 case 5: // Null model with local background prob. from template and query protein
2092 // if (!q->divided_by_local_bg_freqs) q->DivideBySqrtOfLocalBackgroundFreqs(par.half_window_size_local_aa_bg_freqs);
2093 if (!q->divided_by_local_bg_freqs)
2094 InternalError("No local amino acid bias correction on query HMM!",
2095 __FILE__, __LINE__, __func__);
2096 if (!t->divided_by_local_bg_freqs)
2097 t->DivideBySqrtOfLocalBackgroundFreqs(
2098 half_window_size_local_aa_bg_freqs, pb);
2099 break;
2100
2101 case 10: // Separated column scoring for Stochastic Backtracing (STILL USED??)
2102 for (i = 0; i <= q->L + 1; ++i) {
2103 float sum = 0.0;
2104 for (a = 0; a < 20; ++a)
2105 sum += pb[a] * q->p[i][a];
2106 sum = 1.0 / sqrt(sum);
2107 for (a = 0; a < 20; ++a)
2108 q->p[i][a] *= sum;
2109 }
2110 for (j = 0; j <= t->L + 1; j++) {
2111 float sum = 0.0;
2112 for (a = 0; a < 20; ++a)
2113 sum += pb[a] * t->p[j][a];
2114 sum = 1.0 / sqrt(sum);
2115 for (a = 0; a < 20; ++a)
2116 t->p[j][a] *= sum;
2117 }
2118 break;
2119
2120 case 11: // log co-emission probability (no null model)
2121 for (a = 0; a < 20; ++a)
2122 pnul[a] = 0.05;
2123 break;
2124
2125 }
2126
2127 if (Log::reporting_level() >= DEBUG1) {
2128 HH_LOG(DEBUG1) << "\nAverage amino acid frequencies\n";
2129 HH_LOG(DEBUG1) << " A R N D C Q E G H I L K M F P S T W Y V\n";
2130 HH_LOG(DEBUG1) << "Q: ";
2131 for (a = 0; a < 20; ++a)
2132 HH_LOG(DEBUG1) << 100 * q->pav[a] << " ";
2133 HH_LOG(DEBUG1) << "\nT: ";
2134 for (a = 0; a < 20; ++a)
2135 HH_LOG(DEBUG1) << 100 * t->pav[a] << " ";
2136 HH_LOG(DEBUG1) << "\npb: ";
2137 for (a = 0; a < 20; ++a)
2138 HH_LOG(DEBUG1) << 100 * pb[a] << " ";
2139 }
2140 }
2141
WriteToFile(char * outfile,const char append,const int max_seqid,const int coverage,const int qid,const int Ndiff,const float qsc,const int argc,const char ** argv,const float * pb)2142 void HMM::WriteToFile(char* outfile, const char append, const int max_seqid,
2143 const int coverage, const int qid, const int Ndiff, const float qsc,
2144 const int argc, const char** argv, const float* pb) {
2145 std::stringstream out;
2146 WriteToFile(out, max_seqid, coverage, qid, Ndiff, qsc, argc, argv, pb);
2147
2148 if (strcmp(outfile, "stdout") == 0) {
2149 std::cout << out.str();
2150 } else {
2151 std::fstream outf;
2152 if (append)
2153 outf.open(outfile, std::ios::out | std::ios::app);
2154 else
2155 outf.open(outfile, std::ios::out);
2156
2157 if (!outf.good())
2158 OpenFileError(outfile, __FILE__, __LINE__, __func__);
2159
2160 outf << out.str();
2161
2162 outf.close();
2163 }
2164 }
2165
2166 /////////////////////////////////////////////////////////////////////////////////////
2167 // Write HMM to output file
2168 /////////////////////////////////////////////////////////////////////////////////////
WriteToFile(std::stringstream & out,const int max_seqid,const int coverage,const int qid,const int Ndiff,const float qsc,const int argc,const char ** argv,const float * pb)2169 void HMM::WriteToFile(std::stringstream& out, const int max_seqid,
2170 const int coverage, const int qid, const int Ndiff, const float qsc,
2171 const int argc, const char** argv, const float* pb) {
2172 const int SEQLEN = 100; // number of residues per line for sequences to be displayed
2173 char line[LINELEN];
2174
2175 if (trans_lin == 1)
2176 InternalError(
2177 "tried to write HMM file with transition pseudocounts in linear representation",
2178 __FILE__, __LINE__, __func__);
2179 if (divided_by_local_bg_freqs)
2180 InternalError(
2181 "tried to write HMM file with amino acid probabilities divided by sqrt of local background frequencies\n",
2182 __FILE__, __LINE__, __func__);
2183
2184 // format specification
2185 out << "HHsearch 1.5" << std::endl;
2186 out << "NAME " << longname << std::endl; // name of first sequence
2187 out << "FAM " << fam << std::endl; // family name
2188
2189 //TODO
2190 // char file_nopath[NAMELEN];
2191 // RemoveExtension(file, outfile);
2192 // RemovePath(file_nopath, file);
2193 // fprintf(outf,"FILE %s\n",file_nopath); // base name of alignment file
2194
2195 // Print command line
2196 out << "COM ";
2197 for (int i = 0; i < argc; i++)
2198 if (strlen(argv[i]) <= 100)
2199 out << argv[i] << " ";
2200 else
2201 out << "<" << strlen(argv[i]) << " characters> ";
2202 out << std::endl;
2203
2204 // print out date stamp
2205 time_t* tp = new time_t;
2206 *tp = time(NULL);
2207 out << "DATE " << ctime(tp);
2208 delete tp;
2209
2210 // Print out some statistics of alignment
2211 out << "LENG " << L << " match states, " << l[L]
2212 << " columns in multiple alignment\n" << std::endl;
2213 out << "FILT " << N_filtered << " out of " << N_in
2214 << " sequences passed filter (-id " << max_seqid << " -cov "
2215 << coverage << " -qid " << qid << " -qsc " << qsc << " -diff "
2216 << Ndiff << ")" << std::endl;
2217 sprintf(line, "NEFF %-4.1f\n", Neff_HMM);
2218 out << line;
2219
2220 if (has_pseudocounts) {
2221 out << "PCT true\n";
2222 }
2223
2224 // Print selected sequences from alignment (including secondary structure and confidence values, if known)
2225 out << "SEQ" << std::endl;
2226 for (int n = 0; n < n_display; n++) {
2227 out << ">" << sname[n] << std::endl;
2228 //first sequence character starts at 1; 0 not used.
2229 for (unsigned int j = 0; j < strlen(seq[n] + 1); j += SEQLEN) {
2230 sprintf(line, "%-.*s\n", SEQLEN, seq[n] + 1 + j);
2231 out << line;
2232 }
2233 }
2234 out << "#" << std::endl;
2235
2236 // print null model background probabilities from substitution matrix
2237 out << "NULL ";
2238 for (int a = 0; a < 20; ++a)
2239 sout(out, -iround(fast_log2(pb[s2a[a]]) * HMMSCALE));
2240 out << std::endl;
2241
2242 // print table header line with amino acids
2243 out << "HMM ";
2244 for (int a = 0; a < 20; ++a) {
2245 out << i2aa(s2a[a]) << "\t";
2246 }
2247 out << std::endl;
2248
2249 // print table header line with state transitions
2250 out
2251 << " M->M\tM->I\tM->D\tI->M\tI->I\tD->M\tD->D\tNeff\tNeff_I\tNeff_D"
2252 << std::endl;
2253
2254 // print out transition probabilities from begin state (virtual match state)
2255 out << " ";
2256 for (int a = 0; a <= D2D; ++a)
2257 sout(out, -iround(tr[0][a] * HMMSCALE));
2258
2259 sout(out, iround(Neff_M[0] * HMMSCALE));
2260 sout(out, iround(Neff_I[0] * HMMSCALE));
2261 sout(out, iround(Neff_D[0] * HMMSCALE));
2262 out << std::endl;
2263
2264 // Start loop for printing HMM columns
2265 int h = 1;
2266 for (int i = 1; i <= L; ++i) {
2267 while (islower(seq[nfirst][h]) && seq[nfirst][h])
2268 h++;
2269
2270 sprintf(line, "%1c %-4i ", seq[nfirst][h++], i);
2271 out << line;
2272
2273 // Print emission probabilities for match state
2274 for (int a = 0; a < 20; ++a) {
2275 sout(out, -iround(fast_log2(p[i][s2a[a]]) * HMMSCALE));
2276 }
2277
2278 sprintf(line, "%-i", l[i]);
2279 out << line;
2280 out << std::endl;
2281
2282 // Print transition probabilities
2283 out << " ";
2284 for (int a = 0; a <= D2D; ++a) {
2285 sout(out, -iround(tr[i][a] * HMMSCALE));
2286 }
2287 sout(out, iround(Neff_M[i] * HMMSCALE));
2288 sout(out, iround(Neff_I[i] * HMMSCALE));
2289 sout(out, iround(Neff_D[i] * HMMSCALE));
2290
2291 out << std::endl << std::endl;
2292 }
2293 out << "//" << std::endl;
2294 }
2295
2296
2297 /////////////////////////////////////////////////////////////////////////////////////
2298 // Transform log to lin transition probs
2299 /////////////////////////////////////////////////////////////////////////////////////
Log2LinTransitionProbs(float beta)2300 void HMM::Log2LinTransitionProbs(float beta) {
2301 if (trans_lin == 1)
2302 return;
2303 trans_lin = 1;
2304 for (int i = 0; i <= L; ++i) {
2305 for (int a = 0; a < NTRANS; ++a)
2306 tr[i][a] = pow(2.0f,beta * tr[i][a]);
2307 }
2308 }
2309
2310
2311 /////////////////////////////////////////////////////////////////////////////////////
2312 // Set query columns in His-tags etc to Null model distribution
2313 /////////////////////////////////////////////////////////////////////////////////////
NeutralizeTags(const float * pb)2314 void HMM::NeutralizeTags(const float* pb) {
2315 char* qseq = seq[nfirst];
2316 char* pt;
2317 int a, i;
2318
2319 // Neutralize His tag
2320 if ((pt = strstr(qseq, "HHHHH"))) {
2321 int i0 = pt - qseq + 1;
2322 for (i = imax(i0 - 8, 1); i < i0; ++i) // neutralize leading 5 columns
2323 for (a = 0; a < NAA; ++a)
2324 p[i][a] = f[i][a] = pb[a];
2325 for (; (*pt) == 'H'; ++i, ++pt) // neutralize His columns
2326 for (a = 0; a < NAA; ++a)
2327 p[i][a] = f[i][a] = pb[a];
2328 int i1 = i;
2329 for (; i < imin(i1 + 8, L + 1); ++i) // neutralize trailing 5 columns
2330 for (a = 0; a < NAA; ++a)
2331 p[i][a] = f[i][a] = pb[a];
2332 HH_LOG(INFO) << "Neutralized His-tag between positions " << imax(i0 - 8, 1) << " and " << i-1 << std::endl;
2333 }
2334
2335 // Neutralize C-myc tag
2336 if ((pt = strstr(qseq, "EQKLISEEDL"))) {
2337 HH_LOG(INFO) << "Neutralized C-myc-tag at position " << int(pt - qseq) + 1 << std::endl;
2338 for (i = pt - qseq + 1; i <= pt - qseq + 10; ++i)
2339 for (a = 0; a < NAA; ++a)
2340 p[i][a] = f[i][a] = pb[a];
2341 }
2342 // Neutralize FLAG tag
2343 if ((pt = strstr(qseq, "DYKDDDDK"))) {
2344 HH_LOG(INFO) << "Neutralized FLAG-tag at position " << int(pt - qseq) + 1;
2345 for (i = pt - qseq + 1; i <= pt - qseq + 8; ++i)
2346 for (a = 0; a < NAA; ++a)
2347 p[i][a] = f[i][a] = pb[a];
2348 }
2349 }
2350
2351 /////////////////////////////////////////////////////////////////////////////////////
2352 // Calculate effective number of sequences using profiles INCLUDING pseudocounts
2353 /////////////////////////////////////////////////////////////////////////////////////
CalcNeff()2354 float HMM::CalcNeff() {
2355 float Neff = 0;
2356 for (int i = 1; i <= L; ++i)
2357 for (int a = 0; a < 20; ++a)
2358 if (p[i][a] > 1E-10)
2359 Neff -= p[i][a] * fast_log2(p[i][a]);
2360 return fpow2(Neff / L);
2361 }
2362
2363 /////////////////////////////////////////////////////////////////////////////////////
2364 // Add secondary structure prediction to alignment (overwrite existing prediction)
2365 /////////////////////////////////////////////////////////////////////////////////////
AddSSPrediction(char seq_pred[],char seq_conf[])2366 void HMM::AddSSPrediction(char seq_pred[], char seq_conf[]) {
2367 unsigned int i;
2368
2369 if ((int) strlen(seq_pred) != L + 1) {
2370 HH_LOG(WARNING) << "Could not add secondary struture prediction - unequal length!" << std::endl;
2371 return;
2372 }
2373
2374 if (nss_pred >= 0 && nss_conf >= 0) {
2375 strcpy(seq[nss_pred], seq_pred);
2376 for (i = 0; i < strlen(seq_pred); i++)
2377 ss_pred[i] = ss2i(seq_pred[i]);
2378 strcpy(seq[nss_conf], seq_conf);
2379 for (i = 0; i < strlen(seq_conf); i++)
2380 ss_conf[i] = cf2i(seq_conf[i]);
2381 } else {
2382 // Shift existing sequences two positions down
2383 for (int k = imin(n_display - 1, MAXSEQDIS - 3); k >= 0; --k) {
2384 seq[k + 2] = seq[k];
2385 sname[k + 2] = sname[k];
2386 }
2387 if (nss_dssp >= 0) {
2388 nss_dssp += 2;
2389 }
2390 if (nsa_dssp >= 0) {
2391 nsa_dssp += 2;
2392 }
2393 if (ncons >= 0) {
2394 ncons += 2;
2395 }
2396 if (nfirst >= 0) {
2397 nfirst += 2;
2398 }
2399
2400 nss_pred = 0;
2401 seq[nss_pred] = new char[L + 2];
2402 strcpy(seq[nss_pred], seq_pred);
2403 for (i = 0; i < strlen(seq_pred); i++)
2404 ss_pred[i] = ss2i(seq_pred[i]);
2405 sname[nss_pred] = new char[50];
2406 strcpy(sname[nss_pred],
2407 "ss_pred PSIPRED predicted secondary structure");
2408
2409 nss_conf = 1;
2410 seq[nss_conf] = new char[L + 2];
2411 strcpy(seq[nss_conf], seq_conf);
2412 for (i = 0; i < strlen(seq_conf); i++)
2413 ss_conf[i] = cf2i(seq_conf[i]);
2414 sname[nss_conf] = new char[50];
2415 strcpy(sname[nss_conf], "ss_conf PSIPRED confidence values");
2416 n_display += 2;
2417 n_seqs += 2;
2418 }
2419 }
2420
2421