1 // hhhmm.C
2 
3 #include "hhhmm.h"
4 
5 /////////////////////////////////////////////////////////////////////////////////////
6 //// Class HMM
7 /////////////////////////////////////////////////////////////////////////////////////
8 
9 /////////////////////////////////////////////////////////////////////////////////////
10 // Object constructor
11 /////////////////////////////////////////////////////////////////////////////////////
HMM(int maxseqdis,int maxres)12 HMM::HMM(int maxseqdis, int maxres) : maxseqdis(maxseqdis), maxres(maxres) {
13 	sname = new char*[maxseqdis];   // names of stored sequences
14 	seq = new char*[maxseqdis]; // residues of stored sequences (first at pos 1!)
15 	Neff_M = new float[maxres]; // Neff_M[i] = diversity of subalignment of seqs that have residue in col i
16 	Neff_I = new float[maxres]; // Neff_I[i] = diversity of subalignment of seqs that have insert in col i
17 	Neff_D = new float[maxres]; // Neff_D[i] = diversity of subalignment of seqs that have delete in col i
18 	longname = new char[DESCLEN]; // Full name of first sequence of original alignment (NAME field)
19 	// Make Clang memory sanitizer happy, avoid copying uninitialized values in HMM::operator=()
20 	name[0] = file[0] = fam[0] = sfam[0] = fold[0] = cl[0] = '\0';
21 	ss_dssp = new char[maxres]; // secondary structure determined by dssp 0:-  1:H  2:E  3:C  4:S  5:T  6:G  7:B
22 	sa_dssp = new char[maxres]; // solvent accessibility state determined by dssp 0:-  1:A (absolutely buried) 2:B  3:C  4:D  5:E (exposed)
23 	ss_pred = new char[maxres]; // predicted secondary structure          0:-  1:H  2:E  3:C
24 	ss_conf = new char[maxres]; // confidence value of prediction         0:-  1:0 ... 10:9
25 	l = new int[maxres];          // l[i] = pos. of j'th match state in aligment
26 	f = new float*[maxres]; // f[i][a] = prob of finding amino acid a in column i WITHOUT pseudocounts
27 	g = new float*[maxres]; // f[i][a] = prob of finding amino acid a in column i WITH pseudocounts
28 	p = new float*[maxres]; // p[i][a] = prob of finding amino acid a in column i WITH OPTIMUM pseudocounts
29 	tr = new float*[maxres]; // log2 of transition probabilities M2M M2I M2D I2M I2I D2M D2D
30 	for (int i = 0; i < maxres; i++)
31 		f[i] = new float[NAA + 3];
32 	for (int i = 0; i < maxres; i++)
33 		g[i] = new float[NAA];
34 	for (int i = 0; i < maxres; i++)
35 		p[i] = (float*) mem_align(ALIGN_FLOAT, NAA * sizeof(float)); // align memory on 16B/32B boundaries for SSE2 / AVX
36 	for (int i = 0; i < maxres; i++)
37 		tr[i] = (float*) mem_align(ALIGN_FLOAT, NTRANS * sizeof(float));
38 
39 	L = 0;
40 	Neff_HMM = 0;
41 	n_display = n_seqs = N_in = N_filtered = 0;
42 	nss_dssp = nsa_dssp = nss_pred = nss_conf = nfirst = ncons = -1;
43 	lamda = 0.0;
44 	mu = 0.0;
45 	name[0] = longname[0] = fam[0] = '\0';
46 	trans_lin = 0; // transition probs in log space
47 	dont_delete_seqs = false;
48 	has_pseudocounts = false;
49 	divided_by_local_bg_freqs = false;
50 }
51 
52 /////////////////////////////////////////////////////////////////////////////////////
53 // Object destructor
54 /////////////////////////////////////////////////////////////////////////////////////
~HMM()55 HMM::~HMM() {
56 	//Delete name and seq matrices
57 	if (!dont_delete_seqs) // don't delete sname and seq if flat copy to hit object has been made
58 	{
59 		for (int k = 0; k < n_seqs; k++)
60 			delete[] sname[k];
61 		for (int k = 0; k < n_seqs; k++)
62 			delete[] seq[k];
63 	} else // Delete all not shown sequences (lost otherwise)
64 	{
65 		if (n_seqs > n_display) {
66 			for (int k = n_display; k < n_seqs; k++)
67 				delete[] sname[k];
68 			for (int k = n_display; k < n_seqs; k++)
69 				delete[] seq[k];
70 		}
71 	}
72 	delete[] sname;
73 	delete[] seq;
74 	delete[] Neff_M;
75 	delete[] Neff_D;
76 	delete[] Neff_I;
77 	delete[] longname;
78 	delete[] ss_dssp;
79 	delete[] sa_dssp;
80 	delete[] ss_pred;
81 	delete[] ss_conf;
82 	delete[] l;
83 	for (int i = 0; i < maxres; i++)
84 		if (f[i])
85 			delete[] f[i];
86 		else
87 			break;
88 	for (int i = 0; i < maxres; i++)
89 		if (g[i])
90 			delete[] g[i];
91 		else
92 			break;
93 	for (int i = 0; i < maxres; i++)
94 		if (p[i])
95 			free(p[i]);
96 		else
97 			break;
98 	for (int i = 0; i < maxres; i++)
99 		if (tr[i])
100 			free(tr[i]);
101 		else
102 			break;
103 	delete[] f;
104 	delete[] g;
105 	delete[] p;
106 	delete[] tr;
107 }
108 
109 /////////////////////////////////////////////////////////////////////////////////////
110 // Deep-copy constructor
111 /////////////////////////////////////////////////////////////////////////////////////
operator =(HMM & q)112 HMM& HMM::operator=(HMM& q) {
113 	if (!dont_delete_seqs) // don't delete sname and seq if flat copy to hit object has been made
114 	{
115 		for (int k = 0; k < n_seqs; k++)
116 			delete[] sname[k];
117 		for (int k = 0; k < n_seqs; k++)
118 			delete[] seq[k];
119 	} else // Delete all not shown sequences (lost otherwise)
120 	{
121 		if (n_seqs > n_display) {
122 			for (int k = n_display; k < n_seqs; k++)
123 				delete[] sname[k];
124 			for (int k = n_display; k < n_seqs; k++)
125 				delete[] seq[k];
126 		}
127 	}
128 
129 	L = q.L;
130 	for (int i = 0; i <= L + 1; ++i) {
131 		for (int a = 0; a < NAA; ++a) {
132 			f[i][a] = q.f[i][a];
133 			g[i][a] = q.g[i][a];
134 			p[i][a] = q.p[i][a];
135 		}
136 		for (int a = 0; a < NTRANS; ++a)
137 			tr[i][a] = q.tr[i][a];
138 		ss_dssp[i] = q.ss_dssp[i];
139 		sa_dssp[i] = q.sa_dssp[i];
140 		ss_pred[i] = q.ss_pred[i];
141 		ss_conf[i] = q.ss_conf[i];
142 		l[i] = q.l[i];
143 	}
144 
145 	n_display = q.n_display;
146 	n_seqs = q.n_seqs;
147 	for (int k = 0; k < n_seqs; k++) {
148 		sname[k] = new char[strlen(q.sname[k]) + 1];
149 		if (!sname[k])
150 			MemoryError("array of names for sequences to display", __FILE__,
151 					__LINE__, __func__);
152 		strcpy(sname[k], q.sname[k]);
153 	}
154 	for (int k = 0; k < n_seqs; k++) {
155 		seq[k] = new char[strlen(q.seq[k]) + 1];
156 		if (!seq[k])
157 			MemoryError("array of names for sequences to display", __FILE__,
158 					__LINE__, __func__);
159 		strcpy(seq[k], q.seq[k]);
160 	}
161 	ncons = q.ncons;
162 	nfirst = q.nfirst;
163 	nss_dssp = q.nss_dssp;
164 	nsa_dssp = q.nsa_dssp;
165 	nss_pred = q.nss_pred;
166 	nss_conf = q.nss_conf;
167 
168 	for (int i = 0; i <= L + 1; ++i)
169 		Neff_M[i] = q.Neff_M[i];
170 	for (int i = 0; i <= L + 1; ++i)
171 		Neff_I[i] = q.Neff_I[i];
172 	for (int i = 0; i <= L + 1; ++i)
173 		Neff_D[i] = q.Neff_D[i];
174 	Neff_HMM = q.Neff_HMM;
175 
176 	strcpy(longname, q.longname);
177 	strmcpy(name, q.name, NAMELEN - 1);
178 	strmcpy(file, q.file, NAMELEN - 1);
179 	strmcpy(fam, q.fam, NAMELEN - 1);
180 	strmcpy(sfam, q.sfam, IDLEN - 1);
181 	strmcpy(fold, q.fold, IDLEN - 1);
182 	strmcpy(cl, q.cl, IDLEN - 1);
183 
184 	lamda = q.lamda;
185 	mu = q.mu;
186 	trans_lin = q.trans_lin; // transition probs in log space
187 	dont_delete_seqs = q.dont_delete_seqs;
188 	has_pseudocounts = q.has_pseudocounts;
189 	divided_by_local_bg_freqs = q.divided_by_local_bg_freqs;
190 
191 	for (int a = 0; a < NAA; ++a)
192 		pav[a] = q.pav[a];
193 	N_in = q.N_in;
194 	N_filtered = q.N_filtered;
195 	return (HMM&) (*this);
196 }
197 
198 /////////////////////////////////////////////////////////////////////////////////////
199 //// Read an HMM from an HHsearch .hhm file; return 0 at end of file
200 /////////////////////////////////////////////////////////////////////////////////////
Read(FILE * dbf,const int maxcol,const int nseqdis,float * pb,char * path)201 int HMM::Read(FILE* dbf, const int maxcol, const int nseqdis, float* pb,
202 		char* path) {
203 	char line[LINELEN] = "";    // input line
204 	char str3[8] = "", str4[8] = ""; // first 3 and 4 letters of input line
205 	char* ptr;                // pointer for string manipulation
206 	int i = 0;                  // index for match state (first=1)
207 	int a;                    // amino acid index
208 	int warn = 0;
209 
210 	//Delete name and seq matrices
211 	if (!dont_delete_seqs) // Delete all sname and seq if no flat copy to hit object has been made
212 	{
213 		for (int k = 0; k < n_seqs; k++)
214 			delete[] sname[k];
215 		for (int k = 0; k < n_seqs; k++)
216 			delete[] seq[k];
217 	} else // Otherwise, delete only sequences not diplayed (lost otherwise)
218 	{
219 		if (n_seqs > n_display) {
220 			for (int k = n_display; k < n_seqs; k++)
221 				delete[] sname[k];
222 			for (int k = n_display; k < n_seqs; k++)
223 				delete[] seq[k];
224 		}
225 	}
226 
227 	L = 0;
228 	Neff_HMM = 0;
229 	n_display = N_in = N_filtered = 0;
230 	nss_dssp = nsa_dssp = nss_pred = nss_conf = nfirst = ncons = -1;
231 	lamda = mu = 0.0;
232 	name[0] = longname[0] = fam[0] = '\0';
233 	trans_lin = 0; // transition probs in log space
234 	has_pseudocounts = false;
235 	dont_delete_seqs = false;
236 	divided_by_local_bg_freqs = false;
237 	//If at the end of while-loop L is still 0 then we have reached end of db file
238 
239 	while (fgetline(line, LINELEN - 1, dbf)
240 			&& !(line[0] == '/' && line[1] == '/')) {
241 
242 		if (strscn(line) == NULL)
243 			continue;    // skip lines that contain only white space
244 		substr(str3, line, 0, 2);   // copy the first three characters into str3
245 		substr(str4, line, 0, 3);    // copy the first four characters into str4
246 
247 		if (!strncmp("HH", line, 2))
248 			continue;
249 
250 		if (!strcmp("NAME", str4)) {
251 			ptr = strscn(line + 4); //advance to first non-white-space character
252 			if (ptr) {
253 				strmcpy(longname, ptr, DESCLEN - 1); //copy full name to longname
254 				strmcpy(name, ptr, NAMELEN - 1);     //copy longname to name...
255 				strcut(name);                    //...cut after first word...
256 			} else {
257 				strcpy(longname, "undefined");
258 				strcpy(name, "undefined");
259 			}
260 
261 			HH_LOG(DEBUG1) << "Reading in HMM " << name << ":" << std::endl;
262 		}
263 
264 		else if (!strcmp("FAM", str3)) {
265 			ptr = strscn(line + 3); //advance to first non-white-space character
266 			if (ptr)
267 				strmcpy(fam, ptr, IDLEN - 1);
268 			else
269 				strcpy(fam, ""); //copy family name to basename
270 			ScopID(cl, fold, sfam, fam); //get scop classification from basename (e.g. a.1.2.3.4)
271 		}
272 
273 		else if (!strcmp("FILE", str4)) {
274 			if (path)
275 				strmcpy(file, path, NAMELEN - 1);
276 			else
277 				*file = '\0'; // copy path to file variable
278 			ptr = strscn(line + 4); //advance to first non-white-space character
279 			if (ptr)
280 				strncat(file, ptr, NAMELEN - 1 - strlen(file)); // append file name read from file to path
281 			else
282 				strcat(file, "*");
283 		}
284 
285 		else if (!strcmp("LENG", str4)) {
286 			ptr = line + 4;
287 			L = strint(ptr);        //read next integer (number of match states)
288 		} else if (!strcmp("FILT", str4) || !strcmp("NSEQ", str4)) {
289 			ptr = line + 4;
290 			N_filtered = strint(ptr); //read next integer: number of sequences after filtering
291 			N_in = strint(ptr); //read next integer: number of sequences in alignment
292 		}
293 
294 		else if (!strcmp("NEFF", str4) || !strcmp("NAA", str3))
295 			sscanf(line + 6, "%f", &Neff_HMM);
296 
297 		else if (!strcmp("EVD", str3)) {
298 			//        char key[IDLEN];
299 			sscanf(line + 6, "%f %f", &lamda, &mu);
300 			//        sscanf(line+22,"%s",key);
301 			//        lamda_hash.Add(key,lamda);
302 			//        mu_hash.Add(key,mu);
303 		}
304 
305 		else if (!strcmp("PCT", str3)) {
306 			has_pseudocounts = true;
307 		} else if (!strcmp("DESC", str4))
308 			continue;
309 		else if (!strcmp("COM", str3))
310 			continue;
311 		else if (!strcmp("DATE", str4))
312 			continue;
313 
314 		/////////////////////////////////////////////////////////////////////////////////////
315 		// Read template sequences that should get displayed in output alignments
316 		else if (!strcmp("SEQ", str3)) {
317 			//char cur_seq[par.maxcol]=""; //Sequence currently read in
318 			char* cur_seq = new char[maxcol]; //Sequence currently read in
319 			int k; // sequence index; start with -1; after reading name of n'th sequence-> k=n
320 			int h;                // index for character in input line
321 			int l = 1;              // index of character in sequence seq[k]
322 			int i = 1; // index of match states in ss_dssp[i] and ss_pred[i] sequence
323 			int n_seq = 0; // number of sequences to be displayed EXCLUDING ss sequences
324 			cur_seq[0] = '-'; // overwrite '\0' character at beginning to be able to do strcpy(*,cur_seq)
325 			k = -1;
326 			while (fgetline(line, LINELEN - 1, dbf) && line[0] != '#') {
327 				HH_LOG(DEBUG1) << "Read from file:" << line << std::endl;
328 				if (line[0] == '>') //line contains sequence name
329 						{
330 					if (k >= MAXSEQDIS - 1) //maximum number of allowable sequences exceeded
331 					{
332 						HH_LOG(WARNING) << "Warning in " << __FILE__ << ":"
333 									<< __LINE__ << ": " << __func__ << ":"
334 									<< std::endl;
335 						HH_LOG(WARNING) << "\tnumber of sequences in " << file
336 									<< " exceeds maximum allowed number of "
337 									<< MAXSEQDIS << ". Skipping sequences.\n";
338 						while (fgetline(line, LINELEN - 1, dbf)
339 								&& line[0] != '#')
340 							;
341 						break;
342 					}
343 					k++;
344 					if (!strncmp(line, ">ss_dssp", 8))
345 						nss_dssp = k;
346 					else if (!strncmp(line, ">sa_dssp", 8))
347 						nsa_dssp = k;
348 					else if (!strncmp(line, ">ss_pred", 8))
349 						nss_pred = k;
350 					else if (!strncmp(line, ">ss_conf", 8))
351 						nss_conf = k;
352 					else if (!strncmp(line, ">Cons-", 6)
353 							|| !strncmp(line, ">Consensus", 10))
354 						ncons = k;
355 					else {
356 						if (nfirst == -1)
357 							nfirst = k;
358 						//                      if (n_seq>=par.nseqdis)
359 						//  {while (fgetline(line,LINELEN-1,dbf) && line[0]!='#'); k--; break;}
360 						n_seq++;
361 					}
362 
363 					//If this is not the first sequence then store residues of previous sequence
364 					if (k > 0) {
365 						seq[k - 1] = new char[strlen(cur_seq) + 1];
366 						if (!seq[k - 1])
367 							MemoryError("array of sequences to display",
368 									__FILE__, __LINE__, __func__);
369 						strcpy(seq[k - 1], cur_seq);
370 					}
371 
372 					// store sequence name
373 					strcut(line + 1); //find next white-space character and overwrite it with end-of-string character
374 					sname[k] = new char[strlen(line + 1) + 1]; //+1 for terminating '\0'
375 					if (!sname[k])
376 						MemoryError("array of names for sequences to display",
377 								__FILE__, __LINE__, __func__);
378 					strcpy(sname[k], line + 1);  //store sequence name in **name
379 					l = 1;
380 					i = 1;
381 				} else //line contains sequence residues
382 				{
383 					if (k == -1) {
384 					  HH_LOG(WARNING) 	<< "Ignoring following line while reading HMM" << name << ":\n\'" << line << "\'" << std::endl;
385 						continue;
386 					}
387 
388 					h = 0; //counts characters in current line
389 
390 					// Check whether all characters are correct; store into cur_seq
391 					if (k == nss_dssp) // lines with dssp secondary structure states (. - H E C S T G B)
392 							{
393 						while (h < LINELEN && line[h] > '\0' && l < maxcol - 1) {
394 							if (ss2i(line[h]) >= 0 && line[h] != '.') {
395 								char c = ss2ss(line[h]);
396 								cur_seq[l] = c;
397 								if (c != '.' && !(c >= 'a' && c <= 'z'))
398 									ss_dssp[i++] = ss2i(c);
399 								l++;
400 							} else if (ss2i(line[h]) == -2) {
401 								HH_LOG(WARNING) << std::endl
402 										<< "Ignoring invalid symbol \'"
403 										<< line[h] << "\' at pos. " << h
404 										<< " in line '" << line << "' of HMM "
405 										<< name << "\n";
406 							}
407 							h++;
408 						}
409 					}
410 					if (k == nsa_dssp) // lines with dssp secondary solvent accessibility (- A B C D E)
411 							{
412 						while (h < LINELEN && line[h] > '\0' && l < maxcol - 1) {
413 							if (sa2i(line[h]) >= 0) {
414 								char c = line[h];
415 								cur_seq[l] = c;
416 								if (c != '.' && !(c >= 'a' && c <= 'z'))
417 									sa_dssp[i++] = sa2i(c);
418 								l++;
419 							} else if (sa2i(line[h]) == -2) {
420 								HH_LOG(WARNING) << std::endl
421 										<< "Ignoring invalid symbol \'"
422 										<< line[h] << "\' at pos. " << h
423 										<< " in line '" << line << "' of HMM "
424 										<< name << "\n";
425 							}
426 							h++;
427 						}
428 					} else if (k == nss_pred) // lines with predicted secondary structure (. - H E C)
429 							{
430 						while (h < LINELEN && line[h] > '\0' && l < maxcol - 1) {
431 							if (ss2i(line[h]) >= 0 && ss2i(line[h]) <= 3
432 									&& line[h] != '.') {
433 								char c = ss2ss(line[h]);
434 								cur_seq[l] = c;
435 								if (c != '.' && !(c >= 'a' && c <= 'z'))
436 									ss_pred[i++] = ss2i(c);
437 								l++;
438 							} else if (ss2i(line[h]) == -2) {
439 								HH_LOG(WARNING) << std::endl
440 										<< "Ignoring invalid symbol \'"
441 										<< line[h] << "\' at pos. " << h
442 										<< " in line '" << line << "' of HMM "
443 										<< name << "\n";
444 							}
445 							h++;
446 						}
447 					} else if (k == nss_conf) // lines with confidence values should contain only 0-9, '-', or '.'
448 							{
449 						while (h < LINELEN && line[h] > '\0' && l < maxcol - 1) {
450 							if (line[h] == '-'
451 									|| (line[h] >= '0' && line[h] <= '9')) {
452 								cur_seq[l] = line[h];
453 								ss_conf[l] = cf2i(line[h]);
454 								l++;
455 							} else if (cf2i(line[h]) == -2) {
456 								HH_LOG(WARNING) << std::endl
457 										<< "Ignoring invalid symbol \'"
458 										<< line[h] << "\' at pos. " << h
459 										<< " in line '" << line << "' of HMM "
460 										<< name << "\n";
461 							}
462 							h++;
463 						}
464 					} else // normal line containing residues
465 					{
466 						while (h < LINELEN && line[h] > '\0' && l < maxcol - 1) {
467 							// ignore '.' and white-space characters ' ', \t and \n (aa2i()==-1)
468 							if (aa2i(line[h]) >= 0 && line[h] != '.') {
469 								cur_seq[l] = line[h];
470 								l++;
471 							} else if (aa2i(line[h]) == -2) {
472 								HH_LOG(WARNING) << std::endl
473 										<< "Ignoring invalid symbol \'"
474 										<< line[h] << "\' at pos. " << h
475 										<< " in line '" << line << "' of HMM "
476 										<< name << "\n";
477 							}
478 							h++;
479 						}
480 					}
481 					cur_seq[l] = '\0'; //Ensure that cur_seq ends with a '\0' character
482 
483 				} //end else
484 
485 				if (n_seq <= nseqdis)
486 					n_display = k + 1;
487 			} //while(getline)
488 
489 			//If this is not the first sequence some residues have already been read in
490 			if (k >= 0) {
491 				seq[k] = new char[strlen(cur_seq) + 1];
492 				if (!seq[k])
493 					MemoryError("array of sequences to display", __FILE__,
494 							__LINE__, __func__);
495 				strcpy(seq[k], cur_seq);
496 			}
497 			n_seqs = k + 1;
498 
499 			// DEBUG
500 			if (Log::reporting_level() >= DEBUG1) {
501 				HH_LOG(DEBUG1) << "nss_dssp=" << nss_dssp << "  nsa_dssp=" << nsa_dssp << "  nss_pred=" << nss_pred << "  nss_conf=" << nss_conf << "  nfirst=" << nfirst << std::endl;
502 				for (k = 0; k < n_display; k++) {
503 					int j;
504 					HH_LOG(DEBUG1) << ">" << sname[k] << "(k=" << k << ")\n";
505 					if (k == nss_dssp) {
506 						for (j = 1; j <= L; j++)
507 							HH_LOG(DEBUG1) << char(i2ss(ss_dssp[j]));
508 					} else if (k == nsa_dssp) {
509 						for (j = 1; j <= L; j++)
510 							HH_LOG(DEBUG1) << char(i2sa(sa_dssp[j]));
511 					} else if (k == nss_pred) {
512 						for (j = 1; j <= L; j++)
513 							HH_LOG(DEBUG1) << char(i2ss(ss_pred[j]));
514 					} else if (k == nss_conf) {
515 						for (j = 1; j <= L; j++)
516 							HH_LOG(DEBUG1) << int(ss_conf[j] - 1);
517 					} else {
518 						for (j = 1; j <= L; j++)
519 							HH_LOG(DEBUG1) << seq[k][j];
520 					}
521 					HH_LOG(DEBUG1) << "\n";
522 				}
523 			}
524 
525 			delete[] cur_seq;
526 
527 		} //end if("SEQ")
528 
529 		/////////////////////////////////////////////////////////////////////////////////////
530 		// Read average amino acid frequencies for HMM
531 		else if (!strcmp("FREQ", str4))
532 			FormatError(file, __FILE__, __LINE__, __func__,
533 					"File has obsolete format. Please use hhmake version > 1.1 to generate hhm files.\n");
534 
535 		else if (!strcmp("AVER", str4)) {
536 		} // AVER line scrapped
537 		else if (!strcmp("NULL", str4)) {
538 			ptr = line + 4;
539 			for (a = 0; a < 20 && ptr; ++a)
540 				//s2[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
541 				pb[s2a[a]] = (float) fpow2(float(-strinta(ptr)) / HMMSCALE);
542 			if (!ptr)
543 				return Warning(dbf, line, name);
544 			if (Log::reporting_level() >= DEBUG1) {
545 				HH_LOG(DEBUG1) << "\nNULL  ";
546 				for (a = 0; a < 20; ++a)
547 					HH_LOG(DEBUG1) << 100. * pb[s2a[a]] << " ";
548 				HH_LOG(DEBUG1) << std::endl;
549 			}
550 		}
551 
552 		/////////////////////////////////////////////////////////////////////////////////////
553 		// Read transition probabilities from start state
554 		else if (!strcmp("HMM", str3)) {
555 			fgetline(line, LINELEN - 1, dbf); // Skip line with amino acid labels
556 			fgetline(line, LINELEN - 1, dbf); // Skip line with transition labels
557 			ptr = line;
558 
559 			for (a = 0; a <= D2D && ptr; ++a)
560 				tr[0][a] = float(-strinta(ptr)) / HMMSCALE; //store transition probabilites as log2 values
561 			// strinta returns next integer in string and puts ptr to first char
562 			// after the integer. Returns -99999 if '*' is found.
563 			// ptr is set to 0 if no integer is found after ptr.
564 			Neff_M[0] = float(strinta(ptr)) / HMMSCALE; // Read eff. number of sequences with M->? transition
565 			Neff_I[0] = float(strinta(ptr)) / HMMSCALE; // Read eff. number of sequences with I->? transition
566 			Neff_D[0] = float(strinta(ptr)) / HMMSCALE; // Read eff. number of sequences with D->? transition
567 			if (!ptr)
568 				return Warning(dbf, line, name);
569 
570 			/////////////////////////////////////////////////////////////////////////////////////
571 			// Read columns of HMM
572 			int next_i = 0;  // index of next column
573 			while (fgetline(line, LINELEN - 2, dbf)
574 					&& !(line[0] == '/' && line[1] == '/') && line[0] != '#') {
575 				if (strscn(line) == NULL)
576 					continue; // skip lines that contain only white space
577 
578 				// Read in AA probabilities
579 				ptr = line + 1;
580 				int prev_i = next_i;
581 				next_i = strint(ptr);
582 				++i;
583 				if (next_i != prev_i + 1)
584 					if (++warn <= 5) {
585 						HH_LOG(WARNING) << "In HMM " << name
586 								<< " state " << prev_i
587 								<< " is followed by state " << next_i << "\n";
588 						if (warn == 5) {
589 							HH_LOG(WARNING) << std::endl
590 									<< "Further warnings while reading HMMs will be suppressed.\n";
591 						}
592 					}
593 				if (i > L) {
594 					HH_LOG(WARNING) << std::endl << "In HMM " << name
595 							<< " there are more columns than the stated length "
596 							<< L << ". Skipping HMM\n";
597 					return 2;
598 				}
599 				if (i > maxres - 2) {
600 					fgetline(line, LINELEN - 1, dbf); // Skip line
601 					continue;
602 				}
603 
604 				for (a = 0; a < 20 && ptr; ++a)
605 					//              f[i][s2a[a]] = (float)pow(2.,float(-strinta(ptr))/HMMSCALE);
606 					f[i][s2a[a]] = fpow2(float(-strinta(ptr)) / HMMSCALE); // speed-up ~5 s for 10000 SCOP domains
607 
608 				//s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
609 				l[i] = strint(ptr);
610 				if (!ptr)
611 					return Warning(dbf, line, name);
612 				if (Log::reporting_level() >= DEBUG1) {
613 					HH_LOG(DEBUG1) << line;
614 					HH_LOG(DEBUG1) << i << " ";
615 					for (a = 0; a < 20; ++a)
616 						HH_LOG(DEBUG1) << 100 * f[i][s2a[a]] << " ";
617 					HH_LOG(DEBUG1) << l[i];
618 					HH_LOG(DEBUG1) << std::endl;
619 				}
620 
621 				// Read transition probabilities
622 				fgetline(line, LINELEN - 1, dbf); // Skip line with amino acid labels
623 				if (line[0] != ' ' && line[0] != '\t')
624 					return Warning(dbf, line, name);
625 				ptr = line;
626 				for (a = 0; a <= D2D && ptr; ++a)
627 					tr[i][a] = float(-strinta(ptr)) / HMMSCALE; //store transition prob's as log2-values
628 				Neff_M[i] = float(strinta(ptr)) / HMMSCALE; // Read eff. number of sequences with M->? transition
629 				if (Neff_M[i] == 0) {
630 					Neff_M[i] = 1;
631 				}
632 				Neff_I[i] = float(strinta(ptr)) / HMMSCALE; // Read eff. number of sequences with I->? transition
633 				Neff_D[i] = float(strinta(ptr)) / HMMSCALE; // Read eff. number of sequences with D->? transition
634 				if (!ptr)
635 					return Warning(dbf, line, name);
636 				if (Log::reporting_level() >= DEBUG1) {
637 					HH_LOG(DEBUG1) << "       ";
638 					for (a = 0; a <= D2D; ++a)
639 						HH_LOG(DEBUG1) << 100 * fpow2(tr[i][a]) << " ";
640 					HH_LOG(DEBUG1) << Neff_M[i] << " " << Neff_I[i] << " " << Neff_D[i] << std::endl;
641 				}
642 			}
643 			if (line[0] == '/' && line[1] == '/')
644 				break;
645 		} else
646 			HH_LOG(WARNING) << std::endl << "Ignoring line\n\'" << line
647 					<< "\'\nin HMM " << name << "\n";
648 
649 	} //while(getline)
650 
651 	if (L == 0)
652 		return 0; //End of db file -> stop reading in
653 
654 	// Set coefficients of EVD (= 0.0 if not calibrated for these parameters)
655 	//   lamda = lamda_hash.Show(par.Key());
656 	//   mu    = mu_hash.Show(par.Key());
657 	if (lamda) {
658 		HH_LOG(DEBUG) << "HMM " << name << " is already calibrated: lamda="
659 		    << lamda << ", mu=" << mu << std::endl;
660 	}
661 
662 	if (i != L) {
663 		HH_LOG(WARNING) << "In HMM " << name << " there are only " << i
664 		    << " columns while the stated length is " << L << "\n";
665 	}
666 
667 	if (i > maxres - 2) {
668 		i = maxres - 2;
669 		HH_LOG(WARNING) << "Maximum number " << maxres - 2
670 				<< " of residues exceeded while reading HMM " << name << "\n";
671 	}
672 	if (!i) {
673 		HH_LOG(WARNING) << std::endl << "HMM " << name
674 		    << " contains no match states. Check the alignment that gave rise to this HMM.\n";
675 	}
676 
677 	HH_LOG(DEBUG) << "Read in HMM " << name << " with " << L
678 				<< " match states and effective number of sequences = "
679 				<< Neff_HMM << "\n";
680 	L = i;
681 
682 	// Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities
683 	for (a = 0; a < 20; ++a)
684 		f[0][a] = f[L + 1][a] = pb[a];
685 	Neff_M[L + 1] = 1.0f;
686 	Neff_I[L + 1] = Neff_D[L + 1] = 0.0f;
687 
688 	return 1; //return status: ok
689 }
690 
691 /////////////////////////////////////////////////////////////////////////////////////
692 //// Read an HMM from a HMMer .hmm file; return 0 at end of file
693 /////////////////////////////////////////////////////////////////////////////////////
ReadHMMer(FILE * dbf,const char showcons,float * pb,char * filestr)694 int HMM::ReadHMMer(FILE* dbf, const char showcons, float* pb, char* filestr) {
695 	char line[LINELEN] = "";    // input line
696 	char desc[DESCLEN] = "";    // description of family
697 	char str4[5] = "";          // first 4 letters of input line
698 	char* ptr;                // pointer for string manipulation
699 	int i = 0;                  // index for match state (first=1)
700 	int a;                    // amino acid index
701 	char dssp = 0; // 1 if a consensus SS has been found in the transition prob lines
702 	char annot = 0; // 1 if at least one annotation character in insert lines is ne '-' or ' '
703 	int k = 0;                  // index for seq[k]
704 	static char ignore_hmmer_cal = 0;
705 	char* annotchr; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
706 	annotchr = new char[maxres]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
707 	int warn = 0;
708 
709 	L = 0;
710 	Neff_HMM = 0;
711 	n_seqs = n_display = N_in = N_filtered = 0;
712 	nss_dssp = nsa_dssp = nss_pred = nss_conf = nfirst = ncons = -1;
713 	lamda = mu = 0.0;
714 	trans_lin = 0; // transition probs in log space
715 	has_pseudocounts = true; // !!
716 	dont_delete_seqs = false;
717 	divided_by_local_bg_freqs = false;
718 	name[0] = longname[0] = desc[0] = fam[0] = '\0';
719 	//If at the end of while-loop L is still 0 then we have reached end of db file
720 
721 	// Do not delete name and seq vectors because their adresses are transferred to hitlist as part of a hit!!
722 
723 	while (fgetline(line, LINELEN - 1, dbf)
724 			&& !(line[0] == '/' && line[1] == '/')) {
725 
726 		if (strscn(line) == NULL)
727 			continue;   // skip lines that contain only white space
728 		if (!strncmp("HMMER", line, 5))
729 			continue;
730 
731 		substr(str4, line, 0, 3);    // copy the first four characters into str4
732 
733 		if (!strcmp("NAME", str4) && name[0] == '\0') {
734 			ptr = strscn(line + 4); // advance to first non-white-space character
735 			strmcpy(name, ptr, NAMELEN - 1);    // copy full name to name
736 			strcut(name);                   // ...cut after first word...
737 			HH_LOG(DEBUG1) << "Reading in HMM " << name << ":" << std::endl;
738 		}
739 
740 		else if (!strcmp("ACC ", str4)) {
741 			ptr = strscn(line + 4); // advance to first non-white-space character
742 			strmcpy(longname, ptr, DESCLEN - 1); // copy Accession id to longname...
743 		}
744 
745 		else if (!strcmp("DESC", str4)) {
746 			ptr = strscn(line + 4); // advance to first non-white-space character
747 			if (ptr) {
748 				strmcpy(desc, ptr, DESCLEN - 1); // copy description to name...
749 				strcut(ptr);                 // ...cut after first word...
750 			}
751 			if (!ptr || ptr[1] != '.' || strchr(ptr + 3, '.') == NULL)
752 				strcpy(fam, "");
753 			else
754 				strmcpy(fam, ptr, NAMELEN - 1); // could not find two '.' in name?
755 		}
756 
757 		else if (!strcmp("LENG", str4)) {
758 			ptr = line + 4;
759 			L = strint(ptr);        //read next integer (number of match states)
760 		}
761 
762 		else if (!strcmp("ALPH", str4))
763 			continue;
764 		else if (!strcmp("RF  ", str4))
765 			continue;
766 		else if (!strcmp("CS  ", str4))
767 			continue;
768 		else if (!strcmp("MAP ", str4))
769 			continue;
770 		else if (!strcmp("COM ", str4))
771 			continue;
772 		else if (!strcmp("NSEQ", str4)) {
773 			ptr = line + 4;
774 			N_in = N_filtered = strint(ptr); //read next integer: number of sequences after filtering
775 		}
776 
777 		else if (!strcmp("DATE", str4))
778 			continue;
779 		else if (!strncmp("CKSUM ", line, 5))
780 			continue;
781 		else if (!strcmp("GA  ", str4))
782 			continue;
783 		else if (!strcmp("TC  ", str4))
784 			continue;
785 		else if (!strcmp("NC  ", str4))
786 			continue;
787 
788 		else if (!strncmp("SADSS", line, 5)) {
789 			if (nsa_dssp < 0) {
790 				nsa_dssp = k++;
791 				seq[nsa_dssp] = new char[maxres + 2];
792 				sname[nsa_dssp] = new char[15];
793 				strcpy(seq[nsa_dssp], " ");
794 				strcpy(sname[nsa_dssp], "sa_dssp");
795 
796 			}
797 			ptr = strscn(line + 5);
798 			if (ptr) {
799 				strcut(ptr);
800 				if (strlen(seq[nsa_dssp]) + strlen(ptr) >= (unsigned) (maxres))
801 					printf(
802 							"HMM %s has SADSS records with more than %i residues.\n",
803 							name, maxres);
804 				else
805 					strcat(seq[nsa_dssp], ptr);
806 			}
807 		}
808 
809 		else if (!strncmp("SSPRD", line, 5)) {
810 			if (nss_pred < 0) {
811 				nss_pred = k++;
812 				seq[nss_pred] = new char[maxres + 2];
813 				sname[nss_pred] = new char[15];
814 				strcpy(seq[nss_pred], " ");
815 				strcpy(sname[nss_pred], "ss_pred");
816 
817 			}
818 			ptr = strscn(line + 5);
819 			if (ptr) {
820 				strcut(ptr);
821 				if (strlen(seq[nss_pred]) + strlen(ptr) >= (unsigned) (maxres))
822 					printf(
823 							"HMM %s has SSPRD records with more than %i residues.\n",
824 							name, maxres);
825 				else
826 					strcat(seq[nss_pred], ptr);
827 			}
828 		}
829 
830 		else if (!strncmp("SSCON", line, 5)) {
831 			if (nss_conf < 0) {
832 				nss_conf = k++;
833 				seq[nss_conf] = new char[maxres + 2];
834 				sname[nss_conf] = new char[15];
835 				strcpy(seq[nss_conf], " ");
836 				strcpy(sname[nss_conf], "ss_conf");
837 			}
838 			ptr = strscn(line + 5);
839 			if (ptr) {
840 				strcut(ptr);
841 				if (strlen(seq[nss_conf]) + strlen(ptr) >= (unsigned) (maxres))
842 					printf(
843 							"HMM %s has SSPRD records with more than %i residues.\n",
844 							name, maxres);
845 				else
846 					strcat(seq[nss_conf], ptr);
847 			}
848 		}
849 
850 		else if (!strncmp("SSCIT", line, 5))
851 			continue;
852 		else if (!strcmp("XT  ", str4))
853 			continue;
854 		else if (!strcmp("NULT", str4))
855 			continue;
856 
857 		else if (!strcmp("NULE", str4)) {
858 			ptr = line + 4;
859 			for (a = 0; a < 20 && ptr; ++a)
860 				//s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
861 				pb[s2a[a]] = (float) 0.05
862 						* fpow2(float(strinta(ptr, -99999)) / HMMSCALE);
863 			if (!ptr)
864 				return Warning(dbf, line, name);
865 			if (Log::reporting_level() >= DEBUG1) {
866 				HH_LOG(DEBUG1) << "\nNULL  ";
867 				for (a = 0; a < 20; ++a)
868 					HH_LOG(DEBUG1) << 100. * pb[s2a[a]] << " ";
869 				HH_LOG(DEBUG1) << std::endl;
870 			}
871 		}
872 
873 		else if (!strcmp("EVD ", str4)) {
874 			char* ptr = line + 4;
875 			ptr = strscn(ptr);
876 			sscanf(ptr, "%f", &lamda);
877 			ptr = strscn(ptr);
878 			sscanf(ptr, "%f", &mu);
879 			if (lamda < 0) {
880 				if (ignore_hmmer_cal == 0) {
881 					HH_LOG(WARNING) << std::endl
882 							<< "Some HMMs have been calibrated with HMMER's 'hmmcalibrate'. These calibrations will be ignored\n";
883 				}
884 				ignore_hmmer_cal = 1;
885 				mu = lamda = 0.0;
886 			}
887 		}
888 
889 		/////////////////////////////////////////////////////////////////////////////////////
890 		// Read transition probabilities from start state
891 		else if (!strncmp("HMM", line, 3)) {
892 			fgetline(line, LINELEN - 1, dbf); // Skip line with amino acid labels
893 			fgetline(line, LINELEN - 1, dbf); // Skip line with transition labels
894 			ptr = line;
895 			for (a = 0; a <= M2D && ptr; ++a)
896 				tr[0][a] = float(strinta(ptr, -99999)) / HMMSCALE; //store transition probabilites as log2 values
897 			// strinta returns next integer in string and puts ptr to first char
898 			// after the integer. Returns -99999 if '*' is found.
899 			// ptr is set to 0 if no integer is found after ptr.
900 			tr[0][I2M] = tr[0][D2M] = 0.0;
901 			tr[0][I2I] = tr[0][D2D] = -99999.0;
902 			if (!ptr)
903 				return Warning(dbf, line, name);
904 			if (Log::reporting_level() >= DEBUG1) {
905 				HH_LOG(DEBUG1) << "       ";
906 				for (a = 0; a <= D2D && ptr; ++a)
907 					HH_LOG(DEBUG1) << 100 * fpow2(tr[i][a]) << " ";
908 				HH_LOG(DEBUG1) << std::endl;
909 			}
910 
911 			// Prepare to store DSSP states (if there are none, delete afterwards)
912 			nss_dssp = k++;
913 			seq[nss_dssp] = new char[maxres + 2];
914 			sname[nss_dssp] = new char[15];
915 			strcpy(sname[nss_dssp], "ss_dssp");
916 
917 			/////////////////////////////////////////////////////////////////////////////////////
918 			// Read columns of HMM
919 			int next_i = 0;  // index of next column
920 			while (fgetline(line, LINELEN - 1, dbf)
921 					&& !(line[0] == '/' && line[1] == '/') && line[0] != '#') {
922 				if (strscn(line) == NULL)
923 					continue; // skip lines that contain only white space
924 
925 				// Read in AA probabilities
926 				ptr = line;
927 				int prev_i = next_i;
928 				next_i = strint(ptr);
929 				++i;
930 				if (next_i != prev_i + 1)
931 					if (++warn < 5) {
932 						HH_LOG(WARNING) << std::endl << "In HMM " << name
933 								<< " state " << prev_i
934 								<< " is followed by state " << next_i << "\n";
935 						if (warn == 5) {
936 							HH_LOG(WARNING) << std::endl
937 									<< "Further warnings while reading HMMs will be suppressed.\n";
938 						}
939 					}
940 				if (i > L) {
941 					HH_LOG(WARNING) << std::endl << "In HMM " << name
942 							<< " there are more columns than the stated length "
943 							<< L << ". Skipping columns.\n";
944 					break;
945 				}
946 				if (i > L) {
947 					HH_LOG(WARNING) << std::endl << "In HMM " << name
948 							<< " there are more columns than the stated length "
949 							<< L << "\n";
950 				}
951 				if (i >= maxres - 2) {
952 					fgetline(line, LINELEN - 1, dbf); // Skip two lines
953 					fgetline(line, LINELEN - 1, dbf);
954 					continue;
955 				}
956 
957 				for (a = 0; a < 20 && ptr; ++a)
958 					f[i][s2a[a]] = (float) pb[s2a[a]]
959 							* fpow2(float(strinta(ptr, -99999)) / HMMSCALE);
960 				//s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
961 				if (!ptr)
962 					return Warning(dbf, line, name);
963 				if (Log::reporting_level() >= DEBUG1) {
964 					HH_LOG(WARNING) << i;
965 					for (a = 0; a < 20; ++a)
966 						HH_LOG(WARNING) << 100 * f[i][s2a[a]] << " ";
967 					HH_LOG(WARNING) << std::endl;
968 				}
969 
970 				// Read insert emission line
971 				fgetline(line, LINELEN - 1, dbf);
972 				ptr = strscn(line);
973 				if (!ptr)
974 					return Warning(dbf, line, name);
975 				annotchr[i] = uprchr(*ptr);
976 				if (*ptr != '-' && *ptr != ' ' && *ptr != 'X' && *ptr != 'x')
977 					annot = 1;
978 
979 				// Read annotation character and seven transition probabilities
980 				fgetline(line, LINELEN - 1, dbf);
981 				ptr = strscn(line);
982 				switch (*ptr) {
983 				case 'H':
984 					ss_dssp[i] = 1;
985 					seq[nss_dssp][i] = *ptr;
986 					dssp = 1;
987 					break;
988 				case 'E':
989 					ss_dssp[i] = 2;
990 					seq[nss_dssp][i] = *ptr;
991 					dssp = 1;
992 					break;
993 				case 'C':
994 					ss_dssp[i] = 3;
995 					seq[nss_dssp][i] = *ptr;
996 					dssp = 1;
997 					break;
998 				case 'S':
999 					ss_dssp[i] = 4;
1000 					seq[nss_dssp][i] = *ptr;
1001 					dssp = 1;
1002 					break;
1003 				case 'T':
1004 					ss_dssp[i] = 5;
1005 					seq[nss_dssp][i] = *ptr;
1006 					dssp = 1;
1007 					break;
1008 				case 'G':
1009 					ss_dssp[i] = 6;
1010 					seq[nss_dssp][i] = *ptr;
1011 					dssp = 1;
1012 					break;
1013 				case 'B':
1014 					ss_dssp[i] = 7;
1015 					seq[nss_dssp][i] = *ptr;
1016 					dssp = 1;
1017 					break;
1018 				case 'I':
1019 					dssp = 1;
1020 				case '~':
1021 					ss_dssp[i] = 3;
1022 					seq[nss_dssp][i] = *ptr;
1023 					break;
1024 				case '-': // no SS available from any template
1025 				case '.': // no clear consensus SS structure
1026 				case 'X': // no clear consensus SS structure
1027 					ss_dssp[i] = 0;
1028 					seq[nss_dssp][i] = '-';
1029 					break;
1030 				default:
1031 					ss_dssp[i] = 0;
1032 					seq[nss_dssp][i] = *ptr;
1033 					break;
1034 				}
1035 
1036 				ptr += 2;
1037 				for (a = 0; a <= D2D && ptr; ++a)
1038 					tr[i][a] = float(strinta(ptr, -99999)) / HMMSCALE; //store transition prob's as log2-values
1039 				if (!ptr)
1040 					return Warning(dbf, line, name);
1041 				if (Log::reporting_level() >= DEBUG1) {
1042 					HH_LOG(DEBUG1) << "       ";
1043 					for (a = 0; a <= D2D; ++a)
1044 						HH_LOG(DEBUG1) << 100 * fpow2(tr[i][a]) << " ";
1045 					HH_LOG(DEBUG1) << std::endl;
1046 				}
1047 			}
1048 
1049 			if (line[0] == '/' && line[1] == '/')
1050 				break;
1051 
1052 		}
1053 
1054 	} //while(getline)
1055 
1056 	if (L == 0)
1057 		return 0; //End of db file -> stop reading in
1058 
1059 	// Set coefficients of EVD (= 0.0 if not calibrated for these parameters)
1060 	//   lamda = lamda_hash.Show(par.Key());
1061 	//   mu    = mu_hash.Show(par.Key());
1062 	if (lamda) {
1063 		HH_LOG(DEBUG) << "HMM "<< name <<" is already calibrated: lamda="<< lamda<<", mu="<<mu<< std::endl;
1064 	}
1065 
1066 	if (i != L) {
1067 		HH_LOG(WARNING) << std::endl << "In HMM " << name
1068 				<< " there are only " << i
1069 				<< " columns while the stated length is " << L << std::endl;
1070 	}
1071 	if (i >= maxres - 2) {
1072 		i = maxres - 2;
1073 		HH_LOG(WARNING) << std::endl << "Maximum number " << maxres - 2
1074 				<< " of residues exceeded while reading HMM " << name << std::endl;
1075 	}
1076 	if (!i) {
1077 		HH_LOG(WARNING) << std::endl << "HMM " << name
1078 				<< " contains no match states. Check the alignment that gave rise to this HMM." << std::endl;
1079 	}
1080 	L = i;
1081 
1082 	if (strlen(longname) > 0)
1083 		strcat(longname, " ");
1084 	strncat(longname, name, DESCLEN - strlen(longname) - 1); // longname = ACC NAME DESC
1085 	if (strlen(name) > 0)
1086 		strcat(longname, " ");
1087 	strncat(longname, desc, DESCLEN - strlen(longname) - 1);
1088 	longname[DESCLEN - 1] = '\0';
1089 	ScopID(cl, fold, sfam, fam); // get scop classification from basename (e.g. a.1.2.3.4)
1090 	RemoveExtension(file, filestr); // copy name of dbfile without extension into 'file'
1091 
1092 	// Secondary structure
1093 	if (!dssp) {
1094 		// remove dssp sequence
1095 		delete[] seq[nss_dssp]; // memory that had been allocated in case ss_dssp was given needs to be freed
1096 		delete[] sname[nss_dssp]; // memory that had been allocated in case ss_dssp was given needs to be freed
1097 		nss_dssp = -1;
1098 		k--;
1099 	} else {
1100 		seq[nss_dssp][0] = '-';
1101 		seq[nss_dssp][L + 1] = '\0';
1102 	}
1103 
1104 	if (nss_pred >= 0) {
1105 		for (i = 1; i <= L; ++i)
1106 			ss_pred[i] = ss2i(seq[nss_pred][i]);
1107 		if (nss_conf >= 0)
1108 			for (i = 1; i <= L; ++i)
1109 				ss_conf[i] = cf2i(seq[nss_conf][i]);
1110 		else
1111 			for (i = 1; i <= L; ++i)
1112 				ss_conf[i] = 5;
1113 	}
1114 
1115 	// Copy query (first sequence) and consensus  residues?
1116 	if (showcons) {
1117 		sname[k] = new char[10];
1118 		strcpy(sname[k], "Consensus");
1119 		sname[k + 1] = new char[strlen(longname) + 1];
1120 		strcpy(sname[k + 1], longname);
1121 		seq[k] = new char[L + 2];
1122 		seq[k][0] = ' ';
1123 		seq[k][L + 1] = '\0';
1124 		seq[k + 1] = new char[L + 2];
1125 		seq[k + 1][0] = ' ';
1126 		seq[k + 1][L + 1] = '\0';
1127 		for (i = 1; i <= L; ++i) {
1128 			float pmax = 0.0;
1129 			int amax = 0;
1130 			for (a = 0; a < NAA; ++a)
1131 				if (f[i][a] > pmax) {
1132 					amax = a;
1133 					pmax = f[i][a];
1134 				}
1135 			if (pmax > 0.6)
1136 				seq[k][i] = i2aa(amax);
1137 			else if (pmax > 0.4)
1138 				seq[k][i] = lwrchr(i2aa(amax));
1139 			else
1140 				seq[k][i] = 'x';
1141 			seq[k + 1][i] = i2aa(amax);
1142 		}
1143 		ncons = k++; // nfirst is set later!
1144 	} else {
1145 		sname[k] = new char[strlen(longname) + 1];
1146 		strcpy(sname[k], longname);
1147 		seq[k] = new char[L + 2];
1148 		seq[k][0] = ' ';
1149 		seq[k][L + 1] = '\0';
1150 	}
1151 
1152 	if (annot) // read in some annotation characters?
1153 	{
1154 		annotchr[0] = ' ';
1155 		annotchr[L + 1] = '\0';
1156 		strcpy(seq[k], annotchr); // overwrite the consensus sequence with the annotation characters
1157 	} else if (!showcons) // we have not yet calculated the consensus, but we need it now as query (first sequence)
1158 	{
1159 		for (i = 1; i <= L; ++i) {
1160 			float pmax = 0.0;
1161 			int amax = 0;
1162 			for (a = 0; a < NAA; ++a)
1163 				if (f[i][a] > pmax) {
1164 					amax = a;
1165 					pmax = f[i][a];
1166 				}
1167 			seq[k][i] = i2aa(amax);
1168 		}
1169 	}
1170 	//   printf("%i query name=%s  seq=%s\n",n,sname[n],seq[n]);
1171 	nfirst = k++;
1172 
1173 	n_display = k;
1174 	n_seqs = k;
1175 
1176 	// Calculate overall Neff_HMM
1177 	Neff_HMM = 0;
1178 	for (i = 1; i <= L; ++i) {
1179 		float S = 0.0;
1180 		for (a = 0; a < 20; ++a)
1181 			if (f[i][a] > 1E-10)
1182 				S -= f[i][a] * fast_log2(f[i][a]);
1183 		Neff_HMM += (float) fpow2(S);
1184 	}
1185 	Neff_HMM /= L;
1186 	for (i = 0; i <= L; ++i)
1187 		Neff_M[i] = Neff_I[i] = Neff_D[i] = 10.0; // to add only little additional pseudocounts!
1188 	Neff_M[L + 1] = 1.0f;
1189 	Neff_I[L + 1] = Neff_D[L + 1] = 0.0f;
1190 
1191 	HH_LOG(DEBUG) << "Read in HMM " << name << " with " << L
1192 				<< " match states and effective number of sequences = "
1193 				<< Neff_HMM << "\n";
1194 
1195 	// Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities
1196 	for (a = 0; a < 20; ++a)
1197 		f[0][a] = f[L + 1][a] = pb[a];
1198 	delete[] annotchr;
1199 	return 1; //return status: ok
1200 }
1201 
1202 /////////////////////////////////////////////////////////////////////////////////////
1203 //// Read an HMM from a HMMER3 .hmm file; return 0 at end of file
1204 /////////////////////////////////////////////////////////////////////////////////////
ReadHMMer3(FILE * dbf,const char showcons,float * pb,char * filestr)1205 int HMM::ReadHMMer3(FILE* dbf, const char showcons, float* pb, char* filestr) {
1206 	char line[LINELEN] = "";    // input line
1207 	char desc[DESCLEN] = "";    // description of family
1208 	char str4[5] = "";          // first 4 letters of input line
1209 	char* ptr;                // pointer for string manipulation
1210 	int i = 0;                  // index for match state (first=1)
1211 	int a;                    // amino acid index
1212 	char dssp = 0; // 1 if a consensus SS has been found in the transition prob lines
1213 	char annot = 0; // 1 if at least one annotation character in insert lines is ne '-' or ' '
1214 	int k = 0;                  // index for seq[k]
1215 	char* annotchr; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
1216 	annotchr = new char[maxres]; // consensus amino acids in ASCII format, or, in HMMER format, the reference annotation character in insert line
1217 	int warn = 0;
1218 
1219 	L = 0;
1220 	Neff_HMM = 0;
1221 	n_seqs = n_display = N_in = N_filtered = 0;
1222 	nss_dssp = nsa_dssp = nss_pred = nss_conf = nfirst = ncons = -1;
1223 	lamda = mu = 0.0;
1224 	trans_lin = 0; // transition probs in log space
1225 	has_pseudocounts = true; // !!
1226 	dont_delete_seqs = false;
1227 	divided_by_local_bg_freqs = false;
1228 	name[0] = longname[0] = desc[0] = fam[0] = '\0';
1229 	//If at the end of while-loop L is still 0 then we have reached end of db file
1230 
1231 	// Do not delete name and seq vectors because their adresses are transferred to hitlist as part of a hit!!
1232 
1233 	while (fgetline(line, LINELEN - 1, dbf)
1234 			&& !(line[0] == '/' && line[1] == '/')) {
1235 
1236 		if (strscn(line) == NULL)
1237 			continue;   // skip lines that contain only white space
1238 		if (!strncmp("HMMER", line, 5))
1239 			continue;
1240 
1241 		substr(str4, line, 0, 3);    // copy the first four characters into str4
1242 
1243 		if (!strcmp("NAME", str4) && name[0] == '\0') {
1244 			ptr = strscn(line + 4); // advance to first non-white-space character
1245 			strmcpy(name, ptr, NAMELEN - 1);      // copy full name to name
1246 			strcut(name);                   // ...cut after first word...
1247 			HH_LOG(DEBUG1) << "Reading in HMM " << name << ":\n";
1248 		}
1249 
1250 		else if (!strcmp("ACC ", str4)) {
1251 			ptr = strscn(line + 4); // advance to first non-white-space character
1252 			strmcpy(longname, ptr, DESCLEN - 1); // copy Accession id to longname...
1253 		}
1254 
1255 		else if (!strcmp("DESC", str4)) {
1256 			ptr = strscn(line + 4); // advance to first non-white-space character
1257 			if (ptr) {
1258 				strmcpy(desc, ptr, DESCLEN - 1);  // copy description to name...
1259 				desc[DESCLEN - 1] = '\0';
1260 				strcut(ptr);                 // ...cut after first word...
1261 			}
1262 			if (!ptr || ptr[1] != '.' || strchr(ptr + 3, '.') == NULL)
1263 				strcpy(fam, "");
1264 			else
1265 				strmcpy(fam, ptr, NAMELEN - 1); // could not find two '.' in name?
1266 		}
1267 
1268 		else if (!strcmp("LENG", str4)) {
1269 			ptr = line + 4;
1270 			L = strint(ptr);        //read next integer (number of match states)
1271 		}
1272 
1273 		else if (!strcmp("ALPH", str4))
1274 			continue;
1275 		else if (!strcmp("RF  ", str4))
1276 			continue;
1277 		else if (!strcmp("CS  ", str4))
1278 			continue;
1279 		else if (!strcmp("MAP ", str4))
1280 			continue;
1281 		else if (!strcmp("COM ", str4))
1282 			continue;
1283 		else if (!strcmp("NSEQ", str4)) {
1284 			ptr = line + 4;
1285 			N_in = N_filtered = strint(ptr); //read next integer: number of sequences after filtering
1286 		}
1287 
1288 		else if (!strcmp("DATE", str4))
1289 			continue;
1290 		else if (!strncmp("CKSUM ", line, 5))
1291 			continue;
1292 		else if (!strcmp("GA  ", str4))
1293 			continue;
1294 		else if (!strcmp("TC  ", str4))
1295 			continue;
1296 		else if (!strcmp("NC  ", str4))
1297 			continue;
1298 
1299 		//////////////////////////////////////////////////////////////////////////////////////////////////////
1300 		// Still needed???
1301 
1302 		else if (!strncmp("SADSS", line, 5)) {
1303 			if (nsa_dssp < 0) {
1304 				nsa_dssp = k++;
1305 				seq[nsa_dssp] = new char[maxres + 2];
1306 				sname[nsa_dssp] = new char[15];
1307 				strcpy(seq[nsa_dssp], " ");
1308 				strcpy(sname[nsa_dssp], "sa_dssp");
1309 
1310 			}
1311 			ptr = strscn(line + 5);
1312 			if (ptr) {
1313 				strcut(ptr);
1314 				if (strlen(seq[nsa_dssp]) + strlen(ptr) >= (unsigned) (maxres))
1315 					printf(
1316 							"HMM %s has SADSS records with more than %i residues.\n",
1317 							name, maxres);
1318 				else
1319 					strcat(seq[nsa_dssp], ptr);
1320 			}
1321 		}
1322 
1323 		else if (!strncmp("SSPRD", line, 5)) {
1324 			if (nss_pred < 0) {
1325 				nss_pred = k++;
1326 				seq[nss_pred] = new char[maxres + 2];
1327 				sname[nss_pred] = new char[15];
1328 				strcpy(seq[nss_pred], " ");
1329 				strcpy(sname[nss_pred], "ss_pred");
1330 
1331 			}
1332 			ptr = strscn(line + 5);
1333 			if (ptr) {
1334 				strcut(ptr);
1335 				if (strlen(seq[nss_pred]) + strlen(ptr) >= (unsigned) (maxres))
1336 					printf(
1337 							"HMM %s has SSPRD records with more than %i residues.\n",
1338 							name, maxres);
1339 				else
1340 					strcat(seq[nss_pred], ptr);
1341 			}
1342 		}
1343 
1344 		else if (!strncmp("SSCON", line, 5)) {
1345 			if (nss_conf < 0) {
1346 				nss_conf = k++;
1347 				seq[nss_conf] = new char[maxres + 2];
1348 				sname[nss_conf] = new char[15];
1349 				strcpy(seq[nss_conf], " ");
1350 				strcpy(sname[nss_conf], "ss_conf");
1351 			}
1352 			ptr = strscn(line + 5);
1353 			if (ptr) {
1354 				strcut(ptr);
1355 				if (strlen(seq[nss_conf]) + strlen(ptr) >= (unsigned) (maxres))
1356 					printf(
1357 							"HMM %s has SSPRD records with more than %i residues.\n",
1358 							name, maxres);
1359 				else
1360 					strcat(seq[nss_conf], ptr);
1361 			}
1362 		}
1363 
1364 		else if (!strncmp("SSCIT", line, 5))
1365 			continue;
1366 		else if (!strcmp("XT  ", str4))
1367 			continue;
1368 		//////////////////////////////////////////////////////////////////////////////////////////////////////
1369 
1370 		else if (!strncmp("STATS LOCAL", line, 11))
1371 			continue;
1372 
1373 		else if (!strcmp("EFFN", str4)) {
1374 			ptr = line + 4;
1375 			float effn = strflt(ptr);
1376 			// Calculate Neff_HMM by using f(x) = ax^0.1 + bx^0.5 + cx + d  (fitted with scop25 dataset)
1377 			Neff_HMM = -1.403534 * pow(effn, 0.1) + 4.428118 * pow(effn, 0.5)
1378 					- 0.2885410 * effn - 1.108568;
1379 		}
1380 
1381 		/////////////////////////////////////////////////////////////////////////////////////
1382 		// Read transition probabilities from start state
1383 		else if (!strncmp("HMM", line, 3)) {
1384 			fgetline(line, LINELEN - 1, dbf); // Skip line with amino acid labels
1385 			fgetline(line, LINELEN - 1, dbf); // Skip line with transition labels
1386 			ptr = strscn(line);
1387 
1388 			if (!strncmp("COMPO", ptr, 5)) {
1389 				ptr = ptr + 5;
1390 				for (a = 0; a < 20 && ptr; ++a)
1391 					//s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
1392 					pb[s2a[a]] = (float) exp(-1.0 * strflta(ptr, 99999));
1393 				if (!ptr)
1394 					return Warning(dbf, line, name);
1395 				if (Log::reporting_level() >= DEBUG1) {
1396 					HH_LOG(DEBUG1) << "\nNULL ";
1397 					for (a = 0; a < 20; ++a)
1398 						HH_LOG(DEBUG1) << 100. * pb[s2a[a]] << " ";
1399 					HH_LOG(DEBUG1) << std::endl;
1400 				}
1401 				fgetline(line, LINELEN - 1, dbf); // Read next line
1402 			}
1403 
1404 			fgetline(line, LINELEN - 1, dbf); // Skip line with 0-states insert probabilities
1405 
1406 			ptr = strscn(line);
1407 			for (a = 0; a <= D2D && ptr; ++a)
1408 				tr[0][a] = log2((float) exp(-1.0 * strflta(ptr, 99999))); //store transition probabilites as log2 values
1409 				// strinta returns next integer in string and puts ptr to first char
1410 				// after the integer. Returns -99999 if '*' is found.
1411 				// ptr is set to 0 if no integer is found after ptr.
1412 			if (!ptr)
1413 				return Warning(dbf, line, name);
1414 			if (Log::reporting_level() >= DEBUG1) {
1415 				HH_LOG(DEBUG1) << "       ";
1416 				for (a = 0; a <= D2D && ptr; ++a)
1417 					HH_LOG(DEBUG1) << 100 * fpow2(tr[i][a]) << " ";
1418 				HH_LOG(DEBUG1) << std::endl;
1419 			}
1420 
1421 			// Prepare to store DSSP states (if there are none, delete afterwards)
1422 			nss_dssp = k++;
1423 			seq[nss_dssp] = new char[maxres + 2];
1424 			sname[nss_dssp] = new char[15];
1425 			strcpy(sname[nss_dssp], "ss_dssp");
1426 
1427 			/////////////////////////////////////////////////////////////////////////////////////
1428 			// Read columns of HMM
1429 			int next_i = 0;  // index of next column
1430 			while (fgetline(line, LINELEN - 1, dbf)
1431 					&& !(line[0] == '/' && line[1] == '/') && line[0] != '#') {
1432 				if (strscn(line) == NULL)
1433 					continue; // skip lines that contain only white space
1434 
1435 				// Read in AA probabilities
1436 				ptr = line;
1437 				int prev_i = next_i;
1438 				next_i = strint(ptr);
1439 				++i;
1440 				if (next_i != prev_i + 1)
1441 					if (++warn < 5) {
1442 						HH_LOG(WARNING) << "In HMM " << name
1443 								<< " state " << prev_i
1444 								<< " is followed by state " << next_i << "\n";
1445 						if (warn == 5) {
1446 							HH_LOG(WARNING) << std::endl
1447 									<< "Further warnings while reading HMMs will be suppressed.\n";
1448 						}
1449 					}
1450 				if (i > L) {
1451 					HH_LOG(WARNING) << std::endl << "In HMM " << name
1452 							<< " there are more columns than the stated length "
1453 							<< L << ". Skipping columns.\n";
1454 					break;
1455 				}
1456 				if (i > L) {
1457 					HH_LOG(WARNING) << std::endl << "In HMM " << name
1458 							<< " there are more columns than the stated length "
1459 							<< L << "\n";
1460 				}
1461 				if (i >= maxres - 2) {
1462 					fgetline(line, LINELEN - 1, dbf); // Skip two lines
1463 					fgetline(line, LINELEN - 1, dbf);
1464 					continue;
1465 				}
1466 
1467 				for (a = 0; a < 20 && ptr; ++a)
1468 					f[i][s2a[a]] = (float) exp(-1.0 * strflta(ptr, 99999));
1469 				//s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids
1470 				if (!ptr)
1471 					return Warning(dbf, line, name);
1472 				if (Log::reporting_level() >= DEBUG1) {
1473 					HH_LOG(WARNING) << i << " ";
1474 					for (a = 0; a < 20; ++a)
1475 						HH_LOG(WARNING) << 100 * f[i][s2a[a]] << " ";
1476 					HH_LOG(WARNING) << std::endl;
1477 				}
1478 
1479 				// Ignore MAP annotation
1480 				ptr = strscn(ptr); //find next word
1481 				ptr = strscn_ws(ptr); // ignore word
1482 
1483 				// Read RF and CS annotation
1484 				ptr = strscn(ptr);
1485 				if (!ptr)
1486 					return Warning(dbf, line, name);
1487 				annotchr[i] = uprchr(*ptr);
1488 				if (*ptr != '-' && *ptr != ' ' && *ptr != 'X' && *ptr != 'x')
1489 					annot = 1;
1490 
1491 				ptr = strscn(ptr);
1492 				switch (*ptr) {
1493 				case 'H':
1494 					ss_dssp[i] = 1;
1495 					seq[nss_dssp][i] = *ptr;
1496 					dssp = 1;
1497 					break;
1498 				case 'E':
1499 					ss_dssp[i] = 2;
1500 					seq[nss_dssp][i] = *ptr;
1501 					dssp = 1;
1502 					break;
1503 				case 'C':
1504 					ss_dssp[i] = 3;
1505 					seq[nss_dssp][i] = *ptr;
1506 					dssp = 1;
1507 					break;
1508 				case 'S':
1509 					ss_dssp[i] = 4;
1510 					seq[nss_dssp][i] = *ptr;
1511 					dssp = 1;
1512 					break;
1513 				case 'T':
1514 					ss_dssp[i] = 5;
1515 					seq[nss_dssp][i] = *ptr;
1516 					dssp = 1;
1517 					break;
1518 				case 'G':
1519 					ss_dssp[i] = 6;
1520 					seq[nss_dssp][i] = *ptr;
1521 					dssp = 1;
1522 					break;
1523 				case 'B':
1524 					ss_dssp[i] = 7;
1525 					seq[nss_dssp][i] = *ptr;
1526 					dssp = 1;
1527 					break;
1528 				case 'I':
1529 					dssp = 1;
1530 				case '~':
1531 					ss_dssp[i] = 3;
1532 					seq[nss_dssp][i] = *ptr;
1533 					break;
1534 				case '-': // no SS available from any template
1535 				case '.': // no clear consensus SS structure
1536 				case 'X': // no clear consensus SS structure
1537 					ss_dssp[i] = 0;
1538 					seq[nss_dssp][i] = '-';
1539 					break;
1540 				default:
1541 					ss_dssp[i] = 0;
1542 					seq[nss_dssp][i] = *ptr;
1543 					break;
1544 				}
1545 
1546 				// Read insert emission line
1547 				fgetline(line, LINELEN - 1, dbf);
1548 
1549 				// Read seven transition probabilities
1550 				fgetline(line, LINELEN - 1, dbf);
1551 
1552 				ptr = line;
1553 				for (a = 0; a <= D2D && ptr; ++a)
1554 					tr[i][a] = log2((float) exp(-1.0 * strflta(ptr, 99999))); //store transition prob's as log2-values
1555 				if (!ptr)
1556 					return Warning(dbf, line, name);
1557 				if (Log::reporting_level() >= DEBUG1) {
1558 					HH_LOG(DEBUG1) << "       ";
1559 					for (a = 0; a <= D2D; ++a)
1560 						HH_LOG(DEBUG1) << 100 * fpow2(tr[i][a]) << " ";
1561 					HH_LOG(DEBUG1) << std::endl;
1562 				}
1563 			}
1564 
1565 			if (line[0] == '/' && line[1] == '/')
1566 				break;
1567 
1568 		}
1569 
1570 	} //while(getline)
1571 
1572 	if (L == 0)
1573 		return 0; //End of db file -> stop reading in
1574 
1575 	if (i != L) {
1576 		HH_LOG(WARNING) << std::endl << "In HMM " << name
1577 				<< " there are only " << i
1578 				<< " columns while the stated length is " << L << "\n";
1579 	}
1580 	if (i >= maxres - 2) {
1581 		i = maxres - 2;
1582 		HH_LOG(WARNING) << std::endl << "Maximum number " << maxres - 2
1583 				<< " of residues exceeded while reading HMM " << name << "\n";
1584 	}
1585 	if (!i) {
1586 		HH_LOG(WARNING) << std::endl << "HMM " << name
1587 				<< " contains no match states. Check the alignment that gave rise to this HMM.\n";
1588 	}
1589 	L = i;
1590 
1591 	if (strlen(longname) > 0)
1592 		strcat(longname, " ");
1593 	strncat(longname, name, DESCLEN - strlen(longname) - 1); // longname = ACC NAME DESC
1594 	if (strlen(name) > 0)
1595 		strcat(longname, " ");
1596 	strncat(longname, desc, DESCLEN - strlen(longname) - 1);
1597 	longname[DESCLEN - 1] = '\0';
1598 	ScopID(cl, fold, sfam, fam); // get scop classification from basename (e.g. a.1.2.3.4)
1599 	RemoveExtension(file, filestr); // copy name of dbfile without extension into 'file'
1600 
1601 	// Secondary structure
1602 	if (!dssp) {
1603 		// remove dssp sequence
1604 		delete[] seq[nss_dssp]; // memory that had been allocated in case ss_dssp was given needs to be freed
1605 		delete[] sname[nss_dssp]; // memory that had been allocated in case ss_dssp was given needs to be freed
1606 		nss_dssp = -1;
1607 		k--;
1608 	} else {
1609 		seq[nss_dssp][0] = '-';
1610 		seq[nss_dssp][L + 1] = '\0';
1611 	}
1612 
1613 	if (nss_pred >= 0) {
1614 		for (i = 1; i <= L; ++i)
1615 			ss_pred[i] = ss2i(seq[nss_pred][i]);
1616 		if (nss_conf >= 0)
1617 			for (i = 1; i <= L; ++i)
1618 				ss_conf[i] = cf2i(seq[nss_conf][i]);
1619 		else
1620 			for (i = 1; i <= L; ++i)
1621 				ss_conf[i] = 5;
1622 	}
1623 
1624 	// Copy query (first sequence) and consensus  residues?
1625 	if (showcons) {
1626 		sname[k] = new char[10];
1627 		strcpy(sname[k], "Consensus");
1628 		sname[k + 1] = new char[strlen(longname) + 1];
1629 		strcpy(sname[k + 1], longname);
1630 		seq[k] = new char[L + 2];
1631 		seq[k][0] = ' ';
1632 		seq[k][L + 1] = '\0';
1633 		seq[k + 1] = new char[L + 2];
1634 		seq[k + 1][0] = ' ';
1635 		seq[k + 1][L + 1] = '\0';
1636 		for (i = 1; i <= L; ++i) {
1637 			float pmax = 0.0;
1638 			int amax = 0;
1639 			for (a = 0; a < NAA; ++a)
1640 				if (f[i][a] > pmax) {
1641 					amax = a;
1642 					pmax = f[i][a];
1643 				}
1644 			if (pmax > 0.6)
1645 				seq[k][i] = i2aa(amax);
1646 			else if (pmax > 0.4)
1647 				seq[k][i] = lwrchr(i2aa(amax));
1648 			else
1649 				seq[k][i] = 'x';
1650 			seq[k + 1][i] = i2aa(amax);
1651 		}
1652 		ncons = k++; // nfirst is set later!
1653 	} else {
1654 		sname[k] = new char[strlen(longname) + 1];
1655 		strcpy(sname[k], longname);
1656 		seq[k] = new char[L + 2];
1657 		seq[k][0] = ' ';
1658 		seq[k][L + 1] = '\0';
1659 	}
1660 
1661 	if (annot) // read in some annotation characters?
1662 	{
1663 		annotchr[0] = ' ';
1664 		annotchr[L + 1] = '\0';
1665 		strcpy(seq[k], annotchr); // overwrite the consensus sequence with the annotation characters
1666 	} else if (!showcons) // we have not yet calculated the consensus, but we need it now as query (first sequence)
1667 	{
1668 		for (i = 1; i <= L; ++i) {
1669 			float pmax = 0.0;
1670 			int amax = 0;
1671 			for (a = 0; a < NAA; ++a)
1672 				if (f[i][a] > pmax) {
1673 					amax = a;
1674 					pmax = f[i][a];
1675 				}
1676 			seq[k][i] = i2aa(amax);
1677 		}
1678 	}
1679 	//   printf("%i query name=%s  seq=%s\n",n,sname[n],seq[n]);
1680 	nfirst = k++;
1681 
1682 	n_display = k;
1683 	n_seqs = k;
1684 
1685 	// If no effektive number of sequences is given, calculate Neff_HMM by given profile
1686 	if (Neff_HMM == 0) {
1687 		for (i = 1; i <= L; ++i) {
1688 			float S = 0.0;
1689 			for (a = 0; a < 20; ++a)
1690 				if (f[i][a] > 1E-10)
1691 					S -= f[i][a] * fast_log2(f[i][a]);
1692 			Neff_HMM += (float) fpow2(S);
1693 		}
1694 		Neff_HMM /= L;
1695 	}
1696 
1697 	for (i = 0; i <= L; ++i)
1698 		Neff_M[i] = Neff_I[i] = Neff_D[i] = 10.0; // to add only little additional pseudocounts!
1699 	Neff_M[L + 1] = 1.0f;
1700 	Neff_I[L + 1] = Neff_D[L + 1] = 0.0f;
1701 
1702 	HH_LOG(DEBUG) << "Read in HMM " << name << " with " << L
1703 			<< " match states and effective number of sequences = "
1704 			<< Neff_HMM << "\n";
1705 
1706 	///////////////////////////////////////////////////////////////////
1707 
1708 	// Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities
1709 	for (a = 0; a < 20; ++a)
1710 		f[0][a] = f[L + 1][a] = pb[a];
1711 	delete[] annotchr;
1712 	return 1; //return status: ok
1713 }
1714 
1715 /////////////////////////////////////////////////////////////////////////////////////
1716 // Add transition pseudocounts to HMM (and calculate lin-space transition probs)
1717 /////////////////////////////////////////////////////////////////////////////////////
AddTransitionPseudocounts(float gapd,float gape,float gapf,float gapg,float gaph,float gapi,float gapb,const float par_gapb)1718 void HMM::AddTransitionPseudocounts(float gapd, float gape, float gapf,
1719 		float gapg, float gaph, float gapi, float gapb, const float par_gapb) {
1720 	int i;               //position in alignment
1721 	float sum;
1722 	float pM2M, pM2I, pM2D, pI2I, pI2M, pD2D, pD2M;
1723 	float p0, p1, p2;
1724 	if (gapb <= 0)
1725 		return;
1726 	if (trans_lin == 1) {
1727 	  HH_LOG(ERROR) << "In " << __FILE__ << ":" << __LINE__ << ": " << __func__ << ":" << std::endl;
1728 	  HH_LOG(WARNING) << "\tAdding transition pseudocounts to linear representation of " << name
1729 	      << " not allowed. Please report this error to the HHsearch developers.\n";
1730 		exit(6);
1731 	}
1732 	if (trans_lin == 2) {
1733 	  HH_LOG(ERROR) << "In " << __FILE__ << ":" << __LINE__ << ": " << __func__ << ":" << std::endl;
1734 	  HH_LOG(ERROR) << "\tAdding transition pseudocounts twice in " << name
1735 				<< " not allowed. Please report this error to the HHsearch developers.\n";
1736 		exit(6);
1737 	}
1738 	trans_lin = 2;
1739 
1740 	// Calculate pseudocount transition probabilities
1741 	pM2D = pM2I = gapd * 0.0286; //a-priori probability for inserts and deletions
1742 	pM2M = 1 - pM2D - pM2I;
1743 	// gape=0 -> pI2I=0   gape=1 -> pI2I=0.75    gape=inf -> pI2I=1.
1744 	pI2I = 1.0 * gape / (gape - 1 + 1.0 / 0.75);
1745 	pI2M = 1 - pI2I;
1746 	// gape=0 -> pD2D=0   gape=1 -> pD2D=0.75    gape=inf -> pD2D=1.
1747 	pD2D = 1.0 * gape / (gape - 1 + 1.0 / 0.75);
1748 	pD2M = 1 - pD2D;
1749 
1750 	for (i = 0; i <= L; ++i) //for all columns in HMM
1751 			{
1752 		// Transitions from M state
1753 		p0 = (Neff_M[i] - 1) * fpow2(tr[i][M2M]) + gapb * pM2M;
1754 		p1 = (Neff_M[i] - 1) * fpow2(tr[i][M2D]) + gapb * pM2D;
1755 		p2 = (Neff_M[i] - 1) * fpow2(tr[i][M2I]) + gapb * pM2I;
1756 		if (i == 0)
1757 			p1 = p2 = 0;     //from M(0) no transition to D(1) and I(0) possible
1758 		if (i == L)
1759 			p1 = p2 = 0; //from M(L) no transition to D(L+1) and I(L+1) possible
1760 		sum = p0 + p1 + p2 + FLT_MIN;
1761 		tr[i][M2M] = fast_log2(p0 / sum);
1762 		tr[i][M2D] = fast_log2(p1 / sum) * gapf;
1763 		tr[i][M2I] = fast_log2(p2 / sum) * gapg;
1764 
1765 		// Transitions from I state
1766 		p0 = Neff_I[i] * fpow2(tr[i][I2M]) + gapb * pI2M;
1767 		p1 = Neff_I[i] * fpow2(tr[i][I2I]) + gapb * pI2I;
1768 		sum = p0 + p1 + FLT_MIN;
1769 		tr[i][I2M] = fast_log2(p0 / sum);
1770 		tr[i][I2I] = fast_log2(p1 / sum) * gapi;
1771 
1772 		// Transitions from D state
1773 		p0 = Neff_D[i] * fpow2(tr[i][D2M]) + gapb * pD2M;
1774 		p1 = Neff_D[i] * fpow2(tr[i][D2D]) + gapb * pD2D;
1775 		if (i == L)
1776 			p1 = 0;          //from D(L) no transition to D(L+1) possible
1777 		sum = p0 + p1 + FLT_MIN;
1778 		tr[i][D2M] = fast_log2(p0 / sum);
1779 		tr[i][D2D] = fast_log2(p1 / sum) * gaph;
1780 
1781 	}
1782 
1783 	if (Log::reporting_level() >= DEBUG1) {
1784 		HH_LOG(DEBUG1) << "\nPseudocount transition probabilities:\n";
1785 		HH_LOG(DEBUG1) << "pM2M=" << 100*pM2M <<"%, pM2I="<< 100 * pM2I <<"%, pM2D="<<100*pM2D<<"%, ";
1786 		HH_LOG(DEBUG1) << "pI2M=" << 100*pI2M << "%, pI2I=" << 100*pI2I << "%, ";
1787 		HH_LOG(DEBUG1) << "pD2M=" << 100*pD2M << "%, pD2D=" << 100*pD2D << "% ";
1788 		HH_LOG(DEBUG1) << "tau = " << 100. * gapb / (Neff_HMM - 1 + gapb) << "%\n\n";
1789 		HH_LOG(DEBUG1) << "Listing transition probabilities WITH pseudocounts:\n";
1790 		HH_LOG(DEBUG1) << "   i dssp pred sacc     M->M   M->I   M->D   I->M   I->I   D->M   D->D\n";
1791 
1792 		//for all columns in HMM
1793 		for (i = 1; i <= L; ++i) {
1794 			HH_LOG(DEBUG1) << i << "\t" << i2ss(ss_dssp[i]) << "\t" << i2ss(ss_pred[i]) << "\t" << i2sa(sa_dssp[i]) << "\t" << fpow2(tr[i][M2M]) << "\t" << fpow2(tr[i][M2I]) << "\t" << fpow2(tr[i][M2D]) << "\t";
1795 			HH_LOG(DEBUG1) << fpow2(tr[i][I2M]) << "\t" << fpow2(tr[i][I2I]) << "\t";
1796 			HH_LOG(DEBUG1) << fpow2(tr[i][D2M]) << "\t" << fpow2(tr[i][D2D]) << "\t";
1797 			HH_LOG(DEBUG1) << ss_pred[i] << "\t" << ss_conf[i] << "\t" << ss_dssp[i] << std::endl;
1798 		}
1799 		HH_LOG(DEBUG1) << std::endl;
1800 		HH_LOG(DEBUG1) << "nss_dssp=" << nss_dssp << "  nss_pred=" << nss_pred << std::endl;
1801 	}
1802 }
1803 
1804 /////////////////////////////////////////////////////////////////////////////////////
1805 // Generate an amino acid frequency matrix g[][] with full pseudocount admixture (tau=1)
1806 /////////////////////////////////////////////////////////////////////////////////////
PreparePseudocounts(const float R[20][20])1807 void HMM::PreparePseudocounts(const float R[20][20]) {
1808 	for (int i = 0; i <= L + 1; ++i)
1809 		for (int a = 0; a < 20; ++a)
1810 			g[i][a] = ScalarProd20(R[a], f[i]);
1811 }
1812 
1813 /////////////////////////////////////////////////////////////////////////////////////
1814 // Add pseudocounts to profile p[]
1815 /////////////////////////////////////////////////////////////////////////////////////
AddContextSpecificPseudocounts(cs::Pseudocounts<cs::AA> * pc,cs::Admix * admix)1816 void HMM::AddContextSpecificPseudocounts(cs::Pseudocounts<cs::AA>* pc,
1817 		cs::Admix* admix) {
1818 	if (has_pseudocounts || pc == NULL || admix == NULL) {
1819 		for (int i = 1; i <= L; ++i) {
1820 			for (int a = 0; a < 20; ++a) {
1821 				p[i][a] = f[i][a];
1822 			}
1823 		}
1824 	} else {
1825 		cs::CountProfile<cs::AA> cp(L);
1826 		fillCountProfile(&cp);
1827 		cs::Profile<cs::AA> profile = pc->AddTo(cp, *admix);
1828 		for (int i = 1; i <= L; ++i) {
1829 			for (int a = 0; a < 20; ++a) {
1830 				p[i][a] = profile[i - 1][a];
1831 			}
1832 		}
1833 	}
1834 }
1835 
1836 /////////////////////////////////////////////////////////////////////////////////////
1837 // Fill CountProfile with actual aa-frequencies
1838 /////////////////////////////////////////////////////////////////////////////////////
fillCountProfile(cs::CountProfile<cs::AA> * csProfile)1839 void HMM::fillCountProfile(cs::CountProfile<cs::AA> *csProfile) {
1840 	for (int i = 0; i < L; ++i) {
1841 		csProfile->neff[i] = Neff_M[i + 1];
1842 		for (int a = 0; a < 20; ++a)
1843 			csProfile->counts[i][a] = f[i + 1][a] * Neff_M[i + 1];
1844 	}
1845 }
1846 
1847 /////////////////////////////////////////////////////////////////////////////////////
1848 // Calculate amino acid background frequencies in HMM
1849 /////////////////////////////////////////////////////////////////////////////////////
CalculateAminoAcidBackground(const float * pb)1850 void HMM::CalculateAminoAcidBackground(const float* pb) {
1851 	int a, i;
1852 	// initialize vector of average aa freqs with pseudocounts
1853 	for (a = 0; a < 20; ++a)
1854 		pav[a] = pb[a] * 100.0f / Neff_HMM;
1855 	// calculate averages
1856 	for (i = 1; i <= L; ++i)
1857 		for (a = 0; a < 20; ++a)
1858 			pav[a] += p[i][a];
1859 	// Normalize vector of average aa frequencies pav[a]
1860 	NormalizeTo1(pav, NAA);
1861 	for (a = 0; a < 20; ++a)
1862 		p[0][a] = p[L + 1][a] = pav[a];
1863 
1864 }
1865 
1866 /////////////////////////////////////////////////////////////////////////////////////
1867 // Add amino acid pseudocounts to HMM and calculate average protein aa probabilities pav[a]
1868 // Pseudocounts: p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
1869 /////////////////////////////////////////////////////////////////////////////////////
AddAminoAcidPseudocounts(char pcm,float pca,float pcb,float pcc)1870 void HMM::AddAminoAcidPseudocounts(char pcm, float pca, float pcb, float pcc) {
1871 	int i;               //position in HMM
1872 	int a;               //amino acid (0..19)
1873 	float sum;
1874 	float tau;           //tau = pseudocount admixture
1875 
1876 	if (has_pseudocounts) {
1877 		pcm = 0;
1878 	}
1879 
1880 	// Calculate amino acid frequencies p[i][a] = (1-tau(i))*f[i][a] + tau(i)*g[i][a]
1881 	switch (pcm) {
1882 	case 0: //no pseudocounts whatsoever: tau=0
1883 		for (i = 1; i <= L; ++i)
1884 			for (a = 0; a < 20; ++a)
1885 				p[i][a] = f[i][a];
1886 		break;
1887 	case 1: //constant pseudocounts (for optimization): tau = pca
1888 		tau = pca;
1889 		for (i = 1; i <= L; ++i)
1890 			for (a = 0; a < 20; ++a)
1891 				p[i][a] = (1. - tau) * f[i][a] + tau * g[i][a];
1892 		break;
1893 	case 2: //divergence-dependent pseudocounts and rate matrix rescaling
1894 		if (pcc == 1.0f)
1895 			for (i = 1; i <= L; ++i) {
1896 				tau = fmin(1.0, pca / (1. + Neff_M[i] / pcb));
1897 				for (a = 0; a < 20; ++a)
1898 					p[i][a] = (1. - tau) * f[i][a] + tau * g[i][a];
1899 			}
1900 		else
1901 			for (i = 1; i <= L; ++i) {
1902 				tau = fmin(1.0, pca / (1. + pow((Neff_M[i]) / pcb, pcc)));
1903 				for (a = 0; a < 20; ++a)
1904 					p[i][a] = (1. - tau) * f[i][a] + tau * g[i][a];
1905 			}
1906 		break;
1907 	case 3: // constant-diversity pseudocounts // Is this still used? => scrap? (JS)
1908 		for (i = 1; i <= L; ++i) {
1909 			float x = Neff_M[i] / pcb;
1910 			pca = 0.793 + 0.048 * (pcb - 10.0);
1911 			tau = fmax(0.0, pca * (1 - x + pcc * x * (1 - x)));
1912 			for (a = 0; a < 20; ++a)
1913 				p[i][a] = (1. - tau) * f[i][a] + tau * g[i][a];
1914 		}
1915 		HH_LOG(DEBUG) << "Divergence before / after addition of amino acid pseudocounts: " << Neff_HMM << " / " << CalcNeff() << std::endl;
1916 		break;
1917 	} //end switch (pcm)
1918 
1919 	//turn on pseudocount switch to indicate that HMM contains pseudocounts
1920 	if (pcm != 0)
1921 		has_pseudocounts = true;
1922 
1923 	// DEBUGGING output
1924 	if (Log::reporting_level() >= DEBUG) {
1925 		switch (pcm) {
1926 		case 0:
1927 			HH_LOG(DEBUG) << "No pseudocounts added (-pcm 0)\n";
1928 			return;
1929 		case 1:
1930 			HH_LOG(DEBUG) << "Adding constant AA pseudocount admixture of " << pca
1931 					<< " to HMM " << name << "\n";
1932 			break;
1933 		case 2:
1934 			HH_LOG(DEBUG)
1935 					<< "Adding divergence-dependent AA pseudocounts (-pcm 2) with admixture of "
1936 					<< fmin(1.0, pca / (1. + Neff_HMM / pcb)) << " to HMM "
1937 					<< name << "\n";
1938 			break;
1939 		} //end switch (pcm)
1940 
1941 		if ( Log::reporting_level() >= DEBUG1) {
1942 			HH_LOG(DEBUG1) << "Amino acid frequencies WITHOUT pseudocounts:\n       A    R    N    D    C    Q    E    G    H    I    L    K    M    F    P    S    T    W    Y    V" << std::endl;
1943 			for (i=1; i<=L; ++i)
1944 			{
1945 			  HH_LOG(DEBUG1) << i << ": " << f[i][0] * 100 << " " << f[i][1] * 100 << " " << f[i][2] * 100 << " " << f[i][3] * 100 << " " << f[i][4] * 100 << " "
1946 			      << f[i][5] * 100 << " " << f[i][6] * 100 << " " << f[i][7] * 100 << " " << f[i][8] * 100 << " " << f[i][9] * 100 << " " << f[i][10] * 100 << " "
1947 			      << f[i][11] * 100 << " " << f[i][12] * 100 << " " << f[i][13] * 100 << " " << f[i][14] * 100 << " " << f[i][15] * 100 << " "
1948 			      << f[i][16] * 100 << " " << f[i][17] * 100 << " " << f[i][18] * 100 << " " << f[i][19] * 100 << " sum=" << sum << std::endl;
1949 			}
1950       HH_LOG(DEBUG1) << "Amino acid frequencies WITH pseudocounts:\n       A    R    N    D    C    Q    E    G    H    I    L    K    M    F    P    S    T    W    Y    V" << std::endl;
1951       for (i=1; i<=L; ++i)
1952       {
1953         HH_LOG(DEBUG1) << std::setprecision(1) << i << ": " << p[i][0] * 100 << " " << p[i][1] * 100 << " " << p[i][2] * 100 << " " << p[i][3] * 100 << " " << p[i][4] * 100 << " "
1954             << p[i][5] * 100 << " " << p[i][6] * 100 << " " << p[i][7] * 100 << " " << p[i][8] * 100 << " " << p[i][9] * 100 << " " << p[i][10] * 100 << " "
1955             << p[i][11] * 100 << " " << p[i][12] * 100 << " " << p[i][13] * 100 << " " << p[i][14] * 100 << " " << p[i][15] * 100 << " "
1956             << p[i][16] * 100 << " " << p[i][17] * 100 << " " << p[i][18] * 100 << " " << p[i][19] * 100 << " sum=" << sum << std::endl;
1957       }
1958 		}
1959 	}
1960 }
1961 
1962 
computeScoreSSMode(HMM * q,HMM * t)1963 int HMM::computeScoreSSMode( HMM *q,  HMM *t){
1964 	int returnMode = HMM::NO_SS_INFORMATION;
1965     returnMode |= (q->nss_pred>=0 && t->nss_dssp>=0) ? HMM::PRED_DSSP : 0;
1966     returnMode |= (q->nss_dssp>=0 && t->nss_pred>=0) ? HMM::DSSP_PRED : 0;
1967     returnMode |= (q->nss_pred>=0 && t->nss_pred>=0) ? HMM::PRED_PRED : 0;
1968 	return returnMode;
1969 }
1970 
1971 /////////////////////////////////////////////////////////////////////////////////////
1972 // Divide aa probabilties by square root of locally averaged background frequencies
1973 // !!!!! ATTENTION!!!!!!!  after this p is not the same as after adding pseudocounts !!!
1974 /////////////////////////////////////////////////////////////////////////////////////
DivideBySqrtOfLocalBackgroundFreqs(const int D,const float * pb)1975 void HMM::DivideBySqrtOfLocalBackgroundFreqs(const int D, const float* pb) // 2*D+1 is window size
1976 		{
1977 	if (divided_by_local_bg_freqs) {
1978 	  HH_LOG(WARNING) << "Already divided probs by local aa frequencies!" << std::endl;
1979 		return;
1980 	}
1981 	divided_by_local_bg_freqs = 1;
1982 
1983 	int i;                     // query and template match state indices
1984 	int a;                     // amino acid index
1985 	const float pc = 10.0; // amount of pseudocounts on local amino acid frequencies
1986 	float fac = 1.0 / (2.0 * (float) D + 1.0 + pc);  // 1 / window size
1987 
1988 	float** pnul = new float*[L + 1];  // null model probabilities
1989 	for (i = 0; i <= L; ++i)
1990 		pnul[i] = new float[NAA];
1991 	for (a = 0; a < NAA; ++a)
1992 		pnul[0][a] = pc * pb[a];
1993 
1994 	// HMM shorter than window length? => average over entire length L
1995 	if (L <= 2 * D + 1) {
1996 		for (i = 1; i <= L; ++i)
1997 			for (a = 0; a < NAA; ++a)
1998 				pnul[0][a] += p[i][a];
1999 		for (i = 1; i <= L; ++i)
2000 			for (a = 0; a < NAA; ++a)
2001 				pnul[i][a] = pnul[0][a];
2002 		fac = 1.0 / ((float) L + pc);
2003 	}
2004 	// HMM longer than window length? => average over window size 2*D+1
2005 	else {
2006 		// Calculate local amino acid background frequencies in leftmost window (1,..,2*D+1)
2007 		for (i = 1; i <= 2 * D + 1; ++i)
2008 			for (a = 0; a < NAA; ++a)
2009 				pnul[0][a] += p[i][a];
2010 
2011 		// Copy local amino acid background frequencies in leftmost window to positions 1,..,D+1
2012 		for (i = 1; i <= D + 1; ++i)
2013 			for (a = 0; a < NAA; ++a)
2014 				pnul[i][a] = pnul[0][a];
2015 
2016 		// Calculate local amino acid background frequencies in window of size 2*D+1 around each residue
2017 		for (i = D + 2; i <= L - D; ++i)
2018 			for (a = 0; a < NAA; ++a)
2019 				pnul[i][a] = pnul[i - 1][a] + p[i + D][a] - p[i - 1 - D][a];
2020 
2021 		// Copy local amino acid background frequencies from pos. L-D to positions L-D+1,..,L
2022 		for (i = L - D + 1; i <= L; ++i)
2023 			for (a = 0; a < NAA; ++a)
2024 				pnul[i][a] = pnul[L - D][a];
2025 	}
2026 
2027 	// Divide amino acid probs by sqrt of local amino acid background frequencies
2028 	for (i = 1; i <= L; ++i)
2029 		for (a = 0; a < NAA; ++a)
2030 			p[i][a] /= sqrt(fac * pnul[i][a]);
2031 
2032 	if (Log::reporting_level() >= DEBUG1) {
2033 		HH_LOG(DEBUG1) << "\nLocal amino acid background frequencies\n";
2034 		HH_LOG(DEBUG1) << "         A    R    N    D    C    Q    E    G    H    I    L    K    M    F    P    S    T    W    Y    V  sum\n";
2035 		for (i = 1; i <= L; ++i) {
2036 			HH_LOG(DEBUG1) <<  i << " ";
2037 			float sum = 0.0;
2038 			for (a = 0; a < 20; ++a) {
2039 				HH_LOG(DEBUG1) << 100 * fac * pnul[i][a] << " ";
2040 				sum += fac * pnul[i][a];
2041 			}
2042 			HH_LOG(DEBUG1) << 100 * sum << std::endl;
2043 		}
2044 	}
2045 
2046 	for (i = 0; i <= L; ++i)
2047 		delete[] pnul[i];
2048 	delete[] pnul;
2049 }
2050 
2051 /////////////////////////////////////////////////////////////////////////////////////
2052 // Factor Null model into HMM t
2053 // !!!!! ATTENTION!!!!!!!  after this t->p is not the same as after adding pseudocounts !!!
2054 /////////////////////////////////////////////////////////////////////////////////////
IncludeNullModelInHMM(HMM * q,HMM * t,int columnscore,const int half_window_size_local_aa_bg_freqs,const float * pb)2055 void HMM::IncludeNullModelInHMM(HMM* q, HMM* t, int columnscore,
2056 		const int half_window_size_local_aa_bg_freqs, const float* pb) {
2057 
2058 	int i, j;         //query and template match state indices
2059 	int a;           //amino acid index
2060 
2061 	// Multiply template frequencies with amino acid weights = 1/background_freq(a) (for all but SOP scores)
2062 	switch (columnscore) {
2063 	default:
2064 	case 0: // Null model with background prob. from database
2065 		for (j = 0; j <= t->L + 1; ++j)
2066 			for (a = 0; a < 20; ++a)
2067 				t->p[j][a] /= pb[a];
2068 		break;
2069 
2070 	case 1: // Null model with background prob. equal average from query and template
2071 		float pnul[NAA]; // null model probabilities used in comparison (only set in template/db HMMs)
2072 		for (a = 0; a < 20; ++a)
2073 			pnul[a] = 0.5 * (q->pav[a] + t->pav[a]);
2074 		for (j = 0; j <= t->L + 1; ++j)
2075 			for (a = 0; a < 20; ++a)
2076 				t->p[j][a] /= pnul[a];
2077 		break;
2078 
2079 	case 2: // Null model with background prob. from template protein
2080 		for (j = 0; j <= t->L + 1; ++j)
2081 			for (a = 0; a < 20; ++a)
2082 				t->p[j][a] /= t->pav[a];
2083 		break;
2084 
2085 	case 3: // Null model with background prob. from query protein
2086 		for (j = 0; j <= t->L + 1; ++j)
2087 			for (a = 0; a < 20; ++a)
2088 				t->p[j][a] /= q->pav[a];
2089 		break;
2090 
2091 	case 5: // Null model with local background prob. from template and query protein
2092 		//      if (!q->divided_by_local_bg_freqs) q->DivideBySqrtOfLocalBackgroundFreqs(par.half_window_size_local_aa_bg_freqs);
2093 		if (!q->divided_by_local_bg_freqs)
2094 			InternalError("No local amino acid bias correction on query HMM!",
2095 					__FILE__, __LINE__, __func__);
2096 		if (!t->divided_by_local_bg_freqs)
2097 			t->DivideBySqrtOfLocalBackgroundFreqs(
2098 					half_window_size_local_aa_bg_freqs, pb);
2099 		break;
2100 
2101 	case 10: // Separated column scoring for Stochastic Backtracing (STILL USED??)
2102 		for (i = 0; i <= q->L + 1; ++i) {
2103 			float sum = 0.0;
2104 			for (a = 0; a < 20; ++a)
2105 				sum += pb[a] * q->p[i][a];
2106 			sum = 1.0 / sqrt(sum);
2107 			for (a = 0; a < 20; ++a)
2108 				q->p[i][a] *= sum;
2109 		}
2110 		for (j = 0; j <= t->L + 1; j++) {
2111 			float sum = 0.0;
2112 			for (a = 0; a < 20; ++a)
2113 				sum += pb[a] * t->p[j][a];
2114 			sum = 1.0 / sqrt(sum);
2115 			for (a = 0; a < 20; ++a)
2116 				t->p[j][a] *= sum;
2117 		}
2118 		break;
2119 
2120 	case 11:  // log co-emission probability (no null model)
2121 		for (a = 0; a < 20; ++a)
2122 			pnul[a] = 0.05;
2123 		break;
2124 
2125 	}
2126 
2127 	if (Log::reporting_level() >= DEBUG1) {
2128 		HH_LOG(DEBUG1) << "\nAverage amino acid frequencies\n";
2129 		HH_LOG(DEBUG1) << "         A    R    N    D    C    Q    E    G    H    I    L    K    M    F    P    S    T    W    Y    V\n";
2130 		HH_LOG(DEBUG1) << "Q:    ";
2131 		for (a = 0; a < 20; ++a)
2132 			HH_LOG(DEBUG1) << 100 * q->pav[a] << " ";
2133 		HH_LOG(DEBUG1) << "\nT:    ";
2134 		for (a = 0; a < 20; ++a)
2135 			HH_LOG(DEBUG1) << 100 * t->pav[a] << " ";
2136 		HH_LOG(DEBUG1) << "\npb:   ";
2137 		for (a = 0; a < 20; ++a)
2138 			HH_LOG(DEBUG1) << 100 * pb[a] << " ";
2139 	}
2140 }
2141 
WriteToFile(char * outfile,const char append,const int max_seqid,const int coverage,const int qid,const int Ndiff,const float qsc,const int argc,const char ** argv,const float * pb)2142 void HMM::WriteToFile(char* outfile, const char append, const int max_seqid,
2143 		const int coverage, const int qid, const int Ndiff, const float qsc,
2144 		const int argc, const char** argv, const float* pb) {
2145 	std::stringstream out;
2146 	WriteToFile(out, max_seqid, coverage, qid, Ndiff, qsc, argc, argv, pb);
2147 
2148 	if (strcmp(outfile, "stdout") == 0) {
2149 		std::cout << out.str();
2150 	} else {
2151 		std::fstream outf;
2152 		if (append)
2153 			outf.open(outfile, std::ios::out | std::ios::app);
2154 		else
2155 			outf.open(outfile, std::ios::out);
2156 
2157 		if (!outf.good())
2158 			OpenFileError(outfile, __FILE__, __LINE__, __func__);
2159 
2160 		outf << out.str();
2161 
2162 		outf.close();
2163 	}
2164 }
2165 
2166 /////////////////////////////////////////////////////////////////////////////////////
2167 // Write HMM to output file
2168 /////////////////////////////////////////////////////////////////////////////////////
WriteToFile(std::stringstream & out,const int max_seqid,const int coverage,const int qid,const int Ndiff,const float qsc,const int argc,const char ** argv,const float * pb)2169 void HMM::WriteToFile(std::stringstream& out, const int max_seqid,
2170 		const int coverage, const int qid, const int Ndiff, const float qsc,
2171 		const int argc, const char** argv, const float* pb) {
2172 	const int SEQLEN = 100; // number of residues per line for sequences to be displayed
2173 	char line[LINELEN];
2174 
2175 	if (trans_lin == 1)
2176 		InternalError(
2177 				"tried to write HMM file with transition pseudocounts in linear representation",
2178 				__FILE__, __LINE__, __func__);
2179 	if (divided_by_local_bg_freqs)
2180 		InternalError(
2181 				"tried to write HMM file with amino acid probabilities divided by sqrt of local background frequencies\n",
2182 				__FILE__, __LINE__, __func__);
2183 
2184 	// format specification
2185 	out << "HHsearch 1.5" << std::endl;
2186 	out << "NAME  " << longname << std::endl;    // name of first sequence
2187 	out << "FAM   " << fam << std::endl;         // family name
2188 
2189 	//TODO
2190 //  char file_nopath[NAMELEN];
2191 //  RemoveExtension(file, outfile);
2192 //  RemovePath(file_nopath, file);
2193 //  fprintf(outf,"FILE  %s\n",file_nopath); // base name of alignment file
2194 
2195 	// Print command line
2196 	out << "COM   ";
2197 	for (int i = 0; i < argc; i++)
2198 		if (strlen(argv[i]) <= 100)
2199 			out << argv[i] << " ";
2200 		else
2201 			out << "<" << strlen(argv[i]) << " characters> ";
2202 	out << std::endl;
2203 
2204 	// print out date stamp
2205 	time_t* tp = new time_t;
2206 	*tp = time(NULL);
2207 	out << "DATE  " << ctime(tp);
2208 	delete tp;
2209 
2210 	// Print out some statistics of alignment
2211 	out << "LENG  " << L << " match states, " << l[L]
2212 			<< " columns in multiple alignment\n" << std::endl;
2213 	out << "FILT  " << N_filtered << " out of " << N_in
2214 			<< " sequences passed filter (-id " << max_seqid << " -cov "
2215 			<< coverage << " -qid " << qid << " -qsc " << qsc << " -diff "
2216 			<< Ndiff << ")" << std::endl;
2217 	sprintf(line, "NEFF  %-4.1f\n", Neff_HMM);
2218 	out << line;
2219 
2220 	if (has_pseudocounts) {
2221 		out << "PCT   true\n";
2222 	}
2223 
2224 	// Print selected sequences from alignment (including secondary structure and confidence values, if known)
2225 	out << "SEQ" << std::endl;
2226 	for (int n = 0; n < n_display; n++) {
2227 		out << ">" << sname[n] << std::endl;
2228 		//first sequence character starts at 1; 0 not used.
2229 		for (unsigned int j = 0; j < strlen(seq[n] + 1); j += SEQLEN) {
2230 			sprintf(line, "%-.*s\n", SEQLEN, seq[n] + 1 + j);
2231 			out << line;
2232 		}
2233 	}
2234 	out << "#" << std::endl;
2235 
2236 	// print null model background probabilities from substitution matrix
2237 	out << "NULL   ";
2238 	for (int a = 0; a < 20; ++a)
2239 		sout(out, -iround(fast_log2(pb[s2a[a]]) * HMMSCALE));
2240 	out << std::endl;
2241 
2242 	// print table header line with amino acids
2243 	out << "HMM    ";
2244 	for (int a = 0; a < 20; ++a) {
2245 		out << i2aa(s2a[a]) << "\t";
2246 	}
2247 	out << std::endl;
2248 
2249 	// print table header line with state transitions
2250 	out
2251 			<< "       M->M\tM->I\tM->D\tI->M\tI->I\tD->M\tD->D\tNeff\tNeff_I\tNeff_D"
2252 			<< std::endl;
2253 
2254 	// print out transition probabilities from begin state (virtual match state)
2255 	out << "       ";
2256 	for (int a = 0; a <= D2D; ++a)
2257 		sout(out, -iround(tr[0][a] * HMMSCALE));
2258 
2259 	sout(out, iround(Neff_M[0] * HMMSCALE));
2260 	sout(out, iround(Neff_I[0] * HMMSCALE));
2261 	sout(out, iround(Neff_D[0] * HMMSCALE));
2262 	out << std::endl;
2263 
2264 	// Start loop for printing HMM columns
2265 	int h = 1;
2266 	for (int i = 1; i <= L; ++i) {
2267 		while (islower(seq[nfirst][h]) && seq[nfirst][h])
2268 			h++;
2269 
2270 		sprintf(line, "%1c %-4i ", seq[nfirst][h++], i);
2271 		out << line;
2272 
2273 		// Print emission probabilities for match state
2274 		for (int a = 0; a < 20; ++a) {
2275 			sout(out, -iround(fast_log2(p[i][s2a[a]]) * HMMSCALE));
2276 		}
2277 
2278 		sprintf(line, "%-i", l[i]);
2279 		out << line;
2280 		out << std::endl;
2281 
2282 		// Print transition probabilities
2283 		out << "       ";
2284 		for (int a = 0; a <= D2D; ++a) {
2285 			sout(out, -iround(tr[i][a] * HMMSCALE));
2286 		}
2287 		sout(out, iround(Neff_M[i] * HMMSCALE));
2288 		sout(out, iround(Neff_I[i] * HMMSCALE));
2289 		sout(out, iround(Neff_D[i] * HMMSCALE));
2290 
2291 		out << std::endl << std::endl;
2292 	}
2293 	out << "//" << std::endl;
2294 }
2295 
2296 
2297 /////////////////////////////////////////////////////////////////////////////////////
2298 // Transform log to lin transition probs
2299 /////////////////////////////////////////////////////////////////////////////////////
Log2LinTransitionProbs(float beta)2300 void HMM::Log2LinTransitionProbs(float beta) {
2301 	if (trans_lin == 1)
2302 		return;
2303 	trans_lin = 1;
2304 	for (int i = 0; i <= L; ++i) {
2305 		for (int a = 0; a < NTRANS; ++a)
2306 			tr[i][a] = pow(2.0f,beta * tr[i][a]);
2307 	}
2308 }
2309 
2310 
2311 /////////////////////////////////////////////////////////////////////////////////////
2312 // Set query columns in His-tags etc to Null model distribution
2313 /////////////////////////////////////////////////////////////////////////////////////
NeutralizeTags(const float * pb)2314 void HMM::NeutralizeTags(const float* pb) {
2315 	char* qseq = seq[nfirst];
2316 	char* pt;
2317 	int a, i;
2318 
2319 	// Neutralize His tag
2320 	if ((pt = strstr(qseq, "HHHHH"))) {
2321 		int i0 = pt - qseq + 1;
2322 		for (i = imax(i0 - 8, 1); i < i0; ++i)   // neutralize leading 5 columns
2323 			for (a = 0; a < NAA; ++a)
2324 				p[i][a] = f[i][a] = pb[a];
2325 		for (; (*pt) == 'H'; ++i, ++pt)      // neutralize His columns
2326 			for (a = 0; a < NAA; ++a)
2327 				p[i][a] = f[i][a] = pb[a];
2328 		int i1 = i;
2329 		for (; i < imin(i1 + 8, L + 1); ++i)    // neutralize trailing 5 columns
2330 			for (a = 0; a < NAA; ++a)
2331 				p[i][a] = f[i][a] = pb[a];
2332 		HH_LOG(INFO) << "Neutralized His-tag between positions " << imax(i0 - 8, 1) << " and " << i-1 << std::endl;
2333 	}
2334 
2335 	// Neutralize C-myc tag
2336 	if ((pt = strstr(qseq, "EQKLISEEDL"))) {
2337 		HH_LOG(INFO) << "Neutralized C-myc-tag at position " << int(pt - qseq) + 1 << std::endl;
2338 		for (i = pt - qseq + 1; i <= pt - qseq + 10; ++i)
2339 			for (a = 0; a < NAA; ++a)
2340 				p[i][a] = f[i][a] = pb[a];
2341 	}
2342 	// Neutralize FLAG tag
2343 	if ((pt = strstr(qseq, "DYKDDDDK"))) {
2344 		HH_LOG(INFO) << "Neutralized FLAG-tag at position " << int(pt - qseq) + 1;
2345 		for (i = pt - qseq + 1; i <= pt - qseq + 8; ++i)
2346 			for (a = 0; a < NAA; ++a)
2347 				p[i][a] = f[i][a] = pb[a];
2348 	}
2349 }
2350 
2351 /////////////////////////////////////////////////////////////////////////////////////
2352 // Calculate effective number of sequences using profiles INCLUDING pseudocounts
2353 /////////////////////////////////////////////////////////////////////////////////////
CalcNeff()2354 float HMM::CalcNeff() {
2355 	float Neff = 0;
2356 	for (int i = 1; i <= L; ++i)
2357 		for (int a = 0; a < 20; ++a)
2358 			if (p[i][a] > 1E-10)
2359 				Neff -= p[i][a] * fast_log2(p[i][a]);
2360 	return fpow2(Neff / L);
2361 }
2362 
2363 /////////////////////////////////////////////////////////////////////////////////////
2364 // Add secondary structure prediction to alignment (overwrite existing prediction)
2365 /////////////////////////////////////////////////////////////////////////////////////
AddSSPrediction(char seq_pred[],char seq_conf[])2366 void HMM::AddSSPrediction(char seq_pred[], char seq_conf[]) {
2367 	unsigned int i;
2368 
2369 	if ((int) strlen(seq_pred) != L + 1) {
2370 	  HH_LOG(WARNING) << "Could not add secondary struture prediction - unequal length!" << std::endl;
2371 		return;
2372 	}
2373 
2374 	if (nss_pred >= 0 && nss_conf >= 0) {
2375 		strcpy(seq[nss_pred], seq_pred);
2376 		for (i = 0; i < strlen(seq_pred); i++)
2377 			ss_pred[i] = ss2i(seq_pred[i]);
2378 		strcpy(seq[nss_conf], seq_conf);
2379 		for (i = 0; i < strlen(seq_conf); i++)
2380 			ss_conf[i] = cf2i(seq_conf[i]);
2381 	} else {
2382 		// Shift existing sequences two positions down
2383 		for (int k = imin(n_display - 1, MAXSEQDIS - 3); k >= 0; --k) {
2384 			seq[k + 2] = seq[k];
2385 			sname[k + 2] = sname[k];
2386 		}
2387 		if (nss_dssp >= 0) {
2388 			nss_dssp += 2;
2389 		}
2390 		if (nsa_dssp >= 0) {
2391 			nsa_dssp += 2;
2392 		}
2393 		if (ncons >= 0) {
2394 			ncons += 2;
2395 		}
2396 		if (nfirst >= 0) {
2397 			nfirst += 2;
2398 		}
2399 
2400 		nss_pred = 0;
2401 		seq[nss_pred] = new char[L + 2];
2402 		strcpy(seq[nss_pred], seq_pred);
2403 		for (i = 0; i < strlen(seq_pred); i++)
2404 			ss_pred[i] = ss2i(seq_pred[i]);
2405 		sname[nss_pred] = new char[50];
2406 		strcpy(sname[nss_pred],
2407 				"ss_pred PSIPRED predicted secondary structure");
2408 
2409 		nss_conf = 1;
2410 		seq[nss_conf] = new char[L + 2];
2411 		strcpy(seq[nss_conf], seq_conf);
2412 		for (i = 0; i < strlen(seq_conf); i++)
2413 			ss_conf[i] = cf2i(seq_conf[i]);
2414 		sname[nss_conf] = new char[50];
2415 		strcpy(sname[nss_conf], "ss_conf PSIPRED confidence values");
2416 		n_display += 2;
2417 		n_seqs += 2;
2418 	}
2419 }
2420 
2421